60139e00c27a8dcf3556529a6006223c9c452c95
[melted] / src / modules / xine / deinterlace.c
1 /*
2 * Copyright (C) 2001 the xine project
3 *
4 * This file is part of xine, a free video player.
5 *
6 * xine is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * xine is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
19 *
20 * Deinterlace routines by Miguel Freitas
21 * based of DScaler project sources (deinterlace.sourceforge.net)
22 *
23 * Currently only available for Xv driver and MMX extensions
24 *
25 * small todo list:
26 * - implement non-MMX versions for all methods
27 * - support MMX2 instructions
28 * - move some generic code from xv driver to this file
29 * - make it also work for yuy2 frames
30 *
31 */
32
33 #include <stdio.h>
34 #include <string.h>
35 #include "deinterlace.h"
36 #include "xineutils.h"
37
38 #define xine_fast_memcpy memcpy
39
40 /*
41 DeinterlaceFieldBob algorithm
42 Based on Virtual Dub plugin by Gunnar Thalin
43 MMX asm version from dscaler project (deinterlace.sourceforge.net)
44 Linux version for Xine player by Miguel Freitas
45 */
46 static void deinterlace_bob_yuv_mmx( uint8_t *pdst, uint8_t *psrc[],
47 int width, int height )
48 {
49 #if defined(ARCH_X86) || defined(ARCH_X86_64)
50 int Line;
51 uint64_t *YVal1;
52 uint64_t *YVal2;
53 uint64_t *YVal3;
54 uint64_t *Dest;
55 uint8_t* pEvenLines = psrc[0];
56 uint8_t* pOddLines = psrc[0]+width;
57 int LineLength = width;
58 int SourcePitch = width * 2;
59 int IsOdd = 1;
60 long EdgeDetect = 625;
61 long JaggieThreshold = 73;
62
63 int n;
64
65 uint64_t qwEdgeDetect;
66 uint64_t qwThreshold;
67
68 static mmx_t YMask = {ub:{0xff,0,0xff,0,0xff,0,0xff,0}};
69 static mmx_t Mask = {ub:{0xfe,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe}};
70
71 qwEdgeDetect = EdgeDetect;
72 qwEdgeDetect += (qwEdgeDetect << 48) + (qwEdgeDetect << 32) + (qwEdgeDetect << 16);
73 qwThreshold = JaggieThreshold;
74 qwThreshold += (qwThreshold << 48) + (qwThreshold << 32) + (qwThreshold << 16);
75
76
77 // copy first even line no matter what, and the first odd line if we're
78 // processing an odd field.
79 xine_fast_memcpy(pdst, pEvenLines, LineLength);
80 if (IsOdd)
81 xine_fast_memcpy(pdst + LineLength, pOddLines, LineLength);
82
83 height = height / 2;
84 for (Line = 0; Line < height - 1; ++Line)
85 {
86 if (IsOdd)
87 {
88 YVal1 = (uint64_t *)(pOddLines + Line * SourcePitch);
89 YVal2 = (uint64_t *)(pEvenLines + (Line + 1) * SourcePitch);
90 YVal3 = (uint64_t *)(pOddLines + (Line + 1) * SourcePitch);
91 Dest = (uint64_t *)(pdst + (Line * 2 + 2) * LineLength);
92 }
93 else
94 {
95 YVal1 = (uint64_t *)(pEvenLines + Line * SourcePitch);
96 YVal2 = (uint64_t *)(pOddLines + Line * SourcePitch);
97 YVal3 = (uint64_t *)(pEvenLines + (Line + 1) * SourcePitch);
98 Dest = (uint64_t *)(pdst + (Line * 2 + 1) * LineLength);
99 }
100
101 // For ease of reading, the comments below assume that we're operating on an odd
102 // field (i.e., that bIsOdd is true). The exact same processing is done when we
103 // operate on an even field, but the roles of the odd and even fields are reversed.
104 // It's just too cumbersome to explain the algorithm in terms of "the next odd
105 // line if we're doing an odd field, or the next even line if we're doing an
106 // even field" etc. So wherever you see "odd" or "even" below, keep in mind that
107 // half the time this function is called, those words' meanings will invert.
108
109 // Copy the odd line to the overlay verbatim.
110 xine_fast_memcpy((char *)Dest + LineLength, YVal3, LineLength);
111
112 n = LineLength >> 3;
113 while( n-- )
114 {
115 movq_m2r (*YVal1++, mm0);
116 movq_m2r (*YVal2++, mm1);
117 movq_m2r (*YVal3++, mm2);
118
119 // get intensities in mm3 - 4
120 movq_r2r ( mm0, mm3 );
121 pand_m2r ( YMask, mm3 );
122 movq_r2r ( mm1, mm4 );
123 pand_m2r ( YMask, mm4 );
124 movq_r2r ( mm2, mm5 );
125 pand_m2r ( YMask, mm5 );
126
127 // get average in mm0
128 pand_m2r ( Mask, mm0 );
129 pand_m2r ( Mask, mm2 );
130 psrlw_i2r ( 01, mm0 );
131 psrlw_i2r ( 01, mm2 );
132 paddw_r2r ( mm2, mm0 );
133
134 // work out (O1 - E) * (O2 - E) / 2 - EdgeDetect * (O1 - O2) ^ 2 >> 12
135 // result will be in mm6
136
137 psrlw_i2r ( 01, mm3 );
138 psrlw_i2r ( 01, mm4 );
139 psrlw_i2r ( 01, mm5 );
140
141 movq_r2r ( mm3, mm6 );
142 psubw_r2r ( mm4, mm6 ); //mm6 = O1 - E
143
144 movq_r2r ( mm5, mm7 );
145 psubw_r2r ( mm4, mm7 ); //mm7 = O2 - E
146
147 pmullw_r2r ( mm7, mm6 ); // mm6 = (O1 - E) * (O2 - E)
148
149 movq_r2r ( mm3, mm7 );
150 psubw_r2r ( mm5, mm7 ); // mm7 = (O1 - O2)
151 pmullw_r2r ( mm7, mm7 ); // mm7 = (O1 - O2) ^ 2
152 psrlw_i2r ( 12, mm7 ); // mm7 = (O1 - O2) ^ 2 >> 12
153 pmullw_m2r ( *&qwEdgeDetect, mm7 );// mm7 = EdgeDetect * (O1 - O2) ^ 2 >> 12
154
155 psubw_r2r ( mm7, mm6 ); // mm6 is what we want
156
157 pcmpgtw_m2r ( *&qwThreshold, mm6 );
158
159 movq_r2r ( mm6, mm7 );
160
161 pand_r2r ( mm6, mm0 );
162
163 pandn_r2r ( mm1, mm7 );
164
165 por_r2r ( mm0, mm7 );
166
167 movq_r2m ( mm7, *Dest++ );
168 }
169 }
170
171 // Copy last odd line if we're processing an even field.
172 if (! IsOdd)
173 {
174 xine_fast_memcpy(pdst + (height * 2 - 1) * LineLength,
175 pOddLines + (height - 1) * SourcePitch,
176 LineLength);
177 }
178
179 // clear out the MMX registers ready for doing floating point
180 // again
181 emms();
182 #endif
183 }
184
185 /* Deinterlace the latest field, with a tendency to weave rather than bob.
186 Good for high detail on low-movement scenes.
187 Seems to produce bad output in general case, need to check if this
188 is normal or if the code is broken.
189 */
190 static int deinterlace_weave_yuv_mmx( uint8_t *pdst, uint8_t *psrc[],
191 int width, int height )
192 {
193 #if defined(ARCH_X86) || defined(ARCH_X86_64)
194
195 int Line;
196 uint64_t *YVal1;
197 uint64_t *YVal2;
198 uint64_t *YVal3;
199 uint64_t *YVal4;
200 uint64_t *Dest;
201 uint8_t* pEvenLines = psrc[0];
202 uint8_t* pOddLines = psrc[0]+width;
203 uint8_t* pPrevLines;
204
205 int LineLength = width;
206 int SourcePitch = width * 2;
207 int IsOdd = 1;
208
209 long TemporalTolerance = 300;
210 long SpatialTolerance = 600;
211 long SimilarityThreshold = 25;
212
213 int n;
214
215 uint64_t qwSpatialTolerance;
216 uint64_t qwTemporalTolerance;
217 uint64_t qwThreshold;
218
219 static mmx_t YMask = {ub:{0xff,0,0xff,0,0xff,0,0xff,0}};
220 static mmx_t Mask = {ub:{0xfe,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe}};
221
222
223 // Make sure we have all the data we need.
224 if ( psrc[0] == NULL || psrc[1] == NULL )
225 return 0;
226
227 if (IsOdd)
228 pPrevLines = psrc[1] + width;
229 else
230 pPrevLines = psrc[1];
231
232 // Since the code uses MMX to process 4 pixels at a time, we need our constants
233 // to be represented 4 times per quadword.
234 qwSpatialTolerance = SpatialTolerance;
235 qwSpatialTolerance += (qwSpatialTolerance << 48) + (qwSpatialTolerance << 32) + (qwSpatialTolerance << 16);
236 qwTemporalTolerance = TemporalTolerance;
237 qwTemporalTolerance += (qwTemporalTolerance << 48) + (qwTemporalTolerance << 32) + (qwTemporalTolerance << 16);
238 qwThreshold = SimilarityThreshold;
239 qwThreshold += (qwThreshold << 48) + (qwThreshold << 32) + (qwThreshold << 16);
240
241 // copy first even line no matter what, and the first odd line if we're
242 // processing an even field.
243 xine_fast_memcpy(pdst, pEvenLines, LineLength);
244 if (!IsOdd)
245 xine_fast_memcpy(pdst + LineLength, pOddLines, LineLength);
246
247 height = height / 2;
248 for (Line = 0; Line < height - 1; ++Line)
249 {
250 if (IsOdd)
251 {
252 YVal1 = (uint64_t *)(pEvenLines + Line * SourcePitch);
253 YVal2 = (uint64_t *)(pOddLines + Line * SourcePitch);
254 YVal3 = (uint64_t *)(pEvenLines + (Line + 1) * SourcePitch);
255 YVal4 = (uint64_t *)(pPrevLines + Line * SourcePitch);
256 Dest = (uint64_t *)(pdst + (Line * 2 + 1) * LineLength);
257 }
258 else
259 {
260 YVal1 = (uint64_t *)(pOddLines + Line * SourcePitch);
261 YVal2 = (uint64_t *)(pEvenLines + (Line + 1) * SourcePitch);
262 YVal3 = (uint64_t *)(pOddLines + (Line + 1) * SourcePitch);
263 YVal4 = (uint64_t *)(pPrevLines + (Line + 1) * SourcePitch);
264 Dest = (uint64_t *)(pdst + (Line * 2 + 2) * LineLength);
265 }
266
267 // For ease of reading, the comments below assume that we're operating on an odd
268 // field (i.e., that bIsOdd is true). The exact same processing is done when we
269 // operate on an even field, but the roles of the odd and even fields are reversed.
270 // It's just too cumbersome to explain the algorithm in terms of "the next odd
271 // line if we're doing an odd field, or the next even line if we're doing an
272 // even field" etc. So wherever you see "odd" or "even" below, keep in mind that
273 // half the time this function is called, those words' meanings will invert.
274
275 // Copy the even scanline below this one to the overlay buffer, since we'll be
276 // adapting the current scanline to the even lines surrounding it. The scanline
277 // above has already been copied by the previous pass through the loop.
278 xine_fast_memcpy((char *)Dest + LineLength, YVal3, LineLength);
279
280 n = LineLength >> 3;
281 while( n-- )
282 {
283 movq_m2r ( *YVal1++, mm0 ); // mm0 = E1
284 movq_m2r ( *YVal2++, mm1 ); // mm1 = O
285 movq_m2r ( *YVal3++, mm2 ); // mm2 = E2
286
287 movq_r2r ( mm0, mm3 ); // mm3 = intensity(E1)
288 movq_r2r ( mm1, mm4 ); // mm4 = intensity(O)
289 movq_r2r ( mm2, mm6 ); // mm6 = intensity(E2)
290
291 pand_m2r ( YMask, mm3 );
292 pand_m2r ( YMask, mm4 );
293 pand_m2r ( YMask, mm6 );
294
295 // Average E1 and E2 for interpolated bobbing.
296 // leave result in mm0
297 pand_m2r ( Mask, mm0 ); // mm0 = E1 with lower chroma bit stripped off
298 pand_m2r ( Mask, mm2 ); // mm2 = E2 with lower chroma bit stripped off
299 psrlw_i2r ( 01, mm0 ); // mm0 = E1 / 2
300 psrlw_i2r ( 01, mm2 ); // mm2 = E2 / 2
301 paddb_r2r ( mm2, mm0 );
302
303 // The meat of the work is done here. We want to see whether this pixel is
304 // close in luminosity to ANY of: its top neighbor, its bottom neighbor,
305 // or its predecessor. To do this without branching, we use MMX's
306 // saturation feature, which gives us Z(x) = x if x>=0, or 0 if x<0.
307 //
308 // The formula we're computing here is
309 // Z(ST - (E1 - O) ^ 2) + Z(ST - (E2 - O) ^ 2) + Z(TT - (Oold - O) ^ 2)
310 // where ST is spatial tolerance and TT is temporal tolerance. The idea
311 // is that if a pixel is similar to none of its neighbors, the resulting
312 // value will be pretty low, probably zero. A high value therefore indicates
313 // that the pixel had a similar neighbor. The pixel in the same position
314 // in the field before last (Oold) is considered a neighbor since we want
315 // to be able to display 1-pixel-high horizontal lines.
316
317 movq_m2r ( *&qwSpatialTolerance, mm7 );
318 movq_r2r ( mm3, mm5 ); // mm5 = E1
319 psubsw_r2r ( mm4, mm5 ); // mm5 = E1 - O
320 psraw_i2r ( 1, mm5 );
321 pmullw_r2r ( mm5, mm5 ); // mm5 = (E1 - O) ^ 2
322 psubusw_r2r ( mm5, mm7 ); // mm7 = ST - (E1 - O) ^ 2, or 0 if that's negative
323
324 movq_m2r ( *&qwSpatialTolerance, mm3 );
325 movq_r2r ( mm6, mm5 ); // mm5 = E2
326 psubsw_r2r ( mm4, mm5 ); // mm5 = E2 - O
327 psraw_i2r ( 1, mm5 );
328 pmullw_r2r ( mm5, mm5 ); // mm5 = (E2 - O) ^ 2
329 psubusw_r2r ( mm5, mm3 ); // mm0 = ST - (E2 - O) ^ 2, or 0 if that's negative
330 paddusw_r2r ( mm3, mm7 ); // mm7 = (ST - (E1 - O) ^ 2) + (ST - (E2 - O) ^ 2)
331
332 movq_m2r ( *&qwTemporalTolerance, mm3 );
333 movq_m2r ( *YVal4++, mm5 ); // mm5 = Oold
334 pand_m2r ( YMask, mm5 );
335 psubsw_r2r ( mm4, mm5 ); // mm5 = Oold - O
336 psraw_i2r ( 1, mm5 ); // XXX
337 pmullw_r2r ( mm5, mm5 ); // mm5 = (Oold - O) ^ 2
338 psubusw_r2r ( mm5, mm3 ); /* mm0 = TT - (Oold - O) ^ 2, or 0 if that's negative */
339 paddusw_r2r ( mm3, mm7 ); // mm7 = our magic number
340
341 /*
342 * Now compare the similarity totals against our threshold. The pcmpgtw
343 * instruction will populate the target register with a bunch of mask bits,
344 * filling words where the comparison is true with 1s and ones where it's
345 * false with 0s. A few ANDs and NOTs and an OR later, we have bobbed
346 * values for pixels under the similarity threshold and weaved ones for
347 * pixels over the threshold.
348 */
349
350 pcmpgtw_m2r( *&qwThreshold, mm7 ); // mm7 = 0xffff where we're greater than the threshold, 0 elsewhere
351 movq_r2r ( mm7, mm6 ); // mm6 = 0xffff where we're greater than the threshold, 0 elsewhere
352 pand_r2r ( mm1, mm7 ); // mm7 = weaved data where we're greater than the threshold, 0 elsewhere
353 pandn_r2r ( mm0, mm6 ); // mm6 = bobbed data where we're not greater than the threshold, 0 elsewhere
354 por_r2r ( mm6, mm7 ); // mm7 = bobbed and weaved data
355
356 movq_r2m ( mm7, *Dest++ );
357 }
358 }
359
360 // Copy last odd line if we're processing an odd field.
361 if (IsOdd)
362 {
363 xine_fast_memcpy(pdst + (height * 2 - 1) * LineLength,
364 pOddLines + (height - 1) * SourcePitch,
365 LineLength);
366 }
367
368 // clear out the MMX registers ready for doing floating point
369 // again
370 emms();
371
372 #endif
373
374 return 1;
375 }
376
377
378 // This is a simple lightweight DeInterlace method that uses little CPU time
379 // but gives very good results for low or intermedite motion. (MORE CPU THAN BOB)
380 // It defers frames by one field, but that does not seem to produce noticeable
381 // lip sync problems.
382 //
383 // The method used is to take either the older or newer weave pixel depending
384 // upon which give the smaller comb factor, and then clip to avoid large damage
385 // when wrong.
386 //
387 // I'd intended this to be part of a larger more elaborate method added to
388 // Blended Clip but this give too good results for the CPU to ignore here.
389 static int deinterlace_greedy_yuv_mmx( uint8_t *pdst, uint8_t *psrc[],
390 int width, int height )
391 {
392 #if defined(ARCH_X86) || defined(ARCH_X86_64)
393 int Line;
394 int LoopCtr;
395 uint64_t *L1; // ptr to Line1, of 3
396 uint64_t *L2; // ptr to Line2, the weave line
397 uint64_t *L3; // ptr to Line3
398 uint64_t *LP2; // ptr to prev Line2
399 uint64_t *Dest;
400 uint8_t* pEvenLines = psrc[0];
401 uint8_t* pOddLines = psrc[0]+width;
402 uint8_t* pPrevLines;
403
404 static mmx_t ShiftMask = {ub:{0xfe,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe}};
405
406 int LineLength = width;
407 int SourcePitch = width * 2;
408 int IsOdd = 1;
409 long GreedyMaxComb = 15;
410 static mmx_t MaxComb;
411 int i;
412
413 if ( psrc[0] == NULL || psrc[1] == NULL )
414 return 0;
415
416 if (IsOdd)
417 pPrevLines = psrc[1] + width;
418 else
419 pPrevLines = psrc[1];
420
421
422 for( i = 0; i < 8; i++ )
423 MaxComb.ub[i] = GreedyMaxComb; // How badly do we let it weave? 0-255
424
425
426 // copy first even line no matter what, and the first odd line if we're
427 // processing an EVEN field. (note diff from other deint rtns.)
428 xine_fast_memcpy(pdst, pEvenLines, LineLength); //DL0
429 if (!IsOdd)
430 xine_fast_memcpy(pdst + LineLength, pOddLines, LineLength); //DL1
431
432 height = height / 2;
433 for (Line = 0; Line < height - 1; ++Line)
434 {
435 LoopCtr = LineLength / 8; // there are LineLength / 8 qwords per line
436
437 if (IsOdd)
438 {
439 L1 = (uint64_t *)(pEvenLines + Line * SourcePitch);
440 L2 = (uint64_t *)(pOddLines + Line * SourcePitch);
441 L3 = (uint64_t *)(pEvenLines + (Line + 1) * SourcePitch);
442 LP2 = (uint64_t *)(pPrevLines + Line * SourcePitch); // prev Odd lines
443 Dest = (uint64_t *)(pdst + (Line * 2 + 1) * LineLength);
444 }
445 else
446 {
447 L1 = (uint64_t *)(pOddLines + Line * SourcePitch);
448 L2 = (uint64_t *)(pEvenLines + (Line + 1) * SourcePitch);
449 L3 = (uint64_t *)(pOddLines + (Line + 1) * SourcePitch);
450 LP2 = (uint64_t *)(pPrevLines + (Line + 1) * SourcePitch); //prev even lines
451 Dest = (uint64_t *)(pdst + (Line * 2 + 2) * LineLength);
452 }
453
454 xine_fast_memcpy((char *)Dest + LineLength, L3, LineLength);
455
456 // For ease of reading, the comments below assume that we're operating on an odd
457 // field (i.e., that info->IsOdd is true). Assume the obvious for even lines..
458
459 while( LoopCtr-- )
460 {
461 movq_m2r ( *L1++, mm1 );
462 movq_m2r ( *L2++, mm2 );
463 movq_m2r ( *L3++, mm3 );
464 movq_m2r ( *LP2++, mm0 );
465
466 // average L1 and L3 leave result in mm4
467 movq_r2r ( mm1, mm4 ); // L1
468
469 pand_m2r ( ShiftMask, mm4 );
470 psrlw_i2r ( 01, mm4 );
471 movq_r2r ( mm3, mm5 ); // L3
472 pand_m2r ( ShiftMask, mm5 );
473 psrlw_i2r ( 01, mm5 );
474 paddb_r2r ( mm5, mm4 ); // the average, for computing comb
475
476 // get abs value of possible L2 comb
477 movq_r2r ( mm2, mm7 ); // L2
478 psubusb_r2r ( mm4, mm7 ); // L2 - avg
479 movq_r2r ( mm4, mm5 ); // avg
480 psubusb_r2r ( mm2, mm5 ); // avg - L2
481 por_r2r ( mm7, mm5 ); // abs(avg-L2)
482 movq_r2r ( mm4, mm6 ); // copy of avg for later
483
484 // get abs value of possible LP2 comb
485 movq_r2r ( mm0, mm7 ); // LP2
486 psubusb_r2r ( mm4, mm7 ); // LP2 - avg
487 psubusb_r2r ( mm0, mm4 ); // avg - LP2
488 por_r2r ( mm7, mm4 ); // abs(avg-LP2)
489
490 // use L2 or LP2 depending upon which makes smaller comb
491 psubusb_r2r ( mm5, mm4 ); // see if it goes to zero
492 psubusb_r2r ( mm5, mm5 ); // 0
493 pcmpeqb_r2r ( mm5, mm4 ); // if (mm4=0) then FF else 0
494 pcmpeqb_r2r ( mm4, mm5 ); // opposite of mm4
495
496 // if Comb(LP2) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55
497 pand_r2r ( mm2, mm5 ); // use L2 if mm5 == ff, else 0
498 pand_r2r ( mm0, mm4 ); // use LP2 if mm4 = ff, else 0
499 por_r2r ( mm5, mm4 ); // may the best win
500
501 // Now lets clip our chosen value to be not outside of the range
502 // of the high/low range L1-L3 by more than abs(L1-L3)
503 // This allows some comb but limits the damages and also allows more
504 // detail than a boring oversmoothed clip.
505
506 movq_r2r ( mm1, mm2 ); // copy L1
507 psubusb_r2r ( mm3, mm2 ); // - L3, with saturation
508 paddusb_r2r ( mm3, mm2 ); // now = Max(L1,L3)
509
510 pcmpeqb_r2r ( mm7, mm7 ); // all ffffffff
511 psubusb_r2r ( mm1, mm7 ); // - L1
512 paddusb_r2r ( mm7, mm3 ); // add, may sat at fff..
513 psubusb_r2r ( mm7, mm3 ); // now = Min(L1,L3)
514
515 // allow the value to be above the high or below the low by amt of MaxComb
516 paddusb_m2r ( MaxComb, mm2 ); // increase max by diff
517 psubusb_m2r ( MaxComb, mm3 ); // lower min by diff
518
519 psubusb_r2r ( mm3, mm4 ); // best - Min
520 paddusb_r2r ( mm3, mm4 ); // now = Max(best,Min(L1,L3)
521
522 pcmpeqb_r2r ( mm7, mm7 ); // all ffffffff
523 psubusb_r2r ( mm4, mm7 ); // - Max(best,Min(best,L3)
524 paddusb_r2r ( mm7, mm2 ); // add may sat at FFF..
525 psubusb_r2r ( mm7, mm2 ); // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped
526
527 movq_r2m ( mm2, *Dest++ ); // move in our clipped best
528
529 }
530 }
531
532 /* Copy last odd line if we're processing an Odd field. */
533 if (IsOdd)
534 {
535 xine_fast_memcpy(pdst + (height * 2 - 1) * LineLength,
536 pOddLines + (height - 1) * SourcePitch,
537 LineLength);
538 }
539
540 /* clear out the MMX registers ready for doing floating point again */
541 emms();
542
543 #endif
544
545 return 1;
546 }
547
548 /* Use one field to interpolate the other (low cpu utilization)
549 Will lose resolution but does not produce weaving effect
550 (good for fast moving scenes) also know as "linear interpolation"
551 */
552 static void deinterlace_onefield_yuv_mmx( uint8_t *pdst, uint8_t *psrc[],
553 int width, int height )
554 {
555 #if defined(ARCH_X86) || defined(ARCH_X86_64)
556 int Line;
557 uint64_t *YVal1;
558 uint64_t *YVal3;
559 uint64_t *Dest;
560 uint8_t* pEvenLines = psrc[0];
561 uint8_t* pOddLines = psrc[0]+width;
562 int LineLength = width;
563 int SourcePitch = width * 2;
564 int IsOdd = 1;
565
566 int n;
567
568 static mmx_t Mask = {ub:{0xfe,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe}};
569
570 /*
571 * copy first even line no matter what, and the first odd line if we're
572 * processing an odd field.
573 */
574
575 xine_fast_memcpy(pdst, pEvenLines, LineLength);
576 if (IsOdd)
577 xine_fast_memcpy(pdst + LineLength, pOddLines, LineLength);
578
579 height = height / 2;
580 for (Line = 0; Line < height - 1; ++Line)
581 {
582 if (IsOdd)
583 {
584 YVal1 = (uint64_t *)(pOddLines + Line * SourcePitch);
585 YVal3 = (uint64_t *)(pOddLines + (Line + 1) * SourcePitch);
586 Dest = (uint64_t *)(pdst + (Line * 2 + 2) * LineLength);
587 }
588 else
589 {
590 YVal1 = (uint64_t *)(pEvenLines + Line * SourcePitch);
591 YVal3 = (uint64_t *)(pEvenLines + (Line + 1) * SourcePitch);
592 Dest = (uint64_t *)(pdst + (Line * 2 + 1) * LineLength);
593 }
594
595 // Copy the odd line to the overlay verbatim.
596 xine_fast_memcpy((char *)Dest + LineLength, YVal3, LineLength);
597
598 n = LineLength >> 3;
599 while( n-- )
600 {
601 movq_m2r (*YVal1++, mm0);
602 movq_m2r (*YVal3++, mm2);
603
604 // get average in mm0
605 pand_m2r ( Mask, mm0 );
606 pand_m2r ( Mask, mm2 );
607 psrlw_i2r ( 01, mm0 );
608 psrlw_i2r ( 01, mm2 );
609 paddw_r2r ( mm2, mm0 );
610
611 movq_r2m ( mm0, *Dest++ );
612 }
613 }
614
615 /* Copy last odd line if we're processing an even field. */
616 if (! IsOdd)
617 {
618 xine_fast_memcpy(pdst + (height * 2 - 1) * LineLength,
619 pOddLines + (height - 1) * SourcePitch,
620 LineLength);
621 }
622
623 /* clear out the MMX registers ready for doing floating point
624 * again
625 */
626 emms();
627 #endif
628 }
629
630 /* Linear Blend filter - does a kind of vertical blurring on the image.
631 (idea borrowed from mplayer's sources)
632 */
633 static void deinterlace_linearblend_yuv_mmx( uint8_t *pdst, uint8_t *psrc[],
634 int width, int height )
635 {
636 #if defined(ARCH_X86) || defined(ARCH_X86_64)
637 int Line;
638 uint64_t *YVal1;
639 uint64_t *YVal2;
640 uint64_t *YVal3;
641 uint64_t *Dest;
642 int LineLength = width;
643
644 int n;
645
646 /* Copy first line */
647 xine_fast_memcpy(pdst, psrc[0], LineLength);
648
649 for (Line = 1; Line < height - 1; ++Line)
650 {
651 YVal1 = (uint64_t *)(psrc[0] + (Line - 1) * LineLength);
652 YVal2 = (uint64_t *)(psrc[0] + (Line) * LineLength);
653 YVal3 = (uint64_t *)(psrc[0] + (Line + 1) * LineLength);
654 Dest = (uint64_t *)(pdst + Line * LineLength);
655
656 n = LineLength >> 3;
657 while( n-- )
658 {
659 /* load data from 3 lines */
660 movq_m2r (*YVal1++, mm0);
661 movq_m2r (*YVal2++, mm1);
662 movq_m2r (*YVal3++, mm2);
663
664 /* expand bytes to words */
665 punpckhbw_r2r (mm0, mm3);
666 punpckhbw_r2r (mm1, mm4);
667 punpckhbw_r2r (mm2, mm5);
668 punpcklbw_r2r (mm0, mm0);
669 punpcklbw_r2r (mm1, mm1);
670 punpcklbw_r2r (mm2, mm2);
671
672 /*
673 * deinterlacing:
674 * deint_line = (line0 + 2*line1 + line2) / 4
675 */
676 psrlw_i2r (07, mm0);
677 psrlw_i2r (06, mm1);
678 psrlw_i2r (07, mm2);
679 psrlw_i2r (07, mm3);
680 psrlw_i2r (06, mm4);
681 psrlw_i2r (07, mm5);
682 paddw_r2r (mm1, mm0);
683 paddw_r2r (mm2, mm0);
684 paddw_r2r (mm4, mm3);
685 paddw_r2r (mm5, mm3);
686 psrlw_i2r (03, mm0);
687 psrlw_i2r (03, mm3);
688
689 /* pack 8 words to 8 bytes in mm0 */
690 packuswb_r2r (mm3, mm0);
691
692 movq_r2m ( mm0, *Dest++ );
693 }
694 }
695
696 /* Copy last line */
697 xine_fast_memcpy(pdst + Line * LineLength,
698 psrc[0] + Line * LineLength, LineLength);
699
700 /* clear out the MMX registers ready for doing floating point
701 * again
702 */
703 emms();
704 #endif
705 }
706
707 /* Linear Blend filter - C version contributed by Rogerio Brito.
708 This algorithm has the same interface as the other functions.
709
710 The destination "screen" (pdst) is constructed from the source
711 screen (psrc[0]) line by line.
712
713 The i-th line of the destination screen is the average of 3 lines
714 from the source screen: the (i-1)-th, i-th and (i+1)-th lines, with
715 the i-th line having weight 2 in the computation.
716
717 Remarks:
718 * each line on pdst doesn't depend on previous lines;
719 * due to the way the algorithm is defined, the first & last lines of the
720 screen aren't deinterlaced.
721
722 */
723 static void deinterlace_linearblend_yuv( uint8_t *pdst, uint8_t *psrc[],
724 int width, int height )
725 {
726 register int x, y;
727 register uint8_t *l0, *l1, *l2, *l3;
728
729 l0 = pdst; /* target line */
730 l1 = psrc[0]; /* 1st source line */
731 l2 = l1 + width; /* 2nd source line = line that follows l1 */
732 l3 = l2 + width; /* 3rd source line = line that follows l2 */
733
734 /* Copy the first line */
735 xine_fast_memcpy(l0, l1, width);
736 l0 += width;
737
738 for (y = 1; y < height-1; ++y) {
739 /* computes avg of: l1 + 2*l2 + l3 */
740
741 for (x = 0; x < width; ++x) {
742 l0[x] = (l1[x] + (l2[x]<<1) + l3[x]) >> 2;
743 }
744
745 /* updates the line pointers */
746 l1 = l2; l2 = l3; l3 += width;
747 l0 += width;
748 }
749
750 /* Copy the last line */
751 xine_fast_memcpy(l0, l1, width);
752 }
753
754 static int check_for_mmx(void)
755 {
756 #if defined(ARCH_X86) || defined(ARCH_X86_64)
757 static int config_flags = -1;
758
759 if ( config_flags == -1 )
760 config_flags = xine_mm_accel();
761 if (config_flags & MM_ACCEL_X86_MMX)
762 return 1;
763 return 0;
764 #else
765 return 0;
766 #endif
767 }
768
769 /* generic YUV deinterlacer
770 pdst -> pointer to destination bitmap
771 psrc -> array of pointers to source bitmaps ([0] = most recent)
772 width,height -> dimension for bitmaps
773 method -> DEINTERLACE_xxx
774 */
775
776 void deinterlace_yuv( uint8_t *pdst, uint8_t *psrc[],
777 int width, int height, int method )
778 {
779 switch( method ) {
780 case DEINTERLACE_NONE:
781 xine_fast_memcpy(pdst,psrc[0],width*height);
782 break;
783 case DEINTERLACE_BOB:
784 if( check_for_mmx() )
785 deinterlace_bob_yuv_mmx(pdst,psrc,width,height);
786 else /* FIXME: provide an alternative? */
787 xine_fast_memcpy(pdst,psrc[0],width*height);
788 break;
789 case DEINTERLACE_WEAVE:
790 if( check_for_mmx() )
791 {
792 if( !deinterlace_weave_yuv_mmx(pdst,psrc,width,height) )
793 xine_fast_memcpy(pdst,psrc[0],width*height);
794 }
795 else /* FIXME: provide an alternative? */
796 xine_fast_memcpy(pdst,psrc[0],width*height);
797 break;
798 case DEINTERLACE_GREEDY:
799 if( check_for_mmx() )
800 {
801 if( !deinterlace_greedy_yuv_mmx(pdst,psrc,width,height) )
802 xine_fast_memcpy(pdst,psrc[0],width*height);
803 }
804 else /* FIXME: provide an alternative? */
805 xine_fast_memcpy(pdst,psrc[0],width*height);
806 break;
807 case DEINTERLACE_ONEFIELD:
808 if( check_for_mmx() )
809 deinterlace_onefield_yuv_mmx(pdst,psrc,width,height);
810 else /* FIXME: provide an alternative? */
811 xine_fast_memcpy(pdst,psrc[0],width*height);
812 break;
813 case DEINTERLACE_ONEFIELDXV:
814 lprintf("ONEFIELDXV must be handled by the video driver.\n");
815 break;
816 case DEINTERLACE_LINEARBLEND:
817 if( check_for_mmx() )
818 deinterlace_linearblend_yuv_mmx(pdst,psrc,width,height);
819 else
820 deinterlace_linearblend_yuv(pdst,psrc,width,height);
821 break;
822 default:
823 lprintf("unknow method %d.\n",method);
824 break;
825 }
826 }
827
828 int deinterlace_yuv_supported ( int method )
829 {
830 switch( method ) {
831 case DEINTERLACE_NONE:
832 return 1;
833 case DEINTERLACE_BOB:
834 case DEINTERLACE_WEAVE:
835 case DEINTERLACE_GREEDY:
836 case DEINTERLACE_ONEFIELD:
837 return check_for_mmx();
838 case DEINTERLACE_ONEFIELDXV:
839 lprintf ("ONEFIELDXV must be handled by the video driver.\n");
840 return 0;
841 case DEINTERLACE_LINEARBLEND:
842 return 1;
843 }
844
845 return 0;
846 }
847
848 char *deinterlace_methods[] = {
849 "none",
850 "bob",
851 "weave",
852 "greedy",
853 "onefield",
854 "onefield_xv",
855 "linearblend",
856 NULL
857 };
858
859