research.m1stereo.tv/gitweb/ - melted/blob - src/modules/xine/deinterlace.c

   1  /*
   2  * Copyright (C) 2001 the xine project
   3  *
   4  * This file is part of xine, a free video player.
   5  *
   6  * xine is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * xine is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; if not, write to the Free Software
  18  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
  19  *
  20  * Deinterlace routines by Miguel Freitas
  21  * based of DScaler project sources (deinterlace.sourceforge.net)
  22  *
  23  * Currently only available for Xv driver and MMX extensions
  24  *
  25  * small todo list:
  26  * - implement non-MMX versions for all methods
  27  * - support MMX2 instructions
  28  * - move some generic code from xv driver to this file
  29  * - make it also work for yuy2 frames
  30  *
  31  */
  32
  33 #include <stdio.h>
  34 #include <string.h>
  35 #include "deinterlace.h"
  36 #include "xineutils.h"
  37
  38 #define xine_fast_memcpy memcpy
  39
  40 /*
  41    DeinterlaceFieldBob algorithm
  42    Based on Virtual Dub plugin by Gunnar Thalin
  43    MMX asm version from dscaler project (deinterlace.sourceforge.net)
  44    Linux version for Xine player by Miguel Freitas
  45 */
  46 static void deinterlace_bob_yuv_mmx( uint8_t *pdst, uint8_t *psrc[],
  47     int width, int height )
  48 {
  49 #if defined(ARCH_X86) || defined(ARCH_X86_64)
  50   int Line;
  51   uint64_t *YVal1;
  52   uint64_t *YVal2;
  53   uint64_t *YVal3;
  54   uint64_t *Dest;
  55   uint8_t* pEvenLines = psrc[0];
  56   uint8_t* pOddLines = psrc[0]+width;
  57   int LineLength = width;
  58   int SourcePitch = width * 2;
  59   int IsOdd = 1;
  60   long EdgeDetect = 625;
  61   long JaggieThreshold = 73;
  62
  63   int n;
  64
  65   uint64_t qwEdgeDetect;
  66   uint64_t qwThreshold;
  67
  68   static mmx_t YMask = {ub:{0xff,0,0xff,0,0xff,0,0xff,0}};
  69   static mmx_t Mask = {ub:{0xfe,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe}};
  70
  71   qwEdgeDetect = EdgeDetect;
  72   qwEdgeDetect += (qwEdgeDetect << 48) + (qwEdgeDetect << 32) + (qwEdgeDetect << 16);
  73   qwThreshold = JaggieThreshold;
  74   qwThreshold += (qwThreshold << 48) + (qwThreshold << 32) + (qwThreshold << 16);
  75
  76
  77   // copy first even line no matter what, and the first odd line if we're
  78   // processing an odd field.
  79   xine_fast_memcpy(pdst, pEvenLines, LineLength);
  80   if (IsOdd)
  81     xine_fast_memcpy(pdst + LineLength, pOddLines, LineLength);
  82
  83   height = height / 2;
  84   for (Line = 0; Line < height - 1; ++Line)
  85   {
  86     if (IsOdd)
  87     {
  88       YVal1 = (uint64_t *)(pOddLines + Line * SourcePitch);
  89       YVal2 = (uint64_t *)(pEvenLines + (Line + 1) * SourcePitch);
  90       YVal3 = (uint64_t *)(pOddLines + (Line + 1) * SourcePitch);
  91       Dest = (uint64_t *)(pdst + (Line * 2 + 2) * LineLength);
  92     }
  93     else
  94     {
  95       YVal1 = (uint64_t *)(pEvenLines + Line * SourcePitch);
  96       YVal2 = (uint64_t *)(pOddLines + Line * SourcePitch);
  97       YVal3 = (uint64_t *)(pEvenLines + (Line + 1) * SourcePitch);
  98       Dest = (uint64_t *)(pdst + (Line * 2 + 1) * LineLength);
  99     }
 100
 101     // For ease of reading, the comments below assume that we're operating on an odd
 102     // field (i.e., that bIsOdd is true).  The exact same processing is done when we
 103     // operate on an even field, but the roles of the odd and even fields are reversed.
 104     // It's just too cumbersome to explain the algorithm in terms of "the next odd
 105     // line if we're doing an odd field, or the next even line if we're doing an
 106     // even field" etc.  So wherever you see "odd" or "even" below, keep in mind that
 107     // half the time this function is called, those words' meanings will invert.
 108
 109     // Copy the odd line to the overlay verbatim.
 110     xine_fast_memcpy((char *)Dest + LineLength, YVal3, LineLength);
 111
 112     n = LineLength >> 3;
 113     while( n-- )
 114     {
 115       movq_m2r (*YVal1++, mm0);
 116       movq_m2r (*YVal2++, mm1);
 117       movq_m2r (*YVal3++, mm2);
 118
 119       // get intensities in mm3 - 4
 120       movq_r2r ( mm0, mm3 );
 121       pand_m2r ( YMask, mm3 );
 122       movq_r2r ( mm1, mm4 );
 123       pand_m2r ( YMask, mm4 );
 124       movq_r2r ( mm2, mm5 );
 125       pand_m2r ( YMask, mm5 );
 126
 127       // get average in mm0
 128       pand_m2r ( Mask, mm0 );
 129       pand_m2r ( Mask, mm2 );
 130       psrlw_i2r ( 01, mm0 );
 131       psrlw_i2r ( 01, mm2 );
 132       paddw_r2r ( mm2, mm0 );
 133
 134       // work out (O1 - E) * (O2 - E) / 2 - EdgeDetect * (O1 - O2) ^ 2 >> 12
 135       // result will be in mm6
 136
 137       psrlw_i2r ( 01, mm3 );
 138       psrlw_i2r ( 01, mm4 );
 139       psrlw_i2r ( 01, mm5 );
 140
 141       movq_r2r ( mm3, mm6 );
 142       psubw_r2r ( mm4, mm6 );   //mm6 = O1 - E
 143
 144       movq_r2r ( mm5, mm7 );
 145       psubw_r2r ( mm4, mm7 );   //mm7 = O2 - E
 146
 147       pmullw_r2r ( mm7, mm6 );          // mm6 = (O1 - E) * (O2 - E)
 148
 149       movq_r2r ( mm3, mm7 );
 150       psubw_r2r ( mm5, mm7 );           // mm7 = (O1 - O2)
 151       pmullw_r2r ( mm7, mm7 );  // mm7 = (O1 - O2) ^ 2
 152       psrlw_i2r ( 12, mm7 );            // mm7 = (O1 - O2) ^ 2 >> 12
 153       pmullw_m2r ( *&qwEdgeDetect, mm7 );// mm7  = EdgeDetect * (O1 - O2) ^ 2 >> 12
 154
 155       psubw_r2r ( mm7, mm6 );      // mm6 is what we want
 156
 157       pcmpgtw_m2r ( *&qwThreshold, mm6 );
 158
 159       movq_r2r ( mm6, mm7 );
 160
 161       pand_r2r ( mm6, mm0 );
 162
 163       pandn_r2r ( mm1, mm7 );
 164
 165       por_r2r ( mm0, mm7 );
 166
 167       movq_r2m ( mm7, *Dest++ );
 168     }
 169   }
 170
 171   // Copy last odd line if we're processing an even field.
 172   if (! IsOdd)
 173   {
 174     xine_fast_memcpy(pdst + (height * 2 - 1) * LineLength,
 175                       pOddLines + (height - 1) * SourcePitch,
 176                       LineLength);
 177   }
 178
 179   // clear out the MMX registers ready for doing floating point
 180   // again
 181   emms();
 182 #endif
 183 }
 184
 185 /* Deinterlace the latest field, with a tendency to weave rather than bob.
 186    Good for high detail on low-movement scenes.
 187    Seems to produce bad output in general case, need to check if this
 188    is normal or if the code is broken.
 189 */
 190 static int deinterlace_weave_yuv_mmx( uint8_t *pdst, uint8_t *psrc[],
 191     int width, int height )
 192 {
 193 #if defined(ARCH_X86) || defined(ARCH_X86_64)
 194
 195   int Line;
 196   uint64_t *YVal1;
 197   uint64_t *YVal2;
 198   uint64_t *YVal3;
 199   uint64_t *YVal4;
 200   uint64_t *Dest;
 201   uint8_t* pEvenLines = psrc[0];
 202   uint8_t* pOddLines = psrc[0]+width;
 203   uint8_t* pPrevLines;
 204
 205   int LineLength = width;
 206   int SourcePitch = width * 2;
 207   int IsOdd = 1;
 208
 209   long TemporalTolerance = 300;
 210   long SpatialTolerance = 600;
 211   long SimilarityThreshold = 25;
 212
 213   int n;
 214
 215   uint64_t qwSpatialTolerance;
 216   uint64_t qwTemporalTolerance;
 217   uint64_t qwThreshold;
 218
 219   static mmx_t YMask = {ub:{0xff,0,0xff,0,0xff,0,0xff,0}};
 220   static mmx_t Mask = {ub:{0xfe,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe}};
 221
 222
 223   // Make sure we have all the data we need.
 224   if ( psrc[0] == NULL || psrc[1] == NULL )
 225     return 0;
 226
 227   if (IsOdd)
 228     pPrevLines = psrc[1] + width;
 229   else
 230     pPrevLines = psrc[1];
 231
 232   // Since the code uses MMX to process 4 pixels at a time, we need our constants
 233   // to be represented 4 times per quadword.
 234   qwSpatialTolerance = SpatialTolerance;
 235   qwSpatialTolerance += (qwSpatialTolerance << 48) + (qwSpatialTolerance << 32) + (qwSpatialTolerance << 16);
 236   qwTemporalTolerance = TemporalTolerance;
 237   qwTemporalTolerance += (qwTemporalTolerance << 48) + (qwTemporalTolerance << 32) + (qwTemporalTolerance << 16);
 238   qwThreshold = SimilarityThreshold;
 239   qwThreshold += (qwThreshold << 48) + (qwThreshold << 32) + (qwThreshold << 16);
 240
 241   // copy first even line no matter what, and the first odd line if we're
 242   // processing an even field.
 243   xine_fast_memcpy(pdst, pEvenLines, LineLength);
 244   if (!IsOdd)
 245     xine_fast_memcpy(pdst + LineLength, pOddLines, LineLength);
 246
 247   height = height / 2;
 248   for (Line = 0; Line < height - 1; ++Line)
 249   {
 250     if (IsOdd)
 251     {
 252       YVal1 = (uint64_t *)(pEvenLines + Line * SourcePitch);
 253       YVal2 = (uint64_t *)(pOddLines + Line * SourcePitch);
 254       YVal3 = (uint64_t *)(pEvenLines + (Line + 1) * SourcePitch);
 255       YVal4 = (uint64_t *)(pPrevLines + Line * SourcePitch);
 256       Dest = (uint64_t *)(pdst + (Line * 2 + 1) * LineLength);
 257     }
 258     else
 259     {
 260       YVal1 = (uint64_t *)(pOddLines + Line * SourcePitch);
 261       YVal2 = (uint64_t *)(pEvenLines + (Line + 1) * SourcePitch);
 262       YVal3 = (uint64_t *)(pOddLines + (Line + 1) * SourcePitch);
 263       YVal4 = (uint64_t *)(pPrevLines + (Line + 1) * SourcePitch);
 264       Dest = (uint64_t *)(pdst + (Line * 2 + 2) * LineLength);
 265     }
 266
 267     // For ease of reading, the comments below assume that we're operating on an odd
 268     // field (i.e., that bIsOdd is true).  The exact same processing is done when we
 269     // operate on an even field, but the roles of the odd and even fields are reversed.
 270     // It's just too cumbersome to explain the algorithm in terms of "the next odd
 271     // line if we're doing an odd field, or the next even line if we're doing an
 272     // even field" etc.  So wherever you see "odd" or "even" below, keep in mind that
 273     // half the time this function is called, those words' meanings will invert.
 274
 275     // Copy the even scanline below this one to the overlay buffer, since we'll be
 276     // adapting the current scanline to the even lines surrounding it.  The scanline
 277     // above has already been copied by the previous pass through the loop.
 278     xine_fast_memcpy((char *)Dest + LineLength, YVal3, LineLength);
 279
 280     n = LineLength >> 3;
 281     while( n-- )
 282     {
 283       movq_m2r ( *YVal1++, mm0 );    // mm0 = E1
 284       movq_m2r ( *YVal2++, mm1 );    // mm1 = O
 285       movq_m2r ( *YVal3++, mm2 );    // mm2 = E2
 286
 287       movq_r2r ( mm0, mm3 );       // mm3 = intensity(E1)
 288       movq_r2r ( mm1, mm4 );       // mm4 = intensity(O)
 289       movq_r2r ( mm2, mm6 );       // mm6 = intensity(E2)
 290
 291       pand_m2r ( YMask, mm3 );
 292       pand_m2r ( YMask, mm4 );
 293       pand_m2r ( YMask, mm6 );
 294
 295       // Average E1 and E2 for interpolated bobbing.
 296       // leave result in mm0
 297       pand_m2r ( Mask, mm0 ); // mm0 = E1 with lower chroma bit stripped off
 298       pand_m2r ( Mask, mm2 ); // mm2 = E2 with lower chroma bit stripped off
 299       psrlw_i2r ( 01, mm0 );    // mm0 = E1 / 2
 300       psrlw_i2r ( 01, mm2 );    // mm2 = E2 / 2
 301       paddb_r2r ( mm2, mm0 );
 302
 303       // The meat of the work is done here.  We want to see whether this pixel is
 304       // close in luminosity to ANY of: its top neighbor, its bottom neighbor,
 305       // or its predecessor.  To do this without branching, we use MMX's
 306       // saturation feature, which gives us Z(x) = x if x>=0, or 0 if x<0.
 307       //
 308       // The formula we're computing here is
 309       //                Z(ST - (E1 - O) ^ 2) + Z(ST - (E2 - O) ^ 2) + Z(TT - (Oold - O) ^ 2)
 310       // where ST is spatial tolerance and TT is temporal tolerance.  The idea
 311       // is that if a pixel is similar to none of its neighbors, the resulting
 312       // value will be pretty low, probably zero.  A high value therefore indicates
 313       // that the pixel had a similar neighbor.  The pixel in the same position
 314       // in the field before last (Oold) is considered a neighbor since we want
 315       // to be able to display 1-pixel-high horizontal lines.
 316
 317       movq_m2r ( *&qwSpatialTolerance, mm7 );
 318       movq_r2r ( mm3, mm5 );     // mm5 = E1
 319       psubsw_r2r ( mm4, mm5 );   // mm5 = E1 - O
 320       psraw_i2r ( 1, mm5 );
 321       pmullw_r2r ( mm5, mm5 );   // mm5 = (E1 - O) ^ 2
 322       psubusw_r2r ( mm5, mm7 );  // mm7 = ST - (E1 - O) ^ 2, or 0 if that's negative
 323
 324       movq_m2r ( *&qwSpatialTolerance, mm3 );
 325       movq_r2r ( mm6, mm5 );    // mm5 = E2
 326       psubsw_r2r ( mm4, mm5 );  // mm5 = E2 - O
 327       psraw_i2r ( 1, mm5 );
 328       pmullw_r2r ( mm5, mm5 );  // mm5 = (E2 - O) ^ 2
 329       psubusw_r2r ( mm5, mm3 ); // mm0 = ST - (E2 - O) ^ 2, or 0 if that's negative
 330       paddusw_r2r ( mm3, mm7 ); // mm7 = (ST - (E1 - O) ^ 2) + (ST - (E2 - O) ^ 2)
 331
 332       movq_m2r ( *&qwTemporalTolerance, mm3 );
 333       movq_m2r ( *YVal4++, mm5 ); // mm5 = Oold
 334       pand_m2r ( YMask, mm5 );
 335       psubsw_r2r ( mm4, mm5 );  // mm5 = Oold - O
 336       psraw_i2r ( 1, mm5 ); // XXX
 337       pmullw_r2r ( mm5, mm5 );  // mm5 = (Oold - O) ^ 2
 338       psubusw_r2r ( mm5, mm3 ); /* mm0 = TT - (Oold - O) ^ 2, or 0 if that's negative */
 339       paddusw_r2r ( mm3, mm7 ); // mm7 = our magic number
 340
 341       /*
 342        * Now compare the similarity totals against our threshold.  The pcmpgtw
 343        * instruction will populate the target register with a bunch of mask bits,
 344        * filling words where the comparison is true with 1s and ones where it's
 345        * false with 0s.  A few ANDs and NOTs and an OR later, we have bobbed
 346        * values for pixels under the similarity threshold and weaved ones for
 347        * pixels over the threshold.
 348        */
 349
 350       pcmpgtw_m2r( *&qwThreshold, mm7 ); // mm7 = 0xffff where we're greater than the threshold, 0 elsewhere
 351       movq_r2r ( mm7, mm6 );  // mm6 = 0xffff where we're greater than the threshold, 0 elsewhere
 352       pand_r2r ( mm1, mm7 );  // mm7 = weaved data where we're greater than the threshold, 0 elsewhere
 353       pandn_r2r ( mm0, mm6 ); // mm6 = bobbed data where we're not greater than the threshold, 0 elsewhere
 354       por_r2r ( mm6, mm7 );   // mm7 = bobbed and weaved data
 355
 356       movq_r2m ( mm7, *Dest++ );
 357     }
 358   }
 359
 360   // Copy last odd line if we're processing an odd field.
 361   if (IsOdd)
 362   {
 363     xine_fast_memcpy(pdst + (height * 2 - 1) * LineLength,
 364                       pOddLines + (height - 1) * SourcePitch,
 365                       LineLength);
 366   }
 367
 368   // clear out the MMX registers ready for doing floating point
 369   // again
 370   emms();
 371
 372 #endif
 373
 374   return 1;
 375 }
 376
 377
 378 // This is a simple lightweight DeInterlace method that uses little CPU time
 379 // but gives very good results for low or intermedite motion. (MORE CPU THAN BOB)
 380 // It defers frames by one field, but that does not seem to produce noticeable
 381 // lip sync problems.
 382 //
 383 // The method used is to take either the older or newer weave pixel depending
 384 // upon which give the smaller comb factor, and then clip to avoid large damage
 385 // when wrong.
 386 //
 387 // I'd intended this to be part of a larger more elaborate method added to
 388 // Blended Clip but this give too good results for the CPU to ignore here.
 389 static int deinterlace_greedy_yuv_mmx( uint8_t *pdst, uint8_t *psrc[],
 390     int width, int height )
 391 {
 392 #if defined(ARCH_X86) || defined(ARCH_X86_64)
 393   int Line;
 394   int   LoopCtr;
 395   uint64_t *L1;                                 // ptr to Line1, of 3
 396   uint64_t *L2;                                 // ptr to Line2, the weave line
 397   uint64_t *L3;                                 // ptr to Line3
 398   uint64_t *LP2;                                        // ptr to prev Line2
 399   uint64_t *Dest;
 400   uint8_t* pEvenLines = psrc[0];
 401   uint8_t* pOddLines = psrc[0]+width;
 402   uint8_t* pPrevLines;
 403
 404   static mmx_t ShiftMask = {ub:{0xfe,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe}};
 405
 406   int LineLength = width;
 407   int SourcePitch = width * 2;
 408   int IsOdd = 1;
 409   long GreedyMaxComb = 15;
 410   static mmx_t MaxComb;
 411   int i;
 412
 413   if ( psrc[0] == NULL || psrc[1] == NULL )
 414     return 0;
 415
 416   if (IsOdd)
 417     pPrevLines = psrc[1] + width;
 418   else
 419     pPrevLines = psrc[1];
 420
 421
 422   for( i = 0; i < 8; i++ )
 423     MaxComb.ub[i] = GreedyMaxComb; // How badly do we let it weave? 0-255
 424
 425
 426   // copy first even line no matter what, and the first odd line if we're
 427   // processing an EVEN field. (note diff from other deint rtns.)
 428   xine_fast_memcpy(pdst, pEvenLines, LineLength); //DL0
 429   if (!IsOdd)
 430     xine_fast_memcpy(pdst + LineLength, pOddLines, LineLength); //DL1
 431
 432   height = height / 2;
 433   for (Line = 0; Line < height - 1; ++Line)
 434   {
 435     LoopCtr = LineLength / 8;                           // there are LineLength / 8 qwords per line
 436
 437     if (IsOdd)
 438     {
 439       L1 = (uint64_t *)(pEvenLines + Line * SourcePitch);
 440       L2 = (uint64_t *)(pOddLines + Line * SourcePitch);
 441       L3 = (uint64_t *)(pEvenLines + (Line + 1) * SourcePitch);
 442       LP2 = (uint64_t *)(pPrevLines + Line * SourcePitch); // prev Odd lines
 443       Dest = (uint64_t *)(pdst + (Line * 2 + 1) * LineLength);
 444     }
 445     else
 446     {
 447       L1 = (uint64_t *)(pOddLines + Line * SourcePitch);
 448       L2 = (uint64_t *)(pEvenLines + (Line + 1) * SourcePitch);
 449       L3 = (uint64_t *)(pOddLines + (Line + 1) * SourcePitch);
 450       LP2 = (uint64_t *)(pPrevLines + (Line + 1) * SourcePitch); //prev even lines
 451       Dest = (uint64_t *)(pdst + (Line * 2 + 2) * LineLength);
 452     }
 453
 454     xine_fast_memcpy((char *)Dest + LineLength, L3, LineLength);
 455
 456 // For ease of reading, the comments below assume that we're operating on an odd
 457 // field (i.e., that info->IsOdd is true).  Assume the obvious for even lines..
 458
 459     while( LoopCtr-- )
 460     {
 461       movq_m2r ( *L1++, mm1 );
 462       movq_m2r ( *L2++, mm2 );
 463       movq_m2r ( *L3++, mm3 );
 464       movq_m2r ( *LP2++, mm0 );
 465
 466       // average L1 and L3 leave result in mm4
 467       movq_r2r ( mm1, mm4 );    // L1
 468
 469       pand_m2r ( ShiftMask, mm4 );
 470       psrlw_i2r ( 01, mm4 );
 471       movq_r2r ( mm3, mm5 );  // L3
 472       pand_m2r ( ShiftMask, mm5 );
 473       psrlw_i2r ( 01, mm5 );
 474       paddb_r2r ( mm5, mm4 );  // the average, for computing comb
 475
 476       // get abs value of possible L2 comb
 477       movq_r2r  ( mm2, mm7 );                           // L2
 478       psubusb_r2r ( mm4, mm7 );                         // L2 - avg
 479       movq_r2r ( mm4, mm5 );                            // avg
 480       psubusb_r2r ( mm2, mm5 );                         // avg - L2
 481       por_r2r ( mm7, mm5 );                             // abs(avg-L2)
 482       movq_r2r ( mm4, mm6 );     // copy of avg for later
 483
 484       // get abs value of possible LP2 comb
 485       movq_r2r ( mm0, mm7 );                            // LP2
 486       psubusb_r2r ( mm4, mm7 );                         // LP2 - avg
 487       psubusb_r2r ( mm0, mm4 );                         // avg - LP2
 488       por_r2r ( mm7, mm4 );                             // abs(avg-LP2)
 489
 490       // use L2 or LP2 depending upon which makes smaller comb
 491       psubusb_r2r ( mm5, mm4 );                         // see if it goes to zero
 492       psubusb_r2r ( mm5, mm5 );                         // 0
 493       pcmpeqb_r2r ( mm5, mm4 );                         // if (mm4=0) then FF else 0
 494       pcmpeqb_r2r ( mm4, mm5 );                         // opposite of mm4
 495
 496       // if Comb(LP2) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55
 497       pand_r2r ( mm2, mm5 );                            // use L2 if mm5 == ff, else 0
 498       pand_r2r ( mm0, mm4 );                            // use LP2 if mm4 = ff, else 0
 499       por_r2r ( mm5, mm4 );                             // may the best win
 500
 501       // Now lets clip our chosen value to be not outside of the range
 502       // of the high/low range L1-L3 by more than abs(L1-L3)
 503       // This allows some comb but limits the damages and also allows more
 504       // detail than a boring oversmoothed clip.
 505
 506       movq_r2r ( mm1, mm2 );                            // copy L1
 507       psubusb_r2r ( mm3, mm2 );                         // - L3, with saturation
 508       paddusb_r2r ( mm3, mm2 );                // now = Max(L1,L3)
 509
 510       pcmpeqb_r2r ( mm7, mm7 );                         // all ffffffff
 511       psubusb_r2r ( mm1, mm7 );                         // - L1
 512       paddusb_r2r ( mm7, mm3 );                         // add, may sat at fff..
 513       psubusb_r2r ( mm7, mm3 );                         // now = Min(L1,L3)
 514
 515       // allow the value to be above the high or below the low by amt of MaxComb
 516       paddusb_m2r ( MaxComb, mm2 );                     // increase max by diff
 517       psubusb_m2r ( MaxComb, mm3 );                     // lower min by diff
 518
 519       psubusb_r2r ( mm3, mm4 );                         // best - Min
 520       paddusb_r2r ( mm3, mm4 );                         // now = Max(best,Min(L1,L3)
 521
 522       pcmpeqb_r2r ( mm7, mm7 );                         // all ffffffff
 523       psubusb_r2r ( mm4, mm7 );                         // - Max(best,Min(best,L3)
 524       paddusb_r2r ( mm7, mm2 );                         // add may sat at FFF..
 525       psubusb_r2r ( mm7, mm2 );                         // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped
 526
 527       movq_r2m ( mm2, *Dest++ );        // move in our clipped best
 528
 529     }
 530   }
 531
 532   /* Copy last odd line if we're processing an Odd field. */
 533   if (IsOdd)
 534   {
 535     xine_fast_memcpy(pdst + (height * 2 - 1) * LineLength,
 536                       pOddLines + (height - 1) * SourcePitch,
 537                       LineLength);
 538   }
 539
 540   /* clear out the MMX registers ready for doing floating point again */
 541   emms();
 542
 543 #endif
 544
 545   return 1;
 546 }
 547
 548 /* Use one field to interpolate the other (low cpu utilization)
 549    Will lose resolution but does not produce weaving effect
 550    (good for fast moving scenes) also know as "linear interpolation"
 551 */
 552 static void deinterlace_onefield_yuv_mmx( uint8_t *pdst, uint8_t *psrc[],
 553     int width, int height )
 554 {
 555 #if defined(ARCH_X86) || defined(ARCH_X86_64)
 556   int Line;
 557   uint64_t *YVal1;
 558   uint64_t *YVal3;
 559   uint64_t *Dest;
 560   uint8_t* pEvenLines = psrc[0];
 561   uint8_t* pOddLines = psrc[0]+width;
 562   int LineLength = width;
 563   int SourcePitch = width * 2;
 564   int IsOdd = 1;
 565
 566   int n;
 567
 568   static mmx_t Mask = {ub:{0xfe,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe}};
 569
 570   /*
 571    * copy first even line no matter what, and the first odd line if we're
 572    * processing an odd field.
 573    */
 574
 575   xine_fast_memcpy(pdst, pEvenLines, LineLength);
 576   if (IsOdd)
 577     xine_fast_memcpy(pdst + LineLength, pOddLines, LineLength);
 578
 579   height = height / 2;
 580   for (Line = 0; Line < height - 1; ++Line)
 581   {
 582     if (IsOdd)
 583     {
 584       YVal1 = (uint64_t *)(pOddLines + Line * SourcePitch);
 585       YVal3 = (uint64_t *)(pOddLines + (Line + 1) * SourcePitch);
 586       Dest = (uint64_t *)(pdst + (Line * 2 + 2) * LineLength);
 587     }
 588     else
 589     {
 590       YVal1 = (uint64_t *)(pEvenLines + Line * SourcePitch);
 591       YVal3 = (uint64_t *)(pEvenLines + (Line + 1) * SourcePitch);
 592       Dest = (uint64_t *)(pdst + (Line * 2 + 1) * LineLength);
 593     }
 594
 595     // Copy the odd line to the overlay verbatim.
 596     xine_fast_memcpy((char *)Dest + LineLength, YVal3, LineLength);
 597
 598     n = LineLength >> 3;
 599     while( n-- )
 600     {
 601       movq_m2r (*YVal1++, mm0);
 602       movq_m2r (*YVal3++, mm2);
 603
 604       // get average in mm0
 605       pand_m2r ( Mask, mm0 );
 606       pand_m2r ( Mask, mm2 );
 607       psrlw_i2r ( 01, mm0 );
 608       psrlw_i2r ( 01, mm2 );
 609       paddw_r2r ( mm2, mm0 );
 610
 611       movq_r2m ( mm0, *Dest++ );
 612     }
 613   }
 614
 615   /* Copy last odd line if we're processing an even field. */
 616   if (! IsOdd)
 617   {
 618     xine_fast_memcpy(pdst + (height * 2 - 1) * LineLength,
 619                       pOddLines + (height - 1) * SourcePitch,
 620                       LineLength);
 621   }
 622
 623   /* clear out the MMX registers ready for doing floating point
 624    * again
 625    */
 626   emms();
 627 #endif
 628 }
 629
 630 /* Linear Blend filter - does a kind of vertical blurring on the image.
 631    (idea borrowed from mplayer's sources)
 632 */
 633 static void deinterlace_linearblend_yuv_mmx( uint8_t *pdst, uint8_t *psrc[],
 634     int width, int height )
 635 {
 636 #if defined(ARCH_X86) || defined(ARCH_X86_64)
 637   int Line;
 638   uint64_t *YVal1;
 639   uint64_t *YVal2;
 640   uint64_t *YVal3;
 641   uint64_t *Dest;
 642   int LineLength = width;
 643
 644   int n;
 645
 646   /* Copy first line */
 647   xine_fast_memcpy(pdst, psrc[0], LineLength);
 648
 649   for (Line = 1; Line < height - 1; ++Line)
 650   {
 651     YVal1 = (uint64_t *)(psrc[0] + (Line - 1) * LineLength);
 652     YVal2 = (uint64_t *)(psrc[0] + (Line) * LineLength);
 653     YVal3 = (uint64_t *)(psrc[0] + (Line + 1) * LineLength);
 654     Dest = (uint64_t *)(pdst + Line * LineLength);
 655
 656     n = LineLength >> 3;
 657     while( n-- )
 658     {
 659       /* load data from 3 lines */
 660       movq_m2r (*YVal1++, mm0);
 661       movq_m2r (*YVal2++, mm1);
 662       movq_m2r (*YVal3++, mm2);
 663
 664       /* expand bytes to words */
 665       punpckhbw_r2r (mm0, mm3);
 666       punpckhbw_r2r (mm1, mm4);
 667       punpckhbw_r2r (mm2, mm5);
 668       punpcklbw_r2r (mm0, mm0);
 669       punpcklbw_r2r (mm1, mm1);
 670       punpcklbw_r2r (mm2, mm2);
 671
 672       /*
 673        * deinterlacing:
 674        * deint_line = (line0 + 2*line1 + line2) / 4
 675        */
 676       psrlw_i2r (07, mm0);
 677       psrlw_i2r (06, mm1);
 678       psrlw_i2r (07, mm2);
 679       psrlw_i2r (07, mm3);
 680       psrlw_i2r (06, mm4);
 681       psrlw_i2r (07, mm5);
 682       paddw_r2r (mm1, mm0);
 683       paddw_r2r (mm2, mm0);
 684       paddw_r2r (mm4, mm3);
 685       paddw_r2r (mm5, mm3);
 686       psrlw_i2r (03, mm0);
 687       psrlw_i2r (03, mm3);
 688
 689       /* pack 8 words to 8 bytes in mm0 */
 690       packuswb_r2r (mm3, mm0);
 691
 692       movq_r2m ( mm0, *Dest++ );
 693     }
 694   }
 695
 696   /* Copy last line */
 697   xine_fast_memcpy(pdst + Line * LineLength,
 698                    psrc[0] + Line * LineLength, LineLength);
 699
 700   /* clear out the MMX registers ready for doing floating point
 701    * again
 702    */
 703   emms();
 704 #endif
 705 }
 706
 707 /* Linear Blend filter - C version contributed by Rogerio Brito.
 708    This algorithm has the same interface as the other functions.
 709
 710    The destination "screen" (pdst) is constructed from the source
 711    screen (psrc[0]) line by line.
 712
 713    The i-th line of the destination screen is the average of 3 lines
 714    from the source screen: the (i-1)-th, i-th and (i+1)-th lines, with
 715    the i-th line having weight 2 in the computation.
 716
 717    Remarks:
 718    * each line on pdst doesn't depend on previous lines;
 719    * due to the way the algorithm is defined, the first & last lines of the
 720      screen aren't deinterlaced.
 721
 722 */
 723 static void deinterlace_linearblend_yuv( uint8_t *pdst, uint8_t *psrc[],
 724                                          int width, int height )
 725 {
 726   register int x, y;
 727   register uint8_t *l0, *l1, *l2, *l3;
 728
 729   l0 = pdst;            /* target line */
 730   l1 = psrc[0];         /* 1st source line */
 731   l2 = l1 + width;      /* 2nd source line = line that follows l1 */
 732   l3 = l2 + width;      /* 3rd source line = line that follows l2 */
 733
 734   /* Copy the first line */
 735   xine_fast_memcpy(l0, l1, width);
 736   l0 += width;
 737
 738   for (y = 1; y < height-1; ++y) {
 739     /* computes avg of: l1 + 2*l2 + l3 */
 740
 741     for (x = 0; x < width; ++x) {
 742       l0[x] = (l1[x] + (l2[x]<<1) + l3[x]) >> 2;
 743     }
 744
 745     /* updates the line pointers */
 746     l1 = l2; l2 = l3; l3 += width;
 747     l0 += width;
 748   }
 749
 750   /* Copy the last line */
 751   xine_fast_memcpy(l0, l1, width);
 752 }
 753
 754 static int check_for_mmx(void)
 755 {
 756 #if defined(ARCH_X86) || defined(ARCH_X86_64)
 757 static int config_flags = -1;
 758
 759   if ( config_flags == -1 )
 760     config_flags = xine_mm_accel();
 761   if (config_flags & MM_ACCEL_X86_MMX)
 762     return 1;
 763   return 0;
 764 #else
 765   return 0;
 766 #endif
 767 }
 768
 769 /* generic YUV deinterlacer
 770    pdst -> pointer to destination bitmap
 771    psrc -> array of pointers to source bitmaps ([0] = most recent)
 772    width,height -> dimension for bitmaps
 773    method -> DEINTERLACE_xxx
 774 */
 775
 776 void deinterlace_yuv( uint8_t *pdst, uint8_t *psrc[],
 777     int width, int height, int method )
 778 {
 779   switch( method ) {
 780     case DEINTERLACE_NONE:
 781       xine_fast_memcpy(pdst,psrc[0],width*height);
 782       break;
 783     case DEINTERLACE_BOB:
 784       if( check_for_mmx() )
 785         deinterlace_bob_yuv_mmx(pdst,psrc,width,height);
 786       else /* FIXME: provide an alternative? */
 787         xine_fast_memcpy(pdst,psrc[0],width*height);
 788       break;
 789     case DEINTERLACE_WEAVE:
 790       if( check_for_mmx() )
 791       {
 792         if( !deinterlace_weave_yuv_mmx(pdst,psrc,width,height) )
 793           xine_fast_memcpy(pdst,psrc[0],width*height);
 794       }
 795       else /* FIXME: provide an alternative? */
 796         xine_fast_memcpy(pdst,psrc[0],width*height);
 797       break;
 798     case DEINTERLACE_GREEDY:
 799       if( check_for_mmx() )
 800       {
 801         if( !deinterlace_greedy_yuv_mmx(pdst,psrc,width,height) )
 802           xine_fast_memcpy(pdst,psrc[0],width*height);
 803       }
 804       else /* FIXME: provide an alternative? */
 805         xine_fast_memcpy(pdst,psrc[0],width*height);
 806       break;
 807     case DEINTERLACE_ONEFIELD:
 808       if( check_for_mmx() )
 809         deinterlace_onefield_yuv_mmx(pdst,psrc,width,height);
 810       else /* FIXME: provide an alternative? */
 811         xine_fast_memcpy(pdst,psrc[0],width*height);
 812       break;
 813     case DEINTERLACE_ONEFIELDXV:
 814       lprintf("ONEFIELDXV must be handled by the video driver.\n");
 815       break;
 816     case DEINTERLACE_LINEARBLEND:
 817       if( check_for_mmx() )
 818         deinterlace_linearblend_yuv_mmx(pdst,psrc,width,height);
 819       else
 820         deinterlace_linearblend_yuv(pdst,psrc,width,height);
 821       break;
 822     default:
 823       lprintf("unknow method %d.\n",method);
 824       break;
 825   }
 826 }
 827
 828 int deinterlace_yuv_supported ( int method )
 829 {
 830   switch( method ) {
 831     case DEINTERLACE_NONE:
 832       return 1;
 833     case DEINTERLACE_BOB:
 834     case DEINTERLACE_WEAVE:
 835     case DEINTERLACE_GREEDY:
 836     case DEINTERLACE_ONEFIELD:
 837       return check_for_mmx();
 838     case DEINTERLACE_ONEFIELDXV:
 839       lprintf ("ONEFIELDXV must be handled by the video driver.\n");
 840       return 0;
 841     case DEINTERLACE_LINEARBLEND:
 842       return 1;
 843   }
 844
 845   return 0;
 846 }
 847
 848 char *deinterlace_methods[] = {
 849   "none",
 850   "bob",
 851   "weave",
 852   "greedy",
 853   "onefield",
 854   "onefield_xv",
 855   "linearblend",
 856   NULL
 857 };
 858
 859