research.m1stereo.tv/gitweb/ - melted/blob - src/modules/xine/deinterlace.c

   1  /*
   2  * Copyright (C) 2001 the xine project
   3  *
   4  * This file is part of xine, a free video player.
   5  *
   6  * xine is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * xine is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; if not, write to the Free Software
  18  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
  19  *
  20  * Deinterlace routines by Miguel Freitas
  21  * based of DScaler project sources (deinterlace.sourceforge.net)
  22  *
  23  * Currently only available for Xv driver and MMX extensions
  24  *
  25  * small todo list:
  26  * - implement non-MMX versions for all methods
  27  * - support MMX2 instructions
  28  * - move some generic code from xv driver to this file
  29  * - make it also work for yuy2 frames
  30  *
  31  */
  32
  33 #include <stdio.h>
  34 #include <string.h>
  35 #include "deinterlace.h"
  36 #include "xineutils.h"
  37
  38 #define xine_fast_memcpy memcpy
  39 #define xine_fast_memmove memmove
  40
  41 /*
  42    DeinterlaceFieldBob algorithm
  43    Based on Virtual Dub plugin by Gunnar Thalin
  44    MMX asm version from dscaler project (deinterlace.sourceforge.net)
  45    Linux version for Xine player by Miguel Freitas
  46 */
  47 static void deinterlace_bob_yuv_mmx( uint8_t *pdst, uint8_t *psrc[],
  48     int width, int height )
  49 {
  50 #ifdef USE_MMX
  51   int Line;
  52   uint64_t *YVal1;
  53   uint64_t *YVal2;
  54   uint64_t *YVal3;
  55   uint64_t *Dest;
  56   uint8_t* pEvenLines = psrc[0];
  57   uint8_t* pOddLines = psrc[0]+width;
  58   int LineLength = width;
  59   int SourcePitch = width * 2;
  60   int IsOdd = 1;
  61   long EdgeDetect = 625;
  62   long JaggieThreshold = 73;
  63
  64   int n;
  65
  66   uint64_t qwEdgeDetect;
  67   uint64_t qwThreshold;
  68
  69   static mmx_t YMask = {ub:{0xff,0,0xff,0,0xff,0,0xff,0}};
  70   static mmx_t Mask = {ub:{0xfe,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe}};
  71
  72   qwEdgeDetect = EdgeDetect;
  73   qwEdgeDetect += (qwEdgeDetect << 48) + (qwEdgeDetect << 32) + (qwEdgeDetect << 16);
  74   qwThreshold = JaggieThreshold;
  75   qwThreshold += (qwThreshold << 48) + (qwThreshold << 32) + (qwThreshold << 16);
  76
  77
  78   // copy first even line no matter what, and the first odd line if we're
  79   // processing an odd field.
  80   xine_fast_memcpy(pdst, pEvenLines, LineLength);
  81   if (IsOdd)
  82     xine_fast_memcpy(pdst + LineLength, pOddLines, LineLength);
  83
  84   height = height / 2;
  85   for (Line = 0; Line < height - 1; ++Line)
  86   {
  87     if (IsOdd)
  88     {
  89       YVal1 = (uint64_t *)(pOddLines + Line * SourcePitch);
  90       YVal2 = (uint64_t *)(pEvenLines + (Line + 1) * SourcePitch);
  91       YVal3 = (uint64_t *)(pOddLines + (Line + 1) * SourcePitch);
  92       Dest = (uint64_t *)(pdst + (Line * 2 + 2) * LineLength);
  93     }
  94     else
  95     {
  96       YVal1 = (uint64_t *)(pEvenLines + Line * SourcePitch);
  97       YVal2 = (uint64_t *)(pOddLines + Line * SourcePitch);
  98       YVal3 = (uint64_t *)(pEvenLines + (Line + 1) * SourcePitch);
  99       Dest = (uint64_t *)(pdst + (Line * 2 + 1) * LineLength);
 100     }
 101
 102     // For ease of reading, the comments below assume that we're operating on an odd
 103     // field (i.e., that bIsOdd is true).  The exact same processing is done when we
 104     // operate on an even field, but the roles of the odd and even fields are reversed.
 105     // It's just too cumbersome to explain the algorithm in terms of "the next odd
 106     // line if we're doing an odd field, or the next even line if we're doing an
 107     // even field" etc.  So wherever you see "odd" or "even" below, keep in mind that
 108     // half the time this function is called, those words' meanings will invert.
 109
 110     // Copy the odd line to the overlay verbatim.
 111     xine_fast_memcpy((char *)Dest + LineLength, YVal3, LineLength);
 112
 113     n = LineLength >> 3;
 114     while( n-- )
 115     {
 116       movq_m2r (*YVal1++, mm0);
 117       movq_m2r (*YVal2++, mm1);
 118       movq_m2r (*YVal3++, mm2);
 119
 120       // get intensities in mm3 - 4
 121       movq_r2r ( mm0, mm3 );
 122       pand_m2r ( YMask, mm3 );
 123       movq_r2r ( mm1, mm4 );
 124       pand_m2r ( YMask, mm4 );
 125       movq_r2r ( mm2, mm5 );
 126       pand_m2r ( YMask, mm5 );
 127
 128       // get average in mm0
 129       pand_m2r ( Mask, mm0 );
 130       pand_m2r ( Mask, mm2 );
 131       psrlw_i2r ( 01, mm0 );
 132       psrlw_i2r ( 01, mm2 );
 133       paddw_r2r ( mm2, mm0 );
 134
 135       // work out (O1 - E) * (O2 - E) / 2 - EdgeDetect * (O1 - O2) ^ 2 >> 12
 136       // result will be in mm6
 137
 138       psrlw_i2r ( 01, mm3 );
 139       psrlw_i2r ( 01, mm4 );
 140       psrlw_i2r ( 01, mm5 );
 141
 142       movq_r2r ( mm3, mm6 );
 143       psubw_r2r ( mm4, mm6 );   //mm6 = O1 - E
 144
 145       movq_r2r ( mm5, mm7 );
 146       psubw_r2r ( mm4, mm7 );   //mm7 = O2 - E
 147
 148       pmullw_r2r ( mm7, mm6 );          // mm6 = (O1 - E) * (O2 - E)
 149
 150       movq_r2r ( mm3, mm7 );
 151       psubw_r2r ( mm5, mm7 );           // mm7 = (O1 - O2)
 152       pmullw_r2r ( mm7, mm7 );  // mm7 = (O1 - O2) ^ 2
 153       psrlw_i2r ( 12, mm7 );            // mm7 = (O1 - O2) ^ 2 >> 12
 154       pmullw_m2r ( *&qwEdgeDetect, mm7 );// mm7  = EdgeDetect * (O1 - O2) ^ 2 >> 12
 155
 156       psubw_r2r ( mm7, mm6 );      // mm6 is what we want
 157
 158       pcmpgtw_m2r ( *&qwThreshold, mm6 );
 159
 160       movq_r2r ( mm6, mm7 );
 161
 162       pand_r2r ( mm6, mm0 );
 163
 164       pandn_r2r ( mm1, mm7 );
 165
 166       por_r2r ( mm0, mm7 );
 167
 168       movq_r2m ( mm7, *Dest++ );
 169     }
 170   }
 171
 172   // Copy last odd line if we're processing an even field.
 173   if (! IsOdd)
 174   {
 175     xine_fast_memcpy(pdst + (height * 2 - 1) * LineLength,
 176                       pOddLines + (height - 1) * SourcePitch,
 177                       LineLength);
 178   }
 179
 180   // clear out the MMX registers ready for doing floating point
 181   // again
 182   emms();
 183 #endif
 184 }
 185
 186 /* Deinterlace the latest field, with a tendency to weave rather than bob.
 187    Good for high detail on low-movement scenes.
 188    Seems to produce bad output in general case, need to check if this
 189    is normal or if the code is broken.
 190 */
 191 static int deinterlace_weave_yuv_mmx( uint8_t *pdst, uint8_t *psrc[],
 192     int width, int height )
 193 {
 194 #ifdef USE_MMX
 195
 196   int Line;
 197   uint64_t *YVal1;
 198   uint64_t *YVal2;
 199   uint64_t *YVal3;
 200   uint64_t *YVal4;
 201   uint64_t *Dest;
 202   uint8_t* pEvenLines = psrc[0];
 203   uint8_t* pOddLines = psrc[0]+width;
 204   uint8_t* pPrevLines;
 205
 206   int LineLength = width;
 207   int SourcePitch = width * 2;
 208   int IsOdd = 1;
 209
 210   long TemporalTolerance = 300;
 211   long SpatialTolerance = 600;
 212   long SimilarityThreshold = 25;
 213
 214   int n;
 215
 216   uint64_t qwSpatialTolerance;
 217   uint64_t qwTemporalTolerance;
 218   uint64_t qwThreshold;
 219
 220   static mmx_t YMask = {ub:{0xff,0,0xff,0,0xff,0,0xff,0}};
 221   static mmx_t Mask = {ub:{0xfe,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe}};
 222
 223
 224   // Make sure we have all the data we need.
 225   if ( psrc[0] == NULL || psrc[1] == NULL )
 226     return 0;
 227
 228   if (IsOdd)
 229     pPrevLines = psrc[1] + width;
 230   else
 231     pPrevLines = psrc[1];
 232
 233   // Since the code uses MMX to process 4 pixels at a time, we need our constants
 234   // to be represented 4 times per quadword.
 235   qwSpatialTolerance = SpatialTolerance;
 236   qwSpatialTolerance += (qwSpatialTolerance << 48) + (qwSpatialTolerance << 32) + (qwSpatialTolerance << 16);
 237   qwTemporalTolerance = TemporalTolerance;
 238   qwTemporalTolerance += (qwTemporalTolerance << 48) + (qwTemporalTolerance << 32) + (qwTemporalTolerance << 16);
 239   qwThreshold = SimilarityThreshold;
 240   qwThreshold += (qwThreshold << 48) + (qwThreshold << 32) + (qwThreshold << 16);
 241
 242   // copy first even line no matter what, and the first odd line if we're
 243   // processing an even field.
 244   xine_fast_memcpy(pdst, pEvenLines, LineLength);
 245   if (!IsOdd)
 246     xine_fast_memcpy(pdst + LineLength, pOddLines, LineLength);
 247
 248   height = height / 2;
 249   for (Line = 0; Line < height - 1; ++Line)
 250   {
 251     if (IsOdd)
 252     {
 253       YVal1 = (uint64_t *)(pEvenLines + Line * SourcePitch);
 254       YVal2 = (uint64_t *)(pOddLines + Line * SourcePitch);
 255       YVal3 = (uint64_t *)(pEvenLines + (Line + 1) * SourcePitch);
 256       YVal4 = (uint64_t *)(pPrevLines + Line * SourcePitch);
 257       Dest = (uint64_t *)(pdst + (Line * 2 + 1) * LineLength);
 258     }
 259     else
 260     {
 261       YVal1 = (uint64_t *)(pOddLines + Line * SourcePitch);
 262       YVal2 = (uint64_t *)(pEvenLines + (Line + 1) * SourcePitch);
 263       YVal3 = (uint64_t *)(pOddLines + (Line + 1) * SourcePitch);
 264       YVal4 = (uint64_t *)(pPrevLines + (Line + 1) * SourcePitch);
 265       Dest = (uint64_t *)(pdst + (Line * 2 + 2) * LineLength);
 266     }
 267
 268     // For ease of reading, the comments below assume that we're operating on an odd
 269     // field (i.e., that bIsOdd is true).  The exact same processing is done when we
 270     // operate on an even field, but the roles of the odd and even fields are reversed.
 271     // It's just too cumbersome to explain the algorithm in terms of "the next odd
 272     // line if we're doing an odd field, or the next even line if we're doing an
 273     // even field" etc.  So wherever you see "odd" or "even" below, keep in mind that
 274     // half the time this function is called, those words' meanings will invert.
 275
 276     // Copy the even scanline below this one to the overlay buffer, since we'll be
 277     // adapting the current scanline to the even lines surrounding it.  The scanline
 278     // above has already been copied by the previous pass through the loop.
 279     xine_fast_memcpy((char *)Dest + LineLength, YVal3, LineLength);
 280
 281     n = LineLength >> 3;
 282     while( n-- )
 283     {
 284       movq_m2r ( *YVal1++, mm0 );    // mm0 = E1
 285       movq_m2r ( *YVal2++, mm1 );    // mm1 = O
 286       movq_m2r ( *YVal3++, mm2 );    // mm2 = E2
 287
 288       movq_r2r ( mm0, mm3 );       // mm3 = intensity(E1)
 289       movq_r2r ( mm1, mm4 );       // mm4 = intensity(O)
 290       movq_r2r ( mm2, mm6 );       // mm6 = intensity(E2)
 291
 292       pand_m2r ( YMask, mm3 );
 293       pand_m2r ( YMask, mm4 );
 294       pand_m2r ( YMask, mm6 );
 295
 296       // Average E1 and E2 for interpolated bobbing.
 297       // leave result in mm0
 298       pand_m2r ( Mask, mm0 ); // mm0 = E1 with lower chroma bit stripped off
 299       pand_m2r ( Mask, mm2 ); // mm2 = E2 with lower chroma bit stripped off
 300       psrlw_i2r ( 01, mm0 );    // mm0 = E1 / 2
 301       psrlw_i2r ( 01, mm2 );    // mm2 = E2 / 2
 302       paddb_r2r ( mm2, mm0 );
 303
 304       // The meat of the work is done here.  We want to see whether this pixel is
 305       // close in luminosity to ANY of: its top neighbor, its bottom neighbor,
 306       // or its predecessor.  To do this without branching, we use MMX's
 307       // saturation feature, which gives us Z(x) = x if x>=0, or 0 if x<0.
 308       //
 309       // The formula we're computing here is
 310       //                Z(ST - (E1 - O) ^ 2) + Z(ST - (E2 - O) ^ 2) + Z(TT - (Oold - O) ^ 2)
 311       // where ST is spatial tolerance and TT is temporal tolerance.  The idea
 312       // is that if a pixel is similar to none of its neighbors, the resulting
 313       // value will be pretty low, probably zero.  A high value therefore indicates
 314       // that the pixel had a similar neighbor.  The pixel in the same position
 315       // in the field before last (Oold) is considered a neighbor since we want
 316       // to be able to display 1-pixel-high horizontal lines.
 317
 318       movq_m2r ( *&qwSpatialTolerance, mm7 );
 319       movq_r2r ( mm3, mm5 );     // mm5 = E1
 320       psubsw_r2r ( mm4, mm5 );   // mm5 = E1 - O
 321       psraw_i2r ( 1, mm5 );
 322       pmullw_r2r ( mm5, mm5 );   // mm5 = (E1 - O) ^ 2
 323       psubusw_r2r ( mm5, mm7 );  // mm7 = ST - (E1 - O) ^ 2, or 0 if that's negative
 324
 325       movq_m2r ( *&qwSpatialTolerance, mm3 );
 326       movq_r2r ( mm6, mm5 );    // mm5 = E2
 327       psubsw_r2r ( mm4, mm5 );  // mm5 = E2 - O
 328       psraw_i2r ( 1, mm5 );
 329       pmullw_r2r ( mm5, mm5 );  // mm5 = (E2 - O) ^ 2
 330       psubusw_r2r ( mm5, mm3 ); // mm0 = ST - (E2 - O) ^ 2, or 0 if that's negative
 331       paddusw_r2r ( mm3, mm7 ); // mm7 = (ST - (E1 - O) ^ 2) + (ST - (E2 - O) ^ 2)
 332
 333       movq_m2r ( *&qwTemporalTolerance, mm3 );
 334       movq_m2r ( *YVal4++, mm5 ); // mm5 = Oold
 335       pand_m2r ( YMask, mm5 );
 336       psubsw_r2r ( mm4, mm5 );  // mm5 = Oold - O
 337       psraw_i2r ( 1, mm5 ); // XXX
 338       pmullw_r2r ( mm5, mm5 );  // mm5 = (Oold - O) ^ 2
 339       psubusw_r2r ( mm5, mm3 ); /* mm0 = TT - (Oold - O) ^ 2, or 0 if that's negative */
 340       paddusw_r2r ( mm3, mm7 ); // mm7 = our magic number
 341
 342       /*
 343        * Now compare the similarity totals against our threshold.  The pcmpgtw
 344        * instruction will populate the target register with a bunch of mask bits,
 345        * filling words where the comparison is true with 1s and ones where it's
 346        * false with 0s.  A few ANDs and NOTs and an OR later, we have bobbed
 347        * values for pixels under the similarity threshold and weaved ones for
 348        * pixels over the threshold.
 349        */
 350
 351       pcmpgtw_m2r( *&qwThreshold, mm7 ); // mm7 = 0xffff where we're greater than the threshold, 0 elsewhere
 352       movq_r2r ( mm7, mm6 );  // mm6 = 0xffff where we're greater than the threshold, 0 elsewhere
 353       pand_r2r ( mm1, mm7 );  // mm7 = weaved data where we're greater than the threshold, 0 elsewhere
 354       pandn_r2r ( mm0, mm6 ); // mm6 = bobbed data where we're not greater than the threshold, 0 elsewhere
 355       por_r2r ( mm6, mm7 );   // mm7 = bobbed and weaved data
 356
 357       movq_r2m ( mm7, *Dest++ );
 358     }
 359   }
 360
 361   // Copy last odd line if we're processing an odd field.
 362   if (IsOdd)
 363   {
 364     xine_fast_memcpy(pdst + (height * 2 - 1) * LineLength,
 365                       pOddLines + (height - 1) * SourcePitch,
 366                       LineLength);
 367   }
 368
 369   // clear out the MMX registers ready for doing floating point
 370   // again
 371   emms();
 372
 373 #endif
 374
 375   return 1;
 376 }
 377
 378
 379 // This is a simple lightweight DeInterlace method that uses little CPU time
 380 // but gives very good results for low or intermedite motion. (MORE CPU THAN BOB)
 381 // It defers frames by one field, but that does not seem to produce noticeable
 382 // lip sync problems.
 383 //
 384 // The method used is to take either the older or newer weave pixel depending
 385 // upon which give the smaller comb factor, and then clip to avoid large damage
 386 // when wrong.
 387 //
 388 // I'd intended this to be part of a larger more elaborate method added to
 389 // Blended Clip but this give too good results for the CPU to ignore here.
 390 static int deinterlace_greedy_yuv_mmx( uint8_t *pdst, uint8_t *psrc[],
 391     int width, int height )
 392 {
 393 #ifdef USE_MMX
 394   int Line;
 395   int   LoopCtr;
 396   uint64_t *L1;                                 // ptr to Line1, of 3
 397   uint64_t *L2;                                 // ptr to Line2, the weave line
 398   uint64_t *L3;                                 // ptr to Line3
 399   uint64_t *LP2;                                        // ptr to prev Line2
 400   uint64_t *Dest;
 401   uint8_t* pEvenLines = psrc[0];
 402   uint8_t* pOddLines = psrc[0]+width;
 403   uint8_t* pPrevLines;
 404
 405   static mmx_t ShiftMask = {ub:{0xfe,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe}};
 406
 407   int LineLength = width;
 408   int SourcePitch = width * 2;
 409   int IsOdd = 1;
 410   long GreedyMaxComb = 15;
 411   static mmx_t MaxComb;
 412   int i;
 413
 414   if ( psrc[0] == NULL || psrc[1] == NULL )
 415     return 0;
 416
 417   if (IsOdd)
 418     pPrevLines = psrc[1] + width;
 419   else
 420     pPrevLines = psrc[1];
 421
 422
 423   for( i = 0; i < 8; i++ )
 424     MaxComb.ub[i] = GreedyMaxComb; // How badly do we let it weave? 0-255
 425
 426
 427   // copy first even line no matter what, and the first odd line if we're
 428   // processing an EVEN field. (note diff from other deint rtns.)
 429   xine_fast_memcpy(pdst, pEvenLines, LineLength); //DL0
 430   if (!IsOdd)
 431     xine_fast_memcpy(pdst + LineLength, pOddLines, LineLength); //DL1
 432
 433   height = height / 2;
 434   for (Line = 0; Line < height - 1; ++Line)
 435   {
 436     LoopCtr = LineLength / 8;                           // there are LineLength / 8 qwords per line
 437
 438     if (IsOdd)
 439     {
 440       L1 = (uint64_t *)(pEvenLines + Line * SourcePitch);
 441       L2 = (uint64_t *)(pOddLines + Line * SourcePitch);
 442       L3 = (uint64_t *)(pEvenLines + (Line + 1) * SourcePitch);
 443       LP2 = (uint64_t *)(pPrevLines + Line * SourcePitch); // prev Odd lines
 444       Dest = (uint64_t *)(pdst + (Line * 2 + 1) * LineLength);
 445     }
 446     else
 447     {
 448       L1 = (uint64_t *)(pOddLines + Line * SourcePitch);
 449       L2 = (uint64_t *)(pEvenLines + (Line + 1) * SourcePitch);
 450       L3 = (uint64_t *)(pOddLines + (Line + 1) * SourcePitch);
 451       LP2 = (uint64_t *)(pPrevLines + (Line + 1) * SourcePitch); //prev even lines
 452       Dest = (uint64_t *)(pdst + (Line * 2 + 2) * LineLength);
 453     }
 454
 455     xine_fast_memcpy((char *)Dest + LineLength, L3, LineLength);
 456
 457 // For ease of reading, the comments below assume that we're operating on an odd
 458 // field (i.e., that info->IsOdd is true).  Assume the obvious for even lines..
 459
 460     while( LoopCtr-- )
 461     {
 462       movq_m2r ( *L1++, mm1 );
 463       movq_m2r ( *L2++, mm2 );
 464       movq_m2r ( *L3++, mm3 );
 465       movq_m2r ( *LP2++, mm0 );
 466
 467       // average L1 and L3 leave result in mm4
 468       movq_r2r ( mm1, mm4 );    // L1
 469
 470       pand_m2r ( ShiftMask, mm4 );
 471       psrlw_i2r ( 01, mm4 );
 472       movq_r2r ( mm3, mm5 );  // L3
 473       pand_m2r ( ShiftMask, mm5 );
 474       psrlw_i2r ( 01, mm5 );
 475       paddb_r2r ( mm5, mm4 );  // the average, for computing comb
 476
 477       // get abs value of possible L2 comb
 478       movq_r2r  ( mm2, mm7 );                           // L2
 479       psubusb_r2r ( mm4, mm7 );                         // L2 - avg
 480       movq_r2r ( mm4, mm5 );                            // avg
 481       psubusb_r2r ( mm2, mm5 );                         // avg - L2
 482       por_r2r ( mm7, mm5 );                             // abs(avg-L2)
 483       movq_r2r ( mm4, mm6 );     // copy of avg for later
 484
 485       // get abs value of possible LP2 comb
 486       movq_r2r ( mm0, mm7 );                            // LP2
 487       psubusb_r2r ( mm4, mm7 );                         // LP2 - avg
 488       psubusb_r2r ( mm0, mm4 );                         // avg - LP2
 489       por_r2r ( mm7, mm4 );                             // abs(avg-LP2)
 490
 491       // use L2 or LP2 depending upon which makes smaller comb
 492       psubusb_r2r ( mm5, mm4 );                         // see if it goes to zero
 493       psubusb_r2r ( mm5, mm5 );                         // 0
 494       pcmpeqb_r2r ( mm5, mm4 );                         // if (mm4=0) then FF else 0
 495       pcmpeqb_r2r ( mm4, mm5 );                         // opposite of mm4
 496
 497       // if Comb(LP2) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55
 498       pand_r2r ( mm2, mm5 );                            // use L2 if mm5 == ff, else 0
 499       pand_r2r ( mm0, mm4 );                            // use LP2 if mm4 = ff, else 0
 500       por_r2r ( mm5, mm4 );                             // may the best win
 501
 502       // Now lets clip our chosen value to be not outside of the range
 503       // of the high/low range L1-L3 by more than abs(L1-L3)
 504       // This allows some comb but limits the damages and also allows more
 505       // detail than a boring oversmoothed clip.
 506
 507       movq_r2r ( mm1, mm2 );                            // copy L1
 508       psubusb_r2r ( mm3, mm2 );                         // - L3, with saturation
 509       paddusb_r2r ( mm3, mm2 );                // now = Max(L1,L3)
 510
 511       pcmpeqb_r2r ( mm7, mm7 );                         // all ffffffff
 512       psubusb_r2r ( mm1, mm7 );                         // - L1
 513       paddusb_r2r ( mm7, mm3 );                         // add, may sat at fff..
 514       psubusb_r2r ( mm7, mm3 );                         // now = Min(L1,L3)
 515
 516       // allow the value to be above the high or below the low by amt of MaxComb
 517       paddusb_m2r ( MaxComb, mm2 );                     // increase max by diff
 518       psubusb_m2r ( MaxComb, mm3 );                     // lower min by diff
 519
 520       psubusb_r2r ( mm3, mm4 );                         // best - Min
 521       paddusb_r2r ( mm3, mm4 );                         // now = Max(best,Min(L1,L3)
 522
 523       pcmpeqb_r2r ( mm7, mm7 );                         // all ffffffff
 524       psubusb_r2r ( mm4, mm7 );                         // - Max(best,Min(best,L3)
 525       paddusb_r2r ( mm7, mm2 );                         // add may sat at FFF..
 526       psubusb_r2r ( mm7, mm2 );                         // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped
 527
 528       movq_r2m ( mm2, *Dest++ );        // move in our clipped best
 529
 530     }
 531   }
 532
 533   /* Copy last odd line if we're processing an Odd field. */
 534   if (IsOdd)
 535   {
 536     xine_fast_memcpy(pdst + (height * 2 - 1) * LineLength,
 537                       pOddLines + (height - 1) * SourcePitch,
 538                       LineLength);
 539   }
 540
 541   /* clear out the MMX registers ready for doing floating point again */
 542   emms();
 543
 544 #endif
 545
 546   return 1;
 547 }
 548
 549 /* Use one field to interpolate the other (low cpu utilization)
 550    Will lose resolution but does not produce weaving effect
 551    (good for fast moving scenes) also know as "linear interpolation"
 552 */
 553 static void deinterlace_onefield_yuv_mmx( uint8_t *pdst, uint8_t *psrc[],
 554     int width, int height )
 555 {
 556 #ifdef USE_MMX
 557   int Line;
 558   uint64_t *YVal1;
 559   uint64_t *YVal3;
 560   uint64_t *Dest;
 561   uint8_t* pEvenLines = psrc[0];
 562   uint8_t* pOddLines = psrc[0]+width;
 563   int LineLength = width;
 564   int SourcePitch = width * 2;
 565   int IsOdd = 1;
 566
 567   int n;
 568
 569   static mmx_t Mask = {ub:{0xfe,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe}};
 570
 571   /*
 572    * copy first even line no matter what, and the first odd line if we're
 573    * processing an odd field.
 574    */
 575
 576   xine_fast_memcpy(pdst, pEvenLines, LineLength);
 577   if (IsOdd)
 578     xine_fast_memcpy(pdst + LineLength, pOddLines, LineLength);
 579
 580   height = height / 2;
 581   for (Line = 0; Line < height - 1; ++Line)
 582   {
 583     if (IsOdd)
 584     {
 585       YVal1 = (uint64_t *)(pOddLines + Line * SourcePitch);
 586       YVal3 = (uint64_t *)(pOddLines + (Line + 1) * SourcePitch);
 587       Dest = (uint64_t *)(pdst + (Line * 2 + 2) * LineLength);
 588     }
 589     else
 590     {
 591       YVal1 = (uint64_t *)(pEvenLines + Line * SourcePitch);
 592       YVal3 = (uint64_t *)(pEvenLines + (Line + 1) * SourcePitch);
 593       Dest = (uint64_t *)(pdst + (Line * 2 + 1) * LineLength);
 594     }
 595
 596     // Copy the odd line to the overlay verbatim.
 597     xine_fast_memcpy((char *)Dest + LineLength, YVal3, LineLength);
 598
 599     n = LineLength >> 3;
 600     while( n-- )
 601     {
 602       movq_m2r (*YVal1++, mm0);
 603       movq_m2r (*YVal3++, mm2);
 604
 605       // get average in mm0
 606       pand_m2r ( Mask, mm0 );
 607       pand_m2r ( Mask, mm2 );
 608       psrlw_i2r ( 01, mm0 );
 609       psrlw_i2r ( 01, mm2 );
 610       paddw_r2r ( mm2, mm0 );
 611
 612       movq_r2m ( mm0, *Dest++ );
 613     }
 614   }
 615
 616   /* Copy last odd line if we're processing an even field. */
 617   if (! IsOdd)
 618   {
 619     xine_fast_memcpy(pdst + (height * 2 - 1) * LineLength,
 620                       pOddLines + (height - 1) * SourcePitch,
 621                       LineLength);
 622   }
 623
 624   /* clear out the MMX registers ready for doing floating point
 625    * again
 626    */
 627   emms();
 628 #endif
 629 }
 630
 631 /* Linear Blend filter - does a kind of vertical blurring on the image.
 632    (idea borrowed from mplayer's sources)
 633 */
 634 static void deinterlace_linearblend_yuv_mmx( uint8_t *pdst, uint8_t *psrc[],
 635     int width, int height )
 636 {
 637 #ifdef USE_MMX
 638   int Line;
 639   uint64_t *YVal1;
 640   uint64_t *YVal2;
 641   uint64_t *YVal3;
 642   uint64_t *Dest;
 643   int LineLength = width;
 644
 645   int n;
 646
 647   /* Copy first line */
 648   xine_fast_memmove(pdst, psrc[0], LineLength);
 649
 650   for (Line = 1; Line < height - 1; ++Line)
 651   {
 652     YVal1 = (uint64_t *)(psrc[0] + (Line - 1) * LineLength);
 653     YVal2 = (uint64_t *)(psrc[0] + (Line) * LineLength);
 654     YVal3 = (uint64_t *)(psrc[0] + (Line + 1) * LineLength);
 655     Dest = (uint64_t *)(pdst + Line * LineLength);
 656
 657     n = LineLength >> 3;
 658     while( n-- )
 659     {
 660       /* load data from 3 lines */
 661       movq_m2r (*YVal1++, mm0);
 662       movq_m2r (*YVal2++, mm1);
 663       movq_m2r (*YVal3++, mm2);
 664
 665       /* expand bytes to words */
 666       punpckhbw_r2r (mm0, mm3);
 667       punpckhbw_r2r (mm1, mm4);
 668       punpckhbw_r2r (mm2, mm5);
 669       punpcklbw_r2r (mm0, mm0);
 670       punpcklbw_r2r (mm1, mm1);
 671       punpcklbw_r2r (mm2, mm2);
 672
 673       /*
 674        * deinterlacing:
 675        * deint_line = (line0 + 2*line1 + line2) / 4
 676        */
 677       psrlw_i2r (07, mm0);
 678       psrlw_i2r (06, mm1);
 679       psrlw_i2r (07, mm2);
 680       psrlw_i2r (07, mm3);
 681       psrlw_i2r (06, mm4);
 682       psrlw_i2r (07, mm5);
 683       paddw_r2r (mm1, mm0);
 684       paddw_r2r (mm2, mm0);
 685       paddw_r2r (mm4, mm3);
 686       paddw_r2r (mm5, mm3);
 687       psrlw_i2r (03, mm0);
 688       psrlw_i2r (03, mm3);
 689
 690       /* pack 8 words to 8 bytes in mm0 */
 691       packuswb_r2r (mm3, mm0);
 692
 693       movq_r2m ( mm0, *Dest++ );
 694     }
 695   }
 696
 697   /* Copy last line */
 698   xine_fast_memmove(pdst + Line * LineLength,
 699                    psrc[0] + Line * LineLength, LineLength);
 700
 701   /* clear out the MMX registers ready for doing floating point
 702    * again
 703    */
 704   emms();
 705 #endif
 706 }
 707
 708 /* Linear Blend filter - C version contributed by Rogerio Brito.
 709    This algorithm has the same interface as the other functions.
 710
 711    The destination "screen" (pdst) is constructed from the source
 712    screen (psrc[0]) line by line.
 713
 714    The i-th line of the destination screen is the average of 3 lines
 715    from the source screen: the (i-1)-th, i-th and (i+1)-th lines, with
 716    the i-th line having weight 2 in the computation.
 717
 718    Remarks:
 719    * each line on pdst doesn't depend on previous lines;
 720    * due to the way the algorithm is defined, the first & last lines of the
 721      screen aren't deinterlaced.
 722
 723 */
 724 static void deinterlace_linearblend_yuv( uint8_t *pdst, uint8_t *psrc[],
 725                                          int width, int height )
 726 {
 727   register int x, y;
 728   register uint8_t *l0, *l1, *l2, *l3;
 729
 730   l0 = pdst;            /* target line */
 731   l1 = psrc[0];         /* 1st source line */
 732   l2 = l1 + width;      /* 2nd source line = line that follows l1 */
 733   l3 = l2 + width;      /* 3rd source line = line that follows l2 */
 734
 735   /* Copy the first line */
 736   xine_fast_memcpy(l0, l1, width);
 737   l0 += width;
 738
 739   for (y = 1; y < height-1; ++y) {
 740     /* computes avg of: l1 + 2*l2 + l3 */
 741
 742     for (x = 0; x < width; ++x) {
 743       l0[x] = (l1[x] + (l2[x]<<1) + l3[x]) >> 2;
 744     }
 745
 746     /* updates the line pointers */
 747     l1 = l2; l2 = l3; l3 += width;
 748     l0 += width;
 749   }
 750
 751   /* Copy the last line */
 752   xine_fast_memcpy(l0, l1, width);
 753 }
 754
 755 static int check_for_mmx(void)
 756 {
 757 #ifdef USE_MMX
 758 static int config_flags = -1;
 759
 760   if ( config_flags == -1 )
 761     config_flags = xine_mm_accel();
 762   if (config_flags & MM_ACCEL_X86_MMX)
 763     return 1;
 764   return 0;
 765 #else
 766   return 0;
 767 #endif
 768 }
 769
 770 /* generic YUV deinterlacer
 771    pdst -> pointer to destination bitmap
 772    psrc -> array of pointers to source bitmaps ([0] = most recent)
 773    width,height -> dimension for bitmaps
 774    method -> DEINTERLACE_xxx
 775 */
 776
 777 void deinterlace_yuv( uint8_t *pdst, uint8_t *psrc[],
 778     int width, int height, int method )
 779 {
 780   switch( method ) {
 781     case DEINTERLACE_NONE:
 782       xine_fast_memcpy(pdst,psrc[0],width*height);
 783       break;
 784     case DEINTERLACE_BOB:
 785       if( check_for_mmx() )
 786         deinterlace_bob_yuv_mmx(pdst,psrc,width,height);
 787       else /* FIXME: provide an alternative? */
 788         xine_fast_memcpy(pdst,psrc[0],width*height);
 789       break;
 790     case DEINTERLACE_WEAVE:
 791       if( check_for_mmx() )
 792       {
 793         if( !deinterlace_weave_yuv_mmx(pdst,psrc,width,height) )
 794           xine_fast_memcpy(pdst,psrc[0],width*height);
 795       }
 796       else /* FIXME: provide an alternative? */
 797         xine_fast_memcpy(pdst,psrc[0],width*height);
 798       break;
 799     case DEINTERLACE_GREEDY:
 800       if( check_for_mmx() )
 801       {
 802         if( !deinterlace_greedy_yuv_mmx(pdst,psrc,width,height) )
 803           xine_fast_memcpy(pdst,psrc[0],width*height);
 804       }
 805       else /* FIXME: provide an alternative? */
 806         xine_fast_memcpy(pdst,psrc[0],width*height);
 807       break;
 808     case DEINTERLACE_ONEFIELD:
 809       if( check_for_mmx() )
 810         deinterlace_onefield_yuv_mmx(pdst,psrc,width,height);
 811       else /* FIXME: provide an alternative? */
 812         xine_fast_memcpy(pdst,psrc[0],width*height);
 813       break;
 814     case DEINTERLACE_ONEFIELDXV:
 815       lprintf("ONEFIELDXV must be handled by the video driver.\n");
 816       break;
 817     case DEINTERLACE_LINEARBLEND:
 818       if( check_for_mmx() )
 819         deinterlace_linearblend_yuv_mmx(pdst,psrc,width,height);
 820       else
 821         deinterlace_linearblend_yuv(pdst,psrc,width,height);
 822       break;
 823     default:
 824       lprintf("unknown method %d.\n",method);
 825       break;
 826   }
 827 }
 828
 829 int deinterlace_yuv_supported ( int method )
 830 {
 831   switch( method ) {
 832     case DEINTERLACE_NONE:
 833       return 1;
 834     case DEINTERLACE_BOB:
 835     case DEINTERLACE_WEAVE:
 836     case DEINTERLACE_GREEDY:
 837     case DEINTERLACE_ONEFIELD:
 838       return check_for_mmx();
 839     case DEINTERLACE_ONEFIELDXV:
 840       lprintf ("ONEFIELDXV must be handled by the video driver.\n");
 841       return 0;
 842     case DEINTERLACE_LINEARBLEND:
 843       return 1;
 844   }
 845
 846   return 0;
 847 }
 848
 849 const char *deinterlace_methods[] = {
 850   "none",
 851   "bob",
 852   "weave",
 853   "greedy",
 854   "onefield",
 855   "onefield_xv",
 856   "linearblend",
 857   NULL
 858 };
 859
 860