From 5f0547204a5a08185e9d8162016cc2ec9a670486 Mon Sep 17 00:00:00 2001
From: ddennedy <ddennedy@d19143bc-622f-0410-bfdd-b5b2a6649095>
Date: Fri, 30 Jan 2004 13:51:05 +0000
Subject: [PATCH] some bugfixes and rescale filter

git-svn-id: https://mlt.svn.sourceforge.net/svnroot/mlt/trunk/mlt@101 d19143bc-622f-0410-bfdd-b5b2a6649095
---
 src/framework/mlt_frame.c                |   22 +-
 src/modules/avformat/producer_avformat.c |    1 +
 src/modules/gtk2/Makefile                |   21 +-
 src/modules/gtk2/configure               |    4 +
 src/modules/gtk2/factory.c               |    3 +
 src/modules/gtk2/filter_rescale.c        |  143 ++++++
 src/modules/gtk2/filter_rescale.h        |   28 ++
 src/modules/gtk2/have_mmx.S              |   53 ++
 src/modules/gtk2/pixops.c                |  781 ++++++++++++++++++++++++++++++
 src/modules/gtk2/pixops.h                |   69 +++
 src/modules/gtk2/producer_pango.c        |    2 +-
 src/modules/gtk2/scale_line_22_33_mmx.S  |  183 +++++++
 src/modules/vorbis/Makefile              |    2 +-
 13 files changed, 1295 insertions(+), 17 deletions(-)
 create mode 100644 src/modules/gtk2/filter_rescale.c
 create mode 100644 src/modules/gtk2/filter_rescale.h
 create mode 100644 src/modules/gtk2/have_mmx.S
 create mode 100644 src/modules/gtk2/pixops.c
 create mode 100644 src/modules/gtk2/pixops.h
 create mode 100644 src/modules/gtk2/scale_line_22_33_mmx.S

diff --git a/src/framework/mlt_frame.c b/src/framework/mlt_frame.c
index 3f583d9..632d73a 100644
--- a/src/framework/mlt_frame.c
+++ b/src/framework/mlt_frame.c
@@ -723,15 +723,18 @@ int mlt_frame_mix_audio( mlt_frame this, mlt_frame that, float weight, int16_t *
 	int16_t *src, *dest;
 	//static int16_t *extra_src = NULL, *extra_dest = NULL;
 	static int extra_src_samples = 0, extra_dest_samples = 0;
-	int frequency_src = 0, frequency_dest = 0;
-	int channels_src = 0, channels_dest = 0;
-	int samples_src = 0, samples_dest = 0;
+	int frequency_src = *channels, frequency_dest = *channels;
+	int channels_src = *channels, channels_dest = *channels;
+	int samples_src = *samples, samples_dest = *samples;
 	int i, j;
+	double d = 0, s = 0;
 
 	mlt_frame_get_audio( this, &p_dest, format, &frequency_dest, &channels_dest, &samples_dest );
-	//fprintf( stderr, "frame dest samples %d channels %d position %f\n", samples_dest, channels_dest, mlt_properties_get_position( mlt_frame_properties( this ), "position" ) );
+	fprintf( stderr, "frame dest samples %d channels %d position %lld\n", samples_dest, channels_dest, mlt_properties_get_position( mlt_frame_properties( this ), "position" ) );
 	mlt_frame_get_audio( that, &p_src, format, &frequency_src, &channels_src, &samples_src );
-	//fprintf( stderr, "frame src  samples %d channels %d\n", samples_src, channels_src );
+	fprintf( stderr, "frame src  samples %d channels %d\n", samples_src, channels_src );
+	src = p_src;
+	dest = p_dest;
 	if ( channels_src > 6 )
 		channels_src = 0;
 	if ( channels_dest > 6 )
@@ -759,9 +762,6 @@ int mlt_frame_mix_audio( mlt_frame this, mlt_frame that, float weight, int16_t *
 	}
 	else
 		src = p_src;
-#else
-	src = p_src;
-	dest = p_dest;
 #endif
 
 	// determine number of samples to process	
@@ -778,8 +778,10 @@ int mlt_frame_mix_audio( mlt_frame this, mlt_frame that, float weight, int16_t *
 	{
 		for ( j = 0; j < *channels; j++ )
 		{
-			double d = (double) dest[ i * channels_dest + j ];
-			double s = (double) src[ i * channels_src + j ];
+			if ( j < channels_dest )
+				d = (double) dest[ i * channels_dest + j ];
+			if ( j < channels_src )
+				s = (double) src[ i * channels_src + j ];
 			dest[ i * channels_dest + j ] = s * weight + d * ( 1.0 - weight );
 		}
 	}
diff --git a/src/modules/avformat/producer_avformat.c b/src/modules/avformat/producer_avformat.c
index d19c165..bbefd2a 100644
--- a/src/modules/avformat/producer_avformat.c
+++ b/src/modules/avformat/producer_avformat.c
@@ -699,6 +699,7 @@ static int producer_get_audio( mlt_frame frame, int16_t **buffer, mlt_audio_form
 		}
 
 		// Now handle the audio if we have enough
+
 		if ( audio_used >= *samples )
 		{
 			*buffer = malloc( *samples * *channels * sizeof( int16_t ) );
diff --git a/src/modules/gtk2/Makefile b/src/modules/gtk2/Makefile
index 3130918..cd001dd 100644
--- a/src/modules/gtk2/Makefile
+++ b/src/modules/gtk2/Makefile
@@ -3,9 +3,14 @@ TARGET = ../libmltgtk2.so
 
 OBJS = factory.o \
 	   producer_pixbuf.o \
-	   producer_pango.o
+	   producer_pango.o \
+	   pixops.o \
+	   filter_rescale.o
 
-CFLAGS = `pkg-config gdk-pixbuf-2.0 --cflags` `pkg-config pangoft2 --cflags` -I../../ -Wall -g -D_FILE_OFFSET_BITS=64 -pthread
+ASM_OBJS = have_mmx.o \
+		   scale_line_22_33_mmx.o
+
+CFLAGS = -O3 -DUSE_MMX `pkg-config gdk-pixbuf-2.0 --cflags` `pkg-config pangoft2 --cflags` -I../../ -Wall -g -D_FILE_OFFSET_BITS=64 -pthread
 
 LDFLAGS = `pkg-config gdk-pixbuf-2.0 --libs` `pkg-config pangoft2 --libs`
 
@@ -13,8 +18,14 @@ SRCS := $(OBJS:.o=.c)
 
 all: 	$(TARGET)
 
-$(TARGET): $(OBJS)
-		$(CC) -shared -o $@ $(OBJS) $(LDFLAGS)
+$(TARGET): $(OBJS) $(ASM_OBJS)
+		$(CC) -shared -o $@ $(OBJS) $(ASM_OBJS) $(LDFLAGS)
+
+have_mmx.o:
+	$(CC) -o $@ -c have_mmx.S
+
+scale_line_22_33_mmx.o:
+	$(CC) -o $@ -c scale_line_22_33_mmx.S
 
 depend:	$(SRCS)
 		$(CC) -MM $(CFLAGS) $^ 1>.depend
@@ -23,7 +34,7 @@ dist-clean:	clean
 		rm -f .depend
 
 clean:	
-		rm -f $(OBJS) $(TARGET)
+		rm -f $(OBJS) $(ASM_OBJS) $(TARGET)
 
 ifneq ($(wildcard .depend),)
 include .depend
diff --git a/src/modules/gtk2/configure b/src/modules/gtk2/configure
index 04a232a..34f5515 100755
--- a/src/modules/gtk2/configure
+++ b/src/modules/gtk2/configure
@@ -8,5 +8,9 @@ pixbuf			libmltgtk2.so
 pango			libmltgtk2.so
 EOF
 
+cat << EOF >> ../filters.dat
+rescale			libmltgtk2.so
+EOF
+
 fi
 
diff --git a/src/modules/gtk2/factory.c b/src/modules/gtk2/factory.c
index 38e1692..bbddaa2 100644
--- a/src/modules/gtk2/factory.c
+++ b/src/modules/gtk2/factory.c
@@ -22,6 +22,7 @@
 
 #include "producer_pixbuf.h"
 #include "producer_pango.h"
+#include "filter_rescale.h"
 
 void *mlt_create_producer( char *id, void *arg )
 {
@@ -34,6 +35,8 @@ void *mlt_create_producer( char *id, void *arg )
 
 void *mlt_create_filter( char *id, void *arg )
 {
+	if ( !strcmp( id, "rescale" ) )
+		return filter_rescale_init( arg );
 	return NULL;
 }
 
diff --git a/src/modules/gtk2/filter_rescale.c b/src/modules/gtk2/filter_rescale.c
new file mode 100644
index 0000000..cf0905e
--- /dev/null
+++ b/src/modules/gtk2/filter_rescale.c
@@ -0,0 +1,143 @@
+/*
+ * filter_rescale.c -- scale the producer video frame size to match the consumer
+ * Copyright (C) 2003-2004 Ushodaya Enterprises Limited
+ * Author: Dan Dennedy <dan@dennedy.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
+#include "filter_rescale.h"
+#include "pixops.h"
+
+#include <framework/mlt_frame.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <gdk-pixbuf/gdk-pixbuf.h>
+
+/** Do it :-).
+*/
+
+static int filter_get_image( mlt_frame this, uint8_t **image, mlt_image_format *format, int *width, int *height, int writable )
+{
+	if ( *width == 0 )
+		*width = 720;
+	if ( *height == 0 )
+		*height = 576;
+
+	mlt_properties properties = mlt_frame_properties( this );
+	int iwidth = *width;
+	int iheight = *height;
+	int owidth = *width;
+	int oheight = *height;
+	uint8_t *input = NULL;
+	
+	char *interps = mlt_properties_get( properties, "rescale.interp" );
+	int interp = PIXOPS_INTERP_BILINEAR;
+	if ( strcmp( interps, "nearest" ) == 0 )
+		interp = PIXOPS_INTERP_NEAREST;
+	else if ( strcmp( interps, "tiles" ) == 0 )
+		interp = PIXOPS_INTERP_TILES;
+	else if ( strcmp( interps, "hyper" ) == 0 )
+		interp = PIXOPS_INTERP_HYPER;
+
+	mlt_frame_get_image( this, &input, format, &iwidth, &iheight, 0 );
+
+	// If width and height are correct, don't do anything
+	if ( iwidth != owidth || iheight != oheight )
+	{
+		if ( *format == mlt_image_yuv422 )
+		{
+			// Create the output image
+			uint8_t *output = malloc( owidth * oheight * 2 );
+
+			// Calculate strides
+			int istride = iwidth * 2;
+			int ostride = owidth * 2;
+
+			yuv422_scale_simple( output, owidth, oheight, ostride, input, iwidth, iheight, istride, interp );
+		
+			// Now update the frame
+			mlt_properties_set_data( properties, "image", output, owidth * oheight * 2, free, NULL );
+			mlt_properties_set_int( properties, "width", owidth );
+			mlt_properties_set_int( properties, "height", oheight );
+
+			// Return the output
+			*image = output;
+		}
+		else if ( *format == mlt_image_rgb24 || *format == mlt_image_rgb24a )
+		{
+			int bpp = (*format == mlt_image_rgb24a ? 4 : 3 );
+			GdkPixbuf *pixbuf = gdk_pixbuf_new_from_data( input, GDK_COLORSPACE_RGB,
+				(*format == mlt_image_rgb24a), 24, iwidth, iheight,
+				iwidth * bpp, NULL, NULL );
+			GdkPixbuf *scaled = gdk_pixbuf_scale_simple( pixbuf, owidth, oheight, interp );
+
+			// Create the output image
+			uint8_t *output = malloc( owidth * oheight * bpp );
+
+			int i;
+			for ( i = 0; i < oheight; i++ )
+				memcpy( output + i * owidth * bpp,
+						gdk_pixbuf_get_pixels( scaled ) + i * gdk_pixbuf_get_rowstride( scaled ),
+						gdk_pixbuf_get_width( scaled ) * bpp );
+
+			g_object_unref( pixbuf );
+			g_object_unref( scaled );
+			
+			// Now update the frame
+			mlt_properties_set_data( properties, "image", output, owidth * oheight * bpp, free, NULL );
+			mlt_properties_set_int( properties, "width", owidth );
+			mlt_properties_set_int( properties, "height", oheight );
+
+			// Return the output
+			*image = output;
+		}
+	}
+	else
+		*image = input;
+		
+	return 0;
+}
+
+/** Filter processing.
+*/
+
+static mlt_frame filter_process( mlt_filter this, mlt_frame frame )
+{
+	mlt_frame_push_get_image( frame, filter_get_image );
+	mlt_properties_set( mlt_frame_properties( frame ), "rescale.interp",
+		mlt_properties_get( mlt_filter_properties( this ), "interpolation" ) );
+	return frame;
+}
+
+/** Constructor for the filter.
+*/
+
+mlt_filter filter_rescale_init( char *arg )
+{
+	mlt_filter this = calloc( sizeof( struct mlt_filter_s ), 1 );
+	if ( mlt_filter_init( this, this ) == 0 )
+	{
+		this->process = filter_process;
+		if ( arg != NULL )
+			mlt_properties_set( mlt_filter_properties( this ), "interpolation", arg );
+		else
+			mlt_properties_set( mlt_filter_properties( this ), "interpolation", "bilinear" );
+	}
+	return this;
+}
+
diff --git a/src/modules/gtk2/filter_rescale.h b/src/modules/gtk2/filter_rescale.h
new file mode 100644
index 0000000..58340ff
--- /dev/null
+++ b/src/modules/gtk2/filter_rescale.h
@@ -0,0 +1,28 @@
+/*
+ * filter_rescale.h -- scale the producer video frame size to match the consumer
+ * Copyright (C) 2003-2004 Ushodaya Enterprises Limited
+ * Author: Dan Dennedy <dan@dennedy.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
+#ifndef _FILTER_RESCALE_H_
+#define _FILTER_RESCALE_H_
+
+#include <framework/mlt_filter.h>
+
+extern mlt_filter filter_rescale_init( char *arg );
+
+#endif
diff --git a/src/modules/gtk2/have_mmx.S b/src/modules/gtk2/have_mmx.S
new file mode 100644
index 0000000..4f8f5d8
--- /dev/null
+++ b/src/modules/gtk2/have_mmx.S
@@ -0,0 +1,53 @@
+	.file	"have_mmx.S"
+	.version	"01.01"
+gcc2_compiled.:
+.text
+	.align 16
+
+#if !defined(__MINGW32__) && !defined(__CYGWIN__)	
+
+.globl pixops_have_mmx
+	.type	 pixops_have_mmx,@function
+pixops_have_mmx:
+
+#else
+
+.globl _pixops_have_mmx
+_pixops_have_mmx:
+
+#endif
+	
+	push	%ebx
+
+# Check if bit 21 in flags word is writeable
+
+	pushfl	
+	popl	%eax
+	movl	%eax,%ebx
+	xorl	$0x00200000, %eax
+	pushl   %eax
+	popfl
+	pushfl
+	popl	%eax
+
+	cmpl	%eax, %ebx
+
+	je .notfound
+
+# OK, we have CPUID
+
+	movl	$1, %eax
+	cpuid
+	
+	test	$0x00800000, %edx
+	jz	.notfound
+
+	movl	$1, %eax
+	jmp	.out
+
+.notfound:
+	movl  	$0, %eax
+.out:	
+	popl	%ebx
+	ret
+
diff --git a/src/modules/gtk2/pixops.c b/src/modules/gtk2/pixops.c
new file mode 100644
index 0000000..9fe02b0
--- /dev/null
+++ b/src/modules/gtk2/pixops.c
@@ -0,0 +1,781 @@
+/* GdkPixbuf library - Scaling and compositing functions
+ *
+ * Copyright (C) 1999 The Free Software Foundation
+ *
+ * Author: Owen Taylor <otaylor@redhat.com>
+ * Modified for YUV422 by Dan Dennedy <dan@dennedy.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+ 
+#include <math.h>
+#include <glib.h>
+#include <stdio.h>
+
+#include "pixops.h"
+
+#define SUBSAMPLE_BITS 4
+#define SUBSAMPLE (1 << SUBSAMPLE_BITS)
+#define SUBSAMPLE_MASK ((1 << SUBSAMPLE_BITS)-1)
+#define SCALE_SHIFT 16
+
+typedef struct _PixopsFilter PixopsFilter;
+typedef struct _PixopsFilterDimension PixopsFilterDimension;
+
+struct _PixopsFilterDimension
+{
+	int n;
+	double offset;
+	double *weights;
+};
+
+struct _PixopsFilter
+{
+	PixopsFilterDimension x;
+	PixopsFilterDimension y;
+	double overall_alpha;
+};
+
+typedef guchar *( *PixopsLineFunc ) ( int *weights, int n_x, int n_y,
+                                      guchar *dest, int dest_x, guchar *dest_end,
+                                      guchar **src,
+                                      int x_init, int x_step, int src_width );
+
+typedef void ( *PixopsPixelFunc ) ( guchar *dest, guint y1, guint cr, guint y2, guint cb );
+
+
+/* mmx function declarations */
+#ifdef USE_MMX
+guchar *pixops_scale_line_22_33_mmx ( guint32 weights[ 16 ][ 8 ], guchar *p, guchar *q1, guchar *q2, int x_step, guchar *p_stop, int x_init );
+int pixops_have_mmx ( void );
+#endif
+
+static inline int
+get_check_shift ( int check_size )
+{
+	int check_shift = 0;
+	g_return_val_if_fail ( check_size >= 0, 4 );
+
+	while ( !( check_size & 1 ) )
+	{
+		check_shift++;
+		check_size >>= 1;
+	}
+
+	return check_shift;
+}
+
+static inline void
+pixops_scale_nearest ( guchar *dest_buf,
+                       int render_x0,
+                       int render_y0,
+                       int render_x1,
+                       int render_y1,
+                       int dest_rowstride,
+                       const guchar *src_buf,
+                       int src_width,
+                       int src_height,
+                       int src_rowstride,
+                       double scale_x,
+                       double scale_y )
+{
+	int i, j;
+	int x;
+	int x_step = ( 1 << SCALE_SHIFT ) / scale_x;
+	int y_step = ( 1 << SCALE_SHIFT ) / scale_y;
+
+	for ( i = 0; i < ( render_y1 - render_y0 ); i++ )
+	{
+		const guchar *src = src_buf + ( ( ( i + render_y0 ) * y_step + y_step / 2 ) >> SCALE_SHIFT ) * src_rowstride;
+		guchar *dest = dest_buf + i * dest_rowstride;
+
+		x = render_x0 * x_step + x_step / 2;
+
+		for ( j = 0; j < ( render_x1 - render_x0 ); j++ )
+		{
+			const guchar *p = src + ( x >> SCALE_SHIFT ) * 4;
+			guint32 *p32;
+
+			p32 = ( guint32 * ) dest;
+			*p32 = *( ( guint32 * ) p );
+
+			dest += 4;
+			x += x_step;
+		}
+	}
+}
+
+
+static inline guchar *
+scale_line ( int *weights, int n_x, int n_y,
+             guchar *dest, int dest_x, guchar *dest_end,
+             guchar **src,
+             int x_init, int x_step, int src_width )
+{
+	int x = x_init;
+	int i, j;
+
+	while ( dest < dest_end )
+	{
+		int x_scaled = x >> SCALE_SHIFT;
+		int *pixel_weights;
+
+		pixel_weights = weights + ( ( x >> ( SCALE_SHIFT - SUBSAMPLE_BITS ) ) & SUBSAMPLE_MASK ) * n_x * n_y;
+
+		unsigned int y1 = 0, cb = 0, y2 = 0, cr = 0;
+		for ( i = 0; i < n_y; i++ )
+		{
+			guchar *q = src[ i ] + x_scaled * 4;
+			int *line_weights = pixel_weights + n_x * i;
+
+			for ( j = 0; j < n_x; j++ )
+			{
+				unsigned int ta = line_weights[ j ];
+
+				y1 += ta * q[ 0 ];
+				cb += ta * q[ 1 ];
+				y2 += ta * q[ 2 ];
+				cr += ta * q[ 3 ];
+
+				q += 4;
+			}
+		}
+
+		dest[ 0 ] = ( y1 + 0xffff ) >> 16;
+		dest[ 1 ] = ( cb + 0xffff ) >> 16;
+		dest[ 2 ] = ( y2 + 0xffff ) >> 16;
+		dest[ 3 ] = ( cr + 0xffff ) >> 16;
+
+		dest += 4;
+
+		x += x_step;
+	}
+
+	return dest;
+}
+
+#ifdef USE_MMX
+static inline guchar *
+scale_line_22_33_mmx_stub ( int *weights, int n_x, int n_y,
+                            guchar *dest, int dest_x, guchar *dest_end,
+                            guchar **src,
+                            int x_init, int x_step, int src_width )
+{
+	guint32 mmx_weights[ 16 ][ 8 ];
+	int j;
+
+	for ( j = 0; j < 16; j++ )
+	{
+		mmx_weights[ j ][ 0 ] = 0x00010001 * ( weights[ 4 * j ] >> 8 );
+		mmx_weights[ j ][ 1 ] = 0x00010001 * ( weights[ 4 * j ] >> 8 );
+		mmx_weights[ j ][ 2 ] = 0x00010001 * ( weights[ 4 * j + 1 ] >> 8 );
+		mmx_weights[ j ][ 3 ] = 0x00010001 * ( weights[ 4 * j + 1 ] >> 8 );
+		mmx_weights[ j ][ 4 ] = 0x00010001 * ( weights[ 4 * j + 2 ] >> 8 );
+		mmx_weights[ j ][ 5 ] = 0x00010001 * ( weights[ 4 * j + 2 ] >> 8 );
+		mmx_weights[ j ][ 6 ] = 0x00010001 * ( weights[ 4 * j + 3 ] >> 8 );
+		mmx_weights[ j ][ 7 ] = 0x00010001 * ( weights[ 4 * j + 3 ] >> 8 );
+	}
+
+	return pixops_scale_line_22_33_mmx ( mmx_weights, dest, src[ 0 ], src[ 1 ], x_step, dest_end, x_init );
+}
+#endif /* USE_MMX */
+
+static inline guchar *
+scale_line_22_33 ( int *weights, int n_x, int n_y,
+                   guchar *dest, int dest_x, guchar *dest_end,
+                   guchar **src,
+                   int x_init, int x_step, int src_width )
+{
+	int x = x_init;
+	guchar *src0 = src[ 0 ];
+	guchar *src1 = src[ 1 ];
+
+	while ( dest < dest_end )
+	{
+		unsigned int y1, cb, y2, cr;
+		int x_scaled = x >> SCALE_SHIFT;
+		int *pixel_weights;
+		guchar *q0, *q1;
+		int w1, w2, w3, w4;
+
+		q0 = src0 + x_scaled * 4;
+		q1 = src1 + x_scaled * 4;
+
+		pixel_weights = weights + ( ( x >> ( SCALE_SHIFT - SUBSAMPLE_BITS ) ) & SUBSAMPLE_MASK ) * 4;
+
+		w1 = pixel_weights[ 0 ];
+		w2 = pixel_weights[ 1 ];
+		w3 = pixel_weights[ 2 ];
+		w4 = pixel_weights[ 3 ];
+
+		y1 = w1 * q0[ 0 ];
+		cb = w1 * q0[ 1 ];
+		y2 = w1 * q0[ 2 ];
+		cr = w1 * q0[ 3 ];
+
+		y1 += w2 * q0[ 4 ];
+		cb += w2 * q0[ 5 ];
+		y2 += w2 * q0[ 6 ];
+		cr += w2 * q0[ 7 ];
+
+		y1 += w3 * q1[ 0 ];
+		cb += w3 * q1[ 1 ];
+		y2 += w3 * q1[ 2 ];
+		cr += w3 * q1[ 3 ];
+
+		y1 += w4 * q1[ 4 ];
+		cb += w4 * q1[ 5 ];
+		y2 += w4 * q1[ 6 ];
+		cr += w4 * q1[ 7 ];
+
+		dest[ 0 ] = ( y1 + 0x8000 ) >> 16;
+		dest[ 1 ] = ( cb + 0x8000 ) >> 16;
+		dest[ 2 ] = ( y2 + 0x8000 ) >> 16;
+		dest[ 3 ] = ( cr + 0x8000 ) >> 16;
+
+		dest += 4;
+		x += x_step;
+	}
+
+	return dest;
+}
+
+
+static inline void
+process_pixel ( int *weights, int n_x, int n_y,
+                guchar *dest, int dest_x, int dest_channels,
+                guchar **src, int src_channels,
+                int x_start, int src_width )
+{
+	unsigned int y1 = 0, cb = 0, y2 = 0, cr = 0;
+	int i, j;
+
+	for ( i = 0; i < n_y; i++ )
+	{
+		int *line_weights = weights + n_x * i;
+
+		for ( j = 0; j < n_x; j++ )
+		{
+			unsigned int ta;
+			guchar *q;
+
+			if ( x_start + j < 0 )
+				q = src[ i ];
+			else if ( x_start + j < src_width )
+				q = src[ i ] + ( x_start + j ) * src_channels;
+			else
+				q = src[ i ] + ( src_width - 1 ) * src_channels;
+
+			ta = 0xff * line_weights[ j ];
+
+			y1 += ta * q[ 0 ];
+			cb += ta * q[ 1 ];
+			y2 += ta * q[ 2 ];
+			cr += ta * q[ 3 ];
+		}
+	}
+
+	dest[ 0 ] = ( y1 + 0xffffff ) >> 24;
+	dest[ 1 ] = ( cb + 0xffffff ) >> 24;
+	dest[ 2 ] = ( y2 + 0xffffff ) >> 24;
+	dest[ 3 ] = ( cr + 0xffffff ) >> 24;
+}
+
+
+static inline void
+correct_total ( int *weights,
+                int n_x,
+                int n_y,
+                int total,
+                double overall_alpha )
+{
+	int correction = ( int ) ( 0.5 + 65536 * overall_alpha ) - total;
+	int remaining, c, d, i;
+
+	if ( correction != 0 )
+	{
+		remaining = correction;
+		for ( d = 1, c = correction; c != 0 && remaining != 0; d++, c = correction / d )
+			for ( i = n_x * n_y - 1; i >= 0 && c != 0 && remaining != 0; i-- )
+				if ( *( weights + i ) + c >= 0 )
+				{
+					*( weights + i ) += c;
+					remaining -= c;
+					if ( ( 0 < remaining && remaining < c ) ||
+					        ( 0 > remaining && remaining > c ) )
+						c = remaining;
+				}
+	}
+}
+
+
+static inline int *
+make_filter_table ( PixopsFilter *filter )
+{
+	int i_offset, j_offset;
+	int n_x = filter->x.n;
+	int n_y = filter->y.n;
+	int *weights = g_new ( int, SUBSAMPLE * SUBSAMPLE * n_x * n_y );
+
+	for ( i_offset = 0; i_offset < SUBSAMPLE; i_offset++ )
+		for ( j_offset = 0; j_offset < SUBSAMPLE; j_offset++ )
+		{
+			double weight;
+			int *pixel_weights = weights + ( ( i_offset * SUBSAMPLE ) + j_offset ) * n_x * n_y;
+			int total = 0;
+			int i, j;
+
+			for ( i = 0; i < n_y; i++ )
+				for ( j = 0; j < n_x; j++ )
+				{
+					weight = filter->x.weights[ ( j_offset * n_x ) + j ] *
+					         filter->y.weights[ ( i_offset * n_y ) + i ] *
+					         filter->overall_alpha * 65536 + 0.5;
+
+					total += ( int ) weight;
+
+					*( pixel_weights + n_x * i + j ) = weight;
+				}
+
+			correct_total ( pixel_weights, n_x, n_y, total, filter->overall_alpha );
+		}
+
+	return weights;
+}
+
+
+static inline void
+pixops_process ( guchar *dest_buf,
+                 int render_x0,
+                 int render_y0,
+                 int render_x1,
+                 int render_y1,
+                 int dest_rowstride,
+                 int dest_channels,
+                 gboolean dest_has_alpha,
+                 const guchar *src_buf,
+                 int src_width,
+                 int src_height,
+                 int src_rowstride,
+                 int src_channels,
+                 gboolean src_has_alpha,
+                 double scale_x,
+                 double scale_y,
+                 int check_x,
+                 int check_y,
+                 int check_size,
+                 guint32 color1,
+                 guint32 color2,
+                 PixopsFilter *filter,
+                 PixopsLineFunc line_func )
+{
+	int i, j;
+	int x, y;			/* X and Y position in source (fixed_point) */
+
+	guchar **line_bufs = g_new ( guchar *, filter->y.n );
+	int *filter_weights = make_filter_table ( filter );
+
+	int x_step = ( 1 << SCALE_SHIFT ) / scale_x; /* X step in source (fixed point) */
+	int y_step = ( 1 << SCALE_SHIFT ) / scale_y; /* Y step in source (fixed point) */
+
+	int check_shift = check_size ? get_check_shift ( check_size ) : 0;
+
+	int scaled_x_offset = floor ( filter->x.offset * ( 1 << SCALE_SHIFT ) );
+
+	/* Compute the index where we run off the end of the source buffer. The furthest
+	 * source pixel we access at index i is:
+	 *
+	 *  ((render_x0 + i) * x_step + scaled_x_offset) >> SCALE_SHIFT + filter->x.n - 1
+	 *
+	 * So, run_end_index is the smallest i for which this pixel is src_width, i.e, for which:
+	 *
+	 *  (i + render_x0) * x_step >= ((src_width - filter->x.n + 1) << SCALE_SHIFT) - scaled_x_offset
+	 *
+	 */
+#define MYDIV(a,b) ((a) > 0 ? (a) / (b) : ((a) - (b) + 1) / (b))    /* Division so that -1/5 = -1 */
+
+	int run_end_x = ( ( ( src_width - filter->x.n + 1 ) << SCALE_SHIFT ) - scaled_x_offset );
+	int run_end_index = MYDIV ( run_end_x + x_step - 1, x_step ) - render_x0;
+	run_end_index = MIN ( run_end_index, render_x1 - render_x0 );
+
+	y = render_y0 * y_step + floor ( filter->y.offset * ( 1 << SCALE_SHIFT ) );
+	for ( i = 0; i < ( render_y1 - render_y0 ); i++ )
+	{
+		int dest_x;
+		int y_start = y >> SCALE_SHIFT;
+		int x_start;
+		int *run_weights = filter_weights +
+		                   ( ( y >> ( SCALE_SHIFT - SUBSAMPLE_BITS ) ) & SUBSAMPLE_MASK ) *
+		                   filter->x.n * filter->y.n * SUBSAMPLE;
+		guchar *new_outbuf;
+		guint32 tcolor1, tcolor2;
+
+		guchar *outbuf = dest_buf + dest_rowstride * i;
+		guchar *outbuf_end = outbuf + dest_channels * ( render_x1 - render_x0 );
+
+		if ( ( ( i + check_y ) >> check_shift ) & 1 )
+		{
+			tcolor1 = color2;
+			tcolor2 = color1;
+		}
+		else
+		{
+			tcolor1 = color1;
+			tcolor2 = color2;
+		}
+
+		for ( j = 0; j < filter->y.n; j++ )
+		{
+			if ( y_start < 0 )
+				line_bufs[ j ] = ( guchar * ) src_buf;
+			else if ( y_start < src_height )
+				line_bufs[ j ] = ( guchar * ) src_buf + src_rowstride * y_start;
+			else
+				line_bufs[ j ] = ( guchar * ) src_buf + src_rowstride * ( src_height - 1 );
+
+			y_start++;
+		}
+
+		dest_x = check_x;
+		x = render_x0 * x_step + scaled_x_offset;
+		x_start = x >> SCALE_SHIFT;
+
+		while ( x_start < 0 && outbuf < outbuf_end )
+		{
+			process_pixel ( run_weights + ( ( x >> ( SCALE_SHIFT - SUBSAMPLE_BITS ) ) & SUBSAMPLE_MASK ) * ( filter->x.n * filter->y.n ),
+			                filter->x.n, filter->y.n,
+			                outbuf, dest_x, dest_channels,
+			                line_bufs, src_channels,
+			                x >> SCALE_SHIFT, src_width );
+
+			x += x_step;
+			x_start = x >> SCALE_SHIFT;
+			dest_x++;
+			outbuf += dest_channels;
+		}
+
+		new_outbuf = ( *line_func ) ( run_weights, filter->x.n, filter->y.n,
+		                              outbuf, dest_x,
+		                              dest_buf + dest_rowstride * i + run_end_index * dest_channels,
+		                              line_bufs,
+		                              x, x_step, src_width );
+
+		dest_x += ( new_outbuf - outbuf ) / dest_channels;
+
+		x = ( dest_x - check_x + render_x0 ) * x_step + scaled_x_offset;
+		outbuf = new_outbuf;
+
+		while ( outbuf < outbuf_end )
+		{
+			process_pixel ( run_weights + ( ( x >> ( SCALE_SHIFT - SUBSAMPLE_BITS ) ) & SUBSAMPLE_MASK ) * ( filter->x.n * filter->y.n ),
+			                filter->x.n, filter->y.n,
+			                outbuf, dest_x, dest_channels,
+			                line_bufs, src_channels,
+			                x >> SCALE_SHIFT, src_width );
+
+			x += x_step;
+			dest_x++;
+			outbuf += dest_channels;
+		}
+
+		y += y_step;
+	}
+
+	g_free ( line_bufs );
+	g_free ( filter_weights );
+}
+
+
+/* Compute weights for reconstruction by replication followed by
+ * sampling with a box filter
+ */
+static inline void
+tile_make_weights ( PixopsFilterDimension *dim,
+                    double scale )
+{
+	int n = ceil ( 1 / scale + 1 );
+	double *pixel_weights = g_new ( double, SUBSAMPLE * n );
+	int offset;
+	int i;
+
+	dim->n = n;
+	dim->offset = 0;
+	dim->weights = pixel_weights;
+
+	for ( offset = 0; offset < SUBSAMPLE; offset++ )
+	{
+		double x = ( double ) offset / SUBSAMPLE;
+		double a = x + 1 / scale;
+
+		for ( i = 0; i < n; i++ )
+		{
+			if ( i < x )
+			{
+				if ( i + 1 > x )
+					* ( pixel_weights++ ) = ( MIN ( i + 1, a ) - x ) * scale;
+				else
+					*( pixel_weights++ ) = 0;
+			}
+			else
+			{
+				if ( a > i )
+					* ( pixel_weights++ ) = ( MIN ( i + 1, a ) - i ) * scale;
+				else
+					*( pixel_weights++ ) = 0;
+			}
+		}
+	}
+}
+
+/* Compute weights for a filter that, for minification
+ * is the same as 'tiles', and for magnification, is bilinear
+ * reconstruction followed by a sampling with a delta function.
+ */
+static inline void
+bilinear_magnify_make_weights ( PixopsFilterDimension *dim,
+                                double scale )
+{
+	double * pixel_weights;
+	int n;
+	int offset;
+	int i;
+
+	if ( scale > 1.0 )              /* Linear */
+	{
+		n = 2;
+		dim->offset = 0.5 * ( 1 / scale - 1 );
+	}
+	else                          /* Tile */
+	{
+		n = ceil ( 1.0 + 1.0 / scale );
+		dim->offset = 0.0;
+	}
+
+	dim->n = n;
+	dim->weights = g_new ( double, SUBSAMPLE * n );
+
+	pixel_weights = dim->weights;
+
+	for ( offset = 0; offset < SUBSAMPLE; offset++ )
+	{
+		double x = ( double ) offset / SUBSAMPLE;
+
+		if ( scale > 1.0 )        /* Linear */
+		{
+			for ( i = 0; i < n; i++ )
+				*( pixel_weights++ ) = ( ( ( i == 0 ) ? ( 1 - x ) : x ) / scale ) * scale;
+		}
+		else                  /* Tile */
+		{
+			double a = x + 1 / scale;
+
+			/*           x
+			 * ---------|--.-|----|--.-|-------  SRC
+			 * ------------|---------|---------  DEST
+			 */
+			for ( i = 0; i < n; i++ )
+			{
+				if ( i < x )
+				{
+					if ( i + 1 > x )
+						* ( pixel_weights++ ) = ( MIN ( i + 1, a ) - x ) * scale;
+					else
+						*( pixel_weights++ ) = 0;
+				}
+				else
+				{
+					if ( a > i )
+						* ( pixel_weights++ ) = ( MIN ( i + 1, a ) - i ) * scale;
+					else
+						*( pixel_weights++ ) = 0;
+				}
+			}
+		}
+	}
+}
+
+/* Computes the integral from b0 to b1 of
+ *
+ * f(x) = x; 0 <= x < 1
+ * f(x) = 0; otherwise
+ *
+ * We combine two of these to compute the convolution of
+ * a box filter with a triangular spike.
+ */
+static inline double
+linear_box_half ( double b0, double b1 )
+{
+	double a0, a1;
+	double x0, x1;
+
+	a0 = 0.;
+	a1 = 1.;
+
+	if ( a0 < b0 )
+	{
+		if ( a1 > b0 )
+		{
+			x0 = b0;
+			x1 = MIN ( a1, b1 );
+		}
+		else
+			return 0;
+	}
+	else
+	{
+		if ( b1 > a0 )
+		{
+			x0 = a0;
+			x1 = MIN ( a1, b1 );
+		}
+		else
+			return 0;
+	}
+
+	return 0.5 * ( x1 * x1 - x0 * x0 );
+}
+
+/* Compute weights for reconstructing with bilinear
+ * interpolation, then sampling with a box filter
+ */
+static inline void
+bilinear_box_make_weights ( PixopsFilterDimension *dim,
+                            double scale )
+{
+	int n = ceil ( 1 / scale + 2.0 );
+	double *pixel_weights = g_new ( double, SUBSAMPLE * n );
+	double w;
+	int offset, i;
+
+	dim->offset = -1.0;
+	dim->n = n;
+	dim->weights = pixel_weights;
+
+	for ( offset = 0 ; offset < SUBSAMPLE; offset++ )
+	{
+		double x = ( double ) offset / SUBSAMPLE;
+		double a = x + 1 / scale;
+
+		for ( i = 0; i < n; i++ )
+		{
+			w = linear_box_half ( 0.5 + i - a, 0.5 + i - x );
+			w += linear_box_half ( 1.5 + x - i, 1.5 + a - i );
+
+			*( pixel_weights++ ) = w * scale;
+		}
+	}
+}
+
+
+static inline void
+make_weights ( PixopsFilter *filter,
+               PixopsInterpType interp_type,
+               double scale_x,
+               double scale_y )
+{
+	switch ( interp_type )
+	{
+	case PIXOPS_INTERP_NEAREST:
+		g_assert_not_reached ();
+		break;
+
+	case PIXOPS_INTERP_TILES:
+		tile_make_weights ( &filter->x, scale_x );
+		tile_make_weights ( &filter->y, scale_y );
+		break;
+
+	case PIXOPS_INTERP_BILINEAR:
+		bilinear_magnify_make_weights ( &filter->x, scale_x );
+		bilinear_magnify_make_weights ( &filter->y, scale_y );
+		break;
+
+	case PIXOPS_INTERP_HYPER:
+		bilinear_box_make_weights ( &filter->x, scale_x );
+		bilinear_box_make_weights ( &filter->y, scale_y );
+		break;
+	}
+}
+
+
+void
+yuv422_scale ( guchar *dest_buf,
+               int render_x0,
+               int render_y0,
+               int render_x1,
+               int render_y1,
+               int dest_rowstride,
+               int dest_channels,
+               gboolean dest_has_alpha,
+               const guchar *src_buf,
+               int src_width,
+               int src_height,
+               int src_rowstride,
+               int src_channels,
+               gboolean src_has_alpha,
+               double scale_x,
+               double scale_y,
+               PixopsInterpType interp_type )
+{
+	PixopsFilter filter;
+	PixopsLineFunc line_func;
+
+#ifdef USE_MMX
+	gboolean found_mmx = pixops_have_mmx();
+#endif
+
+	//g_return_if_fail ( !( dest_channels == 3 && dest_has_alpha ) );
+	//g_return_if_fail ( !( src_channels == 3 && src_has_alpha ) );
+	//g_return_if_fail ( !( src_has_alpha && !dest_has_alpha ) );
+
+	if ( scale_x == 0 || scale_y == 0 )
+		return ;
+
+	if ( interp_type == PIXOPS_INTERP_NEAREST )
+	{
+		pixops_scale_nearest ( dest_buf, render_x0, render_y0, render_x1, render_y1,
+		                       dest_rowstride,
+		                       src_buf, src_width, src_height, src_rowstride,
+		                       scale_x, scale_y );
+		return ;
+	}
+
+	filter.overall_alpha = 1.0;
+	make_weights ( &filter, interp_type, scale_x, scale_y );
+
+fprintf( stderr, "RESCALE: %d %d\n", filter.x.n, filter.y.n );
+	if ( filter.x.n == 2 && filter.y.n == 2 )
+	{
+#ifdef USE_MMX
+		if ( 0 && found_mmx )
+			line_func = scale_line_22_33_mmx_stub;
+		else
+#endif
+
+			line_func = scale_line_22_33;
+	}
+	else
+		line_func = scale_line;
+
+	pixops_process ( dest_buf, render_x0, render_y0, render_x1, render_y1,
+	                 dest_rowstride, dest_channels, dest_has_alpha,
+	                 src_buf, src_width, src_height, src_rowstride, src_channels,
+	                 src_has_alpha, scale_x, scale_y, 0, 0, 0, 0, 0,
+	                 &filter, line_func );
+
+	g_free ( filter.x.weights );
+	g_free ( filter.y.weights );
+}
+
diff --git a/src/modules/gtk2/pixops.h b/src/modules/gtk2/pixops.h
new file mode 100644
index 0000000..37d6f37
--- /dev/null
+++ b/src/modules/gtk2/pixops.h
@@ -0,0 +1,69 @@
+/* GdkPixbuf library - Scaling and compositing functions
+ *
+ * Copyright (C) 1999 The Free Software Foundation
+ *
+ * Author: Owen Taylor <otaylor@redhat.com>
+ * Modified for YUV422 by: Dan Dennedy <dan@dennedy.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+#ifndef PIXOPS_H
+#define PIXOPS_H
+
+#include <glib.h>
+
+/* Interpolation modes; must match GdkInterpType */
+typedef enum {
+	PIXOPS_INTERP_NEAREST,
+	PIXOPS_INTERP_TILES,
+	PIXOPS_INTERP_BILINEAR,
+	PIXOPS_INTERP_HYPER
+} PixopsInterpType;
+
+/* Scale src_buf from src_width / src_height by factors scale_x, scale_y
+ * and composite the portion corresponding to
+ * render_x, render_y, render_width, render_height in the new
+ * coordinate system into dest_buf starting at 0, 0
+ */
+void yuv422_scale     (guchar         *dest_buf,
+		       int             render_x0,
+		       int             render_y0,
+		       int             render_x1,
+		       int             render_y1,
+		       int             dest_rowstride,
+		       int             dest_channels,
+		       int             dest_has_alpha,
+		       const guchar   *src_buf,
+		       int             src_width,
+		       int             src_height,
+		       int             src_rowstride,
+		       int             src_channels,
+		       int             src_has_alpha,
+		       double          scale_x,
+		       double          scale_y,
+		       PixopsInterpType   interp_type);
+
+#define yuv422_scale_simple( dest_buf, dest_width, dest_height, dest_rowstride, src_buf, src_width, src_height, src_rowstride, interp_type ) \
+	yuv422_scale( (dest_buf), 0, 0, \
+		(dest_width)/2, (dest_height), \
+		(dest_rowstride), 4, 0, \
+		(src_buf), (src_width)/2, (src_height), \
+		(src_rowstride), 4, 0, \
+		(double) (dest_width) / (src_width), (double) (dest_height) / (src_height), \
+		(PixopsInterpType) interp_type );
+
+#endif
diff --git a/src/modules/gtk2/producer_pango.c b/src/modules/gtk2/producer_pango.c
index f1952dc..7fa3764 100644
--- a/src/modules/gtk2/producer_pango.c
+++ b/src/modules/gtk2/producer_pango.c
@@ -103,7 +103,7 @@ mlt_producer producer_pango_init( const char *filename )
 					size += strlen( line ) + 1;
 					if ( markup )
 					{
-						realloc( markup, size );
+						markup = realloc( markup, size );
 						strcat( markup, line );
 					}
 					else
diff --git a/src/modules/gtk2/scale_line_22_33_mmx.S b/src/modules/gtk2/scale_line_22_33_mmx.S
new file mode 100644
index 0000000..f389217
--- /dev/null
+++ b/src/modules/gtk2/scale_line_22_33_mmx.S
@@ -0,0 +1,183 @@
+	.file	"scale_line_22_33_mmx.S"
+	.version	"01.01"
+gcc2_compiled.:
+.text
+	.align 16
+
+#if !defined(__MINGW32__) && !defined(__CYGWIN__)	
+	
+.globl pixops_scale_line_22_33_mmx
+	.type	 pixops_scale_line_22_33_mmx,@function
+pixops_scale_line_22_33_mmx:
+	
+#else
+	
+.globl _pixops_scale_line_22_33_mmx
+_pixops_scale_line_22_33_mmx:
+	
+#endif
+/*
+ * Arguments
+ *		
+ * weights:	 8(%ebp)
+ * p:	        12(%ebp)	%esi
+ * q1:	        16(%ebp)	
+ * q2:	        20(%ebp)	
+ * xstep:       24(%ebp)	
+ * p_end:       28(%ebp)
+ * xinit:       32(%ebp)
+ *	
+*/
+
+/*
+ * Function call entry
+ */
+	pushl %ebp
+	movl %esp,%ebp
+	subl $28,%esp
+	pushl %edi
+	pushl %esi
+	pushl %ebx
+/* Locals:	
+ * int x                      %ebx
+ * int x_scaled             -24(%ebp)
+ */
+
+/*
+ * Setup
+ */
+/* Initialize variables */	
+	movl 32(%ebp),%ebx
+	movl 32(%ebp),%edx
+	sarl $16,%edx
+	movl 12(%ebp),%esi
+
+	cmpl 28(%ebp),%esi
+	jnb  .out
+
+/* For the body of this loop, %mm01, %mm1, %mm2, %mm3 hold the 4 adjoining
+ * points we are interpolating between, as:
+ *
+ *  000000BB00GG00RR
+ */	
+	
+/* Load initial values into %mm1, %mm3 */
+	leal (%edx,%edx,2),%edx  # Multiply by 3
+
+	movl 16(%ebp),%edi
+	pxor %mm4, %mm4
+	movzbl 2(%edi,%edx),%ecx
+	shll $16,%ecx
+	movzwl (%edi,%edx),%eax
+	orl %eax,%ecx
+	movd %ecx, %mm1
+	punpcklbw %mm4, %mm1
+
+	movl 20(%ebp),%edi
+	movzbl 2(%edi,%edx),%ecx
+	shll $16,%ecx
+	movzwl (%edi,%edx),%eax
+	orl %eax,%ecx
+	movd %ecx, %mm3
+	punpcklbw %mm4, %mm3
+
+	addl $65536,%ebx
+	movl %ebx,%edx
+	sarl $16,%edx
+
+	jmp .newx
+	.p2align 4,,7
+.loop:
+/* short *pixel_weights = weights + ((x >> (SCALE_SHIFT - SUBSAMPLE_BITS)) & SUBSAMPLE_MASK) * n_x * n_y
+ *                                             16             4                  0xf            2     2
+ */
+	movl %ebx,%eax
+	andl $0xf000,%eax
+	shrl $7,%eax
+
+/* At this point, %edi holds weights. Load the 4 weights into %mm4,%mm5,%mm6,%mm7, multiply and
+ * accumulate.
+ */
+	movq (%edi,%eax),%mm4
+	pmullw %mm0,%mm4
+	movq 8(%edi,%eax),%mm5
+	pmullw %mm1,%mm5
+	movq 16(%edi,%eax),%mm6
+	movq 24(%edi,%eax),%mm7
+	pmullw %mm2,%mm6
+	pmullw %mm3,%mm7
+	paddw %mm4, %mm5
+	paddw %mm6, %mm7
+	paddw %mm5, %mm7
+
+/* %mm7	holds the accumulated sum. Compute (C + 0x80) / 256
+ */
+	pxor %mm4, %mm4
+	movl $8421504, %eax  # 0x00808080
+	movd %eax, %mm6  
+	punpcklbw %mm4, %mm6
+	paddw %mm6, %mm7
+	psrlw $8, %mm7
+
+/* Pack into %eax and store result
+ */	
+	packuswb %mm7, %mm7
+	movd %mm7, %eax
+	
+	movb %al, (%esi)
+	shrl $8, %eax
+	movw %ax, 1(%esi)
+	addl $3, %esi
+		
+	cmpl %esi,28(%ebp)
+	je   .out
+
+/* x += x_step; */
+	addl 24(%ebp),%ebx
+/* x_scaled = x >> 16; */
+	movl %ebx,%edx
+	sarl $16,%edx
+
+	cmpl %edx,-24(%ebp)
+	je   .loop
+
+.newx:
+	movl %edx,-24(%ebp)
+/*
+ * Load the two new values into %mm1, %mm3, move old values into %mm0, %mm2
+ */
+	movq %mm1, %mm0
+	movq %mm3, %mm2
+	
+	leal (%edx,%edx,2),%edx  # Multiply by 3
+
+	movl 16(%ebp),%edi
+	movzbl 2(%edi,%edx),%ecx
+	shll $16,%ecx
+	movzwl (%edi,%edx),%eax
+	orl %eax,%ecx
+	movd %ecx, %mm1
+	punpcklbw %mm4, %mm1
+
+	movl 20(%ebp),%edi
+	movzbl 2(%edi,%edx),%ecx
+	shll $16,%ecx
+	movzwl (%edi,%edx),%eax
+	orl %eax,%ecx
+	movd %ecx, %mm3
+	punpcklbw %mm4, %mm3
+	
+	movl 8(%ebp),%edi
+	
+	jmp .loop
+
+.out:
+	movl %esi,%eax
+	emms
+	leal -40(%ebp),%esp
+	popl %ebx
+	popl %esi
+	popl %edi
+	movl %ebp,%esp
+	popl %ebp
+	ret
diff --git a/src/modules/vorbis/Makefile b/src/modules/vorbis/Makefile
index ac457c0..6e28934 100644
--- a/src/modules/vorbis/Makefile
+++ b/src/modules/vorbis/Makefile
@@ -6,7 +6,7 @@ OBJS = factory.o \
 
 CFLAGS = -I../../ -Wall -g -D_FILE_OFFSET_BITS=64
 
-LDFLAGS = -lvorbisfile
+LDFLAGS = -lvorbis -lvorbisfile
 
 SRCS := $(OBJS:.o=.c)
 
-- 
1.7.4.4