ffmpeg-2.1.1: move directory

git-svn-id: svn://kolibrios.org@6148 a494cfbc-eb01-0410-851d-a64ba20cac60
2016-02-05 22:14:10 +00:00
parent a4b787f4b8
commit ecf3e862ea
4011 changed files with 1868 additions and 4 deletions
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/Makefile
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/Makefile
@@ -0,0 +1,19 @@
+include $(SUBDIR)../config.mak
+
+NAME = swscale
+FFLIBS = avutil
+
+HEADERS = swscale.h                                                     \
+          version.h                                                     \
+
+OBJS = input.o                                          \
+       options.o                                        \
+       output.o                                         \
+       rgb2rgb.o                                        \
+       swscale.o                                        \
+       swscale_unscaled.o                               \
+       utils.o                                          \
+       yuv2rgb.o                                        \
+
+TESTPROGS = colorspace                                                  \
+            swscale                                                     \
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/bfin/Makefile
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/bfin/Makefile
@@ -0,0 +1,3 @@
+OBJS +=  bfin/internal_bfin.o                                           \
+         bfin/swscale_bfin.o                                            \
+         bfin/yuv2rgb_bfin.o                                            \
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/bfin/internal_bfin.S
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/bfin/internal_bfin.S
@@ -0,0 +1,613 @@
+/*
+ * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
+ *                    April 20, 2007
+ *
+ * Blackfin video color space converter operations
+ * convert I420 YV12 to RGB in various formats
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+/*
+YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock
+and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts.
+
+
+The following calculation is used for the conversion:
+
+  r = clipz((y - oy) * cy  + crv * (v - 128))
+  g = clipz((y - oy) * cy  + cgv * (v - 128) + cgu * (u - 128))
+  b = clipz((y - oy) * cy  + cbu * (u - 128))
+
+y, u, v are prescaled by a factor of 4 i.e. left-shifted to gain precision.
+
+
+New factorization to eliminate the truncation error which was
+occurring due to the byteop3p.
+
+
+1) Use the bytop16m to subtract quad bytes we use this in U8 this
+ then so the offsets need to be renormalized to 8bits.
+
+2) Scale operands up by a factor of 4 not 8 because Blackfin
+   multiplies include a shift.
+
+3) Compute into the accumulators cy * yx0, cy * yx1.
+
+4) Compute each of the linear equations:
+     r = clipz((y - oy) * cy  + crv * (v - 128))
+
+     g = clipz((y - oy) * cy  + cgv * (v - 128) + cgu * (u - 128))
+
+     b = clipz((y - oy) * cy  + cbu * (u - 128))
+
+   Reuse of the accumulators requires that we actually multiply
+   twice once with addition and the second time with a subtraction.
+
+   Because of this we need to compute the equations in the order R B
+   then G saving the writes for B in the case of 24/32 bit color
+   formats.
+
+   API: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
+                      int dW, uint32_t *coeffs);
+
+       A          B
+       ---        ---
+       i2 = cb    i3 = cr
+       i1 = coeff i0 = y
+
+Where coeffs have the following layout in memory.
+
+uint32_t oy, oc, zero, cy, crv, rmask, cbu, bmask, cgu, cgv;
+
+coeffs is a pointer to oy.
+
+The {rgb} masks are only utilized by the 565 packing algorithm. Note the data
+replication is used to simplify the internal algorithms for the dual Mac
+architecture of BlackFin.
+
+All routines are exported with _ff_bfin_ as a symbol prefix.
+
+Rough performance gain compared against -O3:
+
+2779809/1484290 187.28%
+
+which translates to ~33c/pel to ~57c/pel for the reference vs 17.5
+c/pel for the optimized implementations. Not sure why there is such a
+huge variation on the reference codes on Blackfin I guess it must have
+to do with the memory system.
+*/
+
+#define mL3 .text
+#if defined(__FDPIC__) && CONFIG_SRAM
+#define mL1 .l1.text
+#else
+#define mL1 mL3
+#endif
+#define MEM mL1
+
+#define DEFUN(fname,where,interface) \
+        .section where;              \
+        .global _ff_bfin_ ## fname;  \
+        .type _ff_bfin_ ## fname, STT_FUNC; \
+        .align 8;                    \
+        _ff_bfin_ ## fname
+
+#define DEFUN_END(fname) \
+        .size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname
+
+
+.text
+
+#define COEFF_LEN        11*4
+#define COEFF_REL_CY_OFF 4*4
+
+#define ARG_OUT   20
+#define ARG_W     24
+#define ARG_COEFF 28
+
+DEFUN(yuv2rgb565_line,MEM,
+   (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
+        link 0;
+        [--sp] = (r7:4);
+        p1 = [fp+ARG_OUT];
+        r3 = [fp+ARG_W];
+
+        i0 = r0;
+        i2 = r1;
+        i3 = r2;
+
+        r0 = [fp+ARG_COEFF];
+        i1 = r0;
+        b1 = i1;
+        l1 = COEFF_LEN;
+        m0 = COEFF_REL_CY_OFF;
+        p0 = r3;
+
+        r0   = [i0++];         // 2Y
+        r1.l = w[i2++];        // 2u
+        r1.h = w[i3++];        // 2v
+        p0 = p0>>2;
+
+        lsetup (.L0565, .L1565) lc0 = p0;
+
+        /*
+           uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
+           r0 -- used to load 4ys
+           r1 -- used to load 2us,2vs
+           r4 -- y3,y2
+           r5 -- y1,y0
+           r6 -- u1,u0
+           r7 -- v1,v0
+        */
+                                                              r2=[i1++]; // oy
+.L0565:
+        /*
+        rrrrrrrr gggggggg bbbbbbbb
+         5432109876543210
+                    bbbbb >>3
+              gggggggg    <<3
+         rrrrrrrr         <<8
+         rrrrrggggggbbbbb
+        */
+        (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
+        (r7,r6) = byteop16m (r1:0, r3:2) (r);
+        r5 = r5 << 2 (v);                                                // y1,y0
+        r4 = r4 << 2 (v);                                                // y3,y2
+        r6 = r6 << 2 (v)                                   || r0=[i1++]; // u1,u0, r0=zero
+        r7 = r7 << 2 (v)                                   || r1=[i1++]; // v1,v0  r1=cy
+        /* Y' = y*cy */
+        a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
+
+        /* R = Y+ crv*(Cr-128) */
+        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
+                a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
+        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
+        r2 = r2 >> 3 (v);
+        r3 = r2 & r5;
+
+        /* B = Y+ cbu*(Cb-128) */
+        r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
+                a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
+        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
+        r2 = r2 << 8 (v);
+        r2 = r2 & r5;
+        r3 = r3 | r2;
+
+        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
+                a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
+        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
+        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask
+        r2 = r2 << 3 (v);
+        r2 = r2 & r5;
+        r3 = r3 | r2;
+        [p1++]=r3                                          || r1=[i1++]; // cy
+
+        /* Y' = y*cy */
+
+        a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
+
+        /* R = Y+ crv*(Cr-128) */
+        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
+                a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
+        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
+        r2 = r2 >> 3 (v);
+        r3 = r2 & r5;
+
+        /* B = Y+ cbu*(Cb-128) */
+        r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
+                a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
+        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
+        r2 = r2 << 8 (v);
+        r2 = r2 & r5;
+        r3 = r3 | r2;
+
+        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
+                a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
+        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
+        r2 = byteop3p(r3:2, r1:0)(LO)                      || r0   =  [i0++];        // 2Y
+        r2 = r2 << 3 (v)                                   || r1.l = w[i2++];        // 2u
+        r2 = r2 & r5;
+        r3 = r3 | r2;
+        [p1++]=r3                                          || r1.h = w[i3++];        // 2v
+.L1565:                                                       r2=[i1++]; // oy
+
+        l1 = 0;
+
+        (r7:4) = [sp++];
+        unlink;
+        rts;
+DEFUN_END(yuv2rgb565_line)
+
+DEFUN(yuv2rgb555_line,MEM,
+   (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
+        link 0;
+        [--sp] = (r7:4);
+        p1 = [fp+ARG_OUT];
+        r3 = [fp+ARG_W];
+
+        i0 = r0;
+        i2 = r1;
+        i3 = r2;
+
+        r0 = [fp+ARG_COEFF];
+        i1 = r0;
+        b1 = i1;
+        l1 = COEFF_LEN;
+        m0 = COEFF_REL_CY_OFF;
+        p0 = r3;
+
+        r0   = [i0++];         // 2Y
+        r1.l = w[i2++];        // 2u
+        r1.h = w[i3++];        // 2v
+        p0 = p0>>2;
+
+        lsetup (.L0555, .L1555) lc0 = p0;
+
+        /*
+           uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
+           r0 -- used to load 4ys
+           r1 -- used to load 2us,2vs
+           r4 -- y3,y2
+           r5 -- y1,y0
+           r6 -- u1,u0
+           r7 -- v1,v0
+        */
+                                                              r2=[i1++]; // oy
+.L0555:
+        /*
+        rrrrrrrr gggggggg bbbbbbbb
+         5432109876543210
+                    bbbbb >>3
+               gggggggg   <<2
+          rrrrrrrr        <<7
+         xrrrrrgggggbbbbb
+        */
+
+        (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
+        (r7,r6) = byteop16m (r1:0, r3:2) (r);
+        r5 = r5 << 2 (v);                                                // y1,y0
+        r4 = r4 << 2 (v);                                                // y3,y2
+        r6 = r6 << 2 (v)                                   || r0=[i1++]; // u1,u0, r0=zero
+        r7 = r7 << 2 (v)                                   || r1=[i1++]; // v1,v0  r1=cy
+        /* Y' = y*cy */
+        a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
+
+        /* R = Y+ crv*(Cr-128) */
+        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
+                a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
+        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
+        r2 = r2 >> 3 (v);
+        r3 = r2 & r5;
+
+        /* B = Y+ cbu*(Cb-128) */
+        r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
+                a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
+        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
+        r2 = r2 << 7 (v);
+        r2 = r2 & r5;
+        r3 = r3 | r2;
+
+        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
+                a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
+        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
+        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask
+        r2 = r2 << 2 (v);
+        r2 = r2 & r5;
+        r3 = r3 | r2;
+        [p1++]=r3                                          || r1=[i1++]; // cy
+
+        /* Y' = y*cy */
+
+        a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
+
+        /* R = Y+ crv*(Cr-128) */
+        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
+                a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
+        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
+        r2 = r2 >> 3 (v);
+        r3 = r2 & r5;
+
+        /* B = Y+ cbu*(Cb-128) */
+        r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
+                a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
+        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
+        r2 = r2 << 7 (v);
+        r2 = r2 & r5;
+        r3 = r3 | r2;
+
+        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
+                a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
+        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
+        r2 = byteop3p(r3:2, r1:0)(LO)                      || r0=[i0++];     // 4Y
+        r2 = r2 << 2 (v)                                   || r1.l=w[i2++];  // 2u
+        r2 = r2 & r5;
+        r3 = r3 | r2;
+        [p1++]=r3                                          || r1.h=w[i3++]; // 2v
+
+.L1555:                                                       r2=[i1++]; // oy
+
+        l1 = 0;
+
+        (r7:4) = [sp++];
+        unlink;
+        rts;
+DEFUN_END(yuv2rgb555_line)
+
+DEFUN(yuv2rgb24_line,MEM,
+   (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
+        link 0;
+        [--sp] = (r7:4);
+        p1 = [fp+ARG_OUT];
+        r3 = [fp+ARG_W];
+        p2 = p1;
+        p2 += 3;
+
+        i0 = r0;
+        i2 = r1;
+        i3 = r2;
+
+        r0 = [fp+ARG_COEFF]; // coeff buffer
+        i1 = r0;
+        b1 = i1;
+        l1 = COEFF_LEN;
+        m0 = COEFF_REL_CY_OFF;
+        p0 = r3;
+
+        r0   = [i0++];         // 2Y
+        r1.l = w[i2++];        // 2u
+        r1.h = w[i3++];        // 2v
+        p0 = p0>>2;
+
+        lsetup (.L0888, .L1888) lc0 = p0;
+
+        /*
+           uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
+           r0 -- used to load 4ys
+           r1 -- used to load 2us,2vs
+           r4 -- y3,y2
+           r5 -- y1,y0
+           r6 -- u1,u0
+           r7 -- v1,v0
+        */
+                                                              r2=[i1++]; // oy
+.L0888:
+        (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
+        (r7,r6) = byteop16m (r1:0, r3:2) (r);
+        r5 = r5 << 2 (v);               // y1,y0
+        r4 = r4 << 2 (v);               // y3,y2
+        r6 = r6 << 2 (v) || r0=[i1++];  // u1,u0, r0=zero
+        r7 = r7 << 2 (v) || r1=[i1++];  // v1,v0  r1=cy
+
+        /* Y' = y*cy */
+        a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
+
+        /* R = Y+ crv*(Cr-128) */
+        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
+                a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
+        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
+        r2=r2>>16 || B[p1++]=r2;
+                     B[p2++]=r2;
+
+        /* B = Y+ cbu*(Cb-128) */
+        r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
+                a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
+        r3 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
+
+        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
+                a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
+        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
+        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask, oy,cy,zero
+
+        r2=r2>>16 || B[p1++]=r2;
+                     B[p2++]=r2;
+
+        r3=r3>>16 || B[p1++]=r3;
+                     B[p2++]=r3                            || r1=[i1++]; // cy
+
+        p1+=3;
+        p2+=3;
+        /* Y' = y*cy */
+        a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
+
+        /* R = Y+ crv*(Cr-128) */
+        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
+                a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
+        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
+        r2=r2>>16 || B[p1++]=r2;
+        B[p2++]=r2;
+
+        /* B = Y+ cbu*(Cb-128) */
+        r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
+                a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
+        r3 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
+
+        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
+                a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
+        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
+        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++]; // gmask
+        r2=r2>>16 || B[p1++]=r2 || r0 = [i0++];    // 4y
+                     B[p2++]=r2 || r1.l = w[i2++]; // 2u
+        r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v
+                     B[p2++]=r3 || r2=[i1++];      // oy
+
+        p1+=3;
+.L1888: p2+=3;
+
+        l1 = 0;
+
+        (r7:4) = [sp++];
+        unlink;
+        rts;
+DEFUN_END(yuv2rgb24_line)
+
+
+
+#define ARG_vdst        20
+#define ARG_width       24
+#define ARG_height      28
+#define ARG_lumStride   32
+#define ARG_chromStride 36
+#define ARG_srcStride   40
+
+DEFUN(uyvytoyv12, mL3,  (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                         int width, int height,
+                         int lumStride, int chromStride, int srcStride)):
+        link 0;
+        [--sp] = (r7:4,p5:4);
+
+        p0 = r1;       // Y top even
+
+        i2 = r2; // *u
+        r2 = [fp + ARG_vdst];
+        i3 = r2; // *v
+
+        r1 = [fp + ARG_srcStride];
+        r2 = r0 + r1;
+        i0 = r0;  // uyvy_T even
+        i1 = r2;  // uyvy_B odd
+
+        p2 = [fp + ARG_lumStride];
+        p1 = p0 + p2;  // Y bot odd
+
+        p5 = [fp + ARG_width];
+        p4 = [fp + ARG_height];
+        r0 = p5;
+        p4 = p4 >> 1;
+        p5 = p5 >> 2;
+
+        r2 = r0 << 1;
+        r1 = r1 << 1;
+        r1 = r1 - r2;  // srcStride + (srcStride - 2*width)
+        r1 += -8;  // i0,i1 is pre read need to correct
+        m0 = r1;
+
+        r2 = [fp + ARG_chromStride];
+        r0 = r0 >> 1;
+        r2 = r2 - r0;
+        m1 = r2;
+
+        /*   I0,I1 - src input line pointers
+         *   p0,p1 - luma output line pointers
+         *   I2    - dstU
+         *   I3    - dstV
+         */
+
+        lsetup (0f, 1f) lc1 = p4;   // H/2
+0:        r0 = [i0++] || r2 = [i1++];
+          r1 = [i0++] || r3 = [i1++];
+          r4 = byteop1p(r1:0, r3:2);
+          r5 = byteop1p(r1:0, r3:2) (r);
+          lsetup (2f, 3f) lc0 = p5; // W/4
+2:          r0 = r0 >> 8(v);
+            r1 = r1 >> 8(v);
+            r2 = r2 >> 8(v);
+            r3 = r3 >> 8(v);
+            r0 = bytepack(r0, r1);
+            r2 = bytepack(r2, r3)         ||  [p0++] = r0;    // yyyy
+            r6 = pack(r5.l, r4.l)         ||  [p1++] = r2;    // yyyy
+            r7 = pack(r5.h, r4.h)         ||  r0 = [i0++] || r2 = [i1++];
+            r6 = bytepack(r6, r7)         ||  r1 = [i0++] || r3 = [i1++];
+            r4 = byteop1p(r1:0, r3:2)     ||  w[i2++] = r6.l; // uu
+3:          r5 = byteop1p(r1:0, r3:2) (r) ||  w[i3++] = r6.h; // vv
+
+          i0 += m0;
+          i1 += m0;
+          i2 += m1;
+          i3 += m1;
+          p0 = p0 + p2;
+1:        p1 = p1 + p2;
+
+        (r7:4,p5:4) = [sp++];
+        unlink;
+        rts;
+DEFUN_END(uyvytoyv12)
+
+DEFUN(yuyvtoyv12, mL3,  (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                         int width, int height,
+                         int lumStride, int chromStride, int srcStride)):
+        link 0;
+        [--sp] = (r7:4,p5:4);
+
+        p0 = r1;       // Y top even
+
+        i2 = r2; // *u
+        r2 = [fp + ARG_vdst];
+        i3 = r2; // *v
+
+        r1 = [fp + ARG_srcStride];
+        r2 = r0 + r1;
+
+        i0 = r0;  // uyvy_T even
+        i1 = r2;  // uyvy_B odd
+
+        p2 = [fp + ARG_lumStride];
+        p1 = p0 + p2;  // Y bot odd
+
+        p5 = [fp + ARG_width];
+        p4 = [fp + ARG_height];
+        r0 = p5;
+        p4 = p4 >> 1;
+        p5 = p5 >> 2;
+
+        r2 = r0 << 1;
+        r1 = r1 << 1;
+        r1 = r1 - r2;  // srcStride + (srcStride - 2*width)
+        r1 += -8;  // i0,i1 is pre read need to correct
+        m0 = r1;
+
+        r2 = [fp + ARG_chromStride];
+        r0 = r0 >> 1;
+        r2 = r2 - r0;
+        m1 = r2;
+
+        /*   I0,I1 - src input line pointers
+         *   p0,p1 - luma output line pointers
+         *   I2    - dstU
+         *   I3    - dstV
+         */
+
+        lsetup (0f, 1f) lc1 = p4;   // H/2
+0:        r0 = [i0++] || r2 = [i1++];
+          r1 = [i0++] || r3 = [i1++];
+          r4 = bytepack(r0, r1);
+          r5 = bytepack(r2, r3);
+          lsetup (2f, 3f) lc0 = p5; // W/4
+2:          r0 = r0 >> 8(v) || [p0++] = r4;  // yyyy-even
+            r1 = r1 >> 8(v) || [p1++] = r5;  // yyyy-odd
+            r2 = r2 >> 8(v);
+            r3 = r3 >> 8(v);
+            r4 = byteop1p(r1:0, r3:2);
+            r5 = byteop1p(r1:0, r3:2) (r);
+            r6 = pack(r5.l, r4.l);
+            r7 = pack(r5.h, r4.h)         ||  r0 = [i0++] || r2 = [i1++];
+            r6 = bytepack(r6, r7)         ||  r1 = [i0++] || r3 = [i1++];
+            r4 = bytepack(r0, r1)         ||  w[i2++] = r6.l; // uu
+3:          r5 = bytepack(r2, r3)         ||  w[i3++] = r6.h; // vv
+
+          i0 += m0;
+          i1 += m0;
+          i2 += m1;
+          i3 += m1;
+          p0 = p0 + p2;
+1:        p1 = p1 + p2;
+
+        (r7:4,p5:4) = [sp++];
+        unlink;
+        rts;
+DEFUN_END(yuyvtoyv12)
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/bfin/swscale_bfin.c
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/bfin/swscale_bfin.c
@@ -0,0 +1,87 @@
+/*
+ * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
+ *
+ * Blackfin software video scaler operations
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libswscale/swscale_internal.h"
+
+#if defined (__FDPIC__) && CONFIG_SRAM
+#define L1CODE __attribute__((l1_text))
+#else
+#define L1CODE
+#endif
+
+int ff_bfin_uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                       uint8_t *vdst, int width, int height,
+                       int lumStride, int chromStride, int srcStride) L1CODE;
+
+int ff_bfin_yuyvtoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                       uint8_t *vdst, int width, int height,
+                       int lumStride, int chromStride, int srcStride) L1CODE;
+
+static int uyvytoyv12_unscaled(SwsContext *c, const uint8_t *src[],
+                               int srcStride[], int srcSliceY, int srcSliceH,
+                               uint8_t *dst[], int dstStride[])
+{
+    uint8_t *dsty = dst[0] + dstStride[0] * srcSliceY;
+    uint8_t *dstu = dst[1] + dstStride[1] * srcSliceY / 2;
+    uint8_t *dstv = dst[2] + dstStride[2] * srcSliceY / 2;
+    const uint8_t *ip = src[0] + srcStride[0] * srcSliceY;
+    int w = dstStride[0];
+
+    ff_bfin_uyvytoyv12(ip, dsty, dstu, dstv, w, srcSliceH,
+                       dstStride[0], dstStride[1], srcStride[0]);
+
+    return srcSliceH;
+}
+
+static int yuyvtoyv12_unscaled(SwsContext *c, const uint8_t *src[],
+                               int srcStride[], int srcSliceY, int srcSliceH,
+                               uint8_t *dst[], int dstStride[])
+{
+    uint8_t *dsty = dst[0] + dstStride[0] * srcSliceY;
+    uint8_t *dstu = dst[1] + dstStride[1] * srcSliceY / 2;
+    uint8_t *dstv = dst[2] + dstStride[2] * srcSliceY / 2;
+    const uint8_t *ip = src[0] + srcStride[0] * srcSliceY;
+    int w = dstStride[0];
+
+    ff_bfin_yuyvtoyv12(ip, dsty, dstu, dstv, w, srcSliceH,
+                       dstStride[0], dstStride[1], srcStride[0]);
+
+    return srcSliceH;
+}
+
+av_cold void ff_get_unscaled_swscale_bfin(SwsContext *c)
+{
+    if (c->dstFormat == AV_PIX_FMT_YUV420P && c->srcFormat == AV_PIX_FMT_UYVY422) {
+        av_log(NULL, AV_LOG_VERBOSE,
+               "selecting Blackfin optimized uyvytoyv12_unscaled\n");
+        c->swscale = uyvytoyv12_unscaled;
+    }
+    if (c->dstFormat == AV_PIX_FMT_YUV420P && c->srcFormat == AV_PIX_FMT_YUYV422) {
+        av_log(NULL, AV_LOG_VERBOSE,
+               "selecting Blackfin optimized yuyvtoyv12_unscaled\n");
+        c->swscale = yuyvtoyv12_unscaled;
+    }
+}
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/bfin/yuv2rgb_bfin.c
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/bfin/yuv2rgb_bfin.c
@@ -0,0 +1,203 @@
+/*
+ * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
+ *
+ * Blackfin video color space converter operations
+ * convert I420 YV12 to RGB in various formats
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/pixdesc.h"
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libswscale/swscale_internal.h"
+
+#if defined(__FDPIC__) && CONFIG_SRAM
+#define L1CODE __attribute__((l1_text))
+#else
+#define L1CODE
+#endif
+
+void ff_bfin_yuv2rgb555_line(const uint8_t *Y, const uint8_t *U,
+                             const uint8_t *V, uint8_t *out,
+                             int w, uint32_t *coeffs) L1CODE;
+
+void ff_bfin_yuv2rgb565_line(const uint8_t *Y, const uint8_t *U,
+                             const uint8_t *V, uint8_t *out,
+                             int w, uint32_t *coeffs) L1CODE;
+
+void ff_bfin_yuv2rgb24_line(const uint8_t *Y, const uint8_t *U,
+                            const uint8_t *V, uint8_t *out,
+                            int w, uint32_t *coeffs) L1CODE;
+
+typedef void (*ltransform)(const uint8_t *Y, const uint8_t *U, const uint8_t *V,
+                           uint8_t *out, int w, uint32_t *coeffs);
+
+static void bfin_prepare_coefficients(SwsContext *c, int rgb, int masks)
+{
+    int oy;
+    oy = c->yOffset & 0xffff;
+    oy = oy >> 3;      // keep everything U8.0 for offset calculation
+
+    c->oc = 128 * 0x01010101U;
+    c->oy = oy * 0x01010101U;
+
+    /* copy 64bit vector coeffs down to 32bit vector coeffs */
+    c->cy   = c->yCoeff;
+    c->zero = 0;
+
+    if (rgb) {
+        c->crv = c->vrCoeff;
+        c->cbu = c->ubCoeff;
+        c->cgu = c->ugCoeff;
+        c->cgv = c->vgCoeff;
+    } else {
+        c->crv = c->ubCoeff;
+        c->cbu = c->vrCoeff;
+        c->cgu = c->vgCoeff;
+        c->cgv = c->ugCoeff;
+    }
+
+    if (masks == 555) {
+        c->rmask = 0x001f * 0x00010001U;
+        c->gmask = 0x03e0 * 0x00010001U;
+        c->bmask = 0x7c00 * 0x00010001U;
+    } else if (masks == 565) {
+        c->rmask = 0x001f * 0x00010001U;
+        c->gmask = 0x07e0 * 0x00010001U;
+        c->bmask = 0xf800 * 0x00010001U;
+    }
+}
+
+static int core_yuv420_rgb(SwsContext *c, const uint8_t **in, int *instrides,
+                           int srcSliceY, int srcSliceH, uint8_t **oplanes,
+                           int *outstrides, ltransform lcscf,
+                           int rgb, int masks)
+{
+    const uint8_t *py, *pu, *pv;
+    uint8_t *op;
+    int w  = instrides[0];
+    int h2 = srcSliceH >> 1;
+    int i;
+
+    bfin_prepare_coefficients(c, rgb, masks);
+
+    py = in[0];
+    pu = in[1 + (1 ^ rgb)];
+    pv = in[1 + (0 ^ rgb)];
+
+    op = oplanes[0] + srcSliceY * outstrides[0];
+
+    for (i = 0; i < h2; i++) {
+        lcscf(py, pu, pv, op, w, &c->oy);
+
+        py += instrides[0];
+        op += outstrides[0];
+
+        lcscf(py, pu, pv, op, w, &c->oy);
+
+        py += instrides[0];
+        pu += instrides[1];
+        pv += instrides[2];
+        op += outstrides[0];
+    }
+
+    return srcSliceH;
+}
+
+static int bfin_yuv420_rgb555(SwsContext *c, const uint8_t **in, int *instrides,
+                              int srcSliceY, int srcSliceH,
+                              uint8_t **oplanes, int *outstrides)
+{
+    return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
+                           outstrides, ff_bfin_yuv2rgb555_line, 1, 555);
+}
+
+static int bfin_yuv420_bgr555(SwsContext *c, const uint8_t **in, int *instrides,
+                              int srcSliceY, int srcSliceH,
+                              uint8_t **oplanes, int *outstrides)
+{
+    return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
+                           outstrides, ff_bfin_yuv2rgb555_line, 0, 555);
+}
+
+static int bfin_yuv420_rgb24(SwsContext *c, const uint8_t **in, int *instrides,
+                             int srcSliceY, int srcSliceH,
+                             uint8_t **oplanes, int *outstrides)
+{
+    return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
+                           outstrides, ff_bfin_yuv2rgb24_line, 1, 888);
+}
+
+static int bfin_yuv420_bgr24(SwsContext *c, const uint8_t **in, int *instrides,
+                             int srcSliceY, int srcSliceH,
+                             uint8_t **oplanes, int *outstrides)
+{
+    return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
+                           outstrides, ff_bfin_yuv2rgb24_line, 0, 888);
+}
+
+static int bfin_yuv420_rgb565(SwsContext *c, const uint8_t **in, int *instrides,
+                              int srcSliceY, int srcSliceH,
+                              uint8_t **oplanes, int *outstrides)
+{
+    return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
+                           outstrides, ff_bfin_yuv2rgb565_line, 1, 565);
+}
+
+static int bfin_yuv420_bgr565(SwsContext *c, const uint8_t **in, int *instrides,
+                              int srcSliceY, int srcSliceH,
+                              uint8_t **oplanes, int *outstrides)
+{
+    return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
+                           outstrides, ff_bfin_yuv2rgb565_line, 0, 565);
+}
+
+av_cold SwsFunc ff_yuv2rgb_init_bfin(SwsContext *c)
+{
+    SwsFunc f;
+
+    switch (c->dstFormat) {
+    case AV_PIX_FMT_RGB555:
+        f = bfin_yuv420_rgb555;
+        break;
+    case AV_PIX_FMT_BGR555:
+        f = bfin_yuv420_bgr555;
+        break;
+    case AV_PIX_FMT_RGB565:
+        f = bfin_yuv420_rgb565;
+        break;
+    case AV_PIX_FMT_BGR565:
+        f = bfin_yuv420_bgr565;
+        break;
+    case AV_PIX_FMT_RGB24:
+        f = bfin_yuv420_rgb24;
+        break;
+    case AV_PIX_FMT_BGR24:
+        f = bfin_yuv420_bgr24;
+        break;
+    default:
+        return 0;
+    }
+
+    av_log(c, AV_LOG_INFO, "BlackFin accelerated color space converter %s\n",
+           av_get_pix_fmt_name(c->dstFormat));
+
+    return f;
+}
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/colorspace-test.c
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/colorspace-test.c
@@ -0,0 +1,170 @@
+/*
+ * Copyright (C) 2002 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdio.h>
+#include <string.h>              /* for memset() */
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include "swscale.h"
+#include "rgb2rgb.h"
+#include "libavutil/mem.h"
+
+#define SIZE    1000
+#define srcByte 0x55
+#define dstByte 0xBB
+
+#define FUNC(s, d, n) { s, d, #n, n }
+
+int main(int argc, char **argv)
+{
+    int i, funcNum;
+    uint8_t *srcBuffer = av_malloc(SIZE);
+    uint8_t *dstBuffer = av_malloc(SIZE);
+    int failedNum      = 0;
+    int passedNum      = 0;
+
+    if (!srcBuffer || !dstBuffer)
+        return -1;
+
+    av_log(NULL, AV_LOG_INFO, "memory corruption test ...\n");
+    sws_rgb2rgb_init();
+
+    for (funcNum = 0; ; funcNum++) {
+        struct func_info_s {
+            int src_bpp;
+            int dst_bpp;
+            const char *name;
+            void (*func)(const uint8_t *src, uint8_t *dst, int src_size);
+        } func_info[] = {
+            FUNC(2, 2, rgb12to15),
+            FUNC(2, 2, rgb15to16),
+            FUNC(2, 3, rgb15to24),
+            FUNC(2, 4, rgb15to32),
+            FUNC(2, 3, rgb16to24),
+            FUNC(2, 4, rgb16to32),
+            FUNC(3, 2, rgb24to15),
+            FUNC(3, 2, rgb24to16),
+            FUNC(3, 4, rgb24to32),
+            FUNC(4, 2, rgb32to15),
+            FUNC(4, 2, rgb32to16),
+            FUNC(4, 3, rgb32to24),
+            FUNC(2, 2, rgb16to15),
+            FUNC(2, 2, rgb12tobgr12),
+            FUNC(2, 2, rgb15tobgr15),
+            FUNC(2, 2, rgb15tobgr16),
+            FUNC(2, 3, rgb15tobgr24),
+            FUNC(2, 4, rgb15tobgr32),
+            FUNC(2, 2, rgb16tobgr15),
+            FUNC(2, 2, rgb16tobgr16),
+            FUNC(2, 3, rgb16tobgr24),
+            FUNC(2, 4, rgb16tobgr32),
+            FUNC(3, 2, rgb24tobgr15),
+            FUNC(3, 2, rgb24tobgr16),
+            FUNC(3, 3, rgb24tobgr24),
+            FUNC(3, 4, rgb24tobgr32),
+            FUNC(4, 2, rgb32tobgr15),
+            FUNC(4, 2, rgb32tobgr16),
+            FUNC(4, 3, rgb32tobgr24),
+            FUNC(4, 4, shuffle_bytes_2103), /* rgb32tobgr32 */
+            FUNC(6, 6, rgb48tobgr48_nobswap),
+            FUNC(6, 6, rgb48tobgr48_bswap),
+            FUNC(8, 6, rgb64to48_nobswap),
+            FUNC(8, 6, rgb64to48_bswap),
+            FUNC(8, 6, rgb64tobgr48_nobswap),
+            FUNC(8, 6, rgb64tobgr48_bswap),
+            FUNC(0, 0, NULL)
+        };
+        int width;
+        int failed = 0;
+        int srcBpp = 0;
+        int dstBpp = 0;
+
+        if (!func_info[funcNum].func)
+            break;
+
+        av_log(NULL, AV_LOG_INFO, ".");
+        memset(srcBuffer, srcByte, SIZE);
+
+        for (width = 63; width > 0; width--) {
+            int dstOffset;
+            for (dstOffset = 128; dstOffset < 196; dstOffset += 4) {
+                int srcOffset;
+                memset(dstBuffer, dstByte, SIZE);
+
+                for (srcOffset = 128; srcOffset < 196; srcOffset += 4) {
+                    uint8_t *src     = srcBuffer + srcOffset;
+                    uint8_t *dst     = dstBuffer + dstOffset;
+                    const char *name = NULL;
+
+                    // don't fill the screen with shit ...
+                    if (failed)
+                        break;
+
+                    srcBpp = func_info[funcNum].src_bpp;
+                    dstBpp = func_info[funcNum].dst_bpp;
+                    name   = func_info[funcNum].name;
+
+                    func_info[funcNum].func(src, dst, width * srcBpp);
+
+                    if (!srcBpp)
+                        break;
+
+                    for (i = 0; i < SIZE; i++) {
+                        if (srcBuffer[i] != srcByte) {
+                            av_log(NULL, AV_LOG_INFO,
+                                   "src damaged at %d w:%d src:%d dst:%d %s\n",
+                                   i, width, srcOffset, dstOffset, name);
+                            failed = 1;
+                            break;
+                        }
+                    }
+                    for (i = 0; i < dstOffset; i++) {
+                        if (dstBuffer[i] != dstByte) {
+                            av_log(NULL, AV_LOG_INFO,
+                                   "dst damaged at %d w:%d src:%d dst:%d %s\n",
+                                   i, width, srcOffset, dstOffset, name);
+                            failed = 1;
+                            break;
+                        }
+                    }
+                    for (i = dstOffset + width * dstBpp; i < SIZE; i++) {
+                        if (dstBuffer[i] != dstByte) {
+                            av_log(NULL, AV_LOG_INFO,
+                                   "dst damaged at %d w:%d src:%d dst:%d %s\n",
+                                   i, width, srcOffset, dstOffset, name);
+                            failed = 1;
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+        if (failed)
+            failedNum++;
+        else if (srcBpp)
+            passedNum++;
+    }
+
+    av_log(NULL, AV_LOG_INFO,
+           "\n%d converters passed, %d converters randomly overwrote memory\n",
+           passedNum, failedNum);
+    return failedNum;
+}
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/input.c
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/input.c
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/libswscale.pc
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/libswscale.pc
@@ -0,0 +1,14 @@
+prefix=/usr/local
+exec_prefix=${prefix}
+libdir=${prefix}/lib
+includedir=${prefix}/include
+
+Name: libswscale
+Description: FFmpeg image rescaling library
+Version: 2.5.101
+Requires: 
+Requires.private: libavutil = 52.48.101
+Conflicts:
+Libs: -L${libdir} -lswscale 
+Libs.private: -lm
+Cflags: -I${includedir}
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/libswscale.v
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/libswscale.v
@@ -0,0 +1,4 @@
+LIBSWSCALE_$MAJOR {
+        global: swscale_*; sws_*;
+        local: *;
+};
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/libswscale.ver
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/libswscale.ver
@@ -0,0 +1,4 @@
+LIBSWSCALE_2 {
+        global: swscale_*; sws_*;
+        local: *;
+};
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/options.c
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/options.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avutil.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixfmt.h"
+#include "swscale.h"
+#include "swscale_internal.h"
+
+static const char *sws_context_to_name(void *ptr)
+{
+    return "swscaler";
+}
+
+#define OFFSET(x) offsetof(SwsContext, x)
+#define DEFAULT 0
+#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+
+static const AVOption swscale_options[] = {
+    { "sws_flags",       "scaler flags",                  OFFSET(flags),     AV_OPT_TYPE_FLAGS,  { .i64  = SWS_BICUBIC        }, 0,      UINT_MAX,        VE, "sws_flags" },
+    { "fast_bilinear",   "fast bilinear",                 0,                 AV_OPT_TYPE_CONST,  { .i64  = SWS_FAST_BILINEAR  }, INT_MIN, INT_MAX,        VE, "sws_flags" },
+    { "bilinear",        "bilinear",                      0,                 AV_OPT_TYPE_CONST,  { .i64  = SWS_BILINEAR       }, INT_MIN, INT_MAX,        VE, "sws_flags" },
+    { "bicubic",         "bicubic",                       0,                 AV_OPT_TYPE_CONST,  { .i64  = SWS_BICUBIC        }, INT_MIN, INT_MAX,        VE, "sws_flags" },
+    { "experimental",    "experimental",                  0,                 AV_OPT_TYPE_CONST,  { .i64  = SWS_X              }, INT_MIN, INT_MAX,        VE, "sws_flags" },
+    { "neighbor",        "nearest neighbor",              0,                 AV_OPT_TYPE_CONST,  { .i64  = SWS_POINT          }, INT_MIN, INT_MAX,        VE, "sws_flags" },
+    { "area",            "averaging area",                0,                 AV_OPT_TYPE_CONST,  { .i64  = SWS_AREA           }, INT_MIN, INT_MAX,        VE, "sws_flags" },
+    { "bicublin",        "luma bicubic, chroma bilinear", 0,                 AV_OPT_TYPE_CONST,  { .i64  = SWS_BICUBLIN       }, INT_MIN, INT_MAX,        VE, "sws_flags" },
+    { "gauss",           "gaussian",                      0,                 AV_OPT_TYPE_CONST,  { .i64  = SWS_GAUSS          }, INT_MIN, INT_MAX,        VE, "sws_flags" },
+    { "sinc",            "sinc",                          0,                 AV_OPT_TYPE_CONST,  { .i64  = SWS_SINC           }, INT_MIN, INT_MAX,        VE, "sws_flags" },
+    { "lanczos",         "lanczos",                       0,                 AV_OPT_TYPE_CONST,  { .i64  = SWS_LANCZOS        }, INT_MIN, INT_MAX,        VE, "sws_flags" },
+    { "spline",          "natural bicubic spline",        0,                 AV_OPT_TYPE_CONST,  { .i64  = SWS_SPLINE         }, INT_MIN, INT_MAX,        VE, "sws_flags" },
+    { "print_info",      "print info",                    0,                 AV_OPT_TYPE_CONST,  { .i64  = SWS_PRINT_INFO     }, INT_MIN, INT_MAX,        VE, "sws_flags" },
+    { "accurate_rnd",    "accurate rounding",             0,                 AV_OPT_TYPE_CONST,  { .i64  = SWS_ACCURATE_RND   }, INT_MIN, INT_MAX,        VE, "sws_flags" },
+    { "full_chroma_int", "full chroma interpolation",     0,                 AV_OPT_TYPE_CONST,  { .i64  = SWS_FULL_CHR_H_INT }, INT_MIN, INT_MAX,        VE, "sws_flags" },
+    { "full_chroma_inp", "full chroma input",             0,                 AV_OPT_TYPE_CONST,  { .i64  = SWS_FULL_CHR_H_INP }, INT_MIN, INT_MAX,        VE, "sws_flags" },
+    { "bitexact",        "",                              0,                 AV_OPT_TYPE_CONST,  { .i64  = SWS_BITEXACT       }, INT_MIN, INT_MAX,        VE, "sws_flags" },
+    { "error_diffusion", "error diffusion dither",        0,                 AV_OPT_TYPE_CONST,  { .i64  = SWS_ERROR_DIFFUSION}, INT_MIN, INT_MAX,        VE, "sws_flags" },
+
+    { "srcw",            "source width",                  OFFSET(srcW),      AV_OPT_TYPE_INT,    { .i64 = 16                 }, 1,       INT_MAX,        VE },
+    { "srch",            "source height",                 OFFSET(srcH),      AV_OPT_TYPE_INT,    { .i64 = 16                 }, 1,       INT_MAX,        VE },
+    { "dstw",            "destination width",             OFFSET(dstW),      AV_OPT_TYPE_INT,    { .i64 = 16                 }, 1,       INT_MAX,        VE },
+    { "dsth",            "destination height",            OFFSET(dstH),      AV_OPT_TYPE_INT,    { .i64 = 16                 }, 1,       INT_MAX,        VE },
+    { "src_format",      "source format",                 OFFSET(srcFormat), AV_OPT_TYPE_INT,    { .i64 = DEFAULT            }, 0,       AV_PIX_FMT_NB - 1, VE },
+    { "dst_format",      "destination format",            OFFSET(dstFormat), AV_OPT_TYPE_INT,    { .i64 = DEFAULT            }, 0,       AV_PIX_FMT_NB - 1, VE },
+    { "src_range",       "source range",                  OFFSET(srcRange),  AV_OPT_TYPE_INT,    { .i64 = DEFAULT            }, 0,       1,              VE },
+    { "dst_range",       "destination range",             OFFSET(dstRange),  AV_OPT_TYPE_INT,    { .i64 = DEFAULT            }, 0,       1,              VE },
+    { "param0",          "scaler param 0",                OFFSET(param[0]),  AV_OPT_TYPE_DOUBLE, { .dbl = SWS_PARAM_DEFAULT  }, INT_MIN, INT_MAX,        VE },
+    { "param1",          "scaler param 1",                OFFSET(param[1]),  AV_OPT_TYPE_DOUBLE, { .dbl = SWS_PARAM_DEFAULT  }, INT_MIN, INT_MAX,        VE },
+
+    { "src_v_chr_pos",   "source vertical chroma position in luma grid/256"  , OFFSET(src_v_chr_pos), AV_OPT_TYPE_INT, { .i64 = -1            }, -1,      512,             VE },
+    { "src_h_chr_pos",   "source horizontal chroma position in luma grid/256", OFFSET(src_h_chr_pos), AV_OPT_TYPE_INT, { .i64 = -1            }, -1,      512,             VE },
+    { "dst_v_chr_pos",   "destination vertical chroma position in luma grid/256"  , OFFSET(dst_v_chr_pos), AV_OPT_TYPE_INT, { .i64 = -1            }, -1,      512,             VE },
+    { "dst_h_chr_pos",   "destination horizontal chroma position in luma grid/256", OFFSET(dst_h_chr_pos), AV_OPT_TYPE_INT, { .i64 = -1            }, -1,      512,             VE },
+
+    { "sws_dither",      "set dithering algorithm",       OFFSET(dither),    AV_OPT_TYPE_INT,    { .i64  = SWS_DITHER_AUTO   }, 0,       NB_SWS_DITHER,  VE, "sws_dither" },
+    { "auto",            "leave choice to sws",           0,                 AV_OPT_TYPE_CONST,  { .i64  = SWS_DITHER_AUTO   }, INT_MIN, INT_MAX,        VE, "sws_dither" },
+    { "bayer",           "bayer dither",                  0,                 AV_OPT_TYPE_CONST,  { .i64  = SWS_DITHER_BAYER  }, INT_MIN, INT_MAX,        VE, "sws_dither" },
+    { "ed",              "error diffusion",               0,                 AV_OPT_TYPE_CONST,  { .i64  = SWS_DITHER_ED     }, INT_MIN, INT_MAX,        VE, "sws_dither" },
+
+    { NULL }
+};
+
+const AVClass sws_context_class = {
+    .class_name = "SWScaler",
+    .item_name  = sws_context_to_name,
+    .option     = swscale_options,
+    .category   = AV_CLASS_CATEGORY_SWSCALER,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+const AVClass *sws_get_class(void)
+{
+    return &sws_context_class;
+}
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/output.c
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/output.c
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/ppc/Makefile
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/ppc/Makefile
@@ -0,0 +1,3 @@
+OBJS += ppc/swscale_altivec.o                                           \
+        ppc/yuv2rgb_altivec.o                                           \
+        ppc/yuv2yuv_altivec.o                                           \
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/ppc/swscale_altivec.c
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/ppc/swscale_altivec.c
@@ -0,0 +1,332 @@
+/*
+ * AltiVec-enhanced yuv2yuvX
+ *
+ * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
+ * based on the equivalent C code in swscale.c
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <inttypes.h>
+
+#include "config.h"
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "yuv2rgb_altivec.h"
+
+#if HAVE_ALTIVEC
+#define vzero vec_splat_s32(0)
+
+#define yuv2planeX_8(d1, d2, l1, src, x, perm, filter) do {     \
+        vector signed short l2  = vec_ld(((x) << 1) + 16, src); \
+        vector signed short ls  = vec_perm(l1, l2, perm);       \
+        vector signed int   i1  = vec_mule(filter, ls);         \
+        vector signed int   i2  = vec_mulo(filter, ls);         \
+        vector signed int   vf1 = vec_mergeh(i1, i2);           \
+        vector signed int   vf2 = vec_mergel(i1, i2);           \
+        d1 = vec_add(d1, vf1);                                  \
+        d2 = vec_add(d2, vf2);                                  \
+        l1 = l2;                                                \
+    } while (0)
+
+static void yuv2planeX_16_altivec(const int16_t *filter, int filterSize,
+                                  const int16_t **src, uint8_t *dest,
+                                  const uint8_t *dither, int offset, int x)
+{
+    register int i, j;
+    DECLARE_ALIGNED(16, int, val)[16];
+    vector signed int vo1, vo2, vo3, vo4;
+    vector unsigned short vs1, vs2;
+    vector unsigned char vf;
+    vector unsigned int altivec_vectorShiftInt19 =
+        vec_add(vec_splat_u32(10), vec_splat_u32(9));
+
+    for (i = 0; i < 16; i++)
+        val[i] = dither[(x + i + offset) & 7] << 12;
+
+    vo1 = vec_ld(0,  val);
+    vo2 = vec_ld(16, val);
+    vo3 = vec_ld(32, val);
+    vo4 = vec_ld(48, val);
+
+    for (j = 0; j < filterSize; j++) {
+        vector signed short l1, vLumFilter = vec_ld(j << 1, filter);
+        vector unsigned char perm, perm0 = vec_lvsl(j << 1, filter);
+        vLumFilter = vec_perm(vLumFilter, vLumFilter, perm0);
+        vLumFilter = vec_splat(vLumFilter, 0); // lumFilter[j] is loaded 8 times in vLumFilter
+
+        perm = vec_lvsl(x << 1, src[j]);
+        l1   = vec_ld(x << 1, src[j]);
+
+        yuv2planeX_8(vo1, vo2, l1, src[j], x,     perm, vLumFilter);
+        yuv2planeX_8(vo3, vo4, l1, src[j], x + 8, perm, vLumFilter);
+    }
+
+    vo1 = vec_sra(vo1, altivec_vectorShiftInt19);
+    vo2 = vec_sra(vo2, altivec_vectorShiftInt19);
+    vo3 = vec_sra(vo3, altivec_vectorShiftInt19);
+    vo4 = vec_sra(vo4, altivec_vectorShiftInt19);
+    vs1 = vec_packsu(vo1, vo2);
+    vs2 = vec_packsu(vo3, vo4);
+    vf  = vec_packsu(vs1, vs2);
+    vec_st(vf, 0, dest);
+}
+
+static inline void yuv2planeX_u(const int16_t *filter, int filterSize,
+                                const int16_t **src, uint8_t *dest, int dstW,
+                                const uint8_t *dither, int offset, int x)
+{
+    int i, j;
+
+    for (i = x; i < dstW; i++) {
+        int t = dither[(i + offset) & 7] << 12;
+        for (j = 0; j < filterSize; j++)
+            t += src[j][i] * filter[j];
+        dest[i] = av_clip_uint8(t >> 19);
+    }
+}
+
+static void yuv2planeX_altivec(const int16_t *filter, int filterSize,
+                               const int16_t **src, uint8_t *dest, int dstW,
+                               const uint8_t *dither, int offset)
+{
+    int dst_u = -(uintptr_t)dest & 15;
+    int i;
+
+    yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0);
+
+    for (i = dst_u; i < dstW - 15; i += 16)
+        yuv2planeX_16_altivec(filter, filterSize, src, dest + i, dither,
+                              offset, i);
+
+    yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i);
+}
+
+static void hScale_altivec_real(SwsContext *c, int16_t *dst, int dstW,
+                                const uint8_t *src, const int16_t *filter,
+                                const int32_t *filterPos, int filterSize)
+{
+    register int i;
+    DECLARE_ALIGNED(16, int, tempo)[4];
+
+    if (filterSize % 4) {
+        for (i = 0; i < dstW; i++) {
+            register int j;
+            register int srcPos = filterPos[i];
+            register int val    = 0;
+            for (j = 0; j < filterSize; j++)
+                val += ((int)src[srcPos + j]) * filter[filterSize * i + j];
+            dst[i] = FFMIN(val >> 7, (1 << 15) - 1);
+        }
+    } else
+        switch (filterSize) {
+        case 4:
+            for (i = 0; i < dstW; i++) {
+                register int srcPos = filterPos[i];
+
+                vector unsigned char src_v0 = vec_ld(srcPos, src);
+                vector unsigned char src_v1, src_vF;
+                vector signed short src_v, filter_v;
+                vector signed int val_vEven, val_s;
+                if ((((uintptr_t)src + srcPos) % 16) > 12) {
+                    src_v1 = vec_ld(srcPos + 16, src);
+                }
+                src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src));
+
+                src_v = // vec_unpackh sign-extends...
+                        (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
+                // now put our elements in the even slots
+                src_v = vec_mergeh(src_v, (vector signed short)vzero);
+
+                filter_v = vec_ld(i << 3, filter);
+                // The 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2).
+
+                // The neat trick: We only care for half the elements,
+                // high or low depending on (i<<3)%16 (it's 0 or 8 here),
+                // and we're going to use vec_mule, so we choose
+                // carefully how to "unpack" the elements into the even slots.
+                if ((i << 3) % 16)
+                    filter_v = vec_mergel(filter_v, (vector signed short)vzero);
+                else
+                    filter_v = vec_mergeh(filter_v, (vector signed short)vzero);
+
+                val_vEven = vec_mule(src_v, filter_v);
+                val_s     = vec_sums(val_vEven, vzero);
+                vec_st(val_s, 0, tempo);
+                dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
+            }
+        break;
+
+        case 8:
+            for (i = 0; i < dstW; i++) {
+                register int srcPos = filterPos[i];
+
+                vector unsigned char src_v0 = vec_ld(srcPos, src);
+                vector unsigned char src_v1, src_vF;
+                vector signed short src_v, filter_v;
+                vector signed int val_v, val_s;
+                if ((((uintptr_t)src + srcPos) % 16) > 8) {
+                    src_v1 = vec_ld(srcPos + 16, src);
+                }
+                src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src));
+
+                src_v = // vec_unpackh sign-extends...
+                        (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
+                filter_v = vec_ld(i << 4, filter);
+                // the 4 above is 3 (filterSize == 8) + 1 (sizeof(short) == 2)
+
+                val_v = vec_msums(src_v, filter_v, (vector signed int)vzero);
+                val_s = vec_sums(val_v, vzero);
+                vec_st(val_s, 0, tempo);
+                dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
+            }
+        break;
+
+        case 16:
+            for (i = 0; i < dstW; i++) {
+                register int srcPos = filterPos[i];
+
+                vector unsigned char src_v0 = vec_ld(srcPos, src);
+                vector unsigned char src_v1 = vec_ld(srcPos + 16, src);
+                vector unsigned char src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src));
+
+                vector signed short src_vA = // vec_unpackh sign-extends...
+                                             (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
+                vector signed short src_vB = // vec_unpackh sign-extends...
+                                             (vector signed short)(vec_mergel((vector unsigned char)vzero, src_vF));
+
+                vector signed short filter_v0 = vec_ld(i << 5, filter);
+                vector signed short filter_v1 = vec_ld((i << 5) + 16, filter);
+                // the 5 above are 4 (filterSize == 16) + 1 (sizeof(short) == 2)
+
+                vector signed int val_acc = vec_msums(src_vA, filter_v0, (vector signed int)vzero);
+                vector signed int val_v   = vec_msums(src_vB, filter_v1, val_acc);
+
+                vector signed int val_s = vec_sums(val_v, vzero);
+
+                vec_st(val_s, 0, tempo);
+                dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
+            }
+        break;
+
+        default:
+            for (i = 0; i < dstW; i++) {
+                register int j;
+                register int srcPos = filterPos[i];
+
+                vector signed int val_s, val_v = (vector signed int)vzero;
+                vector signed short filter_v0R = vec_ld(i * 2 * filterSize, filter);
+                vector unsigned char permF     = vec_lvsl((i * 2 * filterSize), filter);
+
+                vector unsigned char src_v0 = vec_ld(srcPos, src);
+                vector unsigned char permS  = vec_lvsl(srcPos, src);
+
+                for (j = 0; j < filterSize - 15; j += 16) {
+                    vector unsigned char src_v1 = vec_ld(srcPos + j + 16, src);
+                    vector unsigned char src_vF = vec_perm(src_v0, src_v1, permS);
+
+                    vector signed short src_vA = // vec_unpackh sign-extends...
+                                                 (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
+                    vector signed short src_vB = // vec_unpackh sign-extends...
+                                                 (vector signed short)(vec_mergel((vector unsigned char)vzero, src_vF));
+
+                    vector signed short filter_v1R = vec_ld((i * 2 * filterSize) + (j * 2) + 16, filter);
+                    vector signed short filter_v2R = vec_ld((i * 2 * filterSize) + (j * 2) + 32, filter);
+                    vector signed short filter_v0  = vec_perm(filter_v0R, filter_v1R, permF);
+                    vector signed short filter_v1  = vec_perm(filter_v1R, filter_v2R, permF);
+
+                    vector signed int val_acc = vec_msums(src_vA, filter_v0, val_v);
+                    val_v = vec_msums(src_vB, filter_v1, val_acc);
+
+                    filter_v0R = filter_v2R;
+                    src_v0     = src_v1;
+                }
+
+                if (j < filterSize - 7) {
+                    // loading src_v0 is useless, it's already done above
+                    // vector unsigned char src_v0 = vec_ld(srcPos + j, src);
+                    vector unsigned char src_v1, src_vF;
+                    vector signed short src_v, filter_v1R, filter_v;
+                    if ((((uintptr_t)src + srcPos) % 16) > 8) {
+                        src_v1 = vec_ld(srcPos + j + 16, src);
+                    }
+                    src_vF = vec_perm(src_v0, src_v1, permS);
+
+                    src_v = // vec_unpackh sign-extends...
+                            (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
+                    // loading filter_v0R is useless, it's already done above
+                    // vector signed short filter_v0R = vec_ld((i * 2 * filterSize) + j, filter);
+                    filter_v1R = vec_ld((i * 2 * filterSize) + (j * 2) + 16, filter);
+                    filter_v   = vec_perm(filter_v0R, filter_v1R, permF);
+
+                    val_v = vec_msums(src_v, filter_v, val_v);
+                }
+
+                val_s = vec_sums(val_v, vzero);
+
+                vec_st(val_s, 0, tempo);
+                dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
+            }
+        }
+}
+#endif /* HAVE_ALTIVEC */
+
+av_cold void ff_sws_init_swscale_ppc(SwsContext *c)
+{
+#if HAVE_ALTIVEC
+    enum AVPixelFormat dstFormat = c->dstFormat;
+
+    if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
+        return;
+
+    if (c->srcBpc == 8 && c->dstBpc <= 14) {
+        c->hyScale = c->hcScale = hScale_altivec_real;
+    }
+    if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) &&
+        dstFormat != AV_PIX_FMT_NV12 && dstFormat != AV_PIX_FMT_NV21 &&
+        !c->alpPixBuf) {
+        c->yuv2planeX = yuv2planeX_altivec;
+    }
+
+    /* The following list of supported dstFormat values should
+     * match what's found in the body of ff_yuv2packedX_altivec() */
+    if (!(c->flags & (SWS_BITEXACT | SWS_FULL_CHR_H_INT)) && !c->alpPixBuf) {
+        switch (c->dstFormat) {
+        case AV_PIX_FMT_ABGR:
+            c->yuv2packedX = ff_yuv2abgr_X_altivec;
+            break;
+        case AV_PIX_FMT_BGRA:
+            c->yuv2packedX = ff_yuv2bgra_X_altivec;
+            break;
+        case AV_PIX_FMT_ARGB:
+            c->yuv2packedX = ff_yuv2argb_X_altivec;
+            break;
+        case AV_PIX_FMT_RGBA:
+            c->yuv2packedX = ff_yuv2rgba_X_altivec;
+            break;
+        case AV_PIX_FMT_BGR24:
+            c->yuv2packedX = ff_yuv2bgr24_X_altivec;
+            break;
+        case AV_PIX_FMT_RGB24:
+            c->yuv2packedX = ff_yuv2rgb24_X_altivec;
+            break;
+        }
+    }
+#endif /* HAVE_ALTIVEC */
+}
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/ppc/yuv2rgb_altivec.c
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/ppc/yuv2rgb_altivec.c
@@ -0,0 +1,868 @@
+/*
+ * AltiVec acceleration for colorspace conversion
+ *
+ * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/*
+ * Convert I420 YV12 to RGB in various formats,
+ * it rejects images that are not in 420 formats,
+ * it rejects images that don't have widths of multiples of 16,
+ * it rejects images that don't have heights of multiples of 2.
+ * Reject defers to C simulation code.
+ *
+ * Lots of optimizations to be done here.
+ *
+ * 1. Need to fix saturation code. I just couldn't get it to fly with packs
+ * and adds, so we currently use max/min to clip.
+ *
+ * 2. The inefficient use of chroma loading needs a bit of brushing up.
+ *
+ * 3. Analysis of pipeline stalls needs to be done. Use shark to identify
+ * pipeline stalls.
+ *
+ *
+ * MODIFIED to calculate coeffs from currently selected color space.
+ * MODIFIED core to be a macro where you specify the output format.
+ * ADDED UYVY conversion which is never called due to some thing in swscale.
+ * CORRECTED algorithim selection to be strict on input formats.
+ * ADDED runtime detection of AltiVec.
+ *
+ * ADDED altivec_yuv2packedX vertical scl + RGB converter
+ *
+ * March 27,2004
+ * PERFORMANCE ANALYSIS
+ *
+ * The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
+ * used as test.
+ * The AltiVec version uses 10% of the processor or ~100Mips for D1 video
+ * same sequence.
+ *
+ * 720 * 480 * 30  ~10MPS
+ *
+ * so we have roughly 10 clocks per pixel. This is too high, something has
+ * to be wrong.
+ *
+ * OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
+ * need for vec_min.
+ *
+ * OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to
+ * have the input video frame, it was just decompressed so it probably resides
+ * in L1 caches. However, we are creating the output video stream. This needs
+ * to use the DSTST instruction to optimize for the cache. We couple this with
+ * the fact that we are not going to be visiting the input buffer again so we
+ * mark it Least Recently Used. This shaves 25% of the processor cycles off.
+ *
+ * Now memcpy is the largest mips consumer in the system, probably due
+ * to the inefficient X11 stuff.
+ *
+ * GL libraries seem to be very slow on this machine 1.33Ghz PB running
+ * Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
+ * a versioning issue, however I have libGL.1.2.dylib for both
+ * machines. (We need to figure this out now.)
+ *
+ * GL2 libraries work now with patch for RGB32.
+ *
+ * NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
+ *
+ * Integrated luma prescaling adjustment for saturation/contrast/brightness
+ * adjustment.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+#include <assert.h>
+
+#include "config.h"
+#include "libswscale/rgb2rgb.h"
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/pixdesc.h"
+#include "yuv2rgb_altivec.h"
+
+#if HAVE_ALTIVEC
+
+#undef PROFILE_THE_BEAST
+#undef INC_SCALING
+
+typedef unsigned char ubyte;
+typedef signed char   sbyte;
+
+/* RGB interleaver, 16 planar pels 8-bit samples per channel in
+ * homogeneous vector registers x0,x1,x2 are interleaved with the
+ * following technique:
+ *
+ *    o0 = vec_mergeh(x0, x1);
+ *    o1 = vec_perm(o0, x2, perm_rgb_0);
+ *    o2 = vec_perm(o0, x2, perm_rgb_1);
+ *    o3 = vec_mergel(x0, x1);
+ *    o4 = vec_perm(o3, o2, perm_rgb_2);
+ *    o5 = vec_perm(o3, o2, perm_rgb_3);
+ *
+ * perm_rgb_0:   o0(RG).h v1(B) --> o1*
+ *            0   1  2   3   4
+ *           rgbr|gbrg|brgb|rgbr
+ *           0010 0100 1001 0010
+ *           0102 3145 2673 894A
+ *
+ * perm_rgb_1:   o0(RG).h v1(B) --> o2
+ *            0   1  2   3   4
+ *           gbrg|brgb|bbbb|bbbb
+ *           0100 1001 1111 1111
+ *           B5CD 6EF7 89AB CDEF
+ *
+ * perm_rgb_2:   o3(RG).l o2(rgbB.l) --> o4*
+ *            0   1  2   3   4
+ *           gbrg|brgb|rgbr|gbrg
+ *           1111 1111 0010 0100
+ *           89AB CDEF 0182 3945
+ *
+ * perm_rgb_2:   o3(RG).l o2(rgbB.l) ---> o5*
+ *            0   1  2   3   4
+ *           brgb|rgbr|gbrg|brgb
+ *           1001 0010 0100 1001
+ *           a67b 89cA BdCD eEFf
+ *
+ */
+static const vector unsigned char
+    perm_rgb_0 = { 0x00, 0x01, 0x10, 0x02, 0x03, 0x11, 0x04, 0x05,
+                   0x12, 0x06, 0x07, 0x13, 0x08, 0x09, 0x14, 0x0a },
+    perm_rgb_1 = { 0x0b, 0x15, 0x0c, 0x0d, 0x16, 0x0e, 0x0f, 0x17,
+                   0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f },
+    perm_rgb_2 = { 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+                   0x00, 0x01, 0x18, 0x02, 0x03, 0x19, 0x04, 0x05 },
+    perm_rgb_3 = { 0x1a, 0x06, 0x07, 0x1b, 0x08, 0x09, 0x1c, 0x0a,
+                   0x0b, 0x1d, 0x0c, 0x0d, 0x1e, 0x0e, 0x0f, 0x1f };
+
+#define vec_merge3(x2, x1, x0, y0, y1, y2)     \
+    do {                                       \
+        __typeof__(x0) o0, o2, o3;             \
+        o0 = vec_mergeh(x0, x1);               \
+        y0 = vec_perm(o0, x2, perm_rgb_0);     \
+        o2 = vec_perm(o0, x2, perm_rgb_1);     \
+        o3 = vec_mergel(x0, x1);               \
+        y1 = vec_perm(o3, o2, perm_rgb_2);     \
+        y2 = vec_perm(o3, o2, perm_rgb_3);     \
+    } while (0)
+
+#define vec_mstbgr24(x0, x1, x2, ptr)          \
+    do {                                       \
+        __typeof__(x0) _0, _1, _2;             \
+        vec_merge3(x0, x1, x2, _0, _1, _2);    \
+        vec_st(_0, 0, ptr++);                  \
+        vec_st(_1, 0, ptr++);                  \
+        vec_st(_2, 0, ptr++);                  \
+    } while (0)
+
+#define vec_mstrgb24(x0, x1, x2, ptr)          \
+    do {                                       \
+        __typeof__(x0) _0, _1, _2;             \
+        vec_merge3(x2, x1, x0, _0, _1, _2);    \
+        vec_st(_0, 0, ptr++);                  \
+        vec_st(_1, 0, ptr++);                  \
+        vec_st(_2, 0, ptr++);                  \
+    } while (0)
+
+/* pack the pixels in rgb0 format
+ * msb R
+ * lsb 0
+ */
+#define vec_mstrgb32(T, x0, x1, x2, x3, ptr)                            \
+    do {                                                                \
+        T _0, _1, _2, _3;                                               \
+        _0 = vec_mergeh(x0, x1);                                        \
+        _1 = vec_mergeh(x2, x3);                                        \
+        _2 = (T) vec_mergeh((vector unsigned short) _0,                 \
+                            (vector unsigned short) _1);                \
+        _3 = (T) vec_mergel((vector unsigned short) _0,                 \
+                            (vector unsigned short) _1);                \
+        vec_st(_2, 0 * 16, (T *) ptr);                                  \
+        vec_st(_3, 1 * 16, (T *) ptr);                                  \
+        _0 = vec_mergel(x0, x1);                                        \
+        _1 = vec_mergel(x2, x3);                                        \
+        _2 = (T) vec_mergeh((vector unsigned short) _0,                 \
+                            (vector unsigned short) _1);                \
+        _3 = (T) vec_mergel((vector unsigned short) _0,                 \
+                            (vector unsigned short) _1);                \
+        vec_st(_2, 2 * 16, (T *) ptr);                                  \
+        vec_st(_3, 3 * 16, (T *) ptr);                                  \
+        ptr += 4;                                                       \
+    } while (0)
+
+/*
+ * 1     0       1.4021   | | Y |
+ * 1    -0.3441 -0.7142   |x| Cb|
+ * 1     1.7718  0        | | Cr|
+ *
+ *
+ * Y:      [-128 127]
+ * Cb/Cr : [-128 127]
+ *
+ * typical YUV conversion works on Y: 0-255 this version has been
+ * optimized for JPEG decoding.
+ */
+
+#define vec_unh(x)                                                      \
+    (vector signed short)                                               \
+        vec_perm(x, (__typeof__(x)) { 0 },                              \
+                 ((vector unsigned char) {                              \
+                     0x10, 0x00, 0x10, 0x01, 0x10, 0x02, 0x10, 0x03,    \
+                     0x10, 0x04, 0x10, 0x05, 0x10, 0x06, 0x10, 0x07 }))
+
+#define vec_unl(x)                                                      \
+    (vector signed short)                                               \
+        vec_perm(x, (__typeof__(x)) { 0 },                              \
+                 ((vector unsigned char) {                              \
+                     0x10, 0x08, 0x10, 0x09, 0x10, 0x0A, 0x10, 0x0B,    \
+                     0x10, 0x0C, 0x10, 0x0D, 0x10, 0x0E, 0x10, 0x0F }))
+
+#define vec_clip_s16(x)                                                 \
+    vec_max(vec_min(x, ((vector signed short) {                         \
+                    235, 235, 235, 235, 235, 235, 235, 235 })),         \
+            ((vector signed short) { 16, 16, 16, 16, 16, 16, 16, 16 }))
+
+#define vec_packclp(x, y)                                               \
+    (vector unsigned char)                                              \
+        vec_packs((vector unsigned short)                               \
+                      vec_max(x, ((vector signed short) { 0 })),        \
+                  (vector unsigned short)                               \
+                      vec_max(y, ((vector signed short) { 0 })))
+
+static inline void cvtyuvtoRGB(SwsContext *c, vector signed short Y,
+                               vector signed short U, vector signed short V,
+                               vector signed short *R, vector signed short *G,
+                               vector signed short *B)
+{
+    vector signed short vx, ux, uvx;
+
+    Y = vec_mradds(Y, c->CY, c->OY);
+    U = vec_sub(U, (vector signed short)
+                       vec_splat((vector signed short) { 128 }, 0));
+    V = vec_sub(V, (vector signed short)
+                       vec_splat((vector signed short) { 128 }, 0));
+
+    // ux  = (CBU * (u << c->CSHIFT) + 0x4000) >> 15;
+    ux = vec_sl(U, c->CSHIFT);
+    *B = vec_mradds(ux, c->CBU, Y);
+
+    // vx  = (CRV * (v << c->CSHIFT) + 0x4000) >> 15;
+    vx = vec_sl(V, c->CSHIFT);
+    *R = vec_mradds(vx, c->CRV, Y);
+
+    // uvx = ((CGU * u) + (CGV * v)) >> 15;
+    uvx = vec_mradds(U, c->CGU, Y);
+    *G  = vec_mradds(V, c->CGV, uvx);
+}
+
+/*
+ * ------------------------------------------------------------------------------
+ * CS converters
+ * ------------------------------------------------------------------------------
+ */
+
+#define DEFCSP420_CVT(name, out_pixels)                                       \
+static int altivec_ ## name(SwsContext *c, const unsigned char **in,          \
+                            int *instrides, int srcSliceY, int srcSliceH,     \
+                            unsigned char **oplanes, int *outstrides)         \
+{                                                                             \
+    int w = c->srcW;                                                          \
+    int h = srcSliceH;                                                        \
+    int i, j;                                                                 \
+    int instrides_scl[3];                                                     \
+    vector unsigned char y0, y1;                                              \
+                                                                              \
+    vector signed char u, v;                                                  \
+                                                                              \
+    vector signed short Y0, Y1, Y2, Y3;                                       \
+    vector signed short U, V;                                                 \
+    vector signed short vx, ux, uvx;                                          \
+    vector signed short vx0, ux0, uvx0;                                       \
+    vector signed short vx1, ux1, uvx1;                                       \
+    vector signed short R0, G0, B0;                                           \
+    vector signed short R1, G1, B1;                                           \
+    vector unsigned char R, G, B;                                             \
+                                                                              \
+    const vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP;                  \
+    vector unsigned char align_perm;                                          \
+                                                                              \
+    vector signed short lCY       = c->CY;                                    \
+    vector signed short lOY       = c->OY;                                    \
+    vector signed short lCRV      = c->CRV;                                   \
+    vector signed short lCBU      = c->CBU;                                   \
+    vector signed short lCGU      = c->CGU;                                   \
+    vector signed short lCGV      = c->CGV;                                   \
+    vector unsigned short lCSHIFT = c->CSHIFT;                                \
+                                                                              \
+    const ubyte *y1i = in[0];                                                 \
+    const ubyte *y2i = in[0] + instrides[0];                                  \
+    const ubyte *ui  = in[1];                                                 \
+    const ubyte *vi  = in[2];                                                 \
+                                                                              \
+    vector unsigned char *oute, *outo;                                        \
+                                                                              \
+    /* loop moves y{1, 2}i by w */                                            \
+    instrides_scl[0] = instrides[0] * 2 - w;                                  \
+    /* loop moves ui by w / 2 */                                              \
+    instrides_scl[1] = instrides[1] - w / 2;                                  \
+    /* loop moves vi by w / 2 */                                              \
+    instrides_scl[2] = instrides[2] - w / 2;                                  \
+                                                                              \
+    for (i = 0; i < h / 2; i++) {                                             \
+        oute = (vector unsigned char *)(oplanes[0] + outstrides[0] *          \
+                                        (srcSliceY + i * 2));                 \
+        outo = oute + (outstrides[0] >> 4);                                   \
+        vec_dstst(outo, (0x02000002 | (((w * 3 + 32) / 32) << 16)), 0);       \
+        vec_dstst(oute, (0x02000002 | (((w * 3 + 32) / 32) << 16)), 1);       \
+                                                                              \
+        for (j = 0; j < w / 16; j++) {                                        \
+            y1ivP = (const vector unsigned char *) y1i;                       \
+            y2ivP = (const vector unsigned char *) y2i;                       \
+            uivP  = (const vector unsigned char *) ui;                        \
+            vivP  = (const vector unsigned char *) vi;                        \
+                                                                              \
+            align_perm = vec_lvsl(0, y1i);                                    \
+            y0 = (vector unsigned char)                                       \
+                     vec_perm(y1ivP[0], y1ivP[1], align_perm);                \
+                                                                              \
+            align_perm = vec_lvsl(0, y2i);                                    \
+            y1 = (vector unsigned char)                                       \
+                     vec_perm(y2ivP[0], y2ivP[1], align_perm);                \
+                                                                              \
+            align_perm = vec_lvsl(0, ui);                                     \
+            u = (vector signed char)                                          \
+                    vec_perm(uivP[0], uivP[1], align_perm);                   \
+                                                                              \
+            align_perm = vec_lvsl(0, vi);                                     \
+            v = (vector signed char)                                          \
+                    vec_perm(vivP[0], vivP[1], align_perm);                   \
+                                                                              \
+            u = (vector signed char)                                          \
+                    vec_sub(u,                                                \
+                            (vector signed char)                              \
+                                vec_splat((vector signed char) { 128 }, 0));  \
+            v = (vector signed char)                                          \
+                    vec_sub(v,                                                \
+                            (vector signed char)                              \
+                                vec_splat((vector signed char) { 128 }, 0));  \
+                                                                              \
+            U = vec_unpackh(u);                                               \
+            V = vec_unpackh(v);                                               \
+                                                                              \
+            Y0 = vec_unh(y0);                                                 \
+            Y1 = vec_unl(y0);                                                 \
+            Y2 = vec_unh(y1);                                                 \
+            Y3 = vec_unl(y1);                                                 \
+                                                                              \
+            Y0 = vec_mradds(Y0, lCY, lOY);                                    \
+            Y1 = vec_mradds(Y1, lCY, lOY);                                    \
+            Y2 = vec_mradds(Y2, lCY, lOY);                                    \
+            Y3 = vec_mradds(Y3, lCY, lOY);                                    \
+                                                                              \
+            /* ux  = (CBU * (u << CSHIFT) + 0x4000) >> 15 */                  \
+            ux  = vec_sl(U, lCSHIFT);                                         \
+            ux  = vec_mradds(ux, lCBU, (vector signed short) { 0 });          \
+            ux0 = vec_mergeh(ux, ux);                                         \
+            ux1 = vec_mergel(ux, ux);                                         \
+                                                                              \
+            /* vx  = (CRV * (v << CSHIFT) + 0x4000) >> 15; */                 \
+            vx  = vec_sl(V, lCSHIFT);                                         \
+            vx  = vec_mradds(vx, lCRV, (vector signed short) { 0 });          \
+            vx0 = vec_mergeh(vx, vx);                                         \
+            vx1 = vec_mergel(vx, vx);                                         \
+                                                                              \
+            /* uvx = ((CGU * u) + (CGV * v)) >> 15 */                         \
+            uvx  = vec_mradds(U, lCGU, (vector signed short) { 0 });          \
+            uvx  = vec_mradds(V, lCGV, uvx);                                  \
+            uvx0 = vec_mergeh(uvx, uvx);                                      \
+            uvx1 = vec_mergel(uvx, uvx);                                      \
+                                                                              \
+            R0 = vec_add(Y0, vx0);                                            \
+            G0 = vec_add(Y0, uvx0);                                           \
+            B0 = vec_add(Y0, ux0);                                            \
+            R1 = vec_add(Y1, vx1);                                            \
+            G1 = vec_add(Y1, uvx1);                                           \
+            B1 = vec_add(Y1, ux1);                                            \
+                                                                              \
+            R = vec_packclp(R0, R1);                                          \
+            G = vec_packclp(G0, G1);                                          \
+            B = vec_packclp(B0, B1);                                          \
+                                                                              \
+            out_pixels(R, G, B, oute);                                        \
+                                                                              \
+            R0 = vec_add(Y2, vx0);                                            \
+            G0 = vec_add(Y2, uvx0);                                           \
+            B0 = vec_add(Y2, ux0);                                            \
+            R1 = vec_add(Y3, vx1);                                            \
+            G1 = vec_add(Y3, uvx1);                                           \
+            B1 = vec_add(Y3, ux1);                                            \
+            R  = vec_packclp(R0, R1);                                         \
+            G  = vec_packclp(G0, G1);                                         \
+            B  = vec_packclp(B0, B1);                                         \
+                                                                              \
+                                                                              \
+            out_pixels(R, G, B, outo);                                        \
+                                                                              \
+            y1i += 16;                                                        \
+            y2i += 16;                                                        \
+            ui  += 8;                                                         \
+            vi  += 8;                                                         \
+        }                                                                     \
+                                                                              \
+        ui  += instrides_scl[1];                                              \
+        vi  += instrides_scl[2];                                              \
+        y1i += instrides_scl[0];                                              \
+        y2i += instrides_scl[0];                                              \
+    }                                                                         \
+    return srcSliceH;                                                         \
+}
+
+#define out_abgr(a, b, c, ptr)                                          \
+    vec_mstrgb32(__typeof__(a), ((__typeof__(a)) { 255 }), c, b, a, ptr)
+#define out_bgra(a, b, c, ptr)                                          \
+    vec_mstrgb32(__typeof__(a), c, b, a, ((__typeof__(a)) { 255 }), ptr)
+#define out_rgba(a, b, c, ptr)                                          \
+    vec_mstrgb32(__typeof__(a), a, b, c, ((__typeof__(a)) { 255 }), ptr)
+#define out_argb(a, b, c, ptr)                                          \
+    vec_mstrgb32(__typeof__(a), ((__typeof__(a)) { 255 }), a, b, c, ptr)
+#define out_rgb24(a, b, c, ptr) vec_mstrgb24(a, b, c, ptr)
+#define out_bgr24(a, b, c, ptr) vec_mstbgr24(a, b, c, ptr)
+
+DEFCSP420_CVT(yuv2_abgr,  out_abgr)
+DEFCSP420_CVT(yuv2_bgra,  out_bgra)
+DEFCSP420_CVT(yuv2_rgba,  out_rgba)
+DEFCSP420_CVT(yuv2_argb,  out_argb)
+DEFCSP420_CVT(yuv2_rgb24, out_rgb24)
+DEFCSP420_CVT(yuv2_bgr24, out_bgr24)
+
+// uyvy|uyvy|uyvy|uyvy
+// 0123 4567 89ab cdef
+static const vector unsigned char
+    demux_u = { 0x10, 0x00, 0x10, 0x00,
+                0x10, 0x04, 0x10, 0x04,
+                0x10, 0x08, 0x10, 0x08,
+                0x10, 0x0c, 0x10, 0x0c },
+    demux_v = { 0x10, 0x02, 0x10, 0x02,
+                0x10, 0x06, 0x10, 0x06,
+                0x10, 0x0A, 0x10, 0x0A,
+                0x10, 0x0E, 0x10, 0x0E },
+    demux_y = { 0x10, 0x01, 0x10, 0x03,
+                0x10, 0x05, 0x10, 0x07,
+                0x10, 0x09, 0x10, 0x0B,
+                0x10, 0x0D, 0x10, 0x0F };
+
+/*
+ * this is so I can play live CCIR raw video
+ */
+static int altivec_uyvy_rgb32(SwsContext *c, const unsigned char **in,
+                              int *instrides, int srcSliceY, int srcSliceH,
+                              unsigned char **oplanes, int *outstrides)
+{
+    int w = c->srcW;
+    int h = srcSliceH;
+    int i, j;
+    vector unsigned char uyvy;
+    vector signed short Y, U, V;
+    vector signed short R0, G0, B0, R1, G1, B1;
+    vector unsigned char R, G, B;
+    vector unsigned char *out;
+    const ubyte *img;
+
+    img = in[0];
+    out = (vector unsigned char *) (oplanes[0] + srcSliceY * outstrides[0]);
+
+    for (i = 0; i < h; i++)
+        for (j = 0; j < w / 16; j++) {
+            uyvy = vec_ld(0, img);
+
+            U = (vector signed short)
+                    vec_perm(uyvy, (vector unsigned char) { 0 }, demux_u);
+            V = (vector signed short)
+                    vec_perm(uyvy, (vector unsigned char) { 0 }, demux_v);
+            Y = (vector signed short)
+                    vec_perm(uyvy, (vector unsigned char) { 0 }, demux_y);
+
+            cvtyuvtoRGB(c, Y, U, V, &R0, &G0, &B0);
+
+            uyvy = vec_ld(16, img);
+
+            U = (vector signed short)
+                    vec_perm(uyvy, (vector unsigned char) { 0 }, demux_u);
+            V = (vector signed short)
+                    vec_perm(uyvy, (vector unsigned char) { 0 }, demux_v);
+            Y = (vector signed short)
+                    vec_perm(uyvy, (vector unsigned char) { 0 }, demux_y);
+
+            cvtyuvtoRGB(c, Y, U, V, &R1, &G1, &B1);
+
+            R = vec_packclp(R0, R1);
+            G = vec_packclp(G0, G1);
+            B = vec_packclp(B0, B1);
+
+            // vec_mstbgr24 (R,G,B, out);
+            out_rgba(R, G, B, out);
+
+            img += 32;
+        }
+    return srcSliceH;
+}
+
+#endif /* HAVE_ALTIVEC */
+
+/* Ok currently the acceleration routine only supports
+ * inputs of widths a multiple of 16
+ * and heights a multiple 2
+ *
+ * So we just fall back to the C codes for this.
+ */
+av_cold SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c)
+{
+#if HAVE_ALTIVEC
+    if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
+        return NULL;
+
+    /*
+     * and this seems not to matter too much I tried a bunch of
+     * videos with abnormal widths and MPlayer crashes elsewhere.
+     * mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
+     * boom with X11 bad match.
+     *
+     */
+    if ((c->srcW & 0xf) != 0)
+        return NULL;
+
+    switch (c->srcFormat) {
+    case AV_PIX_FMT_YUV410P:
+    case AV_PIX_FMT_YUV420P:
+    /*case IMGFMT_CLPL:        ??? */
+    case AV_PIX_FMT_GRAY8:
+    case AV_PIX_FMT_NV12:
+    case AV_PIX_FMT_NV21:
+        if ((c->srcH & 0x1) != 0)
+            return NULL;
+
+        switch (c->dstFormat) {
+        case AV_PIX_FMT_RGB24:
+            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
+            return altivec_yuv2_rgb24;
+        case AV_PIX_FMT_BGR24:
+            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
+            return altivec_yuv2_bgr24;
+        case AV_PIX_FMT_ARGB:
+            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
+            return altivec_yuv2_argb;
+        case AV_PIX_FMT_ABGR:
+            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
+            return altivec_yuv2_abgr;
+        case AV_PIX_FMT_RGBA:
+            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
+            return altivec_yuv2_rgba;
+        case AV_PIX_FMT_BGRA:
+            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
+            return altivec_yuv2_bgra;
+        default: return NULL;
+        }
+        break;
+
+    case AV_PIX_FMT_UYVY422:
+        switch (c->dstFormat) {
+        case AV_PIX_FMT_BGR32:
+            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
+            return altivec_uyvy_rgb32;
+        default: return NULL;
+        }
+        break;
+    }
+#endif /* HAVE_ALTIVEC */
+
+    return NULL;
+}
+
+av_cold void ff_yuv2rgb_init_tables_ppc(SwsContext *c,
+                                        const int inv_table[4],
+                                        int brightness,
+                                        int contrast,
+                                        int saturation)
+{
+#if HAVE_ALTIVEC
+    union {
+        DECLARE_ALIGNED(16, signed short, tmp)[8];
+        vector signed short vec;
+    } buf;
+
+    if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
+        return;
+
+    buf.tmp[0] = ((0xffffLL) * contrast >> 8) >> 9;                               // cy
+    buf.tmp[1] = -256 * brightness;                                               // oy
+    buf.tmp[2] =   (inv_table[0] >> 3) * (contrast >> 16) * (saturation >> 16);   // crv
+    buf.tmp[3] =   (inv_table[1] >> 3) * (contrast >> 16) * (saturation >> 16);   // cbu
+    buf.tmp[4] = -((inv_table[2] >> 1) * (contrast >> 16) * (saturation >> 16));  // cgu
+    buf.tmp[5] = -((inv_table[3] >> 1) * (contrast >> 16) * (saturation >> 16));  // cgv
+
+    c->CSHIFT = (vector unsigned short) vec_splat_u16(2);
+    c->CY     = vec_splat((vector signed short) buf.vec, 0);
+    c->OY     = vec_splat((vector signed short) buf.vec, 1);
+    c->CRV    = vec_splat((vector signed short) buf.vec, 2);
+    c->CBU    = vec_splat((vector signed short) buf.vec, 3);
+    c->CGU    = vec_splat((vector signed short) buf.vec, 4);
+    c->CGV    = vec_splat((vector signed short) buf.vec, 5);
+    return;
+#endif /* HAVE_ALTIVEC */
+}
+
+#if HAVE_ALTIVEC
+
+static av_always_inline void yuv2packedX_altivec(SwsContext *c,
+                                                 const int16_t *lumFilter,
+                                                 const int16_t **lumSrc,
+                                                 int lumFilterSize,
+                                                 const int16_t *chrFilter,
+                                                 const int16_t **chrUSrc,
+                                                 const int16_t **chrVSrc,
+                                                 int chrFilterSize,
+                                                 const int16_t **alpSrc,
+                                                 uint8_t *dest,
+                                                 int dstW, int dstY,
+                                                 enum AVPixelFormat target)
+{
+    int i, j;
+    vector signed short X, X0, X1, Y0, U0, V0, Y1, U1, V1, U, V;
+    vector signed short R0, G0, B0, R1, G1, B1;
+
+    vector unsigned char R, G, B;
+    vector unsigned char *out, *nout;
+
+    vector signed short RND   = vec_splat_s16(1 << 3);
+    vector unsigned short SCL = vec_splat_u16(4);
+    DECLARE_ALIGNED(16, unsigned int, scratch)[16];
+
+    vector signed short *YCoeffs, *CCoeffs;
+
+    YCoeffs = c->vYCoeffsBank + dstY * lumFilterSize;
+    CCoeffs = c->vCCoeffsBank + dstY * chrFilterSize;
+
+    out = (vector unsigned char *) dest;
+
+    for (i = 0; i < dstW; i += 16) {
+        Y0 = RND;
+        Y1 = RND;
+        /* extract 16 coeffs from lumSrc */
+        for (j = 0; j < lumFilterSize; j++) {
+            X0 = vec_ld(0, &lumSrc[j][i]);
+            X1 = vec_ld(16, &lumSrc[j][i]);
+            Y0 = vec_mradds(X0, YCoeffs[j], Y0);
+            Y1 = vec_mradds(X1, YCoeffs[j], Y1);
+        }
+
+        U = RND;
+        V = RND;
+        /* extract 8 coeffs from U,V */
+        for (j = 0; j < chrFilterSize; j++) {
+            X = vec_ld(0, &chrUSrc[j][i / 2]);
+            U = vec_mradds(X, CCoeffs[j], U);
+            X = vec_ld(0, &chrVSrc[j][i / 2]);
+            V = vec_mradds(X, CCoeffs[j], V);
+        }
+
+        /* scale and clip signals */
+        Y0 = vec_sra(Y0, SCL);
+        Y1 = vec_sra(Y1, SCL);
+        U  = vec_sra(U, SCL);
+        V  = vec_sra(V, SCL);
+
+        Y0 = vec_clip_s16(Y0);
+        Y1 = vec_clip_s16(Y1);
+        U  = vec_clip_s16(U);
+        V  = vec_clip_s16(V);
+
+        /* now we have
+         * Y0 = y0 y1 y2 y3 y4 y5 y6 y7    Y1 = y8 y9 y10 y11 y12 y13 y14 y15
+         * U  = u0 u1 u2 u3 u4 u5 u6 u7    V  = v0 v1 v2 v3 v4 v5 v6 v7
+         *
+         * Y0 = y0 y1 y2 y3 y4 y5 y6 y7    Y1 = y8 y9 y10 y11 y12 y13 y14 y15
+         * U0 = u0 u0 u1 u1 u2 u2 u3 u3    U1 = u4 u4 u5 u5 u6 u6 u7 u7
+         * V0 = v0 v0 v1 v1 v2 v2 v3 v3    V1 = v4 v4 v5 v5 v6 v6 v7 v7
+         */
+
+        U0 = vec_mergeh(U, U);
+        V0 = vec_mergeh(V, V);
+
+        U1 = vec_mergel(U, U);
+        V1 = vec_mergel(V, V);
+
+        cvtyuvtoRGB(c, Y0, U0, V0, &R0, &G0, &B0);
+        cvtyuvtoRGB(c, Y1, U1, V1, &R1, &G1, &B1);
+
+        R = vec_packclp(R0, R1);
+        G = vec_packclp(G0, G1);
+        B = vec_packclp(B0, B1);
+
+        switch (target) {
+        case AV_PIX_FMT_ABGR:
+            out_abgr(R, G, B, out);
+            break;
+        case AV_PIX_FMT_BGRA:
+            out_bgra(R, G, B, out);
+            break;
+        case AV_PIX_FMT_RGBA:
+            out_rgba(R, G, B, out);
+            break;
+        case AV_PIX_FMT_ARGB:
+            out_argb(R, G, B, out);
+            break;
+        case AV_PIX_FMT_RGB24:
+            out_rgb24(R, G, B, out);
+            break;
+        case AV_PIX_FMT_BGR24:
+            out_bgr24(R, G, B, out);
+            break;
+        default:
+        {
+            /* If this is reached, the caller should have called yuv2packedXinC
+             * instead. */
+            static int printed_error_message;
+            if (!printed_error_message) {
+                av_log(c, AV_LOG_ERROR,
+                       "altivec_yuv2packedX doesn't support %s output\n",
+                       av_get_pix_fmt_name(c->dstFormat));
+                printed_error_message = 1;
+            }
+            return;
+        }
+        }
+    }
+
+    if (i < dstW) {
+        i -= 16;
+
+        Y0 = RND;
+        Y1 = RND;
+        /* extract 16 coeffs from lumSrc */
+        for (j = 0; j < lumFilterSize; j++) {
+            X0 = vec_ld(0, &lumSrc[j][i]);
+            X1 = vec_ld(16, &lumSrc[j][i]);
+            Y0 = vec_mradds(X0, YCoeffs[j], Y0);
+            Y1 = vec_mradds(X1, YCoeffs[j], Y1);
+        }
+
+        U = RND;
+        V = RND;
+        /* extract 8 coeffs from U,V */
+        for (j = 0; j < chrFilterSize; j++) {
+            X = vec_ld(0, &chrUSrc[j][i / 2]);
+            U = vec_mradds(X, CCoeffs[j], U);
+            X = vec_ld(0, &chrVSrc[j][i / 2]);
+            V = vec_mradds(X, CCoeffs[j], V);
+        }
+
+        /* scale and clip signals */
+        Y0 = vec_sra(Y0, SCL);
+        Y1 = vec_sra(Y1, SCL);
+        U  = vec_sra(U, SCL);
+        V  = vec_sra(V, SCL);
+
+        Y0 = vec_clip_s16(Y0);
+        Y1 = vec_clip_s16(Y1);
+        U  = vec_clip_s16(U);
+        V  = vec_clip_s16(V);
+
+        /* now we have
+         * Y0 = y0 y1 y2 y3 y4 y5 y6 y7    Y1 = y8 y9 y10 y11 y12 y13 y14 y15
+         * U  = u0 u1 u2 u3 u4 u5 u6 u7    V  = v0 v1 v2 v3 v4 v5 v6 v7
+         *
+         * Y0 = y0 y1 y2 y3 y4 y5 y6 y7    Y1 = y8 y9 y10 y11 y12 y13 y14 y15
+         * U0 = u0 u0 u1 u1 u2 u2 u3 u3    U1 = u4 u4 u5 u5 u6 u6 u7 u7
+         * V0 = v0 v0 v1 v1 v2 v2 v3 v3    V1 = v4 v4 v5 v5 v6 v6 v7 v7
+         */
+
+        U0 = vec_mergeh(U, U);
+        V0 = vec_mergeh(V, V);
+
+        U1 = vec_mergel(U, U);
+        V1 = vec_mergel(V, V);
+
+        cvtyuvtoRGB(c, Y0, U0, V0, &R0, &G0, &B0);
+        cvtyuvtoRGB(c, Y1, U1, V1, &R1, &G1, &B1);
+
+        R = vec_packclp(R0, R1);
+        G = vec_packclp(G0, G1);
+        B = vec_packclp(B0, B1);
+
+        nout = (vector unsigned char *) scratch;
+        switch (target) {
+        case AV_PIX_FMT_ABGR:
+            out_abgr(R, G, B, nout);
+            break;
+        case AV_PIX_FMT_BGRA:
+            out_bgra(R, G, B, nout);
+            break;
+        case AV_PIX_FMT_RGBA:
+            out_rgba(R, G, B, nout);
+            break;
+        case AV_PIX_FMT_ARGB:
+            out_argb(R, G, B, nout);
+            break;
+        case AV_PIX_FMT_RGB24:
+            out_rgb24(R, G, B, nout);
+            break;
+        case AV_PIX_FMT_BGR24:
+            out_bgr24(R, G, B, nout);
+            break;
+        default:
+            /* Unreachable, I think. */
+            av_log(c, AV_LOG_ERROR,
+                   "altivec_yuv2packedX doesn't support %s output\n",
+                   av_get_pix_fmt_name(c->dstFormat));
+            return;
+        }
+
+        memcpy(&((uint32_t *) dest)[i], scratch, (dstW - i) / 4);
+    }
+}
+
+#define YUV2PACKEDX_WRAPPER(suffix, pixfmt)                             \
+void ff_yuv2 ## suffix ## _X_altivec(SwsContext *c,                     \
+                                     const int16_t *lumFilter,          \
+                                     const int16_t **lumSrc,            \
+                                     int lumFilterSize,                 \
+                                     const int16_t *chrFilter,          \
+                                     const int16_t **chrUSrc,           \
+                                     const int16_t **chrVSrc,           \
+                                     int chrFilterSize,                 \
+                                     const int16_t **alpSrc,            \
+                                     uint8_t *dest, int dstW, int dstY) \
+{                                                                       \
+    yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,            \
+                        chrFilter, chrUSrc, chrVSrc,                    \
+                        chrFilterSize, alpSrc,                          \
+                        dest, dstW, dstY, pixfmt);                      \
+}
+
+YUV2PACKEDX_WRAPPER(abgr,  AV_PIX_FMT_ABGR);
+YUV2PACKEDX_WRAPPER(bgra,  AV_PIX_FMT_BGRA);
+YUV2PACKEDX_WRAPPER(argb,  AV_PIX_FMT_ARGB);
+YUV2PACKEDX_WRAPPER(rgba,  AV_PIX_FMT_RGBA);
+YUV2PACKEDX_WRAPPER(rgb24, AV_PIX_FMT_RGB24);
+YUV2PACKEDX_WRAPPER(bgr24, AV_PIX_FMT_BGR24);
+
+#endif /* HAVE_ALTIVEC */
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/ppc/yuv2rgb_altivec.h
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/ppc/yuv2rgb_altivec.h
@@ -0,0 +1,51 @@
+/*
+ * AltiVec-enhanced yuv2yuvX
+ *
+ * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
+ * based on the equivalent C code in swscale.c
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef SWSCALE_PPC_YUV2RGB_ALTIVEC_H
+#define SWSCALE_PPC_YUV2RGB_ALTIVEC_H
+
+#include <stdint.h>
+
+#include "libswscale/swscale_internal.h"
+
+#define YUV2PACKEDX_HEADER(suffix)                                  \
+    void ff_yuv2 ## suffix ## _X_altivec(SwsContext *c,             \
+                                         const int16_t *lumFilter,  \
+                                         const int16_t **lumSrc,    \
+                                         int lumFilterSize,         \
+                                         const int16_t *chrFilter,  \
+                                         const int16_t **chrUSrc,   \
+                                         const int16_t **chrVSrc,   \
+                                         int chrFilterSize,         \
+                                         const int16_t **alpSrc,    \
+                                         uint8_t *dest,             \
+                                         int dstW, int dstY);
+
+YUV2PACKEDX_HEADER(abgr);
+YUV2PACKEDX_HEADER(bgra);
+YUV2PACKEDX_HEADER(argb);
+YUV2PACKEDX_HEADER(rgba);
+YUV2PACKEDX_HEADER(rgb24);
+YUV2PACKEDX_HEADER(bgr24);
+
+#endif /* SWSCALE_PPC_YUV2RGB_ALTIVEC_H */
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/ppc/yuv2yuv_altivec.c
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/ppc/yuv2yuv_altivec.c
@@ -0,0 +1,204 @@
+/*
+ * AltiVec-enhanced yuv-to-yuv conversion routines.
+ *
+ * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
+ * based on the equivalent C code in swscale.c
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <inttypes.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+
+#if HAVE_ALTIVEC
+
+static int yv12toyuy2_unscaled_altivec(SwsContext *c, const uint8_t *src[],
+                                       int srcStride[], int srcSliceY,
+                                       int srcSliceH, uint8_t *dstParam[],
+                                       int dstStride_a[])
+{
+    uint8_t *dst = dstParam[0] + dstStride_a[0] * srcSliceY;
+    // yv12toyuy2(src[0], src[1], src[2], dst, c->srcW, srcSliceH,
+    //            srcStride[0], srcStride[1], dstStride[0]);
+    const uint8_t *ysrc   = src[0];
+    const uint8_t *usrc   = src[1];
+    const uint8_t *vsrc   = src[2];
+    const int width       = c->srcW;
+    const int height      = srcSliceH;
+    const int lumStride   = srcStride[0];
+    const int chromStride = srcStride[1];
+    const int dstStride   = dstStride_a[0];
+    const vector unsigned char yperm = vec_lvsl(0, ysrc);
+    const int vertLumPerChroma       = 2;
+    register unsigned int y;
+
+    /* This code assumes:
+     *
+     * 1) dst is 16 bytes-aligned
+     * 2) dstStride is a multiple of 16
+     * 3) width is a multiple of 16
+     * 4) lum & chrom stride are multiples of 8
+     */
+
+    for (y = 0; y < height; y++) {
+        int i;
+        for (i = 0; i < width - 31; i += 32) {
+            const unsigned int j          = i >> 1;
+            vector unsigned char v_yA     = vec_ld(i, ysrc);
+            vector unsigned char v_yB     = vec_ld(i + 16, ysrc);
+            vector unsigned char v_yC     = vec_ld(i + 32, ysrc);
+            vector unsigned char v_y1     = vec_perm(v_yA, v_yB, yperm);
+            vector unsigned char v_y2     = vec_perm(v_yB, v_yC, yperm);
+            vector unsigned char v_uA     = vec_ld(j, usrc);
+            vector unsigned char v_uB     = vec_ld(j + 16, usrc);
+            vector unsigned char v_u      = vec_perm(v_uA, v_uB, vec_lvsl(j, usrc));
+            vector unsigned char v_vA     = vec_ld(j, vsrc);
+            vector unsigned char v_vB     = vec_ld(j + 16, vsrc);
+            vector unsigned char v_v      = vec_perm(v_vA, v_vB, vec_lvsl(j, vsrc));
+            vector unsigned char v_uv_a   = vec_mergeh(v_u, v_v);
+            vector unsigned char v_uv_b   = vec_mergel(v_u, v_v);
+            vector unsigned char v_yuy2_0 = vec_mergeh(v_y1, v_uv_a);
+            vector unsigned char v_yuy2_1 = vec_mergel(v_y1, v_uv_a);
+            vector unsigned char v_yuy2_2 = vec_mergeh(v_y2, v_uv_b);
+            vector unsigned char v_yuy2_3 = vec_mergel(v_y2, v_uv_b);
+            vec_st(v_yuy2_0, (i << 1), dst);
+            vec_st(v_yuy2_1, (i << 1) + 16, dst);
+            vec_st(v_yuy2_2, (i << 1) + 32, dst);
+            vec_st(v_yuy2_3, (i << 1) + 48, dst);
+        }
+        if (i < width) {
+            const unsigned int j          = i >> 1;
+            vector unsigned char v_y1     = vec_ld(i, ysrc);
+            vector unsigned char v_u      = vec_ld(j, usrc);
+            vector unsigned char v_v      = vec_ld(j, vsrc);
+            vector unsigned char v_uv_a   = vec_mergeh(v_u, v_v);
+            vector unsigned char v_yuy2_0 = vec_mergeh(v_y1, v_uv_a);
+            vector unsigned char v_yuy2_1 = vec_mergel(v_y1, v_uv_a);
+            vec_st(v_yuy2_0, (i << 1), dst);
+            vec_st(v_yuy2_1, (i << 1) + 16, dst);
+        }
+        if ((y & (vertLumPerChroma - 1)) == vertLumPerChroma - 1) {
+            usrc += chromStride;
+            vsrc += chromStride;
+        }
+        ysrc += lumStride;
+        dst  += dstStride;
+    }
+
+    return srcSliceH;
+}
+
+static int yv12touyvy_unscaled_altivec(SwsContext *c, const uint8_t *src[],
+                                       int srcStride[], int srcSliceY,
+                                       int srcSliceH, uint8_t *dstParam[],
+                                       int dstStride_a[])
+{
+    uint8_t *dst = dstParam[0] + dstStride_a[0] * srcSliceY;
+    // yv12toyuy2(src[0], src[1], src[2], dst, c->srcW, srcSliceH,
+    //            srcStride[0], srcStride[1], dstStride[0]);
+    const uint8_t *ysrc              = src[0];
+    const uint8_t *usrc              = src[1];
+    const uint8_t *vsrc              = src[2];
+    const int width                  = c->srcW;
+    const int height                 = srcSliceH;
+    const int lumStride              = srcStride[0];
+    const int chromStride            = srcStride[1];
+    const int dstStride              = dstStride_a[0];
+    const int vertLumPerChroma       = 2;
+    const vector unsigned char yperm = vec_lvsl(0, ysrc);
+    register unsigned int y;
+
+    /* This code assumes:
+     *
+     * 1) dst is 16 bytes-aligned
+     * 2) dstStride is a multiple of 16
+     * 3) width is a multiple of 16
+     * 4) lum & chrom stride are multiples of 8
+     */
+
+    for (y = 0; y < height; y++) {
+        int i;
+        for (i = 0; i < width - 31; i += 32) {
+            const unsigned int j          = i >> 1;
+            vector unsigned char v_yA     = vec_ld(i, ysrc);
+            vector unsigned char v_yB     = vec_ld(i + 16, ysrc);
+            vector unsigned char v_yC     = vec_ld(i + 32, ysrc);
+            vector unsigned char v_y1     = vec_perm(v_yA, v_yB, yperm);
+            vector unsigned char v_y2     = vec_perm(v_yB, v_yC, yperm);
+            vector unsigned char v_uA     = vec_ld(j, usrc);
+            vector unsigned char v_uB     = vec_ld(j + 16, usrc);
+            vector unsigned char v_u      = vec_perm(v_uA, v_uB, vec_lvsl(j, usrc));
+            vector unsigned char v_vA     = vec_ld(j, vsrc);
+            vector unsigned char v_vB     = vec_ld(j + 16, vsrc);
+            vector unsigned char v_v      = vec_perm(v_vA, v_vB, vec_lvsl(j, vsrc));
+            vector unsigned char v_uv_a   = vec_mergeh(v_u, v_v);
+            vector unsigned char v_uv_b   = vec_mergel(v_u, v_v);
+            vector unsigned char v_uyvy_0 = vec_mergeh(v_uv_a, v_y1);
+            vector unsigned char v_uyvy_1 = vec_mergel(v_uv_a, v_y1);
+            vector unsigned char v_uyvy_2 = vec_mergeh(v_uv_b, v_y2);
+            vector unsigned char v_uyvy_3 = vec_mergel(v_uv_b, v_y2);
+            vec_st(v_uyvy_0, (i << 1), dst);
+            vec_st(v_uyvy_1, (i << 1) + 16, dst);
+            vec_st(v_uyvy_2, (i << 1) + 32, dst);
+            vec_st(v_uyvy_3, (i << 1) + 48, dst);
+        }
+        if (i < width) {
+            const unsigned int j          = i >> 1;
+            vector unsigned char v_y1     = vec_ld(i, ysrc);
+            vector unsigned char v_u      = vec_ld(j, usrc);
+            vector unsigned char v_v      = vec_ld(j, vsrc);
+            vector unsigned char v_uv_a   = vec_mergeh(v_u, v_v);
+            vector unsigned char v_uyvy_0 = vec_mergeh(v_uv_a, v_y1);
+            vector unsigned char v_uyvy_1 = vec_mergel(v_uv_a, v_y1);
+            vec_st(v_uyvy_0, (i << 1), dst);
+            vec_st(v_uyvy_1, (i << 1) + 16, dst);
+        }
+        if ((y & (vertLumPerChroma - 1)) == vertLumPerChroma - 1) {
+            usrc += chromStride;
+            vsrc += chromStride;
+        }
+        ysrc += lumStride;
+        dst  += dstStride;
+    }
+    return srcSliceH;
+}
+
+#endif /* HAVE_ALTIVEC */
+
+av_cold void ff_get_unscaled_swscale_ppc(SwsContext *c)
+{
+#if HAVE_ALTIVEC
+    if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
+        return;
+
+    if (!(c->srcW & 15) && !(c->flags & SWS_BITEXACT) &&
+        c->srcFormat == AV_PIX_FMT_YUV420P) {
+        enum AVPixelFormat dstFormat = c->dstFormat;
+
+        // unscaled YV12 -> packed YUV, we want speed
+        if (dstFormat == AV_PIX_FMT_YUYV422)
+            c->swscale = yv12toyuy2_unscaled_altivec;
+        else if (dstFormat == AV_PIX_FMT_UYVY422)
+            c->swscale = yv12touyvy_unscaled_altivec;
+    }
+#endif /* HAVE_ALTIVEC */
+}
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/rgb2rgb.c
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/rgb2rgb.c
@@ -0,0 +1,390 @@
+/*
+ * software RGB to RGB converter
+ * pluralize by software PAL8 to RGB converter
+ *              software YUV to YUV converter
+ *              software YUV to RGB converter
+ * Written by Nick Kurshev.
+ * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <inttypes.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/bswap.h"
+#include "config.h"
+#include "rgb2rgb.h"
+#include "swscale.h"
+#include "swscale_internal.h"
+
+void (*rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
+void (*rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size);
+void (*rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size);
+void (*rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size);
+void (*rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
+void (*rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size);
+void (*rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size);
+void (*rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
+void (*rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
+
+void (*rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size);
+void (*rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size);
+void (*rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size);
+void (*rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size);
+void (*rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size);
+void (*rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size);
+void (*rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size);
+void (*rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size);
+
+void (*shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size);
+
+void (*yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc,
+                   const uint8_t *vsrc, uint8_t *dst,
+                   int width, int height,
+                   int lumStride, int chromStride, int dstStride);
+void (*yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc,
+                   const uint8_t *vsrc, uint8_t *dst,
+                   int width, int height,
+                   int lumStride, int chromStride, int dstStride);
+void (*yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc,
+                      const uint8_t *vsrc, uint8_t *dst,
+                      int width, int height,
+                      int lumStride, int chromStride, int dstStride);
+void (*yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc,
+                      const uint8_t *vsrc, uint8_t *dst,
+                      int width, int height,
+                      int lumStride, int chromStride, int dstStride);
+void (*yuy2toyv12)(const uint8_t *src, uint8_t *ydst,
+                   uint8_t *udst, uint8_t *vdst,
+                   int width, int height,
+                   int lumStride, int chromStride, int srcStride);
+void (*ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst,
+                       uint8_t *udst, uint8_t *vdst,
+                       int width, int height,
+                       int lumStride, int chromStride, int srcStride,
+                       int32_t *rgb2yuv);
+void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height,
+                 int srcStride, int dstStride);
+void (*interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst,
+                        int width, int height, int src1Stride,
+                        int src2Stride, int dstStride);
+void (*vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
+                    uint8_t *dst1, uint8_t *dst2,
+                    int width, int height,
+                    int srcStride1, int srcStride2,
+                    int dstStride1, int dstStride2);
+void (*yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2,
+                     const uint8_t *src3, uint8_t *dst,
+                     int width, int height,
+                     int srcStride1, int srcStride2,
+                     int srcStride3, int dstStride);
+void (*uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                     const uint8_t *src, int width, int height,
+                     int lumStride, int chromStride, int srcStride);
+void (*uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                     const uint8_t *src, int width, int height,
+                     int lumStride, int chromStride, int srcStride);
+void (*yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                     const uint8_t *src, int width, int height,
+                     int lumStride, int chromStride, int srcStride);
+void (*yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                     const uint8_t *src, int width, int height,
+                     int lumStride, int chromStride, int srcStride);
+
+#define BY ((int)( 0.098 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define BV ((int)(-0.071 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define BU ((int)( 0.439 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define GY ((int)( 0.504 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define GV ((int)(-0.368 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define GU ((int)(-0.291 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define RY ((int)( 0.257 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define RV ((int)( 0.439 * (1 << RGB2YUV_SHIFT) + 0.5))
+#define RU ((int)(-0.148 * (1 << RGB2YUV_SHIFT) + 0.5))
+
+//plain C versions
+#include "rgb2rgb_template.c"
+
+/*
+ * RGB15->RGB16 original by Strepto/Astral
+ * ported to gcc & bugfixed : A'rpi
+ * MMXEXT, 3DNOW optimization by Nick Kurshev
+ * 32-bit C version, and and&add trick by Michael Niedermayer
+ */
+
+av_cold void sws_rgb2rgb_init(void)
+{
+    rgb2rgb_init_c();
+    if (ARCH_X86)
+        rgb2rgb_init_x86();
+}
+
+void rgb32to24(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    int i, num_pixels = src_size >> 2;
+
+    for (i = 0; i < num_pixels; i++) {
+#if HAVE_BIGENDIAN
+        /* RGB32 (= A,B,G,R) -> BGR24 (= B,G,R) */
+        dst[3 * i + 0] = src[4 * i + 1];
+        dst[3 * i + 1] = src[4 * i + 2];
+        dst[3 * i + 2] = src[4 * i + 3];
+#else
+        dst[3 * i + 0] = src[4 * i + 2];
+        dst[3 * i + 1] = src[4 * i + 1];
+        dst[3 * i + 2] = src[4 * i + 0];
+#endif
+    }
+}
+
+void rgb24to32(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    int i;
+
+    for (i = 0; 3 * i < src_size; i++) {
+#if HAVE_BIGENDIAN
+        /* RGB24 (= R, G, B) -> BGR32 (= A, R, G, B) */
+        dst[4 * i + 0] = 255;
+        dst[4 * i + 1] = src[3 * i + 0];
+        dst[4 * i + 2] = src[3 * i + 1];
+        dst[4 * i + 3] = src[3 * i + 2];
+#else
+        dst[4 * i + 0] = src[3 * i + 2];
+        dst[4 * i + 1] = src[3 * i + 1];
+        dst[4 * i + 2] = src[3 * i + 0];
+        dst[4 * i + 3] = 255;
+#endif
+    }
+}
+
+void rgb16tobgr32(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    uint8_t *d          = dst;
+    const uint16_t *s   = (const uint16_t *)src;
+    const uint16_t *end = s + src_size / 2;
+
+    while (s < end) {
+        register uint16_t bgr = *s++;
+#if HAVE_BIGENDIAN
+        *d++ = 255;
+        *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
+        *d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
+        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
+#else
+        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
+        *d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
+        *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
+        *d++ = 255;
+#endif
+    }
+}
+
+void rgb12to15(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    uint16_t rgb, r, g, b;
+    uint16_t *d         = (uint16_t *)dst;
+    const uint16_t *s   = (const uint16_t *)src;
+    const uint16_t *end = s + src_size / 2;
+
+    while (s < end) {
+        rgb  = *s++;
+        r    = rgb & 0xF00;
+        g    = rgb & 0x0F0;
+        b    = rgb & 0x00F;
+        r    = (r << 3) | ((r & 0x800) >> 1);
+        g    = (g << 2) | ((g & 0x080) >> 2);
+        b    = (b << 1) | ( b          >> 3);
+        *d++ = r | g | b;
+    }
+}
+
+void rgb16to24(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    uint8_t *d          = dst;
+    const uint16_t *s   = (const uint16_t *)src;
+    const uint16_t *end = s + src_size / 2;
+
+    while (s < end) {
+        register uint16_t bgr = *s++;
+        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
+        *d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
+        *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
+    }
+}
+
+void rgb16tobgr16(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    int i, num_pixels = src_size >> 1;
+
+    for (i = 0; i < num_pixels; i++) {
+        unsigned rgb         = ((const uint16_t *)src)[i];
+        ((uint16_t *)dst)[i] = (rgb >> 11) | (rgb & 0x7E0) | (rgb << 11);
+    }
+}
+
+void rgb16tobgr15(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    int i, num_pixels = src_size >> 1;
+
+    for (i = 0; i < num_pixels; i++) {
+        unsigned rgb         = ((const uint16_t *)src)[i];
+        ((uint16_t *)dst)[i] = (rgb >> 11) | ((rgb & 0x7C0) >> 1) | ((rgb & 0x1F) << 10);
+    }
+}
+
+void rgb15tobgr32(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    uint8_t *d          = dst;
+    const uint16_t *s   = (const uint16_t *)src;
+    const uint16_t *end = s + src_size / 2;
+
+    while (s < end) {
+        register uint16_t bgr = *s++;
+#if HAVE_BIGENDIAN
+        *d++ = 255;
+        *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
+        *d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
+        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
+#else
+        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
+        *d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
+        *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
+        *d++ = 255;
+#endif
+    }
+}
+
+void rgb15to24(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    uint8_t *d          = dst;
+    const uint16_t *s   = (const uint16_t *)src;
+    const uint16_t *end = s + src_size / 2;
+
+    while (s < end) {
+        register uint16_t bgr = *s++;
+        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
+        *d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
+        *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
+    }
+}
+
+void rgb15tobgr16(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    int i, num_pixels = src_size >> 1;
+
+    for (i = 0; i < num_pixels; i++) {
+        unsigned rgb         = ((const uint16_t *)src)[i];
+        ((uint16_t *)dst)[i] = ((rgb & 0x7C00) >> 10) | ((rgb & 0x3E0) << 1) | (rgb << 11);
+    }
+}
+
+void rgb15tobgr15(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    int i, num_pixels = src_size >> 1;
+
+    for (i = 0; i < num_pixels; i++) {
+        unsigned rgb         = ((const uint16_t *)src)[i];
+        unsigned br          = rgb & 0x7C1F;
+        ((uint16_t *)dst)[i] = (br >> 10) | (rgb & 0x3E0) | (br << 10);
+    }
+}
+
+void rgb12tobgr12(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    uint16_t *d = (uint16_t *)dst;
+    uint16_t *s = (uint16_t *)src;
+    int i, num_pixels = src_size >> 1;
+
+    for (i = 0; i < num_pixels; i++) {
+        unsigned rgb = s[i];
+        d[i]         = (rgb << 8 | rgb & 0xF0 | rgb >> 8) & 0xFFF;
+    }
+}
+
+
+#define DEFINE_SHUFFLE_BYTES(a, b, c, d)                                \
+void shuffle_bytes_ ## a ## b ## c ## d(const uint8_t *src,             \
+                                        uint8_t *dst, int src_size)     \
+{                                                                       \
+    int i;                                                              \
+                                                                        \
+    for (i = 0; i < src_size; i += 4) {                                 \
+        dst[i + 0] = src[i + a];                                        \
+        dst[i + 1] = src[i + b];                                        \
+        dst[i + 2] = src[i + c];                                        \
+        dst[i + 3] = src[i + d];                                        \
+    }                                                                   \
+}
+
+DEFINE_SHUFFLE_BYTES(0, 3, 2, 1)
+DEFINE_SHUFFLE_BYTES(1, 2, 3, 0)
+DEFINE_SHUFFLE_BYTES(3, 0, 1, 2)
+DEFINE_SHUFFLE_BYTES(3, 2, 1, 0)
+
+#define DEFINE_RGB48TOBGR48(need_bswap, swap)                           \
+void rgb48tobgr48_ ## need_bswap(const uint8_t *src,                    \
+                                 uint8_t *dst, int src_size)            \
+{                                                                       \
+    uint16_t *d = (uint16_t *)dst;                                      \
+    uint16_t *s = (uint16_t *)src;                                      \
+    int i, num_pixels = src_size >> 1;                                  \
+                                                                        \
+    for (i = 0; i < num_pixels; i += 3) {                               \
+        d[i    ] = swap ? av_bswap16(s[i + 2]) : s[i + 2];              \
+        d[i + 1] = swap ? av_bswap16(s[i + 1]) : s[i + 1];              \
+        d[i + 2] = swap ? av_bswap16(s[i    ]) : s[i    ];              \
+    }                                                                   \
+}
+
+DEFINE_RGB48TOBGR48(nobswap, 0)
+DEFINE_RGB48TOBGR48(bswap, 1)
+
+#define DEFINE_RGB64TOBGR48(need_bswap, swap)                           \
+void rgb64tobgr48_ ## need_bswap(const uint8_t *src,                    \
+                                 uint8_t *dst, int src_size)            \
+{                                                                       \
+    uint16_t *d = (uint16_t *)dst;                                      \
+    uint16_t *s = (uint16_t *)src;                                      \
+    int i, num_pixels = src_size >> 3;                                  \
+                                                                        \
+    for (i = 0; i < num_pixels; i++) {                                  \
+        d[3 * i    ] = swap ? av_bswap16(s[4 * i + 2]) : s[4 * i + 2];  \
+        d[3 * i + 1] = swap ? av_bswap16(s[4 * i + 1]) : s[4 * i + 1];  \
+        d[3 * i + 2] = swap ? av_bswap16(s[4 * i    ]) : s[4 * i    ];  \
+    }                                                                   \
+}
+
+DEFINE_RGB64TOBGR48(nobswap, 0)
+DEFINE_RGB64TOBGR48(bswap, 1)
+
+#define DEFINE_RGB64TO48(need_bswap, swap)                              \
+void rgb64to48_ ## need_bswap(const uint8_t *src,                       \
+                              uint8_t *dst, int src_size)               \
+{                                                                       \
+    uint16_t *d = (uint16_t *)dst;                                      \
+    uint16_t *s = (uint16_t *)src;                                      \
+    int i, num_pixels = src_size >> 3;                                  \
+                                                                        \
+    for (i = 0; i < num_pixels; i++) {                                  \
+        d[3 * i    ] = swap ? av_bswap16(s[4 * i    ]) : s[4 * i    ];  \
+        d[3 * i + 1] = swap ? av_bswap16(s[4 * i + 1]) : s[4 * i + 1];  \
+        d[3 * i + 2] = swap ? av_bswap16(s[4 * i + 2]) : s[4 * i + 2];  \
+    }                                                                   \
+}
+
+DEFINE_RGB64TO48(nobswap, 0)
+DEFINE_RGB64TO48(bswap, 1)
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/rgb2rgb.h
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/rgb2rgb.h
@@ -0,0 +1,167 @@
+/*
+ *  software RGB to RGB converter
+ *  pluralize by Software PAL8 to RGB converter
+ *               Software YUV to YUV converter
+ *               Software YUV to RGB converter
+ *  Written by Nick Kurshev.
+ *  YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef SWSCALE_RGB2RGB_H
+#define SWSCALE_RGB2RGB_H
+
+#include <inttypes.h>
+
+#include "libavutil/avutil.h"
+#include "swscale.h"
+
+/* A full collection of RGB to RGB(BGR) converters */
+extern void (*rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void (*rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void (*rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void (*rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void    (*rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void    (*rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void    (*rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void (*rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void    (*rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void    (*rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void (*rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void    (*rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void (*rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void    (*rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void    (*rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void (*rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void (*rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size);
+
+extern void (*shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size);
+
+void rgb64tobgr48_nobswap(const uint8_t *src, uint8_t *dst, int src_size);
+void   rgb64tobgr48_bswap(const uint8_t *src, uint8_t *dst, int src_size);
+void rgb48tobgr48_nobswap(const uint8_t *src, uint8_t *dst, int src_size);
+void   rgb48tobgr48_bswap(const uint8_t *src, uint8_t *dst, int src_size);
+void    rgb64to48_nobswap(const uint8_t *src, uint8_t *dst, int src_size);
+void      rgb64to48_bswap(const uint8_t *src, uint8_t *dst, int src_size);
+void    rgb24to32(const uint8_t *src, uint8_t *dst, int src_size);
+void    rgb32to24(const uint8_t *src, uint8_t *dst, int src_size);
+void rgb16tobgr32(const uint8_t *src, uint8_t *dst, int src_size);
+void    rgb16to24(const uint8_t *src, uint8_t *dst, int src_size);
+void rgb16tobgr16(const uint8_t *src, uint8_t *dst, int src_size);
+void rgb16tobgr15(const uint8_t *src, uint8_t *dst, int src_size);
+void rgb15tobgr32(const uint8_t *src, uint8_t *dst, int src_size);
+void    rgb15to24(const uint8_t *src, uint8_t *dst, int src_size);
+void rgb15tobgr16(const uint8_t *src, uint8_t *dst, int src_size);
+void rgb15tobgr15(const uint8_t *src, uint8_t *dst, int src_size);
+void rgb12tobgr12(const uint8_t *src, uint8_t *dst, int src_size);
+void    rgb12to15(const uint8_t *src, uint8_t *dst, int src_size);
+
+void shuffle_bytes_0321(const uint8_t *src, uint8_t *dst, int src_size);
+void shuffle_bytes_1230(const uint8_t *src, uint8_t *dst, int src_size);
+void shuffle_bytes_3012(const uint8_t *src, uint8_t *dst, int src_size);
+void shuffle_bytes_3210(const uint8_t *src, uint8_t *dst, int src_size);
+
+void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                      uint8_t *vdst, int width, int height, int lumStride,
+                      int chromStride, int srcStride, int32_t *rgb2yuv);
+
+/**
+ * Height should be a multiple of 2 and width should be a multiple of 16.
+ * (If this is a problem for anyone then tell me, and I will fix it.)
+ */
+extern void (*yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
+                          int width, int height,
+                          int lumStride, int chromStride, int dstStride);
+
+/**
+ * Width should be a multiple of 16.
+ */
+extern void (*yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
+                             int width, int height,
+                             int lumStride, int chromStride, int dstStride);
+
+/**
+ * Height should be a multiple of 2 and width should be a multiple of 16.
+ * (If this is a problem for anyone then tell me, and I will fix it.)
+ */
+extern void (*yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                          int width, int height,
+                          int lumStride, int chromStride, int srcStride);
+
+/**
+ * Height should be a multiple of 2 and width should be a multiple of 16.
+ * (If this is a problem for anyone then tell me, and I will fix it.)
+ */
+extern void (*yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
+                          int width, int height,
+                          int lumStride, int chromStride, int dstStride);
+
+/**
+ * Width should be a multiple of 16.
+ */
+extern void (*yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
+                             int width, int height,
+                             int lumStride, int chromStride, int dstStride);
+
+/**
+ * Height should be a multiple of 2 and width should be a multiple of 2.
+ * (If this is a problem for anyone then tell me, and I will fix it.)
+ * Chrominance data is only taken from every second line, others are ignored.
+ * FIXME: Write high quality version.
+ */
+extern void (*ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                              int width, int height,
+                              int lumStride, int chromStride, int srcStride,
+                              int32_t *rgb2yuv);
+extern void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height,
+                        int srcStride, int dstStride);
+
+extern void (*interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst,
+                               int width, int height, int src1Stride,
+                               int src2Stride, int dstStride);
+
+extern void (*vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
+                           uint8_t *dst1, uint8_t *dst2,
+                           int width, int height,
+                           int srcStride1, int srcStride2,
+                           int dstStride1, int dstStride2);
+
+extern void (*yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
+                            uint8_t *dst,
+                            int width, int height,
+                            int srcStride1, int srcStride2,
+                            int srcStride3, int dstStride);
+
+extern void (*uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
+                            int width, int height,
+                            int lumStride, int chromStride, int srcStride);
+extern void (*uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
+                            int width, int height,
+                            int lumStride, int chromStride, int srcStride);
+extern void (*yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
+                            int width, int height,
+                            int lumStride, int chromStride, int srcStride);
+extern void (*yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
+                            int width, int height,
+                            int lumStride, int chromStride, int srcStride);
+
+void sws_rgb2rgb_init(void);
+
+void rgb2rgb_init_x86(void);
+
+#endif /* SWSCALE_RGB2RGB_H */
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/rgb2rgb_template.c
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/rgb2rgb_template.c
@@ -0,0 +1,932 @@
+/*
+ * software RGB to RGB converter
+ * pluralize by software PAL8 to RGB converter
+ *              software YUV to YUV converter
+ *              software YUV to RGB converter
+ * Written by Nick Kurshev.
+ * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
+ * lot of big-endian byte order fixes by Alex Beregszaszi
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stddef.h>
+
+#include "libavutil/attributes.h"
+
+static inline void rgb24tobgr32_c(const uint8_t *src, uint8_t *dst,
+                                  int src_size)
+{
+    uint8_t *dest      = dst;
+    const uint8_t *s   = src;
+    const uint8_t *end = s + src_size;
+
+    while (s < end) {
+#if HAVE_BIGENDIAN
+        /* RGB24 (= R, G, B) -> RGB32 (= A, B, G, R) */
+        *dest++  = 255;
+        *dest++  = s[2];
+        *dest++  = s[1];
+        *dest++  = s[0];
+        s       += 3;
+#else
+        *dest++  = *s++;
+        *dest++  = *s++;
+        *dest++  = *s++;
+        *dest++  = 255;
+#endif
+    }
+}
+
+static inline void rgb32tobgr24_c(const uint8_t *src, uint8_t *dst,
+                                  int src_size)
+{
+    uint8_t *dest      = dst;
+    const uint8_t *s   = src;
+    const uint8_t *end = s + src_size;
+
+    while (s < end) {
+#if HAVE_BIGENDIAN
+        /* RGB32 (= A, B, G, R) -> RGB24 (= R, G, B) */
+        s++;
+        dest[2]  = *s++;
+        dest[1]  = *s++;
+        dest[0]  = *s++;
+        dest    += 3;
+#else
+        *dest++  = *s++;
+        *dest++  = *s++;
+        *dest++  = *s++;
+        s++;
+#endif
+    }
+}
+
+/*
+ * original by Strepto/Astral
+ * ported to gcc & bugfixed: A'rpi
+ * MMXEXT, 3DNOW optimization by Nick Kurshev
+ * 32-bit C version, and and&add trick by Michael Niedermayer
+ */
+static inline void rgb15to16_c(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    register uint8_t *d         = dst;
+    register const uint8_t *s   = src;
+    register const uint8_t *end = s + src_size;
+    const uint8_t *mm_end       = end - 3;
+
+    while (s < mm_end) {
+        register unsigned x = *((const uint32_t *)s);
+        *((uint32_t *)d)    = (x & 0x7FFF7FFF) + (x & 0x7FE07FE0);
+        d += 4;
+        s += 4;
+    }
+    if (s < end) {
+        register unsigned short x = *((const uint16_t *)s);
+        *((uint16_t *)d)          = (x & 0x7FFF) + (x & 0x7FE0);
+    }
+}
+
+static inline void rgb16to15_c(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    register uint8_t *d         = dst;
+    register const uint8_t *s   = src;
+    register const uint8_t *end = s + src_size;
+    const uint8_t *mm_end       = end - 3;
+
+    while (s < mm_end) {
+        register uint32_t x  = *((const uint32_t *)s);
+        *((uint32_t *)d)     = ((x >> 1) & 0x7FE07FE0) | (x & 0x001F001F);
+        s                   += 4;
+        d                   += 4;
+    }
+    if (s < end) {
+        register uint16_t x = *((const uint16_t *)s);
+        *((uint16_t *)d)    = ((x >> 1) & 0x7FE0) | (x & 0x001F);
+    }
+}
+
+static inline void rgb32to16_c(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    uint16_t *d        = (uint16_t *)dst;
+    const uint8_t *s   = src;
+    const uint8_t *end = s + src_size;
+
+    while (s < end) {
+        register int rgb  = *(const uint32_t *)s;
+        s                += 4;
+        *d++              = ((rgb & 0xFF)     >> 3) +
+                            ((rgb & 0xFC00)   >> 5) +
+                            ((rgb & 0xF80000) >> 8);
+    }
+}
+
+static inline void rgb32tobgr16_c(const uint8_t *src, uint8_t *dst,
+                                  int src_size)
+{
+    uint16_t *d        = (uint16_t *)dst;
+    const uint8_t *s   = src;
+    const uint8_t *end = s + src_size;
+
+    while (s < end) {
+        register int rgb  = *(const uint32_t *)s;
+        s                += 4;
+        *d++              = ((rgb & 0xF8)     << 8) +
+                            ((rgb & 0xFC00)   >> 5) +
+                            ((rgb & 0xF80000) >> 19);
+    }
+}
+
+static inline void rgb32to15_c(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    uint16_t *d        = (uint16_t *)dst;
+    const uint8_t *s   = src;
+    const uint8_t *end = s + src_size;
+
+    while (s < end) {
+        register int rgb  = *(const uint32_t *)s;
+        s                += 4;
+        *d++              = ((rgb & 0xFF)     >> 3) +
+                            ((rgb & 0xF800)   >> 6) +
+                            ((rgb & 0xF80000) >> 9);
+    }
+}
+
+static inline void rgb32tobgr15_c(const uint8_t *src, uint8_t *dst,
+                                  int src_size)
+{
+    uint16_t *d        = (uint16_t *)dst;
+    const uint8_t *s   = src;
+    const uint8_t *end = s + src_size;
+
+    while (s < end) {
+        register int rgb  = *(const uint32_t *)s;
+        s                += 4;
+        *d++              = ((rgb & 0xF8)     <<  7) +
+                            ((rgb & 0xF800)   >>  6) +
+                            ((rgb & 0xF80000) >> 19);
+    }
+}
+
+static inline void rgb24tobgr16_c(const uint8_t *src, uint8_t *dst,
+                                  int src_size)
+{
+    uint16_t *d        = (uint16_t *)dst;
+    const uint8_t *s   = src;
+    const uint8_t *end = s + src_size;
+
+    while (s < end) {
+        const int b = *s++;
+        const int g = *s++;
+        const int r = *s++;
+        *d++        = (b >> 3) | ((g & 0xFC) << 3) | ((r & 0xF8) << 8);
+    }
+}
+
+static inline void rgb24to16_c(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    uint16_t *d        = (uint16_t *)dst;
+    const uint8_t *s   = src;
+    const uint8_t *end = s + src_size;
+
+    while (s < end) {
+        const int r = *s++;
+        const int g = *s++;
+        const int b = *s++;
+        *d++        = (b >> 3) | ((g & 0xFC) << 3) | ((r & 0xF8) << 8);
+    }
+}
+
+static inline void rgb24tobgr15_c(const uint8_t *src, uint8_t *dst,
+                                  int src_size)
+{
+    uint16_t *d        = (uint16_t *)dst;
+    const uint8_t *s   = src;
+    const uint8_t *end = s + src_size;
+
+    while (s < end) {
+        const int b = *s++;
+        const int g = *s++;
+        const int r = *s++;
+        *d++        = (b >> 3) | ((g & 0xF8) << 2) | ((r & 0xF8) << 7);
+    }
+}
+
+static inline void rgb24to15_c(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    uint16_t *d        = (uint16_t *)dst;
+    const uint8_t *s   = src;
+    const uint8_t *end = s + src_size;
+
+    while (s < end) {
+        const int r = *s++;
+        const int g = *s++;
+        const int b = *s++;
+        *d++        = (b >> 3) | ((g & 0xF8) << 2) | ((r & 0xF8) << 7);
+    }
+}
+
+static inline void rgb15tobgr24_c(const uint8_t *src, uint8_t *dst,
+                                  int src_size)
+{
+    uint8_t *d          = dst;
+    const uint16_t *s   = (const uint16_t *)src;
+    const uint16_t *end = s + src_size / 2;
+
+    while (s < end) {
+        register uint16_t bgr = *s++;
+        *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
+        *d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
+        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
+    }
+}
+
+static inline void rgb16tobgr24_c(const uint8_t *src, uint8_t *dst,
+                                  int src_size)
+{
+    uint8_t *d          = (uint8_t *)dst;
+    const uint16_t *s   = (const uint16_t *)src;
+    const uint16_t *end = s + src_size / 2;
+
+    while (s < end) {
+        register uint16_t bgr = *s++;
+        *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
+        *d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
+        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
+    }
+}
+
+static inline void rgb15to32_c(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    uint8_t *d          = dst;
+    const uint16_t *s   = (const uint16_t *)src;
+    const uint16_t *end = s + src_size / 2;
+
+    while (s < end) {
+        register uint16_t bgr = *s++;
+#if HAVE_BIGENDIAN
+        *d++ = 255;
+        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
+        *d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
+        *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
+#else
+        *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
+        *d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
+        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
+        *d++ = 255;
+#endif
+    }
+}
+
+static inline void rgb16to32_c(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    uint8_t *d          = dst;
+    const uint16_t *s   = (const uint16_t *)src;
+    const uint16_t *end = s + src_size / 2;
+
+    while (s < end) {
+        register uint16_t bgr = *s++;
+#if HAVE_BIGENDIAN
+        *d++ = 255;
+        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
+        *d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
+        *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
+#else
+        *d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
+        *d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
+        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
+        *d++ = 255;
+#endif
+    }
+}
+
+static inline void shuffle_bytes_2103_c(const uint8_t *src, uint8_t *dst,
+                                        int src_size)
+{
+    int idx          = 15  - src_size;
+    const uint8_t *s = src - idx;
+    uint8_t *d       = dst - idx;
+
+    for (; idx < 15; idx += 4) {
+        register int v        = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
+        v                    &= 0xff00ff;
+        *(uint32_t *)&d[idx]  = (v >> 16) + g + (v << 16);
+    }
+}
+
+static inline void rgb24tobgr24_c(const uint8_t *src, uint8_t *dst, int src_size)
+{
+    unsigned i;
+
+    for (i = 0; i < src_size; i += 3) {
+        register uint8_t x = src[i + 2];
+        dst[i + 1]         = src[i + 1];
+        dst[i + 2]         = src[i + 0];
+        dst[i + 0]         = x;
+    }
+}
+
+static inline void yuvPlanartoyuy2_c(const uint8_t *ysrc, const uint8_t *usrc,
+                                     const uint8_t *vsrc, uint8_t *dst,
+                                     int width, int height,
+                                     int lumStride, int chromStride,
+                                     int dstStride, int vertLumPerChroma)
+{
+    int y, i;
+    const int chromWidth = width >> 1;
+
+    for (y = 0; y < height; y++) {
+#if HAVE_FAST_64BIT
+        uint64_t *ldst = (uint64_t *)dst;
+        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
+        for (i = 0; i < chromWidth; i += 2) {
+            uint64_t k = yc[0] + (uc[0] << 8) +
+                         (yc[1] << 16) + (unsigned)(vc[0] << 24);
+            uint64_t l = yc[2] + (uc[1] << 8) +
+                         (yc[3] << 16) + (unsigned)(vc[1] << 24);
+            *ldst++ = k + (l << 32);
+            yc     += 4;
+            uc     += 2;
+            vc     += 2;
+        }
+
+#else
+        int *idst = (int32_t *)dst;
+        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
+
+        for (i = 0; i < chromWidth; i++) {
+#if HAVE_BIGENDIAN
+            *idst++ = (yc[0] << 24) + (uc[0] << 16) +
+                      (yc[1] <<  8) + (vc[0] <<  0);
+#else
+            *idst++ = yc[0] + (uc[0] << 8) +
+                      (yc[1] << 16) + (vc[0] << 24);
+#endif
+            yc += 2;
+            uc++;
+            vc++;
+        }
+#endif
+        if ((y & (vertLumPerChroma - 1)) == vertLumPerChroma - 1) {
+            usrc += chromStride;
+            vsrc += chromStride;
+        }
+        ysrc += lumStride;
+        dst  += dstStride;
+    }
+}
+
+/**
+ * Height should be a multiple of 2 and width should be a multiple of 16.
+ * (If this is a problem for anyone then tell me, and I will fix it.)
+ */
+static inline void yv12toyuy2_c(const uint8_t *ysrc, const uint8_t *usrc,
+                                const uint8_t *vsrc, uint8_t *dst,
+                                int width, int height, int lumStride,
+                                int chromStride, int dstStride)
+{
+    //FIXME interpolate chroma
+    yuvPlanartoyuy2_c(ysrc, usrc, vsrc, dst, width, height, lumStride,
+                      chromStride, dstStride, 2);
+}
+
+static inline void yuvPlanartouyvy_c(const uint8_t *ysrc, const uint8_t *usrc,
+                                     const uint8_t *vsrc, uint8_t *dst,
+                                     int width, int height,
+                                     int lumStride, int chromStride,
+                                     int dstStride, int vertLumPerChroma)
+{
+    int y, i;
+    const int chromWidth = width >> 1;
+
+    for (y = 0; y < height; y++) {
+#if HAVE_FAST_64BIT
+        uint64_t *ldst = (uint64_t *)dst;
+        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
+        for (i = 0; i < chromWidth; i += 2) {
+            uint64_t k = uc[0] + (yc[0] << 8) +
+                         (vc[0] << 16) + (unsigned)(yc[1] << 24);
+            uint64_t l = uc[1] + (yc[2] << 8) +
+                         (vc[1] << 16) + (unsigned)(yc[3] << 24);
+            *ldst++ = k + (l << 32);
+            yc     += 4;
+            uc     += 2;
+            vc     += 2;
+        }
+
+#else
+        int *idst = (int32_t *)dst;
+        const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
+
+        for (i = 0; i < chromWidth; i++) {
+#if HAVE_BIGENDIAN
+            *idst++ = (uc[0] << 24) + (yc[0] << 16) +
+                      (vc[0] <<  8) + (yc[1] <<  0);
+#else
+            *idst++ = uc[0] + (yc[0] << 8) +
+                      (vc[0] << 16) + (yc[1] << 24);
+#endif
+            yc += 2;
+            uc++;
+            vc++;
+        }
+#endif
+        if ((y & (vertLumPerChroma - 1)) == vertLumPerChroma - 1) {
+            usrc += chromStride;
+            vsrc += chromStride;
+        }
+        ysrc += lumStride;
+        dst  += dstStride;
+    }
+}
+
+/**
+ * Height should be a multiple of 2 and width should be a multiple of 16
+ * (If this is a problem for anyone then tell me, and I will fix it.)
+ */
+static inline void yv12touyvy_c(const uint8_t *ysrc, const uint8_t *usrc,
+                                const uint8_t *vsrc, uint8_t *dst,
+                                int width, int height, int lumStride,
+                                int chromStride, int dstStride)
+{
+    //FIXME interpolate chroma
+    yuvPlanartouyvy_c(ysrc, usrc, vsrc, dst, width, height, lumStride,
+                      chromStride, dstStride, 2);
+}
+
+/**
+ * Width should be a multiple of 16.
+ */
+static inline void yuv422ptouyvy_c(const uint8_t *ysrc, const uint8_t *usrc,
+                                   const uint8_t *vsrc, uint8_t *dst,
+                                   int width, int height, int lumStride,
+                                   int chromStride, int dstStride)
+{
+    yuvPlanartouyvy_c(ysrc, usrc, vsrc, dst, width, height, lumStride,
+                      chromStride, dstStride, 1);
+}
+
+/**
+ * Width should be a multiple of 16.
+ */
+static inline void yuv422ptoyuy2_c(const uint8_t *ysrc, const uint8_t *usrc,
+                                   const uint8_t *vsrc, uint8_t *dst,
+                                   int width, int height, int lumStride,
+                                   int chromStride, int dstStride)
+{
+    yuvPlanartoyuy2_c(ysrc, usrc, vsrc, dst, width, height, lumStride,
+                      chromStride, dstStride, 1);
+}
+
+/**
+ * Height should be a multiple of 2 and width should be a multiple of 16.
+ * (If this is a problem for anyone then tell me, and I will fix it.)
+ */
+static inline void yuy2toyv12_c(const uint8_t *src, uint8_t *ydst,
+                                uint8_t *udst, uint8_t *vdst,
+                                int width, int height, int lumStride,
+                                int chromStride, int srcStride)
+{
+    int y;
+    const int chromWidth = width >> 1;
+
+    for (y = 0; y < height; y += 2) {
+        int i;
+        for (i = 0; i < chromWidth; i++) {
+            ydst[2 * i + 0] = src[4 * i + 0];
+            udst[i]         = src[4 * i + 1];
+            ydst[2 * i + 1] = src[4 * i + 2];
+            vdst[i]         = src[4 * i + 3];
+        }
+        ydst += lumStride;
+        src  += srcStride;
+
+        for (i = 0; i < chromWidth; i++) {
+            ydst[2 * i + 0] = src[4 * i + 0];
+            ydst[2 * i + 1] = src[4 * i + 2];
+        }
+        udst += chromStride;
+        vdst += chromStride;
+        ydst += lumStride;
+        src  += srcStride;
+    }
+}
+
+static inline void planar2x_c(const uint8_t *src, uint8_t *dst, int srcWidth,
+                              int srcHeight, int srcStride, int dstStride)
+{
+    int x, y;
+
+    dst[0] = src[0];
+
+    // first line
+    for (x = 0; x < srcWidth - 1; x++) {
+        dst[2 * x + 1] = (3 * src[x] + src[x + 1]) >> 2;
+        dst[2 * x + 2] = (src[x] + 3 * src[x + 1]) >> 2;
+    }
+    dst[2 * srcWidth - 1] = src[srcWidth - 1];
+
+    dst += dstStride;
+
+    for (y = 1; y < srcHeight; y++) {
+        const int mmxSize = 1;
+
+        dst[0]         = (src[0] * 3 + src[srcStride]) >> 2;
+        dst[dstStride] = (src[0] + 3 * src[srcStride]) >> 2;
+
+        for (x = mmxSize - 1; x < srcWidth - 1; x++) {
+            dst[2 * x + 1]             = (src[x + 0] * 3 + src[x + srcStride + 1]) >> 2;
+            dst[2 * x + dstStride + 2] = (src[x + 0] + 3 * src[x + srcStride + 1]) >> 2;
+            dst[2 * x + dstStride + 1] = (src[x + 1] + 3 * src[x + srcStride])     >> 2;
+            dst[2 * x + 2]             = (src[x + 1] * 3 + src[x + srcStride])     >> 2;
+        }
+        dst[srcWidth * 2 - 1]             = (src[srcWidth - 1] * 3 + src[srcWidth - 1 + srcStride]) >> 2;
+        dst[srcWidth * 2 - 1 + dstStride] = (src[srcWidth - 1] + 3 * src[srcWidth - 1 + srcStride]) >> 2;
+
+        dst += dstStride * 2;
+        src += srcStride;
+    }
+
+    // last line
+    dst[0] = src[0];
+
+    for (x = 0; x < srcWidth - 1; x++) {
+        dst[2 * x + 1] = (src[x] * 3 + src[x + 1]) >> 2;
+        dst[2 * x + 2] = (src[x] + 3 * src[x + 1]) >> 2;
+    }
+    dst[2 * srcWidth - 1] = src[srcWidth - 1];
+}
+
+/**
+ * Height should be a multiple of 2 and width should be a multiple of 16.
+ * (If this is a problem for anyone then tell me, and I will fix it.)
+ * Chrominance data is only taken from every second line, others are ignored.
+ * FIXME: Write HQ version.
+ */
+static inline void uyvytoyv12_c(const uint8_t *src, uint8_t *ydst,
+                                uint8_t *udst, uint8_t *vdst,
+                                int width, int height, int lumStride,
+                                int chromStride, int srcStride)
+{
+    int y;
+    const int chromWidth = width >> 1;
+
+    for (y = 0; y < height; y += 2) {
+        int i;
+        for (i = 0; i < chromWidth; i++) {
+            udst[i]         = src[4 * i + 0];
+            ydst[2 * i + 0] = src[4 * i + 1];
+            vdst[i]         = src[4 * i + 2];
+            ydst[2 * i + 1] = src[4 * i + 3];
+        }
+        ydst += lumStride;
+        src  += srcStride;
+
+        for (i = 0; i < chromWidth; i++) {
+            ydst[2 * i + 0] = src[4 * i + 1];
+            ydst[2 * i + 1] = src[4 * i + 3];
+        }
+        udst += chromStride;
+        vdst += chromStride;
+        ydst += lumStride;
+        src  += srcStride;
+    }
+}
+
+/**
+ * Height should be a multiple of 2 and width should be a multiple of 2.
+ * (If this is a problem for anyone then tell me, and I will fix it.)
+ * Chrominance data is only taken from every second line,
+ * others are ignored in the C version.
+ * FIXME: Write HQ version.
+ */
+void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                   uint8_t *vdst, int width, int height, int lumStride,
+                   int chromStride, int srcStride, int32_t *rgb2yuv)
+{
+    int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
+    int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+    int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+    int y;
+    const int chromWidth = width >> 1;
+
+    for (y = 0; y < height; y += 2) {
+        int i;
+        for (i = 0; i < chromWidth; i++) {
+            unsigned int b = src[6 * i + 0];
+            unsigned int g = src[6 * i + 1];
+            unsigned int r = src[6 * i + 2];
+
+            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) +  16;
+            unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128;
+            unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128;
+
+            udst[i]     = U;
+            vdst[i]     = V;
+            ydst[2 * i] = Y;
+
+            b = src[6 * i + 3];
+            g = src[6 * i + 4];
+            r = src[6 * i + 5];
+
+            Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
+            ydst[2 * i + 1] = Y;
+        }
+        ydst += lumStride;
+        src  += srcStride;
+
+        if (y+1 == height)
+            break;
+
+        for (i = 0; i < chromWidth; i++) {
+            unsigned int b = src[6 * i + 0];
+            unsigned int g = src[6 * i + 1];
+            unsigned int r = src[6 * i + 2];
+
+            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
+
+            ydst[2 * i] = Y;
+
+            b = src[6 * i + 3];
+            g = src[6 * i + 4];
+            r = src[6 * i + 5];
+
+            Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
+            ydst[2 * i + 1] = Y;
+        }
+        udst += chromStride;
+        vdst += chromStride;
+        ydst += lumStride;
+        src  += srcStride;
+    }
+}
+
+static void interleaveBytes_c(const uint8_t *src1, const uint8_t *src2,
+                              uint8_t *dest, int width, int height,
+                              int src1Stride, int src2Stride, int dstStride)
+{
+    int h;
+
+    for (h = 0; h < height; h++) {
+        int w;
+        for (w = 0; w < width; w++) {
+            dest[2 * w + 0] = src1[w];
+            dest[2 * w + 1] = src2[w];
+        }
+        dest += dstStride;
+        src1 += src1Stride;
+        src2 += src2Stride;
+    }
+}
+
+static inline void vu9_to_vu12_c(const uint8_t *src1, const uint8_t *src2,
+                                 uint8_t *dst1, uint8_t *dst2,
+                                 int width, int height,
+                                 int srcStride1, int srcStride2,
+                                 int dstStride1, int dstStride2)
+{
+    int x, y;
+    int w = width  / 2;
+    int h = height / 2;
+
+    for (y = 0; y < h; y++) {
+        const uint8_t *s1 = src1 + srcStride1 * (y >> 1);
+        uint8_t *d        = dst1 + dstStride1 *  y;
+        for (x = 0; x < w; x++)
+            d[2 * x] = d[2 * x + 1] = s1[x];
+    }
+    for (y = 0; y < h; y++) {
+        const uint8_t *s2 = src2 + srcStride2 * (y >> 1);
+        uint8_t *d        = dst2 + dstStride2 *  y;
+        for (x = 0; x < w; x++)
+            d[2 * x] = d[2 * x + 1] = s2[x];
+    }
+}
+
+static inline void yvu9_to_yuy2_c(const uint8_t *src1, const uint8_t *src2,
+                                  const uint8_t *src3, uint8_t *dst,
+                                  int width, int height,
+                                  int srcStride1, int srcStride2,
+                                  int srcStride3, int dstStride)
+{
+    int x, y;
+    int w = width / 2;
+    int h = height;
+
+    for (y = 0; y < h; y++) {
+        const uint8_t *yp = src1 + srcStride1 *  y;
+        const uint8_t *up = src2 + srcStride2 * (y >> 2);
+        const uint8_t *vp = src3 + srcStride3 * (y >> 2);
+        uint8_t *d        = dst  + dstStride  *  y;
+        for (x = 0; x < w; x++) {
+            const int x2 = x << 2;
+            d[8 * x + 0] = yp[x2];
+            d[8 * x + 1] = up[x];
+            d[8 * x + 2] = yp[x2 + 1];
+            d[8 * x + 3] = vp[x];
+            d[8 * x + 4] = yp[x2 + 2];
+            d[8 * x + 5] = up[x];
+            d[8 * x + 6] = yp[x2 + 3];
+            d[8 * x + 7] = vp[x];
+        }
+    }
+}
+
+static void extract_even_c(const uint8_t *src, uint8_t *dst, int count)
+{
+    dst   +=  count;
+    src   +=  count * 2;
+    count  = -count;
+    while (count < 0) {
+        dst[count] = src[2 * count];
+        count++;
+    }
+}
+
+static void extract_even2_c(const uint8_t *src, uint8_t *dst0, uint8_t *dst1,
+                            int count)
+{
+    dst0  +=  count;
+    dst1  +=  count;
+    src   +=  count * 4;
+    count  = -count;
+    while (count < 0) {
+        dst0[count] = src[4 * count + 0];
+        dst1[count] = src[4 * count + 2];
+        count++;
+    }
+}
+
+static void extract_even2avg_c(const uint8_t *src0, const uint8_t *src1,
+                               uint8_t *dst0, uint8_t *dst1, int count)
+{
+    dst0  +=  count;
+    dst1  +=  count;
+    src0  +=  count * 4;
+    src1  +=  count * 4;
+    count  = -count;
+    while (count < 0) {
+        dst0[count] = (src0[4 * count + 0] + src1[4 * count + 0]) >> 1;
+        dst1[count] = (src0[4 * count + 2] + src1[4 * count + 2]) >> 1;
+        count++;
+    }
+}
+
+static void extract_odd2_c(const uint8_t *src, uint8_t *dst0, uint8_t *dst1,
+                           int count)
+{
+    dst0  +=  count;
+    dst1  +=  count;
+    src   +=  count * 4;
+    count  = -count;
+    src++;
+    while (count < 0) {
+        dst0[count] = src[4 * count + 0];
+        dst1[count] = src[4 * count + 2];
+        count++;
+    }
+}
+
+static void extract_odd2avg_c(const uint8_t *src0, const uint8_t *src1,
+                              uint8_t *dst0, uint8_t *dst1, int count)
+{
+    dst0  +=  count;
+    dst1  +=  count;
+    src0  +=  count * 4;
+    src1  +=  count * 4;
+    count  = -count;
+    src0++;
+    src1++;
+    while (count < 0) {
+        dst0[count] = (src0[4 * count + 0] + src1[4 * count + 0]) >> 1;
+        dst1[count] = (src0[4 * count + 2] + src1[4 * count + 2]) >> 1;
+        count++;
+    }
+}
+
+static void yuyvtoyuv420_c(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                           const uint8_t *src, int width, int height,
+                           int lumStride, int chromStride, int srcStride)
+{
+    int y;
+    const int chromWidth = FF_CEIL_RSHIFT(width, 1);
+
+    for (y = 0; y < height; y++) {
+        extract_even_c(src, ydst, width);
+        if (y & 1) {
+            extract_odd2avg_c(src - srcStride, src, udst, vdst, chromWidth);
+            udst += chromStride;
+            vdst += chromStride;
+        }
+
+        src  += srcStride;
+        ydst += lumStride;
+    }
+}
+
+static void yuyvtoyuv422_c(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                           const uint8_t *src, int width, int height,
+                           int lumStride, int chromStride, int srcStride)
+{
+    int y;
+    const int chromWidth = FF_CEIL_RSHIFT(width, 1);
+
+    for (y = 0; y < height; y++) {
+        extract_even_c(src, ydst, width);
+        extract_odd2_c(src, udst, vdst, chromWidth);
+
+        src  += srcStride;
+        ydst += lumStride;
+        udst += chromStride;
+        vdst += chromStride;
+    }
+}
+
+static void uyvytoyuv420_c(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                           const uint8_t *src, int width, int height,
+                           int lumStride, int chromStride, int srcStride)
+{
+    int y;
+    const int chromWidth = FF_CEIL_RSHIFT(width, 1);
+
+    for (y = 0; y < height; y++) {
+        extract_even_c(src + 1, ydst, width);
+        if (y & 1) {
+            extract_even2avg_c(src - srcStride, src, udst, vdst, chromWidth);
+            udst += chromStride;
+            vdst += chromStride;
+        }
+
+        src  += srcStride;
+        ydst += lumStride;
+    }
+}
+
+static void uyvytoyuv422_c(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                           const uint8_t *src, int width, int height,
+                           int lumStride, int chromStride, int srcStride)
+{
+    int y;
+    const int chromWidth = FF_CEIL_RSHIFT(width, 1);
+
+    for (y = 0; y < height; y++) {
+        extract_even_c(src + 1, ydst, width);
+        extract_even2_c(src, udst, vdst, chromWidth);
+
+        src  += srcStride;
+        ydst += lumStride;
+        udst += chromStride;
+        vdst += chromStride;
+    }
+}
+
+static av_cold void rgb2rgb_init_c(void)
+{
+    rgb15to16          = rgb15to16_c;
+    rgb15tobgr24       = rgb15tobgr24_c;
+    rgb15to32          = rgb15to32_c;
+    rgb16tobgr24       = rgb16tobgr24_c;
+    rgb16to32          = rgb16to32_c;
+    rgb16to15          = rgb16to15_c;
+    rgb24tobgr16       = rgb24tobgr16_c;
+    rgb24tobgr15       = rgb24tobgr15_c;
+    rgb24tobgr32       = rgb24tobgr32_c;
+    rgb32to16          = rgb32to16_c;
+    rgb32to15          = rgb32to15_c;
+    rgb32tobgr24       = rgb32tobgr24_c;
+    rgb24to15          = rgb24to15_c;
+    rgb24to16          = rgb24to16_c;
+    rgb24tobgr24       = rgb24tobgr24_c;
+    shuffle_bytes_2103 = shuffle_bytes_2103_c;
+    rgb32tobgr16       = rgb32tobgr16_c;
+    rgb32tobgr15       = rgb32tobgr15_c;
+    yv12toyuy2         = yv12toyuy2_c;
+    yv12touyvy         = yv12touyvy_c;
+    yuv422ptoyuy2      = yuv422ptoyuy2_c;
+    yuv422ptouyvy      = yuv422ptouyvy_c;
+    yuy2toyv12         = yuy2toyv12_c;
+    planar2x           = planar2x_c;
+    ff_rgb24toyv12     = ff_rgb24toyv12_c;
+    interleaveBytes    = interleaveBytes_c;
+    vu9_to_vu12        = vu9_to_vu12_c;
+    yvu9_to_yuy2       = yvu9_to_yuy2_c;
+
+    uyvytoyuv420       = uyvytoyuv420_c;
+    uyvytoyuv422       = uyvytoyuv422_c;
+    yuyvtoyuv420       = yuyvtoyuv420_c;
+    yuyvtoyuv422       = yuyvtoyuv422_c;
+}
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/sparc/Makefile
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/sparc/Makefile
@@ -0,0 +1 @@
+VIS-OBJS += sparc/yuv2rgb_vis.o                                         \
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/sparc/yuv2rgb_vis.c
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/sparc/yuv2rgb_vis.c
@@ -0,0 +1,212 @@
+/*
+ * VIS optimized software YUV to RGB converter
+ * Copyright (c) 2007 Denes Balatoni <dbalatoni@programozo.hu>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <inttypes.h>
+#include <stdlib.h>
+
+#include "libavutil/attributes.h"
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+
+#define YUV2RGB_INIT                               \
+    "wr %%g0, 0x10, %%gsr \n\t"                    \
+    "ldd [%5],      %%f32 \n\t"                    \
+    "ldd [%5 +  8], %%f34 \n\t"                    \
+    "ldd [%5 + 16], %%f36 \n\t"                    \
+    "ldd [%5 + 24], %%f38 \n\t"                    \
+    "ldd [%5 + 32], %%f40 \n\t"                    \
+    "ldd [%5 + 40], %%f42 \n\t"                    \
+    "ldd [%5 + 48], %%f44 \n\t"                    \
+    "ldd [%5 + 56], %%f46 \n\t"                    \
+    "ldd [%5 + 64], %%f48 \n\t"                    \
+    "ldd [%5 + 72], %%f50 \n\t"
+
+#define YUV2RGB_KERNEL                             \
+    /* ^^^^ f0=Y f3=u f5=v */                      \
+    "fmul8x16 %%f3,  %%f48,  %%f6 \n\t"            \
+    "fmul8x16 %%f19, %%f48, %%f22 \n\t"            \
+    "fmul8x16 %%f5,  %%f44,  %%f8 \n\t"            \
+    "fmul8x16 %%f21, %%f44, %%f24 \n\t"            \
+    "fmul8x16 %%f0,  %%f42,  %%f0 \n\t"            \
+    "fmul8x16 %%f16, %%f42, %%f16 \n\t"            \
+    "fmul8x16 %%f3,  %%f50,  %%f2 \n\t"            \
+    "fmul8x16 %%f19, %%f50, %%f18 \n\t"            \
+    "fmul8x16 %%f5,  %%f46,  %%f4 \n\t"            \
+    "fmul8x16 %%f21, %%f46, %%f20 \n\t"            \
+                                                   \
+    "fpsub16 %%f6,  %%f34,  %%f6 \n\t" /* 1 */     \
+    "fpsub16 %%f22, %%f34, %%f22 \n\t" /* 1 */     \
+    "fpsub16 %%f8,  %%f38,  %%f8 \n\t" /* 3 */     \
+    "fpsub16 %%f24, %%f38, %%f24 \n\t" /* 3 */     \
+    "fpsub16 %%f0,  %%f32,  %%f0 \n\t" /* 0 */     \
+    "fpsub16 %%f16, %%f32, %%f16 \n\t" /* 0 */     \
+    "fpsub16 %%f2,  %%f36,  %%f2 \n\t" /* 2 */     \
+    "fpsub16 %%f18, %%f36, %%f18 \n\t" /* 2 */     \
+    "fpsub16 %%f4,  %%f40,  %%f4 \n\t" /* 4 */     \
+    "fpsub16 %%f20, %%f40, %%f20 \n\t" /* 4 */     \
+                                                   \
+    "fpadd16 %%f0,  %%f8,  %%f8  \n\t" /* Gt */    \
+    "fpadd16 %%f16, %%f24, %%f24 \n\t" /* Gt */    \
+    "fpadd16 %%f0,  %%f4,  %%f4  \n\t" /* R */     \
+    "fpadd16 %%f16, %%f20, %%f20 \n\t" /* R */     \
+    "fpadd16 %%f0,  %%f6,  %%f6  \n\t" /* B */     \
+    "fpadd16 %%f16, %%f22, %%f22 \n\t" /* B */     \
+    "fpadd16 %%f8,  %%f2,  %%f2  \n\t" /* G */     \
+    "fpadd16 %%f24, %%f18, %%f18 \n\t" /* G */     \
+                                                   \
+    "fpack16 %%f4,  %%f4  \n\t"                    \
+    "fpack16 %%f20, %%f20 \n\t"                    \
+    "fpack16 %%f6,  %%f6  \n\t"                    \
+    "fpack16 %%f22, %%f22 \n\t"                    \
+    "fpack16 %%f2,  %%f2  \n\t"                    \
+    "fpack16 %%f18, %%f18 \n\t"
+
+// FIXME: must be changed to set alpha to 255 instead of 0
+static int vis_420P_ARGB32(SwsContext *c, uint8_t *src[], int srcStride[],
+                           int srcSliceY, int srcSliceH,
+                           uint8_t *dst[], int dstStride[])
+{
+    int y, out1, out2, out3, out4, out5, out6;
+
+    for (y = 0; y < srcSliceH; ++y)
+        __asm__ volatile (
+            YUV2RGB_INIT
+            "wr %%g0, 0xd2, %%asi        \n\t"  /* ASI_FL16_P */
+            "1:                          \n\t"
+            "ldda [%1]     %%asi, %%f2   \n\t"
+            "ldda [%1 + 2] %%asi, %%f18  \n\t"
+            "ldda [%2]     %%asi, %%f4   \n\t"
+            "ldda [%2 + 2] %%asi, %%f20  \n\t"
+            "ld [%0], %%f0               \n\t"
+            "ld [%0+4], %%f16            \n\t"
+            "fpmerge %%f3,  %%f3,  %%f2  \n\t"
+            "fpmerge %%f19, %%f19, %%f18 \n\t"
+            "fpmerge %%f5,  %%f5,  %%f4  \n\t"
+            "fpmerge %%f21, %%f21, %%f20 \n\t"
+            YUV2RGB_KERNEL
+            "fzero %%f0                  \n\t"
+            "fpmerge %%f4,  %%f6,  %%f8  \n\t"  // r, b, t1
+            "fpmerge %%f20, %%f22, %%f24 \n\t"  // r, b, t1
+            "fpmerge %%f0,  %%f2,  %%f10 \n\t"  // 0, g, t2
+            "fpmerge %%f0,  %%f18, %%f26 \n\t"  // 0, g, t2
+            "fpmerge %%f10, %%f8,  %%f4  \n\t"  // t2, t1, msb
+            "fpmerge %%f26, %%f24, %%f20 \n\t"  // t2, t1, msb
+            "fpmerge %%f11, %%f9,  %%f6  \n\t"  // t2, t1, lsb
+            "fpmerge %%f27, %%f25, %%f22 \n\t"  // t2, t1, lsb
+            "std %%f4,  [%3]             \n\t"
+            "std %%f20, [%3 + 16]        \n\t"
+            "std %%f6,  [%3 +  8]        \n\t"
+            "std %%f22, [%3 + 24]        \n\t"
+
+            "add %0, 8, %0   \n\t"
+            "add %1, 4, %1   \n\t"
+            "add %2, 4, %2   \n\t"
+            "subcc %4, 8, %4 \n\t"
+            "bne 1b          \n\t"
+            "add %3, 32, %3  \n\t"              // delay slot
+            : "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4), "=r" (out5), "=r" (out6)
+            : "0" (src[0] + (y + srcSliceY) * srcStride[0]), "1" (src[1] + ((y + srcSliceY) >> 1) * srcStride[1]),
+            "2" (src[2] + ((y + srcSliceY) >> 1) * srcStride[2]), "3" (dst[0] + (y + srcSliceY) * dstStride[0]),
+            "4" (c->dstW),
+            "5" (c->sparc_coeffs)
+            );
+
+    return srcSliceH;
+}
+
+// FIXME: must be changed to set alpha to 255 instead of 0
+static int vis_422P_ARGB32(SwsContext *c, uint8_t *src[], int srcStride[],
+                           int srcSliceY, int srcSliceH,
+                           uint8_t *dst[], int dstStride[])
+{
+    int y, out1, out2, out3, out4, out5, out6;
+
+    for (y = 0; y < srcSliceH; ++y)
+        __asm__ volatile (
+            YUV2RGB_INIT
+            "wr %%g0, 0xd2, %%asi        \n\t" /* ASI_FL16_P */
+            "1:                          \n\t"
+            "ldda [%1]     %%asi, %%f2   \n\t"
+            "ldda [%1 + 2] %%asi, %%f18  \n\t"
+            "ldda [%2]     %%asi, %%f4   \n\t"
+            "ldda [%2 + 2] %%asi, %%f20  \n\t"
+            "ld [%0],     %%f0           \n\t"
+            "ld [%0 + 4], %%f16          \n\t"
+            "fpmerge %%f3,  %%f3,  %%f2  \n\t"
+            "fpmerge %%f19, %%f19, %%f18 \n\t"
+            "fpmerge %%f5,  %%f5,  %%f4  \n\t"
+            "fpmerge %%f21, %%f21, %%f20 \n\t"
+            YUV2RGB_KERNEL
+            "fzero %%f0 \n\t"
+            "fpmerge %%f4,  %%f6,  %%f8  \n\t"  // r,b,t1
+            "fpmerge %%f20, %%f22, %%f24 \n\t"  // r,b,t1
+            "fpmerge %%f0,  %%f2,  %%f10 \n\t"  // 0,g,t2
+            "fpmerge %%f0,  %%f18, %%f26 \n\t"  // 0,g,t2
+            "fpmerge %%f10, %%f8,  %%f4  \n\t"  // t2,t1,msb
+            "fpmerge %%f26, %%f24, %%f20 \n\t"  // t2,t1,msb
+            "fpmerge %%f11, %%f9,  %%f6  \n\t"  // t2,t1,lsb
+            "fpmerge %%f27, %%f25, %%f22 \n\t"  // t2,t1,lsb
+            "std %%f4,  [%3]             \n\t"
+            "std %%f20, [%3 + 16]        \n\t"
+            "std %%f6,  [%3 + 8]         \n\t"
+            "std %%f22, [%3 + 24]        \n\t"
+
+            "add %0, 8, %0   \n\t"
+            "add %1, 4, %1   \n\t"
+            "add %2, 4, %2   \n\t"
+            "subcc %4, 8, %4 \n\t"
+            "bne 1b          \n\t"
+            "add %3, 32, %3  \n\t" //delay slot
+            : "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4), "=r" (out5), "=r" (out6)
+            : "0" (src[0] + (y + srcSliceY) * srcStride[0]), "1" (src[1] + (y + srcSliceY) * srcStride[1]),
+            "2" (src[2] + (y + srcSliceY) * srcStride[2]), "3" (dst[0] + (y + srcSliceY) * dstStride[0]),
+            "4" (c->dstW),
+            "5" (c->sparc_coeffs)
+            );
+
+    return srcSliceH;
+}
+
+av_cold SwsFunc ff_yuv2rgb_init_vis(SwsContext *c)
+{
+    c->sparc_coeffs[5] = c->yCoeff;
+    c->sparc_coeffs[6] = c->vgCoeff;
+    c->sparc_coeffs[7] = c->vrCoeff;
+    c->sparc_coeffs[8] = c->ubCoeff;
+    c->sparc_coeffs[9] = c->ugCoeff;
+
+    c->sparc_coeffs[0] = (((int16_t)c->yOffset * (int16_t)c->yCoeff  >> 11) & 0xffff) * 0x0001000100010001ULL;
+    c->sparc_coeffs[1] = (((int16_t)c->uOffset * (int16_t)c->ubCoeff >> 11) & 0xffff) * 0x0001000100010001ULL;
+    c->sparc_coeffs[2] = (((int16_t)c->uOffset * (int16_t)c->ugCoeff >> 11) & 0xffff) * 0x0001000100010001ULL;
+    c->sparc_coeffs[3] = (((int16_t)c->vOffset * (int16_t)c->vgCoeff >> 11) & 0xffff) * 0x0001000100010001ULL;
+    c->sparc_coeffs[4] = (((int16_t)c->vOffset * (int16_t)c->vrCoeff >> 11) & 0xffff) * 0x0001000100010001ULL;
+
+    if (c->dstFormat == AV_PIX_FMT_RGB32 && c->srcFormat == AV_PIX_FMT_YUV422P && (c->dstW & 7) == 0) {
+        av_log(c, AV_LOG_INFO,
+               "SPARC VIS accelerated YUV422P -> RGB32 (WARNING: alpha value is wrong)\n");
+        return vis_422P_ARGB32;
+    } else if (c->dstFormat == AV_PIX_FMT_RGB32 && c->srcFormat == AV_PIX_FMT_YUV420P && (c->dstW & 7) == 0) {
+        av_log(c, AV_LOG_INFO,
+               "SPARC VIS accelerated YUV420P -> RGB32 (WARNING: alpha value is wrong)\n");
+        return vis_420P_ARGB32;
+    }
+    return NULL;
+}
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/swscale-2.def
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/swscale-2.def
@@ -0,0 +1,37 @@
+EXPORTS
+    sws_addVec
+    sws_allocVec
+    sws_alloc_context
+    sws_cloneVec
+    sws_context_class DATA
+    sws_convVec
+    sws_convertPalette8ToPacked24
+    sws_convertPalette8ToPacked32
+    sws_format_name
+    sws_freeContext
+    sws_freeFilter
+    sws_freeVec
+    sws_getCachedContext
+    sws_getCoefficients
+    sws_getColorspaceDetails
+    sws_getConstVec
+    sws_getContext
+    sws_getDefaultFilter
+    sws_getGaussianVec
+    sws_getIdentityVec
+    sws_get_class
+    sws_init_context
+    sws_isSupportedEndiannessConversion
+    sws_isSupportedInput
+    sws_isSupportedOutput
+    sws_normalizeVec
+    sws_printVec2
+    sws_rgb2rgb_init
+    sws_scale
+    sws_scaleVec
+    sws_setColorspaceDetails
+    sws_shiftVec
+    sws_subVec
+    swscale_configuration
+    swscale_license
+    swscale_version
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/swscale-2.orig.def
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/swscale-2.orig.def
@@ -0,0 +1,37 @@
+EXPORTS
+    sws_addVec @1
+    sws_allocVec @2
+    sws_alloc_context @3
+    sws_cloneVec @4
+    sws_context_class @5 DATA
+    sws_convVec @6
+    sws_convertPalette8ToPacked24 @7
+    sws_convertPalette8ToPacked32 @8
+    sws_format_name @9
+    sws_freeContext @10
+    sws_freeFilter @11
+    sws_freeVec @12
+    sws_getCachedContext @13
+    sws_getCoefficients @14
+    sws_getColorspaceDetails @15
+    sws_getConstVec @16
+    sws_getContext @17
+    sws_getDefaultFilter @18
+    sws_getGaussianVec @19
+    sws_getIdentityVec @20
+    sws_get_class @21
+    sws_init_context @22
+    sws_isSupportedEndiannessConversion @23
+    sws_isSupportedInput @24
+    sws_isSupportedOutput @25
+    sws_normalizeVec @26
+    sws_printVec2 @27
+    sws_rgb2rgb_init @28
+    sws_scale @29
+    sws_scaleVec @30
+    sws_setColorspaceDetails @31
+    sws_shiftVec @32
+    sws_subVec @33
+    swscale_configuration @34
+    swscale_license @35
+    swscale_version @36
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/swscale-test.c
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/swscale-test.c
@@ -0,0 +1,415 @@
+/*
+ * Copyright (C) 2003-2011 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+#include <stdarg.h>
+
+#undef HAVE_AV_CONFIG_H
+#include "libavutil/imgutils.h"
+#include "libavutil/mem.h"
+#include "libavutil/avutil.h"
+#include "libavutil/crc.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/lfg.h"
+#include "swscale.h"
+
+/* HACK Duplicated from swscale_internal.h.
+ * Should be removed when a cleaner pixel format system exists. */
+#define isGray(x)                      \
+    ((x) == AV_PIX_FMT_GRAY8       ||     \
+     (x) == AV_PIX_FMT_Y400A       ||     \
+     (x) == AV_PIX_FMT_GRAY16BE    ||     \
+     (x) == AV_PIX_FMT_GRAY16LE)
+#define hasChroma(x)                   \
+    (!(isGray(x)                ||     \
+       (x) == AV_PIX_FMT_MONOBLACK ||     \
+       (x) == AV_PIX_FMT_MONOWHITE))
+#define isALPHA(x)                     \
+    ((x) == AV_PIX_FMT_BGR32   ||         \
+     (x) == AV_PIX_FMT_BGR32_1 ||         \
+     (x) == AV_PIX_FMT_RGB32   ||         \
+     (x) == AV_PIX_FMT_RGB32_1 ||         \
+     (x) == AV_PIX_FMT_YUVA420P)
+
+static uint64_t getSSD(const uint8_t *src1, const uint8_t *src2, int stride1,
+                       int stride2, int w, int h)
+{
+    int x, y;
+    uint64_t ssd = 0;
+
+    for (y = 0; y < h; y++) {
+        for (x = 0; x < w; x++) {
+            int d = src1[x + y * stride1] - src2[x + y * stride2];
+            ssd += d * d;
+        }
+    }
+    return ssd;
+}
+
+struct Results {
+    uint64_t ssdY;
+    uint64_t ssdU;
+    uint64_t ssdV;
+    uint64_t ssdA;
+    uint32_t crc;
+};
+
+// test by ref -> src -> dst -> out & compare out against ref
+// ref & out are YV12
+static int doTest(uint8_t *ref[4], int refStride[4], int w, int h,
+                  enum AVPixelFormat srcFormat, enum AVPixelFormat dstFormat,
+                  int srcW, int srcH, int dstW, int dstH, int flags,
+                  struct Results *r)
+{
+    const AVPixFmtDescriptor *desc_yuva420p = av_pix_fmt_desc_get(AV_PIX_FMT_YUVA420P);
+    const AVPixFmtDescriptor *desc_src      = av_pix_fmt_desc_get(srcFormat);
+    const AVPixFmtDescriptor *desc_dst      = av_pix_fmt_desc_get(dstFormat);
+    static enum AVPixelFormat cur_srcFormat;
+    static int cur_srcW, cur_srcH;
+    static uint8_t *src[4];
+    static int srcStride[4];
+    uint8_t *dst[4] = { 0 };
+    uint8_t *out[4] = { 0 };
+    int dstStride[4] = {0};
+    int i;
+    uint64_t ssdY, ssdU = 0, ssdV = 0, ssdA = 0;
+    struct SwsContext *dstContext = NULL, *outContext = NULL;
+    uint32_t crc = 0;
+    int res      = 0;
+
+    if (cur_srcFormat != srcFormat || cur_srcW != srcW || cur_srcH != srcH) {
+        struct SwsContext *srcContext = NULL;
+        int p;
+
+        for (p = 0; p < 4; p++)
+            av_freep(&src[p]);
+
+        av_image_fill_linesizes(srcStride, srcFormat, srcW);
+        for (p = 0; p < 4; p++) {
+            srcStride[p] = FFALIGN(srcStride[p], 16);
+            if (srcStride[p])
+                src[p] = av_mallocz(srcStride[p] * srcH + 16);
+            if (srcStride[p] && !src[p]) {
+                perror("Malloc");
+                res = -1;
+                goto end;
+            }
+        }
+        srcContext = sws_getContext(w, h, AV_PIX_FMT_YUVA420P, srcW, srcH,
+                                    srcFormat, SWS_BILINEAR, NULL, NULL, NULL);
+        if (!srcContext) {
+            fprintf(stderr, "Failed to get %s ---> %s\n",
+                    desc_yuva420p->name,
+                    desc_src->name);
+            res = -1;
+            goto end;
+        }
+        sws_scale(srcContext, (const uint8_t * const*)ref, refStride, 0, h, src, srcStride);
+        sws_freeContext(srcContext);
+
+        cur_srcFormat = srcFormat;
+        cur_srcW      = srcW;
+        cur_srcH      = srcH;
+    }
+
+    av_image_fill_linesizes(dstStride, dstFormat, dstW);
+    for (i = 0; i < 4; i++) {
+        /* Image buffers passed into libswscale can be allocated any way you
+         * prefer, as long as they're aligned enough for the architecture, and
+         * they're freed appropriately (such as using av_free for buffers
+         * allocated with av_malloc). */
+        /* An extra 16 bytes is being allocated because some scalers may write
+         * out of bounds. */
+        dstStride[i] = FFALIGN(dstStride[i], 16);
+        if (dstStride[i])
+            dst[i] = av_mallocz(dstStride[i] * dstH + 16);
+        if (dstStride[i] && !dst[i]) {
+            perror("Malloc");
+            res = -1;
+
+            goto end;
+        }
+    }
+
+    dstContext = sws_getContext(srcW, srcH, srcFormat, dstW, dstH, dstFormat,
+                                flags, NULL, NULL, NULL);
+    if (!dstContext) {
+        fprintf(stderr, "Failed to get %s ---> %s\n",
+                desc_src->name, desc_dst->name);
+        res = -1;
+        goto end;
+    }
+
+    printf(" %s %dx%d -> %s %3dx%3d flags=%2d",
+           desc_src->name, srcW, srcH,
+           desc_dst->name, dstW, dstH,
+           flags);
+    fflush(stdout);
+
+    sws_scale(dstContext, (const uint8_t * const*)src, srcStride, 0, srcH, dst, dstStride);
+
+    for (i = 0; i < 4 && dstStride[i]; i++)
+        crc = av_crc(av_crc_get_table(AV_CRC_32_IEEE), crc, dst[i],
+                     dstStride[i] * dstH);
+
+    if (r && crc == r->crc) {
+        ssdY = r->ssdY;
+        ssdU = r->ssdU;
+        ssdV = r->ssdV;
+        ssdA = r->ssdA;
+    } else {
+        for (i = 0; i < 4; i++) {
+            refStride[i] = FFALIGN(refStride[i], 16);
+            if (refStride[i])
+                out[i] = av_mallocz(refStride[i] * h);
+            if (refStride[i] && !out[i]) {
+                perror("Malloc");
+                res = -1;
+                goto end;
+            }
+        }
+        outContext = sws_getContext(dstW, dstH, dstFormat, w, h,
+                                    AV_PIX_FMT_YUVA420P, SWS_BILINEAR,
+                                    NULL, NULL, NULL);
+        if (!outContext) {
+            fprintf(stderr, "Failed to get %s ---> %s\n",
+                    desc_dst->name,
+                    desc_yuva420p->name);
+            res = -1;
+            goto end;
+        }
+        sws_scale(outContext, (const uint8_t * const*)dst, dstStride, 0, dstH, out, refStride);
+
+        ssdY = getSSD(ref[0], out[0], refStride[0], refStride[0], w, h);
+        if (hasChroma(srcFormat) && hasChroma(dstFormat)) {
+            //FIXME check that output is really gray
+            ssdU = getSSD(ref[1], out[1], refStride[1], refStride[1],
+                          (w + 1) >> 1, (h + 1) >> 1);
+            ssdV = getSSD(ref[2], out[2], refStride[2], refStride[2],
+                          (w + 1) >> 1, (h + 1) >> 1);
+        }
+        if (isALPHA(srcFormat) && isALPHA(dstFormat))
+            ssdA = getSSD(ref[3], out[3], refStride[3], refStride[3], w, h);
+
+        ssdY /= w * h;
+        ssdU /= w * h / 4;
+        ssdV /= w * h / 4;
+        ssdA /= w * h;
+
+        sws_freeContext(outContext);
+
+        for (i = 0; i < 4; i++)
+            if (refStride[i])
+                av_free(out[i]);
+    }
+
+    printf(" CRC=%08x SSD=%5"PRId64 ",%5"PRId64 ",%5"PRId64 ",%5"PRId64 "\n",
+           crc, ssdY, ssdU, ssdV, ssdA);
+
+end:
+    sws_freeContext(dstContext);
+
+    for (i = 0; i < 4; i++)
+        if (dstStride[i])
+            av_free(dst[i]);
+
+    return res;
+}
+
+static void selfTest(uint8_t *ref[4], int refStride[4], int w, int h,
+                     enum AVPixelFormat srcFormat_in,
+                     enum AVPixelFormat dstFormat_in)
+{
+    const int flags[] = { SWS_FAST_BILINEAR, SWS_BILINEAR, SWS_BICUBIC,
+                          SWS_X, SWS_POINT, SWS_AREA, 0 };
+    const int srcW   = w;
+    const int srcH   = h;
+    const int dstW[] = { srcW - srcW / 3, srcW, srcW + srcW / 3, 0 };
+    const int dstH[] = { srcH - srcH / 3, srcH, srcH + srcH / 3, 0 };
+    enum AVPixelFormat srcFormat, dstFormat;
+    const AVPixFmtDescriptor *desc_src, *desc_dst;
+
+    for (srcFormat = srcFormat_in != AV_PIX_FMT_NONE ? srcFormat_in : 0;
+         srcFormat < AV_PIX_FMT_NB; srcFormat++) {
+        if (!sws_isSupportedInput(srcFormat) ||
+            !sws_isSupportedOutput(srcFormat))
+            continue;
+
+        desc_src = av_pix_fmt_desc_get(srcFormat);
+
+        for (dstFormat = dstFormat_in != AV_PIX_FMT_NONE ? dstFormat_in : 0;
+             dstFormat < AV_PIX_FMT_NB; dstFormat++) {
+            int i, j, k;
+            int res = 0;
+
+            if (!sws_isSupportedInput(dstFormat) ||
+                !sws_isSupportedOutput(dstFormat))
+                continue;
+
+            desc_dst = av_pix_fmt_desc_get(dstFormat);
+
+            printf("%s -> %s\n", desc_src->name, desc_dst->name);
+            fflush(stdout);
+
+            for (k = 0; flags[k] && !res; k++)
+                for (i = 0; dstW[i] && !res; i++)
+                    for (j = 0; dstH[j] && !res; j++)
+                        res = doTest(ref, refStride, w, h,
+                                     srcFormat, dstFormat,
+                                     srcW, srcH, dstW[i], dstH[j], flags[k],
+                                     NULL);
+            if (dstFormat_in != AV_PIX_FMT_NONE)
+                break;
+        }
+        if (srcFormat_in != AV_PIX_FMT_NONE)
+            break;
+    }
+}
+
+static int fileTest(uint8_t *ref[4], int refStride[4], int w, int h, FILE *fp,
+                    enum AVPixelFormat srcFormat_in,
+                    enum AVPixelFormat dstFormat_in)
+{
+    char buf[256];
+
+    while (fgets(buf, sizeof(buf), fp)) {
+        struct Results r;
+        enum AVPixelFormat srcFormat;
+        char srcStr[12];
+        int srcW, srcH;
+        enum AVPixelFormat dstFormat;
+        char dstStr[12];
+        int dstW, dstH;
+        int flags;
+        int ret;
+
+        ret = sscanf(buf,
+                     " %12s %dx%d -> %12s %dx%d flags=%d CRC=%x"
+                     " SSD=%"SCNd64 ", %"SCNd64 ", %"SCNd64 ", %"SCNd64 "\n",
+                     srcStr, &srcW, &srcH, dstStr, &dstW, &dstH,
+                     &flags, &r.crc, &r.ssdY, &r.ssdU, &r.ssdV, &r.ssdA);
+        if (ret != 12) {
+            srcStr[0] = dstStr[0] = 0;
+            ret       = sscanf(buf, "%12s -> %12s\n", srcStr, dstStr);
+        }
+
+        srcFormat = av_get_pix_fmt(srcStr);
+        dstFormat = av_get_pix_fmt(dstStr);
+
+        if (srcFormat == AV_PIX_FMT_NONE || dstFormat == AV_PIX_FMT_NONE ||
+            srcW > 8192U || srcH > 8192U || dstW > 8192U || dstH > 8192U) {
+            fprintf(stderr, "malformed input file\n");
+            return -1;
+        }
+        if ((srcFormat_in != AV_PIX_FMT_NONE && srcFormat_in != srcFormat) ||
+            (dstFormat_in != AV_PIX_FMT_NONE && dstFormat_in != dstFormat))
+            continue;
+        if (ret != 12) {
+            printf("%s", buf);
+            continue;
+        }
+
+        doTest(ref, refStride, w, h,
+               srcFormat, dstFormat,
+               srcW, srcH, dstW, dstH, flags,
+               &r);
+    }
+
+    return 0;
+}
+
+#define W 96
+#define H 96
+
+int main(int argc, char **argv)
+{
+    enum AVPixelFormat srcFormat = AV_PIX_FMT_NONE;
+    enum AVPixelFormat dstFormat = AV_PIX_FMT_NONE;
+    uint8_t *rgb_data   = av_malloc(W * H * 4);
+    const uint8_t * const rgb_src[4] = { rgb_data, NULL, NULL, NULL };
+    int rgb_stride[4]   = { 4 * W, 0, 0, 0 };
+    uint8_t *data       = av_malloc(4 * W * H);
+    uint8_t *src[4]     = { data, data + W * H, data + W * H * 2, data + W * H * 3 };
+    int stride[4]       = { W, W, W, W };
+    int x, y;
+    struct SwsContext *sws;
+    AVLFG rand;
+    int res = -1;
+    int i;
+    FILE *fp = NULL;
+
+    if (!rgb_data || !data)
+        return -1;
+
+    for (i = 1; i < argc; i += 2) {
+        if (argv[i][0] != '-' || i + 1 == argc)
+            goto bad_option;
+        if (!strcmp(argv[i], "-ref")) {
+            fp = fopen(argv[i + 1], "r");
+            if (!fp) {
+                fprintf(stderr, "could not open '%s'\n", argv[i + 1]);
+                goto error;
+            }
+        } else if (!strcmp(argv[i], "-src")) {
+            srcFormat = av_get_pix_fmt(argv[i + 1]);
+            if (srcFormat == AV_PIX_FMT_NONE) {
+                fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]);
+                return -1;
+            }
+        } else if (!strcmp(argv[i], "-dst")) {
+            dstFormat = av_get_pix_fmt(argv[i + 1]);
+            if (dstFormat == AV_PIX_FMT_NONE) {
+                fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]);
+                return -1;
+            }
+        } else {
+bad_option:
+            fprintf(stderr, "bad option or argument missing (%s)\n", argv[i]);
+            goto error;
+        }
+    }
+
+    sws = sws_getContext(W / 12, H / 12, AV_PIX_FMT_RGB32, W, H,
+                         AV_PIX_FMT_YUVA420P, SWS_BILINEAR, NULL, NULL, NULL);
+
+    av_lfg_init(&rand, 1);
+
+    for (y = 0; y < H; y++)
+        for (x = 0; x < W * 4; x++)
+            rgb_data[ x + y * 4 * W] = av_lfg_get(&rand);
+    sws_scale(sws, rgb_src, rgb_stride, 0, H, src, stride);
+    sws_freeContext(sws);
+    av_free(rgb_data);
+
+    if(fp) {
+        res = fileTest(src, stride, W, H, fp, srcFormat, dstFormat);
+        fclose(fp);
+    } else {
+        selfTest(src, stride, W, H, srcFormat, dstFormat);
+        res = 0;
+    }
+error:
+    av_free(data);
+
+    return res;
+}
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/swscale.c
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/swscale.c
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/swscale.h
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/swscale.h
@@ -0,0 +1,362 @@
+/*
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef SWSCALE_SWSCALE_H
+#define SWSCALE_SWSCALE_H
+
+/**
+ * @file
+ * @ingroup lsws
+ * external API header
+ */
+
+/**
+ * @defgroup lsws Libswscale
+ * @{
+ */
+
+#include <stdint.h>
+
+#include "libavutil/avutil.h"
+#include "libavutil/log.h"
+#include "libavutil/pixfmt.h"
+#include "version.h"
+
+/**
+ * Return the LIBSWSCALE_VERSION_INT constant.
+ */
+unsigned swscale_version(void);
+
+/**
+ * Return the libswscale build-time configuration.
+ */
+const char *swscale_configuration(void);
+
+/**
+ * Return the libswscale license.
+ */
+const char *swscale_license(void);
+
+/* values for the flags, the stuff on the command line is different */
+#define SWS_FAST_BILINEAR     1
+#define SWS_BILINEAR          2
+#define SWS_BICUBIC           4
+#define SWS_X                 8
+#define SWS_POINT          0x10
+#define SWS_AREA           0x20
+#define SWS_BICUBLIN       0x40
+#define SWS_GAUSS          0x80
+#define SWS_SINC          0x100
+#define SWS_LANCZOS       0x200
+#define SWS_SPLINE        0x400
+
+#define SWS_SRC_V_CHR_DROP_MASK     0x30000
+#define SWS_SRC_V_CHR_DROP_SHIFT    16
+
+#define SWS_PARAM_DEFAULT           123456
+
+#define SWS_PRINT_INFO              0x1000
+
+//the following 3 flags are not completely implemented
+//internal chrominace subsampling info
+#define SWS_FULL_CHR_H_INT    0x2000
+//input subsampling info
+#define SWS_FULL_CHR_H_INP    0x4000
+#define SWS_DIRECT_BGR        0x8000
+#define SWS_ACCURATE_RND      0x40000
+#define SWS_BITEXACT          0x80000
+#define SWS_ERROR_DIFFUSION  0x800000
+
+#if FF_API_SWS_CPU_CAPS
+/**
+ * CPU caps are autodetected now, those flags
+ * are only provided for API compatibility.
+ */
+#define SWS_CPU_CAPS_MMX      0x80000000
+#define SWS_CPU_CAPS_MMXEXT   0x20000000
+#define SWS_CPU_CAPS_MMX2     0x20000000
+#define SWS_CPU_CAPS_3DNOW    0x40000000
+#define SWS_CPU_CAPS_ALTIVEC  0x10000000
+#define SWS_CPU_CAPS_BFIN     0x01000000
+#define SWS_CPU_CAPS_SSE2     0x02000000
+#endif
+
+#define SWS_MAX_REDUCE_CUTOFF 0.002
+
+#define SWS_CS_ITU709         1
+#define SWS_CS_FCC            4
+#define SWS_CS_ITU601         5
+#define SWS_CS_ITU624         5
+#define SWS_CS_SMPTE170M      5
+#define SWS_CS_SMPTE240M      7
+#define SWS_CS_DEFAULT        5
+
+/**
+ * Return a pointer to yuv<->rgb coefficients for the given colorspace
+ * suitable for sws_setColorspaceDetails().
+ *
+ * @param colorspace One of the SWS_CS_* macros. If invalid,
+ * SWS_CS_DEFAULT is used.
+ */
+const int *sws_getCoefficients(int colorspace);
+
+// when used for filters they must have an odd number of elements
+// coeffs cannot be shared between vectors
+typedef struct SwsVector {
+    double *coeff;              ///< pointer to the list of coefficients
+    int length;                 ///< number of coefficients in the vector
+} SwsVector;
+
+// vectors can be shared
+typedef struct SwsFilter {
+    SwsVector *lumH;
+    SwsVector *lumV;
+    SwsVector *chrH;
+    SwsVector *chrV;
+} SwsFilter;
+
+struct SwsContext;
+
+/**
+ * Return a positive value if pix_fmt is a supported input format, 0
+ * otherwise.
+ */
+int sws_isSupportedInput(enum AVPixelFormat pix_fmt);
+
+/**
+ * Return a positive value if pix_fmt is a supported output format, 0
+ * otherwise.
+ */
+int sws_isSupportedOutput(enum AVPixelFormat pix_fmt);
+
+/**
+ * @param[in]  pix_fmt the pixel format
+ * @return a positive value if an endianness conversion for pix_fmt is
+ * supported, 0 otherwise.
+ */
+int sws_isSupportedEndiannessConversion(enum AVPixelFormat pix_fmt);
+
+/**
+ * Allocate an empty SwsContext. This must be filled and passed to
+ * sws_init_context(). For filling see AVOptions, options.c and
+ * sws_setColorspaceDetails().
+ */
+struct SwsContext *sws_alloc_context(void);
+
+/**
+ * Initialize the swscaler context sws_context.
+ *
+ * @return zero or positive value on success, a negative value on
+ * error
+ */
+int sws_init_context(struct SwsContext *sws_context, SwsFilter *srcFilter, SwsFilter *dstFilter);
+
+/**
+ * Free the swscaler context swsContext.
+ * If swsContext is NULL, then does nothing.
+ */
+void sws_freeContext(struct SwsContext *swsContext);
+
+#if FF_API_SWS_GETCONTEXT
+/**
+ * Allocate and return an SwsContext. You need it to perform
+ * scaling/conversion operations using sws_scale().
+ *
+ * @param srcW the width of the source image
+ * @param srcH the height of the source image
+ * @param srcFormat the source image format
+ * @param dstW the width of the destination image
+ * @param dstH the height of the destination image
+ * @param dstFormat the destination image format
+ * @param flags specify which algorithm and options to use for rescaling
+ * @return a pointer to an allocated context, or NULL in case of error
+ * @note this function is to be removed after a saner alternative is
+ *       written
+ * @deprecated Use sws_getCachedContext() instead.
+ */
+struct SwsContext *sws_getContext(int srcW, int srcH, enum AVPixelFormat srcFormat,
+                                  int dstW, int dstH, enum AVPixelFormat dstFormat,
+                                  int flags, SwsFilter *srcFilter,
+                                  SwsFilter *dstFilter, const double *param);
+#endif
+
+/**
+ * Scale the image slice in srcSlice and put the resulting scaled
+ * slice in the image in dst. A slice is a sequence of consecutive
+ * rows in an image.
+ *
+ * Slices have to be provided in sequential order, either in
+ * top-bottom or bottom-top order. If slices are provided in
+ * non-sequential order the behavior of the function is undefined.
+ *
+ * @param c         the scaling context previously created with
+ *                  sws_getContext()
+ * @param srcSlice  the array containing the pointers to the planes of
+ *                  the source slice
+ * @param srcStride the array containing the strides for each plane of
+ *                  the source image
+ * @param srcSliceY the position in the source image of the slice to
+ *                  process, that is the number (counted starting from
+ *                  zero) in the image of the first row of the slice
+ * @param srcSliceH the height of the source slice, that is the number
+ *                  of rows in the slice
+ * @param dst       the array containing the pointers to the planes of
+ *                  the destination image
+ * @param dstStride the array containing the strides for each plane of
+ *                  the destination image
+ * @return          the height of the output slice
+ */
+int sws_scale(struct SwsContext *c, const uint8_t *const srcSlice[],
+              const int srcStride[], int srcSliceY, int srcSliceH,
+              uint8_t *const dst[], const int dstStride[]);
+
+/**
+ * @param dstRange flag indicating the while-black range of the output (1=jpeg / 0=mpeg)
+ * @param srcRange flag indicating the while-black range of the input (1=jpeg / 0=mpeg)
+ * @param table the yuv2rgb coefficients describing the output yuv space, normally ff_yuv2rgb_coeffs[x]
+ * @param inv_table the yuv2rgb coefficients describing the input yuv space, normally ff_yuv2rgb_coeffs[x]
+ * @param brightness 16.16 fixed point brightness correction
+ * @param contrast 16.16 fixed point contrast correction
+ * @param saturation 16.16 fixed point saturation correction
+ * @return -1 if not supported
+ */
+int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
+                             int srcRange, const int table[4], int dstRange,
+                             int brightness, int contrast, int saturation);
+
+/**
+ * @return -1 if not supported
+ */
+int sws_getColorspaceDetails(struct SwsContext *c, int **inv_table,
+                             int *srcRange, int **table, int *dstRange,
+                             int *brightness, int *contrast, int *saturation);
+
+/**
+ * Allocate and return an uninitialized vector with length coefficients.
+ */
+SwsVector *sws_allocVec(int length);
+
+/**
+ * Return a normalized Gaussian curve used to filter stuff
+ * quality = 3 is high quality, lower is lower quality.
+ */
+SwsVector *sws_getGaussianVec(double variance, double quality);
+
+/**
+ * Allocate and return a vector with length coefficients, all
+ * with the same value c.
+ */
+SwsVector *sws_getConstVec(double c, int length);
+
+/**
+ * Allocate and return a vector with just one coefficient, with
+ * value 1.0.
+ */
+SwsVector *sws_getIdentityVec(void);
+
+/**
+ * Scale all the coefficients of a by the scalar value.
+ */
+void sws_scaleVec(SwsVector *a, double scalar);
+
+/**
+ * Scale all the coefficients of a so that their sum equals height.
+ */
+void sws_normalizeVec(SwsVector *a, double height);
+void sws_convVec(SwsVector *a, SwsVector *b);
+void sws_addVec(SwsVector *a, SwsVector *b);
+void sws_subVec(SwsVector *a, SwsVector *b);
+void sws_shiftVec(SwsVector *a, int shift);
+
+/**
+ * Allocate and return a clone of the vector a, that is a vector
+ * with the same coefficients as a.
+ */
+SwsVector *sws_cloneVec(SwsVector *a);
+
+/**
+ * Print with av_log() a textual representation of the vector a
+ * if log_level <= av_log_level.
+ */
+void sws_printVec2(SwsVector *a, AVClass *log_ctx, int log_level);
+
+void sws_freeVec(SwsVector *a);
+
+SwsFilter *sws_getDefaultFilter(float lumaGBlur, float chromaGBlur,
+                                float lumaSharpen, float chromaSharpen,
+                                float chromaHShift, float chromaVShift,
+                                int verbose);
+void sws_freeFilter(SwsFilter *filter);
+
+/**
+ * Check if context can be reused, otherwise reallocate a new one.
+ *
+ * If context is NULL, just calls sws_getContext() to get a new
+ * context. Otherwise, checks if the parameters are the ones already
+ * saved in context. If that is the case, returns the current
+ * context. Otherwise, frees context and gets a new context with
+ * the new parameters.
+ *
+ * Be warned that srcFilter and dstFilter are not checked, they
+ * are assumed to remain the same.
+ */
+struct SwsContext *sws_getCachedContext(struct SwsContext *context,
+                                        int srcW, int srcH, enum AVPixelFormat srcFormat,
+                                        int dstW, int dstH, enum AVPixelFormat dstFormat,
+                                        int flags, SwsFilter *srcFilter,
+                                        SwsFilter *dstFilter, const double *param);
+
+/**
+ * Convert an 8-bit paletted frame into a frame with a color depth of 32 bits.
+ *
+ * The output frame will have the same packed format as the palette.
+ *
+ * @param src        source frame buffer
+ * @param dst        destination frame buffer
+ * @param num_pixels number of pixels to convert
+ * @param palette    array with [256] entries, which must match color arrangement (RGB or BGR) of src
+ */
+void sws_convertPalette8ToPacked32(const uint8_t *src, uint8_t *dst, int num_pixels, const uint8_t *palette);
+
+/**
+ * Convert an 8-bit paletted frame into a frame with a color depth of 24 bits.
+ *
+ * With the palette format "ABCD", the destination frame ends up with the format "ABC".
+ *
+ * @param src        source frame buffer
+ * @param dst        destination frame buffer
+ * @param num_pixels number of pixels to convert
+ * @param palette    array with [256] entries, which must match color arrangement (RGB or BGR) of src
+ */
+void sws_convertPalette8ToPacked24(const uint8_t *src, uint8_t *dst, int num_pixels, const uint8_t *palette);
+
+/**
+ * Get the AVClass for swsContext. It can be used in combination with
+ * AV_OPT_SEARCH_FAKE_OBJ for examining options.
+ *
+ * @see av_opt_find().
+ */
+const AVClass *sws_get_class(void);
+
+/**
+ * @}
+ */
+
+#endif /* SWSCALE_SWSCALE_H */
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/swscale_internal.h
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/swscale_internal.h
@@ -0,0 +1,875 @@
+/*
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef SWSCALE_SWSCALE_INTERNAL_H
+#define SWSCALE_SWSCALE_INTERNAL_H
+
+#include "config.h"
+
+#if HAVE_ALTIVEC_H
+#include <altivec.h>
+#endif
+
+#include "libavutil/avassert.h"
+#include "libavutil/avutil.h"
+#include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/log.h"
+#include "libavutil/pixfmt.h"
+#include "libavutil/pixdesc.h"
+
+#define STR(s) AV_TOSTRING(s) // AV_STRINGIFY is too long
+
+#define YUVRGB_TABLE_HEADROOM 128
+
+#define MAX_FILTER_SIZE 256
+
+#define DITHER1XBPP
+
+#if HAVE_BIGENDIAN
+#define ALT32_CORR (-1)
+#else
+#define ALT32_CORR   1
+#endif
+
+#if ARCH_X86_64
+#   define APCK_PTR2  8
+#   define APCK_COEF 16
+#   define APCK_SIZE 24
+#else
+#   define APCK_PTR2  4
+#   define APCK_COEF  8
+#   define APCK_SIZE 16
+#endif
+
+struct SwsContext;
+
+typedef enum SwsDither {
+    SWS_DITHER_NONE = 0,
+    SWS_DITHER_AUTO,
+    SWS_DITHER_BAYER,
+    SWS_DITHER_ED,
+    NB_SWS_DITHER,
+} SwsDither;
+
+typedef int (*SwsFunc)(struct SwsContext *context, const uint8_t *src[],
+                       int srcStride[], int srcSliceY, int srcSliceH,
+                       uint8_t *dst[], int dstStride[]);
+
+/**
+ * Write one line of horizontally scaled data to planar output
+ * without any additional vertical scaling (or point-scaling).
+ *
+ * @param src     scaled source data, 15bit for 8-10bit output,
+ *                19-bit for 16bit output (in int32_t)
+ * @param dest    pointer to the output plane. For >8bit
+ *                output, this is in uint16_t
+ * @param dstW    width of destination in pixels
+ * @param dither  ordered dither array of type int16_t and size 8
+ * @param offset  Dither offset
+ */
+typedef void (*yuv2planar1_fn)(const int16_t *src, uint8_t *dest, int dstW,
+                               const uint8_t *dither, int offset);
+
+/**
+ * Write one line of horizontally scaled data to planar output
+ * with multi-point vertical scaling between input pixels.
+ *
+ * @param filter        vertical luma/alpha scaling coefficients, 12bit [0,4096]
+ * @param src           scaled luma (Y) or alpha (A) source data, 15bit for 8-10bit output,
+ *                      19-bit for 16bit output (in int32_t)
+ * @param filterSize    number of vertical input lines to scale
+ * @param dest          pointer to output plane. For >8bit
+ *                      output, this is in uint16_t
+ * @param dstW          width of destination pixels
+ * @param offset        Dither offset
+ */
+typedef void (*yuv2planarX_fn)(const int16_t *filter, int filterSize,
+                               const int16_t **src, uint8_t *dest, int dstW,
+                               const uint8_t *dither, int offset);
+
+/**
+ * Write one line of horizontally scaled chroma to interleaved output
+ * with multi-point vertical scaling between input pixels.
+ *
+ * @param c             SWS scaling context
+ * @param chrFilter     vertical chroma scaling coefficients, 12bit [0,4096]
+ * @param chrUSrc       scaled chroma (U) source data, 15bit for 8-10bit output,
+ *                      19-bit for 16bit output (in int32_t)
+ * @param chrVSrc       scaled chroma (V) source data, 15bit for 8-10bit output,
+ *                      19-bit for 16bit output (in int32_t)
+ * @param chrFilterSize number of vertical chroma input lines to scale
+ * @param dest          pointer to the output plane. For >8bit
+ *                      output, this is in uint16_t
+ * @param dstW          width of chroma planes
+ */
+typedef void (*yuv2interleavedX_fn)(struct SwsContext *c,
+                                    const int16_t *chrFilter,
+                                    int chrFilterSize,
+                                    const int16_t **chrUSrc,
+                                    const int16_t **chrVSrc,
+                                    uint8_t *dest, int dstW);
+
+/**
+ * Write one line of horizontally scaled Y/U/V/A to packed-pixel YUV/RGB
+ * output without any additional vertical scaling (or point-scaling). Note
+ * that this function may do chroma scaling, see the "uvalpha" argument.
+ *
+ * @param c       SWS scaling context
+ * @param lumSrc  scaled luma (Y) source data, 15bit for 8-10bit output,
+ *                19-bit for 16bit output (in int32_t)
+ * @param chrUSrc scaled chroma (U) source data, 15bit for 8-10bit output,
+ *                19-bit for 16bit output (in int32_t)
+ * @param chrVSrc scaled chroma (V) source data, 15bit for 8-10bit output,
+ *                19-bit for 16bit output (in int32_t)
+ * @param alpSrc  scaled alpha (A) source data, 15bit for 8-10bit output,
+ *                19-bit for 16bit output (in int32_t)
+ * @param dest    pointer to the output plane. For 16bit output, this is
+ *                uint16_t
+ * @param dstW    width of lumSrc and alpSrc in pixels, number of pixels
+ *                to write into dest[]
+ * @param uvalpha chroma scaling coefficient for the second line of chroma
+ *                pixels, either 2048 or 0. If 0, one chroma input is used
+ *                for 2 output pixels (or if the SWS_FLAG_FULL_CHR_INT flag
+ *                is set, it generates 1 output pixel). If 2048, two chroma
+ *                input pixels should be averaged for 2 output pixels (this
+ *                only happens if SWS_FLAG_FULL_CHR_INT is not set)
+ * @param y       vertical line number for this output. This does not need
+ *                to be used to calculate the offset in the destination,
+ *                but can be used to generate comfort noise using dithering
+ *                for some output formats.
+ */
+typedef void (*yuv2packed1_fn)(struct SwsContext *c, const int16_t *lumSrc,
+                               const int16_t *chrUSrc[2],
+                               const int16_t *chrVSrc[2],
+                               const int16_t *alpSrc, uint8_t *dest,
+                               int dstW, int uvalpha, int y);
+/**
+ * Write one line of horizontally scaled Y/U/V/A to packed-pixel YUV/RGB
+ * output by doing bilinear scaling between two input lines.
+ *
+ * @param c       SWS scaling context
+ * @param lumSrc  scaled luma (Y) source data, 15bit for 8-10bit output,
+ *                19-bit for 16bit output (in int32_t)
+ * @param chrUSrc scaled chroma (U) source data, 15bit for 8-10bit output,
+ *                19-bit for 16bit output (in int32_t)
+ * @param chrVSrc scaled chroma (V) source data, 15bit for 8-10bit output,
+ *                19-bit for 16bit output (in int32_t)
+ * @param alpSrc  scaled alpha (A) source data, 15bit for 8-10bit output,
+ *                19-bit for 16bit output (in int32_t)
+ * @param dest    pointer to the output plane. For 16bit output, this is
+ *                uint16_t
+ * @param dstW    width of lumSrc and alpSrc in pixels, number of pixels
+ *                to write into dest[]
+ * @param yalpha  luma/alpha scaling coefficients for the second input line.
+ *                The first line's coefficients can be calculated by using
+ *                4096 - yalpha
+ * @param uvalpha chroma scaling coefficient for the second input line. The
+ *                first line's coefficients can be calculated by using
+ *                4096 - uvalpha
+ * @param y       vertical line number for this output. This does not need
+ *                to be used to calculate the offset in the destination,
+ *                but can be used to generate comfort noise using dithering
+ *                for some output formats.
+ */
+typedef void (*yuv2packed2_fn)(struct SwsContext *c, const int16_t *lumSrc[2],
+                               const int16_t *chrUSrc[2],
+                               const int16_t *chrVSrc[2],
+                               const int16_t *alpSrc[2],
+                               uint8_t *dest,
+                               int dstW, int yalpha, int uvalpha, int y);
+/**
+ * Write one line of horizontally scaled Y/U/V/A to packed-pixel YUV/RGB
+ * output by doing multi-point vertical scaling between input pixels.
+ *
+ * @param c             SWS scaling context
+ * @param lumFilter     vertical luma/alpha scaling coefficients, 12bit [0,4096]
+ * @param lumSrc        scaled luma (Y) source data, 15bit for 8-10bit output,
+ *                      19-bit for 16bit output (in int32_t)
+ * @param lumFilterSize number of vertical luma/alpha input lines to scale
+ * @param chrFilter     vertical chroma scaling coefficients, 12bit [0,4096]
+ * @param chrUSrc       scaled chroma (U) source data, 15bit for 8-10bit output,
+ *                      19-bit for 16bit output (in int32_t)
+ * @param chrVSrc       scaled chroma (V) source data, 15bit for 8-10bit output,
+ *                      19-bit for 16bit output (in int32_t)
+ * @param chrFilterSize number of vertical chroma input lines to scale
+ * @param alpSrc        scaled alpha (A) source data, 15bit for 8-10bit output,
+ *                      19-bit for 16bit output (in int32_t)
+ * @param dest          pointer to the output plane. For 16bit output, this is
+ *                      uint16_t
+ * @param dstW          width of lumSrc and alpSrc in pixels, number of pixels
+ *                      to write into dest[]
+ * @param y             vertical line number for this output. This does not need
+ *                      to be used to calculate the offset in the destination,
+ *                      but can be used to generate comfort noise using dithering
+ *                      or some output formats.
+ */
+typedef void (*yuv2packedX_fn)(struct SwsContext *c, const int16_t *lumFilter,
+                               const int16_t **lumSrc, int lumFilterSize,
+                               const int16_t *chrFilter,
+                               const int16_t **chrUSrc,
+                               const int16_t **chrVSrc, int chrFilterSize,
+                               const int16_t **alpSrc, uint8_t *dest,
+                               int dstW, int y);
+
+/**
+ * Write one line of horizontally scaled Y/U/V/A to YUV/RGB
+ * output by doing multi-point vertical scaling between input pixels.
+ *
+ * @param c             SWS scaling context
+ * @param lumFilter     vertical luma/alpha scaling coefficients, 12bit [0,4096]
+ * @param lumSrc        scaled luma (Y) source data, 15bit for 8-10bit output,
+ *                      19-bit for 16bit output (in int32_t)
+ * @param lumFilterSize number of vertical luma/alpha input lines to scale
+ * @param chrFilter     vertical chroma scaling coefficients, 12bit [0,4096]
+ * @param chrUSrc       scaled chroma (U) source data, 15bit for 8-10bit output,
+ *                      19-bit for 16bit output (in int32_t)
+ * @param chrVSrc       scaled chroma (V) source data, 15bit for 8-10bit output,
+ *                      19-bit for 16bit output (in int32_t)
+ * @param chrFilterSize number of vertical chroma input lines to scale
+ * @param alpSrc        scaled alpha (A) source data, 15bit for 8-10bit output,
+ *                      19-bit for 16bit output (in int32_t)
+ * @param dest          pointer to the output planes. For 16bit output, this is
+ *                      uint16_t
+ * @param dstW          width of lumSrc and alpSrc in pixels, number of pixels
+ *                      to write into dest[]
+ * @param y             vertical line number for this output. This does not need
+ *                      to be used to calculate the offset in the destination,
+ *                      but can be used to generate comfort noise using dithering
+ *                      or some output formats.
+ */
+typedef void (*yuv2anyX_fn)(struct SwsContext *c, const int16_t *lumFilter,
+                            const int16_t **lumSrc, int lumFilterSize,
+                            const int16_t *chrFilter,
+                            const int16_t **chrUSrc,
+                            const int16_t **chrVSrc, int chrFilterSize,
+                            const int16_t **alpSrc, uint8_t **dest,
+                            int dstW, int y);
+
+/* This struct should be aligned on at least a 32-byte boundary. */
+typedef struct SwsContext {
+    /**
+     * info on struct for av_log
+     */
+    const AVClass *av_class;
+
+    /**
+     * Note that src, dst, srcStride, dstStride will be copied in the
+     * sws_scale() wrapper so they can be freely modified here.
+     */
+    SwsFunc swscale;
+    int srcW;                     ///< Width  of source      luma/alpha planes.
+    int srcH;                     ///< Height of source      luma/alpha planes.
+    int dstH;                     ///< Height of destination luma/alpha planes.
+    int chrSrcW;                  ///< Width  of source      chroma     planes.
+    int chrSrcH;                  ///< Height of source      chroma     planes.
+    int chrDstW;                  ///< Width  of destination chroma     planes.
+    int chrDstH;                  ///< Height of destination chroma     planes.
+    int lumXInc, chrXInc;
+    int lumYInc, chrYInc;
+    enum AVPixelFormat dstFormat; ///< Destination pixel format.
+    enum AVPixelFormat srcFormat; ///< Source      pixel format.
+    int dstFormatBpp;             ///< Number of bits per pixel of the destination pixel format.
+    int srcFormatBpp;             ///< Number of bits per pixel of the source      pixel format.
+    int dstBpc, srcBpc;
+    int chrSrcHSubSample;         ///< Binary logarithm of horizontal subsampling factor between luma/alpha and chroma planes in source      image.
+    int chrSrcVSubSample;         ///< Binary logarithm of vertical   subsampling factor between luma/alpha and chroma planes in source      image.
+    int chrDstHSubSample;         ///< Binary logarithm of horizontal subsampling factor between luma/alpha and chroma planes in destination image.
+    int chrDstVSubSample;         ///< Binary logarithm of vertical   subsampling factor between luma/alpha and chroma planes in destination image.
+    int vChrDrop;                 ///< Binary logarithm of extra vertical subsampling factor in source image chroma planes specified by user.
+    int sliceDir;                 ///< Direction that slices are fed to the scaler (1 = top-to-bottom, -1 = bottom-to-top).
+    double param[2];              ///< Input parameters for scaling algorithms that need them.
+
+    uint32_t pal_yuv[256];
+    uint32_t pal_rgb[256];
+
+    /**
+     * @name Scaled horizontal lines ring buffer.
+     * The horizontal scaler keeps just enough scaled lines in a ring buffer
+     * so they may be passed to the vertical scaler. The pointers to the
+     * allocated buffers for each line are duplicated in sequence in the ring
+     * buffer to simplify indexing and avoid wrapping around between lines
+     * inside the vertical scaler code. The wrapping is done before the
+     * vertical scaler is called.
+     */
+    //@{
+    int16_t **lumPixBuf;          ///< Ring buffer for scaled horizontal luma   plane lines to be fed to the vertical scaler.
+    int16_t **chrUPixBuf;         ///< Ring buffer for scaled horizontal chroma plane lines to be fed to the vertical scaler.
+    int16_t **chrVPixBuf;         ///< Ring buffer for scaled horizontal chroma plane lines to be fed to the vertical scaler.
+    int16_t **alpPixBuf;          ///< Ring buffer for scaled horizontal alpha  plane lines to be fed to the vertical scaler.
+    int vLumBufSize;              ///< Number of vertical luma/alpha lines allocated in the ring buffer.
+    int vChrBufSize;              ///< Number of vertical chroma     lines allocated in the ring buffer.
+    int lastInLumBuf;             ///< Last scaled horizontal luma/alpha line from source in the ring buffer.
+    int lastInChrBuf;             ///< Last scaled horizontal chroma     line from source in the ring buffer.
+    int lumBufIndex;              ///< Index in ring buffer of the last scaled horizontal luma/alpha line from source.
+    int chrBufIndex;              ///< Index in ring buffer of the last scaled horizontal chroma     line from source.
+    //@}
+
+    uint8_t *formatConvBuffer;
+
+    /**
+     * @name Horizontal and vertical filters.
+     * To better understand the following fields, here is a pseudo-code of
+     * their usage in filtering a horizontal line:
+     * @code
+     * for (i = 0; i < width; i++) {
+     *     dst[i] = 0;
+     *     for (j = 0; j < filterSize; j++)
+     *         dst[i] += src[ filterPos[i] + j ] * filter[ filterSize * i + j ];
+     *     dst[i] >>= FRAC_BITS; // The actual implementation is fixed-point.
+     * }
+     * @endcode
+     */
+    //@{
+    int16_t *hLumFilter;          ///< Array of horizontal filter coefficients for luma/alpha planes.
+    int16_t *hChrFilter;          ///< Array of horizontal filter coefficients for chroma     planes.
+    int16_t *vLumFilter;          ///< Array of vertical   filter coefficients for luma/alpha planes.
+    int16_t *vChrFilter;          ///< Array of vertical   filter coefficients for chroma     planes.
+    int32_t *hLumFilterPos;       ///< Array of horizontal filter starting positions for each dst[i] for luma/alpha planes.
+    int32_t *hChrFilterPos;       ///< Array of horizontal filter starting positions for each dst[i] for chroma     planes.
+    int32_t *vLumFilterPos;       ///< Array of vertical   filter starting positions for each dst[i] for luma/alpha planes.
+    int32_t *vChrFilterPos;       ///< Array of vertical   filter starting positions for each dst[i] for chroma     planes.
+    int hLumFilterSize;           ///< Horizontal filter size for luma/alpha pixels.
+    int hChrFilterSize;           ///< Horizontal filter size for chroma     pixels.
+    int vLumFilterSize;           ///< Vertical   filter size for luma/alpha pixels.
+    int vChrFilterSize;           ///< Vertical   filter size for chroma     pixels.
+    //@}
+
+    int lumMmxextFilterCodeSize;  ///< Runtime-generated MMXEXT horizontal fast bilinear scaler code size for luma/alpha planes.
+    int chrMmxextFilterCodeSize;  ///< Runtime-generated MMXEXT horizontal fast bilinear scaler code size for chroma planes.
+    uint8_t *lumMmxextFilterCode; ///< Runtime-generated MMXEXT horizontal fast bilinear scaler code for luma/alpha planes.
+    uint8_t *chrMmxextFilterCode; ///< Runtime-generated MMXEXT horizontal fast bilinear scaler code for chroma planes.
+
+    int canMMXEXTBeUsed;
+
+    int dstY;                     ///< Last destination vertical line output from last slice.
+    int flags;                    ///< Flags passed by the user to select scaler algorithm, optimizations, subsampling, etc...
+    void *yuvTable;             // pointer to the yuv->rgb table start so it can be freed()
+    uint8_t *table_rV[256 + 2*YUVRGB_TABLE_HEADROOM];
+    uint8_t *table_gU[256 + 2*YUVRGB_TABLE_HEADROOM];
+    int table_gV[256 + 2*YUVRGB_TABLE_HEADROOM];
+    uint8_t *table_bU[256 + 2*YUVRGB_TABLE_HEADROOM];
+    DECLARE_ALIGNED(16, int32_t, input_rgb2yuv_table)[16+40*4]; // This table can contain both C and SIMD formatted values, teh C vales are always at the XY_IDX points
+#define RY_IDX 0
+#define GY_IDX 1
+#define BY_IDX 2
+#define RU_IDX 3
+#define GU_IDX 4
+#define BU_IDX 5
+#define RV_IDX 6
+#define GV_IDX 7
+#define BV_IDX 8
+#define RGB2YUV_SHIFT 15
+
+    int *dither_error[4];
+
+    //Colorspace stuff
+    int contrast, brightness, saturation;    // for sws_getColorspaceDetails
+    int srcColorspaceTable[4];
+    int dstColorspaceTable[4];
+    int srcRange;                 ///< 0 = MPG YUV range, 1 = JPG YUV range (source      image).
+    int dstRange;                 ///< 0 = MPG YUV range, 1 = JPG YUV range (destination image).
+    int src0Alpha;
+    int dst0Alpha;
+    int srcXYZ;
+    int dstXYZ;
+    int src_h_chr_pos;
+    int dst_h_chr_pos;
+    int src_v_chr_pos;
+    int dst_v_chr_pos;
+    int yuv2rgb_y_offset;
+    int yuv2rgb_y_coeff;
+    int yuv2rgb_v2r_coeff;
+    int yuv2rgb_v2g_coeff;
+    int yuv2rgb_u2g_coeff;
+    int yuv2rgb_u2b_coeff;
+
+#define RED_DITHER            "0*8"
+#define GREEN_DITHER          "1*8"
+#define BLUE_DITHER           "2*8"
+#define Y_COEFF               "3*8"
+#define VR_COEFF              "4*8"
+#define UB_COEFF              "5*8"
+#define VG_COEFF              "6*8"
+#define UG_COEFF              "7*8"
+#define Y_OFFSET              "8*8"
+#define U_OFFSET              "9*8"
+#define V_OFFSET              "10*8"
+#define LUM_MMX_FILTER_OFFSET "11*8"
+#define CHR_MMX_FILTER_OFFSET "11*8+4*4*256"
+#define DSTW_OFFSET           "11*8+4*4*256*2" //do not change, it is hardcoded in the ASM
+#define ESP_OFFSET            "11*8+4*4*256*2+8"
+#define VROUNDER_OFFSET       "11*8+4*4*256*2+16"
+#define U_TEMP                "11*8+4*4*256*2+24"
+#define V_TEMP                "11*8+4*4*256*2+32"
+#define Y_TEMP                "11*8+4*4*256*2+40"
+#define ALP_MMX_FILTER_OFFSET "11*8+4*4*256*2+48"
+#define UV_OFF_PX             "11*8+4*4*256*3+48"
+#define UV_OFF_BYTE           "11*8+4*4*256*3+56"
+#define DITHER16              "11*8+4*4*256*3+64"
+#define DITHER32              "11*8+4*4*256*3+80"
+
+    DECLARE_ALIGNED(8, uint64_t, redDither);
+    DECLARE_ALIGNED(8, uint64_t, greenDither);
+    DECLARE_ALIGNED(8, uint64_t, blueDither);
+
+    DECLARE_ALIGNED(8, uint64_t, yCoeff);
+    DECLARE_ALIGNED(8, uint64_t, vrCoeff);
+    DECLARE_ALIGNED(8, uint64_t, ubCoeff);
+    DECLARE_ALIGNED(8, uint64_t, vgCoeff);
+    DECLARE_ALIGNED(8, uint64_t, ugCoeff);
+    DECLARE_ALIGNED(8, uint64_t, yOffset);
+    DECLARE_ALIGNED(8, uint64_t, uOffset);
+    DECLARE_ALIGNED(8, uint64_t, vOffset);
+    int32_t lumMmxFilter[4 * MAX_FILTER_SIZE];
+    int32_t chrMmxFilter[4 * MAX_FILTER_SIZE];
+    int dstW;                     ///< Width  of destination luma/alpha planes.
+    DECLARE_ALIGNED(8, uint64_t, esp);
+    DECLARE_ALIGNED(8, uint64_t, vRounder);
+    DECLARE_ALIGNED(8, uint64_t, u_temp);
+    DECLARE_ALIGNED(8, uint64_t, v_temp);
+    DECLARE_ALIGNED(8, uint64_t, y_temp);
+    int32_t alpMmxFilter[4 * MAX_FILTER_SIZE];
+    // alignment of these values is not necessary, but merely here
+    // to maintain the same offset across x8632 and x86-64. Once we
+    // use proper offset macros in the asm, they can be removed.
+    DECLARE_ALIGNED(8, ptrdiff_t, uv_off); ///< offset (in pixels) between u and v planes
+    DECLARE_ALIGNED(8, ptrdiff_t, uv_offx2); ///< offset (in bytes) between u and v planes
+    DECLARE_ALIGNED(8, uint16_t, dither16)[8];
+    DECLARE_ALIGNED(8, uint32_t, dither32)[8];
+
+    const uint8_t *chrDither8, *lumDither8;
+
+#if HAVE_ALTIVEC
+    vector signed short   CY;
+    vector signed short   CRV;
+    vector signed short   CBU;
+    vector signed short   CGU;
+    vector signed short   CGV;
+    vector signed short   OY;
+    vector unsigned short CSHIFT;
+    vector signed short  *vYCoeffsBank, *vCCoeffsBank;
+#endif
+
+#if ARCH_BFIN
+    DECLARE_ALIGNED(4, uint32_t, oy);
+    DECLARE_ALIGNED(4, uint32_t, oc);
+    DECLARE_ALIGNED(4, uint32_t, zero);
+    DECLARE_ALIGNED(4, uint32_t, cy);
+    DECLARE_ALIGNED(4, uint32_t, crv);
+    DECLARE_ALIGNED(4, uint32_t, rmask);
+    DECLARE_ALIGNED(4, uint32_t, cbu);
+    DECLARE_ALIGNED(4, uint32_t, bmask);
+    DECLARE_ALIGNED(4, uint32_t, cgu);
+    DECLARE_ALIGNED(4, uint32_t, cgv);
+    DECLARE_ALIGNED(4, uint32_t, gmask);
+#endif
+
+#if HAVE_VIS
+    DECLARE_ALIGNED(8, uint64_t, sparc_coeffs)[10];
+#endif
+    int use_mmx_vfilter;
+
+/* pre defined color-spaces gamma */
+#define XYZ_GAMMA (2.6f)
+#define RGB_GAMMA (2.2f)
+    int16_t *xyzgamma;
+    int16_t *rgbgamma;
+    int16_t *xyzgammainv;
+    int16_t *rgbgammainv;
+    int16_t xyz2rgb_matrix[3][4];
+    int16_t rgb2xyz_matrix[3][4];
+
+    /* function pointers for swscale() */
+    yuv2planar1_fn yuv2plane1;
+    yuv2planarX_fn yuv2planeX;
+    yuv2interleavedX_fn yuv2nv12cX;
+    yuv2packed1_fn yuv2packed1;
+    yuv2packed2_fn yuv2packed2;
+    yuv2packedX_fn yuv2packedX;
+    yuv2anyX_fn yuv2anyX;
+
+    /// Unscaled conversion of luma plane to YV12 for horizontal scaler.
+    void (*lumToYV12)(uint8_t *dst, const uint8_t *src, const uint8_t *src2, const uint8_t *src3,
+                      int width, uint32_t *pal);
+    /// Unscaled conversion of alpha plane to YV12 for horizontal scaler.
+    void (*alpToYV12)(uint8_t *dst, const uint8_t *src, const uint8_t *src2, const uint8_t *src3,
+                      int width, uint32_t *pal);
+    /// Unscaled conversion of chroma planes to YV12 for horizontal scaler.
+    void (*chrToYV12)(uint8_t *dstU, uint8_t *dstV,
+                      const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
+                      int width, uint32_t *pal);
+
+    /**
+     * Functions to read planar input, such as planar RGB, and convert
+     * internally to Y/UV/A.
+     */
+    /** @{ */
+    void (*readLumPlanar)(uint8_t *dst, const uint8_t *src[4], int width, int32_t *rgb2yuv);
+    void (*readChrPlanar)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src[4],
+                          int width, int32_t *rgb2yuv);
+    void (*readAlpPlanar)(uint8_t *dst, const uint8_t *src[4], int width, int32_t *rgb2yuv);
+    /** @} */
+
+    /**
+     * Scale one horizontal line of input data using a bilinear filter
+     * to produce one line of output data. Compared to SwsContext->hScale(),
+     * please take note of the following caveats when using these:
+     * - Scaling is done using only 7bit instead of 14bit coefficients.
+     * - You can use no more than 5 input pixels to produce 4 output
+     *   pixels. Therefore, this filter should not be used for downscaling
+     *   by more than ~20% in width (because that equals more than 5/4th
+     *   downscaling and thus more than 5 pixels input per 4 pixels output).
+     * - In general, bilinear filters create artifacts during downscaling
+     *   (even when <20%), because one output pixel will span more than one
+     *   input pixel, and thus some pixels will need edges of both neighbor
+     *   pixels to interpolate the output pixel. Since you can use at most
+     *   two input pixels per output pixel in bilinear scaling, this is
+     *   impossible and thus downscaling by any size will create artifacts.
+     * To enable this type of scaling, set SWS_FLAG_FAST_BILINEAR
+     * in SwsContext->flags.
+     */
+    /** @{ */
+    void (*hyscale_fast)(struct SwsContext *c,
+                         int16_t *dst, int dstWidth,
+                         const uint8_t *src, int srcW, int xInc);
+    void (*hcscale_fast)(struct SwsContext *c,
+                         int16_t *dst1, int16_t *dst2, int dstWidth,
+                         const uint8_t *src1, const uint8_t *src2,
+                         int srcW, int xInc);
+    /** @} */
+
+    /**
+     * Scale one horizontal line of input data using a filter over the input
+     * lines, to produce one (differently sized) line of output data.
+     *
+     * @param dst        pointer to destination buffer for horizontally scaled
+     *                   data. If the number of bits per component of one
+     *                   destination pixel (SwsContext->dstBpc) is <= 10, data
+     *                   will be 15bpc in 16bits (int16_t) width. Else (i.e.
+     *                   SwsContext->dstBpc == 16), data will be 19bpc in
+     *                   32bits (int32_t) width.
+     * @param dstW       width of destination image
+     * @param src        pointer to source data to be scaled. If the number of
+     *                   bits per component of a source pixel (SwsContext->srcBpc)
+     *                   is 8, this is 8bpc in 8bits (uint8_t) width. Else
+     *                   (i.e. SwsContext->dstBpc > 8), this is native depth
+     *                   in 16bits (uint16_t) width. In other words, for 9-bit
+     *                   YUV input, this is 9bpc, for 10-bit YUV input, this is
+     *                   10bpc, and for 16-bit RGB or YUV, this is 16bpc.
+     * @param filter     filter coefficients to be used per output pixel for
+     *                   scaling. This contains 14bpp filtering coefficients.
+     *                   Guaranteed to contain dstW * filterSize entries.
+     * @param filterPos  position of the first input pixel to be used for
+     *                   each output pixel during scaling. Guaranteed to
+     *                   contain dstW entries.
+     * @param filterSize the number of input coefficients to be used (and
+     *                   thus the number of input pixels to be used) for
+     *                   creating a single output pixel. Is aligned to 4
+     *                   (and input coefficients thus padded with zeroes)
+     *                   to simplify creating SIMD code.
+     */
+    /** @{ */
+    void (*hyScale)(struct SwsContext *c, int16_t *dst, int dstW,
+                    const uint8_t *src, const int16_t *filter,
+                    const int32_t *filterPos, int filterSize);
+    void (*hcScale)(struct SwsContext *c, int16_t *dst, int dstW,
+                    const uint8_t *src, const int16_t *filter,
+                    const int32_t *filterPos, int filterSize);
+    /** @} */
+
+    /// Color range conversion function for luma plane if needed.
+    void (*lumConvertRange)(int16_t *dst, int width);
+    /// Color range conversion function for chroma planes if needed.
+    void (*chrConvertRange)(int16_t *dst1, int16_t *dst2, int width);
+
+    int needs_hcscale; ///< Set if there are chroma planes to be converted.
+
+    SwsDither dither;
+} SwsContext;
+//FIXME check init (where 0)
+
+SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c);
+int ff_yuv2rgb_c_init_tables(SwsContext *c, const int inv_table[4],
+                             int fullRange, int brightness,
+                             int contrast, int saturation);
+void ff_yuv2rgb_init_tables_ppc(SwsContext *c, const int inv_table[4],
+                                int brightness, int contrast, int saturation);
+
+void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufIndex,
+                           int lastInLumBuf, int lastInChrBuf);
+
+SwsFunc ff_yuv2rgb_init_x86(SwsContext *c);
+SwsFunc ff_yuv2rgb_init_vis(SwsContext *c);
+SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c);
+SwsFunc ff_yuv2rgb_init_bfin(SwsContext *c);
+
+#if FF_API_SWS_FORMAT_NAME
+/**
+ * @deprecated Use av_get_pix_fmt_name() instead.
+ */
+attribute_deprecated
+const char *sws_format_name(enum AVPixelFormat format);
+#endif
+
+static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt)
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+    av_assert0(desc);
+    return desc->comp[0].depth_minus1 == 15;
+}
+
+static av_always_inline int is9_OR_10BPS(enum AVPixelFormat pix_fmt)
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+    av_assert0(desc);
+    return desc->comp[0].depth_minus1 >= 8 && desc->comp[0].depth_minus1 <= 13;
+}
+
+#define isNBPS(x) is9_OR_10BPS(x)
+
+static av_always_inline int isBE(enum AVPixelFormat pix_fmt)
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+    av_assert0(desc);
+    return desc->flags & AV_PIX_FMT_FLAG_BE;
+}
+
+static av_always_inline int isYUV(enum AVPixelFormat pix_fmt)
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+    av_assert0(desc);
+    return !(desc->flags & AV_PIX_FMT_FLAG_RGB) && desc->nb_components >= 2;
+}
+
+static av_always_inline int isPlanarYUV(enum AVPixelFormat pix_fmt)
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+    av_assert0(desc);
+    return ((desc->flags & AV_PIX_FMT_FLAG_PLANAR) && isYUV(pix_fmt));
+}
+
+static av_always_inline int isRGB(enum AVPixelFormat pix_fmt)
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+    av_assert0(desc);
+    return (desc->flags & AV_PIX_FMT_FLAG_RGB);
+}
+
+#if 0 // FIXME
+#define isGray(x) \
+    (!(av_pix_fmt_desc_get(x)->flags & AV_PIX_FMT_FLAG_PAL) && \
+     av_pix_fmt_desc_get(x)->nb_components <= 2)
+#else
+#define isGray(x)                      \
+    ((x) == AV_PIX_FMT_GRAY8       ||  \
+     (x) == AV_PIX_FMT_Y400A       ||  \
+     (x) == AV_PIX_FMT_GRAY16BE    ||  \
+     (x) == AV_PIX_FMT_GRAY16LE)
+#endif
+
+#define isRGBinInt(x) \
+    (           \
+     (x) == AV_PIX_FMT_RGB48BE     ||  \
+     (x) == AV_PIX_FMT_RGB48LE     ||  \
+     (x) == AV_PIX_FMT_RGBA64BE    ||  \
+     (x) == AV_PIX_FMT_RGBA64LE    ||  \
+     (x) == AV_PIX_FMT_RGB32       ||  \
+     (x) == AV_PIX_FMT_RGB32_1     ||  \
+     (x) == AV_PIX_FMT_RGB24       ||  \
+     (x) == AV_PIX_FMT_RGB565BE    ||  \
+     (x) == AV_PIX_FMT_RGB565LE    ||  \
+     (x) == AV_PIX_FMT_RGB555BE    ||  \
+     (x) == AV_PIX_FMT_RGB555LE    ||  \
+     (x) == AV_PIX_FMT_RGB444BE    ||  \
+     (x) == AV_PIX_FMT_RGB444LE    ||  \
+     (x) == AV_PIX_FMT_RGB8        ||  \
+     (x) == AV_PIX_FMT_RGB4        ||  \
+     (x) == AV_PIX_FMT_RGB4_BYTE   ||  \
+     (x) == AV_PIX_FMT_MONOBLACK   ||  \
+     (x) == AV_PIX_FMT_MONOWHITE   \
+    )
+#define isBGRinInt(x) \
+    (           \
+     (x) == AV_PIX_FMT_BGR48BE     ||  \
+     (x) == AV_PIX_FMT_BGR48LE     ||  \
+     (x) == AV_PIX_FMT_BGRA64BE    ||  \
+     (x) == AV_PIX_FMT_BGRA64LE    ||  \
+     (x) == AV_PIX_FMT_BGR32       ||  \
+     (x) == AV_PIX_FMT_BGR32_1     ||  \
+     (x) == AV_PIX_FMT_BGR24       ||  \
+     (x) == AV_PIX_FMT_BGR565BE    ||  \
+     (x) == AV_PIX_FMT_BGR565LE    ||  \
+     (x) == AV_PIX_FMT_BGR555BE    ||  \
+     (x) == AV_PIX_FMT_BGR555LE    ||  \
+     (x) == AV_PIX_FMT_BGR444BE    ||  \
+     (x) == AV_PIX_FMT_BGR444LE    ||  \
+     (x) == AV_PIX_FMT_BGR8        ||  \
+     (x) == AV_PIX_FMT_BGR4        ||  \
+     (x) == AV_PIX_FMT_BGR4_BYTE   ||  \
+     (x) == AV_PIX_FMT_MONOBLACK   ||  \
+     (x) == AV_PIX_FMT_MONOWHITE   \
+    )
+
+#define isRGBinBytes(x) (           \
+           (x) == AV_PIX_FMT_RGB48BE     \
+        || (x) == AV_PIX_FMT_RGB48LE     \
+        || (x) == AV_PIX_FMT_RGBA64BE    \
+        || (x) == AV_PIX_FMT_RGBA64LE    \
+        || (x) == AV_PIX_FMT_RGBA        \
+        || (x) == AV_PIX_FMT_ARGB        \
+        || (x) == AV_PIX_FMT_RGB24       \
+    )
+#define isBGRinBytes(x) (           \
+           (x) == AV_PIX_FMT_BGR48BE     \
+        || (x) == AV_PIX_FMT_BGR48LE     \
+        || (x) == AV_PIX_FMT_BGRA64BE    \
+        || (x) == AV_PIX_FMT_BGRA64LE    \
+        || (x) == AV_PIX_FMT_BGRA        \
+        || (x) == AV_PIX_FMT_ABGR        \
+        || (x) == AV_PIX_FMT_BGR24       \
+    )
+
+#define isAnyRGB(x) \
+    (           \
+          isRGBinInt(x)       ||    \
+          isBGRinInt(x)       ||    \
+          isRGB(x)      \
+    )
+
+static av_always_inline int isALPHA(enum AVPixelFormat pix_fmt)
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+    av_assert0(desc);
+    if (pix_fmt == AV_PIX_FMT_PAL8)
+        return 1;
+    return desc->flags & AV_PIX_FMT_FLAG_ALPHA;
+}
+
+#if 1
+#define isPacked(x)         (       \
+           (x)==AV_PIX_FMT_PAL8        \
+        || (x)==AV_PIX_FMT_YUYV422     \
+        || (x)==AV_PIX_FMT_UYVY422     \
+        || (x)==AV_PIX_FMT_Y400A       \
+        ||  isRGBinInt(x)           \
+        ||  isBGRinInt(x)           \
+    )
+#else
+static av_always_inline int isPacked(enum AVPixelFormat pix_fmt)
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+    av_assert0(desc);
+    return ((desc->nb_components >= 2 && !(desc->flags & AV_PIX_FMT_FLAG_PLANAR)) ||
+            pix_fmt == AV_PIX_FMT_PAL8);
+}
+
+#endif
+static av_always_inline int isPlanar(enum AVPixelFormat pix_fmt)
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+    av_assert0(desc);
+    return (desc->nb_components >= 2 && (desc->flags & AV_PIX_FMT_FLAG_PLANAR));
+}
+
+static av_always_inline int isPackedRGB(enum AVPixelFormat pix_fmt)
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+    av_assert0(desc);
+    return ((desc->flags & (AV_PIX_FMT_FLAG_PLANAR | AV_PIX_FMT_FLAG_RGB)) == AV_PIX_FMT_FLAG_RGB);
+}
+
+static av_always_inline int isPlanarRGB(enum AVPixelFormat pix_fmt)
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+    av_assert0(desc);
+    return ((desc->flags & (AV_PIX_FMT_FLAG_PLANAR | AV_PIX_FMT_FLAG_RGB)) ==
+            (AV_PIX_FMT_FLAG_PLANAR | AV_PIX_FMT_FLAG_RGB));
+}
+
+static av_always_inline int usePal(enum AVPixelFormat pix_fmt)
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+    av_assert0(desc);
+    return (desc->flags & AV_PIX_FMT_FLAG_PAL) || (desc->flags & AV_PIX_FMT_FLAG_PSEUDOPAL);
+}
+
+extern const uint64_t ff_dither4[2];
+extern const uint64_t ff_dither8[2];
+
+extern const uint8_t ff_dither_2x2_4[3][8];
+extern const uint8_t ff_dither_2x2_8[3][8];
+extern const uint8_t ff_dither_4x4_16[5][8];
+extern const uint8_t ff_dither_8x8_32[9][8];
+extern const uint8_t ff_dither_8x8_73[9][8];
+extern const uint8_t ff_dither_8x8_128[9][8];
+extern const uint8_t ff_dither_8x8_220[9][8];
+
+extern const int32_t ff_yuv2rgb_coeffs[8][4];
+
+extern const AVClass sws_context_class;
+
+/**
+ * Set c->swscale to an unscaled converter if one exists for the specific
+ * source and destination formats, bit depths, flags, etc.
+ */
+void ff_get_unscaled_swscale(SwsContext *c);
+void ff_get_unscaled_swscale_bfin(SwsContext *c);
+void ff_get_unscaled_swscale_ppc(SwsContext *c);
+
+/**
+ * Return function pointer to fastest main scaler path function depending
+ * on architecture and available optimizations.
+ */
+SwsFunc ff_getSwsFunc(SwsContext *c);
+
+void ff_sws_init_input_funcs(SwsContext *c);
+void ff_sws_init_output_funcs(SwsContext *c,
+                              yuv2planar1_fn *yuv2plane1,
+                              yuv2planarX_fn *yuv2planeX,
+                              yuv2interleavedX_fn *yuv2nv12cX,
+                              yuv2packed1_fn *yuv2packed1,
+                              yuv2packed2_fn *yuv2packed2,
+                              yuv2packedX_fn *yuv2packedX,
+                              yuv2anyX_fn *yuv2anyX);
+void ff_sws_init_swscale_ppc(SwsContext *c);
+void ff_sws_init_swscale_x86(SwsContext *c);
+
+static inline void fillPlane16(uint8_t *plane, int stride, int width, int height, int y,
+                               int alpha, int bits, const int big_endian)
+{
+    int i, j;
+    uint8_t *ptr = plane + stride * y;
+    int v = alpha ? 0xFFFF>>(15-bits) : (1<<bits);
+    for (i = 0; i < height; i++) {
+#define FILL(wfunc) \
+        for (j = 0; j < width; j++) {\
+            wfunc(ptr+2*j, v);\
+        }
+        if (big_endian) {
+            FILL(AV_WB16);
+        } else {
+            FILL(AV_WL16);
+        }
+        ptr += stride;
+    }
+}
+
+#endif /* SWSCALE_SWSCALE_INTERNAL_H */
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/swscale_unscaled.c
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/swscale_unscaled.c
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/utils.c
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/utils.c
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/version.h
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/version.h
@@ -0,0 +1,59 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef SWSCALE_VERSION_H
+#define SWSCALE_VERSION_H
+
+/**
+ * @file
+ * swscale version macros
+ */
+
+#include "libavutil/avutil.h"
+
+#define LIBSWSCALE_VERSION_MAJOR 2
+#define LIBSWSCALE_VERSION_MINOR 5
+#define LIBSWSCALE_VERSION_MICRO 101
+
+#define LIBSWSCALE_VERSION_INT  AV_VERSION_INT(LIBSWSCALE_VERSION_MAJOR, \
+                                               LIBSWSCALE_VERSION_MINOR, \
+                                               LIBSWSCALE_VERSION_MICRO)
+#define LIBSWSCALE_VERSION      AV_VERSION(LIBSWSCALE_VERSION_MAJOR, \
+                                           LIBSWSCALE_VERSION_MINOR, \
+                                           LIBSWSCALE_VERSION_MICRO)
+#define LIBSWSCALE_BUILD        LIBSWSCALE_VERSION_INT
+
+#define LIBSWSCALE_IDENT        "SwS" AV_STRINGIFY(LIBSWSCALE_VERSION)
+
+/**
+ * FF_API_* defines may be placed below to indicate public API that will be
+ * dropped at a future version bump. The defines themselves are not part of
+ * the public API and may change, break or disappear at any time.
+ */
+
+#ifndef FF_API_SWS_GETCONTEXT
+#define FF_API_SWS_GETCONTEXT  (LIBSWSCALE_VERSION_MAJOR < 3)
+#endif
+#ifndef FF_API_SWS_CPU_CAPS
+#define FF_API_SWS_CPU_CAPS    (LIBSWSCALE_VERSION_MAJOR < 3)
+#endif
+#ifndef FF_API_SWS_FORMAT_NAME
+#define FF_API_SWS_FORMAT_NAME  (LIBSWSCALE_VERSION_MAJOR < 3)
+#endif
+
+#endif /* SWSCALE_VERSION_H */
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/x86/Makefile
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/x86/Makefile
@@ -0,0 +1,11 @@
+$(SUBDIR)x86/swscale_mmx.o: CFLAGS += $(NOREDZONE_FLAGS)
+
+OBJS                            += x86/rgb2rgb.o                        \
+                                   x86/swscale.o                        \
+                                   x86/yuv2rgb.o                        \
+
+OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
+
+YASM-OBJS                       += x86/input.o                          \
+                                   x86/output.o                         \
+                                   x86/scale.o                          \
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/x86/input.asm
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/x86/input.asm
@@ -0,0 +1,696 @@
+;******************************************************************************
+;* x86-optimized input routines; does shuffling of packed
+;* YUV formats into individual planes, and converts RGB
+;* into YUV planes also.
+;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+%define RY 0x20DE
+%define GY 0x4087
+%define BY 0x0C88
+%define RU 0xECFF
+%define GU 0xDAC8
+%define BU 0x3838
+%define RV 0x3838
+%define GV 0xD0E3
+%define BV 0xF6E4
+
+rgb_Yrnd:        times 4 dd 0x80100        ;  16.5 << 15
+rgb_UVrnd:       times 4 dd 0x400100       ; 128.5 << 15
+%define bgr_Ycoeff_12x4 16*4 + 16* 0 + tableq
+%define bgr_Ycoeff_3x56 16*4 + 16* 1 + tableq
+%define rgb_Ycoeff_12x4 16*4 + 16* 2 + tableq
+%define rgb_Ycoeff_3x56 16*4 + 16* 3 + tableq
+%define bgr_Ucoeff_12x4 16*4 + 16* 4 + tableq
+%define bgr_Ucoeff_3x56 16*4 + 16* 5 + tableq
+%define rgb_Ucoeff_12x4 16*4 + 16* 6 + tableq
+%define rgb_Ucoeff_3x56 16*4 + 16* 7 + tableq
+%define bgr_Vcoeff_12x4 16*4 + 16* 8 + tableq
+%define bgr_Vcoeff_3x56 16*4 + 16* 9 + tableq
+%define rgb_Vcoeff_12x4 16*4 + 16*10 + tableq
+%define rgb_Vcoeff_3x56 16*4 + 16*11 + tableq
+
+%define rgba_Ycoeff_rb 16*4 + 16*12 + tableq
+%define rgba_Ycoeff_br 16*4 + 16*13 + tableq
+%define rgba_Ycoeff_ga 16*4 + 16*14 + tableq
+%define rgba_Ycoeff_ag 16*4 + 16*15 + tableq
+%define rgba_Ucoeff_rb 16*4 + 16*16 + tableq
+%define rgba_Ucoeff_br 16*4 + 16*17 + tableq
+%define rgba_Ucoeff_ga 16*4 + 16*18 + tableq
+%define rgba_Ucoeff_ag 16*4 + 16*19 + tableq
+%define rgba_Vcoeff_rb 16*4 + 16*20 + tableq
+%define rgba_Vcoeff_br 16*4 + 16*21 + tableq
+%define rgba_Vcoeff_ga 16*4 + 16*22 + tableq
+%define rgba_Vcoeff_ag 16*4 + 16*23 + tableq
+
+; bgr_Ycoeff_12x4: times 2 dw BY, GY, 0, BY
+; bgr_Ycoeff_3x56: times 2 dw RY, 0, GY, RY
+; rgb_Ycoeff_12x4: times 2 dw RY, GY, 0, RY
+; rgb_Ycoeff_3x56: times 2 dw BY, 0, GY, BY
+; bgr_Ucoeff_12x4: times 2 dw BU, GU, 0, BU
+; bgr_Ucoeff_3x56: times 2 dw RU, 0, GU, RU
+; rgb_Ucoeff_12x4: times 2 dw RU, GU, 0, RU
+; rgb_Ucoeff_3x56: times 2 dw BU, 0, GU, BU
+; bgr_Vcoeff_12x4: times 2 dw BV, GV, 0, BV
+; bgr_Vcoeff_3x56: times 2 dw RV, 0, GV, RV
+; rgb_Vcoeff_12x4: times 2 dw RV, GV, 0, RV
+; rgb_Vcoeff_3x56: times 2 dw BV, 0, GV, BV
+
+; rgba_Ycoeff_rb:  times 4 dw RY, BY
+; rgba_Ycoeff_br:  times 4 dw BY, RY
+; rgba_Ycoeff_ga:  times 4 dw GY, 0
+; rgba_Ycoeff_ag:  times 4 dw 0,  GY
+; rgba_Ucoeff_rb:  times 4 dw RU, BU
+; rgba_Ucoeff_br:  times 4 dw BU, RU
+; rgba_Ucoeff_ga:  times 4 dw GU, 0
+; rgba_Ucoeff_ag:  times 4 dw 0,  GU
+; rgba_Vcoeff_rb:  times 4 dw RV, BV
+; rgba_Vcoeff_br:  times 4 dw BV, RV
+; rgba_Vcoeff_ga:  times 4 dw GV, 0
+; rgba_Vcoeff_ag:  times 4 dw 0,  GV
+
+shuf_rgb_12x4:   db 0, 0x80, 1, 0x80,  2, 0x80,  3, 0x80, \
+                    6, 0x80, 7, 0x80,  8, 0x80,  9, 0x80
+shuf_rgb_3x56:   db 2, 0x80, 3, 0x80,  4, 0x80,  5, 0x80, \
+                    8, 0x80, 9, 0x80, 10, 0x80, 11, 0x80
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; RGB to Y/UV.
+;
+; void <fmt>ToY_<opt>(uint8_t *dst, const uint8_t *src, int w);
+; and
+; void <fmt>toUV_<opt>(uint8_t *dstU, uint8_t *dstV, const uint8_t *src,
+;                      const uint8_t *unused, int w);
+;-----------------------------------------------------------------------------
+
+; %1 = nr. of XMM registers
+; %2 = rgb or bgr
+%macro RGB24_TO_Y_FN 2-3
+cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, table
+%if mmsize == 8
+    mova           m5, [%2_Ycoeff_12x4]
+    mova           m6, [%2_Ycoeff_3x56]
+%define coeff1 m5
+%define coeff2 m6
+%elif ARCH_X86_64
+    mova           m8, [%2_Ycoeff_12x4]
+    mova           m9, [%2_Ycoeff_3x56]
+%define coeff1 m8
+%define coeff2 m9
+%else ; x86-32 && mmsize == 16
+%define coeff1 [%2_Ycoeff_12x4]
+%define coeff2 [%2_Ycoeff_3x56]
+%endif ; x86-32/64 && mmsize == 8/16
+%if (ARCH_X86_64 || mmsize == 8) && %0 == 3
+    jmp mangle(private_prefix %+ _ %+ %3 %+ 24ToY %+ SUFFIX).body
+%else ; (ARCH_X86_64 && %0 == 3) || mmsize == 8
+.body:
+%if cpuflag(ssse3)
+    mova           m7, [shuf_rgb_12x4]
+%define shuf_rgb1 m7
+%if ARCH_X86_64
+    mova          m10, [shuf_rgb_3x56]
+%define shuf_rgb2 m10
+%else ; x86-32
+%define shuf_rgb2 [shuf_rgb_3x56]
+%endif ; x86-32/64
+%endif ; cpuflag(ssse3)
+%if ARCH_X86_64
+    movsxd         wq, wd
+%endif
+    add            wq, wq
+    add          dstq, wq
+    neg            wq
+%if notcpuflag(ssse3)
+    pxor           m7, m7
+%endif ; !cpuflag(ssse3)
+    mova           m4, [rgb_Yrnd]
+.loop:
+%if cpuflag(ssse3)
+    movu           m0, [srcq+0]           ; (byte) { Bx, Gx, Rx }[0-3]
+    movu           m2, [srcq+12]          ; (byte) { Bx, Gx, Rx }[4-7]
+    pshufb         m1, m0, shuf_rgb2      ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
+    pshufb         m0, shuf_rgb1          ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
+    pshufb         m3, m2, shuf_rgb2      ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
+    pshufb         m2, shuf_rgb1          ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
+%else ; !cpuflag(ssse3)
+    movd           m0, [srcq+0]           ; (byte) { B0, G0, R0, B1 }
+    movd           m1, [srcq+2]           ; (byte) { R0, B1, G1, R1 }
+    movd           m2, [srcq+6]           ; (byte) { B2, G2, R2, B3 }
+    movd           m3, [srcq+8]           ; (byte) { R2, B3, G3, R3 }
+%if mmsize == 16 ; i.e. sse2
+    punpckldq      m0, m2                 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 }
+    punpckldq      m1, m3                 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 }
+    movd           m2, [srcq+12]          ; (byte) { B4, G4, R4, B5 }
+    movd           m3, [srcq+14]          ; (byte) { R4, B5, G5, R5 }
+    movd           m5, [srcq+18]          ; (byte) { B6, G6, R6, B7 }
+    movd           m6, [srcq+20]          ; (byte) { R6, B7, G7, R7 }
+    punpckldq      m2, m5                 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 }
+    punpckldq      m3, m6                 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 }
+%endif ; mmsize == 16
+    punpcklbw      m0, m7                 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
+    punpcklbw      m1, m7                 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
+    punpcklbw      m2, m7                 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
+    punpcklbw      m3, m7                 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
+%endif ; cpuflag(ssse3)
+    add          srcq, 3 * mmsize / 2
+    pmaddwd        m0, coeff1             ; (dword) { B0*BY + G0*GY, B1*BY, B2*BY + G2*GY, B3*BY }
+    pmaddwd        m1, coeff2             ; (dword) { R0*RY, G1+GY + R1*RY, R2*RY, G3+GY + R3*RY }
+    pmaddwd        m2, coeff1             ; (dword) { B4*BY + G4*GY, B5*BY, B6*BY + G6*GY, B7*BY }
+    pmaddwd        m3, coeff2             ; (dword) { R4*RY, G5+GY + R5*RY, R6*RY, G7+GY + R7*RY }
+    paddd          m0, m1                 ; (dword) { Bx*BY + Gx*GY + Rx*RY }[0-3]
+    paddd          m2, m3                 ; (dword) { Bx*BY + Gx*GY + Rx*RY }[4-7]
+    paddd          m0, m4                 ; += rgb_Yrnd, i.e. (dword) { Y[0-3] }
+    paddd          m2, m4                 ; += rgb_Yrnd, i.e. (dword) { Y[4-7] }
+    psrad          m0, 9
+    psrad          m2, 9
+    packssdw       m0, m2                 ; (word) { Y[0-7] }
+    mova    [dstq+wq], m0
+    add            wq, mmsize
+    jl .loop
+    REP_RET
+%endif ; (ARCH_X86_64 && %0 == 3) || mmsize == 8
+%endmacro
+
+; %1 = nr. of XMM registers
+; %2 = rgb or bgr
+%macro RGB24_TO_UV_FN 2-3
+cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
+%if ARCH_X86_64
+    mova           m8, [%2_Ucoeff_12x4]
+    mova           m9, [%2_Ucoeff_3x56]
+    mova          m10, [%2_Vcoeff_12x4]
+    mova          m11, [%2_Vcoeff_3x56]
+%define coeffU1 m8
+%define coeffU2 m9
+%define coeffV1 m10
+%define coeffV2 m11
+%else ; x86-32
+%define coeffU1 [%2_Ucoeff_12x4]
+%define coeffU2 [%2_Ucoeff_3x56]
+%define coeffV1 [%2_Vcoeff_12x4]
+%define coeffV2 [%2_Vcoeff_3x56]
+%endif ; x86-32/64
+%if ARCH_X86_64 && %0 == 3
+    jmp mangle(private_prefix %+ _ %+ %3 %+ 24ToUV %+ SUFFIX).body
+%else ; ARCH_X86_64 && %0 == 3
+.body:
+%if cpuflag(ssse3)
+    mova           m7, [shuf_rgb_12x4]
+%define shuf_rgb1 m7
+%if ARCH_X86_64
+    mova          m12, [shuf_rgb_3x56]
+%define shuf_rgb2 m12
+%else ; x86-32
+%define shuf_rgb2 [shuf_rgb_3x56]
+%endif ; x86-32/64
+%endif ; cpuflag(ssse3)
+%if ARCH_X86_64
+    movsxd         wq, dword r5m
+%else ; x86-32
+    mov            wq, r5m
+%endif
+    add            wq, wq
+    add         dstUq, wq
+    add         dstVq, wq
+    neg            wq
+    mova           m6, [rgb_UVrnd]
+%if notcpuflag(ssse3)
+    pxor           m7, m7
+%endif
+.loop:
+%if cpuflag(ssse3)
+    movu           m0, [srcq+0]           ; (byte) { Bx, Gx, Rx }[0-3]
+    movu           m4, [srcq+12]          ; (byte) { Bx, Gx, Rx }[4-7]
+    pshufb         m1, m0, shuf_rgb2      ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
+    pshufb         m0, shuf_rgb1          ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
+%else ; !cpuflag(ssse3)
+    movd           m0, [srcq+0]           ; (byte) { B0, G0, R0, B1 }
+    movd           m1, [srcq+2]           ; (byte) { R0, B1, G1, R1 }
+    movd           m4, [srcq+6]           ; (byte) { B2, G2, R2, B3 }
+    movd           m5, [srcq+8]           ; (byte) { R2, B3, G3, R3 }
+%if mmsize == 16
+    punpckldq      m0, m4                 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 }
+    punpckldq      m1, m5                 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 }
+    movd           m4, [srcq+12]          ; (byte) { B4, G4, R4, B5 }
+    movd           m5, [srcq+14]          ; (byte) { R4, B5, G5, R5 }
+%endif ; mmsize == 16
+    punpcklbw      m0, m7                 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
+    punpcklbw      m1, m7                 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
+%endif ; cpuflag(ssse3)
+    pmaddwd        m2, m0, coeffV1        ; (dword) { B0*BV + G0*GV, B1*BV, B2*BV + G2*GV, B3*BV }
+    pmaddwd        m3, m1, coeffV2        ; (dword) { R0*BV, G1*GV + R1*BV, R2*BV, G3*GV + R3*BV }
+    pmaddwd        m0, coeffU1            ; (dword) { B0*BU + G0*GU, B1*BU, B2*BU + G2*GU, B3*BU }
+    pmaddwd        m1, coeffU2            ; (dword) { R0*BU, G1*GU + R1*BU, R2*BU, G3*GU + R3*BU }
+    paddd          m0, m1                 ; (dword) { Bx*BU + Gx*GU + Rx*RU }[0-3]
+    paddd          m2, m3                 ; (dword) { Bx*BV + Gx*GV + Rx*RV }[0-3]
+%if cpuflag(ssse3)
+    pshufb         m5, m4, shuf_rgb2      ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
+    pshufb         m4, shuf_rgb1          ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
+%else ; !cpuflag(ssse3)
+%if mmsize == 16
+    movd           m1, [srcq+18]          ; (byte) { B6, G6, R6, B7 }
+    movd           m3, [srcq+20]          ; (byte) { R6, B7, G7, R7 }
+    punpckldq      m4, m1                 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 }
+    punpckldq      m5, m3                 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 }
+%endif ; mmsize == 16 && !cpuflag(ssse3)
+    punpcklbw      m4, m7                 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
+    punpcklbw      m5, m7                 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
+%endif ; cpuflag(ssse3)
+    add          srcq, 3 * mmsize / 2
+    pmaddwd        m1, m4, coeffU1        ; (dword) { B4*BU + G4*GU, B5*BU, B6*BU + G6*GU, B7*BU }
+    pmaddwd        m3, m5, coeffU2        ; (dword) { R4*BU, G5*GU + R5*BU, R6*BU, G7*GU + R7*BU }
+    pmaddwd        m4, coeffV1            ; (dword) { B4*BV + G4*GV, B5*BV, B6*BV + G6*GV, B7*BV }
+    pmaddwd        m5, coeffV2            ; (dword) { R4*BV, G5*GV + R5*BV, R6*BV, G7*GV + R7*BV }
+    paddd          m1, m3                 ; (dword) { Bx*BU + Gx*GU + Rx*RU }[4-7]
+    paddd          m4, m5                 ; (dword) { Bx*BV + Gx*GV + Rx*RV }[4-7]
+    paddd          m0, m6                 ; += rgb_UVrnd, i.e. (dword) { U[0-3] }
+    paddd          m2, m6                 ; += rgb_UVrnd, i.e. (dword) { V[0-3] }
+    paddd          m1, m6                 ; += rgb_UVrnd, i.e. (dword) { U[4-7] }
+    paddd          m4, m6                 ; += rgb_UVrnd, i.e. (dword) { V[4-7] }
+    psrad          m0, 9
+    psrad          m2, 9
+    psrad          m1, 9
+    psrad          m4, 9
+    packssdw       m0, m1                 ; (word) { U[0-7] }
+    packssdw       m2, m4                 ; (word) { V[0-7] }
+%if mmsize == 8
+    mova   [dstUq+wq], m0
+    mova   [dstVq+wq], m2
+%else ; mmsize == 16
+    mova   [dstUq+wq], m0
+    mova   [dstVq+wq], m2
+%endif ; mmsize == 8/16
+    add            wq, mmsize
+    jl .loop
+    REP_RET
+%endif ; ARCH_X86_64 && %0 == 3
+%endmacro
+
+; %1 = nr. of XMM registers for rgb-to-Y func
+; %2 = nr. of XMM registers for rgb-to-UV func
+%macro RGB24_FUNCS 2
+RGB24_TO_Y_FN %1, rgb
+RGB24_TO_Y_FN %1, bgr, rgb
+RGB24_TO_UV_FN %2, rgb
+RGB24_TO_UV_FN %2, bgr, rgb
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+RGB24_FUNCS 0, 0
+%endif
+
+INIT_XMM sse2
+RGB24_FUNCS 10, 12
+
+INIT_XMM ssse3
+RGB24_FUNCS 11, 13
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+RGB24_FUNCS 11, 13
+%endif
+
+; %1 = nr. of XMM registers
+; %2-5 = rgba, bgra, argb or abgr (in individual characters)
+%macro RGB32_TO_Y_FN 5-6
+cglobal %2%3%4%5 %+ ToY, 6, 6, %1, dst, src, u1, u2, w, table
+    mova           m5, [rgba_Ycoeff_%2%4]
+    mova           m6, [rgba_Ycoeff_%3%5]
+%if %0 == 6
+    jmp mangle(private_prefix %+ _ %+ %6 %+ ToY %+ SUFFIX).body
+%else ; %0 == 6
+.body:
+%if ARCH_X86_64
+    movsxd         wq, wd
+%endif
+    lea          srcq, [srcq+wq*4]
+    add            wq, wq
+    add          dstq, wq
+    neg            wq
+    mova           m4, [rgb_Yrnd]
+    pcmpeqb        m7, m7
+    psrlw          m7, 8                  ; (word) { 0x00ff } x4
+.loop:
+    ; FIXME check alignment and use mova
+    movu           m0, [srcq+wq*2+0]      ; (byte) { Bx, Gx, Rx, xx }[0-3]
+    movu           m2, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
+    DEINTB          1,  0,  3,  2,  7     ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7]
+    pmaddwd        m1, m5                 ; (dword) { Bx*BY + Rx*RY }[0-3]
+    pmaddwd        m0, m6                 ; (dword) { Gx*GY }[0-3]
+    pmaddwd        m3, m5                 ; (dword) { Bx*BY + Rx*RY }[4-7]
+    pmaddwd        m2, m6                 ; (dword) { Gx*GY }[4-7]
+    paddd          m0, m4                 ; += rgb_Yrnd
+    paddd          m2, m4                 ; += rgb_Yrnd
+    paddd          m0, m1                 ; (dword) { Y[0-3] }
+    paddd          m2, m3                 ; (dword) { Y[4-7] }
+    psrad          m0, 9
+    psrad          m2, 9
+    packssdw       m0, m2                 ; (word) { Y[0-7] }
+    mova    [dstq+wq], m0
+    add            wq, mmsize
+    jl .loop
+    REP_RET
+%endif ; %0 == 3
+%endmacro
+
+; %1 = nr. of XMM registers
+; %2-5 = rgba, bgra, argb or abgr (in individual characters)
+%macro RGB32_TO_UV_FN 5-6
+cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
+%if ARCH_X86_64
+    mova           m8, [rgba_Ucoeff_%2%4]
+    mova           m9, [rgba_Ucoeff_%3%5]
+    mova          m10, [rgba_Vcoeff_%2%4]
+    mova          m11, [rgba_Vcoeff_%3%5]
+%define coeffU1 m8
+%define coeffU2 m9
+%define coeffV1 m10
+%define coeffV2 m11
+%else ; x86-32
+%define coeffU1 [rgba_Ucoeff_%2%4]
+%define coeffU2 [rgba_Ucoeff_%3%5]
+%define coeffV1 [rgba_Vcoeff_%2%4]
+%define coeffV2 [rgba_Vcoeff_%3%5]
+%endif ; x86-64/32
+%if ARCH_X86_64 && %0 == 6
+    jmp mangle(private_prefix %+ _ %+ %6 %+ ToUV %+ SUFFIX).body
+%else ; ARCH_X86_64 && %0 == 6
+.body:
+%if ARCH_X86_64
+    movsxd         wq, dword r5m
+%else ; x86-32
+    mov            wq, r5m
+%endif
+    add            wq, wq
+    add         dstUq, wq
+    add         dstVq, wq
+    lea          srcq, [srcq+wq*2]
+    neg            wq
+    pcmpeqb        m7, m7
+    psrlw          m7, 8                  ; (word) { 0x00ff } x4
+    mova           m6, [rgb_UVrnd]
+.loop:
+    ; FIXME check alignment and use mova
+    movu           m0, [srcq+wq*2+0]      ; (byte) { Bx, Gx, Rx, xx }[0-3]
+    movu           m4, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
+    DEINTB          1,  0,  5,  4,  7     ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7]
+    pmaddwd        m3, m1, coeffV1        ; (dword) { Bx*BV + Rx*RV }[0-3]
+    pmaddwd        m2, m0, coeffV2        ; (dword) { Gx*GV }[0-3]
+    pmaddwd        m1, coeffU1            ; (dword) { Bx*BU + Rx*RU }[0-3]
+    pmaddwd        m0, coeffU2            ; (dword) { Gx*GU }[0-3]
+    paddd          m3, m6                 ; += rgb_UVrnd
+    paddd          m1, m6                 ; += rgb_UVrnd
+    paddd          m2, m3                 ; (dword) { V[0-3] }
+    paddd          m0, m1                 ; (dword) { U[0-3] }
+    pmaddwd        m3, m5, coeffV1        ; (dword) { Bx*BV + Rx*RV }[4-7]
+    pmaddwd        m1, m4, coeffV2        ; (dword) { Gx*GV }[4-7]
+    pmaddwd        m5, coeffU1            ; (dword) { Bx*BU + Rx*RU }[4-7]
+    pmaddwd        m4, coeffU2            ; (dword) { Gx*GU }[4-7]
+    paddd          m3, m6                 ; += rgb_UVrnd
+    paddd          m5, m6                 ; += rgb_UVrnd
+    psrad          m0, 9
+    paddd          m1, m3                 ; (dword) { V[4-7] }
+    paddd          m4, m5                 ; (dword) { U[4-7] }
+    psrad          m2, 9
+    psrad          m4, 9
+    psrad          m1, 9
+    packssdw       m0, m4                 ; (word) { U[0-7] }
+    packssdw       m2, m1                 ; (word) { V[0-7] }
+%if mmsize == 8
+    mova   [dstUq+wq], m0
+    mova   [dstVq+wq], m2
+%else ; mmsize == 16
+    mova   [dstUq+wq], m0
+    mova   [dstVq+wq], m2
+%endif ; mmsize == 8/16
+    add            wq, mmsize
+    jl .loop
+    REP_RET
+%endif ; ARCH_X86_64 && %0 == 3
+%endmacro
+
+; %1 = nr. of XMM registers for rgb-to-Y func
+; %2 = nr. of XMM registers for rgb-to-UV func
+%macro RGB32_FUNCS 2
+RGB32_TO_Y_FN %1, r, g, b, a
+RGB32_TO_Y_FN %1, b, g, r, a, rgba
+RGB32_TO_Y_FN %1, a, r, g, b, rgba
+RGB32_TO_Y_FN %1, a, b, g, r, rgba
+
+RGB32_TO_UV_FN %2, r, g, b, a
+RGB32_TO_UV_FN %2, b, g, r, a, rgba
+RGB32_TO_UV_FN %2, a, r, g, b, rgba
+RGB32_TO_UV_FN %2, a, b, g, r, rgba
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+RGB32_FUNCS 0, 0
+%endif
+
+INIT_XMM sse2
+RGB32_FUNCS 8, 12
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+RGB32_FUNCS 8, 12
+%endif
+
+;-----------------------------------------------------------------------------
+; YUYV/UYVY/NV12/NV21 packed pixel shuffling.
+;
+; void <fmt>ToY_<opt>(uint8_t *dst, const uint8_t *src, int w);
+; and
+; void <fmt>toUV_<opt>(uint8_t *dstU, uint8_t *dstV, const uint8_t *src,
+;                      const uint8_t *unused, int w);
+;-----------------------------------------------------------------------------
+
+; %1 = a (aligned) or u (unaligned)
+; %2 = yuyv or uyvy
+%macro LOOP_YUYV_TO_Y 2
+.loop_%1:
+    mov%1          m0, [srcq+wq*2]        ; (byte) { Y0, U0, Y1, V0, ... }
+    mov%1          m1, [srcq+wq*2+mmsize] ; (byte) { Y8, U4, Y9, V4, ... }
+%ifidn %2, yuyv
+    pand           m0, m2                 ; (word) { Y0, Y1, ..., Y7 }
+    pand           m1, m2                 ; (word) { Y8, Y9, ..., Y15 }
+%else ; uyvy
+    psrlw          m0, 8                  ; (word) { Y0, Y1, ..., Y7 }
+    psrlw          m1, 8                  ; (word) { Y8, Y9, ..., Y15 }
+%endif ; yuyv/uyvy
+    packuswb       m0, m1                 ; (byte) { Y0, ..., Y15 }
+    mova    [dstq+wq], m0
+    add            wq, mmsize
+    jl .loop_%1
+    REP_RET
+%endmacro
+
+; %1 = nr. of XMM registers
+; %2 = yuyv or uyvy
+; %3 = if specified, it means that unaligned and aligned code in loop
+;      will be the same (i.e. YUYV+AVX), and thus we don't need to
+;      split the loop in an aligned and unaligned case
+%macro YUYV_TO_Y_FN 2-3
+cglobal %2ToY, 5, 5, %1, dst, unused0, unused1, src, w
+%if ARCH_X86_64
+    movsxd         wq, wd
+%endif
+    add          dstq, wq
+%if mmsize == 16
+    test         srcq, 15
+%endif
+    lea          srcq, [srcq+wq*2]
+%ifidn %2, yuyv
+    pcmpeqb        m2, m2                 ; (byte) { 0xff } x 16
+    psrlw          m2, 8                  ; (word) { 0x00ff } x 8
+%endif ; yuyv
+%if mmsize == 16
+    jnz .loop_u_start
+    neg            wq
+    LOOP_YUYV_TO_Y  a, %2
+.loop_u_start:
+    neg            wq
+    LOOP_YUYV_TO_Y  u, %2
+%else ; mmsize == 8
+    neg            wq
+    LOOP_YUYV_TO_Y  a, %2
+%endif ; mmsize == 8/16
+%endmacro
+
+; %1 = a (aligned) or u (unaligned)
+; %2 = yuyv or uyvy
+%macro LOOP_YUYV_TO_UV 2
+.loop_%1:
+%ifidn %2, yuyv
+    mov%1          m0, [srcq+wq*4]        ; (byte) { Y0, U0, Y1, V0, ... }
+    mov%1          m1, [srcq+wq*4+mmsize] ; (byte) { Y8, U4, Y9, V4, ... }
+    psrlw          m0, 8                  ; (word) { U0, V0, ..., U3, V3 }
+    psrlw          m1, 8                  ; (word) { U4, V4, ..., U7, V7 }
+%else ; uyvy
+%if cpuflag(avx)
+    vpand          m0, m2, [srcq+wq*4]        ; (word) { U0, V0, ..., U3, V3 }
+    vpand          m1, m2, [srcq+wq*4+mmsize] ; (word) { U4, V4, ..., U7, V7 }
+%else
+    mov%1          m0, [srcq+wq*4]        ; (byte) { Y0, U0, Y1, V0, ... }
+    mov%1          m1, [srcq+wq*4+mmsize] ; (byte) { Y8, U4, Y9, V4, ... }
+    pand           m0, m2                 ; (word) { U0, V0, ..., U3, V3 }
+    pand           m1, m2                 ; (word) { U4, V4, ..., U7, V7 }
+%endif
+%endif ; yuyv/uyvy
+    packuswb       m0, m1                 ; (byte) { U0, V0, ..., U7, V7 }
+    pand           m1, m0, m2             ; (word) { U0, U1, ..., U7 }
+    psrlw          m0, 8                  ; (word) { V0, V1, ..., V7 }
+%if mmsize == 16
+    packuswb       m1, m0                 ; (byte) { U0, ... U7, V1, ... V7 }
+    movh   [dstUq+wq], m1
+    movhps [dstVq+wq], m1
+%else ; mmsize == 8
+    packuswb       m1, m1                 ; (byte) { U0, ... U3 }
+    packuswb       m0, m0                 ; (byte) { V0, ... V3 }
+    movh   [dstUq+wq], m1
+    movh   [dstVq+wq], m0
+%endif ; mmsize == 8/16
+    add            wq, mmsize / 2
+    jl .loop_%1
+    REP_RET
+%endmacro
+
+; %1 = nr. of XMM registers
+; %2 = yuyv or uyvy
+; %3 = if specified, it means that unaligned and aligned code in loop
+;      will be the same (i.e. UYVY+AVX), and thus we don't need to
+;      split the loop in an aligned and unaligned case
+%macro YUYV_TO_UV_FN 2-3
+cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
+%if ARCH_X86_64
+    movsxd         wq, dword r5m
+%else ; x86-32
+    mov            wq, r5m
+%endif
+    add         dstUq, wq
+    add         dstVq, wq
+%if mmsize == 16 && %0 == 2
+    test         srcq, 15
+%endif
+    lea          srcq, [srcq+wq*4]
+    pcmpeqb        m2, m2                 ; (byte) { 0xff } x 16
+    psrlw          m2, 8                  ; (word) { 0x00ff } x 8
+    ; NOTE: if uyvy+avx, u/a are identical
+%if mmsize == 16 && %0 == 2
+    jnz .loop_u_start
+    neg            wq
+    LOOP_YUYV_TO_UV a, %2
+.loop_u_start:
+    neg            wq
+    LOOP_YUYV_TO_UV u, %2
+%else ; mmsize == 8
+    neg            wq
+    LOOP_YUYV_TO_UV a, %2
+%endif ; mmsize == 8/16
+%endmacro
+
+; %1 = a (aligned) or u (unaligned)
+; %2 = nv12 or nv21
+%macro LOOP_NVXX_TO_UV 2
+.loop_%1:
+    mov%1          m0, [srcq+wq*2]        ; (byte) { U0, V0, U1, V1, ... }
+    mov%1          m1, [srcq+wq*2+mmsize] ; (byte) { U8, V8, U9, V9, ... }
+    pand           m2, m0, m5             ; (word) { U0, U1, ..., U7 }
+    pand           m3, m1, m5             ; (word) { U8, U9, ..., U15 }
+    psrlw          m0, 8                  ; (word) { V0, V1, ..., V7 }
+    psrlw          m1, 8                  ; (word) { V8, V9, ..., V15 }
+    packuswb       m2, m3                 ; (byte) { U0, ..., U15 }
+    packuswb       m0, m1                 ; (byte) { V0, ..., V15 }
+%ifidn %2, nv12
+    mova   [dstUq+wq], m2
+    mova   [dstVq+wq], m0
+%else ; nv21
+    mova   [dstVq+wq], m2
+    mova   [dstUq+wq], m0
+%endif ; nv12/21
+    add            wq, mmsize
+    jl .loop_%1
+    REP_RET
+%endmacro
+
+; %1 = nr. of XMM registers
+; %2 = nv12 or nv21
+%macro NVXX_TO_UV_FN 2
+cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
+%if ARCH_X86_64
+    movsxd         wq, dword r5m
+%else ; x86-32
+    mov            wq, r5m
+%endif
+    add         dstUq, wq
+    add         dstVq, wq
+%if mmsize == 16
+    test         srcq, 15
+%endif
+    lea          srcq, [srcq+wq*2]
+    pcmpeqb        m5, m5                 ; (byte) { 0xff } x 16
+    psrlw          m5, 8                  ; (word) { 0x00ff } x 8
+%if mmsize == 16
+    jnz .loop_u_start
+    neg            wq
+    LOOP_NVXX_TO_UV a, %2
+.loop_u_start:
+    neg            wq
+    LOOP_NVXX_TO_UV u, %2
+%else ; mmsize == 8
+    neg            wq
+    LOOP_NVXX_TO_UV a, %2
+%endif ; mmsize == 8/16
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+YUYV_TO_Y_FN  0, yuyv
+YUYV_TO_Y_FN  0, uyvy
+YUYV_TO_UV_FN 0, yuyv
+YUYV_TO_UV_FN 0, uyvy
+NVXX_TO_UV_FN 0, nv12
+NVXX_TO_UV_FN 0, nv21
+%endif
+
+INIT_XMM sse2
+YUYV_TO_Y_FN  3, yuyv
+YUYV_TO_Y_FN  2, uyvy
+YUYV_TO_UV_FN 3, yuyv
+YUYV_TO_UV_FN 3, uyvy
+NVXX_TO_UV_FN 5, nv12
+NVXX_TO_UV_FN 5, nv21
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+; in theory, we could write a yuy2-to-y using vpand (i.e. AVX), but
+; that's not faster in practice
+YUYV_TO_UV_FN 3, yuyv
+YUYV_TO_UV_FN 3, uyvy, 1
+NVXX_TO_UV_FN 5, nv12
+NVXX_TO_UV_FN 5, nv21
+%endif
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/x86/output.asm
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/x86/output.asm
@@ -0,0 +1,413 @@
+;******************************************************************************
+;* x86-optimized vertical line scaling functions
+;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
+;*                    Kieran Kunhya <kieran@kunhya.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+minshort:      times 8 dw 0x8000
+yuv2yuvX_16_start:  times 4 dd 0x4000 - 0x40000000
+yuv2yuvX_10_start:  times 4 dd 0x10000
+yuv2yuvX_9_start:   times 4 dd 0x20000
+yuv2yuvX_10_upper:  times 8 dw 0x3ff
+yuv2yuvX_9_upper:   times 8 dw 0x1ff
+pd_4:          times 4 dd 4
+pd_4min0x40000:times 4 dd 4 - (0x40000)
+pw_16:         times 8 dw 16
+pw_32:         times 8 dw 32
+pw_512:        times 8 dw 512
+pw_1024:       times 8 dw 1024
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; vertical line scaling
+;
+; void yuv2plane1_<output_size>_<opt>(const int16_t *src, uint8_t *dst, int dstW,
+;                                     const uint8_t *dither, int offset)
+; and
+; void yuv2planeX_<output_size>_<opt>(const int16_t *filter, int filterSize,
+;                                     const int16_t **src, uint8_t *dst, int dstW,
+;                                     const uint8_t *dither, int offset)
+;
+; Scale one or $filterSize lines of source data to generate one line of output
+; data. The input is 15-bit in int16_t if $output_size is [8,10] and 19-bit in
+; int32_t if $output_size is 16. $filter is 12-bits. $filterSize is a multiple
+; of 2. $offset is either 0 or 3. $dither holds 8 values.
+;-----------------------------------------------------------------------------
+
+%macro yuv2planeX_fn 3
+
+%if ARCH_X86_32
+%define cntr_reg fltsizeq
+%define movsx mov
+%else
+%define cntr_reg r7
+%define movsx movsxd
+%endif
+
+cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
+%if %1 == 8 || %1 == 9 || %1 == 10
+    pxor            m6,  m6
+%endif ; %1 == 8/9/10
+
+%if %1 == 8
+%if ARCH_X86_32
+%assign pad 0x2c - (stack_offset & 15)
+    SUB             rsp, pad
+%define m_dith m7
+%else ; x86-64
+%define m_dith m9
+%endif ; x86-32
+
+    ; create registers holding dither
+    movq        m_dith, [ditherq]        ; dither
+    test        offsetd, offsetd
+    jz              .no_rot
+%if mmsize == 16
+    punpcklqdq  m_dith,  m_dith
+%endif ; mmsize == 16
+    PALIGNR     m_dith,  m_dith,  3,  m0
+.no_rot:
+%if mmsize == 16
+    punpcklbw   m_dith,  m6
+%if ARCH_X86_64
+    punpcklwd       m8,  m_dith,  m6
+    pslld           m8,  12
+%else ; x86-32
+    punpcklwd       m5,  m_dith,  m6
+    pslld           m5,  12
+%endif ; x86-32/64
+    punpckhwd   m_dith,  m6
+    pslld       m_dith,  12
+%if ARCH_X86_32
+    mova      [rsp+ 0],  m5
+    mova      [rsp+16],  m_dith
+%endif
+%else ; mmsize == 8
+    punpcklbw       m5,  m_dith,  m6
+    punpckhbw   m_dith,  m6
+    punpcklwd       m4,  m5,  m6
+    punpckhwd       m5,  m6
+    punpcklwd       m3,  m_dith,  m6
+    punpckhwd   m_dith,  m6
+    pslld           m4,  12
+    pslld           m5,  12
+    pslld           m3,  12
+    pslld       m_dith,  12
+    mova      [rsp+ 0],  m4
+    mova      [rsp+ 8],  m5
+    mova      [rsp+16],  m3
+    mova      [rsp+24],  m_dith
+%endif ; mmsize == 8/16
+%endif ; %1 == 8
+
+    xor             r5,  r5
+
+.pixelloop:
+%assign %%i 0
+    ; the rep here is for the 8bit output mmx case, where dither covers
+    ; 8 pixels but we can only handle 2 pixels per register, and thus 4
+    ; pixels per iteration. In order to not have to keep track of where
+    ; we are w.r.t. dithering, we unroll the mmx/8bit loop x2.
+%if %1 == 8
+%assign %%repcnt 16/mmsize
+%else
+%assign %%repcnt 1
+%endif
+
+%rep %%repcnt
+
+%if %1 == 8
+%if ARCH_X86_32
+    mova            m2, [rsp+mmsize*(0+%%i)]
+    mova            m1, [rsp+mmsize*(1+%%i)]
+%else ; x86-64
+    mova            m2,  m8
+    mova            m1,  m_dith
+%endif ; x86-32/64
+%else ; %1 == 9/10/16
+    mova            m1, [yuv2yuvX_%1_start]
+    mova            m2,  m1
+%endif ; %1 == 8/9/10/16
+    movsx     cntr_reg,  fltsizem
+.filterloop_ %+ %%i:
+    ; input pixels
+    mov             r6, [srcq+gprsize*cntr_reg-2*gprsize]
+%if %1 == 16
+    mova            m3, [r6+r5*4]
+    mova            m5, [r6+r5*4+mmsize]
+%else ; %1 == 8/9/10
+    mova            m3, [r6+r5*2]
+%endif ; %1 == 8/9/10/16
+    mov             r6, [srcq+gprsize*cntr_reg-gprsize]
+%if %1 == 16
+    mova            m4, [r6+r5*4]
+    mova            m6, [r6+r5*4+mmsize]
+%else ; %1 == 8/9/10
+    mova            m4, [r6+r5*2]
+%endif ; %1 == 8/9/10/16
+
+    ; coefficients
+    movd            m0, [filterq+2*cntr_reg-4] ; coeff[0], coeff[1]
+%if %1 == 16
+    pshuflw         m7,  m0,  0          ; coeff[0]
+    pshuflw         m0,  m0,  0x55       ; coeff[1]
+    pmovsxwd        m7,  m7              ; word -> dword
+    pmovsxwd        m0,  m0              ; word -> dword
+
+    pmulld          m3,  m7
+    pmulld          m5,  m7
+    pmulld          m4,  m0
+    pmulld          m6,  m0
+
+    paddd           m2,  m3
+    paddd           m1,  m5
+    paddd           m2,  m4
+    paddd           m1,  m6
+%else ; %1 == 10/9/8
+    punpcklwd       m5,  m3,  m4
+    punpckhwd       m3,  m4
+    SPLATD          m0
+
+    pmaddwd         m5,  m0
+    pmaddwd         m3,  m0
+
+    paddd           m2,  m5
+    paddd           m1,  m3
+%endif ; %1 == 8/9/10/16
+
+    sub       cntr_reg,  2
+    jg .filterloop_ %+ %%i
+
+%if %1 == 16
+    psrad           m2,  31 - %1
+    psrad           m1,  31 - %1
+%else ; %1 == 10/9/8
+    psrad           m2,  27 - %1
+    psrad           m1,  27 - %1
+%endif ; %1 == 8/9/10/16
+
+%if %1 == 8
+    packssdw        m2,  m1
+    packuswb        m2,  m2
+    movh   [dstq+r5*1],  m2
+%else ; %1 == 9/10/16
+%if %1 == 16
+    packssdw        m2,  m1
+    paddw           m2, [minshort]
+%else ; %1 == 9/10
+%if cpuflag(sse4)
+    packusdw        m2,  m1
+%else ; mmxext/sse2
+    packssdw        m2,  m1
+    pmaxsw          m2,  m6
+%endif ; mmxext/sse2/sse4/avx
+    pminsw          m2, [yuv2yuvX_%1_upper]
+%endif ; %1 == 9/10/16
+    mova   [dstq+r5*2],  m2
+%endif ; %1 == 8/9/10/16
+
+    add             r5,  mmsize/2
+    sub             wd,  mmsize/2
+
+%assign %%i %%i+2
+%endrep
+    jg .pixelloop
+
+%if %1 == 8
+%if ARCH_X86_32
+    ADD             rsp, pad
+    RET
+%else ; x86-64
+    REP_RET
+%endif ; x86-32/64
+%else ; %1 == 9/10/16
+    REP_RET
+%endif ; %1 == 8/9/10/16
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmxext
+yuv2planeX_fn  8,  0, 7
+yuv2planeX_fn  9,  0, 5
+yuv2planeX_fn 10,  0, 5
+%endif
+
+INIT_XMM sse2
+yuv2planeX_fn  8, 10, 7
+yuv2planeX_fn  9,  7, 5
+yuv2planeX_fn 10,  7, 5
+
+INIT_XMM sse4
+yuv2planeX_fn  8, 10, 7
+yuv2planeX_fn  9,  7, 5
+yuv2planeX_fn 10,  7, 5
+yuv2planeX_fn 16,  8, 5
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+yuv2planeX_fn  8, 10, 7
+yuv2planeX_fn  9,  7, 5
+yuv2planeX_fn 10,  7, 5
+%endif
+
+; %1=outout-bpc, %2=alignment (u/a)
+%macro yuv2plane1_mainloop 2
+.loop_%2:
+%if %1 == 8
+    paddsw          m0, m2, [srcq+wq*2+mmsize*0]
+    paddsw          m1, m3, [srcq+wq*2+mmsize*1]
+    psraw           m0, 7
+    psraw           m1, 7
+    packuswb        m0, m1
+    mov%2    [dstq+wq], m0
+%elif %1 == 16
+    paddd           m0, m4, [srcq+wq*4+mmsize*0]
+    paddd           m1, m4, [srcq+wq*4+mmsize*1]
+    paddd           m2, m4, [srcq+wq*4+mmsize*2]
+    paddd           m3, m4, [srcq+wq*4+mmsize*3]
+    psrad           m0, 3
+    psrad           m1, 3
+    psrad           m2, 3
+    psrad           m3, 3
+%if cpuflag(sse4) ; avx/sse4
+    packusdw        m0, m1
+    packusdw        m2, m3
+%else ; mmx/sse2
+    packssdw        m0, m1
+    packssdw        m2, m3
+    paddw           m0, m5
+    paddw           m2, m5
+%endif ; mmx/sse2/sse4/avx
+    mov%2    [dstq+wq*2+mmsize*0], m0
+    mov%2    [dstq+wq*2+mmsize*1], m2
+%else ; %1 == 9/10
+    paddsw          m0, m2, [srcq+wq*2+mmsize*0]
+    paddsw          m1, m2, [srcq+wq*2+mmsize*1]
+    psraw           m0, 15 - %1
+    psraw           m1, 15 - %1
+    pmaxsw          m0, m4
+    pmaxsw          m1, m4
+    pminsw          m0, m3
+    pminsw          m1, m3
+    mov%2    [dstq+wq*2+mmsize*0], m0
+    mov%2    [dstq+wq*2+mmsize*1], m1
+%endif
+    add             wq, mmsize
+    jl .loop_%2
+%endmacro
+
+%macro yuv2plane1_fn 3
+cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset
+    movsxdifnidn    wq, wd
+    add             wq, mmsize - 1
+    and             wq, ~(mmsize - 1)
+%if %1 == 8
+    add           dstq, wq
+%else ; %1 != 8
+    lea           dstq, [dstq+wq*2]
+%endif ; %1 == 8
+%if %1 == 16
+    lea           srcq, [srcq+wq*4]
+%else ; %1 != 16
+    lea           srcq, [srcq+wq*2]
+%endif ; %1 == 16
+    neg             wq
+
+%if %1 == 8
+    pxor            m4, m4               ; zero
+
+    ; create registers holding dither
+    movq            m3, [ditherq]        ; dither
+    test       offsetd, offsetd
+    jz              .no_rot
+%if mmsize == 16
+    punpcklqdq      m3, m3
+%endif ; mmsize == 16
+    PALIGNR         m3, m3, 3, m2
+.no_rot:
+%if mmsize == 8
+    mova            m2, m3
+    punpckhbw       m3, m4               ; byte->word
+    punpcklbw       m2, m4               ; byte->word
+%else
+    punpcklbw       m3, m4
+    mova            m2, m3
+%endif
+%elif %1 == 9
+    pxor            m4, m4
+    mova            m3, [pw_512]
+    mova            m2, [pw_32]
+%elif %1 == 10
+    pxor            m4, m4
+    mova            m3, [pw_1024]
+    mova            m2, [pw_16]
+%else ; %1 == 16
+%if cpuflag(sse4) ; sse4/avx
+    mova            m4, [pd_4]
+%else ; mmx/sse2
+    mova            m4, [pd_4min0x40000]
+    mova            m5, [minshort]
+%endif ; mmx/sse2/sse4/avx
+%endif ; %1 == ..
+
+    ; actual pixel scaling
+%if mmsize == 8
+    yuv2plane1_mainloop %1, a
+%else ; mmsize == 16
+    test          dstq, 15
+    jnz .unaligned
+    yuv2plane1_mainloop %1, a
+    REP_RET
+.unaligned:
+    yuv2plane1_mainloop %1, u
+%endif ; mmsize == 8/16
+    REP_RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+yuv2plane1_fn  8, 0, 5
+yuv2plane1_fn 16, 0, 3
+
+INIT_MMX mmxext
+yuv2plane1_fn  9, 0, 3
+yuv2plane1_fn 10, 0, 3
+%endif
+
+INIT_XMM sse2
+yuv2plane1_fn  8, 5, 5
+yuv2plane1_fn  9, 5, 3
+yuv2plane1_fn 10, 5, 3
+yuv2plane1_fn 16, 6, 3
+
+INIT_XMM sse4
+yuv2plane1_fn 16, 5, 3
+
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+yuv2plane1_fn  8, 5, 5
+yuv2plane1_fn  9, 5, 3
+yuv2plane1_fn 10, 5, 3
+yuv2plane1_fn 16, 5, 3
+%endif
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/x86/rgb2rgb.c
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/x86/rgb2rgb.c
@@ -0,0 +1,148 @@
+/*
+ * software RGB to RGB converter
+ * pluralize by software PAL8 to RGB converter
+ *              software YUV to YUV converter
+ *              software YUV to RGB converter
+ * Written by Nick Kurshev.
+ * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavutil/cpu.h"
+#include "libavutil/bswap.h"
+#include "libswscale/rgb2rgb.h"
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+
+#if HAVE_INLINE_ASM
+
+DECLARE_ASM_CONST(8, uint64_t, mmx_ff)       = 0x00000000000000FFULL;
+DECLARE_ASM_CONST(8, uint64_t, mmx_null)     = 0x0000000000000000ULL;
+DECLARE_ASM_CONST(8, uint64_t, mmx_one)      = 0xFFFFFFFFFFFFFFFFULL;
+DECLARE_ASM_CONST(8, uint64_t, mask32b)      = 0x000000FF000000FFULL;
+DECLARE_ASM_CONST(8, uint64_t, mask32g)      = 0x0000FF000000FF00ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask32r)      = 0x00FF000000FF0000ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask32a)      = 0xFF000000FF000000ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask32)       = 0x00FFFFFF00FFFFFFULL;
+DECLARE_ASM_CONST(8, uint64_t, mask3216br)   = 0x00F800F800F800F8ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask3216g)    = 0x0000FC000000FC00ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask3215g)    = 0x0000F8000000F800ULL;
+DECLARE_ASM_CONST(8, uint64_t, mul3216)      = 0x2000000420000004ULL;
+DECLARE_ASM_CONST(8, uint64_t, mul3215)      = 0x2000000820000008ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask24b)      = 0x00FF0000FF0000FFULL;
+DECLARE_ASM_CONST(8, uint64_t, mask24g)      = 0xFF0000FF0000FF00ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask24r)      = 0x0000FF0000FF0000ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask24l)      = 0x0000000000FFFFFFULL;
+DECLARE_ASM_CONST(8, uint64_t, mask24h)      = 0x0000FFFFFF000000ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask24hh)     = 0xffff000000000000ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask24hhh)    = 0xffffffff00000000ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask24hhhh)   = 0xffffffffffff0000ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask15b)      = 0x001F001F001F001FULL; /* 00000000 00011111  xxB */
+DECLARE_ASM_CONST(8, uint64_t, mask15rg)     = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000  RGx */
+DECLARE_ASM_CONST(8, uint64_t, mask15s)      = 0xFFE0FFE0FFE0FFE0ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask15g)      = 0x03E003E003E003E0ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask15r)      = 0x7C007C007C007C00ULL;
+#define mask16b mask15b
+DECLARE_ASM_CONST(8, uint64_t, mask16g)      = 0x07E007E007E007E0ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask16r)      = 0xF800F800F800F800ULL;
+DECLARE_ASM_CONST(8, uint64_t, red_16mask)   = 0x0000f8000000f800ULL;
+DECLARE_ASM_CONST(8, uint64_t, green_16mask) = 0x000007e0000007e0ULL;
+DECLARE_ASM_CONST(8, uint64_t, blue_16mask)  = 0x0000001f0000001fULL;
+DECLARE_ASM_CONST(8, uint64_t, red_15mask)   = 0x00007c0000007c00ULL;
+DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL;
+DECLARE_ASM_CONST(8, uint64_t, blue_15mask)  = 0x0000001f0000001fULL;
+DECLARE_ASM_CONST(8, uint64_t, mul15_mid)    = 0x4200420042004200ULL;
+DECLARE_ASM_CONST(8, uint64_t, mul15_hi)     = 0x0210021002100210ULL;
+DECLARE_ASM_CONST(8, uint64_t, mul16_mid)    = 0x2080208020802080ULL;
+
+#define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
+#define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
+#define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
+#define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
+#define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
+#define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
+#define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
+#define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
+#define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
+
+// Note: We have C, MMX, MMXEXT, 3DNOW versions, there is no 3DNOW + MMXEXT one.
+
+#define COMPILE_TEMPLATE_MMXEXT 0
+#define COMPILE_TEMPLATE_AMD3DNOW 0
+#define COMPILE_TEMPLATE_SSE2 0
+
+//MMX versions
+#undef RENAME
+#define RENAME(a) a ## _MMX
+#include "rgb2rgb_template.c"
+
+// MMXEXT versions
+#undef RENAME
+#undef COMPILE_TEMPLATE_MMXEXT
+#define COMPILE_TEMPLATE_MMXEXT 1
+#define RENAME(a) a ## _MMXEXT
+#include "rgb2rgb_template.c"
+
+//SSE2 versions
+#undef RENAME
+#undef COMPILE_TEMPLATE_SSE2
+#define COMPILE_TEMPLATE_SSE2 1
+#define RENAME(a) a ## _SSE2
+#include "rgb2rgb_template.c"
+
+//3DNOW versions
+#undef RENAME
+#undef COMPILE_TEMPLATE_MMXEXT
+#undef COMPILE_TEMPLATE_SSE2
+#undef COMPILE_TEMPLATE_AMD3DNOW
+#define COMPILE_TEMPLATE_MMXEXT 0
+#define COMPILE_TEMPLATE_SSE2 0
+#define COMPILE_TEMPLATE_AMD3DNOW 1
+#define RENAME(a) a ## _3DNOW
+#include "rgb2rgb_template.c"
+
+/*
+ RGB15->RGB16 original by Strepto/Astral
+ ported to gcc & bugfixed : A'rpi
+ MMXEXT, 3DNOW optimization by Nick Kurshev
+ 32-bit C version, and and&add trick by Michael Niedermayer
+*/
+
+#endif /* HAVE_INLINE_ASM */
+
+av_cold void rgb2rgb_init_x86(void)
+{
+#if HAVE_INLINE_ASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (INLINE_MMX(cpu_flags))
+        rgb2rgb_init_MMX();
+    if (INLINE_AMD3DNOW(cpu_flags))
+        rgb2rgb_init_3DNOW();
+    if (INLINE_MMXEXT(cpu_flags))
+        rgb2rgb_init_MMXEXT();
+    if (INLINE_SSE2(cpu_flags))
+        rgb2rgb_init_SSE2();
+#endif /* HAVE_INLINE_ASM */
+}
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/x86/rgb2rgb_template.c
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/x86/rgb2rgb_template.c
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/x86/scale.asm
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/x86/scale.asm
@@ -0,0 +1,431 @@
+;******************************************************************************
+;* x86-optimized horizontal line scaling functions
+;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+max_19bit_int: times 4 dd 0x7ffff
+max_19bit_flt: times 4 dd 524287.0
+minshort:      times 8 dw 0x8000
+unicoeff:      times 4 dd 0x20000000
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; horizontal line scaling
+;
+; void hscale<source_width>to<intermediate_nbits>_<filterSize>_<opt>
+;                               (SwsContext *c, int{16,32}_t *dst,
+;                                int dstW, const uint{8,16}_t *src,
+;                                const int16_t *filter,
+;                                const int32_t *filterPos, int filterSize);
+;
+; Scale one horizontal line. Input is either 8-bits width or 16-bits width
+; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to
+; downscale before multiplying). Filter is 14-bits. Output is either 15bits
+; (in int16_t) or 19bits (in int32_t), as given in $intermediate_nbits. Each
+; output pixel is generated from $filterSize input pixels, the position of
+; the first pixel is given in filterPos[nOutputPixel].
+;-----------------------------------------------------------------------------
+
+; SCALE_FUNC source_width, intermediate_nbits, filtersize, filtersuffix, n_args, n_xmm
+%macro SCALE_FUNC 6
+%ifnidn %3, X
+cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, src, filter, fltpos, pos1
+%else
+cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsize
+%endif
+%if ARCH_X86_64
+    movsxd        wq, wd
+%define mov32 movsxd
+%else ; x86-32
+%define mov32 mov
+%endif ; x86-64
+%if %2 == 19
+%if mmsize == 8 ; mmx
+    mova          m2, [max_19bit_int]
+%elif cpuflag(sse4)
+    mova          m2, [max_19bit_int]
+%else ; ssse3/sse2
+    mova          m2, [max_19bit_flt]
+%endif ; mmx/sse2/ssse3/sse4
+%endif ; %2 == 19
+%if %1 == 16
+    mova          m6, [minshort]
+    mova          m7, [unicoeff]
+%elif %1 == 8
+    pxor          m3, m3
+%endif ; %1 == 8/16
+
+%if %1 == 8
+%define movlh movd
+%define movbh movh
+%define srcmul 1
+%else ; %1 == 9-16
+%define movlh movq
+%define movbh movu
+%define srcmul 2
+%endif ; %1 == 8/9-16
+
+%ifnidn %3, X
+
+    ; setup loop
+%if %3 == 8
+    shl           wq, 1                         ; this allows *16 (i.e. now *8) in lea instructions for the 8-tap filter
+%define wshr 1
+%else ; %3 == 4
+%define wshr 0
+%endif ; %3 == 8
+    lea      filterq, [filterq+wq*8]
+%if %2 == 15
+    lea         dstq, [dstq+wq*(2>>wshr)]
+%else ; %2 == 19
+    lea         dstq, [dstq+wq*(4>>wshr)]
+%endif ; %2 == 15/19
+    lea      fltposq, [fltposq+wq*(4>>wshr)]
+    neg           wq
+
+.loop:
+%if %3 == 4 ; filterSize == 4 scaling
+    ; load 2x4 or 4x4 source pixels into m0/m1
+    mov32      pos0q, dword [fltposq+wq*4+ 0]   ; filterPos[0]
+    mov32      pos1q, dword [fltposq+wq*4+ 4]   ; filterPos[1]
+    movlh         m0, [srcq+pos0q*srcmul]       ; src[filterPos[0] + {0,1,2,3}]
+%if mmsize == 8
+    movlh         m1, [srcq+pos1q*srcmul]       ; src[filterPos[1] + {0,1,2,3}]
+%else ; mmsize == 16
+%if %1 > 8
+    movhps        m0, [srcq+pos1q*srcmul]       ; src[filterPos[1] + {0,1,2,3}]
+%else ; %1 == 8
+    movd          m4, [srcq+pos1q*srcmul]       ; src[filterPos[1] + {0,1,2,3}]
+%endif
+    mov32      pos0q, dword [fltposq+wq*4+ 8]   ; filterPos[2]
+    mov32      pos1q, dword [fltposq+wq*4+12]   ; filterPos[3]
+    movlh         m1, [srcq+pos0q*srcmul]       ; src[filterPos[2] + {0,1,2,3}]
+%if %1 > 8
+    movhps        m1, [srcq+pos1q*srcmul]       ; src[filterPos[3] + {0,1,2,3}]
+%else ; %1 == 8
+    movd          m5, [srcq+pos1q*srcmul]       ; src[filterPos[3] + {0,1,2,3}]
+    punpckldq     m0, m4
+    punpckldq     m1, m5
+%endif ; %1 == 8
+%endif ; mmsize == 8/16
+%if %1 == 8
+    punpcklbw     m0, m3                        ; byte -> word
+    punpcklbw     m1, m3                        ; byte -> word
+%endif ; %1 == 8
+
+    ; multiply with filter coefficients
+%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
+             ; add back 0x8000 * sum(coeffs) after the horizontal add
+    psubw         m0, m6
+    psubw         m1, m6
+%endif ; %1 == 16
+    pmaddwd       m0, [filterq+wq*8+mmsize*0]   ; *= filter[{0,1,..,6,7}]
+    pmaddwd       m1, [filterq+wq*8+mmsize*1]   ; *= filter[{8,9,..,14,15}]
+
+    ; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix)
+%if mmsize == 8 ; mmx
+    movq          m4, m0
+    punpckldq     m0, m1
+    punpckhdq     m4, m1
+    paddd         m0, m4
+%elif notcpuflag(ssse3) ; sse2
+    mova          m4, m0
+    shufps        m0, m1, 10001000b
+    shufps        m4, m1, 11011101b
+    paddd         m0, m4
+%else ; ssse3/sse4
+    phaddd        m0, m1                        ; filter[{ 0, 1, 2, 3}]*src[filterPos[0]+{0,1,2,3}],
+                                                ; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}],
+                                                ; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}],
+                                                ; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}]
+%endif ; mmx/sse2/ssse3/sse4
+%else ; %3 == 8, i.e. filterSize == 8 scaling
+    ; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5
+    mov32      pos0q, dword [fltposq+wq*2+0]    ; filterPos[0]
+    mov32      pos1q, dword [fltposq+wq*2+4]    ; filterPos[1]
+    movbh         m0, [srcq+ pos0q   *srcmul]   ; src[filterPos[0] + {0,1,2,3,4,5,6,7}]
+%if mmsize == 8
+    movbh         m1, [srcq+(pos0q+4)*srcmul]   ; src[filterPos[0] + {4,5,6,7}]
+    movbh         m4, [srcq+ pos1q   *srcmul]   ; src[filterPos[1] + {0,1,2,3}]
+    movbh         m5, [srcq+(pos1q+4)*srcmul]   ; src[filterPos[1] + {4,5,6,7}]
+%else ; mmsize == 16
+    movbh         m1, [srcq+ pos1q   *srcmul]   ; src[filterPos[1] + {0,1,2,3,4,5,6,7}]
+    mov32      pos0q, dword [fltposq+wq*2+8]    ; filterPos[2]
+    mov32      pos1q, dword [fltposq+wq*2+12]   ; filterPos[3]
+    movbh         m4, [srcq+ pos0q   *srcmul]   ; src[filterPos[2] + {0,1,2,3,4,5,6,7}]
+    movbh         m5, [srcq+ pos1q   *srcmul]   ; src[filterPos[3] + {0,1,2,3,4,5,6,7}]
+%endif ; mmsize == 8/16
+%if %1 == 8
+    punpcklbw     m0, m3                        ; byte -> word
+    punpcklbw     m1, m3                        ; byte -> word
+    punpcklbw     m4, m3                        ; byte -> word
+    punpcklbw     m5, m3                        ; byte -> word
+%endif ; %1 == 8
+
+    ; multiply
+%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
+             ; add back 0x8000 * sum(coeffs) after the horizontal add
+    psubw         m0, m6
+    psubw         m1, m6
+    psubw         m4, m6
+    psubw         m5, m6
+%endif ; %1 == 16
+    pmaddwd       m0, [filterq+wq*8+mmsize*0]   ; *= filter[{0,1,..,6,7}]
+    pmaddwd       m1, [filterq+wq*8+mmsize*1]   ; *= filter[{8,9,..,14,15}]
+    pmaddwd       m4, [filterq+wq*8+mmsize*2]   ; *= filter[{16,17,..,22,23}]
+    pmaddwd       m5, [filterq+wq*8+mmsize*3]   ; *= filter[{24,25,..,30,31}]
+
+    ; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix)
+%if mmsize == 8
+    paddd         m0, m1
+    paddd         m4, m5
+    movq          m1, m0
+    punpckldq     m0, m4
+    punpckhdq     m1, m4
+    paddd         m0, m1
+%elif notcpuflag(ssse3) ; sse2
+%if %1 == 8
+%define mex m6
+%else
+%define mex m3
+%endif
+    ; emulate horizontal add as transpose + vertical add
+    mova         mex, m0
+    punpckldq     m0, m1
+    punpckhdq    mex, m1
+    paddd         m0, mex
+    mova          m1, m4
+    punpckldq     m4, m5
+    punpckhdq     m1, m5
+    paddd         m4, m1
+    mova          m1, m0
+    punpcklqdq    m0, m4
+    punpckhqdq    m1, m4
+    paddd         m0, m1
+%else ; ssse3/sse4
+    ; FIXME if we rearrange the filter in pairs of 4, we can
+    ; load pixels likewise and use 2 x paddd + phaddd instead
+    ; of 3 x phaddd here, faster on older cpus
+    phaddd        m0, m1
+    phaddd        m4, m5
+    phaddd        m0, m4                        ; filter[{ 0, 1,..., 6, 7}]*src[filterPos[0]+{0,1,...,6,7}],
+                                                ; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}],
+                                                ; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}],
+                                                ; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}]
+%endif ; mmx/sse2/ssse3/sse4
+%endif ; %3 == 4/8
+
+%else ; %3 == X, i.e. any filterSize scaling
+
+%ifidn %4, X4
+%define dlt 4
+%else ; %4 == X || %4 == X8
+%define dlt 0
+%endif ; %4 ==/!= X4
+%if ARCH_X86_64
+%define srcq    r8
+%define pos1q   r7
+%define srcendq r9
+    movsxd  fltsizeq, fltsized                  ; filterSize
+    lea      srcendq, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
+%else ; x86-32
+%define srcq    srcmemq
+%define pos1q   dstq
+%define srcendq r6m
+    lea        pos0q, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
+    mov      srcendq, pos0q
+%endif ; x86-32/64
+    lea      fltposq, [fltposq+wq*4]
+%if %2 == 15
+    lea         dstq, [dstq+wq*2]
+%else ; %2 == 19
+    lea         dstq, [dstq+wq*4]
+%endif ; %2 == 15/19
+    movifnidn  dstmp, dstq
+    neg           wq
+
+.loop:
+    mov32      pos0q, dword [fltposq+wq*4+0]    ; filterPos[0]
+    mov32      pos1q, dword [fltposq+wq*4+4]    ; filterPos[1]
+    ; FIXME maybe do 4px/iteration on x86-64 (x86-32 wouldn't have enough regs)?
+    pxor          m4, m4
+    pxor          m5, m5
+    mov         srcq, srcmemmp
+
+.innerloop:
+    ; load 2x4 (mmx) or 2x8 (sse) source pixels into m0/m1 -> m4/m5
+    movbh         m0, [srcq+ pos0q     *srcmul] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}]
+    movbh         m1, [srcq+(pos1q+dlt)*srcmul] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}]
+%if %1 == 8
+    punpcklbw     m0, m3
+    punpcklbw     m1, m3
+%endif ; %1 == 8
+
+    ; multiply
+%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
+             ; add back 0x8000 * sum(coeffs) after the horizontal add
+    psubw         m0, m6
+    psubw         m1, m6
+%endif ; %1 == 16
+    pmaddwd       m0, [filterq]                 ; filter[{0,1,2,3(,4,5,6,7)}]
+    pmaddwd       m1, [filterq+(fltsizeq+dlt)*2]; filter[filtersize+{0,1,2,3(,4,5,6,7)}]
+    paddd         m4, m0
+    paddd         m5, m1
+    add      filterq, mmsize
+    add         srcq, srcmul*mmsize/2
+    cmp         srcq, srcendq                   ; while (src += 4) < &src[filterSize]
+    jl .innerloop
+
+%ifidn %4, X4
+    mov32      pos1q, dword [fltposq+wq*4+4]    ; filterPos[1]
+    movlh         m0, [srcq+ pos0q     *srcmul] ; split last 4 srcpx of dstpx[0]
+    sub        pos1q, fltsizeq                  ; and first 4 srcpx of dstpx[1]
+%if %1 > 8
+    movhps        m0, [srcq+(pos1q+dlt)*srcmul]
+%else ; %1 == 8
+    movd          m1, [srcq+(pos1q+dlt)*srcmul]
+    punpckldq     m0, m1
+%endif ; %1 == 8
+%if %1 == 8
+    punpcklbw     m0, m3
+%endif ; %1 == 8
+%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
+             ; add back 0x8000 * sum(coeffs) after the horizontal add
+    psubw         m0, m6
+%endif ; %1 == 16
+    pmaddwd       m0, [filterq]
+%endif ; %4 == X4
+
+    lea      filterq, [filterq+(fltsizeq+dlt)*2]
+
+%if mmsize == 8 ; mmx
+    movq          m0, m4
+    punpckldq     m4, m5
+    punpckhdq     m0, m5
+    paddd         m0, m4
+%else ; mmsize == 16
+%if notcpuflag(ssse3) ; sse2
+    mova          m1, m4
+    punpcklqdq    m4, m5
+    punpckhqdq    m1, m5
+    paddd         m4, m1
+%else ; ssse3/sse4
+    phaddd        m4, m5
+%endif ; sse2/ssse3/sse4
+%ifidn %4, X4
+    paddd         m4, m0
+%endif ; %3 == X4
+%if notcpuflag(ssse3) ; sse2
+    pshufd        m4, m4, 11011000b
+    movhlps       m0, m4
+    paddd         m0, m4
+%else ; ssse3/sse4
+    phaddd        m4, m4
+    SWAP           0, 4
+%endif ; sse2/ssse3/sse4
+%endif ; mmsize == 8/16
+%endif ; %3 ==/!= X
+
+%if %1 == 16 ; add 0x8000 * sum(coeffs), i.e. back from signed -> unsigned
+    paddd         m0, m7
+%endif ; %1 == 16
+
+    ; clip, store
+    psrad         m0, 14 + %1 - %2
+%ifidn %3, X
+    movifnidn   dstq, dstmp
+%endif ; %3 == X
+%if %2 == 15
+    packssdw      m0, m0
+%ifnidn %3, X
+    movh [dstq+wq*(2>>wshr)], m0
+%else ; %3 == X
+    movd [dstq+wq*2], m0
+%endif ; %3 ==/!= X
+%else ; %2 == 19
+%if mmsize == 8
+    PMINSD_MMX    m0, m2, m4
+%elif cpuflag(sse4)
+    pminsd        m0, m2
+%else ; sse2/ssse3
+    cvtdq2ps      m0, m0
+    minps         m0, m2
+    cvtps2dq      m0, m0
+%endif ; mmx/sse2/ssse3/sse4
+%ifnidn %3, X
+    mova [dstq+wq*(4>>wshr)], m0
+%else ; %3 == X
+    movq [dstq+wq*4], m0
+%endif ; %3 ==/!= X
+%endif ; %2 == 15/19
+%ifnidn %3, X
+    add           wq, (mmsize<<wshr)/4          ; both 8tap and 4tap really only do 4 pixels (or for mmx: 2 pixels)
+                                                ; per iteration. see "shl wq,1" above as for why we do this
+%else ; %3 == X
+    add           wq, 2
+%endif ; %3 ==/!= X
+    jl .loop
+    REP_RET
+%endmacro
+
+; SCALE_FUNCS source_width, intermediate_nbits, n_xmm
+%macro SCALE_FUNCS 3
+SCALE_FUNC %1, %2, 4, 4,  6, %3
+SCALE_FUNC %1, %2, 8, 8,  6, %3
+%if mmsize == 8
+SCALE_FUNC %1, %2, X, X,  7, %3
+%else
+SCALE_FUNC %1, %2, X, X4, 7, %3
+SCALE_FUNC %1, %2, X, X8, 7, %3
+%endif
+%endmacro
+
+; SCALE_FUNCS2 8_xmm_args, 9to10_xmm_args, 16_xmm_args
+%macro SCALE_FUNCS2 3
+%if notcpuflag(sse4)
+SCALE_FUNCS  8, 15, %1
+SCALE_FUNCS  9, 15, %2
+SCALE_FUNCS 10, 15, %2
+SCALE_FUNCS 12, 15, %2
+SCALE_FUNCS 14, 15, %2
+SCALE_FUNCS 16, 15, %3
+%endif ; !sse4
+SCALE_FUNCS  8, 19, %1
+SCALE_FUNCS  9, 19, %2
+SCALE_FUNCS 10, 19, %2
+SCALE_FUNCS 12, 19, %2
+SCALE_FUNCS 14, 19, %2
+SCALE_FUNCS 16, 19, %3
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+SCALE_FUNCS2 0, 0, 0
+%endif
+INIT_XMM sse2
+SCALE_FUNCS2 6, 7, 8
+INIT_XMM ssse3
+SCALE_FUNCS2 6, 6, 8
+INIT_XMM sse4
+SCALE_FUNCS2 6, 6, 8
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/x86/swscale.c
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/x86/swscale.c
@@ -0,0 +1,580 @@
+/*
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <inttypes.h>
+#include "config.h"
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+#include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavutil/cpu.h"
+#include "libavutil/pixdesc.h"
+
+#if HAVE_INLINE_ASM
+
+#define DITHER1XBPP
+
+DECLARE_ASM_CONST(8, uint64_t, bF8)=       0xF8F8F8F8F8F8F8F8LL;
+DECLARE_ASM_CONST(8, uint64_t, bFC)=       0xFCFCFCFCFCFCFCFCLL;
+DECLARE_ASM_CONST(8, uint64_t, w10)=       0x0010001000100010LL;
+DECLARE_ASM_CONST(8, uint64_t, w02)=       0x0002000200020002LL;
+
+const DECLARE_ALIGNED(8, uint64_t, ff_dither4)[2] = {
+    0x0103010301030103LL,
+    0x0200020002000200LL,};
+
+const DECLARE_ALIGNED(8, uint64_t, ff_dither8)[2] = {
+    0x0602060206020602LL,
+    0x0004000400040004LL,};
+
+DECLARE_ASM_CONST(8, uint64_t, b16Mask)=   0x001F001F001F001FLL;
+DECLARE_ASM_CONST(8, uint64_t, g16Mask)=   0x07E007E007E007E0LL;
+DECLARE_ASM_CONST(8, uint64_t, r16Mask)=   0xF800F800F800F800LL;
+DECLARE_ASM_CONST(8, uint64_t, b15Mask)=   0x001F001F001F001FLL;
+DECLARE_ASM_CONST(8, uint64_t, g15Mask)=   0x03E003E003E003E0LL;
+DECLARE_ASM_CONST(8, uint64_t, r15Mask)=   0x7C007C007C007C00LL;
+
+DECLARE_ALIGNED(8, const uint64_t, ff_M24A)         = 0x00FF0000FF0000FFLL;
+DECLARE_ALIGNED(8, const uint64_t, ff_M24B)         = 0xFF0000FF0000FF00LL;
+DECLARE_ALIGNED(8, const uint64_t, ff_M24C)         = 0x0000FF0000FF0000LL;
+
+DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YCoeff)   = 0x000020E540830C8BULL;
+DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UCoeff)   = 0x0000ED0FDAC23831ULL;
+DECLARE_ALIGNED(8, const uint64_t, ff_bgr2VCoeff)   = 0x00003831D0E6F6EAULL;
+
+DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YOffset)  = 0x1010101010101010ULL;
+DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL;
+DECLARE_ALIGNED(8, const uint64_t, ff_w1111)        = 0x0001000100010001ULL;
+
+
+//MMX versions
+#if HAVE_MMX_INLINE
+#undef RENAME
+#define COMPILE_TEMPLATE_MMXEXT 0
+#define RENAME(a) a ## _MMX
+#include "swscale_template.c"
+#endif
+
+// MMXEXT versions
+#if HAVE_MMXEXT_INLINE
+#undef RENAME
+#undef COMPILE_TEMPLATE_MMXEXT
+#define COMPILE_TEMPLATE_MMXEXT 1
+#define RENAME(a) a ## _MMXEXT
+#include "swscale_template.c"
+#endif
+
+void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufIndex,
+                           int lastInLumBuf, int lastInChrBuf)
+{
+    const int dstH= c->dstH;
+    const int flags= c->flags;
+    int16_t **lumPixBuf= c->lumPixBuf;
+    int16_t **chrUPixBuf= c->chrUPixBuf;
+    int16_t **alpPixBuf= c->alpPixBuf;
+    const int vLumBufSize= c->vLumBufSize;
+    const int vChrBufSize= c->vChrBufSize;
+    int32_t *vLumFilterPos= c->vLumFilterPos;
+    int32_t *vChrFilterPos= c->vChrFilterPos;
+    int16_t *vLumFilter= c->vLumFilter;
+    int16_t *vChrFilter= c->vChrFilter;
+    int32_t *lumMmxFilter= c->lumMmxFilter;
+    int32_t *chrMmxFilter= c->chrMmxFilter;
+    int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
+    const int vLumFilterSize= c->vLumFilterSize;
+    const int vChrFilterSize= c->vChrFilterSize;
+    const int chrDstY= dstY>>c->chrDstVSubSample;
+    const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
+    const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
+
+    c->blueDither= ff_dither8[dstY&1];
+    if (c->dstFormat == AV_PIX_FMT_RGB555 || c->dstFormat == AV_PIX_FMT_BGR555)
+        c->greenDither= ff_dither8[dstY&1];
+    else
+        c->greenDither= ff_dither4[dstY&1];
+    c->redDither= ff_dither8[(dstY+1)&1];
+    if (dstY < dstH - 2) {
+        const int16_t **lumSrcPtr= (const int16_t **)(void*) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
+        const int16_t **chrUSrcPtr= (const int16_t **)(void*) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
+        const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)(void*) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
+        int i;
+
+        if (firstLumSrcY < 0 || firstLumSrcY + vLumFilterSize > c->srcH) {
+            const int16_t **tmpY = (const int16_t **) lumPixBuf + 2 * vLumBufSize;
+            int neg = -firstLumSrcY, i, end = FFMIN(c->srcH - firstLumSrcY, vLumFilterSize);
+            for (i = 0; i < neg;            i++)
+                tmpY[i] = lumSrcPtr[neg];
+            for (     ; i < end;            i++)
+                tmpY[i] = lumSrcPtr[i];
+            for (     ; i < vLumFilterSize; i++)
+                tmpY[i] = tmpY[i-1];
+            lumSrcPtr = tmpY;
+
+            if (alpSrcPtr) {
+                const int16_t **tmpA = (const int16_t **) alpPixBuf + 2 * vLumBufSize;
+                for (i = 0; i < neg;            i++)
+                    tmpA[i] = alpSrcPtr[neg];
+                for (     ; i < end;            i++)
+                    tmpA[i] = alpSrcPtr[i];
+                for (     ; i < vLumFilterSize; i++)
+                    tmpA[i] = tmpA[i - 1];
+                alpSrcPtr = tmpA;
+            }
+        }
+        if (firstChrSrcY < 0 || firstChrSrcY + vChrFilterSize > c->chrSrcH) {
+            const int16_t **tmpU = (const int16_t **) chrUPixBuf + 2 * vChrBufSize;
+            int neg = -firstChrSrcY, i, end = FFMIN(c->chrSrcH - firstChrSrcY, vChrFilterSize);
+            for (i = 0; i < neg;            i++) {
+                tmpU[i] = chrUSrcPtr[neg];
+            }
+            for (     ; i < end;            i++) {
+                tmpU[i] = chrUSrcPtr[i];
+            }
+            for (     ; i < vChrFilterSize; i++) {
+                tmpU[i] = tmpU[i - 1];
+            }
+            chrUSrcPtr = tmpU;
+        }
+
+        if (flags & SWS_ACCURATE_RND) {
+            int s= APCK_SIZE / 8;
+            for (i=0; i<vLumFilterSize; i+=2) {
+                *(const void**)&lumMmxFilter[s*i              ]= lumSrcPtr[i  ];
+                *(const void**)&lumMmxFilter[s*i+APCK_PTR2/4  ]= lumSrcPtr[i+(vLumFilterSize>1)];
+                lumMmxFilter[s*i+APCK_COEF/4  ]=
+                lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i    ]
+                + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
+                if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
+                    *(const void**)&alpMmxFilter[s*i              ]= alpSrcPtr[i  ];
+                    *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4  ]= alpSrcPtr[i+(vLumFilterSize>1)];
+                    alpMmxFilter[s*i+APCK_COEF/4  ]=
+                    alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4  ];
+                }
+            }
+            for (i=0; i<vChrFilterSize; i+=2) {
+                *(const void**)&chrMmxFilter[s*i              ]= chrUSrcPtr[i  ];
+                *(const void**)&chrMmxFilter[s*i+APCK_PTR2/4  ]= chrUSrcPtr[i+(vChrFilterSize>1)];
+                chrMmxFilter[s*i+APCK_COEF/4  ]=
+                chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i    ]
+                + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
+            }
+        } else {
+            for (i=0; i<vLumFilterSize; i++) {
+                *(const void**)&lumMmxFilter[4*i+0]= lumSrcPtr[i];
+                lumMmxFilter[4*i+2]=
+                lumMmxFilter[4*i+3]=
+                ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001U;
+                if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
+                    *(const void**)&alpMmxFilter[4*i+0]= alpSrcPtr[i];
+                    alpMmxFilter[4*i+2]=
+                    alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
+                }
+            }
+            for (i=0; i<vChrFilterSize; i++) {
+                *(const void**)&chrMmxFilter[4*i+0]= chrUSrcPtr[i];
+                chrMmxFilter[4*i+2]=
+                chrMmxFilter[4*i+3]=
+                ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001U;
+            }
+        }
+    }
+}
+
+#if HAVE_MMXEXT
+static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
+                           const int16_t **src, uint8_t *dest, int dstW,
+                           const uint8_t *dither, int offset)
+{
+    if(((int)dest) & 15){
+        return yuv2yuvX_MMXEXT(filter, filterSize, src, dest, dstW, dither, offset);
+    }
+    if (offset) {
+        __asm__ volatile("movq       (%0), %%xmm3\n\t"
+                         "movdqa    %%xmm3, %%xmm4\n\t"
+                         "psrlq       $24, %%xmm3\n\t"
+                         "psllq       $40, %%xmm4\n\t"
+                         "por       %%xmm4, %%xmm3\n\t"
+                         :: "r"(dither)
+                         );
+    } else {
+        __asm__ volatile("movq       (%0), %%xmm3\n\t"
+                         :: "r"(dither)
+                         );
+    }
+    filterSize--;
+    __asm__ volatile(
+        "pxor      %%xmm0, %%xmm0\n\t"
+        "punpcklbw %%xmm0, %%xmm3\n\t"
+        "movd          %0, %%xmm1\n\t"
+        "punpcklwd %%xmm1, %%xmm1\n\t"
+        "punpckldq %%xmm1, %%xmm1\n\t"
+        "punpcklqdq %%xmm1, %%xmm1\n\t"
+        "psllw         $3, %%xmm1\n\t"
+        "paddw     %%xmm1, %%xmm3\n\t"
+        "psraw         $4, %%xmm3\n\t"
+        ::"m"(filterSize)
+     );
+    __asm__ volatile(
+        "movdqa    %%xmm3, %%xmm4\n\t"
+        "movdqa    %%xmm3, %%xmm7\n\t"
+        "movl %3, %%ecx\n\t"
+        "mov                                 %0, %%"REG_d"  \n\t"\
+        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
+        ".p2align                             4             \n\t" /* FIXME Unroll? */\
+        "1:                                                 \n\t"\
+        "movddup                  8(%%"REG_d"), %%xmm0      \n\t" /* filterCoeff */\
+        "movdqa              (%%"REG_S", %%"REG_c", 2), %%xmm2      \n\t" /* srcData */\
+        "movdqa            16(%%"REG_S", %%"REG_c", 2), %%xmm5      \n\t" /* srcData */\
+        "add                                $16, %%"REG_d"  \n\t"\
+        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
+        "test                         %%"REG_S", %%"REG_S"  \n\t"\
+        "pmulhw                           %%xmm0, %%xmm2      \n\t"\
+        "pmulhw                           %%xmm0, %%xmm5      \n\t"\
+        "paddw                            %%xmm2, %%xmm3      \n\t"\
+        "paddw                            %%xmm5, %%xmm4      \n\t"\
+        " jnz                                1b             \n\t"\
+        "psraw                               $3, %%xmm3      \n\t"\
+        "psraw                               $3, %%xmm4      \n\t"\
+        "packuswb                         %%xmm4, %%xmm3      \n\t"
+        "movntdq                          %%xmm3, (%1, %%"REG_c")\n\t"
+        "add                         $16, %%"REG_c"         \n\t"\
+        "cmp                          %2, %%"REG_c"         \n\t"\
+        "movdqa    %%xmm7, %%xmm3\n\t"
+        "movdqa    %%xmm7, %%xmm4\n\t"
+        "mov                                 %0, %%"REG_d"  \n\t"\
+        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
+        "jb                                  1b             \n\t"\
+        :: "g" (filter),
+           "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
+        : "%"REG_d, "%"REG_S, "%"REG_c
+    );
+}
+#endif
+
+#endif /* HAVE_INLINE_ASM */
+
+#define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \
+void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \
+                                                SwsContext *c, int16_t *data, \
+                                                int dstW, const uint8_t *src, \
+                                                const int16_t *filter, \
+                                                const int32_t *filterPos, int filterSize)
+
+#define SCALE_FUNCS(filter_n, opt) \
+    SCALE_FUNC(filter_n,  8, 15, opt); \
+    SCALE_FUNC(filter_n,  9, 15, opt); \
+    SCALE_FUNC(filter_n, 10, 15, opt); \
+    SCALE_FUNC(filter_n, 12, 15, opt); \
+    SCALE_FUNC(filter_n, 14, 15, opt); \
+    SCALE_FUNC(filter_n, 16, 15, opt); \
+    SCALE_FUNC(filter_n,  8, 19, opt); \
+    SCALE_FUNC(filter_n,  9, 19, opt); \
+    SCALE_FUNC(filter_n, 10, 19, opt); \
+    SCALE_FUNC(filter_n, 12, 19, opt); \
+    SCALE_FUNC(filter_n, 14, 19, opt); \
+    SCALE_FUNC(filter_n, 16, 19, opt)
+
+#define SCALE_FUNCS_MMX(opt) \
+    SCALE_FUNCS(4, opt); \
+    SCALE_FUNCS(8, opt); \
+    SCALE_FUNCS(X, opt)
+
+#define SCALE_FUNCS_SSE(opt) \
+    SCALE_FUNCS(4, opt); \
+    SCALE_FUNCS(8, opt); \
+    SCALE_FUNCS(X4, opt); \
+    SCALE_FUNCS(X8, opt)
+
+#if ARCH_X86_32
+SCALE_FUNCS_MMX(mmx);
+#endif
+SCALE_FUNCS_SSE(sse2);
+SCALE_FUNCS_SSE(ssse3);
+SCALE_FUNCS_SSE(sse4);
+
+#define VSCALEX_FUNC(size, opt) \
+void ff_yuv2planeX_ ## size ## _ ## opt(const int16_t *filter, int filterSize, \
+                                        const int16_t **src, uint8_t *dest, int dstW, \
+                                        const uint8_t *dither, int offset)
+#define VSCALEX_FUNCS(opt) \
+    VSCALEX_FUNC(8,  opt); \
+    VSCALEX_FUNC(9,  opt); \
+    VSCALEX_FUNC(10, opt)
+
+#if ARCH_X86_32
+VSCALEX_FUNCS(mmxext);
+#endif
+VSCALEX_FUNCS(sse2);
+VSCALEX_FUNCS(sse4);
+VSCALEX_FUNC(16, sse4);
+VSCALEX_FUNCS(avx);
+
+#define VSCALE_FUNC(size, opt) \
+void ff_yuv2plane1_ ## size ## _ ## opt(const int16_t *src, uint8_t *dst, int dstW, \
+                                        const uint8_t *dither, int offset)
+#define VSCALE_FUNCS(opt1, opt2) \
+    VSCALE_FUNC(8,  opt1); \
+    VSCALE_FUNC(9,  opt2); \
+    VSCALE_FUNC(10, opt2); \
+    VSCALE_FUNC(16, opt1)
+
+#if ARCH_X86_32
+VSCALE_FUNCS(mmx, mmxext);
+#endif
+VSCALE_FUNCS(sse2, sse2);
+VSCALE_FUNC(16, sse4);
+VSCALE_FUNCS(avx, avx);
+
+#define INPUT_Y_FUNC(fmt, opt) \
+void ff_ ## fmt ## ToY_  ## opt(uint8_t *dst, const uint8_t *src, \
+                                const uint8_t *unused1, const uint8_t *unused2, \
+                                int w, uint32_t *unused)
+#define INPUT_UV_FUNC(fmt, opt) \
+void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \
+                                const uint8_t *unused0, \
+                                const uint8_t *src1, \
+                                const uint8_t *src2, \
+                                int w, uint32_t *unused)
+#define INPUT_FUNC(fmt, opt) \
+    INPUT_Y_FUNC(fmt, opt); \
+    INPUT_UV_FUNC(fmt, opt)
+#define INPUT_FUNCS(opt) \
+    INPUT_FUNC(uyvy, opt); \
+    INPUT_FUNC(yuyv, opt); \
+    INPUT_UV_FUNC(nv12, opt); \
+    INPUT_UV_FUNC(nv21, opt); \
+    INPUT_FUNC(rgba, opt); \
+    INPUT_FUNC(bgra, opt); \
+    INPUT_FUNC(argb, opt); \
+    INPUT_FUNC(abgr, opt); \
+    INPUT_FUNC(rgb24, opt); \
+    INPUT_FUNC(bgr24, opt)
+
+#if ARCH_X86_32
+INPUT_FUNCS(mmx);
+#endif
+INPUT_FUNCS(sse2);
+INPUT_FUNCS(ssse3);
+INPUT_FUNCS(avx);
+
+av_cold void ff_sws_init_swscale_x86(SwsContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#if HAVE_MMX_INLINE
+    if (cpu_flags & AV_CPU_FLAG_MMX)
+        sws_init_swscale_MMX(c);
+#endif
+#if HAVE_MMXEXT_INLINE
+    if (cpu_flags & AV_CPU_FLAG_MMXEXT)
+        sws_init_swscale_MMXEXT(c);
+    if (cpu_flags & AV_CPU_FLAG_SSE3){
+        if(c->use_mmx_vfilter && !(c->flags & SWS_ACCURATE_RND))
+            c->yuv2planeX = yuv2yuvX_sse3;
+    }
+#endif
+
+#define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt1, opt2) do { \
+    if (c->srcBpc == 8) { \
+        hscalefn = c->dstBpc <= 14 ? ff_hscale8to15_ ## filtersize ## _ ## opt2 : \
+                                     ff_hscale8to19_ ## filtersize ## _ ## opt1; \
+    } else if (c->srcBpc == 9) { \
+        hscalefn = c->dstBpc <= 14 ? ff_hscale9to15_ ## filtersize ## _ ## opt2 : \
+                                     ff_hscale9to19_ ## filtersize ## _ ## opt1; \
+    } else if (c->srcBpc == 10) { \
+        hscalefn = c->dstBpc <= 14 ? ff_hscale10to15_ ## filtersize ## _ ## opt2 : \
+                                     ff_hscale10to19_ ## filtersize ## _ ## opt1; \
+    } else if (c->srcBpc == 12) { \
+        hscalefn = c->dstBpc <= 14 ? ff_hscale12to15_ ## filtersize ## _ ## opt2 : \
+                                     ff_hscale12to19_ ## filtersize ## _ ## opt1; \
+    } else if (c->srcBpc == 14 || ((c->srcFormat==AV_PIX_FMT_PAL8||isAnyRGB(c->srcFormat)) && av_pix_fmt_desc_get(c->srcFormat)->comp[0].depth_minus1<15)) { \
+        hscalefn = c->dstBpc <= 14 ? ff_hscale14to15_ ## filtersize ## _ ## opt2 : \
+                                     ff_hscale14to19_ ## filtersize ## _ ## opt1; \
+    } else { /* c->srcBpc == 16 */ \
+        av_assert0(c->srcBpc == 16);\
+        hscalefn = c->dstBpc <= 14 ? ff_hscale16to15_ ## filtersize ## _ ## opt2 : \
+                                     ff_hscale16to19_ ## filtersize ## _ ## opt1; \
+    } \
+} while (0)
+#define ASSIGN_MMX_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
+    switch (filtersize) { \
+    case 4:  ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \
+    case 8:  ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \
+    default: ASSIGN_SCALE_FUNC2(hscalefn, X, opt1, opt2); break; \
+    }
+#define ASSIGN_VSCALEX_FUNC(vscalefn, opt, do_16_case, condition_8bit) \
+switch(c->dstBpc){ \
+    case 16:                          do_16_case;                          break; \
+    case 10: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_10_ ## opt; break; \
+    case 9:  if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_9_  ## opt; break; \
+    default: if (condition_8bit)    /*vscalefn = ff_yuv2planeX_8_  ## opt;*/ break; \
+    }
+#define ASSIGN_VSCALE_FUNC(vscalefn, opt1, opt2, opt2chk) \
+    switch(c->dstBpc){ \
+    case 16: if (!isBE(c->dstFormat))            vscalefn = ff_yuv2plane1_16_ ## opt1; break; \
+    case 10: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_10_ ## opt2; break; \
+    case 9:  if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_9_  ## opt2;  break; \
+    case 8:                                      vscalefn = ff_yuv2plane1_8_  ## opt1;  break; \
+    default: av_assert0(c->dstBpc>8); \
+    }
+#define case_rgb(x, X, opt) \
+        case AV_PIX_FMT_ ## X: \
+            c->lumToYV12 = ff_ ## x ## ToY_ ## opt; \
+            if (!c->chrSrcHSubSample) \
+                c->chrToYV12 = ff_ ## x ## ToUV_ ## opt; \
+            break
+#if ARCH_X86_32
+    if (EXTERNAL_MMX(cpu_flags)) {
+        ASSIGN_MMX_SCALE_FUNC(c->hyScale, c->hLumFilterSize, mmx, mmx);
+        ASSIGN_MMX_SCALE_FUNC(c->hcScale, c->hChrFilterSize, mmx, mmx);
+        ASSIGN_VSCALE_FUNC(c->yuv2plane1, mmx, mmxext, cpu_flags & AV_CPU_FLAG_MMXEXT);
+
+        switch (c->srcFormat) {
+        case AV_PIX_FMT_Y400A:
+            c->lumToYV12 = ff_yuyvToY_mmx;
+            if (c->alpPixBuf)
+                c->alpToYV12 = ff_uyvyToY_mmx;
+            break;
+        case AV_PIX_FMT_YUYV422:
+            c->lumToYV12 = ff_yuyvToY_mmx;
+            c->chrToYV12 = ff_yuyvToUV_mmx;
+            break;
+        case AV_PIX_FMT_UYVY422:
+            c->lumToYV12 = ff_uyvyToY_mmx;
+            c->chrToYV12 = ff_uyvyToUV_mmx;
+            break;
+        case AV_PIX_FMT_NV12:
+            c->chrToYV12 = ff_nv12ToUV_mmx;
+            break;
+        case AV_PIX_FMT_NV21:
+            c->chrToYV12 = ff_nv21ToUV_mmx;
+            break;
+        case_rgb(rgb24, RGB24, mmx);
+        case_rgb(bgr24, BGR24, mmx);
+        case_rgb(bgra,  BGRA,  mmx);
+        case_rgb(rgba,  RGBA,  mmx);
+        case_rgb(abgr,  ABGR,  mmx);
+        case_rgb(argb,  ARGB,  mmx);
+        default:
+            break;
+        }
+    }
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        ASSIGN_VSCALEX_FUNC(c->yuv2planeX, mmxext, , 1);
+    }
+#endif /* ARCH_X86_32 */
+#define ASSIGN_SSE_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
+    switch (filtersize) { \
+    case 4:  ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \
+    case 8:  ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \
+    default: if (filtersize & 4) ASSIGN_SCALE_FUNC2(hscalefn, X4, opt1, opt2); \
+             else                ASSIGN_SCALE_FUNC2(hscalefn, X8, opt1, opt2); \
+             break; \
+    }
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse2, sse2);
+        ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse2, sse2);
+        ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse2, ,
+                            HAVE_ALIGNED_STACK || ARCH_X86_64);
+        ASSIGN_VSCALE_FUNC(c->yuv2plane1, sse2, sse2, 1);
+
+        switch (c->srcFormat) {
+        case AV_PIX_FMT_Y400A:
+            c->lumToYV12 = ff_yuyvToY_sse2;
+            if (c->alpPixBuf)
+                c->alpToYV12 = ff_uyvyToY_sse2;
+            break;
+        case AV_PIX_FMT_YUYV422:
+            c->lumToYV12 = ff_yuyvToY_sse2;
+            c->chrToYV12 = ff_yuyvToUV_sse2;
+            break;
+        case AV_PIX_FMT_UYVY422:
+            c->lumToYV12 = ff_uyvyToY_sse2;
+            c->chrToYV12 = ff_uyvyToUV_sse2;
+            break;
+        case AV_PIX_FMT_NV12:
+            c->chrToYV12 = ff_nv12ToUV_sse2;
+            break;
+        case AV_PIX_FMT_NV21:
+            c->chrToYV12 = ff_nv21ToUV_sse2;
+            break;
+        case_rgb(rgb24, RGB24, sse2);
+        case_rgb(bgr24, BGR24, sse2);
+        case_rgb(bgra,  BGRA,  sse2);
+        case_rgb(rgba,  RGBA,  sse2);
+        case_rgb(abgr,  ABGR,  sse2);
+        case_rgb(argb,  ARGB,  sse2);
+        default:
+            break;
+        }
+    }
+    if (EXTERNAL_SSSE3(cpu_flags)) {
+        ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, ssse3, ssse3);
+        ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, ssse3, ssse3);
+        switch (c->srcFormat) {
+        case_rgb(rgb24, RGB24, ssse3);
+        case_rgb(bgr24, BGR24, ssse3);
+        default:
+            break;
+        }
+    }
+    if (EXTERNAL_SSE4(cpu_flags)) {
+        /* Xto15 don't need special sse4 functions */
+        ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse4, ssse3);
+        ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse4, ssse3);
+        ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse4,
+                            if (!isBE(c->dstFormat)) c->yuv2planeX = ff_yuv2planeX_16_sse4,
+                            HAVE_ALIGNED_STACK || ARCH_X86_64);
+        if (c->dstBpc == 16 && !isBE(c->dstFormat))
+            c->yuv2plane1 = ff_yuv2plane1_16_sse4;
+    }
+
+    if (EXTERNAL_AVX(cpu_flags)) {
+        ASSIGN_VSCALEX_FUNC(c->yuv2planeX, avx, ,
+                            HAVE_ALIGNED_STACK || ARCH_X86_64);
+        ASSIGN_VSCALE_FUNC(c->yuv2plane1, avx, avx, 1);
+
+        switch (c->srcFormat) {
+        case AV_PIX_FMT_YUYV422:
+            c->chrToYV12 = ff_yuyvToUV_avx;
+            break;
+        case AV_PIX_FMT_UYVY422:
+            c->chrToYV12 = ff_uyvyToUV_avx;
+            break;
+        case AV_PIX_FMT_NV12:
+            c->chrToYV12 = ff_nv12ToUV_avx;
+            break;
+        case AV_PIX_FMT_NV21:
+            c->chrToYV12 = ff_nv21ToUV_avx;
+            break;
+        case_rgb(rgb24, RGB24, avx);
+        case_rgb(bgr24, BGR24, avx);
+        case_rgb(bgra,  BGRA,  avx);
+        case_rgb(rgba,  RGBA,  avx);
+        case_rgb(abgr,  ABGR,  avx);
+        case_rgb(argb,  ARGB,  avx);
+        default:
+            break;
+        }
+    }
+}
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/x86/swscale_template.c
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/x86/swscale_template.c
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/x86/w64xmmtest.c
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/x86/w64xmmtest.c
@@ -0,0 +1,31 @@
+/*
+ * check XMM registers for clobbers on Win64
+ * Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86/w64xmmtest.h"
+#include "libswscale/swscale.h"
+
+wrap(sws_scale(struct SwsContext *c, const uint8_t *const srcSlice[],
+               const int srcStride[], int srcSliceY, int srcSliceH,
+               uint8_t *const dst[], const int dstStride[]))
+{
+    testxmmclobbers(sws_scale, c, srcSlice, srcStride, srcSliceY,
+                    srcSliceH, dst, dstStride);
+}
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/x86/yuv2rgb.c
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/x86/yuv2rgb.c
@@ -0,0 +1,112 @@
+/*
+ * software YUV to RGB converter
+ *
+ * Copyright (C) 2009 Konstantin Shishkov
+ *
+ * MMX/MMXEXT template stuff (needed for fast movntq support),
+ * 1,4,8bpp support and context / deglobalize stuff
+ * by Michael Niedermayer (michaelni@gmx.at)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include "config.h"
+#include "libswscale/rgb2rgb.h"
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+#include "libavutil/attributes.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/cpu.h"
+
+#if HAVE_INLINE_ASM
+
+#define DITHER1XBPP // only for MMX
+
+/* hope these constant values are cache line aligned */
+DECLARE_ASM_CONST(8, uint64_t, mmx_00ffw)   = 0x00ff00ff00ff00ffULL;
+DECLARE_ASM_CONST(8, uint64_t, mmx_redmask) = 0xf8f8f8f8f8f8f8f8ULL;
+DECLARE_ASM_CONST(8, uint64_t, mmx_grnmask) = 0xfcfcfcfcfcfcfcfcULL;
+DECLARE_ASM_CONST(8, uint64_t, pb_e0) = 0xe0e0e0e0e0e0e0e0ULL;
+DECLARE_ASM_CONST(8, uint64_t, pb_03) = 0x0303030303030303ULL;
+DECLARE_ASM_CONST(8, uint64_t, pb_07) = 0x0707070707070707ULL;
+
+//MMX versions
+#if HAVE_MMX_INLINE
+#undef RENAME
+#undef COMPILE_TEMPLATE_MMXEXT
+#define COMPILE_TEMPLATE_MMXEXT 0
+#define RENAME(a) a ## _MMX
+#include "yuv2rgb_template.c"
+#endif /* HAVE_MMX_INLINE */
+
+// MMXEXT versions
+#if HAVE_MMXEXT_INLINE
+#undef RENAME
+#undef COMPILE_TEMPLATE_MMXEXT
+#define COMPILE_TEMPLATE_MMXEXT 1
+#define RENAME(a) a ## _MMXEXT
+#include "yuv2rgb_template.c"
+#endif /* HAVE_MMXEXT_INLINE */
+
+#endif /* HAVE_INLINE_ASM */
+
+av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c)
+{
+#if HAVE_MMX_INLINE
+    int cpu_flags = av_get_cpu_flags();
+
+#if HAVE_MMXEXT_INLINE
+    if (cpu_flags & AV_CPU_FLAG_MMXEXT) {
+        switch (c->dstFormat) {
+        case AV_PIX_FMT_RGB24:
+            return yuv420_rgb24_MMXEXT;
+        case AV_PIX_FMT_BGR24:
+            return yuv420_bgr24_MMXEXT;
+        }
+    }
+#endif
+
+    if (cpu_flags & AV_CPU_FLAG_MMX) {
+        switch (c->dstFormat) {
+            case AV_PIX_FMT_RGB32:
+                if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
+#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
+                    return yuva420_rgb32_MMX;
+#endif
+                    break;
+                } else return yuv420_rgb32_MMX;
+            case AV_PIX_FMT_BGR32:
+                if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
+#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
+                    return yuva420_bgr32_MMX;
+#endif
+                    break;
+                } else return yuv420_bgr32_MMX;
+            case AV_PIX_FMT_RGB24:  return yuv420_rgb24_MMX;
+            case AV_PIX_FMT_BGR24:  return yuv420_bgr24_MMX;
+            case AV_PIX_FMT_RGB565: return yuv420_rgb16_MMX;
+            case AV_PIX_FMT_RGB555: return yuv420_rgb15_MMX;
+        }
+    }
+#endif /* HAVE_MMX_INLINE */
+
+    return NULL;
+}
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/x86/yuv2rgb_template.c
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/x86/yuv2rgb_template.c
@@ -0,0 +1,451 @@
+/*
+ * software YUV to RGB converter
+ *
+ * Copyright (C) 2001-2007 Michael Niedermayer
+ *           (c) 2010 Konstantin Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#undef MOVNTQ
+#undef EMMS
+#undef SFENCE
+
+#if COMPILE_TEMPLATE_MMXEXT
+#define MOVNTQ "movntq"
+#define SFENCE "sfence"
+#else
+#define MOVNTQ "movq"
+#define SFENCE " # nop"
+#endif
+
+#define REG_BLUE  "0"
+#define REG_RED   "1"
+#define REG_GREEN "2"
+#define REG_ALPHA "3"
+
+#define YUV2RGB_LOOP(depth)                                          \
+    h_size = (c->dstW + 7) & ~7;                                     \
+    if (h_size * depth > FFABS(dstStride[0]))                        \
+        h_size -= 8;                                                 \
+                                                                     \
+    vshift = c->srcFormat != AV_PIX_FMT_YUV422P;                        \
+                                                                     \
+    __asm__ volatile ("pxor %mm4, %mm4\n\t");                        \
+    for (y = 0; y < srcSliceH; y++) {                                \
+        uint8_t *image    = dst[0] + (y + srcSliceY) * dstStride[0]; \
+        const uint8_t *py = src[0] +               y * srcStride[0]; \
+        const uint8_t *pu = src[1] +   (y >> vshift) * srcStride[1]; \
+        const uint8_t *pv = src[2] +   (y >> vshift) * srcStride[2]; \
+        x86_reg index = -h_size / 2;                                 \
+
+#define YUV2RGB_INITIAL_LOAD          \
+    __asm__ volatile (                \
+        "movq (%5, %0, 2), %%mm6\n\t" \
+        "movd    (%2, %0), %%mm0\n\t" \
+        "movd    (%3, %0), %%mm1\n\t" \
+        "1: \n\t"                     \
+
+/* YUV2RGB core
+ * Conversion is performed in usual way:
+ * R = Y' * Ycoef + Vred * V'
+ * G = Y' * Ycoef + Vgreen * V' + Ugreen * U'
+ * B = Y' * Ycoef               + Ublue * U'
+ *
+ * where X' = X * 8 - Xoffset (multiplication is performed to increase
+ * precision a bit).
+ * Since it operates in YUV420 colorspace, Y component is additionally
+ * split into Y1 and Y2 for even and odd pixels.
+ *
+ * Input:
+ * mm0 - U (4 elems), mm1 - V (4 elems), mm6 - Y (8 elems), mm4 - zero register
+ * Output:
+ * mm1 - R, mm2 - G, mm0 - B
+ */
+#define YUV2RGB                                  \
+    /* convert Y, U, V into Y1', Y2', U', V' */  \
+    "movq      %%mm6, %%mm7\n\t"                 \
+    "punpcklbw %%mm4, %%mm0\n\t"                 \
+    "punpcklbw %%mm4, %%mm1\n\t"                 \
+    "pand     "MANGLE(mmx_00ffw)", %%mm6\n\t"    \
+    "psrlw     $8,    %%mm7\n\t"                 \
+    "psllw     $3,    %%mm0\n\t"                 \
+    "psllw     $3,    %%mm1\n\t"                 \
+    "psllw     $3,    %%mm6\n\t"                 \
+    "psllw     $3,    %%mm7\n\t"                 \
+    "psubsw   "U_OFFSET"(%4), %%mm0\n\t"         \
+    "psubsw   "V_OFFSET"(%4), %%mm1\n\t"         \
+    "psubw    "Y_OFFSET"(%4), %%mm6\n\t"         \
+    "psubw    "Y_OFFSET"(%4), %%mm7\n\t"         \
+\
+     /* multiply by coefficients */              \
+    "movq      %%mm0, %%mm2\n\t"                 \
+    "movq      %%mm1, %%mm3\n\t"                 \
+    "pmulhw   "UG_COEFF"(%4), %%mm2\n\t"         \
+    "pmulhw   "VG_COEFF"(%4), %%mm3\n\t"         \
+    "pmulhw   "Y_COEFF" (%4), %%mm6\n\t"         \
+    "pmulhw   "Y_COEFF" (%4), %%mm7\n\t"         \
+    "pmulhw   "UB_COEFF"(%4), %%mm0\n\t"         \
+    "pmulhw   "VR_COEFF"(%4), %%mm1\n\t"         \
+    "paddsw    %%mm3, %%mm2\n\t"                 \
+    /* now: mm0 = UB, mm1 = VR, mm2 = CG */      \
+    /*      mm6 = Y1, mm7 = Y2 */                \
+\
+    /* produce RGB */                            \
+    "movq      %%mm7, %%mm3\n\t"                 \
+    "movq      %%mm7, %%mm5\n\t"                 \
+    "paddsw    %%mm0, %%mm3\n\t"                 \
+    "paddsw    %%mm1, %%mm5\n\t"                 \
+    "paddsw    %%mm2, %%mm7\n\t"                 \
+    "paddsw    %%mm6, %%mm0\n\t"                 \
+    "paddsw    %%mm6, %%mm1\n\t"                 \
+    "paddsw    %%mm6, %%mm2\n\t"                 \
+
+#define RGB_PACK_INTERLEAVE                  \
+    /* pack and interleave even/odd pixels */    \
+    "packuswb  %%mm1, %%mm0\n\t"                 \
+    "packuswb  %%mm5, %%mm3\n\t"                 \
+    "packuswb  %%mm2, %%mm2\n\t"                 \
+    "movq      %%mm0, %%mm1\n\n"                 \
+    "packuswb  %%mm7, %%mm7\n\t"                 \
+    "punpcklbw %%mm3, %%mm0\n\t"                 \
+    "punpckhbw %%mm3, %%mm1\n\t"                 \
+    "punpcklbw %%mm7, %%mm2\n\t"                 \
+
+#define YUV2RGB_ENDLOOP(depth)                   \
+    "movq 8 (%5, %0, 2), %%mm6\n\t"              \
+    "movd 4 (%3, %0),    %%mm1\n\t"              \
+    "movd 4 (%2, %0),    %%mm0\n\t"              \
+    "add $"AV_STRINGIFY(depth * 8)", %1\n\t"     \
+    "add  $4, %0\n\t"                            \
+    "js   1b\n\t"                                \
+
+#define YUV2RGB_OPERANDS                                          \
+        : "+r" (index), "+r" (image)                              \
+        : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \
+          "r" (py - 2*index)                                      \
+        : "memory"                                                \
+        );                                                        \
+    }                                                             \
+
+#define YUV2RGB_OPERANDS_ALPHA                                    \
+        : "+r" (index), "+r" (image)                              \
+        : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \
+          "r" (py - 2*index), "r" (pa - 2*index)                  \
+        : "memory"                                                \
+        );                                                        \
+    }                                                             \
+
+#define YUV2RGB_ENDFUNC                          \
+    __asm__ volatile (SFENCE"\n\t"               \
+                    "emms    \n\t");             \
+    return srcSliceH;                            \
+
+#define IF0(x)
+#define IF1(x) x
+
+#define RGB_PACK16(gmask, is15)                  \
+    "pand      "MANGLE(mmx_redmask)", %%mm0\n\t" \
+    "pand      "MANGLE(mmx_redmask)", %%mm1\n\t" \
+    "movq      %%mm2,     %%mm3\n\t"             \
+    "psllw   $"AV_STRINGIFY(3-is15)", %%mm2\n\t" \
+    "psrlw   $"AV_STRINGIFY(5+is15)", %%mm3\n\t" \
+    "psrlw     $3,        %%mm0\n\t"             \
+    IF##is15("psrlw  $1,  %%mm1\n\t")            \
+    "pand "MANGLE(pb_e0)", %%mm2\n\t"            \
+    "pand "MANGLE(gmask)", %%mm3\n\t"            \
+    "por       %%mm2,     %%mm0\n\t"             \
+    "por       %%mm3,     %%mm1\n\t"             \
+    "movq      %%mm0,     %%mm2\n\t"             \
+    "punpcklbw %%mm1,     %%mm0\n\t"             \
+    "punpckhbw %%mm1,     %%mm2\n\t"             \
+    MOVNTQ "   %%mm0,      (%1)\n\t"             \
+    MOVNTQ "   %%mm2,     8(%1)\n\t"             \
+
+#define DITHER_RGB                               \
+    "paddusb "BLUE_DITHER"(%4),  %%mm0\n\t"      \
+    "paddusb "GREEN_DITHER"(%4), %%mm2\n\t"      \
+    "paddusb "RED_DITHER"(%4),   %%mm1\n\t"      \
+
+#if !COMPILE_TEMPLATE_MMXEXT
+static inline int RENAME(yuv420_rgb15)(SwsContext *c, const uint8_t *src[],
+                                       int srcStride[],
+                                       int srcSliceY, int srcSliceH,
+                                       uint8_t *dst[], int dstStride[])
+{
+    int y, h_size, vshift;
+
+    YUV2RGB_LOOP(2)
+
+#ifdef DITHER1XBPP
+        c->blueDither  = ff_dither8[y       & 1];
+        c->greenDither = ff_dither8[y       & 1];
+        c->redDither   = ff_dither8[(y + 1) & 1];
+#endif
+
+        YUV2RGB_INITIAL_LOAD
+        YUV2RGB
+        RGB_PACK_INTERLEAVE
+#ifdef DITHER1XBPP
+        DITHER_RGB
+#endif
+        RGB_PACK16(pb_03, 1)
+
+    YUV2RGB_ENDLOOP(2)
+    YUV2RGB_OPERANDS
+    YUV2RGB_ENDFUNC
+}
+
+static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
+                                       int srcStride[],
+                                       int srcSliceY, int srcSliceH,
+                                       uint8_t *dst[], int dstStride[])
+{
+    int y, h_size, vshift;
+
+    YUV2RGB_LOOP(2)
+
+#ifdef DITHER1XBPP
+        c->blueDither  = ff_dither8[y       & 1];
+        c->greenDither = ff_dither4[y       & 1];
+        c->redDither   = ff_dither8[(y + 1) & 1];
+#endif
+
+        YUV2RGB_INITIAL_LOAD
+        YUV2RGB
+        RGB_PACK_INTERLEAVE
+#ifdef DITHER1XBPP
+        DITHER_RGB
+#endif
+        RGB_PACK16(pb_07, 0)
+
+    YUV2RGB_ENDLOOP(2)
+    YUV2RGB_OPERANDS
+    YUV2RGB_ENDFUNC
+}
+#endif /* !COMPILE_TEMPLATE_MMXEXT */
+
+#define RGB_PACK24(blue, red)\
+    "packuswb  %%mm3,      %%mm0 \n" /* R0 R2 R4 R6 R1 R3 R5 R7 */\
+    "packuswb  %%mm5,      %%mm1 \n" /* B0 B2 B4 B6 B1 B3 B5 B7 */\
+    "packuswb  %%mm7,      %%mm2 \n" /* G0 G2 G4 G6 G1 G3 G5 G7 */\
+    "movq      %%mm"red",  %%mm3 \n"\
+    "movq      %%mm"blue", %%mm6 \n"\
+    "psrlq     $32,        %%mm"red" \n" /* R1 R3 R5 R7 */\
+    "punpcklbw %%mm2,      %%mm3 \n" /* R0 G0 R2 G2 R4 G4 R6 G6 */\
+    "punpcklbw %%mm"red",  %%mm6 \n" /* B0 R1 B2 R3 B4 R5 B6 R7 */\
+    "movq      %%mm3,      %%mm5 \n"\
+    "punpckhbw %%mm"blue", %%mm2 \n" /* G1 B1 G3 B3 G5 B5 G7 B7 */\
+    "punpcklwd %%mm6,      %%mm3 \n" /* R0 G0 B0 R1 R2 G2 B2 R3 */\
+    "punpckhwd %%mm6,      %%mm5 \n" /* R4 G4 B4 R5 R6 G6 B6 R7 */\
+    RGB_PACK24_B
+
+#if COMPILE_TEMPLATE_MMXEXT
+DECLARE_ASM_CONST(8, int16_t, mask1101[4]) = {-1,-1, 0,-1};
+DECLARE_ASM_CONST(8, int16_t, mask0010[4]) = { 0, 0,-1, 0};
+DECLARE_ASM_CONST(8, int16_t, mask0110[4]) = { 0,-1,-1, 0};
+DECLARE_ASM_CONST(8, int16_t, mask1001[4]) = {-1, 0, 0,-1};
+DECLARE_ASM_CONST(8, int16_t, mask0100[4]) = { 0,-1, 0, 0};
+#undef RGB_PACK24_B
+#define RGB_PACK24_B\
+    "pshufw    $0xc6,  %%mm2, %%mm1 \n"\
+    "pshufw    $0x84,  %%mm3, %%mm6 \n"\
+    "pshufw    $0x38,  %%mm5, %%mm7 \n"\
+    "pand "MANGLE(mask1101)", %%mm6 \n" /* R0 G0 B0 R1 -- -- R2 G2 */\
+    "movq      %%mm1,         %%mm0 \n"\
+    "pand "MANGLE(mask0110)", %%mm7 \n" /* -- -- R6 G6 B6 R7 -- -- */\
+    "movq      %%mm1,         %%mm2 \n"\
+    "pand "MANGLE(mask0100)", %%mm1 \n" /* -- -- G3 B3 -- -- -- -- */\
+    "psrlq       $48,         %%mm3 \n" /* B2 R3 -- -- -- -- -- -- */\
+    "pand "MANGLE(mask0010)", %%mm0 \n" /* -- -- -- -- G1 B1 -- -- */\
+    "psllq       $32,         %%mm5 \n" /* -- -- -- -- R4 G4 B4 R5 */\
+    "pand "MANGLE(mask1001)", %%mm2 \n" /* G5 B5 -- -- -- -- G7 B7 */\
+    "por       %%mm3,         %%mm1 \n"\
+    "por       %%mm6,         %%mm0 \n"\
+    "por       %%mm5,         %%mm1 \n"\
+    "por       %%mm7,         %%mm2 \n"\
+    MOVNTQ"    %%mm0,          (%1) \n"\
+    MOVNTQ"    %%mm1,         8(%1) \n"\
+    MOVNTQ"    %%mm2,        16(%1) \n"\
+
+#else
+#undef RGB_PACK24_B
+#define RGB_PACK24_B\
+    "movd      %%mm3,       (%1) \n" /* R0 G0 B0 R1 */\
+    "movd      %%mm2,      4(%1) \n" /* G1 B1 */\
+    "psrlq     $32,        %%mm3 \n"\
+    "psrlq     $16,        %%mm2 \n"\
+    "movd      %%mm3,      6(%1) \n" /* R2 G2 B2 R3 */\
+    "movd      %%mm2,     10(%1) \n" /* G3 B3 */\
+    "psrlq     $16,        %%mm2 \n"\
+    "movd      %%mm5,     12(%1) \n" /* R4 G4 B4 R5 */\
+    "movd      %%mm2,     16(%1) \n" /* G5 B5 */\
+    "psrlq     $32,        %%mm5 \n"\
+    "movd      %%mm2,     20(%1) \n" /* -- -- G7 B7 */\
+    "movd      %%mm5,     18(%1) \n" /* R6 G6 B6 R7 */\
+
+#endif
+
+static inline int RENAME(yuv420_rgb24)(SwsContext *c, const uint8_t *src[],
+                                       int srcStride[],
+                                       int srcSliceY, int srcSliceH,
+                                       uint8_t *dst[], int dstStride[])
+{
+    int y, h_size, vshift;
+
+    YUV2RGB_LOOP(3)
+
+        YUV2RGB_INITIAL_LOAD
+        YUV2RGB
+        RGB_PACK24(REG_BLUE, REG_RED)
+
+    YUV2RGB_ENDLOOP(3)
+    YUV2RGB_OPERANDS
+    YUV2RGB_ENDFUNC
+}
+
+static inline int RENAME(yuv420_bgr24)(SwsContext *c, const uint8_t *src[],
+                                       int srcStride[],
+                                       int srcSliceY, int srcSliceH,
+                                       uint8_t *dst[], int dstStride[])
+{
+    int y, h_size, vshift;
+
+    YUV2RGB_LOOP(3)
+
+        YUV2RGB_INITIAL_LOAD
+        YUV2RGB
+        RGB_PACK24(REG_RED, REG_BLUE)
+
+    YUV2RGB_ENDLOOP(3)
+    YUV2RGB_OPERANDS
+    YUV2RGB_ENDFUNC
+}
+
+
+#define SET_EMPTY_ALPHA                                                      \
+    "pcmpeqd   %%mm"REG_ALPHA", %%mm"REG_ALPHA"\n\t" /* set alpha to 0xFF */ \
+
+#define LOAD_ALPHA                                   \
+    "movq      (%6, %0, 2),     %%mm"REG_ALPHA"\n\t" \
+
+#define RGB_PACK32(red, green, blue, alpha)  \
+    "movq      %%mm"blue",  %%mm5\n\t"       \
+    "movq      %%mm"red",   %%mm6\n\t"       \
+    "punpckhbw %%mm"green", %%mm5\n\t"       \
+    "punpcklbw %%mm"green", %%mm"blue"\n\t"  \
+    "punpckhbw %%mm"alpha", %%mm6\n\t"       \
+    "punpcklbw %%mm"alpha", %%mm"red"\n\t"   \
+    "movq      %%mm"blue",  %%mm"green"\n\t" \
+    "movq      %%mm5,       %%mm"alpha"\n\t" \
+    "punpcklwd %%mm"red",   %%mm"blue"\n\t"  \
+    "punpckhwd %%mm"red",   %%mm"green"\n\t" \
+    "punpcklwd %%mm6,       %%mm5\n\t"       \
+    "punpckhwd %%mm6,       %%mm"alpha"\n\t" \
+    MOVNTQ "   %%mm"blue",   0(%1)\n\t"      \
+    MOVNTQ "   %%mm"green",  8(%1)\n\t"      \
+    MOVNTQ "   %%mm5,       16(%1)\n\t"      \
+    MOVNTQ "   %%mm"alpha", 24(%1)\n\t"      \
+
+#if !COMPILE_TEMPLATE_MMXEXT
+static inline int RENAME(yuv420_rgb32)(SwsContext *c, const uint8_t *src[],
+                                       int srcStride[],
+                                       int srcSliceY, int srcSliceH,
+                                       uint8_t *dst[], int dstStride[])
+{
+    int y, h_size, vshift;
+
+    YUV2RGB_LOOP(4)
+
+        YUV2RGB_INITIAL_LOAD
+        YUV2RGB
+        RGB_PACK_INTERLEAVE
+        SET_EMPTY_ALPHA
+        RGB_PACK32(REG_RED, REG_GREEN, REG_BLUE, REG_ALPHA)
+
+    YUV2RGB_ENDLOOP(4)
+    YUV2RGB_OPERANDS
+    YUV2RGB_ENDFUNC
+}
+
+#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
+static inline int RENAME(yuva420_rgb32)(SwsContext *c, const uint8_t *src[],
+                                        int srcStride[],
+                                        int srcSliceY, int srcSliceH,
+                                        uint8_t *dst[], int dstStride[])
+{
+    int y, h_size, vshift;
+
+    YUV2RGB_LOOP(4)
+
+        const uint8_t *pa = src[3] + y * srcStride[3];
+        YUV2RGB_INITIAL_LOAD
+        YUV2RGB
+        RGB_PACK_INTERLEAVE
+        LOAD_ALPHA
+        RGB_PACK32(REG_RED, REG_GREEN, REG_BLUE, REG_ALPHA)
+
+    YUV2RGB_ENDLOOP(4)
+    YUV2RGB_OPERANDS_ALPHA
+    YUV2RGB_ENDFUNC
+}
+#endif
+
+static inline int RENAME(yuv420_bgr32)(SwsContext *c, const uint8_t *src[],
+                                       int srcStride[],
+                                       int srcSliceY, int srcSliceH,
+                                       uint8_t *dst[], int dstStride[])
+{
+    int y, h_size, vshift;
+
+    YUV2RGB_LOOP(4)
+
+        YUV2RGB_INITIAL_LOAD
+        YUV2RGB
+        RGB_PACK_INTERLEAVE
+        SET_EMPTY_ALPHA
+        RGB_PACK32(REG_BLUE, REG_GREEN, REG_RED, REG_ALPHA)
+
+    YUV2RGB_ENDLOOP(4)
+    YUV2RGB_OPERANDS
+    YUV2RGB_ENDFUNC
+}
+
+#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
+static inline int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[],
+                                        int srcStride[],
+                                        int srcSliceY, int srcSliceH,
+                                        uint8_t *dst[], int dstStride[])
+{
+    int y, h_size, vshift;
+
+    YUV2RGB_LOOP(4)
+
+        const uint8_t *pa = src[3] + y * srcStride[3];
+        YUV2RGB_INITIAL_LOAD
+        YUV2RGB
+        RGB_PACK_INTERLEAVE
+        LOAD_ALPHA
+        RGB_PACK32(REG_BLUE, REG_GREEN, REG_RED, REG_ALPHA)
+
+    YUV2RGB_ENDLOOP(4)
+    YUV2RGB_OPERANDS_ALPHA
+    YUV2RGB_ENDFUNC
+}
+#endif
+
+#endif /* !COMPILE_TEMPLATE_MMXEXT */
--- a/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/yuv2rgb.c
+++ b/contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libswscale/yuv2rgb.c
@@ -0,0 +1,920 @@
+/*
+ * software YUV to RGB converter
+ *
+ * Copyright (C) 2009 Konstantin Shishkov
+ *
+ * 1,4,8bpp support and context / deglobalize stuff
+ * by Michael Niedermayer (michaelni@gmx.at)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include "libavutil/cpu.h"
+#include "libavutil/bswap.h"
+#include "config.h"
+#include "rgb2rgb.h"
+#include "swscale.h"
+#include "swscale_internal.h"
+#include "libavutil/pixdesc.h"
+
+const int32_t ff_yuv2rgb_coeffs[8][4] = {
+    { 117504, 138453, 13954, 34903 }, /* no sequence_display_extension */
+    { 117504, 138453, 13954, 34903 }, /* ITU-R Rec. 709 (1990) */
+    { 104597, 132201, 25675, 53279 }, /* unspecified */
+    { 104597, 132201, 25675, 53279 }, /* reserved */
+    { 104448, 132798, 24759, 53109 }, /* FCC */
+    { 104597, 132201, 25675, 53279 }, /* ITU-R Rec. 624-4 System B, G */
+    { 104597, 132201, 25675, 53279 }, /* SMPTE 170M */
+    { 117579, 136230, 16907, 35559 }  /* SMPTE 240M (1987) */
+};
+
+const int *sws_getCoefficients(int colorspace)
+{
+    if (colorspace > 7 || colorspace < 0)
+        colorspace = SWS_CS_DEFAULT;
+    return ff_yuv2rgb_coeffs[colorspace];
+}
+
+#define LOADCHROMA(i)                               \
+    U = pu[i];                                      \
+    V = pv[i];                                      \
+    r = (void *)c->table_rV[V+YUVRGB_TABLE_HEADROOM];                     \
+    g = (void *)(c->table_gU[U+YUVRGB_TABLE_HEADROOM] + c->table_gV[V+YUVRGB_TABLE_HEADROOM]);  \
+    b = (void *)c->table_bU[U+YUVRGB_TABLE_HEADROOM];
+
+#define PUTRGB(dst, src, i)                         \
+    Y              = src[2 * i];                    \
+    dst[2 * i]     = r[Y] + g[Y] + b[Y];            \
+    Y              = src[2 * i + 1];                \
+    dst[2 * i + 1] = r[Y] + g[Y] + b[Y];
+
+#define PUTRGB24(dst, src, i)                       \
+    Y              = src[2 * i];                    \
+    dst[6 * i + 0] = r[Y];                          \
+    dst[6 * i + 1] = g[Y];                          \
+    dst[6 * i + 2] = b[Y];                          \
+    Y              = src[2 * i + 1];                \
+    dst[6 * i + 3] = r[Y];                          \
+    dst[6 * i + 4] = g[Y];                          \
+    dst[6 * i + 5] = b[Y];
+
+#define PUTBGR24(dst, src, i)                       \
+    Y              = src[2 * i];                    \
+    dst[6 * i + 0] = b[Y];                          \
+    dst[6 * i + 1] = g[Y];                          \
+    dst[6 * i + 2] = r[Y];                          \
+    Y              = src[2 * i + 1];                \
+    dst[6 * i + 3] = b[Y];                          \
+    dst[6 * i + 4] = g[Y];                          \
+    dst[6 * i + 5] = r[Y];
+
+#define PUTRGBA(dst, ysrc, asrc, i, s)                                  \
+    Y              = ysrc[2 * i];                                       \
+    dst[2 * i]     = r[Y] + g[Y] + b[Y] + (asrc[2 * i]     << s);       \
+    Y              = ysrc[2 * i + 1];                                   \
+    dst[2 * i + 1] = r[Y] + g[Y] + b[Y] + (asrc[2 * i + 1] << s);
+
+#define PUTRGB48(dst, src, i)                       \
+    Y                = src[ 2 * i];                 \
+    dst[12 * i +  0] = dst[12 * i +  1] = r[Y];     \
+    dst[12 * i +  2] = dst[12 * i +  3] = g[Y];     \
+    dst[12 * i +  4] = dst[12 * i +  5] = b[Y];     \
+    Y                = src[ 2 * i + 1];             \
+    dst[12 * i +  6] = dst[12 * i +  7] = r[Y];     \
+    dst[12 * i +  8] = dst[12 * i +  9] = g[Y];     \
+    dst[12 * i + 10] = dst[12 * i + 11] = b[Y];
+
+#define PUTBGR48(dst, src, i)                       \
+    Y                = src[2 * i];                  \
+    dst[12 * i +  0] = dst[12 * i +  1] = b[Y];     \
+    dst[12 * i +  2] = dst[12 * i +  3] = g[Y];     \
+    dst[12 * i +  4] = dst[12 * i +  5] = r[Y];     \
+    Y                = src[2  * i +  1];            \
+    dst[12 * i +  6] = dst[12 * i +  7] = b[Y];     \
+    dst[12 * i +  8] = dst[12 * i +  9] = g[Y];     \
+    dst[12 * i + 10] = dst[12 * i + 11] = r[Y];
+
+#define YUV2RGBFUNC(func_name, dst_type, alpha)                             \
+    static int func_name(SwsContext *c, const uint8_t *src[],               \
+                         int srcStride[], int srcSliceY, int srcSliceH,     \
+                         uint8_t *dst[], int dstStride[])                   \
+    {                                                                       \
+        int y;                                                              \
+                                                                            \
+        if (!alpha && c->srcFormat == AV_PIX_FMT_YUV422P) {                    \
+            srcStride[1] *= 2;                                              \
+            srcStride[2] *= 2;                                              \
+        }                                                                   \
+        for (y = 0; y < srcSliceH; y += 2) {                                \
+            dst_type *dst_1 =                                               \
+                (dst_type *)(dst[0] + (y + srcSliceY)     * dstStride[0]);  \
+            dst_type *dst_2 =                                               \
+                (dst_type *)(dst[0] + (y + srcSliceY + 1) * dstStride[0]);  \
+            dst_type av_unused *r, *g, *b;                                  \
+            const uint8_t *py_1 = src[0] +  y       * srcStride[0];         \
+            const uint8_t *py_2 = py_1   +            srcStride[0];         \
+            const uint8_t *pu   = src[1] + (y >> 1) * srcStride[1];         \
+            const uint8_t *pv   = src[2] + (y >> 1) * srcStride[2];         \
+            const uint8_t av_unused *pa_1, *pa_2;                           \
+            unsigned int h_size = c->dstW >> 3;                             \
+            if (alpha) {                                                    \
+                pa_1 = src[3] + y * srcStride[3];                           \
+                pa_2 = pa_1   +     srcStride[3];                           \
+            }                                                               \
+            while (h_size--) {                                              \
+                int av_unused U, V, Y;                                      \
+
+#define ENDYUV2RGBLINE(dst_delta, ss)               \
+    pu    += 4 >> ss;                               \
+    pv    += 4 >> ss;                               \
+    py_1  += 8 >> ss;                               \
+    py_2  += 8 >> ss;                               \
+    dst_1 += dst_delta >> ss;                       \
+    dst_2 += dst_delta >> ss;                       \
+    }                                               \
+    if (c->dstW & (4 >> ss)) {                      \
+        int av_unused Y, U, V;                      \
+
+#define ENDYUV2RGBFUNC()                            \
+            }                                       \
+        }                                           \
+        return srcSliceH;                           \
+    }
+
+#define CLOSEYUV2RGBFUNC(dst_delta)                 \
+    ENDYUV2RGBLINE(dst_delta, 0)                    \
+    ENDYUV2RGBFUNC()
+
+YUV2RGBFUNC(yuv2rgb_c_48, uint8_t, 0)
+    LOADCHROMA(0);
+    PUTRGB48(dst_1, py_1, 0);
+    PUTRGB48(dst_2, py_2, 0);
+
+    LOADCHROMA(1);
+    PUTRGB48(dst_2, py_2, 1);
+    PUTRGB48(dst_1, py_1, 1);
+
+    LOADCHROMA(2);
+    PUTRGB48(dst_1, py_1, 2);
+    PUTRGB48(dst_2, py_2, 2);
+
+    LOADCHROMA(3);
+    PUTRGB48(dst_2, py_2, 3);
+    PUTRGB48(dst_1, py_1, 3);
+ENDYUV2RGBLINE(48, 0)
+    LOADCHROMA(0);
+    PUTRGB48(dst_1, py_1, 0);
+    PUTRGB48(dst_2, py_2, 0);
+
+    LOADCHROMA(1);
+    PUTRGB48(dst_2, py_2, 1);
+    PUTRGB48(dst_1, py_1, 1);
+ENDYUV2RGBLINE(48, 1)
+    LOADCHROMA(0);
+    PUTRGB48(dst_1, py_1, 0);
+    PUTRGB48(dst_2, py_2, 0);
+ENDYUV2RGBFUNC()
+
+YUV2RGBFUNC(yuv2rgb_c_bgr48, uint8_t, 0)
+    LOADCHROMA(0);
+    PUTBGR48(dst_1, py_1, 0);
+    PUTBGR48(dst_2, py_2, 0);
+
+    LOADCHROMA(1);
+    PUTBGR48(dst_2, py_2, 1);
+    PUTBGR48(dst_1, py_1, 1);
+
+    LOADCHROMA(2);
+    PUTBGR48(dst_1, py_1, 2);
+    PUTBGR48(dst_2, py_2, 2);
+
+    LOADCHROMA(3);
+    PUTBGR48(dst_2, py_2, 3);
+    PUTBGR48(dst_1, py_1, 3);
+ENDYUV2RGBLINE(48, 0)
+    LOADCHROMA(0);
+    PUTBGR48(dst_1, py_1, 0);
+    PUTBGR48(dst_2, py_2, 0);
+
+    LOADCHROMA(1);
+    PUTBGR48(dst_2, py_2, 1);
+    PUTBGR48(dst_1, py_1, 1);
+ENDYUV2RGBLINE(48, 1)
+    LOADCHROMA(0);
+    PUTBGR48(dst_1, py_1, 0);
+    PUTBGR48(dst_2, py_2, 0);
+ENDYUV2RGBFUNC()
+
+YUV2RGBFUNC(yuv2rgb_c_32, uint32_t, 0)
+    LOADCHROMA(0);
+    PUTRGB(dst_1, py_1, 0);
+    PUTRGB(dst_2, py_2, 0);
+
+    LOADCHROMA(1);
+    PUTRGB(dst_2, py_2, 1);
+    PUTRGB(dst_1, py_1, 1);
+
+    LOADCHROMA(2);
+    PUTRGB(dst_1, py_1, 2);
+    PUTRGB(dst_2, py_2, 2);
+
+    LOADCHROMA(3);
+    PUTRGB(dst_2, py_2, 3);
+    PUTRGB(dst_1, py_1, 3);
+ENDYUV2RGBLINE(8, 0)
+    LOADCHROMA(0);
+    PUTRGB(dst_1, py_1, 0);
+    PUTRGB(dst_2, py_2, 0);
+
+    LOADCHROMA(1);
+    PUTRGB(dst_2, py_2, 1);
+    PUTRGB(dst_1, py_1, 1);
+ENDYUV2RGBLINE(8, 1)
+    LOADCHROMA(0);
+    PUTRGB(dst_1, py_1, 0);
+    PUTRGB(dst_2, py_2, 0);
+ENDYUV2RGBFUNC()
+
+YUV2RGBFUNC(yuva2rgba_c, uint32_t, 1)
+    LOADCHROMA(0);
+    PUTRGBA(dst_1, py_1, pa_1, 0, 24);
+    PUTRGBA(dst_2, py_2, pa_2, 0, 24);
+
+    LOADCHROMA(1);
+    PUTRGBA(dst_2, py_2, pa_2, 1, 24);
+    PUTRGBA(dst_1, py_1, pa_1, 1, 24);
+
+    LOADCHROMA(2);
+    PUTRGBA(dst_1, py_1, pa_1, 2, 24);
+    PUTRGBA(dst_2, py_2, pa_2, 2, 24);
+
+    LOADCHROMA(3);
+    PUTRGBA(dst_2, py_2, pa_2, 3, 24);
+    PUTRGBA(dst_1, py_1, pa_1, 3, 24);
+    pa_1 += 8;
+    pa_2 += 8;
+ENDYUV2RGBLINE(8, 0)
+    LOADCHROMA(0);
+    PUTRGBA(dst_1, py_1, pa_1, 0, 24);
+    PUTRGBA(dst_2, py_2, pa_2, 0, 24);
+
+    LOADCHROMA(1);
+    PUTRGBA(dst_2, py_2, pa_2, 1, 24);
+    PUTRGBA(dst_1, py_1, pa_1, 1, 24);
+    pa_1 += 4;
+    pa_2 += 4;
+ENDYUV2RGBLINE(8, 1)
+    LOADCHROMA(0);
+    PUTRGBA(dst_1, py_1, pa_1, 0, 24);
+    PUTRGBA(dst_2, py_2, pa_2, 0, 24);
+ENDYUV2RGBFUNC()
+
+YUV2RGBFUNC(yuva2argb_c, uint32_t, 1)
+    LOADCHROMA(0);
+    PUTRGBA(dst_1, py_1, pa_1, 0, 0);
+    PUTRGBA(dst_2, py_2, pa_2, 0, 0);
+
+    LOADCHROMA(1);
+    PUTRGBA(dst_2, py_2, pa_2, 1, 0);
+    PUTRGBA(dst_1, py_1, pa_1, 1, 0);
+
+    LOADCHROMA(2);
+    PUTRGBA(dst_1, py_1, pa_1, 2, 0);
+    PUTRGBA(dst_2, py_2, pa_2, 2, 0);
+
+    LOADCHROMA(3);
+    PUTRGBA(dst_2, py_2, pa_2, 3, 0);
+    PUTRGBA(dst_1, py_1, pa_1, 3, 0);
+    pa_1 += 8;
+    pa_2 += 8;
+ENDYUV2RGBLINE(8, 0)
+    LOADCHROMA(0);
+    PUTRGBA(dst_1, py_1, pa_1, 0, 0);
+    PUTRGBA(dst_2, py_2, pa_2, 0, 0);
+
+    LOADCHROMA(1);
+    PUTRGBA(dst_2, py_2, pa_2, 1, 0);
+    PUTRGBA(dst_1, py_1, pa_1, 1, 0);
+    pa_1 += 4;
+    pa_2 += 4;
+ENDYUV2RGBLINE(8, 1)
+    LOADCHROMA(0);
+    PUTRGBA(dst_1, py_1, pa_1, 0, 0);
+    PUTRGBA(dst_2, py_2, pa_2, 0, 0);
+ENDYUV2RGBFUNC()
+
+YUV2RGBFUNC(yuv2rgb_c_24_rgb, uint8_t, 0)
+    LOADCHROMA(0);
+    PUTRGB24(dst_1, py_1, 0);
+    PUTRGB24(dst_2, py_2, 0);
+
+    LOADCHROMA(1);
+    PUTRGB24(dst_2, py_2, 1);
+    PUTRGB24(dst_1, py_1, 1);
+
+    LOADCHROMA(2);
+    PUTRGB24(dst_1, py_1, 2);
+    PUTRGB24(dst_2, py_2, 2);
+
+    LOADCHROMA(3);
+    PUTRGB24(dst_2, py_2, 3);
+    PUTRGB24(dst_1, py_1, 3);
+ENDYUV2RGBLINE(24, 0)
+    LOADCHROMA(0);
+    PUTRGB24(dst_1, py_1, 0);
+    PUTRGB24(dst_2, py_2, 0);
+
+    LOADCHROMA(1);
+    PUTRGB24(dst_2, py_2, 1);
+    PUTRGB24(dst_1, py_1, 1);
+ENDYUV2RGBLINE(24, 1)
+    LOADCHROMA(0);
+    PUTRGB24(dst_1, py_1, 0);
+    PUTRGB24(dst_2, py_2, 0);
+ENDYUV2RGBFUNC()
+
+// only trivial mods from yuv2rgb_c_24_rgb
+YUV2RGBFUNC(yuv2rgb_c_24_bgr, uint8_t, 0)
+    LOADCHROMA(0);
+    PUTBGR24(dst_1, py_1, 0);
+    PUTBGR24(dst_2, py_2, 0);
+
+    LOADCHROMA(1);
+    PUTBGR24(dst_2, py_2, 1);
+    PUTBGR24(dst_1, py_1, 1);
+
+    LOADCHROMA(2);
+    PUTBGR24(dst_1, py_1, 2);
+    PUTBGR24(dst_2, py_2, 2);
+
+    LOADCHROMA(3);
+    PUTBGR24(dst_2, py_2, 3);
+    PUTBGR24(dst_1, py_1, 3);
+ENDYUV2RGBLINE(24, 0)
+    LOADCHROMA(0);
+    PUTBGR24(dst_1, py_1, 0);
+    PUTBGR24(dst_2, py_2, 0);
+
+    LOADCHROMA(1);
+    PUTBGR24(dst_2, py_2, 1);
+    PUTBGR24(dst_1, py_1, 1);
+ENDYUV2RGBLINE(24, 1)
+    LOADCHROMA(0);
+    PUTBGR24(dst_1, py_1, 0);
+    PUTBGR24(dst_2, py_2, 0);
+ENDYUV2RGBFUNC()
+
+YUV2RGBFUNC(yuv2rgb_c_16_ordered_dither, uint16_t, 0)
+    const uint8_t *d16 = ff_dither_2x2_8[y & 1];
+    const uint8_t *e16 = ff_dither_2x2_4[y & 1];
+    const uint8_t *f16 = ff_dither_2x2_8[(y & 1)^1];
+
+#define PUTRGB16(dst, src, i, o)                    \
+    Y              = src[2 * i];                    \
+    dst[2 * i]     = r[Y + d16[0 + o]] +            \
+                     g[Y + e16[0 + o]] +            \
+                     b[Y + f16[0 + o]];             \
+    Y              = src[2 * i + 1];                \
+    dst[2 * i + 1] = r[Y + d16[1 + o]] +            \
+                     g[Y + e16[1 + o]] +            \
+                     b[Y + f16[1 + o]];
+    LOADCHROMA(0);
+    PUTRGB16(dst_1, py_1, 0, 0);
+    PUTRGB16(dst_2, py_2, 0, 0 + 8);
+
+    LOADCHROMA(1);
+    PUTRGB16(dst_2, py_2, 1, 2 + 8);
+    PUTRGB16(dst_1, py_1, 1, 2);
+
+    LOADCHROMA(2);
+    PUTRGB16(dst_1, py_1, 2, 4);
+    PUTRGB16(dst_2, py_2, 2, 4 + 8);
+
+    LOADCHROMA(3);
+    PUTRGB16(dst_2, py_2, 3, 6 + 8);
+    PUTRGB16(dst_1, py_1, 3, 6);
+CLOSEYUV2RGBFUNC(8)
+
+YUV2RGBFUNC(yuv2rgb_c_15_ordered_dither, uint16_t, 0)
+    const uint8_t *d16 = ff_dither_2x2_8[y & 1];
+    const uint8_t *e16 = ff_dither_2x2_8[(y & 1)^1];
+
+#define PUTRGB15(dst, src, i, o)                    \
+    Y              = src[2 * i];                    \
+    dst[2 * i]     = r[Y + d16[0 + o]] +            \
+                     g[Y + d16[1 + o]] +            \
+                     b[Y + e16[0 + o]];             \
+    Y              = src[2 * i + 1];                \
+    dst[2 * i + 1] = r[Y + d16[1 + o]] +            \
+                     g[Y + d16[0 + o]] +            \
+                     b[Y + e16[1 + o]];
+    LOADCHROMA(0);
+    PUTRGB15(dst_1, py_1, 0, 0);
+    PUTRGB15(dst_2, py_2, 0, 0 + 8);
+
+    LOADCHROMA(1);
+    PUTRGB15(dst_2, py_2, 1, 2 + 8);
+    PUTRGB15(dst_1, py_1, 1, 2);
+
+    LOADCHROMA(2);
+    PUTRGB15(dst_1, py_1, 2, 4);
+    PUTRGB15(dst_2, py_2, 2, 4 + 8);
+
+    LOADCHROMA(3);
+    PUTRGB15(dst_2, py_2, 3, 6 + 8);
+    PUTRGB15(dst_1, py_1, 3, 6);
+CLOSEYUV2RGBFUNC(8)
+
+// r, g, b, dst_1, dst_2
+YUV2RGBFUNC(yuv2rgb_c_12_ordered_dither, uint16_t, 0)
+    const uint8_t *d16 = ff_dither_4x4_16[y & 3];
+
+#define PUTRGB12(dst, src, i, o)                    \
+    Y              = src[2 * i];                    \
+    dst[2 * i]     = r[Y + d16[0 + o]] +            \
+                     g[Y + d16[0 + o]] +            \
+                     b[Y + d16[0 + o]];             \
+    Y              = src[2 * i + 1];                \
+    dst[2 * i + 1] = r[Y + d16[1 + o]] +            \
+                     g[Y + d16[1 + o]] +            \
+                     b[Y + d16[1 + o]];
+
+    LOADCHROMA(0);
+    PUTRGB12(dst_1, py_1, 0, 0);
+    PUTRGB12(dst_2, py_2, 0, 0 + 8);
+
+    LOADCHROMA(1);
+    PUTRGB12(dst_2, py_2, 1, 2 + 8);
+    PUTRGB12(dst_1, py_1, 1, 2);
+
+    LOADCHROMA(2);
+    PUTRGB12(dst_1, py_1, 2, 4);
+    PUTRGB12(dst_2, py_2, 2, 4 + 8);
+
+    LOADCHROMA(3);
+    PUTRGB12(dst_2, py_2, 3, 6 + 8);
+    PUTRGB12(dst_1, py_1, 3, 6);
+CLOSEYUV2RGBFUNC(8)
+
+// r, g, b, dst_1, dst_2
+YUV2RGBFUNC(yuv2rgb_c_8_ordered_dither, uint8_t, 0)
+    const uint8_t *d32 = ff_dither_8x8_32[y & 7];
+    const uint8_t *d64 = ff_dither_8x8_73[y & 7];
+
+#define PUTRGB8(dst, src, i, o)                     \
+    Y              = src[2 * i];                    \
+    dst[2 * i]     = r[Y + d32[0 + o]] +            \
+                     g[Y + d32[0 + o]] +            \
+                     b[Y + d64[0 + o]];             \
+    Y              = src[2 * i + 1];                \
+    dst[2 * i + 1] = r[Y + d32[1 + o]] +            \
+                     g[Y + d32[1 + o]] +            \
+                     b[Y + d64[1 + o]];
+
+    LOADCHROMA(0);
+    PUTRGB8(dst_1, py_1, 0, 0);
+    PUTRGB8(dst_2, py_2, 0, 0 + 8);
+
+    LOADCHROMA(1);
+    PUTRGB8(dst_2, py_2, 1, 2 + 8);
+    PUTRGB8(dst_1, py_1, 1, 2);
+
+    LOADCHROMA(2);
+    PUTRGB8(dst_1, py_1, 2, 4);
+    PUTRGB8(dst_2, py_2, 2, 4 + 8);
+
+    LOADCHROMA(3);
+    PUTRGB8(dst_2, py_2, 3, 6 + 8);
+    PUTRGB8(dst_1, py_1, 3, 6);
+CLOSEYUV2RGBFUNC(8)
+
+YUV2RGBFUNC(yuv2rgb_c_4_ordered_dither, uint8_t, 0)
+    const uint8_t * d64 = ff_dither_8x8_73[y & 7];
+    const uint8_t *d128 = ff_dither_8x8_220[y & 7];
+    int acc;
+
+#define PUTRGB4D(dst, src, i, o)                    \
+    Y      = src[2 * i];                            \
+    acc    = r[Y + d128[0 + o]] +                   \
+             g[Y +  d64[0 + o]] +                   \
+             b[Y + d128[0 + o]];                    \
+    Y      = src[2 * i + 1];                        \
+    acc   |= (r[Y + d128[1 + o]] +                  \
+              g[Y +  d64[1 + o]] +                  \
+              b[Y + d128[1 + o]]) << 4;             \
+    dst[i] = acc;
+
+    LOADCHROMA(0);
+    PUTRGB4D(dst_1, py_1, 0, 0);
+    PUTRGB4D(dst_2, py_2, 0, 0 + 8);
+
+    LOADCHROMA(1);
+    PUTRGB4D(dst_2, py_2, 1, 2 + 8);
+    PUTRGB4D(dst_1, py_1, 1, 2);
+
+    LOADCHROMA(2);
+    PUTRGB4D(dst_1, py_1, 2, 4);
+    PUTRGB4D(dst_2, py_2, 2, 4 + 8);
+
+    LOADCHROMA(3);
+    PUTRGB4D(dst_2, py_2, 3, 6 + 8);
+    PUTRGB4D(dst_1, py_1, 3, 6);
+CLOSEYUV2RGBFUNC(4)
+
+YUV2RGBFUNC(yuv2rgb_c_4b_ordered_dither, uint8_t, 0)
+    const uint8_t *d64  = ff_dither_8x8_73[y & 7];
+    const uint8_t *d128 = ff_dither_8x8_220[y & 7];
+
+#define PUTRGB4DB(dst, src, i, o)                   \
+    Y              = src[2 * i];                    \
+    dst[2 * i]     = r[Y + d128[0 + o]] +           \
+                     g[Y +  d64[0 + o]] +           \
+                     b[Y + d128[0 + o]];            \
+    Y              = src[2 * i + 1];                \
+    dst[2 * i + 1] = r[Y + d128[1 + o]] +           \
+                     g[Y +  d64[1 + o]] +           \
+                     b[Y + d128[1 + o]];
+
+    LOADCHROMA(0);
+    PUTRGB4DB(dst_1, py_1, 0, 0);
+    PUTRGB4DB(dst_2, py_2, 0, 0 + 8);
+
+    LOADCHROMA(1);
+    PUTRGB4DB(dst_2, py_2, 1, 2 + 8);
+    PUTRGB4DB(dst_1, py_1, 1, 2);
+
+    LOADCHROMA(2);
+    PUTRGB4DB(dst_1, py_1, 2, 4);
+    PUTRGB4DB(dst_2, py_2, 2, 4 + 8);
+
+    LOADCHROMA(3);
+    PUTRGB4DB(dst_2, py_2, 3, 6 + 8);
+    PUTRGB4DB(dst_1, py_1, 3, 6);
+CLOSEYUV2RGBFUNC(8)
+
+YUV2RGBFUNC(yuv2rgb_c_1_ordered_dither, uint8_t, 0)
+    const uint8_t *d128 = ff_dither_8x8_220[y & 7];
+    char out_1 = 0, out_2 = 0;
+    g = c->table_gU[128 + YUVRGB_TABLE_HEADROOM] + c->table_gV[128 + YUVRGB_TABLE_HEADROOM];
+
+#define PUTRGB1(out, src, i, o)                     \
+    Y    = src[2 * i];                              \
+    out += out + g[Y + d128[0 + o]];                \
+    Y    = src[2 * i + 1];                          \
+    out += out + g[Y + d128[1 + o]];
+
+    PUTRGB1(out_1, py_1, 0, 0);
+    PUTRGB1(out_2, py_2, 0, 0 + 8);
+
+    PUTRGB1(out_2, py_2, 1, 2 + 8);
+    PUTRGB1(out_1, py_1, 1, 2);
+
+    PUTRGB1(out_1, py_1, 2, 4);
+    PUTRGB1(out_2, py_2, 2, 4 + 8);
+
+    PUTRGB1(out_2, py_2, 3, 6 + 8);
+    PUTRGB1(out_1, py_1, 3, 6);
+
+    dst_1[0] = out_1;
+    dst_2[0] = out_2;
+CLOSEYUV2RGBFUNC(1)
+
+SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c)
+{
+    SwsFunc t = NULL;
+
+    if (ARCH_BFIN)
+        t = ff_yuv2rgb_init_bfin(c);
+    if (ARCH_PPC)
+        t = ff_yuv2rgb_init_ppc(c);
+    if (HAVE_VIS)
+        t = ff_yuv2rgb_init_vis(c);
+    if (ARCH_X86)
+        t = ff_yuv2rgb_init_x86(c);
+
+    if (t)
+        return t;
+
+    av_log(c, AV_LOG_WARNING,
+           "No accelerated colorspace conversion found from %s to %s.\n",
+           av_get_pix_fmt_name(c->srcFormat), av_get_pix_fmt_name(c->dstFormat));
+
+    switch (c->dstFormat) {
+    case AV_PIX_FMT_BGR48BE:
+    case AV_PIX_FMT_BGR48LE:
+        return yuv2rgb_c_bgr48;
+    case AV_PIX_FMT_RGB48BE:
+    case AV_PIX_FMT_RGB48LE:
+        return yuv2rgb_c_48;
+    case AV_PIX_FMT_ARGB:
+    case AV_PIX_FMT_ABGR:
+        if (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat))
+            return yuva2argb_c;
+    case AV_PIX_FMT_RGBA:
+    case AV_PIX_FMT_BGRA:
+        return (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat)) ? yuva2rgba_c : yuv2rgb_c_32;
+    case AV_PIX_FMT_RGB24:
+        return yuv2rgb_c_24_rgb;
+    case AV_PIX_FMT_BGR24:
+        return yuv2rgb_c_24_bgr;
+    case AV_PIX_FMT_RGB565:
+    case AV_PIX_FMT_BGR565:
+        return yuv2rgb_c_16_ordered_dither;
+    case AV_PIX_FMT_RGB555:
+    case AV_PIX_FMT_BGR555:
+        return yuv2rgb_c_15_ordered_dither;
+    case AV_PIX_FMT_RGB444:
+    case AV_PIX_FMT_BGR444:
+        return yuv2rgb_c_12_ordered_dither;
+    case AV_PIX_FMT_RGB8:
+    case AV_PIX_FMT_BGR8:
+        return yuv2rgb_c_8_ordered_dither;
+    case AV_PIX_FMT_RGB4:
+    case AV_PIX_FMT_BGR4:
+        return yuv2rgb_c_4_ordered_dither;
+    case AV_PIX_FMT_RGB4_BYTE:
+    case AV_PIX_FMT_BGR4_BYTE:
+        return yuv2rgb_c_4b_ordered_dither;
+    case AV_PIX_FMT_MONOBLACK:
+        return yuv2rgb_c_1_ordered_dither;
+    }
+    return NULL;
+}
+
+static void fill_table(uint8_t* table[256 + 2*YUVRGB_TABLE_HEADROOM], const int elemsize,
+                       const int64_t inc, void *y_tab)
+{
+    int i;
+    uint8_t *y_table = y_tab;
+
+    y_table -= elemsize * (inc >> 9);
+
+    for (i = 0; i < 256 + 2*YUVRGB_TABLE_HEADROOM; i++) {
+        int64_t cb = av_clip(i-YUVRGB_TABLE_HEADROOM, 0, 255)*inc;
+        table[i] = y_table + elemsize * (cb >> 16);
+    }
+}
+
+static void fill_gv_table(int table[256 + 2*YUVRGB_TABLE_HEADROOM], const int elemsize, const int64_t inc)
+{
+    int i;
+    int off    = -(inc >> 9);
+
+    for (i = 0; i < 256 + 2*YUVRGB_TABLE_HEADROOM; i++) {
+        int64_t cb = av_clip(i-YUVRGB_TABLE_HEADROOM, 0, 255)*inc;
+        table[i] = elemsize * (off + (cb >> 16));
+    }
+}
+
+static uint16_t roundToInt16(int64_t f)
+{
+    int r = (f + (1 << 15)) >> 16;
+
+    if (r < -0x7FFF)
+        return 0x8000;
+    else if (r > 0x7FFF)
+        return 0x7FFF;
+    else
+        return r;
+}
+
+av_cold int ff_yuv2rgb_c_init_tables(SwsContext *c, const int inv_table[4],
+                                     int fullRange, int brightness,
+                                     int contrast, int saturation)
+{
+    const int isRgb = c->dstFormat == AV_PIX_FMT_RGB32     ||
+                      c->dstFormat == AV_PIX_FMT_RGB32_1   ||
+                      c->dstFormat == AV_PIX_FMT_BGR24     ||
+                      c->dstFormat == AV_PIX_FMT_RGB565BE  ||
+                      c->dstFormat == AV_PIX_FMT_RGB565LE  ||
+                      c->dstFormat == AV_PIX_FMT_RGB555BE  ||
+                      c->dstFormat == AV_PIX_FMT_RGB555LE  ||
+                      c->dstFormat == AV_PIX_FMT_RGB444BE  ||
+                      c->dstFormat == AV_PIX_FMT_RGB444LE  ||
+                      c->dstFormat == AV_PIX_FMT_RGB8      ||
+                      c->dstFormat == AV_PIX_FMT_RGB4      ||
+                      c->dstFormat == AV_PIX_FMT_RGB4_BYTE ||
+                      c->dstFormat == AV_PIX_FMT_MONOBLACK;
+    const int isNotNe = c->dstFormat == AV_PIX_FMT_NE(RGB565LE, RGB565BE) ||
+                        c->dstFormat == AV_PIX_FMT_NE(RGB555LE, RGB555BE) ||
+                        c->dstFormat == AV_PIX_FMT_NE(RGB444LE, RGB444BE) ||
+                        c->dstFormat == AV_PIX_FMT_NE(BGR565LE, BGR565BE) ||
+                        c->dstFormat == AV_PIX_FMT_NE(BGR555LE, BGR555BE) ||
+                        c->dstFormat == AV_PIX_FMT_NE(BGR444LE, BGR444BE);
+    const int bpp = c->dstFormatBpp;
+    uint8_t *y_table;
+    uint16_t *y_table16;
+    uint32_t *y_table32;
+    int i, base, rbase, gbase, bbase, av_uninit(abase), needAlpha;
+    const int yoffs = fullRange ? 384 : 326;
+
+    int64_t crv =  inv_table[0];
+    int64_t cbu =  inv_table[1];
+    int64_t cgu = -inv_table[2];
+    int64_t cgv = -inv_table[3];
+    int64_t cy  = 1 << 16;
+    int64_t oy  = 0;
+    int64_t yb  = 0;
+
+    if (!fullRange) {
+        cy = (cy * 255) / 219;
+        oy = 16 << 16;
+    } else {
+        crv = (crv * 224) / 255;
+        cbu = (cbu * 224) / 255;
+        cgu = (cgu * 224) / 255;
+        cgv = (cgv * 224) / 255;
+    }
+
+    cy   = (cy  * contrast)              >> 16;
+    crv  = (crv * contrast * saturation) >> 32;
+    cbu  = (cbu * contrast * saturation) >> 32;
+    cgu  = (cgu * contrast * saturation) >> 32;
+    cgv  = (cgv * contrast * saturation) >> 32;
+    oy  -= 256 * brightness;
+
+    c->uOffset = 0x0400040004000400LL;
+    c->vOffset = 0x0400040004000400LL;
+    c->yCoeff  = roundToInt16(cy  * 8192) * 0x0001000100010001ULL;
+    c->vrCoeff = roundToInt16(crv * 8192) * 0x0001000100010001ULL;
+    c->ubCoeff = roundToInt16(cbu * 8192) * 0x0001000100010001ULL;
+    c->vgCoeff = roundToInt16(cgv * 8192) * 0x0001000100010001ULL;
+    c->ugCoeff = roundToInt16(cgu * 8192) * 0x0001000100010001ULL;
+    c->yOffset = roundToInt16(oy  *    8) * 0x0001000100010001ULL;
+
+    c->yuv2rgb_y_coeff   = (int16_t)roundToInt16(cy  << 13);
+    c->yuv2rgb_y_offset  = (int16_t)roundToInt16(oy  <<  9);
+    c->yuv2rgb_v2r_coeff = (int16_t)roundToInt16(crv << 13);
+    c->yuv2rgb_v2g_coeff = (int16_t)roundToInt16(cgv << 13);
+    c->yuv2rgb_u2g_coeff = (int16_t)roundToInt16(cgu << 13);
+    c->yuv2rgb_u2b_coeff = (int16_t)roundToInt16(cbu << 13);
+
+    //scale coefficients by cy
+    crv = ((crv << 16) + 0x8000) / FFMAX(cy, 1);
+    cbu = ((cbu << 16) + 0x8000) / FFMAX(cy, 1);
+    cgu = ((cgu << 16) + 0x8000) / FFMAX(cy, 1);
+    cgv = ((cgv << 16) + 0x8000) / FFMAX(cy, 1);
+
+    av_freep(&c->yuvTable);
+
+    switch (bpp) {
+    case 1:
+        c->yuvTable = av_malloc(1024);
+        y_table     = c->yuvTable;
+        yb = -(384 << 16) - oy;
+        for (i = 0; i < 1024 - 110; i++) {
+            y_table[i + 110]  = av_clip_uint8((yb + 0x8000) >> 16) >> 7;
+            yb               += cy;
+        }
+        fill_table(c->table_gU, 1, cgu, y_table + yoffs);
+        fill_gv_table(c->table_gV, 1, cgv);
+        break;
+    case 4:
+    case 4 | 128:
+        rbase       = isRgb ? 3 : 0;
+        gbase       = 1;
+        bbase       = isRgb ? 0 : 3;
+        c->yuvTable = av_malloc(1024 * 3);
+        y_table     = c->yuvTable;
+        yb = -(384 << 16) - oy;
+        for (i = 0; i < 1024 - 110; i++) {
+            int yval                = av_clip_uint8((yb + 0x8000) >> 16);
+            y_table[i + 110]        = (yval >> 7)        << rbase;
+            y_table[i +  37 + 1024] = ((yval + 43) / 85) << gbase;
+            y_table[i + 110 + 2048] = (yval >> 7)        << bbase;
+            yb += cy;
+        }
+        fill_table(c->table_rV, 1, crv, y_table + yoffs);
+        fill_table(c->table_gU, 1, cgu, y_table + yoffs + 1024);
+        fill_table(c->table_bU, 1, cbu, y_table + yoffs + 2048);
+        fill_gv_table(c->table_gV, 1, cgv);
+        break;
+    case 8:
+        rbase       = isRgb ? 5 : 0;
+        gbase       = isRgb ? 2 : 3;
+        bbase       = isRgb ? 0 : 6;
+        c->yuvTable = av_malloc(1024 * 3);
+        y_table     = c->yuvTable;
+        yb = -(384 << 16) - oy;
+        for (i = 0; i < 1024 - 38; i++) {
+            int yval               = av_clip_uint8((yb + 0x8000) >> 16);
+            y_table[i + 16]        = ((yval + 18) / 36) << rbase;
+            y_table[i + 16 + 1024] = ((yval + 18) / 36) << gbase;
+            y_table[i + 37 + 2048] = ((yval + 43) / 85) << bbase;
+            yb += cy;
+        }
+        fill_table(c->table_rV, 1, crv, y_table + yoffs);
+        fill_table(c->table_gU, 1, cgu, y_table + yoffs + 1024);
+        fill_table(c->table_bU, 1, cbu, y_table + yoffs + 2048);
+        fill_gv_table(c->table_gV, 1, cgv);
+        break;
+    case 12:
+        rbase       = isRgb ? 8 : 0;
+        gbase       = 4;
+        bbase       = isRgb ? 0 : 8;
+        c->yuvTable = av_malloc(1024 * 3 * 2);
+        y_table16   = c->yuvTable;
+        yb = -(384 << 16) - oy;
+        for (i = 0; i < 1024; i++) {
+            uint8_t yval        = av_clip_uint8((yb + 0x8000) >> 16);
+            y_table16[i]        = (yval >> 4) << rbase;
+            y_table16[i + 1024] = (yval >> 4) << gbase;
+            y_table16[i + 2048] = (yval >> 4) << bbase;
+            yb += cy;
+        }
+        if (isNotNe)
+            for (i = 0; i < 1024 * 3; i++)
+                y_table16[i] = av_bswap16(y_table16[i]);
+        fill_table(c->table_rV, 2, crv, y_table16 + yoffs);
+        fill_table(c->table_gU, 2, cgu, y_table16 + yoffs + 1024);
+        fill_table(c->table_bU, 2, cbu, y_table16 + yoffs + 2048);
+        fill_gv_table(c->table_gV, 2, cgv);
+        break;
+    case 15:
+    case 16:
+        rbase       = isRgb ? bpp - 5 : 0;
+        gbase       = 5;
+        bbase       = isRgb ? 0 : (bpp - 5);
+        c->yuvTable = av_malloc(1024 * 3 * 2);
+        y_table16   = c->yuvTable;
+        yb = -(384 << 16) - oy;
+        for (i = 0; i < 1024; i++) {
+            uint8_t yval        = av_clip_uint8((yb + 0x8000) >> 16);
+            y_table16[i]        = (yval >> 3)          << rbase;
+            y_table16[i + 1024] = (yval >> (18 - bpp)) << gbase;
+            y_table16[i + 2048] = (yval >> 3)          << bbase;
+            yb += cy;
+        }
+        if (isNotNe)
+            for (i = 0; i < 1024 * 3; i++)
+                y_table16[i] = av_bswap16(y_table16[i]);
+        fill_table(c->table_rV, 2, crv, y_table16 + yoffs);
+        fill_table(c->table_gU, 2, cgu, y_table16 + yoffs + 1024);
+        fill_table(c->table_bU, 2, cbu, y_table16 + yoffs + 2048);
+        fill_gv_table(c->table_gV, 2, cgv);
+        break;
+    case 24:
+    case 48:
+        c->yuvTable = av_malloc(1024);
+        y_table     = c->yuvTable;
+        yb = -(384 << 16) - oy;
+        for (i = 0; i < 1024; i++) {
+            y_table[i]  = av_clip_uint8((yb + 0x8000) >> 16);
+            yb         += cy;
+        }
+        fill_table(c->table_rV, 1, crv, y_table + yoffs);
+        fill_table(c->table_gU, 1, cgu, y_table + yoffs);
+        fill_table(c->table_bU, 1, cbu, y_table + yoffs);
+        fill_gv_table(c->table_gV, 1, cgv);
+        break;
+    case 32:
+    case 64:
+        base      = (c->dstFormat == AV_PIX_FMT_RGB32_1 ||
+                     c->dstFormat == AV_PIX_FMT_BGR32_1) ? 8 : 0;
+        rbase     = base + (isRgb ? 16 : 0);
+        gbase     = base + 8;
+        bbase     = base + (isRgb ? 0 : 16);
+        needAlpha = CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat);
+        if (!needAlpha)
+            abase = (base + 24) & 31;
+        c->yuvTable = av_malloc(1024 * 3 * 4);
+        y_table32   = c->yuvTable;
+        yb = -(384 << 16) - oy;
+        for (i = 0; i < 1024; i++) {
+            unsigned yval       = av_clip_uint8((yb + 0x8000) >> 16);
+            y_table32[i]        = (yval << rbase) +
+                                  (needAlpha ? 0 : (255u << abase));
+            y_table32[i + 1024] =  yval << gbase;
+            y_table32[i + 2048] =  yval << bbase;
+            yb += cy;
+        }
+        fill_table(c->table_rV, 4, crv, y_table32 + yoffs);
+        fill_table(c->table_gU, 4, cgu, y_table32 + yoffs + 1024);
+        fill_table(c->table_bU, 4, cbu, y_table32 + yoffs + 2048);
+        fill_gv_table(c->table_gV, 4, cgv);
+        break;
+    default:
+        if(!isPlanar(c->dstFormat) || bpp <= 24)
+            av_log(c, AV_LOG_ERROR, "%ibpp not supported by yuv2rgb\n", bpp);
+        return -1;
+    }
+    return 0;
+}