forked from KolibriOS/kolibrios
ffmpeg-2.1.1: move directory
git-svn-id: svn://kolibrios.org@6148 a494cfbc-eb01-0410-851d-a64ba20cac60
This commit is contained in:
@@ -0,0 +1,8 @@
|
||||
OBJS += x86/cpu.o \
|
||||
x86/float_dsp_init.o \
|
||||
x86/lls_init.o \
|
||||
|
||||
YASM-OBJS += x86/cpuid.o \
|
||||
x86/emms.o \
|
||||
x86/float_dsp.o \
|
||||
x86/lls.o \
|
112
contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libavutil/x86/asm.h
Normal file
112
contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libavutil/x86/asm.h
Normal file
@@ -0,0 +1,112 @@
|
||||
/*
|
||||
* copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVUTIL_X86_ASM_H
|
||||
#define AVUTIL_X86_ASM_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include "config.h"
|
||||
|
||||
typedef struct xmm_reg { uint64_t a, b; } xmm_reg;
|
||||
|
||||
#if ARCH_X86_64
|
||||
# define OPSIZE "q"
|
||||
# define REG_a "rax"
|
||||
# define REG_b "rbx"
|
||||
# define REG_c "rcx"
|
||||
# define REG_d "rdx"
|
||||
# define REG_D "rdi"
|
||||
# define REG_S "rsi"
|
||||
# define PTR_SIZE "8"
|
||||
typedef int64_t x86_reg;
|
||||
|
||||
# define REG_SP "rsp"
|
||||
# define REG_BP "rbp"
|
||||
# define REGBP rbp
|
||||
# define REGa rax
|
||||
# define REGb rbx
|
||||
# define REGc rcx
|
||||
# define REGd rdx
|
||||
# define REGSP rsp
|
||||
|
||||
#elif ARCH_X86_32
|
||||
|
||||
# define OPSIZE "l"
|
||||
# define REG_a "eax"
|
||||
# define REG_b "ebx"
|
||||
# define REG_c "ecx"
|
||||
# define REG_d "edx"
|
||||
# define REG_D "edi"
|
||||
# define REG_S "esi"
|
||||
# define PTR_SIZE "4"
|
||||
typedef int32_t x86_reg;
|
||||
|
||||
# define REG_SP "esp"
|
||||
# define REG_BP "ebp"
|
||||
# define REGBP ebp
|
||||
# define REGa eax
|
||||
# define REGb ebx
|
||||
# define REGc ecx
|
||||
# define REGd edx
|
||||
# define REGSP esp
|
||||
#else
|
||||
typedef int x86_reg;
|
||||
#endif
|
||||
|
||||
#define HAVE_7REGS (ARCH_X86_64 || (HAVE_EBX_AVAILABLE && HAVE_EBP_AVAILABLE))
|
||||
#define HAVE_6REGS (ARCH_X86_64 || (HAVE_EBX_AVAILABLE || HAVE_EBP_AVAILABLE))
|
||||
|
||||
#if ARCH_X86_64 && defined(PIC)
|
||||
# define BROKEN_RELOCATIONS 1
|
||||
#endif
|
||||
|
||||
/*
|
||||
* If gcc is not set to support sse (-msse) it will not accept xmm registers
|
||||
* in the clobber list for inline asm. XMM_CLOBBERS takes a list of xmm
|
||||
* registers to be marked as clobbered and evaluates to nothing if they are
|
||||
* not supported, or to the list itself if they are supported. Since a clobber
|
||||
* list may not be empty, XMM_CLOBBERS_ONLY should be used if the xmm
|
||||
* registers are the only in the clobber list.
|
||||
* For example a list with "eax" and "xmm0" as clobbers should become:
|
||||
* : XMM_CLOBBERS("xmm0",) "eax"
|
||||
* and a list with only "xmm0" should become:
|
||||
* XMM_CLOBBERS_ONLY("xmm0")
|
||||
*/
|
||||
#if HAVE_XMM_CLOBBERS
|
||||
# define XMM_CLOBBERS(...) __VA_ARGS__
|
||||
# define XMM_CLOBBERS_ONLY(...) : __VA_ARGS__
|
||||
#else
|
||||
# define XMM_CLOBBERS(...)
|
||||
# define XMM_CLOBBERS_ONLY(...)
|
||||
#endif
|
||||
|
||||
/* Use to export labels from asm. */
|
||||
#define LABEL_MANGLE(a) EXTERN_PREFIX #a
|
||||
|
||||
// Use rip-relative addressing if compiling PIC code on x86-64.
|
||||
#if ARCH_X86_64 && defined(PIC)
|
||||
# define LOCAL_MANGLE(a) #a "(%%rip)"
|
||||
#else
|
||||
# define LOCAL_MANGLE(a) #a
|
||||
#endif
|
||||
|
||||
#define MANGLE(a) EXTERN_PREFIX LOCAL_MANGLE(a)
|
||||
|
||||
#endif /* AVUTIL_X86_ASM_H */
|
61
contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libavutil/x86/bswap.h
Normal file
61
contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libavutil/x86/bswap.h
Normal file
@@ -0,0 +1,61 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* byte swapping routines
|
||||
*/
|
||||
|
||||
#ifndef AVUTIL_X86_BSWAP_H
|
||||
#define AVUTIL_X86_BSWAP_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include "config.h"
|
||||
#include "libavutil/attributes.h"
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
|
||||
#if !AV_GCC_VERSION_AT_LEAST(4,1)
|
||||
#define av_bswap16 av_bswap16
|
||||
static av_always_inline av_const unsigned av_bswap16(unsigned x)
|
||||
{
|
||||
__asm__("rorw $8, %w0" : "+r"(x));
|
||||
return x;
|
||||
}
|
||||
#endif /* !AV_GCC_VERSION_AT_LEAST(4,1) */
|
||||
|
||||
#if !AV_GCC_VERSION_AT_LEAST(4,5)
|
||||
#define av_bswap32 av_bswap32
|
||||
static av_always_inline av_const uint32_t av_bswap32(uint32_t x)
|
||||
{
|
||||
__asm__("bswap %0" : "+r" (x));
|
||||
return x;
|
||||
}
|
||||
|
||||
#if ARCH_X86_64
|
||||
#define av_bswap64 av_bswap64
|
||||
static inline uint64_t av_const av_bswap64(uint64_t x)
|
||||
{
|
||||
__asm__("bswap %0": "=r" (x) : "0" (x));
|
||||
return x;
|
||||
}
|
||||
#endif
|
||||
#endif /* !AV_GCC_VERSION_AT_LEAST(4,5) */
|
||||
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
#endif /* AVUTIL_X86_BSWAP_H */
|
210
contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libavutil/x86/cpu.c
Normal file
210
contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libavutil/x86/cpu.c
Normal file
@@ -0,0 +1,210 @@
|
||||
/*
|
||||
* CPU detection code, extracted from mmx.h
|
||||
* (c)1997-99 by H. Dietz and R. Fisher
|
||||
* Converted to C and improved by Fabrice Bellard.
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/cpu_internal.h"
|
||||
|
||||
#if HAVE_YASM
|
||||
|
||||
#define cpuid(index, eax, ebx, ecx, edx) \
|
||||
ff_cpu_cpuid(index, &eax, &ebx, &ecx, &edx)
|
||||
|
||||
#define xgetbv(index, eax, edx) \
|
||||
ff_cpu_xgetbv(index, &eax, &edx)
|
||||
|
||||
#elif HAVE_INLINE_ASM
|
||||
|
||||
/* ebx saving is necessary for PIC. gcc seems unable to see it alone */
|
||||
#define cpuid(index, eax, ebx, ecx, edx) \
|
||||
__asm__ volatile ( \
|
||||
"mov %%"REG_b", %%"REG_S" \n\t" \
|
||||
"cpuid \n\t" \
|
||||
"xchg %%"REG_b", %%"REG_S \
|
||||
: "=a" (eax), "=S" (ebx), "=c" (ecx), "=d" (edx) \
|
||||
: "0" (index))
|
||||
|
||||
#define xgetbv(index, eax, edx) \
|
||||
__asm__ (".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c" (index))
|
||||
|
||||
#define get_eflags(x) \
|
||||
__asm__ volatile ("pushfl \n" \
|
||||
"pop %0 \n" \
|
||||
: "=r"(x))
|
||||
|
||||
#define set_eflags(x) \
|
||||
__asm__ volatile ("push %0 \n" \
|
||||
"popfl \n" \
|
||||
:: "r"(x))
|
||||
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
|
||||
#if ARCH_X86_64
|
||||
|
||||
#define cpuid_test() 1
|
||||
|
||||
#elif HAVE_YASM
|
||||
|
||||
#define cpuid_test ff_cpu_cpuid_test
|
||||
|
||||
#elif HAVE_INLINE_ASM
|
||||
|
||||
static int cpuid_test(void)
|
||||
{
|
||||
x86_reg a, c;
|
||||
|
||||
/* Check if CPUID is supported by attempting to toggle the ID bit in
|
||||
* the EFLAGS register. */
|
||||
get_eflags(a);
|
||||
set_eflags(a ^ 0x200000);
|
||||
get_eflags(c);
|
||||
|
||||
return a != c;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Function to test if multimedia instructions are supported... */
|
||||
int ff_get_cpu_flags_x86(void)
|
||||
{
|
||||
int rval = 0;
|
||||
|
||||
#ifdef cpuid
|
||||
|
||||
int eax, ebx, ecx, edx;
|
||||
int max_std_level, max_ext_level, std_caps = 0, ext_caps = 0;
|
||||
int family = 0, model = 0;
|
||||
union { int i[3]; char c[12]; } vendor;
|
||||
|
||||
if (!cpuid_test())
|
||||
return 0; /* CPUID not supported */
|
||||
|
||||
cpuid(0, max_std_level, vendor.i[0], vendor.i[2], vendor.i[1]);
|
||||
|
||||
if (max_std_level >= 1) {
|
||||
cpuid(1, eax, ebx, ecx, std_caps);
|
||||
family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
|
||||
model = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
|
||||
if (std_caps & (1 << 15))
|
||||
rval |= AV_CPU_FLAG_CMOV;
|
||||
if (std_caps & (1 << 23))
|
||||
rval |= AV_CPU_FLAG_MMX;
|
||||
if (std_caps & (1 << 25))
|
||||
rval |= AV_CPU_FLAG_MMXEXT;
|
||||
#if HAVE_SSE
|
||||
if (std_caps & (1 << 25))
|
||||
rval |= AV_CPU_FLAG_SSE;
|
||||
if (std_caps & (1 << 26))
|
||||
rval |= AV_CPU_FLAG_SSE2;
|
||||
if (ecx & 1)
|
||||
rval |= AV_CPU_FLAG_SSE3;
|
||||
if (ecx & 0x00000200 )
|
||||
rval |= AV_CPU_FLAG_SSSE3;
|
||||
if (ecx & 0x00080000 )
|
||||
rval |= AV_CPU_FLAG_SSE4;
|
||||
if (ecx & 0x00100000 )
|
||||
rval |= AV_CPU_FLAG_SSE42;
|
||||
#if HAVE_AVX
|
||||
/* Check OXSAVE and AVX bits */
|
||||
if ((ecx & 0x18000000) == 0x18000000) {
|
||||
/* Check for OS support */
|
||||
xgetbv(0, eax, edx);
|
||||
if ((eax & 0x6) == 0x6)
|
||||
rval |= AV_CPU_FLAG_AVX;
|
||||
}
|
||||
#if HAVE_AVX2
|
||||
if (max_std_level >= 7) {
|
||||
cpuid(7, eax, ebx, ecx, edx);
|
||||
if (ebx&0x00000020)
|
||||
rval |= AV_CPU_FLAG_AVX2;
|
||||
/* TODO: BMI1/2 */
|
||||
}
|
||||
#endif /* HAVE_AVX2 */
|
||||
#endif /* HAVE_AVX */
|
||||
#endif /* HAVE_SSE */
|
||||
}
|
||||
|
||||
cpuid(0x80000000, max_ext_level, ebx, ecx, edx);
|
||||
|
||||
if (max_ext_level >= 0x80000001) {
|
||||
cpuid(0x80000001, eax, ebx, ecx, ext_caps);
|
||||
if (ext_caps & (1U << 31))
|
||||
rval |= AV_CPU_FLAG_3DNOW;
|
||||
if (ext_caps & (1 << 30))
|
||||
rval |= AV_CPU_FLAG_3DNOWEXT;
|
||||
if (ext_caps & (1 << 23))
|
||||
rval |= AV_CPU_FLAG_MMX;
|
||||
if (ext_caps & (1 << 22))
|
||||
rval |= AV_CPU_FLAG_MMXEXT;
|
||||
|
||||
/* Allow for selectively disabling SSE2 functions on AMD processors
|
||||
with SSE2 support but not SSE4a. This includes Athlon64, some
|
||||
Opteron, and some Sempron processors. MMX, SSE, or 3DNow! are faster
|
||||
than SSE2 often enough to utilize this special-case flag.
|
||||
AV_CPU_FLAG_SSE2 and AV_CPU_FLAG_SSE2SLOW are both set in this case
|
||||
so that SSE2 is used unless explicitly disabled by checking
|
||||
AV_CPU_FLAG_SSE2SLOW. */
|
||||
if (!strncmp(vendor.c, "AuthenticAMD", 12) &&
|
||||
rval & AV_CPU_FLAG_SSE2 && !(ecx & 0x00000040)) {
|
||||
rval |= AV_CPU_FLAG_SSE2SLOW;
|
||||
}
|
||||
|
||||
/* XOP and FMA4 use the AVX instruction coding scheme, so they can't be
|
||||
* used unless the OS has AVX support. */
|
||||
if (rval & AV_CPU_FLAG_AVX) {
|
||||
if (ecx & 0x00000800)
|
||||
rval |= AV_CPU_FLAG_XOP;
|
||||
if (ecx & 0x00010000)
|
||||
rval |= AV_CPU_FLAG_FMA4;
|
||||
}
|
||||
}
|
||||
|
||||
if (!strncmp(vendor.c, "GenuineIntel", 12)) {
|
||||
if (family == 6 && (model == 9 || model == 13 || model == 14)) {
|
||||
/* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and
|
||||
* 6/14 (core1 "yonah") theoretically support sse2, but it's
|
||||
* usually slower than mmx, so let's just pretend they don't.
|
||||
* AV_CPU_FLAG_SSE2 is disabled and AV_CPU_FLAG_SSE2SLOW is
|
||||
* enabled so that SSE2 is not used unless explicitly enabled
|
||||
* by checking AV_CPU_FLAG_SSE2SLOW. The same situation
|
||||
* applies for AV_CPU_FLAG_SSE3 and AV_CPU_FLAG_SSE3SLOW. */
|
||||
if (rval & AV_CPU_FLAG_SSE2)
|
||||
rval ^= AV_CPU_FLAG_SSE2SLOW | AV_CPU_FLAG_SSE2;
|
||||
if (rval & AV_CPU_FLAG_SSE3)
|
||||
rval ^= AV_CPU_FLAG_SSE3SLOW | AV_CPU_FLAG_SSE3;
|
||||
}
|
||||
/* The Atom processor has SSSE3 support, which is useful in many cases,
|
||||
* but sometimes the SSSE3 version is slower than the SSE2 equivalent
|
||||
* on the Atom, but is generally faster on other processors supporting
|
||||
* SSSE3. This flag allows for selectively disabling certain SSSE3
|
||||
* functions on the Atom. */
|
||||
if (family == 6 && model == 28)
|
||||
rval |= AV_CPU_FLAG_ATOM;
|
||||
}
|
||||
|
||||
#endif /* cpuid */
|
||||
|
||||
return rval;
|
||||
}
|
75
contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libavutil/x86/cpu.h
Normal file
75
contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libavutil/x86/cpu.h
Normal file
@@ -0,0 +1,75 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVUTIL_X86_CPU_H
|
||||
#define AVUTIL_X86_CPU_H
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/cpu_internal.h"
|
||||
|
||||
#define AV_CPU_FLAG_AMD3DNOW AV_CPU_FLAG_3DNOW
|
||||
#define AV_CPU_FLAG_AMD3DNOWEXT AV_CPU_FLAG_3DNOWEXT
|
||||
|
||||
#define X86_AMD3DNOW(flags) CPUEXT(flags, AMD3DNOW)
|
||||
#define X86_AMD3DNOWEXT(flags) CPUEXT(flags, AMD3DNOWEXT)
|
||||
#define X86_MMX(flags) CPUEXT(flags, MMX)
|
||||
#define X86_MMXEXT(flags) CPUEXT(flags, MMXEXT)
|
||||
#define X86_SSE(flags) CPUEXT(flags, SSE)
|
||||
#define X86_SSE2(flags) CPUEXT(flags, SSE2)
|
||||
#define X86_SSE3(flags) CPUEXT(flags, SSE3)
|
||||
#define X86_SSSE3(flags) CPUEXT(flags, SSSE3)
|
||||
#define X86_SSE4(flags) CPUEXT(flags, SSE4)
|
||||
#define X86_SSE42(flags) CPUEXT(flags, SSE42)
|
||||
#define X86_AVX(flags) CPUEXT(flags, AVX)
|
||||
#define X86_FMA4(flags) CPUEXT(flags, FMA4)
|
||||
#define X86_AVX2(flags) CPUEXT(flags, AVX2)
|
||||
|
||||
#define EXTERNAL_AMD3DNOW(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AMD3DNOW)
|
||||
#define EXTERNAL_AMD3DNOWEXT(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AMD3DNOWEXT)
|
||||
#define EXTERNAL_MMX(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, MMX)
|
||||
#define EXTERNAL_MMXEXT(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, MMXEXT)
|
||||
#define EXTERNAL_SSE(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE)
|
||||
#define EXTERNAL_SSE2(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE2)
|
||||
#define EXTERNAL_SSE3(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE3)
|
||||
#define EXTERNAL_SSSE3(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSSE3)
|
||||
#define EXTERNAL_SSE4(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE4)
|
||||
#define EXTERNAL_SSE42(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE42)
|
||||
#define EXTERNAL_AVX(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AVX)
|
||||
#define EXTERNAL_FMA4(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, FMA4)
|
||||
#define EXTERNAL_AVX2(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AVX2)
|
||||
|
||||
#define INLINE_AMD3DNOW(flags) CPUEXT_SUFFIX(flags, _INLINE, AMD3DNOW)
|
||||
#define INLINE_AMD3DNOWEXT(flags) CPUEXT_SUFFIX(flags, _INLINE, AMD3DNOWEXT)
|
||||
#define INLINE_MMX(flags) CPUEXT_SUFFIX(flags, _INLINE, MMX)
|
||||
#define INLINE_MMXEXT(flags) CPUEXT_SUFFIX(flags, _INLINE, MMXEXT)
|
||||
#define INLINE_SSE(flags) CPUEXT_SUFFIX(flags, _INLINE, SSE)
|
||||
#define INLINE_SSE2(flags) CPUEXT_SUFFIX(flags, _INLINE, SSE2)
|
||||
#define INLINE_SSE3(flags) CPUEXT_SUFFIX(flags, _INLINE, SSE3)
|
||||
#define INLINE_SSSE3(flags) CPUEXT_SUFFIX(flags, _INLINE, SSSE3)
|
||||
#define INLINE_SSE4(flags) CPUEXT_SUFFIX(flags, _INLINE, SSE4)
|
||||
#define INLINE_SSE42(flags) CPUEXT_SUFFIX(flags, _INLINE, SSE42)
|
||||
#define INLINE_AVX(flags) CPUEXT_SUFFIX(flags, _INLINE, AVX)
|
||||
#define INLINE_FMA4(flags) CPUEXT_SUFFIX(flags, _INLINE, FMA4)
|
||||
#define INLINE_AVX2(flags) CPUEXT_SUFFIX(flags, _INLINE, AVX2)
|
||||
|
||||
void ff_cpu_cpuid(int index, int *eax, int *ebx, int *ecx, int *edx);
|
||||
void ff_cpu_xgetbv(int op, int *eax, int *edx);
|
||||
int ff_cpu_cpuid_test(void);
|
||||
|
||||
#endif /* AVUTIL_X86_CPU_H */
|
@@ -0,0 +1,91 @@
|
||||
;*****************************************************************************
|
||||
;* Copyright (C) 2005-2010 x264 project
|
||||
;*
|
||||
;* Authors: Loren Merritt <lorenm@u.washington.edu>
|
||||
;* Jason Garrett-Glaser <darkshikari@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "x86util.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_cpu_cpuid(int index, int *eax, int *ebx, int *ecx, int *edx)
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal cpu_cpuid, 5,7
|
||||
push rbx
|
||||
push r4
|
||||
push r3
|
||||
push r2
|
||||
push r1
|
||||
mov eax, r0d
|
||||
xor ecx, ecx
|
||||
cpuid
|
||||
pop r4
|
||||
mov [r4], eax
|
||||
pop r4
|
||||
mov [r4], ebx
|
||||
pop r4
|
||||
mov [r4], ecx
|
||||
pop r4
|
||||
mov [r4], edx
|
||||
pop rbx
|
||||
RET
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_cpu_xgetbv(int op, int *eax, int *edx)
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal cpu_xgetbv, 3,7
|
||||
push r2
|
||||
push r1
|
||||
mov ecx, r0d
|
||||
xgetbv
|
||||
pop r4
|
||||
mov [r4], eax
|
||||
pop r4
|
||||
mov [r4], edx
|
||||
RET
|
||||
|
||||
%if ARCH_X86_64 == 0
|
||||
;-----------------------------------------------------------------------------
|
||||
; int ff_cpu_cpuid_test(void)
|
||||
; return 0 if unsupported
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal cpu_cpuid_test
|
||||
pushfd
|
||||
push ebx
|
||||
push ebp
|
||||
push esi
|
||||
push edi
|
||||
pushfd
|
||||
pop eax
|
||||
mov ebx, eax
|
||||
xor eax, 0x200000
|
||||
push eax
|
||||
popfd
|
||||
pushfd
|
||||
pop eax
|
||||
xor eax, ebx
|
||||
pop edi
|
||||
pop esi
|
||||
pop ebp
|
||||
pop ebx
|
||||
popfd
|
||||
ret
|
||||
%endif
|
30
contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libavutil/x86/emms.asm
Normal file
30
contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libavutil/x86/emms.asm
Normal file
@@ -0,0 +1,30 @@
|
||||
;*****************************************************************************
|
||||
;* Copyright (C) 2013 Martin Storsjo
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "x86util.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void avpriv_emms_yasm(void)
|
||||
;-----------------------------------------------------------------------------
|
||||
cvisible emms_yasm, 0, 0
|
||||
emms
|
||||
RET
|
47
contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libavutil/x86/emms.h
Normal file
47
contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libavutil/x86/emms.h
Normal file
@@ -0,0 +1,47 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVUTIL_X86_EMMS_H
|
||||
#define AVUTIL_X86_EMMS_H
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
|
||||
void avpriv_emms_yasm(void);
|
||||
|
||||
#if HAVE_MMX_INLINE
|
||||
# define emms_c emms_c
|
||||
/**
|
||||
* Empty mmx state.
|
||||
* this must be called between any dsp function and float/double code.
|
||||
* for example sin(); dsp->idct_put(); emms_c(); cos()
|
||||
*/
|
||||
static av_always_inline void emms_c(void)
|
||||
{
|
||||
if(av_get_cpu_flags() & AV_CPU_FLAG_MMX)
|
||||
__asm__ volatile ("emms" ::: "memory");
|
||||
}
|
||||
#elif HAVE_MMX && HAVE_MM_EMPTY
|
||||
# include <mmintrin.h>
|
||||
# define emms_c _mm_empty
|
||||
#elif HAVE_MMX_EXTERNAL
|
||||
# define emms_c avpriv_emms_yasm
|
||||
#endif /* HAVE_MMX_INLINE */
|
||||
|
||||
#endif /* AVUTIL_X86_EMMS_H */
|
@@ -0,0 +1,290 @@
|
||||
;*****************************************************************************
|
||||
;* x86-optimized Float DSP functions
|
||||
;*
|
||||
;* Copyright 2006 Loren Merritt
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "x86util.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void vector_fmul(float *dst, const float *src0, const float *src1, int len)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro VECTOR_FMUL 0
|
||||
cglobal vector_fmul, 4,4,2, dst, src0, src1, len
|
||||
lea lenq, [lend*4 - 2*mmsize]
|
||||
ALIGN 16
|
||||
.loop:
|
||||
mova m0, [src0q + lenq]
|
||||
mova m1, [src0q + lenq + mmsize]
|
||||
mulps m0, m0, [src1q + lenq]
|
||||
mulps m1, m1, [src1q + lenq + mmsize]
|
||||
mova [dstq + lenq], m0
|
||||
mova [dstq + lenq + mmsize], m1
|
||||
|
||||
sub lenq, 2*mmsize
|
||||
jge .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
VECTOR_FMUL
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_YMM avx
|
||||
VECTOR_FMUL
|
||||
%endif
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len)
|
||||
;------------------------------------------------------------------------------
|
||||
|
||||
%macro VECTOR_FMAC_SCALAR 0
|
||||
%if UNIX64
|
||||
cglobal vector_fmac_scalar, 3,3,3, dst, src, len
|
||||
%else
|
||||
cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len
|
||||
%endif
|
||||
%if ARCH_X86_32
|
||||
VBROADCASTSS m0, mulm
|
||||
%else
|
||||
%if WIN64
|
||||
mova xmm0, xmm2
|
||||
%endif
|
||||
shufps xmm0, xmm0, 0
|
||||
%if cpuflag(avx)
|
||||
vinsertf128 m0, m0, xmm0, 1
|
||||
%endif
|
||||
%endif
|
||||
lea lenq, [lend*4-2*mmsize]
|
||||
.loop:
|
||||
mulps m1, m0, [srcq+lenq ]
|
||||
mulps m2, m0, [srcq+lenq+mmsize]
|
||||
addps m1, m1, [dstq+lenq ]
|
||||
addps m2, m2, [dstq+lenq+mmsize]
|
||||
mova [dstq+lenq ], m1
|
||||
mova [dstq+lenq+mmsize], m2
|
||||
sub lenq, 2*mmsize
|
||||
jge .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
VECTOR_FMAC_SCALAR
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_YMM avx
|
||||
VECTOR_FMAC_SCALAR
|
||||
%endif
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len)
|
||||
;------------------------------------------------------------------------------
|
||||
|
||||
%macro VECTOR_FMUL_SCALAR 0
|
||||
%if UNIX64
|
||||
cglobal vector_fmul_scalar, 3,3,2, dst, src, len
|
||||
%else
|
||||
cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len
|
||||
%endif
|
||||
%if ARCH_X86_32
|
||||
movss m0, mulm
|
||||
%elif WIN64
|
||||
SWAP 0, 2
|
||||
%endif
|
||||
shufps m0, m0, 0
|
||||
lea lenq, [lend*4-mmsize]
|
||||
.loop:
|
||||
mova m1, [srcq+lenq]
|
||||
mulps m1, m0
|
||||
mova [dstq+lenq], m1
|
||||
sub lenq, mmsize
|
||||
jge .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
VECTOR_FMUL_SCALAR
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; void ff_vector_dmul_scalar(double *dst, const double *src, double mul,
|
||||
; int len)
|
||||
;------------------------------------------------------------------------------
|
||||
|
||||
%macro VECTOR_DMUL_SCALAR 0
|
||||
%if ARCH_X86_32
|
||||
cglobal vector_dmul_scalar, 3,4,3, dst, src, mul, len, lenaddr
|
||||
mov lenq, lenaddrm
|
||||
%elif UNIX64
|
||||
cglobal vector_dmul_scalar, 3,3,3, dst, src, len
|
||||
%else
|
||||
cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len
|
||||
%endif
|
||||
%if ARCH_X86_32
|
||||
VBROADCASTSD m0, mulm
|
||||
%else
|
||||
%if WIN64
|
||||
movlhps xmm2, xmm2
|
||||
%if cpuflag(avx)
|
||||
vinsertf128 ymm2, ymm2, xmm2, 1
|
||||
%endif
|
||||
SWAP 0, 2
|
||||
%else
|
||||
movlhps xmm0, xmm0
|
||||
%if cpuflag(avx)
|
||||
vinsertf128 ymm0, ymm0, xmm0, 1
|
||||
%endif
|
||||
%endif
|
||||
%endif
|
||||
lea lenq, [lend*8-2*mmsize]
|
||||
.loop:
|
||||
mulpd m1, m0, [srcq+lenq ]
|
||||
mulpd m2, m0, [srcq+lenq+mmsize]
|
||||
mova [dstq+lenq ], m1
|
||||
mova [dstq+lenq+mmsize], m2
|
||||
sub lenq, 2*mmsize
|
||||
jge .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
VECTOR_DMUL_SCALAR
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_YMM avx
|
||||
VECTOR_DMUL_SCALAR
|
||||
%endif
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; vector_fmul_add(float *dst, const float *src0, const float *src1,
|
||||
; const float *src2, int len)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro VECTOR_FMUL_ADD 0
|
||||
cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len
|
||||
lea lenq, [lend*4 - 2*mmsize]
|
||||
ALIGN 16
|
||||
.loop:
|
||||
mova m0, [src0q + lenq]
|
||||
mova m1, [src0q + lenq + mmsize]
|
||||
mulps m0, m0, [src1q + lenq]
|
||||
mulps m1, m1, [src1q + lenq + mmsize]
|
||||
addps m0, m0, [src2q + lenq]
|
||||
addps m1, m1, [src2q + lenq + mmsize]
|
||||
mova [dstq + lenq], m0
|
||||
mova [dstq + lenq + mmsize], m1
|
||||
|
||||
sub lenq, 2*mmsize
|
||||
jge .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
VECTOR_FMUL_ADD
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_YMM avx
|
||||
VECTOR_FMUL_ADD
|
||||
%endif
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
|
||||
; int len)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro VECTOR_FMUL_REVERSE 0
|
||||
cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
|
||||
lea lenq, [lend*4 - 2*mmsize]
|
||||
ALIGN 16
|
||||
.loop:
|
||||
%if cpuflag(avx)
|
||||
vmovaps xmm0, [src1q + 16]
|
||||
vinsertf128 m0, m0, [src1q], 1
|
||||
vshufps m0, m0, m0, q0123
|
||||
vmovaps xmm1, [src1q + mmsize + 16]
|
||||
vinsertf128 m1, m1, [src1q + mmsize], 1
|
||||
vshufps m1, m1, m1, q0123
|
||||
%else
|
||||
mova m0, [src1q]
|
||||
mova m1, [src1q + mmsize]
|
||||
shufps m0, m0, q0123
|
||||
shufps m1, m1, q0123
|
||||
%endif
|
||||
mulps m0, m0, [src0q + lenq + mmsize]
|
||||
mulps m1, m1, [src0q + lenq]
|
||||
mova [dstq + lenq + mmsize], m0
|
||||
mova [dstq + lenq], m1
|
||||
add src1q, 2*mmsize
|
||||
sub lenq, 2*mmsize
|
||||
jge .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
VECTOR_FMUL_REVERSE
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_YMM avx
|
||||
VECTOR_FMUL_REVERSE
|
||||
%endif
|
||||
|
||||
; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
|
||||
INIT_XMM sse
|
||||
cglobal scalarproduct_float, 3,3,2, v1, v2, offset
|
||||
neg offsetq
|
||||
shl offsetq, 2
|
||||
sub v1q, offsetq
|
||||
sub v2q, offsetq
|
||||
xorps xmm0, xmm0
|
||||
.loop:
|
||||
movaps xmm1, [v1q+offsetq]
|
||||
mulps xmm1, [v2q+offsetq]
|
||||
addps xmm0, xmm1
|
||||
add offsetq, 16
|
||||
js .loop
|
||||
movhlps xmm1, xmm0
|
||||
addps xmm0, xmm1
|
||||
movss xmm1, xmm0
|
||||
shufps xmm0, xmm0, 1
|
||||
addss xmm0, xmm1
|
||||
%if ARCH_X86_64 == 0
|
||||
movss r0m, xmm0
|
||||
fld dword r0m
|
||||
%endif
|
||||
RET
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_butterflies_float(float *src0, float *src1, int len);
|
||||
;-----------------------------------------------------------------------------
|
||||
INIT_XMM sse
|
||||
cglobal butterflies_float, 3,3,3, src0, src1, len
|
||||
%if ARCH_X86_64
|
||||
movsxd lenq, lend
|
||||
%endif
|
||||
test lenq, lenq
|
||||
jz .end
|
||||
shl lenq, 2
|
||||
add src0q, lenq
|
||||
add src1q, lenq
|
||||
neg lenq
|
||||
.loop:
|
||||
mova m0, [src0q + lenq]
|
||||
mova m1, [src1q + lenq]
|
||||
subps m2, m0, m1
|
||||
addps m0, m0, m1
|
||||
mova [src1q + lenq], m2
|
||||
mova [src0q + lenq], m0
|
||||
add lenq, mmsize
|
||||
jl .loop
|
||||
.end:
|
||||
REP_RET
|
@@ -0,0 +1,156 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/float_dsp.h"
|
||||
#include "cpu.h"
|
||||
#include "asm.h"
|
||||
|
||||
void ff_vector_fmul_sse(float *dst, const float *src0, const float *src1,
|
||||
int len);
|
||||
void ff_vector_fmul_avx(float *dst, const float *src0, const float *src1,
|
||||
int len);
|
||||
|
||||
void ff_vector_fmac_scalar_sse(float *dst, const float *src, float mul,
|
||||
int len);
|
||||
void ff_vector_fmac_scalar_avx(float *dst, const float *src, float mul,
|
||||
int len);
|
||||
|
||||
void ff_vector_fmul_scalar_sse(float *dst, const float *src, float mul,
|
||||
int len);
|
||||
|
||||
void ff_vector_dmul_scalar_sse2(double *dst, const double *src,
|
||||
double mul, int len);
|
||||
void ff_vector_dmul_scalar_avx(double *dst, const double *src,
|
||||
double mul, int len);
|
||||
|
||||
void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
|
||||
const float *src2, int len);
|
||||
void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1,
|
||||
const float *src2, int len);
|
||||
|
||||
void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
|
||||
const float *src1, int len);
|
||||
void ff_vector_fmul_reverse_avx(float *dst, const float *src0,
|
||||
const float *src1, int len);
|
||||
|
||||
float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
|
||||
|
||||
void ff_butterflies_float_sse(float *src0, float *src1, int len);
|
||||
|
||||
#if HAVE_6REGS && HAVE_INLINE_ASM
|
||||
static void vector_fmul_window_3dnowext(float *dst, const float *src0,
|
||||
const float *src1, const float *win,
|
||||
int len)
|
||||
{
|
||||
x86_reg i = -len * 4;
|
||||
x86_reg j = len * 4 - 8;
|
||||
__asm__ volatile (
|
||||
"1: \n"
|
||||
"pswapd (%5, %1), %%mm1 \n"
|
||||
"movq (%5, %0), %%mm0 \n"
|
||||
"pswapd (%4, %1), %%mm5 \n"
|
||||
"movq (%3, %0), %%mm4 \n"
|
||||
"movq %%mm0, %%mm2 \n"
|
||||
"movq %%mm1, %%mm3 \n"
|
||||
"pfmul %%mm4, %%mm2 \n" // src0[len + i] * win[len + i]
|
||||
"pfmul %%mm5, %%mm3 \n" // src1[j] * win[len + j]
|
||||
"pfmul %%mm4, %%mm1 \n" // src0[len + i] * win[len + j]
|
||||
"pfmul %%mm5, %%mm0 \n" // src1[j] * win[len + i]
|
||||
"pfadd %%mm3, %%mm2 \n"
|
||||
"pfsub %%mm0, %%mm1 \n"
|
||||
"pswapd %%mm2, %%mm2 \n"
|
||||
"movq %%mm1, (%2, %0) \n"
|
||||
"movq %%mm2, (%2, %1) \n"
|
||||
"sub $8, %1 \n"
|
||||
"add $8, %0 \n"
|
||||
"jl 1b \n"
|
||||
"femms \n"
|
||||
: "+r"(i), "+r"(j)
|
||||
: "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
|
||||
);
|
||||
}
|
||||
|
||||
static void vector_fmul_window_sse(float *dst, const float *src0,
|
||||
const float *src1, const float *win, int len)
|
||||
{
|
||||
x86_reg i = -len * 4;
|
||||
x86_reg j = len * 4 - 16;
|
||||
__asm__ volatile (
|
||||
"1: \n"
|
||||
"movaps (%5, %1), %%xmm1 \n"
|
||||
"movaps (%5, %0), %%xmm0 \n"
|
||||
"movaps (%4, %1), %%xmm5 \n"
|
||||
"movaps (%3, %0), %%xmm4 \n"
|
||||
"shufps $0x1b, %%xmm1, %%xmm1 \n"
|
||||
"shufps $0x1b, %%xmm5, %%xmm5 \n"
|
||||
"movaps %%xmm0, %%xmm2 \n"
|
||||
"movaps %%xmm1, %%xmm3 \n"
|
||||
"mulps %%xmm4, %%xmm2 \n" // src0[len + i] * win[len + i]
|
||||
"mulps %%xmm5, %%xmm3 \n" // src1[j] * win[len + j]
|
||||
"mulps %%xmm4, %%xmm1 \n" // src0[len + i] * win[len + j]
|
||||
"mulps %%xmm5, %%xmm0 \n" // src1[j] * win[len + i]
|
||||
"addps %%xmm3, %%xmm2 \n"
|
||||
"subps %%xmm0, %%xmm1 \n"
|
||||
"shufps $0x1b, %%xmm2, %%xmm2 \n"
|
||||
"movaps %%xmm1, (%2, %0) \n"
|
||||
"movaps %%xmm2, (%2, %1) \n"
|
||||
"sub $16, %1 \n"
|
||||
"add $16, %0 \n"
|
||||
"jl 1b \n"
|
||||
: "+r"(i), "+r"(j)
|
||||
: "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
|
||||
);
|
||||
}
|
||||
#endif /* HAVE_6REGS && HAVE_INLINE_ASM */
|
||||
|
||||
av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
#if HAVE_6REGS && HAVE_INLINE_ASM
|
||||
if (INLINE_AMD3DNOWEXT(cpu_flags)) {
|
||||
fdsp->vector_fmul_window = vector_fmul_window_3dnowext;
|
||||
}
|
||||
if (INLINE_SSE(cpu_flags)) {
|
||||
fdsp->vector_fmul_window = vector_fmul_window_sse;
|
||||
}
|
||||
#endif
|
||||
if (EXTERNAL_SSE(cpu_flags)) {
|
||||
fdsp->vector_fmul = ff_vector_fmul_sse;
|
||||
fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_sse;
|
||||
fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_sse;
|
||||
fdsp->vector_fmul_add = ff_vector_fmul_add_sse;
|
||||
fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_sse;
|
||||
fdsp->scalarproduct_float = ff_scalarproduct_float_sse;
|
||||
fdsp->butterflies_float = ff_butterflies_float_sse;
|
||||
}
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_sse2;
|
||||
}
|
||||
if (EXTERNAL_AVX(cpu_flags)) {
|
||||
fdsp->vector_fmul = ff_vector_fmul_avx;
|
||||
fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_avx;
|
||||
fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_avx;
|
||||
fdsp->vector_fmul_add = ff_vector_fmul_add_avx;
|
||||
fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
|
||||
}
|
||||
}
|
@@ -0,0 +1,97 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Alexander Strange <astrange@ithinksw.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVUTIL_X86_INTREADWRITE_H
|
||||
#define AVUTIL_X86_INTREADWRITE_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include "config.h"
|
||||
#include "libavutil/attributes.h"
|
||||
|
||||
#if HAVE_MMX
|
||||
|
||||
#if !HAVE_FAST_64BIT && defined(__MMX__)
|
||||
|
||||
#define AV_COPY64 AV_COPY64
|
||||
static av_always_inline void AV_COPY64(void *d, const void *s)
|
||||
{
|
||||
__asm__("movq %1, %%mm0 \n\t"
|
||||
"movq %%mm0, %0 \n\t"
|
||||
: "=m"(*(uint64_t*)d)
|
||||
: "m" (*(const uint64_t*)s)
|
||||
: "mm0");
|
||||
}
|
||||
|
||||
#define AV_SWAP64 AV_SWAP64
|
||||
static av_always_inline void AV_SWAP64(void *a, void *b)
|
||||
{
|
||||
__asm__("movq %1, %%mm0 \n\t"
|
||||
"movq %0, %%mm1 \n\t"
|
||||
"movq %%mm0, %0 \n\t"
|
||||
"movq %%mm1, %1 \n\t"
|
||||
: "+m"(*(uint64_t*)a), "+m"(*(uint64_t*)b)
|
||||
::"mm0", "mm1");
|
||||
}
|
||||
|
||||
#define AV_ZERO64 AV_ZERO64
|
||||
static av_always_inline void AV_ZERO64(void *d)
|
||||
{
|
||||
__asm__("pxor %%mm0, %%mm0 \n\t"
|
||||
"movq %%mm0, %0 \n\t"
|
||||
: "=m"(*(uint64_t*)d)
|
||||
:: "mm0");
|
||||
}
|
||||
|
||||
#endif /* !HAVE_FAST_64BIT && defined(__MMX__) */
|
||||
|
||||
#ifdef __SSE__
|
||||
|
||||
#define AV_COPY128 AV_COPY128
|
||||
static av_always_inline void AV_COPY128(void *d, const void *s)
|
||||
{
|
||||
struct v {uint64_t v[2];};
|
||||
|
||||
__asm__("movaps %1, %%xmm0 \n\t"
|
||||
"movaps %%xmm0, %0 \n\t"
|
||||
: "=m"(*(struct v*)d)
|
||||
: "m" (*(const struct v*)s)
|
||||
: "xmm0");
|
||||
}
|
||||
|
||||
#endif /* __SSE__ */
|
||||
|
||||
#ifdef __SSE2__
|
||||
|
||||
#define AV_ZERO128 AV_ZERO128
|
||||
static av_always_inline void AV_ZERO128(void *d)
|
||||
{
|
||||
struct v {uint64_t v[2];};
|
||||
|
||||
__asm__("pxor %%xmm0, %%xmm0 \n\t"
|
||||
"movdqa %%xmm0, %0 \n\t"
|
||||
: "=m"(*(struct v*)d)
|
||||
:: "xmm0");
|
||||
}
|
||||
|
||||
#endif /* __SSE2__ */
|
||||
|
||||
#endif /* HAVE_MMX */
|
||||
|
||||
#endif /* AVUTIL_X86_INTREADWRITE_H */
|
235
contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libavutil/x86/lls.asm
Normal file
235
contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libavutil/x86/lls.asm
Normal file
@@ -0,0 +1,235 @@
|
||||
;******************************************************************************
|
||||
;* linear least squares model
|
||||
;*
|
||||
;* Copyright (c) 2013 Loren Merritt
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "x86util.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
%define MAX_VARS 32
|
||||
%define MAX_VARS_ALIGN (MAX_VARS+4)
|
||||
%define COVAR_STRIDE MAX_VARS_ALIGN*8
|
||||
%define COVAR(x,y) [covarq + (x)*8 + (y)*COVAR_STRIDE]
|
||||
|
||||
struc LLSModel2
|
||||
.covariance: resq MAX_VARS_ALIGN*MAX_VARS_ALIGN
|
||||
.coeff: resq MAX_VARS*MAX_VARS
|
||||
.variance: resq MAX_VARS
|
||||
.indep_count: resd 1
|
||||
endstruc
|
||||
|
||||
%macro ADDPD_MEM 2
|
||||
%if cpuflag(avx)
|
||||
vaddpd %2, %2, %1
|
||||
%else
|
||||
addpd %2, %1
|
||||
%endif
|
||||
mova %1, %2
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
%define movdqa movaps
|
||||
cglobal update_lls, 2,5,8, ctx, var, i, j, covar2
|
||||
%define covarq ctxq
|
||||
mov id, [ctxq + LLSModel2.indep_count]
|
||||
lea varq, [varq + iq*8]
|
||||
neg iq
|
||||
mov covar2q, covarq
|
||||
.loopi:
|
||||
; Compute all 3 pairwise products of a 2x2 block that lies on the diagonal
|
||||
mova m1, [varq + iq*8]
|
||||
mova m3, [varq + iq*8 + 16]
|
||||
pshufd m4, m1, q1010
|
||||
pshufd m5, m1, q3232
|
||||
pshufd m6, m3, q1010
|
||||
pshufd m7, m3, q3232
|
||||
mulpd m0, m1, m4
|
||||
mulpd m1, m1, m5
|
||||
lea covarq, [covar2q + 16]
|
||||
ADDPD_MEM COVAR(-2,0), m0
|
||||
ADDPD_MEM COVAR(-2,1), m1
|
||||
lea jq, [iq + 2]
|
||||
cmp jd, -2
|
||||
jg .skip4x4
|
||||
.loop4x4:
|
||||
; Compute all 16 pairwise products of a 4x4 block
|
||||
mulpd m0, m4, m3
|
||||
mulpd m1, m5, m3
|
||||
mulpd m2, m6, m3
|
||||
mulpd m3, m3, m7
|
||||
ADDPD_MEM COVAR(0,0), m0
|
||||
ADDPD_MEM COVAR(0,1), m1
|
||||
ADDPD_MEM COVAR(0,2), m2
|
||||
ADDPD_MEM COVAR(0,3), m3
|
||||
mova m3, [varq + jq*8 + 16]
|
||||
mulpd m0, m4, m3
|
||||
mulpd m1, m5, m3
|
||||
mulpd m2, m6, m3
|
||||
mulpd m3, m3, m7
|
||||
ADDPD_MEM COVAR(2,0), m0
|
||||
ADDPD_MEM COVAR(2,1), m1
|
||||
ADDPD_MEM COVAR(2,2), m2
|
||||
ADDPD_MEM COVAR(2,3), m3
|
||||
mova m3, [varq + jq*8 + 32]
|
||||
add covarq, 32
|
||||
add jq, 4
|
||||
cmp jd, -2
|
||||
jle .loop4x4
|
||||
.skip4x4:
|
||||
test jd, jd
|
||||
jg .skip2x4
|
||||
mulpd m4, m3
|
||||
mulpd m5, m3
|
||||
mulpd m6, m3
|
||||
mulpd m7, m3
|
||||
ADDPD_MEM COVAR(0,0), m4
|
||||
ADDPD_MEM COVAR(0,1), m5
|
||||
ADDPD_MEM COVAR(0,2), m6
|
||||
ADDPD_MEM COVAR(0,3), m7
|
||||
.skip2x4:
|
||||
add iq, 4
|
||||
add covar2q, 4*COVAR_STRIDE+32
|
||||
cmp id, -2
|
||||
jle .loopi
|
||||
test id, id
|
||||
jg .ret
|
||||
mov jq, iq
|
||||
%define covarq covar2q
|
||||
.loop2x1:
|
||||
movsd m0, [varq + iq*8]
|
||||
movlhps m0, m0
|
||||
mulpd m0, [varq + jq*8]
|
||||
ADDPD_MEM COVAR(0,0), m0
|
||||
inc iq
|
||||
add covarq, COVAR_STRIDE
|
||||
test id, id
|
||||
jle .loop2x1
|
||||
.ret:
|
||||
REP_RET
|
||||
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_YMM avx
|
||||
cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
|
||||
%define covarq ctxq
|
||||
mov countd, [ctxq + LLSModel2.indep_count]
|
||||
lea count2d, [countq-2]
|
||||
xor id, id
|
||||
.loopi:
|
||||
; Compute all 10 pairwise products of a 4x4 block that lies on the diagonal
|
||||
mova ymm1, [varq + iq*8]
|
||||
vbroadcastsd ymm4, [varq + iq*8]
|
||||
vbroadcastsd ymm5, [varq + iq*8 + 8]
|
||||
vbroadcastsd ymm6, [varq + iq*8 + 16]
|
||||
vbroadcastsd ymm7, [varq + iq*8 + 24]
|
||||
vextractf128 xmm3, ymm1, 1
|
||||
vmulpd ymm0, ymm1, ymm4
|
||||
vmulpd ymm1, ymm1, ymm5
|
||||
vmulpd xmm2, xmm3, xmm6
|
||||
vmulpd xmm3, xmm3, xmm7
|
||||
ADDPD_MEM COVAR(iq ,0), ymm0
|
||||
ADDPD_MEM COVAR(iq ,1), ymm1
|
||||
ADDPD_MEM COVAR(iq+2,2), xmm2
|
||||
ADDPD_MEM COVAR(iq+2,3), xmm3
|
||||
lea jd, [iq + 4]
|
||||
cmp jd, count2d
|
||||
jg .skip4x4
|
||||
.loop4x4:
|
||||
; Compute all 16 pairwise products of a 4x4 block
|
||||
mova ymm3, [varq + jq*8]
|
||||
vmulpd ymm0, ymm3, ymm4
|
||||
vmulpd ymm1, ymm3, ymm5
|
||||
vmulpd ymm2, ymm3, ymm6
|
||||
vmulpd ymm3, ymm3, ymm7
|
||||
ADDPD_MEM COVAR(jq,0), ymm0
|
||||
ADDPD_MEM COVAR(jq,1), ymm1
|
||||
ADDPD_MEM COVAR(jq,2), ymm2
|
||||
ADDPD_MEM COVAR(jq,3), ymm3
|
||||
add jd, 4
|
||||
cmp jd, count2d
|
||||
jle .loop4x4
|
||||
.skip4x4:
|
||||
cmp jd, countd
|
||||
jg .skip2x4
|
||||
mova xmm3, [varq + jq*8]
|
||||
vmulpd xmm0, xmm3, xmm4
|
||||
vmulpd xmm1, xmm3, xmm5
|
||||
vmulpd xmm2, xmm3, xmm6
|
||||
vmulpd xmm3, xmm3, xmm7
|
||||
ADDPD_MEM COVAR(jq,0), xmm0
|
||||
ADDPD_MEM COVAR(jq,1), xmm1
|
||||
ADDPD_MEM COVAR(jq,2), xmm2
|
||||
ADDPD_MEM COVAR(jq,3), xmm3
|
||||
.skip2x4:
|
||||
add id, 4
|
||||
add covarq, 4*COVAR_STRIDE
|
||||
cmp id, count2d
|
||||
jle .loopi
|
||||
cmp id, countd
|
||||
jg .ret
|
||||
mov jd, id
|
||||
.loop2x1:
|
||||
vmovddup xmm0, [varq + iq*8]
|
||||
vmulpd xmm0, [varq + jq*8]
|
||||
ADDPD_MEM COVAR(jq,0), xmm0
|
||||
inc id
|
||||
add covarq, COVAR_STRIDE
|
||||
cmp id, countd
|
||||
jle .loop2x1
|
||||
.ret:
|
||||
REP_RET
|
||||
%endif
|
||||
|
||||
INIT_XMM sse2
|
||||
cglobal evaluate_lls, 3,4,2, ctx, var, order, i
|
||||
; This function is often called on the same buffer as update_lls, but with
|
||||
; an offset. They can't both be aligned.
|
||||
; Load halves rather than movu to avoid store-forwarding stalls, since the
|
||||
; input was initialized immediately prior to this function using scalar math.
|
||||
%define coefsq ctxq
|
||||
mov id, orderd
|
||||
imul orderd, MAX_VARS
|
||||
lea coefsq, [ctxq + LLSModel2.coeff + orderq*8]
|
||||
movsd m0, [varq]
|
||||
movhpd m0, [varq + 8]
|
||||
mulpd m0, [coefsq]
|
||||
lea coefsq, [coefsq + iq*8]
|
||||
lea varq, [varq + iq*8]
|
||||
neg iq
|
||||
add iq, 2
|
||||
.loop:
|
||||
movsd m1, [varq + iq*8]
|
||||
movhpd m1, [varq + iq*8 + 8]
|
||||
mulpd m1, [coefsq + iq*8]
|
||||
addpd m0, m1
|
||||
add iq, 2
|
||||
jl .loop
|
||||
jg .skip1
|
||||
movsd m1, [varq + iq*8]
|
||||
mulsd m1, [coefsq + iq*8]
|
||||
addpd m0, m1
|
||||
.skip1:
|
||||
movhlps m1, m0
|
||||
addsd m0, m1
|
||||
%if ARCH_X86_32
|
||||
movsd r0m, m0
|
||||
fld qword r0m
|
||||
%endif
|
||||
RET
|
@@ -0,0 +1,41 @@
|
||||
/*
|
||||
* linear least squares model
|
||||
*
|
||||
* Copyright (c) 2013 Loren Merritt
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/lls2.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
|
||||
void ff_update_lls_sse2(LLSModel2 *m, double *var);
|
||||
void ff_update_lls_avx(LLSModel2 *m, double *var);
|
||||
double ff_evaluate_lls_sse2(LLSModel2 *m, double *var, int order);
|
||||
|
||||
av_cold void ff_init_lls_x86(LLSModel2 *m)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
m->update_lls = ff_update_lls_sse2;
|
||||
if (m->indep_count >= 4)
|
||||
m->evaluate_lls = ff_evaluate_lls_sse2;
|
||||
}
|
||||
if (EXTERNAL_AVX(cpu_flags)) {
|
||||
m->update_lls = ff_update_lls_avx;
|
||||
}
|
||||
}
|
44
contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libavutil/x86/timer.h
Normal file
44
contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libavutil/x86/timer.h
Normal file
@@ -0,0 +1,44 @@
|
||||
/*
|
||||
* copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVUTIL_X86_TIMER_H
|
||||
#define AVUTIL_X86_TIMER_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
|
||||
#define AV_READ_TIME read_time
|
||||
|
||||
static inline uint64_t read_time(void)
|
||||
{
|
||||
uint32_t a, d;
|
||||
__asm__ volatile("rdtsc" : "=a" (a), "=d" (d));
|
||||
return ((uint64_t)d << 32) + a;
|
||||
}
|
||||
|
||||
#elif HAVE_RDTSC
|
||||
|
||||
#include <intrin.h>
|
||||
#define AV_READ_TIME __rdtsc
|
||||
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
|
||||
#endif /* AVUTIL_X86_TIMER_H */
|
@@ -0,0 +1,73 @@
|
||||
/*
|
||||
* check XMM registers for clobbers on Win64
|
||||
* Copyright (c) 2008 Ramiro Polla <ramiro.polla@gmail.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdarg.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "libavutil/bswap.h"
|
||||
|
||||
#define storexmmregs(mem) \
|
||||
__asm__ volatile( \
|
||||
"movups %%xmm6 , 0x00(%0)\n\t" \
|
||||
"movups %%xmm7 , 0x10(%0)\n\t" \
|
||||
"movups %%xmm8 , 0x20(%0)\n\t" \
|
||||
"movups %%xmm9 , 0x30(%0)\n\t" \
|
||||
"movups %%xmm10, 0x40(%0)\n\t" \
|
||||
"movups %%xmm11, 0x50(%0)\n\t" \
|
||||
"movups %%xmm12, 0x60(%0)\n\t" \
|
||||
"movups %%xmm13, 0x70(%0)\n\t" \
|
||||
"movups %%xmm14, 0x80(%0)\n\t" \
|
||||
"movups %%xmm15, 0x90(%0)\n\t" \
|
||||
:: "r"(mem) : "memory")
|
||||
|
||||
#define testxmmclobbers(func, ctx, ...) \
|
||||
uint64_t xmm[2][10][2]; \
|
||||
int ret; \
|
||||
storexmmregs(xmm[0]); \
|
||||
ret = __real_ ## func(ctx, __VA_ARGS__); \
|
||||
storexmmregs(xmm[1]); \
|
||||
if (memcmp(xmm[0], xmm[1], sizeof(xmm[0]))) { \
|
||||
int i; \
|
||||
av_log(ctx, AV_LOG_ERROR, \
|
||||
"XMM REGS CLOBBERED IN %s!\n", #func); \
|
||||
for (i = 0; i < 10; i ++) \
|
||||
if (xmm[0][i][0] != xmm[1][i][0] || \
|
||||
xmm[0][i][1] != xmm[1][i][1]) { \
|
||||
av_log(ctx, AV_LOG_ERROR, \
|
||||
"xmm%-2d = %016"PRIx64"%016"PRIx64"\n", \
|
||||
6 + i, av_bswap64(xmm[0][i][0]), \
|
||||
av_bswap64(xmm[0][i][1])); \
|
||||
av_log(ctx, AV_LOG_ERROR, \
|
||||
" -> %016"PRIx64"%016"PRIx64"\n", \
|
||||
av_bswap64(xmm[1][i][0]), \
|
||||
av_bswap64(xmm[1][i][1])); \
|
||||
} \
|
||||
abort(); \
|
||||
} \
|
||||
return ret
|
||||
|
||||
#define wrap(func) \
|
||||
int __real_ ## func; \
|
||||
int __wrap_ ## func; \
|
||||
int __wrap_ ## func
|
1480
contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libavutil/x86/x86inc.asm
Normal file
1480
contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libavutil/x86/x86inc.asm
Normal file
File diff suppressed because it is too large
Load Diff
680
contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libavutil/x86/x86util.asm
Normal file
680
contrib/sdk/sources/ffmpeg/ffmpeg-2.1/libavutil/x86/x86util.asm
Normal file
@@ -0,0 +1,680 @@
|
||||
;*****************************************************************************
|
||||
;* x86util.asm
|
||||
;*****************************************************************************
|
||||
;* Copyright (C) 2008-2010 x264 project
|
||||
;*
|
||||
;* Authors: Loren Merritt <lorenm@u.washington.edu>
|
||||
;* Holger Lubitz <holger@lubitz.org>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%define private_prefix ff
|
||||
%define public_prefix avpriv
|
||||
%define cpuflags_mmxext cpuflags_mmx2
|
||||
|
||||
%include "libavutil/x86/x86inc.asm"
|
||||
|
||||
%macro SBUTTERFLY 4
|
||||
%if avx_enabled == 0
|
||||
mova m%4, m%2
|
||||
punpckl%1 m%2, m%3
|
||||
punpckh%1 m%4, m%3
|
||||
%else
|
||||
punpckh%1 m%4, m%2, m%3
|
||||
punpckl%1 m%2, m%3
|
||||
%endif
|
||||
SWAP %3, %4
|
||||
%endmacro
|
||||
|
||||
%macro SBUTTERFLY2 4
|
||||
punpckl%1 m%4, m%2, m%3
|
||||
punpckh%1 m%2, m%2, m%3
|
||||
SWAP %2, %4, %3
|
||||
%endmacro
|
||||
|
||||
%macro SBUTTERFLYPS 3
|
||||
unpcklps m%3, m%1, m%2
|
||||
unpckhps m%1, m%1, m%2
|
||||
SWAP %1, %3, %2
|
||||
%endmacro
|
||||
|
||||
%macro TRANSPOSE4x4B 5
|
||||
SBUTTERFLY bw, %1, %2, %5
|
||||
SBUTTERFLY bw, %3, %4, %5
|
||||
SBUTTERFLY wd, %1, %3, %5
|
||||
SBUTTERFLY wd, %2, %4, %5
|
||||
SWAP %2, %3
|
||||
%endmacro
|
||||
|
||||
%macro TRANSPOSE4x4W 5
|
||||
SBUTTERFLY wd, %1, %2, %5
|
||||
SBUTTERFLY wd, %3, %4, %5
|
||||
SBUTTERFLY dq, %1, %3, %5
|
||||
SBUTTERFLY dq, %2, %4, %5
|
||||
SWAP %2, %3
|
||||
%endmacro
|
||||
|
||||
%macro TRANSPOSE2x4x4W 5
|
||||
SBUTTERFLY wd, %1, %2, %5
|
||||
SBUTTERFLY wd, %3, %4, %5
|
||||
SBUTTERFLY dq, %1, %3, %5
|
||||
SBUTTERFLY dq, %2, %4, %5
|
||||
SBUTTERFLY qdq, %1, %2, %5
|
||||
SBUTTERFLY qdq, %3, %4, %5
|
||||
%endmacro
|
||||
|
||||
%macro TRANSPOSE4x4D 5
|
||||
SBUTTERFLY dq, %1, %2, %5
|
||||
SBUTTERFLY dq, %3, %4, %5
|
||||
SBUTTERFLY qdq, %1, %3, %5
|
||||
SBUTTERFLY qdq, %2, %4, %5
|
||||
SWAP %2, %3
|
||||
%endmacro
|
||||
|
||||
; identical behavior to TRANSPOSE4x4D, but using SSE1 float ops
|
||||
%macro TRANSPOSE4x4PS 5
|
||||
SBUTTERFLYPS %1, %2, %5
|
||||
SBUTTERFLYPS %3, %4, %5
|
||||
movlhps m%5, m%1, m%3
|
||||
movhlps m%3, m%1
|
||||
SWAP %5, %1
|
||||
movlhps m%5, m%2, m%4
|
||||
movhlps m%4, m%2
|
||||
SWAP %5, %2, %3
|
||||
%endmacro
|
||||
|
||||
%macro TRANSPOSE8x8W 9-11
|
||||
%if ARCH_X86_64
|
||||
SBUTTERFLY wd, %1, %2, %9
|
||||
SBUTTERFLY wd, %3, %4, %9
|
||||
SBUTTERFLY wd, %5, %6, %9
|
||||
SBUTTERFLY wd, %7, %8, %9
|
||||
SBUTTERFLY dq, %1, %3, %9
|
||||
SBUTTERFLY dq, %2, %4, %9
|
||||
SBUTTERFLY dq, %5, %7, %9
|
||||
SBUTTERFLY dq, %6, %8, %9
|
||||
SBUTTERFLY qdq, %1, %5, %9
|
||||
SBUTTERFLY qdq, %2, %6, %9
|
||||
SBUTTERFLY qdq, %3, %7, %9
|
||||
SBUTTERFLY qdq, %4, %8, %9
|
||||
SWAP %2, %5
|
||||
SWAP %4, %7
|
||||
%else
|
||||
; in: m0..m7, unless %11 in which case m6 is in %9
|
||||
; out: m0..m7, unless %11 in which case m4 is in %10
|
||||
; spills into %9 and %10
|
||||
%if %0<11
|
||||
movdqa %9, m%7
|
||||
%endif
|
||||
SBUTTERFLY wd, %1, %2, %7
|
||||
movdqa %10, m%2
|
||||
movdqa m%7, %9
|
||||
SBUTTERFLY wd, %3, %4, %2
|
||||
SBUTTERFLY wd, %5, %6, %2
|
||||
SBUTTERFLY wd, %7, %8, %2
|
||||
SBUTTERFLY dq, %1, %3, %2
|
||||
movdqa %9, m%3
|
||||
movdqa m%2, %10
|
||||
SBUTTERFLY dq, %2, %4, %3
|
||||
SBUTTERFLY dq, %5, %7, %3
|
||||
SBUTTERFLY dq, %6, %8, %3
|
||||
SBUTTERFLY qdq, %1, %5, %3
|
||||
SBUTTERFLY qdq, %2, %6, %3
|
||||
movdqa %10, m%2
|
||||
movdqa m%3, %9
|
||||
SBUTTERFLY qdq, %3, %7, %2
|
||||
SBUTTERFLY qdq, %4, %8, %2
|
||||
SWAP %2, %5
|
||||
SWAP %4, %7
|
||||
%if %0<11
|
||||
movdqa m%5, %10
|
||||
%endif
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; PABSW macro assumes %1 != %2, while ABS1/2 macros work in-place
|
||||
%macro PABSW 2
|
||||
%if cpuflag(ssse3)
|
||||
pabsw %1, %2
|
||||
%elif cpuflag(mmxext)
|
||||
pxor %1, %1
|
||||
psubw %1, %2
|
||||
pmaxsw %1, %2
|
||||
%else
|
||||
pxor %1, %1
|
||||
pcmpgtw %1, %2
|
||||
pxor %2, %1
|
||||
psubw %2, %1
|
||||
SWAP %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro PSIGNW_MMX 2
|
||||
pxor %1, %2
|
||||
psubw %1, %2
|
||||
%endmacro
|
||||
|
||||
%macro PSIGNW_SSSE3 2
|
||||
psignw %1, %2
|
||||
%endmacro
|
||||
|
||||
%macro ABS1 2
|
||||
%if cpuflag(ssse3)
|
||||
pabsw %1, %1
|
||||
%elif cpuflag(mmxext) ; a, tmp
|
||||
pxor %2, %2
|
||||
psubw %2, %1
|
||||
pmaxsw %1, %2
|
||||
%else ; a, tmp
|
||||
pxor %2, %2
|
||||
pcmpgtw %2, %1
|
||||
pxor %1, %2
|
||||
psubw %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro ABS2 4
|
||||
%if cpuflag(ssse3)
|
||||
pabsw %1, %1
|
||||
pabsw %2, %2
|
||||
%elif cpuflag(mmxext) ; a, b, tmp0, tmp1
|
||||
pxor %3, %3
|
||||
pxor %4, %4
|
||||
psubw %3, %1
|
||||
psubw %4, %2
|
||||
pmaxsw %1, %3
|
||||
pmaxsw %2, %4
|
||||
%else ; a, b, tmp0, tmp1
|
||||
pxor %3, %3
|
||||
pxor %4, %4
|
||||
pcmpgtw %3, %1
|
||||
pcmpgtw %4, %2
|
||||
pxor %1, %3
|
||||
pxor %2, %4
|
||||
psubw %1, %3
|
||||
psubw %2, %4
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro ABSB 2 ; source mmreg, temp mmreg (unused for ssse3)
|
||||
%if cpuflag(ssse3)
|
||||
pabsb %1, %1
|
||||
%else
|
||||
pxor %2, %2
|
||||
psubb %2, %1
|
||||
pminub %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro ABSB2 4 ; src1, src2, tmp1, tmp2 (tmp1/2 unused for SSSE3)
|
||||
%if cpuflag(ssse3)
|
||||
pabsb %1, %1
|
||||
pabsb %2, %2
|
||||
%else
|
||||
pxor %3, %3
|
||||
pxor %4, %4
|
||||
psubb %3, %1
|
||||
psubb %4, %2
|
||||
pminub %1, %3
|
||||
pminub %2, %4
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro ABSD2_MMX 4
|
||||
pxor %3, %3
|
||||
pxor %4, %4
|
||||
pcmpgtd %3, %1
|
||||
pcmpgtd %4, %2
|
||||
pxor %1, %3
|
||||
pxor %2, %4
|
||||
psubd %1, %3
|
||||
psubd %2, %4
|
||||
%endmacro
|
||||
|
||||
%macro ABS4 6
|
||||
ABS2 %1, %2, %5, %6
|
||||
ABS2 %3, %4, %5, %6
|
||||
%endmacro
|
||||
|
||||
%macro SPLATB_LOAD 3
|
||||
%if cpuflag(ssse3)
|
||||
movd %1, [%2-3]
|
||||
pshufb %1, %3
|
||||
%else
|
||||
movd %1, [%2-3] ;to avoid crossing a cacheline
|
||||
punpcklbw %1, %1
|
||||
SPLATW %1, %1, 3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro SPLATB_REG 3
|
||||
%if cpuflag(ssse3)
|
||||
movd %1, %2d
|
||||
pshufb %1, %3
|
||||
%else
|
||||
movd %1, %2d
|
||||
punpcklbw %1, %1
|
||||
SPLATW %1, %1, 0
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro PALIGNR 4-5
|
||||
%if cpuflag(ssse3)
|
||||
%if %0==5
|
||||
palignr %1, %2, %3, %4
|
||||
%else
|
||||
palignr %1, %2, %3
|
||||
%endif
|
||||
%elif cpuflag(mmx) ; [dst,] src1, src2, imm, tmp
|
||||
%define %%dst %1
|
||||
%if %0==5
|
||||
%ifnidn %1, %2
|
||||
mova %%dst, %2
|
||||
%endif
|
||||
%rotate 1
|
||||
%endif
|
||||
%ifnidn %4, %2
|
||||
mova %4, %2
|
||||
%endif
|
||||
%if mmsize==8
|
||||
psllq %%dst, (8-%3)*8
|
||||
psrlq %4, %3*8
|
||||
%else
|
||||
pslldq %%dst, 16-%3
|
||||
psrldq %4, %3
|
||||
%endif
|
||||
por %%dst, %4
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro PAVGB 2
|
||||
%if cpuflag(mmxext)
|
||||
pavgb %1, %2
|
||||
%elif cpuflag(3dnow)
|
||||
pavgusb %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro PSHUFLW 1+
|
||||
%if mmsize == 8
|
||||
pshufw %1
|
||||
%else
|
||||
pshuflw %1
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro PSWAPD 2
|
||||
%if cpuflag(mmxext)
|
||||
pshufw %1, %2, q1032
|
||||
%elif cpuflag(3dnowext)
|
||||
pswapd %1, %2
|
||||
%elif cpuflag(3dnow)
|
||||
movq %1, %2
|
||||
psrlq %1, 32
|
||||
punpckldq %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro DEINTB 5 ; mask, reg1, mask, reg2, optional src to fill masks from
|
||||
%ifnum %5
|
||||
pand m%3, m%5, m%4 ; src .. y6 .. y4
|
||||
pand m%1, m%5, m%2 ; dst .. y6 .. y4
|
||||
%else
|
||||
mova m%1, %5
|
||||
pand m%3, m%1, m%4 ; src .. y6 .. y4
|
||||
pand m%1, m%1, m%2 ; dst .. y6 .. y4
|
||||
%endif
|
||||
psrlw m%2, 8 ; dst .. y7 .. y5
|
||||
psrlw m%4, 8 ; src .. y7 .. y5
|
||||
%endmacro
|
||||
|
||||
%macro SUMSUB_BA 3-4
|
||||
%if %0==3
|
||||
padd%1 m%2, m%3
|
||||
padd%1 m%3, m%3
|
||||
psub%1 m%3, m%2
|
||||
%else
|
||||
%if avx_enabled == 0
|
||||
mova m%4, m%2
|
||||
padd%1 m%2, m%3
|
||||
psub%1 m%3, m%4
|
||||
%else
|
||||
padd%1 m%4, m%2, m%3
|
||||
psub%1 m%3, m%2
|
||||
SWAP %2, %4
|
||||
%endif
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro SUMSUB_BADC 5-6
|
||||
%if %0==6
|
||||
SUMSUB_BA %1, %2, %3, %6
|
||||
SUMSUB_BA %1, %4, %5, %6
|
||||
%else
|
||||
padd%1 m%2, m%3
|
||||
padd%1 m%4, m%5
|
||||
padd%1 m%3, m%3
|
||||
padd%1 m%5, m%5
|
||||
psub%1 m%3, m%2
|
||||
psub%1 m%5, m%4
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro SUMSUB2_AB 4
|
||||
%ifnum %3
|
||||
psub%1 m%4, m%2, m%3
|
||||
psub%1 m%4, m%3
|
||||
padd%1 m%2, m%2
|
||||
padd%1 m%2, m%3
|
||||
%else
|
||||
mova m%4, m%2
|
||||
padd%1 m%2, m%2
|
||||
padd%1 m%2, %3
|
||||
psub%1 m%4, %3
|
||||
psub%1 m%4, %3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro SUMSUB2_BA 4
|
||||
%if avx_enabled == 0
|
||||
mova m%4, m%2
|
||||
padd%1 m%2, m%3
|
||||
padd%1 m%2, m%3
|
||||
psub%1 m%3, m%4
|
||||
psub%1 m%3, m%4
|
||||
%else
|
||||
padd%1 m%4, m%2, m%3
|
||||
padd%1 m%4, m%3
|
||||
psub%1 m%3, m%2
|
||||
psub%1 m%3, m%2
|
||||
SWAP %2, %4
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro SUMSUBD2_AB 5
|
||||
%ifnum %4
|
||||
psra%1 m%5, m%2, 1 ; %3: %3>>1
|
||||
psra%1 m%4, m%3, 1 ; %2: %2>>1
|
||||
padd%1 m%4, m%2 ; %3: %3>>1+%2
|
||||
psub%1 m%5, m%3 ; %2: %2>>1-%3
|
||||
SWAP %2, %5
|
||||
SWAP %3, %4
|
||||
%else
|
||||
mova %5, m%2
|
||||
mova %4, m%3
|
||||
psra%1 m%3, 1 ; %3: %3>>1
|
||||
psra%1 m%2, 1 ; %2: %2>>1
|
||||
padd%1 m%3, %5 ; %3: %3>>1+%2
|
||||
psub%1 m%2, %4 ; %2: %2>>1-%3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro DCT4_1D 5
|
||||
%ifnum %5
|
||||
SUMSUB_BADC w, %4, %1, %3, %2, %5
|
||||
SUMSUB_BA w, %3, %4, %5
|
||||
SUMSUB2_AB w, %1, %2, %5
|
||||
SWAP %1, %3, %4, %5, %2
|
||||
%else
|
||||
SUMSUB_BADC w, %4, %1, %3, %2
|
||||
SUMSUB_BA w, %3, %4
|
||||
mova [%5], m%2
|
||||
SUMSUB2_AB w, %1, [%5], %2
|
||||
SWAP %1, %3, %4, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro IDCT4_1D 6-7
|
||||
%ifnum %6
|
||||
SUMSUBD2_AB %1, %3, %5, %7, %6
|
||||
; %3: %3>>1-%5 %5: %3+%5>>1
|
||||
SUMSUB_BA %1, %4, %2, %7
|
||||
; %4: %2+%4 %2: %2-%4
|
||||
SUMSUB_BADC %1, %5, %4, %3, %2, %7
|
||||
; %5: %2+%4 + (%3+%5>>1)
|
||||
; %4: %2+%4 - (%3+%5>>1)
|
||||
; %3: %2-%4 + (%3>>1-%5)
|
||||
; %2: %2-%4 - (%3>>1-%5)
|
||||
%else
|
||||
%ifidn %1, w
|
||||
SUMSUBD2_AB %1, %3, %5, [%6], [%6+16]
|
||||
%else
|
||||
SUMSUBD2_AB %1, %3, %5, [%6], [%6+32]
|
||||
%endif
|
||||
SUMSUB_BA %1, %4, %2
|
||||
SUMSUB_BADC %1, %5, %4, %3, %2
|
||||
%endif
|
||||
SWAP %2, %5, %4
|
||||
; %2: %2+%4 + (%3+%5>>1) row0
|
||||
; %3: %2-%4 + (%3>>1-%5) row1
|
||||
; %4: %2-%4 - (%3>>1-%5) row2
|
||||
; %5: %2+%4 - (%3+%5>>1) row3
|
||||
%endmacro
|
||||
|
||||
|
||||
%macro LOAD_DIFF 5
|
||||
%ifidn %3, none
|
||||
movh %1, %4
|
||||
movh %2, %5
|
||||
punpcklbw %1, %2
|
||||
punpcklbw %2, %2
|
||||
psubw %1, %2
|
||||
%else
|
||||
movh %1, %4
|
||||
punpcklbw %1, %3
|
||||
movh %2, %5
|
||||
punpcklbw %2, %3
|
||||
psubw %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro STORE_DCT 6
|
||||
movq [%5+%6+ 0], m%1
|
||||
movq [%5+%6+ 8], m%2
|
||||
movq [%5+%6+16], m%3
|
||||
movq [%5+%6+24], m%4
|
||||
movhps [%5+%6+32], m%1
|
||||
movhps [%5+%6+40], m%2
|
||||
movhps [%5+%6+48], m%3
|
||||
movhps [%5+%6+56], m%4
|
||||
%endmacro
|
||||
|
||||
%macro LOAD_DIFF_8x4P 7-10 r0,r2,0 ; 4x dest, 2x temp, 2x pointer, increment?
|
||||
LOAD_DIFF m%1, m%5, m%7, [%8], [%9]
|
||||
LOAD_DIFF m%2, m%6, m%7, [%8+r1], [%9+r3]
|
||||
LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3]
|
||||
LOAD_DIFF m%4, m%6, m%7, [%8+r4], [%9+r5]
|
||||
%if %10
|
||||
lea %8, [%8+4*r1]
|
||||
lea %9, [%9+4*r3]
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro DIFFx2 6-7
|
||||
movh %3, %5
|
||||
punpcklbw %3, %4
|
||||
psraw %1, 6
|
||||
paddsw %1, %3
|
||||
movh %3, %6
|
||||
punpcklbw %3, %4
|
||||
psraw %2, 6
|
||||
paddsw %2, %3
|
||||
packuswb %2, %1
|
||||
%endmacro
|
||||
|
||||
%macro STORE_DIFF 4
|
||||
movh %2, %4
|
||||
punpcklbw %2, %3
|
||||
psraw %1, 6
|
||||
paddsw %1, %2
|
||||
packuswb %1, %1
|
||||
movh %4, %1
|
||||
%endmacro
|
||||
|
||||
%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride
|
||||
movh %3, [%7]
|
||||
movh %4, [%7+%8]
|
||||
psraw %1, %6
|
||||
psraw %2, %6
|
||||
punpcklbw %3, %5
|
||||
punpcklbw %4, %5
|
||||
paddw %3, %1
|
||||
paddw %4, %2
|
||||
packuswb %3, %5
|
||||
packuswb %4, %5
|
||||
movh [%7], %3
|
||||
movh [%7+%8], %4
|
||||
%endmacro
|
||||
|
||||
%macro PMINUB 3 ; dst, src, ignored
|
||||
%if cpuflag(mmxext)
|
||||
pminub %1, %2
|
||||
%else ; dst, src, tmp
|
||||
mova %3, %1
|
||||
psubusb %3, %2
|
||||
psubb %1, %3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro SPLATW 2-3 0
|
||||
%if mmsize == 16
|
||||
pshuflw %1, %2, (%3)*0x55
|
||||
punpcklqdq %1, %1
|
||||
%elif cpuflag(mmxext)
|
||||
pshufw %1, %2, (%3)*0x55
|
||||
%else
|
||||
%ifnidn %1, %2
|
||||
mova %1, %2
|
||||
%endif
|
||||
%if %3 & 2
|
||||
punpckhwd %1, %1
|
||||
%else
|
||||
punpcklwd %1, %1
|
||||
%endif
|
||||
%if %3 & 1
|
||||
punpckhwd %1, %1
|
||||
%else
|
||||
punpcklwd %1, %1
|
||||
%endif
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro SPLATD 1
|
||||
%if mmsize == 8
|
||||
punpckldq %1, %1
|
||||
%elif cpuflag(sse2)
|
||||
pshufd %1, %1, 0
|
||||
%elif cpuflag(sse)
|
||||
shufps %1, %1, 0
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro CLIPW 3 ;(dst, min, max)
|
||||
pmaxsw %1, %2
|
||||
pminsw %1, %3
|
||||
%endmacro
|
||||
|
||||
%macro PMINSD_MMX 3 ; dst, src, tmp
|
||||
mova %3, %2
|
||||
pcmpgtd %3, %1
|
||||
pxor %1, %2
|
||||
pand %1, %3
|
||||
pxor %1, %2
|
||||
%endmacro
|
||||
|
||||
%macro PMAXSD_MMX 3 ; dst, src, tmp
|
||||
mova %3, %1
|
||||
pcmpgtd %3, %2
|
||||
pand %1, %3
|
||||
pandn %3, %2
|
||||
por %1, %3
|
||||
%endmacro
|
||||
|
||||
%macro CLIPD_MMX 3-4 ; src/dst, min, max, tmp
|
||||
PMINSD_MMX %1, %3, %4
|
||||
PMAXSD_MMX %1, %2, %4
|
||||
%endmacro
|
||||
|
||||
%macro CLIPD_SSE2 3-4 ; src/dst, min (float), max (float), unused
|
||||
cvtdq2ps %1, %1
|
||||
minps %1, %3
|
||||
maxps %1, %2
|
||||
cvtps2dq %1, %1
|
||||
%endmacro
|
||||
|
||||
%macro CLIPD_SSE41 3-4 ; src/dst, min, max, unused
|
||||
pminsd %1, %3
|
||||
pmaxsd %1, %2
|
||||
%endmacro
|
||||
|
||||
%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32
|
||||
%if cpuflag(avx)
|
||||
vbroadcastss %1, %2
|
||||
%else ; sse
|
||||
movss %1, %2
|
||||
shufps %1, %1, 0
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro VBROADCASTSD 2 ; dst xmm/ymm, src m64
|
||||
%if cpuflag(avx) && mmsize == 32
|
||||
vbroadcastsd %1, %2
|
||||
%elif cpuflag(sse3)
|
||||
movddup %1, %2
|
||||
%else ; sse2
|
||||
movsd %1, %2
|
||||
movlhps %1, %1
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro SHUFFLE_MASK_W 8
|
||||
%rep 8
|
||||
%if %1>=0x80
|
||||
db %1, %1
|
||||
%else
|
||||
db %1*2
|
||||
db %1*2+1
|
||||
%endif
|
||||
%rotate 1
|
||||
%endrep
|
||||
%endmacro
|
||||
|
||||
%macro PMOVSXWD 2; dst, src
|
||||
%if cpuflag(sse4)
|
||||
pmovsxwd %1, %2
|
||||
%else
|
||||
%ifnidn %1, %2
|
||||
mova %1, %2
|
||||
%endif
|
||||
punpcklwd %1, %1
|
||||
psrad %1, 16
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; Wrapper for non-FMA version of fmaddps
|
||||
%macro FMULADD_PS 5
|
||||
%if cpuflag(fma3) || cpuflag(fma4)
|
||||
fmaddps %1, %2, %3, %4
|
||||
%elifidn %1, %4
|
||||
mulps %5, %2, %3
|
||||
addps %1, %4, %5
|
||||
%else
|
||||
mulps %1, %2, %3
|
||||
addps %1, %4
|
||||
%endif
|
||||
%endmacro
|
Reference in New Issue
Block a user