1828 lines
63 KiB
C
1828 lines
63 KiB
C
/*===---- xmmintrin.h - Implementation of SSE intrinsics on PowerPC --------===
|
|
*
|
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
* See https://llvm.org/LICENSE.txt for license information.
|
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
*
|
|
*===-----------------------------------------------------------------------===
|
|
*/
|
|
|
|
/* Implemented from the specification included in the Intel C++ Compiler
|
|
User Guide and Reference, version 9.0. */
|
|
|
|
#ifndef NO_WARN_X86_INTRINSICS
|
|
/* This header file is to help porting code using Intel intrinsics
|
|
explicitly from x86_64 to powerpc64/powerpc64le.
|
|
|
|
Since X86 SSE intrinsics mainly handles __m128 type, PowerPC
|
|
VMX/VSX ISA is a good match for vector float SIMD operations.
|
|
However scalar float operations in vector (XMM) registers require
|
|
the POWER8 VSX ISA (2.07) level. There are differences for data
|
|
format and placement of float scalars in the vector register, which
|
|
require extra steps to match SSE scalar float semantics on POWER.
|
|
|
|
It should be noted that there's much difference between X86_64's
|
|
MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
|
|
portable <fenv.h> instead of access MXSCR directly.
|
|
|
|
Most SSE scalar float intrinsic operations can be performed more
|
|
efficiently as C language float scalar operations or optimized to
|
|
use vector SIMD operations. We recommend this for new applications. */
|
|
#error \
|
|
"Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
|
|
#endif
|
|
|
|
#ifndef XMMINTRIN_H_
|
|
#define XMMINTRIN_H_
|
|
|
|
#if defined(__powerpc64__) && \
|
|
(defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
|
|
|
|
/* Define four value permute mask */
|
|
#define _MM_SHUFFLE(w, x, y, z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
|
|
|
|
#include <altivec.h>
|
|
|
|
/* Avoid collisions between altivec.h and strict adherence to C++ and
|
|
C11 standards. This should eventually be done inside altivec.h itself,
|
|
but only after testing a full distro build. */
|
|
#if defined(__STRICT_ANSI__) && \
|
|
(defined(__cplusplus) || \
|
|
(defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L))
|
|
#undef vector
|
|
#undef pixel
|
|
#undef bool
|
|
#endif
|
|
|
|
/* We need type definitions from the MMX header file. */
|
|
#include <mmintrin.h>
|
|
|
|
/* Get _mm_malloc () and _mm_free (). */
|
|
#if __STDC_HOSTED__
|
|
#include <mm_malloc.h>
|
|
#endif
|
|
|
|
/* The Intel API is flexible enough that we must allow aliasing with other
|
|
vector types, and their scalar components. */
|
|
typedef vector float __m128 __attribute__((__may_alias__));
|
|
|
|
/* Unaligned version of the same type. */
|
|
typedef vector float __m128_u __attribute__((__may_alias__, __aligned__(1)));
|
|
|
|
/* Internal data types for implementing the intrinsics. */
|
|
typedef vector float __v4sf;
|
|
|
|
/* Create an undefined vector. */
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_undefined_ps(void) {
|
|
__m128 __Y = __Y;
|
|
return __Y;
|
|
}
|
|
|
|
/* Create a vector of zeros. */
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_setzero_ps(void) {
|
|
return __extension__(__m128){0.0f, 0.0f, 0.0f, 0.0f};
|
|
}
|
|
|
|
/* Load four SPFP values from P. The address must be 16-byte aligned. */
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_load_ps(float const *__P) {
|
|
return ((__m128)vec_ld(0, (__v4sf *)__P));
|
|
}
|
|
|
|
/* Load four SPFP values from P. The address need not be 16-byte aligned. */
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_loadu_ps(float const *__P) {
|
|
return (vec_vsx_ld(0, __P));
|
|
}
|
|
|
|
/* Load four SPFP values in reverse order. The address must be aligned. */
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_loadr_ps(float const *__P) {
|
|
__v4sf __tmp;
|
|
__m128 __result;
|
|
static const __vector unsigned char __permute_vector = {
|
|
0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B,
|
|
0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13};
|
|
|
|
__tmp = vec_ld(0, (__v4sf *)__P);
|
|
__result = (__m128)vec_perm(__tmp, __tmp, __permute_vector);
|
|
return __result;
|
|
}
|
|
|
|
/* Create a vector with all four elements equal to F. */
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_set1_ps(float __F) {
|
|
return __extension__(__m128)(__v4sf){__F, __F, __F, __F};
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_set_ps1(float __F) {
|
|
return _mm_set1_ps(__F);
|
|
}
|
|
|
|
/* Create the vector [Z Y X W]. */
|
|
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__,
|
|
__artificial__))
|
|
_mm_set_ps(const float __Z, const float __Y, const float __X, const float __W) {
|
|
return __extension__(__m128)(__v4sf){__W, __X, __Y, __Z};
|
|
}
|
|
|
|
/* Create the vector [W X Y Z]. */
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_setr_ps(float __Z, float __Y, float __X, float __W) {
|
|
return __extension__(__m128)(__v4sf){__Z, __Y, __X, __W};
|
|
}
|
|
|
|
/* Store four SPFP values. The address must be 16-byte aligned. */
|
|
extern __inline void
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_store_ps(float *__P, __m128 __A) {
|
|
vec_st((__v4sf)__A, 0, (__v4sf *)__P);
|
|
}
|
|
|
|
/* Store four SPFP values. The address need not be 16-byte aligned. */
|
|
extern __inline void
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_storeu_ps(float *__P, __m128 __A) {
|
|
*(__m128_u *)__P = __A;
|
|
}
|
|
|
|
/* Store four SPFP values in reverse order. The address must be aligned. */
|
|
extern __inline void
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_storer_ps(float *__P, __m128 __A) {
|
|
__v4sf __tmp;
|
|
static const __vector unsigned char __permute_vector = {
|
|
0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B,
|
|
0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13};
|
|
|
|
__tmp = (__m128)vec_perm(__A, __A, __permute_vector);
|
|
|
|
_mm_store_ps(__P, __tmp);
|
|
}
|
|
|
|
/* Store the lower SPFP value across four words. */
|
|
extern __inline void
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_store1_ps(float *__P, __m128 __A) {
|
|
__v4sf __va = vec_splat((__v4sf)__A, 0);
|
|
_mm_store_ps(__P, __va);
|
|
}
|
|
|
|
extern __inline void
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_store_ps1(float *__P, __m128 __A) {
|
|
_mm_store1_ps(__P, __A);
|
|
}
|
|
|
|
/* Create a vector with element 0 as F and the rest zero. */
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_set_ss(float __F) {
|
|
return __extension__(__m128)(__v4sf){__F, 0.0f, 0.0f, 0.0f};
|
|
}
|
|
|
|
/* Sets the low SPFP value of A from the low value of B. */
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_move_ss(__m128 __A, __m128 __B) {
|
|
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
|
|
|
|
return (vec_sel((__v4sf)__A, (__v4sf)__B, __mask));
|
|
}
|
|
|
|
/* Create a vector with element 0 as *P and the rest zero. */
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_load_ss(float const *__P) {
|
|
return _mm_set_ss(*__P);
|
|
}
|
|
|
|
/* Stores the lower SPFP value. */
|
|
extern __inline void
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_store_ss(float *__P, __m128 __A) {
|
|
*__P = ((__v4sf)__A)[0];
|
|
}
|
|
|
|
/* Perform the respective operation on the lower SPFP (single-precision
|
|
floating-point) values of A and B; the upper three SPFP values are
|
|
passed through from A. */
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_add_ss(__m128 __A, __m128 __B) {
|
|
#ifdef _ARCH_PWR7
|
|
__m128 __a, __b, __c;
|
|
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
|
|
/* PowerISA VSX does not allow partial (for just lower double)
|
|
results. So to insure we don't generate spurious exceptions
|
|
(from the upper double values) we splat the lower double
|
|
before we to the operation. */
|
|
__a = vec_splat(__A, 0);
|
|
__b = vec_splat(__B, 0);
|
|
__c = __a + __b;
|
|
/* Then we merge the lower float result with the original upper
|
|
float elements from __A. */
|
|
return (vec_sel(__A, __c, __mask));
|
|
#else
|
|
__A[0] = __A[0] + __B[0];
|
|
return (__A);
|
|
#endif
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_sub_ss(__m128 __A, __m128 __B) {
|
|
#ifdef _ARCH_PWR7
|
|
__m128 __a, __b, __c;
|
|
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
|
|
/* PowerISA VSX does not allow partial (for just lower double)
|
|
results. So to insure we don't generate spurious exceptions
|
|
(from the upper double values) we splat the lower double
|
|
before we to the operation. */
|
|
__a = vec_splat(__A, 0);
|
|
__b = vec_splat(__B, 0);
|
|
__c = __a - __b;
|
|
/* Then we merge the lower float result with the original upper
|
|
float elements from __A. */
|
|
return (vec_sel(__A, __c, __mask));
|
|
#else
|
|
__A[0] = __A[0] - __B[0];
|
|
return (__A);
|
|
#endif
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_mul_ss(__m128 __A, __m128 __B) {
|
|
#ifdef _ARCH_PWR7
|
|
__m128 __a, __b, __c;
|
|
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
|
|
/* PowerISA VSX does not allow partial (for just lower double)
|
|
results. So to insure we don't generate spurious exceptions
|
|
(from the upper double values) we splat the lower double
|
|
before we to the operation. */
|
|
__a = vec_splat(__A, 0);
|
|
__b = vec_splat(__B, 0);
|
|
__c = __a * __b;
|
|
/* Then we merge the lower float result with the original upper
|
|
float elements from __A. */
|
|
return (vec_sel(__A, __c, __mask));
|
|
#else
|
|
__A[0] = __A[0] * __B[0];
|
|
return (__A);
|
|
#endif
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_div_ss(__m128 __A, __m128 __B) {
|
|
#ifdef _ARCH_PWR7
|
|
__m128 __a, __b, __c;
|
|
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
|
|
/* PowerISA VSX does not allow partial (for just lower double)
|
|
results. So to insure we don't generate spurious exceptions
|
|
(from the upper double values) we splat the lower double
|
|
before we to the operation. */
|
|
__a = vec_splat(__A, 0);
|
|
__b = vec_splat(__B, 0);
|
|
__c = __a / __b;
|
|
/* Then we merge the lower float result with the original upper
|
|
float elements from __A. */
|
|
return (vec_sel(__A, __c, __mask));
|
|
#else
|
|
__A[0] = __A[0] / __B[0];
|
|
return (__A);
|
|
#endif
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_sqrt_ss(__m128 __A) {
|
|
__m128 __a, __c;
|
|
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
|
|
/* PowerISA VSX does not allow partial (for just lower double)
|
|
* results. So to insure we don't generate spurious exceptions
|
|
* (from the upper double values) we splat the lower double
|
|
* before we to the operation. */
|
|
__a = vec_splat(__A, 0);
|
|
__c = vec_sqrt(__a);
|
|
/* Then we merge the lower float result with the original upper
|
|
* float elements from __A. */
|
|
return (vec_sel(__A, __c, __mask));
|
|
}
|
|
|
|
/* Perform the respective operation on the four SPFP values in A and B. */
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_add_ps(__m128 __A, __m128 __B) {
|
|
return (__m128)((__v4sf)__A + (__v4sf)__B);
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_sub_ps(__m128 __A, __m128 __B) {
|
|
return (__m128)((__v4sf)__A - (__v4sf)__B);
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_mul_ps(__m128 __A, __m128 __B) {
|
|
return (__m128)((__v4sf)__A * (__v4sf)__B);
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_div_ps(__m128 __A, __m128 __B) {
|
|
return (__m128)((__v4sf)__A / (__v4sf)__B);
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_sqrt_ps(__m128 __A) {
|
|
return (vec_sqrt((__v4sf)__A));
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_rcp_ps(__m128 __A) {
|
|
return (vec_re((__v4sf)__A));
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_rsqrt_ps(__m128 __A) {
|
|
return (vec_rsqrte(__A));
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_rcp_ss(__m128 __A) {
|
|
__m128 __a, __c;
|
|
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
|
|
/* PowerISA VSX does not allow partial (for just lower double)
|
|
* results. So to insure we don't generate spurious exceptions
|
|
* (from the upper double values) we splat the lower double
|
|
* before we to the operation. */
|
|
__a = vec_splat(__A, 0);
|
|
__c = _mm_rcp_ps(__a);
|
|
/* Then we merge the lower float result with the original upper
|
|
* float elements from __A. */
|
|
return (vec_sel(__A, __c, __mask));
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_rsqrt_ss(__m128 __A) {
|
|
__m128 __a, __c;
|
|
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
|
|
/* PowerISA VSX does not allow partial (for just lower double)
|
|
* results. So to insure we don't generate spurious exceptions
|
|
* (from the upper double values) we splat the lower double
|
|
* before we to the operation. */
|
|
__a = vec_splat(__A, 0);
|
|
__c = vec_rsqrte(__a);
|
|
/* Then we merge the lower float result with the original upper
|
|
* float elements from __A. */
|
|
return (vec_sel(__A, __c, __mask));
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_min_ss(__m128 __A, __m128 __B) {
|
|
__v4sf __a, __b, __c;
|
|
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
|
|
/* PowerISA VSX does not allow partial (for just lower float)
|
|
* results. So to insure we don't generate spurious exceptions
|
|
* (from the upper float values) we splat the lower float
|
|
* before we to the operation. */
|
|
__a = vec_splat((__v4sf)__A, 0);
|
|
__b = vec_splat((__v4sf)__B, 0);
|
|
__c = vec_min(__a, __b);
|
|
/* Then we merge the lower float result with the original upper
|
|
* float elements from __A. */
|
|
return (vec_sel((__v4sf)__A, __c, __mask));
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_max_ss(__m128 __A, __m128 __B) {
|
|
__v4sf __a, __b, __c;
|
|
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
|
|
/* PowerISA VSX does not allow partial (for just lower float)
|
|
* results. So to insure we don't generate spurious exceptions
|
|
* (from the upper float values) we splat the lower float
|
|
* before we to the operation. */
|
|
__a = vec_splat(__A, 0);
|
|
__b = vec_splat(__B, 0);
|
|
__c = vec_max(__a, __b);
|
|
/* Then we merge the lower float result with the original upper
|
|
* float elements from __A. */
|
|
return (vec_sel((__v4sf)__A, __c, __mask));
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_min_ps(__m128 __A, __m128 __B) {
|
|
__vector __bool int __m = vec_cmpgt((__v4sf)__B, (__v4sf)__A);
|
|
return vec_sel(__B, __A, __m);
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_max_ps(__m128 __A, __m128 __B) {
|
|
__vector __bool int __m = vec_cmpgt((__v4sf)__A, (__v4sf)__B);
|
|
return vec_sel(__B, __A, __m);
|
|
}
|
|
|
|
/* Perform logical bit-wise operations on 128-bit values. */
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_and_ps(__m128 __A, __m128 __B) {
|
|
return ((__m128)vec_and((__v4sf)__A, (__v4sf)__B));
|
|
// return __builtin_ia32_andps (__A, __B);
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_andnot_ps(__m128 __A, __m128 __B) {
|
|
return ((__m128)vec_andc((__v4sf)__B, (__v4sf)__A));
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_or_ps(__m128 __A, __m128 __B) {
|
|
return ((__m128)vec_or((__v4sf)__A, (__v4sf)__B));
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_xor_ps(__m128 __A, __m128 __B) {
|
|
return ((__m128)vec_xor((__v4sf)__A, (__v4sf)__B));
|
|
}
|
|
|
|
/* Perform a comparison on the four SPFP values of A and B. For each
|
|
element, if the comparison is true, place a mask of all ones in the
|
|
result, otherwise a mask of zeros. */
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpeq_ps(__m128 __A, __m128 __B) {
|
|
return ((__m128)vec_cmpeq((__v4sf)__A, (__v4sf)__B));
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmplt_ps(__m128 __A, __m128 __B) {
|
|
return ((__m128)vec_cmplt((__v4sf)__A, (__v4sf)__B));
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmple_ps(__m128 __A, __m128 __B) {
|
|
return ((__m128)vec_cmple((__v4sf)__A, (__v4sf)__B));
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpgt_ps(__m128 __A, __m128 __B) {
|
|
return ((__m128)vec_cmpgt((__v4sf)__A, (__v4sf)__B));
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpge_ps(__m128 __A, __m128 __B) {
|
|
return ((__m128)vec_cmpge((__v4sf)__A, (__v4sf)__B));
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpneq_ps(__m128 __A, __m128 __B) {
|
|
__v4sf __temp = (__v4sf)vec_cmpeq((__v4sf)__A, (__v4sf)__B);
|
|
return ((__m128)vec_nor(__temp, __temp));
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpnlt_ps(__m128 __A, __m128 __B) {
|
|
return ((__m128)vec_cmpge((__v4sf)__A, (__v4sf)__B));
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpnle_ps(__m128 __A, __m128 __B) {
|
|
return ((__m128)vec_cmpgt((__v4sf)__A, (__v4sf)__B));
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpngt_ps(__m128 __A, __m128 __B) {
|
|
return ((__m128)vec_cmple((__v4sf)__A, (__v4sf)__B));
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpnge_ps(__m128 __A, __m128 __B) {
|
|
return ((__m128)vec_cmplt((__v4sf)__A, (__v4sf)__B));
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpord_ps(__m128 __A, __m128 __B) {
|
|
__vector unsigned int __a, __b;
|
|
__vector unsigned int __c, __d;
|
|
static const __vector unsigned int __float_exp_mask = {
|
|
0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
|
|
|
|
__a = (__vector unsigned int)vec_abs((__v4sf)__A);
|
|
__b = (__vector unsigned int)vec_abs((__v4sf)__B);
|
|
__c = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __a);
|
|
__d = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __b);
|
|
return ((__m128)vec_and(__c, __d));
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpunord_ps(__m128 __A, __m128 __B) {
|
|
__vector unsigned int __a, __b;
|
|
__vector unsigned int __c, __d;
|
|
static const __vector unsigned int __float_exp_mask = {
|
|
0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
|
|
|
|
__a = (__vector unsigned int)vec_abs((__v4sf)__A);
|
|
__b = (__vector unsigned int)vec_abs((__v4sf)__B);
|
|
__c = (__vector unsigned int)vec_cmpgt(__a, __float_exp_mask);
|
|
__d = (__vector unsigned int)vec_cmpgt(__b, __float_exp_mask);
|
|
return ((__m128)vec_or(__c, __d));
|
|
}
|
|
|
|
/* Perform a comparison on the lower SPFP values of A and B. If the
|
|
comparison is true, place a mask of all ones in the result, otherwise a
|
|
mask of zeros. The upper three SPFP values are passed through from A. */
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpeq_ss(__m128 __A, __m128 __B) {
|
|
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
|
|
__v4sf __a, __b, __c;
|
|
/* PowerISA VMX does not allow partial (for just element 0)
|
|
* results. So to insure we don't generate spurious exceptions
|
|
* (from the upper elements) we splat the lower float
|
|
* before we to the operation. */
|
|
__a = vec_splat((__v4sf)__A, 0);
|
|
__b = vec_splat((__v4sf)__B, 0);
|
|
__c = (__v4sf)vec_cmpeq(__a, __b);
|
|
/* Then we merge the lower float result with the original upper
|
|
* float elements from __A. */
|
|
return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmplt_ss(__m128 __A, __m128 __B) {
|
|
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
|
|
__v4sf __a, __b, __c;
|
|
/* PowerISA VMX does not allow partial (for just element 0)
|
|
* results. So to insure we don't generate spurious exceptions
|
|
* (from the upper elements) we splat the lower float
|
|
* before we to the operation. */
|
|
__a = vec_splat((__v4sf)__A, 0);
|
|
__b = vec_splat((__v4sf)__B, 0);
|
|
__c = (__v4sf)vec_cmplt(__a, __b);
|
|
/* Then we merge the lower float result with the original upper
|
|
* float elements from __A. */
|
|
return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmple_ss(__m128 __A, __m128 __B) {
|
|
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
|
|
__v4sf __a, __b, __c;
|
|
/* PowerISA VMX does not allow partial (for just element 0)
|
|
* results. So to insure we don't generate spurious exceptions
|
|
* (from the upper elements) we splat the lower float
|
|
* before we to the operation. */
|
|
__a = vec_splat((__v4sf)__A, 0);
|
|
__b = vec_splat((__v4sf)__B, 0);
|
|
__c = (__v4sf)vec_cmple(__a, __b);
|
|
/* Then we merge the lower float result with the original upper
|
|
* float elements from __A. */
|
|
return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpgt_ss(__m128 __A, __m128 __B) {
|
|
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
|
|
__v4sf __a, __b, __c;
|
|
/* PowerISA VMX does not allow partial (for just element 0)
|
|
* results. So to insure we don't generate spurious exceptions
|
|
* (from the upper elements) we splat the lower float
|
|
* before we to the operation. */
|
|
__a = vec_splat((__v4sf)__A, 0);
|
|
__b = vec_splat((__v4sf)__B, 0);
|
|
__c = (__v4sf)vec_cmpgt(__a, __b);
|
|
/* Then we merge the lower float result with the original upper
|
|
* float elements from __A. */
|
|
return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpge_ss(__m128 __A, __m128 __B) {
|
|
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
|
|
__v4sf __a, __b, __c;
|
|
/* PowerISA VMX does not allow partial (for just element 0)
|
|
* results. So to insure we don't generate spurious exceptions
|
|
* (from the upper elements) we splat the lower float
|
|
* before we to the operation. */
|
|
__a = vec_splat((__v4sf)__A, 0);
|
|
__b = vec_splat((__v4sf)__B, 0);
|
|
__c = (__v4sf)vec_cmpge(__a, __b);
|
|
/* Then we merge the lower float result with the original upper
|
|
* float elements from __A. */
|
|
return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpneq_ss(__m128 __A, __m128 __B) {
|
|
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
|
|
__v4sf __a, __b, __c;
|
|
/* PowerISA VMX does not allow partial (for just element 0)
|
|
* results. So to insure we don't generate spurious exceptions
|
|
* (from the upper elements) we splat the lower float
|
|
* before we to the operation. */
|
|
__a = vec_splat((__v4sf)__A, 0);
|
|
__b = vec_splat((__v4sf)__B, 0);
|
|
__c = (__v4sf)vec_cmpeq(__a, __b);
|
|
__c = vec_nor(__c, __c);
|
|
/* Then we merge the lower float result with the original upper
|
|
* float elements from __A. */
|
|
return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpnlt_ss(__m128 __A, __m128 __B) {
|
|
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
|
|
__v4sf __a, __b, __c;
|
|
/* PowerISA VMX does not allow partial (for just element 0)
|
|
* results. So to insure we don't generate spurious exceptions
|
|
* (from the upper elements) we splat the lower float
|
|
* before we to the operation. */
|
|
__a = vec_splat((__v4sf)__A, 0);
|
|
__b = vec_splat((__v4sf)__B, 0);
|
|
__c = (__v4sf)vec_cmpge(__a, __b);
|
|
/* Then we merge the lower float result with the original upper
|
|
* float elements from __A. */
|
|
return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpnle_ss(__m128 __A, __m128 __B) {
|
|
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
|
|
__v4sf __a, __b, __c;
|
|
/* PowerISA VMX does not allow partial (for just element 0)
|
|
* results. So to insure we don't generate spurious exceptions
|
|
* (from the upper elements) we splat the lower float
|
|
* before we to the operation. */
|
|
__a = vec_splat((__v4sf)__A, 0);
|
|
__b = vec_splat((__v4sf)__B, 0);
|
|
__c = (__v4sf)vec_cmpgt(__a, __b);
|
|
/* Then we merge the lower float result with the original upper
|
|
* float elements from __A. */
|
|
return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpngt_ss(__m128 __A, __m128 __B) {
|
|
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
|
|
__v4sf __a, __b, __c;
|
|
/* PowerISA VMX does not allow partial (for just element 0)
|
|
* results. So to insure we don't generate spurious exceptions
|
|
* (from the upper elements) we splat the lower float
|
|
* before we to the operation. */
|
|
__a = vec_splat((__v4sf)__A, 0);
|
|
__b = vec_splat((__v4sf)__B, 0);
|
|
__c = (__v4sf)vec_cmple(__a, __b);
|
|
/* Then we merge the lower float result with the original upper
|
|
* float elements from __A. */
|
|
return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpnge_ss(__m128 __A, __m128 __B) {
|
|
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
|
|
__v4sf __a, __b, __c;
|
|
/* PowerISA VMX does not allow partial (for just element 0)
|
|
* results. So to insure we don't generate spurious exceptions
|
|
* (from the upper elements) we splat the lower float
|
|
* before we do the operation. */
|
|
__a = vec_splat((__v4sf)__A, 0);
|
|
__b = vec_splat((__v4sf)__B, 0);
|
|
__c = (__v4sf)vec_cmplt(__a, __b);
|
|
/* Then we merge the lower float result with the original upper
|
|
* float elements from __A. */
|
|
return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpord_ss(__m128 __A, __m128 __B) {
|
|
__vector unsigned int __a, __b;
|
|
__vector unsigned int __c, __d;
|
|
static const __vector unsigned int __float_exp_mask = {
|
|
0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
|
|
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
|
|
|
|
__a = (__vector unsigned int)vec_abs((__v4sf)__A);
|
|
__b = (__vector unsigned int)vec_abs((__v4sf)__B);
|
|
__c = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __a);
|
|
__d = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __b);
|
|
__c = vec_and(__c, __d);
|
|
/* Then we merge the lower float result with the original upper
|
|
* float elements from __A. */
|
|
return ((__m128)vec_sel((__v4sf)__A, (__v4sf)__c, __mask));
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cmpunord_ss(__m128 __A, __m128 __B) {
|
|
__vector unsigned int __a, __b;
|
|
__vector unsigned int __c, __d;
|
|
static const __vector unsigned int __float_exp_mask = {
|
|
0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
|
|
static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
|
|
|
|
__a = (__vector unsigned int)vec_abs((__v4sf)__A);
|
|
__b = (__vector unsigned int)vec_abs((__v4sf)__B);
|
|
__c = (__vector unsigned int)vec_cmpgt(__a, __float_exp_mask);
|
|
__d = (__vector unsigned int)vec_cmpgt(__b, __float_exp_mask);
|
|
__c = vec_or(__c, __d);
|
|
/* Then we merge the lower float result with the original upper
|
|
* float elements from __A. */
|
|
return ((__m128)vec_sel((__v4sf)__A, (__v4sf)__c, __mask));
|
|
}
|
|
|
|
/* Compare the lower SPFP values of A and B and return 1 if true
|
|
and 0 if false. */
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_comieq_ss(__m128 __A, __m128 __B) {
|
|
return (__A[0] == __B[0]);
|
|
}
|
|
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_comilt_ss(__m128 __A, __m128 __B) {
|
|
return (__A[0] < __B[0]);
|
|
}
|
|
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_comile_ss(__m128 __A, __m128 __B) {
|
|
return (__A[0] <= __B[0]);
|
|
}
|
|
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_comigt_ss(__m128 __A, __m128 __B) {
|
|
return (__A[0] > __B[0]);
|
|
}
|
|
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_comige_ss(__m128 __A, __m128 __B) {
|
|
return (__A[0] >= __B[0]);
|
|
}
|
|
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_comineq_ss(__m128 __A, __m128 __B) {
|
|
return (__A[0] != __B[0]);
|
|
}
|
|
|
|
/* FIXME
|
|
* The __mm_ucomi??_ss implementations below are exactly the same as
|
|
* __mm_comi??_ss because GCC for PowerPC only generates unordered
|
|
* compares (scalar and vector).
|
|
* Technically __mm_comieq_ss et al should be using the ordered
|
|
* compare and signal for QNaNs.
|
|
* The __mm_ucomieq_sd et all should be OK, as is.
|
|
*/
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_ucomieq_ss(__m128 __A, __m128 __B) {
|
|
return (__A[0] == __B[0]);
|
|
}
|
|
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_ucomilt_ss(__m128 __A, __m128 __B) {
|
|
return (__A[0] < __B[0]);
|
|
}
|
|
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_ucomile_ss(__m128 __A, __m128 __B) {
|
|
return (__A[0] <= __B[0]);
|
|
}
|
|
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_ucomigt_ss(__m128 __A, __m128 __B) {
|
|
return (__A[0] > __B[0]);
|
|
}
|
|
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_ucomige_ss(__m128 __A, __m128 __B) {
|
|
return (__A[0] >= __B[0]);
|
|
}
|
|
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_ucomineq_ss(__m128 __A, __m128 __B) {
|
|
return (__A[0] != __B[0]);
|
|
}
|
|
|
|
extern __inline float
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtss_f32(__m128 __A) {
|
|
return ((__v4sf)__A)[0];
|
|
}
|
|
|
|
/* Convert the lower SPFP value to a 32-bit integer according to the current
|
|
rounding mode. */
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtss_si32(__m128 __A) {
|
|
int __res;
|
|
#ifdef _ARCH_PWR8
|
|
double __dtmp;
|
|
__asm__(
|
|
#ifdef __LITTLE_ENDIAN__
|
|
"xxsldwi %x0,%x0,%x0,3;\n"
|
|
#endif
|
|
"xscvspdp %x2,%x0;\n"
|
|
"fctiw %2,%2;\n"
|
|
"mfvsrd %1,%x2;\n"
|
|
: "+wa"(__A), "=r"(__res), "=f"(__dtmp)
|
|
:);
|
|
#else
|
|
__res = __builtin_rint(__A[0]);
|
|
#endif
|
|
return __res;
|
|
}
|
|
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvt_ss2si(__m128 __A) {
|
|
return _mm_cvtss_si32(__A);
|
|
}
|
|
|
|
/* Convert the lower SPFP value to a 32-bit integer according to the
|
|
current rounding mode. */
|
|
|
|
/* Intel intrinsic. */
|
|
extern __inline long long
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtss_si64(__m128 __A) {
|
|
long long __res;
|
|
#if defined(_ARCH_PWR8) && defined(__powerpc64__)
|
|
double __dtmp;
|
|
__asm__(
|
|
#ifdef __LITTLE_ENDIAN__
|
|
"xxsldwi %x0,%x0,%x0,3;\n"
|
|
#endif
|
|
"xscvspdp %x2,%x0;\n"
|
|
"fctid %2,%2;\n"
|
|
"mfvsrd %1,%x2;\n"
|
|
: "+wa"(__A), "=r"(__res), "=f"(__dtmp)
|
|
:);
|
|
#else
|
|
__res = __builtin_llrint(__A[0]);
|
|
#endif
|
|
return __res;
|
|
}
|
|
|
|
/* Microsoft intrinsic. */
|
|
extern __inline long long
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtss_si64x(__m128 __A) {
|
|
return _mm_cvtss_si64((__v4sf)__A);
|
|
}
|
|
|
|
/* Constants for use with _mm_prefetch. */
|
|
enum _mm_hint {
|
|
/* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */
|
|
_MM_HINT_ET0 = 7,
|
|
_MM_HINT_ET1 = 6,
|
|
_MM_HINT_T0 = 3,
|
|
_MM_HINT_T1 = 2,
|
|
_MM_HINT_T2 = 1,
|
|
_MM_HINT_NTA = 0
|
|
};
|
|
|
|
/* Loads one cache line from address P to a location "closer" to the
|
|
processor. The selector I specifies the type of prefetch operation. */
|
|
extern __inline void
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_prefetch(const void *__P, enum _mm_hint __I) {
|
|
/* Current PowerPC will ignores the hint parameters. */
|
|
__builtin_prefetch(__P);
|
|
}
|
|
|
|
/* Convert the two lower SPFP values to 32-bit integers according to the
|
|
current rounding mode. Return the integers in packed form. */
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtps_pi32(__m128 __A) {
|
|
/* Splat two lower SPFP values to both halves. */
|
|
__v4sf __temp, __rounded;
|
|
__vector unsigned long long __result;
|
|
|
|
/* Splat two lower SPFP values to both halves. */
|
|
__temp = (__v4sf)vec_splat((__vector long long)__A, 0);
|
|
__rounded = vec_rint(__temp);
|
|
__result = (__vector unsigned long long)vec_cts(__rounded, 0);
|
|
|
|
return (__m64)((__vector long long)__result)[0];
|
|
}
|
|
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvt_ps2pi(__m128 __A) {
|
|
return _mm_cvtps_pi32(__A);
|
|
}
|
|
|
|
/* Truncate the lower SPFP value to a 32-bit integer. */
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvttss_si32(__m128 __A) {
|
|
/* Extract the lower float element. */
|
|
float __temp = __A[0];
|
|
/* truncate to 32-bit integer and return. */
|
|
return __temp;
|
|
}
|
|
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtt_ss2si(__m128 __A) {
|
|
return _mm_cvttss_si32(__A);
|
|
}
|
|
|
|
/* Intel intrinsic. */
|
|
extern __inline long long
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvttss_si64(__m128 __A) {
|
|
/* Extract the lower float element. */
|
|
float __temp = __A[0];
|
|
/* truncate to 32-bit integer and return. */
|
|
return __temp;
|
|
}
|
|
|
|
/* Microsoft intrinsic. */
|
|
extern __inline long long
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvttss_si64x(__m128 __A) {
|
|
/* Extract the lower float element. */
|
|
float __temp = __A[0];
|
|
/* truncate to 32-bit integer and return. */
|
|
return __temp;
|
|
}
|
|
|
|
/* Truncate the two lower SPFP values to 32-bit integers. Return the
|
|
integers in packed form. */
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvttps_pi32(__m128 __A) {
|
|
__v4sf __temp;
|
|
__vector unsigned long long __result;
|
|
|
|
/* Splat two lower SPFP values to both halves. */
|
|
__temp = (__v4sf)vec_splat((__vector long long)__A, 0);
|
|
__result = (__vector unsigned long long)vec_cts(__temp, 0);
|
|
|
|
return (__m64)((__vector long long)__result)[0];
|
|
}
|
|
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtt_ps2pi(__m128 __A) {
|
|
return _mm_cvttps_pi32(__A);
|
|
}
|
|
|
|
/* Convert B to a SPFP value and insert it as element zero in A. */
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtsi32_ss(__m128 __A, int __B) {
|
|
float __temp = __B;
|
|
__A[0] = __temp;
|
|
|
|
return __A;
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvt_si2ss(__m128 __A, int __B) {
|
|
return _mm_cvtsi32_ss(__A, __B);
|
|
}
|
|
|
|
/* Convert B to a SPFP value and insert it as element zero in A. */
|
|
/* Intel intrinsic. */
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtsi64_ss(__m128 __A, long long __B) {
|
|
float __temp = __B;
|
|
__A[0] = __temp;
|
|
|
|
return __A;
|
|
}
|
|
|
|
/* Microsoft intrinsic. */
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtsi64x_ss(__m128 __A, long long __B) {
|
|
return _mm_cvtsi64_ss(__A, __B);
|
|
}
|
|
|
|
/* Convert the two 32-bit values in B to SPFP form and insert them
|
|
as the two lower elements in A. */
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtpi32_ps(__m128 __A, __m64 __B) {
|
|
__vector signed int __vm1;
|
|
__vector float __vf1;
|
|
|
|
__vm1 = (__vector signed int)(__vector unsigned long long){__B, __B};
|
|
__vf1 = (__vector float)vec_ctf(__vm1, 0);
|
|
|
|
return ((__m128)(__vector unsigned long long){
|
|
((__vector unsigned long long)__vf1)[0],
|
|
((__vector unsigned long long)__A)[1]});
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvt_pi2ps(__m128 __A, __m64 __B) {
|
|
return _mm_cvtpi32_ps(__A, __B);
|
|
}
|
|
|
|
/* Convert the four signed 16-bit values in A to SPFP form. */
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtpi16_ps(__m64 __A) {
|
|
__vector signed short __vs8;
|
|
__vector signed int __vi4;
|
|
__vector float __vf1;
|
|
|
|
__vs8 = (__vector signed short)(__vector unsigned long long){__A, __A};
|
|
__vi4 = vec_vupklsh(__vs8);
|
|
__vf1 = (__vector float)vec_ctf(__vi4, 0);
|
|
|
|
return (__m128)__vf1;
|
|
}
|
|
|
|
/* Convert the four unsigned 16-bit values in A to SPFP form. */
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtpu16_ps(__m64 __A) {
|
|
const __vector unsigned short __zero = {0, 0, 0, 0, 0, 0, 0, 0};
|
|
__vector unsigned short __vs8;
|
|
__vector unsigned int __vi4;
|
|
__vector float __vf1;
|
|
|
|
__vs8 = (__vector unsigned short)(__vector unsigned long long){__A, __A};
|
|
__vi4 = (__vector unsigned int)vec_mergel
|
|
#ifdef __LITTLE_ENDIAN__
|
|
(__vs8, __zero);
|
|
#else
|
|
(__zero, __vs8);
|
|
#endif
|
|
__vf1 = (__vector float)vec_ctf(__vi4, 0);
|
|
|
|
return (__m128)__vf1;
|
|
}
|
|
|
|
/* Convert the low four signed 8-bit values in A to SPFP form. */
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtpi8_ps(__m64 __A) {
|
|
__vector signed char __vc16;
|
|
__vector signed short __vs8;
|
|
__vector signed int __vi4;
|
|
__vector float __vf1;
|
|
|
|
__vc16 = (__vector signed char)(__vector unsigned long long){__A, __A};
|
|
__vs8 = vec_vupkhsb(__vc16);
|
|
__vi4 = vec_vupkhsh(__vs8);
|
|
__vf1 = (__vector float)vec_ctf(__vi4, 0);
|
|
|
|
return (__m128)__vf1;
|
|
}
|
|
|
|
/* Convert the low four unsigned 8-bit values in A to SPFP form. */
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
|
|
_mm_cvtpu8_ps(__m64 __A) {
|
|
const __vector unsigned char __zero = {0, 0, 0, 0, 0, 0, 0, 0};
|
|
__vector unsigned char __vc16;
|
|
__vector unsigned short __vs8;
|
|
__vector unsigned int __vi4;
|
|
__vector float __vf1;
|
|
|
|
__vc16 = (__vector unsigned char)(__vector unsigned long long){__A, __A};
|
|
#ifdef __LITTLE_ENDIAN__
|
|
__vs8 = (__vector unsigned short)vec_mergel(__vc16, __zero);
|
|
__vi4 =
|
|
(__vector unsigned int)vec_mergeh(__vs8, (__vector unsigned short)__zero);
|
|
#else
|
|
__vs8 = (__vector unsigned short)vec_mergel(__zero, __vc16);
|
|
__vi4 =
|
|
(__vector unsigned int)vec_mergeh((__vector unsigned short)__zero, __vs8);
|
|
#endif
|
|
__vf1 = (__vector float)vec_ctf(__vi4, 0);
|
|
|
|
return (__m128)__vf1;
|
|
}
|
|
|
|
/* Convert the four signed 32-bit values in A and B to SPFP form. */
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtpi32x2_ps(__m64 __A, __m64 __B) {
|
|
__vector signed int __vi4;
|
|
__vector float __vf4;
|
|
|
|
__vi4 = (__vector signed int)(__vector unsigned long long){__A, __B};
|
|
__vf4 = (__vector float)vec_ctf(__vi4, 0);
|
|
return (__m128)__vf4;
|
|
}
|
|
|
|
/* Convert the four SPFP values in A to four signed 16-bit integers. */
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtps_pi16(__m128 __A) {
|
|
__v4sf __rounded;
|
|
__vector signed int __temp;
|
|
__vector unsigned long long __result;
|
|
|
|
__rounded = vec_rint(__A);
|
|
__temp = vec_cts(__rounded, 0);
|
|
__result = (__vector unsigned long long)vec_pack(__temp, __temp);
|
|
|
|
return (__m64)((__vector long long)__result)[0];
|
|
}
|
|
|
|
/* Convert the four SPFP values in A to four signed 8-bit integers. */
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_cvtps_pi8(__m128 __A) {
|
|
__v4sf __rounded;
|
|
__vector signed int __tmp_i;
|
|
static const __vector signed int __zero = {0, 0, 0, 0};
|
|
__vector signed short __tmp_s;
|
|
__vector signed char __res_v;
|
|
|
|
__rounded = vec_rint(__A);
|
|
__tmp_i = vec_cts(__rounded, 0);
|
|
__tmp_s = vec_pack(__tmp_i, __zero);
|
|
__res_v = vec_pack(__tmp_s, __tmp_s);
|
|
return (__m64)((__vector long long)__res_v)[0];
|
|
}
|
|
|
|
/* Selects four specific SPFP values from A and B based on MASK. */
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
|
|
_mm_shuffle_ps(__m128 __A, __m128 __B, int const __mask) {
|
|
unsigned long __element_selector_10 = __mask & 0x03;
|
|
unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
|
|
unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
|
|
unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
|
|
static const unsigned int __permute_selectors[4] = {
|
|
#ifdef __LITTLE_ENDIAN__
|
|
0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
|
|
#else
|
|
0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
|
|
#endif
|
|
};
|
|
__vector unsigned int __t;
|
|
|
|
__t[0] = __permute_selectors[__element_selector_10];
|
|
__t[1] = __permute_selectors[__element_selector_32];
|
|
__t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
|
|
__t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
|
|
return vec_perm((__v4sf)__A, (__v4sf)__B, (__vector unsigned char)__t);
|
|
}
|
|
|
|
/* Selects and interleaves the upper two SPFP values from A and B. */
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_unpackhi_ps(__m128 __A, __m128 __B) {
|
|
return (__m128)vec_vmrglw((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
|
|
/* Selects and interleaves the lower two SPFP values from A and B. */
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_unpacklo_ps(__m128 __A, __m128 __B) {
|
|
return (__m128)vec_vmrghw((__v4sf)__A, (__v4sf)__B);
|
|
}
|
|
|
|
/* Sets the upper two SPFP values with 64-bits of data loaded from P;
|
|
the lower two values are passed through from A. */
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_loadh_pi(__m128 __A, __m64 const *__P) {
|
|
__vector unsigned long long __a = (__vector unsigned long long)__A;
|
|
__vector unsigned long long __p = vec_splats(*__P);
|
|
__a[1] = __p[1];
|
|
|
|
return (__m128)__a;
|
|
}
|
|
|
|
/* Stores the upper two SPFP values of A into P. */
|
|
extern __inline void
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_storeh_pi(__m64 *__P, __m128 __A) {
|
|
__vector unsigned long long __a = (__vector unsigned long long)__A;
|
|
|
|
*__P = __a[1];
|
|
}
|
|
|
|
/* Moves the upper two values of B into the lower two values of A. */
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_movehl_ps(__m128 __A, __m128 __B) {
|
|
return (__m128)vec_mergel((__vector unsigned long long)__B,
|
|
(__vector unsigned long long)__A);
|
|
}
|
|
|
|
/* Moves the lower two values of B into the upper two values of A. */
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_movelh_ps(__m128 __A, __m128 __B) {
|
|
return (__m128)vec_mergeh((__vector unsigned long long)__A,
|
|
(__vector unsigned long long)__B);
|
|
}
|
|
|
|
/* Sets the lower two SPFP values with 64-bits of data loaded from P;
|
|
the upper two values are passed through from A. */
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_loadl_pi(__m128 __A, __m64 const *__P) {
|
|
__vector unsigned long long __a = (__vector unsigned long long)__A;
|
|
__vector unsigned long long __p = vec_splats(*__P);
|
|
__a[0] = __p[0];
|
|
|
|
return (__m128)__a;
|
|
}
|
|
|
|
/* Stores the lower two SPFP values of A into P. */
|
|
extern __inline void
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_storel_pi(__m64 *__P, __m128 __A) {
|
|
__vector unsigned long long __a = (__vector unsigned long long)__A;
|
|
|
|
*__P = __a[0];
|
|
}
|
|
|
|
#ifdef _ARCH_PWR8
|
|
/* Intrinsic functions that require PowerISA 2.07 minimum. */
|
|
|
|
/* Creates a 4-bit mask from the most significant bits of the SPFP values. */
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_movemask_ps(__m128 __A) {
|
|
#ifdef _ARCH_PWR10
|
|
return vec_extractm((__vector unsigned int)__A);
|
|
#else
|
|
__vector unsigned long long __result;
|
|
static const __vector unsigned int __perm_mask = {
|
|
#ifdef __LITTLE_ENDIAN__
|
|
0x00204060, 0x80808080, 0x80808080, 0x80808080
|
|
#else
|
|
0x80808080, 0x80808080, 0x80808080, 0x00204060
|
|
#endif
|
|
};
|
|
|
|
__result = ((__vector unsigned long long)vec_vbpermq(
|
|
(__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
|
|
|
|
#ifdef __LITTLE_ENDIAN__
|
|
return __result[1];
|
|
#else
|
|
return __result[0];
|
|
#endif
|
|
#endif /* !_ARCH_PWR10 */
|
|
}
|
|
#endif /* _ARCH_PWR8 */
|
|
|
|
/* Create a vector with all four elements equal to *P. */
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_load1_ps(float const *__P) {
|
|
return _mm_set1_ps(*__P);
|
|
}
|
|
|
|
extern __inline __m128
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_load_ps1(float const *__P) {
|
|
return _mm_load1_ps(__P);
|
|
}
|
|
|
|
/* Extracts one of the four words of A. The selector N must be immediate. */
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_extract_pi16(__m64 const __A, int const __N) {
|
|
unsigned int __shiftr = __N & 3;
|
|
#ifdef __BIG_ENDIAN__
|
|
__shiftr = 3 - __shiftr;
|
|
#endif
|
|
|
|
return ((__A >> (__shiftr * 16)) & 0xffff);
|
|
}
|
|
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_m_pextrw(__m64 const __A, int const __N) {
|
|
return _mm_extract_pi16(__A, __N);
|
|
}
|
|
|
|
/* Inserts word D into one of four words of A. The selector N must be
|
|
immediate. */
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_insert_pi16(__m64 const __A, int const __D, int const __N) {
|
|
const int __shiftl = (__N & 3) * 16;
|
|
const __m64 __shiftD = (const __m64)__D << __shiftl;
|
|
const __m64 __mask = 0xffffUL << __shiftl;
|
|
__m64 __result = (__A & (~__mask)) | (__shiftD & __mask);
|
|
|
|
return __result;
|
|
}
|
|
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_m_pinsrw(__m64 const __A, int const __D, int const __N) {
|
|
return _mm_insert_pi16(__A, __D, __N);
|
|
}
|
|
|
|
/* Compute the element-wise maximum of signed 16-bit values. */
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
|
|
_mm_max_pi16(__m64 __A, __m64 __B) {
|
|
#if _ARCH_PWR8
|
|
__vector signed short __a, __b, __r;
|
|
__vector __bool short __c;
|
|
|
|
__a = (__vector signed short)vec_splats(__A);
|
|
__b = (__vector signed short)vec_splats(__B);
|
|
__c = (__vector __bool short)vec_cmpgt(__a, __b);
|
|
__r = vec_sel(__b, __a, __c);
|
|
return (__m64)((__vector long long)__r)[0];
|
|
#else
|
|
__m64_union __m1, __m2, __res;
|
|
|
|
__m1.as_m64 = __A;
|
|
__m2.as_m64 = __B;
|
|
|
|
__res.as_short[0] = (__m1.as_short[0] > __m2.as_short[0]) ? __m1.as_short[0]
|
|
: __m2.as_short[0];
|
|
__res.as_short[1] = (__m1.as_short[1] > __m2.as_short[1]) ? __m1.as_short[1]
|
|
: __m2.as_short[1];
|
|
__res.as_short[2] = (__m1.as_short[2] > __m2.as_short[2]) ? __m1.as_short[2]
|
|
: __m2.as_short[2];
|
|
__res.as_short[3] = (__m1.as_short[3] > __m2.as_short[3]) ? __m1.as_short[3]
|
|
: __m2.as_short[3];
|
|
|
|
return (__m64)__res.as_m64;
|
|
#endif
|
|
}
|
|
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_m_pmaxsw(__m64 __A, __m64 __B) {
|
|
return _mm_max_pi16(__A, __B);
|
|
}
|
|
|
|
/* Compute the element-wise maximum of unsigned 8-bit values. */
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_max_pu8(__m64 __A, __m64 __B) {
|
|
#if _ARCH_PWR8
|
|
__vector unsigned char __a, __b, __r;
|
|
__vector __bool char __c;
|
|
|
|
__a = (__vector unsigned char)vec_splats(__A);
|
|
__b = (__vector unsigned char)vec_splats(__B);
|
|
__c = (__vector __bool char)vec_cmpgt(__a, __b);
|
|
__r = vec_sel(__b, __a, __c);
|
|
return (__m64)((__vector long long)__r)[0];
|
|
#else
|
|
__m64_union __m1, __m2, __res;
|
|
long __i;
|
|
|
|
__m1.as_m64 = __A;
|
|
__m2.as_m64 = __B;
|
|
|
|
for (__i = 0; __i < 8; __i++)
|
|
__res.as_char[__i] =
|
|
((unsigned char)__m1.as_char[__i] > (unsigned char)__m2.as_char[__i])
|
|
? __m1.as_char[__i]
|
|
: __m2.as_char[__i];
|
|
|
|
return (__m64)__res.as_m64;
|
|
#endif
|
|
}
|
|
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_m_pmaxub(__m64 __A, __m64 __B) {
|
|
return _mm_max_pu8(__A, __B);
|
|
}
|
|
|
|
/* Compute the element-wise minimum of signed 16-bit values. */
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_min_pi16(__m64 __A, __m64 __B) {
|
|
#if _ARCH_PWR8
|
|
__vector signed short __a, __b, __r;
|
|
__vector __bool short __c;
|
|
|
|
__a = (__vector signed short)vec_splats(__A);
|
|
__b = (__vector signed short)vec_splats(__B);
|
|
__c = (__vector __bool short)vec_cmplt(__a, __b);
|
|
__r = vec_sel(__b, __a, __c);
|
|
return (__m64)((__vector long long)__r)[0];
|
|
#else
|
|
__m64_union __m1, __m2, __res;
|
|
|
|
__m1.as_m64 = __A;
|
|
__m2.as_m64 = __B;
|
|
|
|
__res.as_short[0] = (__m1.as_short[0] < __m2.as_short[0]) ? __m1.as_short[0]
|
|
: __m2.as_short[0];
|
|
__res.as_short[1] = (__m1.as_short[1] < __m2.as_short[1]) ? __m1.as_short[1]
|
|
: __m2.as_short[1];
|
|
__res.as_short[2] = (__m1.as_short[2] < __m2.as_short[2]) ? __m1.as_short[2]
|
|
: __m2.as_short[2];
|
|
__res.as_short[3] = (__m1.as_short[3] < __m2.as_short[3]) ? __m1.as_short[3]
|
|
: __m2.as_short[3];
|
|
|
|
return (__m64)__res.as_m64;
|
|
#endif
|
|
}
|
|
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_m_pminsw(__m64 __A, __m64 __B) {
|
|
return _mm_min_pi16(__A, __B);
|
|
}
|
|
|
|
/* Compute the element-wise minimum of unsigned 8-bit values. */
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_min_pu8(__m64 __A, __m64 __B) {
|
|
#if _ARCH_PWR8
|
|
__vector unsigned char __a, __b, __r;
|
|
__vector __bool char __c;
|
|
|
|
__a = (__vector unsigned char)vec_splats(__A);
|
|
__b = (__vector unsigned char)vec_splats(__B);
|
|
__c = (__vector __bool char)vec_cmplt(__a, __b);
|
|
__r = vec_sel(__b, __a, __c);
|
|
return (__m64)((__vector long long)__r)[0];
|
|
#else
|
|
__m64_union __m1, __m2, __res;
|
|
long __i;
|
|
|
|
__m1.as_m64 = __A;
|
|
__m2.as_m64 = __B;
|
|
|
|
for (__i = 0; __i < 8; __i++)
|
|
__res.as_char[__i] =
|
|
((unsigned char)__m1.as_char[__i] < (unsigned char)__m2.as_char[__i])
|
|
? __m1.as_char[__i]
|
|
: __m2.as_char[__i];
|
|
|
|
return (__m64)__res.as_m64;
|
|
#endif
|
|
}
|
|
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_m_pminub(__m64 __A, __m64 __B) {
|
|
return _mm_min_pu8(__A, __B);
|
|
}
|
|
|
|
/* Create an 8-bit mask of the signs of 8-bit values. */
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_movemask_pi8(__m64 __A) {
|
|
#ifdef __powerpc64__
|
|
unsigned long long __p =
|
|
#ifdef __LITTLE_ENDIAN__
|
|
0x0008101820283038UL; // permute control for sign bits
|
|
#else
|
|
0x3830282018100800UL; // permute control for sign bits
|
|
#endif
|
|
return __builtin_bpermd(__p, __A);
|
|
#else
|
|
#ifdef __LITTLE_ENDIAN__
|
|
unsigned int __mask = 0x20283038UL;
|
|
unsigned int __r1 = __builtin_bpermd(__mask, __A) & 0xf;
|
|
unsigned int __r2 = __builtin_bpermd(__mask, __A >> 32) & 0xf;
|
|
#else
|
|
unsigned int __mask = 0x38302820UL;
|
|
unsigned int __r1 = __builtin_bpermd(__mask, __A >> 32) & 0xf;
|
|
unsigned int __r2 = __builtin_bpermd(__mask, __A) & 0xf;
|
|
#endif
|
|
return (__r2 << 4) | __r1;
|
|
#endif
|
|
}
|
|
|
|
extern __inline int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_m_pmovmskb(__m64 __A) {
|
|
return _mm_movemask_pi8(__A);
|
|
}
|
|
|
|
/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
|
|
in B and produce the high 16 bits of the 32-bit results. */
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_mulhi_pu16(__m64 __A, __m64 __B) {
|
|
__vector unsigned short __a, __b;
|
|
__vector unsigned short __c;
|
|
__vector unsigned int __w0, __w1;
|
|
__vector unsigned char __xform1 = {
|
|
#ifdef __LITTLE_ENDIAN__
|
|
0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
|
|
0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
|
|
#else
|
|
0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,
|
|
0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
|
|
#endif
|
|
};
|
|
|
|
__a = (__vector unsigned short)vec_splats(__A);
|
|
__b = (__vector unsigned short)vec_splats(__B);
|
|
|
|
__w0 = vec_vmuleuh(__a, __b);
|
|
__w1 = vec_vmulouh(__a, __b);
|
|
__c = (__vector unsigned short)vec_perm(__w0, __w1, __xform1);
|
|
|
|
return (__m64)((__vector long long)__c)[0];
|
|
}
|
|
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_m_pmulhuw(__m64 __A, __m64 __B) {
|
|
return _mm_mulhi_pu16(__A, __B);
|
|
}
|
|
|
|
/* Return a combination of the four 16-bit values in A. The selector
|
|
must be an immediate. */
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_shuffle_pi16(__m64 __A, int const __N) {
|
|
unsigned long __element_selector_10 = __N & 0x03;
|
|
unsigned long __element_selector_32 = (__N >> 2) & 0x03;
|
|
unsigned long __element_selector_54 = (__N >> 4) & 0x03;
|
|
unsigned long __element_selector_76 = (__N >> 6) & 0x03;
|
|
static const unsigned short __permute_selectors[4] = {
|
|
#ifdef __LITTLE_ENDIAN__
|
|
0x0908, 0x0B0A, 0x0D0C, 0x0F0E
|
|
#else
|
|
0x0607, 0x0405, 0x0203, 0x0001
|
|
#endif
|
|
};
|
|
__m64_union __t;
|
|
__vector unsigned long long __a, __p, __r;
|
|
|
|
#ifdef __LITTLE_ENDIAN__
|
|
__t.as_short[0] = __permute_selectors[__element_selector_10];
|
|
__t.as_short[1] = __permute_selectors[__element_selector_32];
|
|
__t.as_short[2] = __permute_selectors[__element_selector_54];
|
|
__t.as_short[3] = __permute_selectors[__element_selector_76];
|
|
#else
|
|
__t.as_short[3] = __permute_selectors[__element_selector_10];
|
|
__t.as_short[2] = __permute_selectors[__element_selector_32];
|
|
__t.as_short[1] = __permute_selectors[__element_selector_54];
|
|
__t.as_short[0] = __permute_selectors[__element_selector_76];
|
|
#endif
|
|
__p = vec_splats(__t.as_m64);
|
|
__a = vec_splats(__A);
|
|
__r = vec_perm(__a, __a, (__vector unsigned char)__p);
|
|
return (__m64)((__vector long long)__r)[0];
|
|
}
|
|
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_m_pshufw(__m64 __A, int const __N) {
|
|
return _mm_shuffle_pi16(__A, __N);
|
|
}
|
|
|
|
/* Conditionally store byte elements of A into P. The high bit of each
|
|
byte in the selector N determines whether the corresponding byte from
|
|
A is stored. */
|
|
extern __inline void
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_maskmove_si64(__m64 __A, __m64 __N, char *__P) {
|
|
__m64 __hibit = 0x8080808080808080UL;
|
|
__m64 __mask, __tmp;
|
|
__m64 *__p = (__m64 *)__P;
|
|
|
|
__tmp = *__p;
|
|
__mask = _mm_cmpeq_pi8((__N & __hibit), __hibit);
|
|
__tmp = (__tmp & (~__mask)) | (__A & __mask);
|
|
*__p = __tmp;
|
|
}
|
|
|
|
extern __inline void
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_m_maskmovq(__m64 __A, __m64 __N, char *__P) {
|
|
_mm_maskmove_si64(__A, __N, __P);
|
|
}
|
|
|
|
/* Compute the rounded averages of the unsigned 8-bit values in A and B. */
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_avg_pu8(__m64 __A, __m64 __B) {
|
|
__vector unsigned char __a, __b, __c;
|
|
|
|
__a = (__vector unsigned char)vec_splats(__A);
|
|
__b = (__vector unsigned char)vec_splats(__B);
|
|
__c = vec_avg(__a, __b);
|
|
return (__m64)((__vector long long)__c)[0];
|
|
}
|
|
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_m_pavgb(__m64 __A, __m64 __B) {
|
|
return _mm_avg_pu8(__A, __B);
|
|
}
|
|
|
|
/* Compute the rounded averages of the unsigned 16-bit values in A and B. */
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_avg_pu16(__m64 __A, __m64 __B) {
|
|
__vector unsigned short __a, __b, __c;
|
|
|
|
__a = (__vector unsigned short)vec_splats(__A);
|
|
__b = (__vector unsigned short)vec_splats(__B);
|
|
__c = vec_avg(__a, __b);
|
|
return (__m64)((__vector long long)__c)[0];
|
|
}
|
|
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_m_pavgw(__m64 __A, __m64 __B) {
|
|
return _mm_avg_pu16(__A, __B);
|
|
}
|
|
|
|
/* Compute the sum of the absolute differences of the unsigned 8-bit
|
|
values in A and B. Return the value in the lower 16-bit word; the
|
|
upper words are cleared. */
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_sad_pu8(__m64 __A, __m64 __B) {
|
|
__vector unsigned char __a, __b;
|
|
__vector unsigned char __vmin, __vmax, __vabsdiff;
|
|
__vector signed int __vsum;
|
|
const __vector unsigned int __zero = {0, 0, 0, 0};
|
|
__m64_union __result = {0};
|
|
|
|
__a = (__vector unsigned char)(__vector unsigned long long){0UL, __A};
|
|
__b = (__vector unsigned char)(__vector unsigned long long){0UL, __B};
|
|
__vmin = vec_min(__a, __b);
|
|
__vmax = vec_max(__a, __b);
|
|
__vabsdiff = vec_sub(__vmax, __vmin);
|
|
/* Sum four groups of bytes into integers. */
|
|
__vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero);
|
|
/* Sum across four integers with integer result. */
|
|
__vsum = vec_sums(__vsum, (__vector signed int)__zero);
|
|
/* The sum is in the right most 32-bits of the vector result.
|
|
Transfer to a GPR and truncate to 16 bits. */
|
|
__result.as_short[0] = __vsum[3];
|
|
return __result.as_m64;
|
|
}
|
|
|
|
extern __inline __m64
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_m_psadbw(__m64 __A, __m64 __B) {
|
|
return _mm_sad_pu8(__A, __B);
|
|
}
|
|
|
|
/* Stores the data in A to the address P without polluting the caches. */
|
|
extern __inline void
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_stream_pi(__m64 *__P, __m64 __A) {
|
|
/* Use the data cache block touch for store transient. */
|
|
__asm__(" dcbtstt 0,%0" : : "b"(__P) : "memory");
|
|
*__P = __A;
|
|
}
|
|
|
|
/* Likewise. The address must be 16-byte aligned. */
|
|
extern __inline void
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_stream_ps(float *__P, __m128 __A) {
|
|
/* Use the data cache block touch for store transient. */
|
|
__asm__(" dcbtstt 0,%0" : : "b"(__P) : "memory");
|
|
_mm_store_ps(__P, __A);
|
|
}
|
|
|
|
/* Guarantees that every preceding store is globally visible before
|
|
any subsequent store. */
|
|
extern __inline void
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_sfence(void) {
|
|
/* Generate a light weight sync. */
|
|
__atomic_thread_fence(__ATOMIC_RELEASE);
|
|
}
|
|
|
|
/* The execution of the next instruction is delayed by an implementation
|
|
specific amount of time. The instruction does not modify the
|
|
architectural state. This is after the pop_options pragma because
|
|
it does not require SSE support in the processor--the encoding is a
|
|
nop on processors that do not support it. */
|
|
extern __inline void
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mm_pause(void) {
|
|
/* There is no exact match with this construct, but the following is
|
|
close to the desired effect. */
|
|
#if _ARCH_PWR8
|
|
/* On power8 and later processors we can depend on Program Priority
|
|
(PRI) and associated "very low" PPI setting. Since we don't know
|
|
what PPI this thread is running at we: 1) save the current PRI
|
|
from the PPR SPR into a local GRP, 2) set the PRI to "very low*
|
|
via the special or 31,31,31 encoding. 3) issue an "isync" to
|
|
insure the PRI change takes effect before we execute any more
|
|
instructions.
|
|
Now we can execute a lwsync (release barrier) while we execute
|
|
this thread at "very low" PRI. Finally we restore the original
|
|
PRI and continue execution. */
|
|
unsigned long __PPR;
|
|
|
|
__asm__ volatile(" mfppr %0;"
|
|
" or 31,31,31;"
|
|
" isync;"
|
|
" lwsync;"
|
|
" isync;"
|
|
" mtppr %0;"
|
|
: "=r"(__PPR)
|
|
:
|
|
: "memory");
|
|
#else
|
|
/* For older processor where we may not even have Program Priority
|
|
controls we can only depend on Heavy Weight Sync. */
|
|
__atomic_thread_fence(__ATOMIC_SEQ_CST);
|
|
#endif
|
|
}
|
|
|
|
/* Transpose the 4x4 matrix composed of row[0-3]. */
|
|
#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
|
|
do { \
|
|
__v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \
|
|
__v4sf __t0 = vec_vmrghw(__r0, __r1); \
|
|
__v4sf __t1 = vec_vmrghw(__r2, __r3); \
|
|
__v4sf __t2 = vec_vmrglw(__r0, __r1); \
|
|
__v4sf __t3 = vec_vmrglw(__r2, __r3); \
|
|
(row0) = (__v4sf)vec_mergeh((__vector long long)__t0, \
|
|
(__vector long long)__t1); \
|
|
(row1) = (__v4sf)vec_mergel((__vector long long)__t0, \
|
|
(__vector long long)__t1); \
|
|
(row2) = (__v4sf)vec_mergeh((__vector long long)__t2, \
|
|
(__vector long long)__t3); \
|
|
(row3) = (__v4sf)vec_mergel((__vector long long)__t2, \
|
|
(__vector long long)__t3); \
|
|
} while (0)
|
|
|
|
/* For backward source compatibility. */
|
|
//# include <emmintrin.h>
|
|
|
|
#else
|
|
#include_next <xmmintrin.h>
|
|
#endif /* defined(__powerpc64__) && \
|
|
* (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
|
|
|
|
#endif /* XMMINTRIN_H_ */
|