135 lines
4.6 KiB
C
135 lines
4.6 KiB
C
/*===---- bmiintrin.h - Implementation of BMI2 intrinsics on PowerPC -------===
|
|
*
|
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
* See https://llvm.org/LICENSE.txt for license information.
|
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
*
|
|
*===-----------------------------------------------------------------------===
|
|
*/
|
|
|
|
#if !defined X86GPRINTRIN_H_
|
|
#error "Never use <bmi2intrin.h> directly; include <x86gprintrin.h> instead."
|
|
#endif
|
|
|
|
#ifndef BMI2INTRIN_H_
|
|
#define BMI2INTRIN_H_
|
|
|
|
extern __inline unsigned int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_bzhi_u32(unsigned int __X, unsigned int __Y) {
|
|
return ((__X << (32 - __Y)) >> (32 - __Y));
|
|
}
|
|
|
|
extern __inline unsigned int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P) {
|
|
unsigned long long __res = (unsigned long long)__X * __Y;
|
|
*__P = (unsigned int)(__res >> 32);
|
|
return (unsigned int)__res;
|
|
}
|
|
|
|
#ifdef __PPC64__
|
|
extern __inline unsigned long long
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_bzhi_u64(unsigned long long __X, unsigned long long __Y) {
|
|
return ((__X << (64 - __Y)) >> (64 - __Y));
|
|
}
|
|
|
|
/* __int128 requires base 64-bit. */
|
|
extern __inline unsigned long long
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_mulx_u64(unsigned long long __X, unsigned long long __Y,
|
|
unsigned long long *__P) {
|
|
unsigned __int128 __res = (unsigned __int128)__X * __Y;
|
|
*__P = (unsigned long long)(__res >> 64);
|
|
return (unsigned long long)__res;
|
|
}
|
|
|
|
#ifdef _ARCH_PWR7
|
|
/* popcount and bpermd require power7 minimum. */
|
|
extern __inline unsigned long long
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_pdep_u64(unsigned long long __X, unsigned long long __M) {
|
|
unsigned long __result = 0x0UL;
|
|
const unsigned long __mask = 0x8000000000000000UL;
|
|
unsigned long __m = __M;
|
|
unsigned long __c, __t;
|
|
unsigned long __p;
|
|
|
|
/* The pop-count of the mask gives the number of the bits from
|
|
source to process. This is also needed to shift bits from the
|
|
source into the correct position for the result. */
|
|
__p = 64 - __builtin_popcountl(__M);
|
|
|
|
/* The loop is for the number of '1' bits in the mask and clearing
|
|
each mask bit as it is processed. */
|
|
while (__m != 0) {
|
|
__c = __builtin_clzl(__m);
|
|
__t = __X << (__p - __c);
|
|
__m ^= (__mask >> __c);
|
|
__result |= (__t & (__mask >> __c));
|
|
__p++;
|
|
}
|
|
return __result;
|
|
}
|
|
|
|
extern __inline unsigned long long
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_pext_u64(unsigned long long __X, unsigned long long __M) {
|
|
unsigned long __p = 0x4040404040404040UL; // initial bit permute control
|
|
const unsigned long __mask = 0x8000000000000000UL;
|
|
unsigned long __m = __M;
|
|
unsigned long __c;
|
|
unsigned long __result;
|
|
|
|
/* if the mask is constant and selects 8 bits or less we can use
|
|
the Power8 Bit permute instruction. */
|
|
if (__builtin_constant_p(__M) && (__builtin_popcountl(__M) <= 8)) {
|
|
/* Also if the pext mask is constant, then the popcount is
|
|
constant, we can evaluate the following loop at compile
|
|
time and use a constant bit permute vector. */
|
|
long __i;
|
|
for (__i = 0; __i < __builtin_popcountl(__M); __i++) {
|
|
__c = __builtin_clzl(__m);
|
|
__p = (__p << 8) | __c;
|
|
__m ^= (__mask >> __c);
|
|
}
|
|
__result = __builtin_bpermd(__p, __X);
|
|
} else {
|
|
__p = 64 - __builtin_popcountl(__M);
|
|
__result = 0;
|
|
/* We could a use a for loop here, but that combined with
|
|
-funroll-loops can expand to a lot of code. The while
|
|
loop avoids unrolling and the compiler commons the xor
|
|
from clearing the mask bit with the (m != 0) test. The
|
|
result is a more compact loop setup and body. */
|
|
while (__m != 0) {
|
|
unsigned long __t;
|
|
__c = __builtin_clzl(__m);
|
|
__t = (__X & (__mask >> __c)) >> (__p - __c);
|
|
__m ^= (__mask >> __c);
|
|
__result |= (__t);
|
|
__p++;
|
|
}
|
|
}
|
|
return __result;
|
|
}
|
|
|
|
/* these 32-bit implementations depend on 64-bit pdep/pext
|
|
which depend on _ARCH_PWR7. */
|
|
extern __inline unsigned int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_pdep_u32(unsigned int __X, unsigned int __Y) {
|
|
return _pdep_u64(__X, __Y);
|
|
}
|
|
|
|
extern __inline unsigned int
|
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
_pext_u32(unsigned int __X, unsigned int __Y) {
|
|
return _pext_u64(__X, __Y);
|
|
}
|
|
#endif /* _ARCH_PWR7 */
|
|
#endif /* __PPC64__ */
|
|
|
|
#endif /* BMI2INTRIN_H_ */
|