Merge pull request #156 from birth-software/delete-libc-files

Delete unneeded libc files
2024-04-20 08:44:07 -06:00 · 2024-04-20 08:44:07 -06:00 · 9dae856734
commit 9dae856734
parent a38af3bbb6 91002c91c0
7903 changed files with 1 additions and 2737025 deletions
--- a/bootstrap/Compilation.zig
+++ b/bootstrap/Compilation.zig
@ -653,7 +653,7 @@ pub fn compileCSourceFile(context: *const Context, arguments: []const []const u8
                    const libc_include_dirs: []const []const u8 = switch (@import("builtin").os.tag) {
                        .macos => &.{
                            "/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include/c++/v1",
-                            try context.pathFromCompiler("lib/include"),
+                            "/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/lib/clang/15.0.0/include",
                            "/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include",
                        },
                        .linux => switch (@import("builtin").abi) {
--- a/lib/include/__clang_cuda_builtin_vars.h
+++ b/lib/include/__clang_cuda_builtin_vars.h
@ -1,121 +0,0 @@
-/*===---- cuda_builtin_vars.h - CUDA built-in variables ---------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __CUDA_BUILTIN_VARS_H
-#define __CUDA_BUILTIN_VARS_H
-
-// Forward declares from vector_types.h.
-struct uint3;
-struct dim3;
-
-// The file implements built-in CUDA variables using __declspec(property).
-// https://msdn.microsoft.com/en-us/library/yhfk0thd.aspx
-// All read accesses of built-in variable fields get converted into calls to a
-// getter function which in turn calls the appropriate builtin to fetch the
-// value.
-//
-// Example:
-//    int x = threadIdx.x;
-// IR output:
-//  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
-// PTX output:
-//  mov.u32     %r2, %tid.x;
-
-#define __CUDA_DEVICE_BUILTIN(FIELD, INTRINSIC)                                \
-  __declspec(property(get = __fetch_builtin_##FIELD)) unsigned int FIELD;      \
-  static inline __attribute__((always_inline))                                 \
-      __attribute__((device)) unsigned int __fetch_builtin_##FIELD(void) {     \
-    return INTRINSIC;                                                          \
-  }
-
-#if __cplusplus >= 201103L
-#define __DELETE =delete
-#else
-#define __DELETE
-#endif
-
-// Make sure nobody can create instances of the special variable types.  nvcc
-// also disallows taking address of special variables, so we disable address-of
-// operator as well.
-#define __CUDA_DISALLOW_BUILTINVAR_ACCESS(TypeName)                            \
-  __attribute__((device)) TypeName() __DELETE;                                 \
-  __attribute__((device)) TypeName(const TypeName &) __DELETE;                 \
-  __attribute__((device)) void operator=(const TypeName &) const __DELETE;     \
-  __attribute__((device)) TypeName *operator&() const __DELETE
-
-struct __cuda_builtin_threadIdx_t {
-  __CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_tid_x());
-  __CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_tid_y());
-  __CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_tid_z());
-  // threadIdx should be convertible to uint3 (in fact in nvcc, it *is* a
-  // uint3).  This function is defined after we pull in vector_types.h.
-  __attribute__((device)) operator dim3() const;
-  __attribute__((device)) operator uint3() const;
-
-private:
-  __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_threadIdx_t);
-};
-
-struct __cuda_builtin_blockIdx_t {
-  __CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_ctaid_x());
-  __CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_ctaid_y());
-  __CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_ctaid_z());
-  // blockIdx should be convertible to uint3 (in fact in nvcc, it *is* a
-  // uint3).  This function is defined after we pull in vector_types.h.
-  __attribute__((device)) operator dim3() const;
-  __attribute__((device)) operator uint3() const;
-
-private:
-  __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockIdx_t);
-};
-
-struct __cuda_builtin_blockDim_t {
-  __CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_ntid_x());
-  __CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_ntid_y());
-  __CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_ntid_z());
-  // blockDim should be convertible to dim3 (in fact in nvcc, it *is* a
-  // dim3).  This function is defined after we pull in vector_types.h.
-  __attribute__((device)) operator dim3() const;
-  __attribute__((device)) operator uint3() const;
-
-private:
-  __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockDim_t);
-};
-
-struct __cuda_builtin_gridDim_t {
-  __CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_nctaid_x());
-  __CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_nctaid_y());
-  __CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_nctaid_z());
-  // gridDim should be convertible to dim3 (in fact in nvcc, it *is* a
-  // dim3).  This function is defined after we pull in vector_types.h.
-  __attribute__((device)) operator dim3() const;
-  __attribute__((device)) operator uint3() const;
-
-private:
-  __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_gridDim_t);
-};
-
-#define __CUDA_BUILTIN_VAR                                                     \
-  extern const __attribute__((device)) __attribute__((weak))
-__CUDA_BUILTIN_VAR __cuda_builtin_threadIdx_t threadIdx;
-__CUDA_BUILTIN_VAR __cuda_builtin_blockIdx_t blockIdx;
-__CUDA_BUILTIN_VAR __cuda_builtin_blockDim_t blockDim;
-__CUDA_BUILTIN_VAR __cuda_builtin_gridDim_t gridDim;
-
-// warpSize should translate to read of %WARP_SZ but there's currently no
-// builtin to do so. According to PTX v4.2 docs 'to date, all target
-// architectures have a WARP_SZ value of 32'.
-__attribute__((device)) const int warpSize = 32;
-
-#undef __CUDA_DEVICE_BUILTIN
-#undef __CUDA_BUILTIN_VAR
-#undef __CUDA_DISALLOW_BUILTINVAR_ACCESS
-#undef __DELETE
-
-#endif /* __CUDA_BUILTIN_VARS_H */
--- a/lib/include/__clang_cuda_cmath.h
+++ b/lib/include/__clang_cuda_cmath.h
@ -1,512 +0,0 @@
-/*===---- __clang_cuda_cmath.h - Device-side CUDA cmath support ------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __CLANG_CUDA_CMATH_H__
-#define __CLANG_CUDA_CMATH_H__
-#ifndef __CUDA__
-#error "This file is for CUDA compilation only."
-#endif
-
-#ifndef __OPENMP_NVPTX__
-#include <limits>
-#endif
-
-// CUDA lets us use various std math functions on the device side.  This file
-// works in concert with __clang_cuda_math_forward_declares.h to make this work.
-//
-// Specifically, the forward-declares header declares __device__ overloads for
-// these functions in the global namespace, then pulls them into namespace std
-// with 'using' statements.  Then this file implements those functions, after
-// their implementations have been pulled in.
-//
-// It's important that we declare the functions in the global namespace and pull
-// them into namespace std with using statements, as opposed to simply declaring
-// these functions in namespace std, because our device functions need to
-// overload the standard library functions, which may be declared in the global
-// namespace or in std, depending on the degree of conformance of the stdlib
-// implementation.  Declaring in the global namespace and pulling into namespace
-// std covers all of the known knowns.
-
-#ifdef __OPENMP_NVPTX__
-#define __DEVICE__ static constexpr __attribute__((always_inline, nothrow))
-#else
-#define __DEVICE__ static __device__ __inline__ __attribute__((always_inline))
-#endif
-
-__DEVICE__ long long abs(long long __n) { return ::llabs(__n); }
-__DEVICE__ long abs(long __n) { return ::labs(__n); }
-__DEVICE__ float abs(float __x) { return ::fabsf(__x); }
-__DEVICE__ double abs(double __x) { return ::fabs(__x); }
-__DEVICE__ float acos(float __x) { return ::acosf(__x); }
-__DEVICE__ float asin(float __x) { return ::asinf(__x); }
-__DEVICE__ float atan(float __x) { return ::atanf(__x); }
-__DEVICE__ float atan2(float __x, float __y) { return ::atan2f(__x, __y); }
-__DEVICE__ float ceil(float __x) { return ::ceilf(__x); }
-__DEVICE__ float cos(float __x) { return ::cosf(__x); }
-__DEVICE__ float cosh(float __x) { return ::coshf(__x); }
-__DEVICE__ float exp(float __x) { return ::expf(__x); }
-__DEVICE__ float fabs(float __x) { return ::fabsf(__x); }
-__DEVICE__ float floor(float __x) { return ::floorf(__x); }
-__DEVICE__ float fmod(float __x, float __y) { return ::fmodf(__x, __y); }
-__DEVICE__ int fpclassify(float __x) {
-  return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
-                              FP_ZERO, __x);
-}
-__DEVICE__ int fpclassify(double __x) {
-  return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
-                              FP_ZERO, __x);
-}
-__DEVICE__ float frexp(float __arg, int *__exp) {
-  return ::frexpf(__arg, __exp);
-}
-
-// For inscrutable reasons, the CUDA headers define these functions for us on
-// Windows.
-#if !defined(_MSC_VER) || defined(__OPENMP_NVPTX__)
-
-// For OpenMP we work around some old system headers that have non-conforming
-// `isinf(float)` and `isnan(float)` implementations that return an `int`. We do
-// this by providing two versions of these functions, differing only in the
-// return type. To avoid conflicting definitions we disable implicit base
-// function generation. That means we will end up with two specializations, one
-// per type, but only one has a base function defined by the system header.
-#if defined(__OPENMP_NVPTX__)
-#pragma omp begin declare variant match(                                       \
-    implementation = {extension(disable_implicit_base)})
-
-// FIXME: We lack an extension to customize the mangling of the variants, e.g.,
-//        add a suffix. This means we would clash with the names of the variants
-//        (note that we do not create implicit base functions here). To avoid
-//        this clash we add a new trait to some of them that is always true
-//        (this is LLVM after all ;)). It will only influence the mangled name
-//        of the variants inside the inner region and avoid the clash.
-#pragma omp begin declare variant match(implementation = {vendor(llvm)})
-
-__DEVICE__ int isinf(float __x) { return ::__isinff(__x); }
-__DEVICE__ int isinf(double __x) { return ::__isinf(__x); }
-__DEVICE__ int isfinite(float __x) { return ::__finitef(__x); }
-__DEVICE__ int isfinite(double __x) { return ::__isfinited(__x); }
-__DEVICE__ int isnan(float __x) { return ::__isnanf(__x); }
-__DEVICE__ int isnan(double __x) { return ::__isnan(__x); }
-
-#pragma omp end declare variant
-
-#endif
-
-__DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
-__DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
-__DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
-// For inscrutable reasons, __finite(), the double-precision version of
-// __finitef, does not exist when compiling for MacOS.  __isfinited is available
-// everywhere and is just as good.
-__DEVICE__ bool isfinite(double __x) { return ::__isfinited(__x); }
-__DEVICE__ bool isnan(float __x) { return ::__isnanf(__x); }
-__DEVICE__ bool isnan(double __x) { return ::__isnan(__x); }
-
-#if defined(__OPENMP_NVPTX__)
-#pragma omp end declare variant
-#endif
-
-#endif
-
-__DEVICE__ bool isgreater(float __x, float __y) {
-  return __builtin_isgreater(__x, __y);
-}
-__DEVICE__ bool isgreater(double __x, double __y) {
-  return __builtin_isgreater(__x, __y);
-}
-__DEVICE__ bool isgreaterequal(float __x, float __y) {
-  return __builtin_isgreaterequal(__x, __y);
-}
-__DEVICE__ bool isgreaterequal(double __x, double __y) {
-  return __builtin_isgreaterequal(__x, __y);
-}
-__DEVICE__ bool isless(float __x, float __y) {
-  return __builtin_isless(__x, __y);
-}
-__DEVICE__ bool isless(double __x, double __y) {
-  return __builtin_isless(__x, __y);
-}
-__DEVICE__ bool islessequal(float __x, float __y) {
-  return __builtin_islessequal(__x, __y);
-}
-__DEVICE__ bool islessequal(double __x, double __y) {
-  return __builtin_islessequal(__x, __y);
-}
-__DEVICE__ bool islessgreater(float __x, float __y) {
-  return __builtin_islessgreater(__x, __y);
-}
-__DEVICE__ bool islessgreater(double __x, double __y) {
-  return __builtin_islessgreater(__x, __y);
-}
-__DEVICE__ bool isnormal(float __x) { return __builtin_isnormal(__x); }
-__DEVICE__ bool isnormal(double __x) { return __builtin_isnormal(__x); }
-__DEVICE__ bool isunordered(float __x, float __y) {
-  return __builtin_isunordered(__x, __y);
-}
-__DEVICE__ bool isunordered(double __x, double __y) {
-  return __builtin_isunordered(__x, __y);
-}
-__DEVICE__ float ldexp(float __arg, int __exp) {
-  return ::ldexpf(__arg, __exp);
-}
-__DEVICE__ float log(float __x) { return ::logf(__x); }
-__DEVICE__ float log10(float __x) { return ::log10f(__x); }
-__DEVICE__ float modf(float __x, float *__iptr) { return ::modff(__x, __iptr); }
-__DEVICE__ float pow(float __base, float __exp) {
-  return ::powf(__base, __exp);
-}
-__DEVICE__ float pow(float __base, int __iexp) {
-  return ::powif(__base, __iexp);
-}
-__DEVICE__ double pow(double __base, int __iexp) {
-  return ::powi(__base, __iexp);
-}
-__DEVICE__ bool signbit(float __x) { return ::__signbitf(__x); }
-__DEVICE__ bool signbit(double __x) { return ::__signbitd(__x); }
-__DEVICE__ float sin(float __x) { return ::sinf(__x); }
-__DEVICE__ float sinh(float __x) { return ::sinhf(__x); }
-__DEVICE__ float sqrt(float __x) { return ::sqrtf(__x); }
-__DEVICE__ float tan(float __x) { return ::tanf(__x); }
-__DEVICE__ float tanh(float __x) { return ::tanhf(__x); }
-
-// There was a redefinition error for this this overload in CUDA mode.
-// We restrict it to OpenMP mode for now, that is where it is actually needed
-// anyway.
-#ifdef __OPENMP_NVPTX__
-__DEVICE__ float remquo(float __n, float __d, int *__q) {
-  return ::remquof(__n, __d, __q);
-}
-#endif
-
-// Notably missing above is nexttoward.  We omit it because
-// libdevice doesn't provide an implementation, and we don't want to be in the
-// business of implementing tricky libm functions in this header.
-
-#ifndef __OPENMP_NVPTX__
-
-// Now we've defined everything we promised we'd define in
-// __clang_cuda_math_forward_declares.h.  We need to do two additional things to
-// fix up our math functions.
-//
-// 1) Define __device__ overloads for e.g. sin(int).  The CUDA headers define
-//    only sin(float) and sin(double), which means that e.g. sin(0) is
-//    ambiguous.
-//
-// 2) Pull the __device__ overloads of "foobarf" math functions into namespace
-//    std.  These are defined in the CUDA headers in the global namespace,
-//    independent of everything else we've done here.
-
-// We can't use std::enable_if, because we want to be pre-C++11 compatible.  But
-// we go ahead and unconditionally define functions that are only available when
-// compiling for C++11 to match the behavior of the CUDA headers.
-template<bool __B, class __T = void>
-struct __clang_cuda_enable_if {};
-
-template <class __T> struct __clang_cuda_enable_if<true, __T> {
-  typedef __T type;
-};
-
-// Defines an overload of __fn that accepts one integral argument, calls
-// __fn((double)x), and returns __retty.
-#define __CUDA_CLANG_FN_INTEGER_OVERLOAD_1(__retty, __fn)                      \
-  template <typename __T>                                                      \
-  __DEVICE__                                                                   \
-      typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,    \
-                                      __retty>::type                           \
-      __fn(__T __x) {                                                          \
-    return ::__fn((double)__x);                                                \
-  }
-
-// Defines an overload of __fn that accepts one two arithmetic arguments, calls
-// __fn((double)x, (double)y), and returns a double.
-//
-// Note this is different from OVERLOAD_1, which generates an overload that
-// accepts only *integral* arguments.
-#define __CUDA_CLANG_FN_INTEGER_OVERLOAD_2(__retty, __fn)                      \
-  template <typename __T1, typename __T2>                                      \
-  __DEVICE__ typename __clang_cuda_enable_if<                                  \
-      std::numeric_limits<__T1>::is_specialized &&                             \
-          std::numeric_limits<__T2>::is_specialized,                           \
-      __retty>::type                                                           \
-  __fn(__T1 __x, __T2 __y) {                                                   \
-    return __fn((double)__x, (double)__y);                                     \
-  }
-
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, acos)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, acosh)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, asin)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, asinh)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, atan)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, atan2);
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, atanh)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cbrt)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, ceil)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, copysign);
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cos)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cosh)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, erf)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, erfc)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, exp)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, exp2)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, expm1)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, fabs)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fdim);
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, floor)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmax);
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmin);
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmod);
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(int, fpclassify)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, hypot);
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(int, ilogb)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isfinite)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isgreater);
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isgreaterequal);
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isinf);
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isless);
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, islessequal);
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, islessgreater);
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isnan);
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isnormal)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isunordered);
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, lgamma)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log10)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log1p)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log2)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, logb)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long long, llrint)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long long, llround)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long, lrint)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long, lround)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, nearbyint);
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, nextafter);
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, pow);
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, remainder);
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, rint);
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, round);
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, signbit)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sin)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sinh)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sqrt)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tan)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tanh)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tgamma)
-__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, trunc);
-
-#undef __CUDA_CLANG_FN_INTEGER_OVERLOAD_1
-#undef __CUDA_CLANG_FN_INTEGER_OVERLOAD_2
-
-// Overloads for functions that don't match the patterns expected by
-// __CUDA_CLANG_FN_INTEGER_OVERLOAD_{1,2}.
-template <typename __T1, typename __T2, typename __T3>
-__DEVICE__ typename __clang_cuda_enable_if<
-    std::numeric_limits<__T1>::is_specialized &&
-        std::numeric_limits<__T2>::is_specialized &&
-        std::numeric_limits<__T3>::is_specialized,
-    double>::type
-fma(__T1 __x, __T2 __y, __T3 __z) {
-  return std::fma((double)__x, (double)__y, (double)__z);
-}
-
-template <typename __T>
-__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
-                                           double>::type
-frexp(__T __x, int *__exp) {
-  return std::frexp((double)__x, __exp);
-}
-
-template <typename __T>
-__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
-                                           double>::type
-ldexp(__T __x, int __exp) {
-  return std::ldexp((double)__x, __exp);
-}
-
-template <typename __T1, typename __T2>
-__DEVICE__ typename __clang_cuda_enable_if<
-    std::numeric_limits<__T1>::is_specialized &&
-        std::numeric_limits<__T2>::is_specialized,
-    double>::type
-remquo(__T1 __x, __T2 __y, int *__quo) {
-  return std::remquo((double)__x, (double)__y, __quo);
-}
-
-template <typename __T>
-__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
-                                           double>::type
-scalbln(__T __x, long __exp) {
-  return std::scalbln((double)__x, __exp);
-}
-
-template <typename __T>
-__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
-                                           double>::type
-scalbn(__T __x, int __exp) {
-  return std::scalbn((double)__x, __exp);
-}
-
-// We need to define these overloads in exactly the namespace our standard
-// library uses (including the right inline namespace), otherwise they won't be
-// picked up by other functions in the standard library (e.g. functions in
-// <complex>).  Thus the ugliness below.
-#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
-_LIBCPP_BEGIN_NAMESPACE_STD
-#else
-namespace std {
-#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
-_GLIBCXX_BEGIN_NAMESPACE_VERSION
-#endif
-#endif
-
-// Pull the new overloads we defined above into namespace std.
-using ::acos;
-using ::acosh;
-using ::asin;
-using ::asinh;
-using ::atan;
-using ::atan2;
-using ::atanh;
-using ::cbrt;
-using ::ceil;
-using ::copysign;
-using ::cos;
-using ::cosh;
-using ::erf;
-using ::erfc;
-using ::exp;
-using ::exp2;
-using ::expm1;
-using ::fabs;
-using ::fdim;
-using ::floor;
-using ::fma;
-using ::fmax;
-using ::fmin;
-using ::fmod;
-using ::fpclassify;
-using ::frexp;
-using ::hypot;
-using ::ilogb;
-using ::isfinite;
-using ::isgreater;
-using ::isgreaterequal;
-using ::isless;
-using ::islessequal;
-using ::islessgreater;
-using ::isnormal;
-using ::isunordered;
-using ::ldexp;
-using ::lgamma;
-using ::llrint;
-using ::llround;
-using ::log;
-using ::log10;
-using ::log1p;
-using ::log2;
-using ::logb;
-using ::lrint;
-using ::lround;
-using ::nearbyint;
-using ::nextafter;
-using ::pow;
-using ::remainder;
-using ::remquo;
-using ::rint;
-using ::round;
-using ::scalbln;
-using ::scalbn;
-using ::signbit;
-using ::sin;
-using ::sinh;
-using ::sqrt;
-using ::tan;
-using ::tanh;
-using ::tgamma;
-using ::trunc;
-
-// Well this is fun: We need to pull these symbols in for libc++, but we can't
-// pull them in with libstdc++, because its ::isinf and ::isnan are different
-// than its std::isinf and std::isnan.
-#ifndef __GLIBCXX__
-using ::isinf;
-using ::isnan;
-#endif
-
-// Finally, pull the "foobarf" functions that CUDA defines in its headers into
-// namespace std.
-using ::acosf;
-using ::acoshf;
-using ::asinf;
-using ::asinhf;
-using ::atan2f;
-using ::atanf;
-using ::atanhf;
-using ::cbrtf;
-using ::ceilf;
-using ::copysignf;
-using ::cosf;
-using ::coshf;
-using ::erfcf;
-using ::erff;
-using ::exp2f;
-using ::expf;
-using ::expm1f;
-using ::fabsf;
-using ::fdimf;
-using ::floorf;
-using ::fmaf;
-using ::fmaxf;
-using ::fminf;
-using ::fmodf;
-using ::frexpf;
-using ::hypotf;
-using ::ilogbf;
-using ::ldexpf;
-using ::lgammaf;
-using ::llrintf;
-using ::llroundf;
-using ::log10f;
-using ::log1pf;
-using ::log2f;
-using ::logbf;
-using ::logf;
-using ::lrintf;
-using ::lroundf;
-using ::modff;
-using ::nearbyintf;
-using ::nextafterf;
-using ::powf;
-using ::remainderf;
-using ::remquof;
-using ::rintf;
-using ::roundf;
-using ::scalblnf;
-using ::scalbnf;
-using ::sinf;
-using ::sinhf;
-using ::sqrtf;
-using ::tanf;
-using ::tanhf;
-using ::tgammaf;
-using ::truncf;
-
-#ifdef _LIBCPP_END_NAMESPACE_STD
-_LIBCPP_END_NAMESPACE_STD
-#else
-#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
-_GLIBCXX_END_NAMESPACE_VERSION
-#endif
-} // namespace std
-#endif
-
-#endif // __OPENMP_NVPTX__
-
-#undef __DEVICE__
-
-#endif
--- a/lib/include/__clang_cuda_complex_builtins.h
+++ b/lib/include/__clang_cuda_complex_builtins.h
@ -1,285 +0,0 @@
-/*===-- __clang_cuda_complex_builtins - CUDA impls of runtime complex fns ---===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __CLANG_CUDA_COMPLEX_BUILTINS
-#define __CLANG_CUDA_COMPLEX_BUILTINS
-
-// This header defines __muldc3, __mulsc3, __divdc3, and __divsc3.  These are
-// libgcc functions that clang assumes are available when compiling c99 complex
-// operations.  (These implementations come from libc++, and have been modified
-// to work with CUDA and OpenMP target offloading [in C and C++ mode].)
-
-#pragma push_macro("__DEVICE__")
-#if defined(__OPENMP_NVPTX__) || defined(__OPENMP_AMDGCN__)
-#pragma omp declare target
-#define __DEVICE__ __attribute__((noinline, nothrow, cold, weak))
-#else
-#define __DEVICE__ __device__ inline
-#endif
-
-// To make the algorithms available for C and C++ in CUDA and OpenMP we select
-// different but equivalent function versions. TODO: For OpenMP we currently
-// select the native builtins as the overload support for templates is lacking.
-#if !defined(__OPENMP_NVPTX__) && !defined(__OPENMP_AMDGCN__)
-#define _ISNANd std::isnan
-#define _ISNANf std::isnan
-#define _ISINFd std::isinf
-#define _ISINFf std::isinf
-#define _ISFINITEd std::isfinite
-#define _ISFINITEf std::isfinite
-#define _COPYSIGNd std::copysign
-#define _COPYSIGNf std::copysign
-#define _SCALBNd std::scalbn
-#define _SCALBNf std::scalbn
-#define _ABSd std::abs
-#define _ABSf std::abs
-#define _LOGBd std::logb
-#define _LOGBf std::logb
-// Rather than pulling in std::max from algorithm everytime, use available ::max.
-#define _fmaxd max
-#define _fmaxf max
-#else
-#ifdef __AMDGCN__
-#define _ISNANd __ocml_isnan_f64
-#define _ISNANf __ocml_isnan_f32
-#define _ISINFd __ocml_isinf_f64
-#define _ISINFf __ocml_isinf_f32
-#define _ISFINITEd __ocml_isfinite_f64
-#define _ISFINITEf __ocml_isfinite_f32
-#define _COPYSIGNd __ocml_copysign_f64
-#define _COPYSIGNf __ocml_copysign_f32
-#define _SCALBNd __ocml_scalbn_f64
-#define _SCALBNf __ocml_scalbn_f32
-#define _ABSd __ocml_fabs_f64
-#define _ABSf __ocml_fabs_f32
-#define _LOGBd __ocml_logb_f64
-#define _LOGBf __ocml_logb_f32
-#define _fmaxd __ocml_fmax_f64
-#define _fmaxf __ocml_fmax_f32
-#else
-#define _ISNANd __nv_isnand
-#define _ISNANf __nv_isnanf
-#define _ISINFd __nv_isinfd
-#define _ISINFf __nv_isinff
-#define _ISFINITEd __nv_isfinited
-#define _ISFINITEf __nv_finitef
-#define _COPYSIGNd __nv_copysign
-#define _COPYSIGNf __nv_copysignf
-#define _SCALBNd __nv_scalbn
-#define _SCALBNf __nv_scalbnf
-#define _ABSd __nv_fabs
-#define _ABSf __nv_fabsf
-#define _LOGBd __nv_logb
-#define _LOGBf __nv_logbf
-#define _fmaxd __nv_fmax
-#define _fmaxf __nv_fmaxf
-#endif
-#endif
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-__DEVICE__ double _Complex __muldc3(double __a, double __b, double __c,
-                                    double __d) {
-  double __ac = __a * __c;
-  double __bd = __b * __d;
-  double __ad = __a * __d;
-  double __bc = __b * __c;
-  double _Complex z;
-  __real__(z) = __ac - __bd;
-  __imag__(z) = __ad + __bc;
-  if (_ISNANd(__real__(z)) && _ISNANd(__imag__(z))) {
-    int __recalc = 0;
-    if (_ISINFd(__a) || _ISINFd(__b)) {
-      __a = _COPYSIGNd(_ISINFd(__a) ? 1 : 0, __a);
-      __b = _COPYSIGNd(_ISINFd(__b) ? 1 : 0, __b);
-      if (_ISNANd(__c))
-        __c = _COPYSIGNd(0, __c);
-      if (_ISNANd(__d))
-        __d = _COPYSIGNd(0, __d);
-      __recalc = 1;
-    }
-    if (_ISINFd(__c) || _ISINFd(__d)) {
-      __c = _COPYSIGNd(_ISINFd(__c) ? 1 : 0, __c);
-      __d = _COPYSIGNd(_ISINFd(__d) ? 1 : 0, __d);
-      if (_ISNANd(__a))
-        __a = _COPYSIGNd(0, __a);
-      if (_ISNANd(__b))
-        __b = _COPYSIGNd(0, __b);
-      __recalc = 1;
-    }
-    if (!__recalc &&
-        (_ISINFd(__ac) || _ISINFd(__bd) || _ISINFd(__ad) || _ISINFd(__bc))) {
-      if (_ISNANd(__a))
-        __a = _COPYSIGNd(0, __a);
-      if (_ISNANd(__b))
-        __b = _COPYSIGNd(0, __b);
-      if (_ISNANd(__c))
-        __c = _COPYSIGNd(0, __c);
-      if (_ISNANd(__d))
-        __d = _COPYSIGNd(0, __d);
-      __recalc = 1;
-    }
-    if (__recalc) {
-      // Can't use std::numeric_limits<double>::infinity() -- that doesn't have
-      // a device overload (and isn't constexpr before C++11, naturally).
-      __real__(z) = __builtin_huge_val() * (__a * __c - __b * __d);
-      __imag__(z) = __builtin_huge_val() * (__a * __d + __b * __c);
-    }
-  }
-  return z;
-}
-
-__DEVICE__ float _Complex __mulsc3(float __a, float __b, float __c, float __d) {
-  float __ac = __a * __c;
-  float __bd = __b * __d;
-  float __ad = __a * __d;
-  float __bc = __b * __c;
-  float _Complex z;
-  __real__(z) = __ac - __bd;
-  __imag__(z) = __ad + __bc;
-  if (_ISNANf(__real__(z)) && _ISNANf(__imag__(z))) {
-    int __recalc = 0;
-    if (_ISINFf(__a) || _ISINFf(__b)) {
-      __a = _COPYSIGNf(_ISINFf(__a) ? 1 : 0, __a);
-      __b = _COPYSIGNf(_ISINFf(__b) ? 1 : 0, __b);
-      if (_ISNANf(__c))
-        __c = _COPYSIGNf(0, __c);
-      if (_ISNANf(__d))
-        __d = _COPYSIGNf(0, __d);
-      __recalc = 1;
-    }
-    if (_ISINFf(__c) || _ISINFf(__d)) {
-      __c = _COPYSIGNf(_ISINFf(__c) ? 1 : 0, __c);
-      __d = _COPYSIGNf(_ISINFf(__d) ? 1 : 0, __d);
-      if (_ISNANf(__a))
-        __a = _COPYSIGNf(0, __a);
-      if (_ISNANf(__b))
-        __b = _COPYSIGNf(0, __b);
-      __recalc = 1;
-    }
-    if (!__recalc &&
-        (_ISINFf(__ac) || _ISINFf(__bd) || _ISINFf(__ad) || _ISINFf(__bc))) {
-      if (_ISNANf(__a))
-        __a = _COPYSIGNf(0, __a);
-      if (_ISNANf(__b))
-        __b = _COPYSIGNf(0, __b);
-      if (_ISNANf(__c))
-        __c = _COPYSIGNf(0, __c);
-      if (_ISNANf(__d))
-        __d = _COPYSIGNf(0, __d);
-      __recalc = 1;
-    }
-    if (__recalc) {
-      __real__(z) = __builtin_huge_valf() * (__a * __c - __b * __d);
-      __imag__(z) = __builtin_huge_valf() * (__a * __d + __b * __c);
-    }
-  }
-  return z;
-}
-
-__DEVICE__ double _Complex __divdc3(double __a, double __b, double __c,
-                                    double __d) {
-  int __ilogbw = 0;
-  // Can't use std::max, because that's defined in <algorithm>, and we don't
-  // want to pull that in for every compile.  The CUDA headers define
-  // ::max(float, float) and ::max(double, double), which is sufficient for us.
-  double __logbw = _LOGBd(_fmaxd(_ABSd(__c), _ABSd(__d)));
-  if (_ISFINITEd(__logbw)) {
-    __ilogbw = (int)__logbw;
-    __c = _SCALBNd(__c, -__ilogbw);
-    __d = _SCALBNd(__d, -__ilogbw);
-  }
-  double __denom = __c * __c + __d * __d;
-  double _Complex z;
-  __real__(z) = _SCALBNd((__a * __c + __b * __d) / __denom, -__ilogbw);
-  __imag__(z) = _SCALBNd((__b * __c - __a * __d) / __denom, -__ilogbw);
-  if (_ISNANd(__real__(z)) && _ISNANd(__imag__(z))) {
-    if ((__denom == 0.0) && (!_ISNANd(__a) || !_ISNANd(__b))) {
-      __real__(z) = _COPYSIGNd(__builtin_huge_val(), __c) * __a;
-      __imag__(z) = _COPYSIGNd(__builtin_huge_val(), __c) * __b;
-    } else if ((_ISINFd(__a) || _ISINFd(__b)) && _ISFINITEd(__c) &&
-               _ISFINITEd(__d)) {
-      __a = _COPYSIGNd(_ISINFd(__a) ? 1.0 : 0.0, __a);
-      __b = _COPYSIGNd(_ISINFd(__b) ? 1.0 : 0.0, __b);
-      __real__(z) = __builtin_huge_val() * (__a * __c + __b * __d);
-      __imag__(z) = __builtin_huge_val() * (__b * __c - __a * __d);
-    } else if (_ISINFd(__logbw) && __logbw > 0.0 && _ISFINITEd(__a) &&
-               _ISFINITEd(__b)) {
-      __c = _COPYSIGNd(_ISINFd(__c) ? 1.0 : 0.0, __c);
-      __d = _COPYSIGNd(_ISINFd(__d) ? 1.0 : 0.0, __d);
-      __real__(z) = 0.0 * (__a * __c + __b * __d);
-      __imag__(z) = 0.0 * (__b * __c - __a * __d);
-    }
-  }
-  return z;
-}
-
-__DEVICE__ float _Complex __divsc3(float __a, float __b, float __c, float __d) {
-  int __ilogbw = 0;
-  float __logbw = _LOGBf(_fmaxf(_ABSf(__c), _ABSf(__d)));
-  if (_ISFINITEf(__logbw)) {
-    __ilogbw = (int)__logbw;
-    __c = _SCALBNf(__c, -__ilogbw);
-    __d = _SCALBNf(__d, -__ilogbw);
-  }
-  float __denom = __c * __c + __d * __d;
-  float _Complex z;
-  __real__(z) = _SCALBNf((__a * __c + __b * __d) / __denom, -__ilogbw);
-  __imag__(z) = _SCALBNf((__b * __c - __a * __d) / __denom, -__ilogbw);
-  if (_ISNANf(__real__(z)) && _ISNANf(__imag__(z))) {
-    if ((__denom == 0) && (!_ISNANf(__a) || !_ISNANf(__b))) {
-      __real__(z) = _COPYSIGNf(__builtin_huge_valf(), __c) * __a;
-      __imag__(z) = _COPYSIGNf(__builtin_huge_valf(), __c) * __b;
-    } else if ((_ISINFf(__a) || _ISINFf(__b)) && _ISFINITEf(__c) &&
-               _ISFINITEf(__d)) {
-      __a = _COPYSIGNf(_ISINFf(__a) ? 1 : 0, __a);
-      __b = _COPYSIGNf(_ISINFf(__b) ? 1 : 0, __b);
-      __real__(z) = __builtin_huge_valf() * (__a * __c + __b * __d);
-      __imag__(z) = __builtin_huge_valf() * (__b * __c - __a * __d);
-    } else if (_ISINFf(__logbw) && __logbw > 0 && _ISFINITEf(__a) &&
-               _ISFINITEf(__b)) {
-      __c = _COPYSIGNf(_ISINFf(__c) ? 1 : 0, __c);
-      __d = _COPYSIGNf(_ISINFf(__d) ? 1 : 0, __d);
-      __real__(z) = 0 * (__a * __c + __b * __d);
-      __imag__(z) = 0 * (__b * __c - __a * __d);
-    }
-  }
-  return z;
-}
-
-#if defined(__cplusplus)
-} // extern "C"
-#endif
-
-#undef _ISNANd
-#undef _ISNANf
-#undef _ISINFd
-#undef _ISINFf
-#undef _COPYSIGNd
-#undef _COPYSIGNf
-#undef _ISFINITEd
-#undef _ISFINITEf
-#undef _SCALBNd
-#undef _SCALBNf
-#undef _ABSd
-#undef _ABSf
-#undef _LOGBd
-#undef _LOGBf
-#undef _fmaxd
-#undef _fmaxf
-
-#if defined(__OPENMP_NVPTX__) || defined(__OPENMP_AMDGCN__)
-#pragma omp end declare target
-#endif
-
-#pragma pop_macro("__DEVICE__")
-
-#endif // __CLANG_CUDA_COMPLEX_BUILTINS
--- a/lib/include/__clang_cuda_device_functions.h
+++ b/lib/include/__clang_cuda_device_functions.h
--- a/lib/include/__clang_cuda_intrinsics.h
+++ b/lib/include/__clang_cuda_intrinsics.h
@ -1,709 +0,0 @@
-/*===--- __clang_cuda_intrinsics.h - Device-side CUDA intrinsic wrappers ---===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __CLANG_CUDA_INTRINSICS_H__
-#define __CLANG_CUDA_INTRINSICS_H__
-#ifndef __CUDA__
-#error "This file is for CUDA compilation only."
-#endif
-
-// sm_30 intrinsics: __shfl_{up,down,xor}.
-
-#define __SM_30_INTRINSICS_H__
-#define __SM_30_INTRINSICS_HPP__
-
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
-
-#pragma push_macro("__MAKE_SHUFFLES")
-#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask,    \
-                        __Type)                                                \
-  inline __device__ int __FnName(int __val, __Type __offset,                   \
-                                 int __width = warpSize) {                     \
-    return __IntIntrinsic(__val, __offset,                                     \
-                          ((warpSize - __width) << 8) | (__Mask));             \
-  }                                                                            \
-  inline __device__ float __FnName(float __val, __Type __offset,               \
-                                   int __width = warpSize) {                   \
-    return __FloatIntrinsic(__val, __offset,                                   \
-                            ((warpSize - __width) << 8) | (__Mask));           \
-  }                                                                            \
-  inline __device__ unsigned int __FnName(unsigned int __val, __Type __offset, \
-                                          int __width = warpSize) {            \
-    return static_cast<unsigned int>(                                          \
-        ::__FnName(static_cast<int>(__val), __offset, __width));               \
-  }                                                                            \
-  inline __device__ long long __FnName(long long __val, __Type __offset,       \
-                                       int __width = warpSize) {               \
-    struct __Bits {                                                            \
-      int __a, __b;                                                            \
-    };                                                                         \
-    _Static_assert(sizeof(__val) == sizeof(__Bits));                           \
-    _Static_assert(sizeof(__Bits) == 2 * sizeof(int));                         \
-    __Bits __tmp;                                                              \
-    memcpy(&__tmp, &__val, sizeof(__val));                                \
-    __tmp.__a = ::__FnName(__tmp.__a, __offset, __width);                      \
-    __tmp.__b = ::__FnName(__tmp.__b, __offset, __width);                      \
-    long long __ret;                                                           \
-    memcpy(&__ret, &__tmp, sizeof(__tmp));                                     \
-    return __ret;                                                              \
-  }                                                                            \
-  inline __device__ long __FnName(long __val, __Type __offset,                 \
-                                  int __width = warpSize) {                    \
-    _Static_assert(sizeof(long) == sizeof(long long) ||                        \
-                   sizeof(long) == sizeof(int));                               \
-    if (sizeof(long) == sizeof(long long)) {                                   \
-      return static_cast<long>(                                                \
-          ::__FnName(static_cast<long long>(__val), __offset, __width));       \
-    } else if (sizeof(long) == sizeof(int)) {                                  \
-      return static_cast<long>(                                                \
-          ::__FnName(static_cast<int>(__val), __offset, __width));             \
-    }                                                                          \
-  }                                                                            \
-  inline __device__ unsigned long __FnName(                                    \
-      unsigned long __val, __Type __offset, int __width = warpSize) {          \
-    return static_cast<unsigned long>(                                         \
-        ::__FnName(static_cast<long>(__val), __offset, __width));              \
-  }                                                                            \
-  inline __device__ unsigned long long __FnName(                               \
-      unsigned long long __val, __Type __offset, int __width = warpSize) {     \
-    return static_cast<unsigned long long>(                                    \
-        ::__FnName(static_cast<long long>(__val), __offset, __width));         \
-  }                                                                            \
-  inline __device__ double __FnName(double __val, __Type __offset,             \
-                                    int __width = warpSize) {                  \
-    long long __tmp;                                                           \
-    _Static_assert(sizeof(__tmp) == sizeof(__val));                            \
-    memcpy(&__tmp, &__val, sizeof(__val));                                     \
-    __tmp = ::__FnName(__tmp, __offset, __width);                              \
-    double __ret;                                                              \
-    memcpy(&__ret, &__tmp, sizeof(__ret));                                     \
-    return __ret;                                                              \
-  }
-
-__MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f, int);
-// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
-// maxLane.
-__MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0,
-                unsigned int);
-__MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f,
-                unsigned int);
-__MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f,
-                int);
-#pragma pop_macro("__MAKE_SHUFFLES")
-
-#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
-
-#if CUDA_VERSION >= 9000
-#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300)
-// __shfl_sync_* variants available in CUDA-9
-#pragma push_macro("__MAKE_SYNC_SHUFFLES")
-#define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic,       \
-                             __Mask, __Type)                                   \
-  inline __device__ int __FnName(unsigned int __mask, int __val,               \
-                                 __Type __offset, int __width = warpSize) {    \
-    return __IntIntrinsic(__mask, __val, __offset,                             \
-                          ((warpSize - __width) << 8) | (__Mask));             \
-  }                                                                            \
-  inline __device__ float __FnName(unsigned int __mask, float __val,           \
-                                   __Type __offset, int __width = warpSize) {  \
-    return __FloatIntrinsic(__mask, __val, __offset,                           \
-                            ((warpSize - __width) << 8) | (__Mask));           \
-  }                                                                            \
-  inline __device__ unsigned int __FnName(unsigned int __mask,                 \
-                                          unsigned int __val, __Type __offset, \
-                                          int __width = warpSize) {            \
-    return static_cast<unsigned int>(                                          \
-        ::__FnName(__mask, static_cast<int>(__val), __offset, __width));       \
-  }                                                                            \
-  inline __device__ long long __FnName(unsigned int __mask, long long __val,   \
-                                       __Type __offset,                        \
-                                       int __width = warpSize) {               \
-    struct __Bits {                                                            \
-      int __a, __b;                                                            \
-    };                                                                         \
-    _Static_assert(sizeof(__val) == sizeof(__Bits));                           \
-    _Static_assert(sizeof(__Bits) == 2 * sizeof(int));                         \
-    __Bits __tmp;                                                              \
-    memcpy(&__tmp, &__val, sizeof(__val));                                     \
-    __tmp.__a = ::__FnName(__mask, __tmp.__a, __offset, __width);              \
-    __tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width);              \
-    long long __ret;                                                           \
-    memcpy(&__ret, &__tmp, sizeof(__tmp));                                     \
-    return __ret;                                                              \
-  }                                                                            \
-  inline __device__ unsigned long long __FnName(                               \
-      unsigned int __mask, unsigned long long __val, __Type __offset,          \
-      int __width = warpSize) {                                                \
-    return static_cast<unsigned long long>(                                    \
-        ::__FnName(__mask, static_cast<long long>(__val), __offset, __width)); \
-  }                                                                            \
-  inline __device__ long __FnName(unsigned int __mask, long __val,             \
-                                  __Type __offset, int __width = warpSize) {   \
-    _Static_assert(sizeof(long) == sizeof(long long) ||                        \
-                   sizeof(long) == sizeof(int));                               \
-    if (sizeof(long) == sizeof(long long)) {                                   \
-      return static_cast<long>(::__FnName(                                     \
-          __mask, static_cast<long long>(__val), __offset, __width));          \
-    } else if (sizeof(long) == sizeof(int)) {                                  \
-      return static_cast<long>(                                                \
-          ::__FnName(__mask, static_cast<int>(__val), __offset, __width));     \
-    }                                                                          \
-  }                                                                            \
-  inline __device__ unsigned long __FnName(                                    \
-      unsigned int __mask, unsigned long __val, __Type __offset,               \
-      int __width = warpSize) {                                                \
-    return static_cast<unsigned long>(                                         \
-        ::__FnName(__mask, static_cast<long>(__val), __offset, __width));      \
-  }                                                                            \
-  inline __device__ double __FnName(unsigned int __mask, double __val,         \
-                                    __Type __offset, int __width = warpSize) { \
-    long long __tmp;                                                           \
-    _Static_assert(sizeof(__tmp) == sizeof(__val));                            \
-    memcpy(&__tmp, &__val, sizeof(__val));                                     \
-    __tmp = ::__FnName(__mask, __tmp, __offset, __width);                      \
-    double __ret;                                                              \
-    memcpy(&__ret, &__tmp, sizeof(__ret));                                     \
-    return __ret;                                                              \
-  }
-__MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32,
-                     __nvvm_shfl_sync_idx_f32, 0x1f, int);
-// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
-// maxLane.
-__MAKE_SYNC_SHUFFLES(__shfl_up_sync, __nvvm_shfl_sync_up_i32,
-                     __nvvm_shfl_sync_up_f32, 0, unsigned int);
-__MAKE_SYNC_SHUFFLES(__shfl_down_sync, __nvvm_shfl_sync_down_i32,
-                     __nvvm_shfl_sync_down_f32, 0x1f, unsigned int);
-__MAKE_SYNC_SHUFFLES(__shfl_xor_sync, __nvvm_shfl_sync_bfly_i32,
-                     __nvvm_shfl_sync_bfly_f32, 0x1f, int);
-#pragma pop_macro("__MAKE_SYNC_SHUFFLES")
-
-inline __device__ void __syncwarp(unsigned int mask = 0xffffffff) {
-  return __nvvm_bar_warp_sync(mask);
-}
-
-inline __device__ void __barrier_sync(unsigned int id) {
-  __nvvm_barrier_sync(id);
-}
-
-inline __device__ void __barrier_sync_count(unsigned int id,
-                                            unsigned int count) {
-  __nvvm_barrier_sync_cnt(id, count);
-}
-
-inline __device__ int __all_sync(unsigned int mask, int pred) {
-  return __nvvm_vote_all_sync(mask, pred);
-}
-
-inline __device__ int __any_sync(unsigned int mask, int pred) {
-  return __nvvm_vote_any_sync(mask, pred);
-}
-
-inline __device__ int __uni_sync(unsigned int mask, int pred) {
-  return __nvvm_vote_uni_sync(mask, pred);
-}
-
-inline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) {
-  return __nvvm_vote_ballot_sync(mask, pred);
-}
-
-inline __device__ unsigned int __activemask() {
-#if CUDA_VERSION < 9020
-  return __nvvm_vote_ballot(1);
-#else
-  unsigned int mask;
-  asm volatile("activemask.b32 %0;" : "=r"(mask));
-  return mask;
-#endif
-}
-
-inline __device__ unsigned int __fns(unsigned mask, unsigned base, int offset) {
-  return __nvvm_fns(mask, base, offset);
-}
-
-#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
-
-// Define __match* builtins CUDA-9 headers expect to see.
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-inline __device__ unsigned int __match32_any_sync(unsigned int mask,
-                                                  unsigned int value) {
-  return __nvvm_match_any_sync_i32(mask, value);
-}
-
-inline __device__ unsigned int
-__match64_any_sync(unsigned int mask, unsigned long long value) {
-  return __nvvm_match_any_sync_i64(mask, value);
-}
-
-inline __device__ unsigned int
-__match32_all_sync(unsigned int mask, unsigned int value, int *pred) {
-  return __nvvm_match_all_sync_i32p(mask, value, pred);
-}
-
-inline __device__ unsigned int
-__match64_all_sync(unsigned int mask, unsigned long long value, int *pred) {
-  return __nvvm_match_all_sync_i64p(mask, value, pred);
-}
-#include "crt/sm_70_rt.hpp"
-
-#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-#endif // __CUDA_VERSION >= 9000
-
-// sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}.
-
-// Prevent the vanilla sm_32 intrinsics header from being included.
-#define __SM_32_INTRINSICS_H__
-#define __SM_32_INTRINSICS_HPP__
-
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
-
-inline __device__ char __ldg(const char *ptr) { return __nvvm_ldg_c(ptr); }
-inline __device__ short __ldg(const short *ptr) { return __nvvm_ldg_s(ptr); }
-inline __device__ int __ldg(const int *ptr) { return __nvvm_ldg_i(ptr); }
-inline __device__ long __ldg(const long *ptr) { return __nvvm_ldg_l(ptr); }
-inline __device__ long long __ldg(const long long *ptr) {
-  return __nvvm_ldg_ll(ptr);
-}
-inline __device__ unsigned char __ldg(const unsigned char *ptr) {
-  return __nvvm_ldg_uc(ptr);
-}
-inline __device__ signed char __ldg(const signed char *ptr) {
-  return __nvvm_ldg_uc((const unsigned char *)ptr);
-}
-inline __device__ unsigned short __ldg(const unsigned short *ptr) {
-  return __nvvm_ldg_us(ptr);
-}
-inline __device__ unsigned int __ldg(const unsigned int *ptr) {
-  return __nvvm_ldg_ui(ptr);
-}
-inline __device__ unsigned long __ldg(const unsigned long *ptr) {
-  return __nvvm_ldg_ul(ptr);
-}
-inline __device__ unsigned long long __ldg(const unsigned long long *ptr) {
-  return __nvvm_ldg_ull(ptr);
-}
-inline __device__ float __ldg(const float *ptr) { return __nvvm_ldg_f(ptr); }
-inline __device__ double __ldg(const double *ptr) { return __nvvm_ldg_d(ptr); }
-
-inline __device__ char2 __ldg(const char2 *ptr) {
-  typedef char c2 __attribute__((ext_vector_type(2)));
-  // We can assume that ptr is aligned at least to char2's alignment, but the
-  // load will assume that ptr is aligned to char2's alignment.  This is only
-  // safe if alignof(c2) <= alignof(char2).
-  c2 rv = __nvvm_ldg_c2(reinterpret_cast<const c2 *>(ptr));
-  char2 ret;
-  ret.x = rv[0];
-  ret.y = rv[1];
-  return ret;
-}
-inline __device__ char4 __ldg(const char4 *ptr) {
-  typedef char c4 __attribute__((ext_vector_type(4)));
-  c4 rv = __nvvm_ldg_c4(reinterpret_cast<const c4 *>(ptr));
-  char4 ret;
-  ret.x = rv[0];
-  ret.y = rv[1];
-  ret.z = rv[2];
-  ret.w = rv[3];
-  return ret;
-}
-inline __device__ short2 __ldg(const short2 *ptr) {
-  typedef short s2 __attribute__((ext_vector_type(2)));
-  s2 rv = __nvvm_ldg_s2(reinterpret_cast<const s2 *>(ptr));
-  short2 ret;
-  ret.x = rv[0];
-  ret.y = rv[1];
-  return ret;
-}
-inline __device__ short4 __ldg(const short4 *ptr) {
-  typedef short s4 __attribute__((ext_vector_type(4)));
-  s4 rv = __nvvm_ldg_s4(reinterpret_cast<const s4 *>(ptr));
-  short4 ret;
-  ret.x = rv[0];
-  ret.y = rv[1];
-  ret.z = rv[2];
-  ret.w = rv[3];
-  return ret;
-}
-inline __device__ int2 __ldg(const int2 *ptr) {
-  typedef int i2 __attribute__((ext_vector_type(2)));
-  i2 rv = __nvvm_ldg_i2(reinterpret_cast<const i2 *>(ptr));
-  int2 ret;
-  ret.x = rv[0];
-  ret.y = rv[1];
-  return ret;
-}
-inline __device__ int4 __ldg(const int4 *ptr) {
-  typedef int i4 __attribute__((ext_vector_type(4)));
-  i4 rv = __nvvm_ldg_i4(reinterpret_cast<const i4 *>(ptr));
-  int4 ret;
-  ret.x = rv[0];
-  ret.y = rv[1];
-  ret.z = rv[2];
-  ret.w = rv[3];
-  return ret;
-}
-inline __device__ longlong2 __ldg(const longlong2 *ptr) {
-  typedef long long ll2 __attribute__((ext_vector_type(2)));
-  ll2 rv = __nvvm_ldg_ll2(reinterpret_cast<const ll2 *>(ptr));
-  longlong2 ret;
-  ret.x = rv[0];
-  ret.y = rv[1];
-  return ret;
-}
-
-inline __device__ uchar2 __ldg(const uchar2 *ptr) {
-  typedef unsigned char uc2 __attribute__((ext_vector_type(2)));
-  uc2 rv = __nvvm_ldg_uc2(reinterpret_cast<const uc2 *>(ptr));
-  uchar2 ret;
-  ret.x = rv[0];
-  ret.y = rv[1];
-  return ret;
-}
-inline __device__ uchar4 __ldg(const uchar4 *ptr) {
-  typedef unsigned char uc4 __attribute__((ext_vector_type(4)));
-  uc4 rv = __nvvm_ldg_uc4(reinterpret_cast<const uc4 *>(ptr));
-  uchar4 ret;
-  ret.x = rv[0];
-  ret.y = rv[1];
-  ret.z = rv[2];
-  ret.w = rv[3];
-  return ret;
-}
-inline __device__ ushort2 __ldg(const ushort2 *ptr) {
-  typedef unsigned short us2 __attribute__((ext_vector_type(2)));
-  us2 rv = __nvvm_ldg_us2(reinterpret_cast<const us2 *>(ptr));
-  ushort2 ret;
-  ret.x = rv[0];
-  ret.y = rv[1];
-  return ret;
-}
-inline __device__ ushort4 __ldg(const ushort4 *ptr) {
-  typedef unsigned short us4 __attribute__((ext_vector_type(4)));
-  us4 rv = __nvvm_ldg_us4(reinterpret_cast<const us4 *>(ptr));
-  ushort4 ret;
-  ret.x = rv[0];
-  ret.y = rv[1];
-  ret.z = rv[2];
-  ret.w = rv[3];
-  return ret;
-}
-inline __device__ uint2 __ldg(const uint2 *ptr) {
-  typedef unsigned int ui2 __attribute__((ext_vector_type(2)));
-  ui2 rv = __nvvm_ldg_ui2(reinterpret_cast<const ui2 *>(ptr));
-  uint2 ret;
-  ret.x = rv[0];
-  ret.y = rv[1];
-  return ret;
-}
-inline __device__ uint4 __ldg(const uint4 *ptr) {
-  typedef unsigned int ui4 __attribute__((ext_vector_type(4)));
-  ui4 rv = __nvvm_ldg_ui4(reinterpret_cast<const ui4 *>(ptr));
-  uint4 ret;
-  ret.x = rv[0];
-  ret.y = rv[1];
-  ret.z = rv[2];
-  ret.w = rv[3];
-  return ret;
-}
-inline __device__ ulonglong2 __ldg(const ulonglong2 *ptr) {
-  typedef unsigned long long ull2 __attribute__((ext_vector_type(2)));
-  ull2 rv = __nvvm_ldg_ull2(reinterpret_cast<const ull2 *>(ptr));
-  ulonglong2 ret;
-  ret.x = rv[0];
-  ret.y = rv[1];
-  return ret;
-}
-
-inline __device__ float2 __ldg(const float2 *ptr) {
-  typedef float f2 __attribute__((ext_vector_type(2)));
-  f2 rv = __nvvm_ldg_f2(reinterpret_cast<const f2 *>(ptr));
-  float2 ret;
-  ret.x = rv[0];
-  ret.y = rv[1];
-  return ret;
-}
-inline __device__ float4 __ldg(const float4 *ptr) {
-  typedef float f4 __attribute__((ext_vector_type(4)));
-  f4 rv = __nvvm_ldg_f4(reinterpret_cast<const f4 *>(ptr));
-  float4 ret;
-  ret.x = rv[0];
-  ret.y = rv[1];
-  ret.z = rv[2];
-  ret.w = rv[3];
-  return ret;
-}
-inline __device__ double2 __ldg(const double2 *ptr) {
-  typedef double d2 __attribute__((ext_vector_type(2)));
-  d2 rv = __nvvm_ldg_d2(reinterpret_cast<const d2 *>(ptr));
-  double2 ret;
-  ret.x = rv[0];
-  ret.y = rv[1];
-  return ret;
-}
-
-// TODO: Implement these as intrinsics, so the backend can work its magic on
-// these.  Alternatively, we could implement these as plain C and try to get
-// llvm to recognize the relevant patterns.
-inline __device__ unsigned __funnelshift_l(unsigned low32, unsigned high32,
-                                           unsigned shiftWidth) {
-  unsigned result;
-  asm("shf.l.wrap.b32 %0, %1, %2, %3;"
-      : "=r"(result)
-      : "r"(low32), "r"(high32), "r"(shiftWidth));
-  return result;
-}
-inline __device__ unsigned __funnelshift_lc(unsigned low32, unsigned high32,
-                                            unsigned shiftWidth) {
-  unsigned result;
-  asm("shf.l.clamp.b32 %0, %1, %2, %3;"
-      : "=r"(result)
-      : "r"(low32), "r"(high32), "r"(shiftWidth));
-  return result;
-}
-inline __device__ unsigned __funnelshift_r(unsigned low32, unsigned high32,
-                                           unsigned shiftWidth) {
-  unsigned result;
-  asm("shf.r.wrap.b32 %0, %1, %2, %3;"
-      : "=r"(result)
-      : "r"(low32), "r"(high32), "r"(shiftWidth));
-  return result;
-}
-inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32,
-                                            unsigned shiftWidth) {
-  unsigned ret;
-  asm("shf.r.clamp.b32 %0, %1, %2, %3;"
-      : "=r"(ret)
-      : "r"(low32), "r"(high32), "r"(shiftWidth));
-  return ret;
-}
-
-#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
-
-#if CUDA_VERSION >= 11000
-extern "C" {
-__device__ inline size_t __nv_cvta_generic_to_global_impl(const void *__ptr) {
-  return (size_t)(void __attribute__((address_space(1))) *)__ptr;
-}
-__device__ inline size_t __nv_cvta_generic_to_shared_impl(const void *__ptr) {
-  return (size_t)(void __attribute__((address_space(3))) *)__ptr;
-}
-__device__ inline size_t __nv_cvta_generic_to_constant_impl(const void *__ptr) {
-  return (size_t)(void __attribute__((address_space(4))) *)__ptr;
-}
-__device__ inline size_t __nv_cvta_generic_to_local_impl(const void *__ptr) {
-  return (size_t)(void __attribute__((address_space(5))) *)__ptr;
-}
-__device__ inline void *__nv_cvta_global_to_generic_impl(size_t __ptr) {
-  return (void *)(void __attribute__((address_space(1))) *)__ptr;
-}
-__device__ inline void *__nv_cvta_shared_to_generic_impl(size_t __ptr) {
-  return (void *)(void __attribute__((address_space(3))) *)__ptr;
-}
-__device__ inline void *__nv_cvta_constant_to_generic_impl(size_t __ptr) {
-  return (void *)(void __attribute__((address_space(4))) *)__ptr;
-}
-__device__ inline void *__nv_cvta_local_to_generic_impl(size_t __ptr) {
-  return (void *)(void __attribute__((address_space(5))) *)__ptr;
-}
-__device__ inline cuuint32_t __nvvm_get_smem_pointer(void *__ptr) {
-  return __nv_cvta_generic_to_shared_impl(__ptr);
-}
-} // extern "C"
-
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
-__device__ inline unsigned __reduce_add_sync(unsigned __mask,
-                                             unsigned __value) {
-  return __nvvm_redux_sync_add(__mask, __value);
-}
-__device__ inline unsigned __reduce_min_sync(unsigned __mask,
-                                             unsigned __value) {
-  return __nvvm_redux_sync_umin(__mask, __value);
-}
-__device__ inline unsigned __reduce_max_sync(unsigned __mask,
-                                             unsigned __value) {
-  return __nvvm_redux_sync_umax(__mask, __value);
-}
-__device__ inline int __reduce_min_sync(unsigned __mask, int __value) {
-  return __nvvm_redux_sync_min(__mask, __value);
-}
-__device__ inline int __reduce_max_sync(unsigned __mask, int __value) {
-  return __nvvm_redux_sync_max(__mask, __value);
-}
-__device__ inline unsigned __reduce_or_sync(unsigned __mask, unsigned __value) {
-  return __nvvm_redux_sync_or(__mask, __value);
-}
-__device__ inline unsigned __reduce_and_sync(unsigned __mask,
-                                             unsigned __value) {
-  return __nvvm_redux_sync_and(__mask, __value);
-}
-__device__ inline unsigned __reduce_xor_sync(unsigned __mask,
-                                             unsigned __value) {
-  return __nvvm_redux_sync_xor(__mask, __value);
-}
-
-__device__ inline void __nv_memcpy_async_shared_global_4(void *__dst,
-                                                         const void *__src,
-                                                         unsigned __src_size) {
-  __nvvm_cp_async_ca_shared_global_4(
-      (void __attribute__((address_space(3))) *)__dst,
-      (const void __attribute__((address_space(1))) *)__src, __src_size);
-}
-__device__ inline void __nv_memcpy_async_shared_global_8(void *__dst,
-                                                         const void *__src,
-                                                         unsigned __src_size) {
-  __nvvm_cp_async_ca_shared_global_8(
-      (void __attribute__((address_space(3))) *)__dst,
-      (const void __attribute__((address_space(1))) *)__src, __src_size);
-}
-__device__ inline void __nv_memcpy_async_shared_global_16(void *__dst,
-                                                          const void *__src,
-                                                          unsigned __src_size) {
-  __nvvm_cp_async_ca_shared_global_16(
-      (void __attribute__((address_space(3))) *)__dst,
-      (const void __attribute__((address_space(1))) *)__src, __src_size);
-}
-
-__device__ inline void *
-__nv_associate_access_property(const void *__ptr, unsigned long long __prop) {
-  // TODO: it appears to provide compiler with some sort of a hint. We do not
-  // know what exactly it is supposed to do. However, CUDA headers suggest that
-  // just passing through __ptr should not affect correctness. They do so on
-  // pre-sm80 GPUs where this builtin is not available.
-  return (void*)__ptr;
-}
-#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
-
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 900
-__device__ inline unsigned __isCtaShared(const void *ptr) {
-  return __isShared(ptr);
-}
-
-__device__ inline unsigned __isClusterShared(const void *__ptr) {
-  return __nvvm_isspacep_shared_cluster(__ptr);
-}
-
-__device__ inline void *__cluster_map_shared_rank(const void *__ptr,
-                                                  unsigned __rank) {
-  return __nvvm_mapa((void *)__ptr, __rank);
-}
-
-__device__ inline unsigned __cluster_query_shared_rank(const void *__ptr) {
-  return __nvvm_getctarank((void *)__ptr);
-}
-
-__device__ inline uint2
-__cluster_map_shared_multicast(const void *__ptr,
-                               unsigned int __cluster_cta_mask) {
-  return make_uint2((unsigned)__cvta_generic_to_shared(__ptr),
-                    __cluster_cta_mask);
-}
-
-__device__ inline unsigned __clusterDimIsSpecified() {
-  return __nvvm_is_explicit_cluster();
-}
-
-__device__ inline dim3 __clusterDim() {
-  return dim3(__nvvm_read_ptx_sreg_cluster_nctaid_x(),
-              __nvvm_read_ptx_sreg_cluster_nctaid_y(),
-              __nvvm_read_ptx_sreg_cluster_nctaid_z());
-}
-
-__device__ inline dim3 __clusterRelativeBlockIdx() {
-  return dim3(__nvvm_read_ptx_sreg_cluster_ctaid_x(),
-              __nvvm_read_ptx_sreg_cluster_ctaid_y(),
-              __nvvm_read_ptx_sreg_cluster_ctaid_z());
-}
-
-__device__ inline dim3 __clusterGridDimInClusters() {
-  return dim3(__nvvm_read_ptx_sreg_nclusterid_x(),
-              __nvvm_read_ptx_sreg_nclusterid_y(),
-              __nvvm_read_ptx_sreg_nclusterid_z());
-}
-
-__device__ inline dim3 __clusterIdx() {
-  return dim3(__nvvm_read_ptx_sreg_clusterid_x(),
-              __nvvm_read_ptx_sreg_clusterid_y(),
-              __nvvm_read_ptx_sreg_clusterid_z());
-}
-
-__device__ inline unsigned __clusterRelativeBlockRank() {
-  return __nvvm_read_ptx_sreg_cluster_ctarank();
-}
-
-__device__ inline unsigned __clusterSizeInBlocks() {
-  return __nvvm_read_ptx_sreg_cluster_nctarank();
-}
-
-__device__ inline void __cluster_barrier_arrive() {
-  __nvvm_barrier_cluster_arrive();
-}
-
-__device__ inline void __cluster_barrier_arrive_relaxed() {
-  __nvvm_barrier_cluster_arrive_relaxed();
-}
-
-__device__ inline void __cluster_barrier_wait() {
-  __nvvm_barrier_cluster_wait();
-}
-
-__device__ inline void __threadfence_cluster() { __nvvm_fence_sc_cluster(); }
-
-__device__ inline float2 atomicAdd(float2 *__ptr, float2 __val) {
-  float2 __ret;
-  __asm__("atom.add.v2.f32         {%0, %1}, [%2], {%3, %4};"
-          : "=f"(__ret.x), "=f"(__ret.y)
-          : "l"(__ptr), "f"(__val.x), "f"(__val.y));
-  return __ret;
-}
-
-__device__ inline float2 atomicAdd_block(float2 *__ptr, float2 __val) {
-  float2 __ret;
-  __asm__("atom.cta.add.v2.f32         {%0, %1}, [%2], {%3, %4};"
-          : "=f"(__ret.x), "=f"(__ret.y)
-          : "l"(__ptr), "f"(__val.x), "f"(__val.y));
-  return __ret;
-}
-
-__device__ inline float2 atomicAdd_system(float2 *__ptr, float2 __val) {
-  float2 __ret;
-  __asm__("atom.sys.add.v2.f32         {%0, %1}, [%2], {%3, %4};"
-          : "=f"(__ret.x), "=f"(__ret.y)
-          : "l"(__ptr), "f"(__val.x), "f"(__val.y));
-  return __ret;
-}
-
-__device__ inline float4 atomicAdd(float4 *__ptr, float4 __val) {
-  float4 __ret;
-  __asm__("atom.add.v4.f32         {%0, %1, %2, %3}, [%4], {%5, %6, %7, %8};"
-          : "=f"(__ret.x), "=f"(__ret.y), "=f"(__ret.z), "=f"(__ret.w)
-          : "l"(__ptr), "f"(__val.x), "f"(__val.y), "f"(__val.z), "f"(__val.w));
-  return __ret;
-}
-
-__device__ inline float4 atomicAdd_block(float4 *__ptr, float4 __val) {
-  float4 __ret;
-  __asm__(
-      "atom.cta.add.v4.f32         {%0, %1, %2, %3}, [%4], {%5, %6, %7, %8};"
-      : "=f"(__ret.x), "=f"(__ret.y), "=f"(__ret.z), "=f"(__ret.w)
-      : "l"(__ptr), "f"(__val.x), "f"(__val.y), "f"(__val.z), "f"(__val.w));
-  return __ret;
-}
-
-__device__ inline float4 atomicAdd_system(float4 *__ptr, float4 __val) {
-  float4 __ret;
-  __asm__(
-      "atom.sys.add.v4.f32         {%0, %1, %2, %3}, [%4], {%5, %6, %7, %8};"
-      : "=f"(__ret.x), "=f"(__ret.y), "=f"(__ret.z), "=f"(__ret.w)
-      : "l"(__ptr), "f"(__val.x), "f"(__val.y), "f"(__val.z), "f"(__val.w)
-      :);
-  return __ret;
-}
-
-#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 900
-#endif // CUDA_VERSION >= 11000
-
-#endif // defined(__CLANG_CUDA_INTRINSICS_H__)
--- a/lib/include/__clang_cuda_libdevice_declares.h
+++ b/lib/include/__clang_cuda_libdevice_declares.h
@ -1,468 +0,0 @@
-/*===-- __clang_cuda_libdevice_declares.h - decls for libdevice functions --===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __CLANG_CUDA_LIBDEVICE_DECLARES_H__
-#define __CLANG_CUDA_LIBDEVICE_DECLARES_H__
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-#if defined(__OPENMP_NVPTX__)
-#define __DEVICE__
-#pragma omp begin assumes ext_spmd_amenable no_openmp
-#elif defined(__CUDA__)
-#define __DEVICE__ __device__
-#endif
-
-__DEVICE__ int __nv_abs(int __a);
-__DEVICE__ double __nv_acos(double __a);
-__DEVICE__ float __nv_acosf(float __a);
-__DEVICE__ double __nv_acosh(double __a);
-__DEVICE__ float __nv_acoshf(float __a);
-__DEVICE__ double __nv_asin(double __a);
-__DEVICE__ float __nv_asinf(float __a);
-__DEVICE__ double __nv_asinh(double __a);
-__DEVICE__ float __nv_asinhf(float __a);
-__DEVICE__ double __nv_atan2(double __a, double __b);
-__DEVICE__ float __nv_atan2f(float __a, float __b);
-__DEVICE__ double __nv_atan(double __a);
-__DEVICE__ float __nv_atanf(float __a);
-__DEVICE__ double __nv_atanh(double __a);
-__DEVICE__ float __nv_atanhf(float __a);
-__DEVICE__ int __nv_brev(int __a);
-__DEVICE__ long long __nv_brevll(long long __a);
-__DEVICE__ int __nv_byte_perm(int __a, int __b, int __c);
-__DEVICE__ double __nv_cbrt(double __a);
-__DEVICE__ float __nv_cbrtf(float __a);
-__DEVICE__ double __nv_ceil(double __a);
-__DEVICE__ float __nv_ceilf(float __a);
-__DEVICE__ int __nv_clz(int __a);
-__DEVICE__ int __nv_clzll(long long __a);
-__DEVICE__ double __nv_copysign(double __a, double __b);
-__DEVICE__ float __nv_copysignf(float __a, float __b);
-__DEVICE__ double __nv_cos(double __a);
-__DEVICE__ float __nv_cosf(float __a);
-__DEVICE__ double __nv_cosh(double __a);
-__DEVICE__ float __nv_coshf(float __a);
-__DEVICE__ double __nv_cospi(double __a);
-__DEVICE__ float __nv_cospif(float __a);
-__DEVICE__ double __nv_cyl_bessel_i0(double __a);
-__DEVICE__ float __nv_cyl_bessel_i0f(float __a);
-__DEVICE__ double __nv_cyl_bessel_i1(double __a);
-__DEVICE__ float __nv_cyl_bessel_i1f(float __a);
-__DEVICE__ double __nv_dadd_rd(double __a, double __b);
-__DEVICE__ double __nv_dadd_rn(double __a, double __b);
-__DEVICE__ double __nv_dadd_ru(double __a, double __b);
-__DEVICE__ double __nv_dadd_rz(double __a, double __b);
-__DEVICE__ double __nv_ddiv_rd(double __a, double __b);
-__DEVICE__ double __nv_ddiv_rn(double __a, double __b);
-__DEVICE__ double __nv_ddiv_ru(double __a, double __b);
-__DEVICE__ double __nv_ddiv_rz(double __a, double __b);
-__DEVICE__ double __nv_dmul_rd(double __a, double __b);
-__DEVICE__ double __nv_dmul_rn(double __a, double __b);
-__DEVICE__ double __nv_dmul_ru(double __a, double __b);
-__DEVICE__ double __nv_dmul_rz(double __a, double __b);
-__DEVICE__ float __nv_double2float_rd(double __a);
-__DEVICE__ float __nv_double2float_rn(double __a);
-__DEVICE__ float __nv_double2float_ru(double __a);
-__DEVICE__ float __nv_double2float_rz(double __a);
-__DEVICE__ int __nv_double2hiint(double __a);
-__DEVICE__ int __nv_double2int_rd(double __a);
-__DEVICE__ int __nv_double2int_rn(double __a);
-__DEVICE__ int __nv_double2int_ru(double __a);
-__DEVICE__ int __nv_double2int_rz(double __a);
-__DEVICE__ long long __nv_double2ll_rd(double __a);
-__DEVICE__ long long __nv_double2ll_rn(double __a);
-__DEVICE__ long long __nv_double2ll_ru(double __a);
-__DEVICE__ long long __nv_double2ll_rz(double __a);
-__DEVICE__ int __nv_double2loint(double __a);
-__DEVICE__ unsigned int __nv_double2uint_rd(double __a);
-__DEVICE__ unsigned int __nv_double2uint_rn(double __a);
-__DEVICE__ unsigned int __nv_double2uint_ru(double __a);
-__DEVICE__ unsigned int __nv_double2uint_rz(double __a);
-__DEVICE__ unsigned long long __nv_double2ull_rd(double __a);
-__DEVICE__ unsigned long long __nv_double2ull_rn(double __a);
-__DEVICE__ unsigned long long __nv_double2ull_ru(double __a);
-__DEVICE__ unsigned long long __nv_double2ull_rz(double __a);
-__DEVICE__ unsigned long long __nv_double_as_longlong(double __a);
-__DEVICE__ double __nv_drcp_rd(double __a);
-__DEVICE__ double __nv_drcp_rn(double __a);
-__DEVICE__ double __nv_drcp_ru(double __a);
-__DEVICE__ double __nv_drcp_rz(double __a);
-__DEVICE__ double __nv_dsqrt_rd(double __a);
-__DEVICE__ double __nv_dsqrt_rn(double __a);
-__DEVICE__ double __nv_dsqrt_ru(double __a);
-__DEVICE__ double __nv_dsqrt_rz(double __a);
-__DEVICE__ double __nv_dsub_rd(double __a, double __b);
-__DEVICE__ double __nv_dsub_rn(double __a, double __b);
-__DEVICE__ double __nv_dsub_ru(double __a, double __b);
-__DEVICE__ double __nv_dsub_rz(double __a, double __b);
-__DEVICE__ double __nv_erfc(double __a);
-__DEVICE__ float __nv_erfcf(float __a);
-__DEVICE__ double __nv_erfcinv(double __a);
-__DEVICE__ float __nv_erfcinvf(float __a);
-__DEVICE__ double __nv_erfcx(double __a);
-__DEVICE__ float __nv_erfcxf(float __a);
-__DEVICE__ double __nv_erf(double __a);
-__DEVICE__ float __nv_erff(float __a);
-__DEVICE__ double __nv_erfinv(double __a);
-__DEVICE__ float __nv_erfinvf(float __a);
-__DEVICE__ double __nv_exp10(double __a);
-__DEVICE__ float __nv_exp10f(float __a);
-__DEVICE__ double __nv_exp2(double __a);
-__DEVICE__ float __nv_exp2f(float __a);
-__DEVICE__ double __nv_exp(double __a);
-__DEVICE__ float __nv_expf(float __a);
-__DEVICE__ double __nv_expm1(double __a);
-__DEVICE__ float __nv_expm1f(float __a);
-__DEVICE__ double __nv_fabs(double __a);
-__DEVICE__ float __nv_fabsf(float __a);
-__DEVICE__ float __nv_fadd_rd(float __a, float __b);
-__DEVICE__ float __nv_fadd_rn(float __a, float __b);
-__DEVICE__ float __nv_fadd_ru(float __a, float __b);
-__DEVICE__ float __nv_fadd_rz(float __a, float __b);
-__DEVICE__ float __nv_fast_cosf(float __a);
-__DEVICE__ float __nv_fast_exp10f(float __a);
-__DEVICE__ float __nv_fast_expf(float __a);
-__DEVICE__ float __nv_fast_fdividef(float __a, float __b);
-__DEVICE__ float __nv_fast_log10f(float __a);
-__DEVICE__ float __nv_fast_log2f(float __a);
-__DEVICE__ float __nv_fast_logf(float __a);
-__DEVICE__ float __nv_fast_powf(float __a, float __b);
-__DEVICE__ void __nv_fast_sincosf(float __a, float *__s, float *__c);
-__DEVICE__ float __nv_fast_sinf(float __a);
-__DEVICE__ float __nv_fast_tanf(float __a);
-__DEVICE__ double __nv_fdim(double __a, double __b);
-__DEVICE__ float __nv_fdimf(float __a, float __b);
-__DEVICE__ float __nv_fdiv_rd(float __a, float __b);
-__DEVICE__ float __nv_fdiv_rn(float __a, float __b);
-__DEVICE__ float __nv_fdiv_ru(float __a, float __b);
-__DEVICE__ float __nv_fdiv_rz(float __a, float __b);
-__DEVICE__ int __nv_ffs(int __a);
-__DEVICE__ int __nv_ffsll(long long __a);
-__DEVICE__ int __nv_finitef(float __a);
-__DEVICE__ unsigned short __nv_float2half_rn(float __a);
-__DEVICE__ int __nv_float2int_rd(float __a);
-__DEVICE__ int __nv_float2int_rn(float __a);
-__DEVICE__ int __nv_float2int_ru(float __a);
-__DEVICE__ int __nv_float2int_rz(float __a);
-__DEVICE__ long long __nv_float2ll_rd(float __a);
-__DEVICE__ long long __nv_float2ll_rn(float __a);
-__DEVICE__ long long __nv_float2ll_ru(float __a);
-__DEVICE__ long long __nv_float2ll_rz(float __a);
-__DEVICE__ unsigned int __nv_float2uint_rd(float __a);
-__DEVICE__ unsigned int __nv_float2uint_rn(float __a);
-__DEVICE__ unsigned int __nv_float2uint_ru(float __a);
-__DEVICE__ unsigned int __nv_float2uint_rz(float __a);
-__DEVICE__ unsigned long long __nv_float2ull_rd(float __a);
-__DEVICE__ unsigned long long __nv_float2ull_rn(float __a);
-__DEVICE__ unsigned long long __nv_float2ull_ru(float __a);
-__DEVICE__ unsigned long long __nv_float2ull_rz(float __a);
-__DEVICE__ int __nv_float_as_int(float __a);
-__DEVICE__ unsigned int __nv_float_as_uint(float __a);
-__DEVICE__ double __nv_floor(double __a);
-__DEVICE__ float __nv_floorf(float __a);
-__DEVICE__ double __nv_fma(double __a, double __b, double __c);
-__DEVICE__ float __nv_fmaf(float __a, float __b, float __c);
-__DEVICE__ float __nv_fmaf_ieee_rd(float __a, float __b, float __c);
-__DEVICE__ float __nv_fmaf_ieee_rn(float __a, float __b, float __c);
-__DEVICE__ float __nv_fmaf_ieee_ru(float __a, float __b, float __c);
-__DEVICE__ float __nv_fmaf_ieee_rz(float __a, float __b, float __c);
-__DEVICE__ float __nv_fmaf_rd(float __a, float __b, float __c);
-__DEVICE__ float __nv_fmaf_rn(float __a, float __b, float __c);
-__DEVICE__ float __nv_fmaf_ru(float __a, float __b, float __c);
-__DEVICE__ float __nv_fmaf_rz(float __a, float __b, float __c);
-__DEVICE__ double __nv_fma_rd(double __a, double __b, double __c);
-__DEVICE__ double __nv_fma_rn(double __a, double __b, double __c);
-__DEVICE__ double __nv_fma_ru(double __a, double __b, double __c);
-__DEVICE__ double __nv_fma_rz(double __a, double __b, double __c);
-__DEVICE__ double __nv_fmax(double __a, double __b);
-__DEVICE__ float __nv_fmaxf(float __a, float __b);
-__DEVICE__ double __nv_fmin(double __a, double __b);
-__DEVICE__ float __nv_fminf(float __a, float __b);
-__DEVICE__ double __nv_fmod(double __a, double __b);
-__DEVICE__ float __nv_fmodf(float __a, float __b);
-__DEVICE__ float __nv_fmul_rd(float __a, float __b);
-__DEVICE__ float __nv_fmul_rn(float __a, float __b);
-__DEVICE__ float __nv_fmul_ru(float __a, float __b);
-__DEVICE__ float __nv_fmul_rz(float __a, float __b);
-__DEVICE__ float __nv_frcp_rd(float __a);
-__DEVICE__ float __nv_frcp_rn(float __a);
-__DEVICE__ float __nv_frcp_ru(float __a);
-__DEVICE__ float __nv_frcp_rz(float __a);
-__DEVICE__ double __nv_frexp(double __a, int *__b);
-__DEVICE__ float __nv_frexpf(float __a, int *__b);
-__DEVICE__ float __nv_frsqrt_rn(float __a);
-__DEVICE__ float __nv_fsqrt_rd(float __a);
-__DEVICE__ float __nv_fsqrt_rn(float __a);
-__DEVICE__ float __nv_fsqrt_ru(float __a);
-__DEVICE__ float __nv_fsqrt_rz(float __a);
-__DEVICE__ float __nv_fsub_rd(float __a, float __b);
-__DEVICE__ float __nv_fsub_rn(float __a, float __b);
-__DEVICE__ float __nv_fsub_ru(float __a, float __b);
-__DEVICE__ float __nv_fsub_rz(float __a, float __b);
-__DEVICE__ int __nv_hadd(int __a, int __b);
-__DEVICE__ float __nv_half2float(unsigned short __h);
-__DEVICE__ double __nv_hiloint2double(int __a, int __b);
-__DEVICE__ double __nv_hypot(double __a, double __b);
-__DEVICE__ float __nv_hypotf(float __a, float __b);
-__DEVICE__ int __nv_ilogb(double __a);
-__DEVICE__ int __nv_ilogbf(float __a);
-__DEVICE__ double __nv_int2double_rn(int __a);
-__DEVICE__ float __nv_int2float_rd(int __a);
-__DEVICE__ float __nv_int2float_rn(int __a);
-__DEVICE__ float __nv_int2float_ru(int __a);
-__DEVICE__ float __nv_int2float_rz(int __a);
-__DEVICE__ float __nv_int_as_float(int __a);
-__DEVICE__ int __nv_isfinited(double __a);
-__DEVICE__ int __nv_isinfd(double __a);
-__DEVICE__ int __nv_isinff(float __a);
-__DEVICE__ int __nv_isnand(double __a);
-__DEVICE__ int __nv_isnanf(float __a);
-__DEVICE__ double __nv_j0(double __a);
-__DEVICE__ float __nv_j0f(float __a);
-__DEVICE__ double __nv_j1(double __a);
-__DEVICE__ float __nv_j1f(float __a);
-__DEVICE__ float __nv_jnf(int __a, float __b);
-__DEVICE__ double __nv_jn(int __a, double __b);
-__DEVICE__ double __nv_ldexp(double __a, int __b);
-__DEVICE__ float __nv_ldexpf(float __a, int __b);
-__DEVICE__ double __nv_lgamma(double __a);
-__DEVICE__ float __nv_lgammaf(float __a);
-__DEVICE__ double __nv_ll2double_rd(long long __a);
-__DEVICE__ double __nv_ll2double_rn(long long __a);
-__DEVICE__ double __nv_ll2double_ru(long long __a);
-__DEVICE__ double __nv_ll2double_rz(long long __a);
-__DEVICE__ float __nv_ll2float_rd(long long __a);
-__DEVICE__ float __nv_ll2float_rn(long long __a);
-__DEVICE__ float __nv_ll2float_ru(long long __a);
-__DEVICE__ float __nv_ll2float_rz(long long __a);
-__DEVICE__ long long __nv_llabs(long long __a);
-__DEVICE__ long long __nv_llmax(long long __a, long long __b);
-__DEVICE__ long long __nv_llmin(long long __a, long long __b);
-__DEVICE__ long long __nv_llrint(double __a);
-__DEVICE__ long long __nv_llrintf(float __a);
-__DEVICE__ long long __nv_llround(double __a);
-__DEVICE__ long long __nv_llroundf(float __a);
-__DEVICE__ double __nv_log10(double __a);
-__DEVICE__ float __nv_log10f(float __a);
-__DEVICE__ double __nv_log1p(double __a);
-__DEVICE__ float __nv_log1pf(float __a);
-__DEVICE__ double __nv_log2(double __a);
-__DEVICE__ float __nv_log2f(float __a);
-__DEVICE__ double __nv_logb(double __a);
-__DEVICE__ float __nv_logbf(float __a);
-__DEVICE__ double __nv_log(double __a);
-__DEVICE__ float __nv_logf(float __a);
-__DEVICE__ double __nv_longlong_as_double(long long __a);
-__DEVICE__ int __nv_max(int __a, int __b);
-__DEVICE__ int __nv_min(int __a, int __b);
-__DEVICE__ double __nv_modf(double __a, double *__b);
-__DEVICE__ float __nv_modff(float __a, float *__b);
-__DEVICE__ int __nv_mul24(int __a, int __b);
-__DEVICE__ long long __nv_mul64hi(long long __a, long long __b);
-__DEVICE__ int __nv_mulhi(int __a, int __b);
-__DEVICE__ double __nv_nan(const signed char *__a);
-__DEVICE__ float __nv_nanf(const signed char *__a);
-__DEVICE__ double __nv_nearbyint(double __a);
-__DEVICE__ float __nv_nearbyintf(float __a);
-__DEVICE__ double __nv_nextafter(double __a, double __b);
-__DEVICE__ float __nv_nextafterf(float __a, float __b);
-__DEVICE__ double __nv_norm3d(double __a, double __b, double __c);
-__DEVICE__ float __nv_norm3df(float __a, float __b, float __c);
-__DEVICE__ double __nv_norm4d(double __a, double __b, double __c, double __d);
-__DEVICE__ float __nv_norm4df(float __a, float __b, float __c, float __d);
-__DEVICE__ double __nv_normcdf(double __a);
-__DEVICE__ float __nv_normcdff(float __a);
-__DEVICE__ double __nv_normcdfinv(double __a);
-__DEVICE__ float __nv_normcdfinvf(float __a);
-__DEVICE__ float __nv_normf(int __a, const float *__b);
-__DEVICE__ double __nv_norm(int __a, const double *__b);
-__DEVICE__ int __nv_popc(int __a);
-__DEVICE__ int __nv_popcll(long long __a);
-__DEVICE__ double __nv_pow(double __a, double __b);
-__DEVICE__ float __nv_powf(float __a, float __b);
-__DEVICE__ double __nv_powi(double __a, int __b);
-__DEVICE__ float __nv_powif(float __a, int __b);
-__DEVICE__ double __nv_rcbrt(double __a);
-__DEVICE__ float __nv_rcbrtf(float __a);
-__DEVICE__ double __nv_rcp64h(double __a);
-__DEVICE__ double __nv_remainder(double __a, double __b);
-__DEVICE__ float __nv_remainderf(float __a, float __b);
-__DEVICE__ double __nv_remquo(double __a, double __b, int *__c);
-__DEVICE__ float __nv_remquof(float __a, float __b, int *__c);
-__DEVICE__ int __nv_rhadd(int __a, int __b);
-__DEVICE__ double __nv_rhypot(double __a, double __b);
-__DEVICE__ float __nv_rhypotf(float __a, float __b);
-__DEVICE__ double __nv_rint(double __a);
-__DEVICE__ float __nv_rintf(float __a);
-__DEVICE__ double __nv_rnorm3d(double __a, double __b, double __c);
-__DEVICE__ float __nv_rnorm3df(float __a, float __b, float __c);
-__DEVICE__ double __nv_rnorm4d(double __a, double __b, double __c, double __d);
-__DEVICE__ float __nv_rnorm4df(float __a, float __b, float __c, float __d);
-__DEVICE__ float __nv_rnormf(int __a, const float *__b);
-__DEVICE__ double __nv_rnorm(int __a, const double *__b);
-__DEVICE__ double __nv_round(double __a);
-__DEVICE__ float __nv_roundf(float __a);
-__DEVICE__ double __nv_rsqrt(double __a);
-__DEVICE__ float __nv_rsqrtf(float __a);
-__DEVICE__ int __nv_sad(int __a, int __b, int __c);
-__DEVICE__ float __nv_saturatef(float __a);
-__DEVICE__ double __nv_scalbn(double __a, int __b);
-__DEVICE__ float __nv_scalbnf(float __a, int __b);
-__DEVICE__ int __nv_signbitd(double __a);
-__DEVICE__ int __nv_signbitf(float __a);
-__DEVICE__ void __nv_sincos(double __a, double *__b, double *__c);
-__DEVICE__ void __nv_sincosf(float __a, float *__b, float *__c);
-__DEVICE__ void __nv_sincospi(double __a, double *__b, double *__c);
-__DEVICE__ void __nv_sincospif(float __a, float *__b, float *__c);
-__DEVICE__ double __nv_sin(double __a);
-__DEVICE__ float __nv_sinf(float __a);
-__DEVICE__ double __nv_sinh(double __a);
-__DEVICE__ float __nv_sinhf(float __a);
-__DEVICE__ double __nv_sinpi(double __a);
-__DEVICE__ float __nv_sinpif(float __a);
-__DEVICE__ double __nv_sqrt(double __a);
-__DEVICE__ float __nv_sqrtf(float __a);
-__DEVICE__ double __nv_tan(double __a);
-__DEVICE__ float __nv_tanf(float __a);
-__DEVICE__ double __nv_tanh(double __a);
-__DEVICE__ float __nv_tanhf(float __a);
-__DEVICE__ double __nv_tgamma(double __a);
-__DEVICE__ float __nv_tgammaf(float __a);
-__DEVICE__ double __nv_trunc(double __a);
-__DEVICE__ float __nv_truncf(float __a);
-__DEVICE__ int __nv_uhadd(unsigned int __a, unsigned int __b);
-__DEVICE__ double __nv_uint2double_rn(unsigned int __i);
-__DEVICE__ float __nv_uint2float_rd(unsigned int __a);
-__DEVICE__ float __nv_uint2float_rn(unsigned int __a);
-__DEVICE__ float __nv_uint2float_ru(unsigned int __a);
-__DEVICE__ float __nv_uint2float_rz(unsigned int __a);
-__DEVICE__ float __nv_uint_as_float(unsigned int __a);
-__DEVICE__ double __nv_ull2double_rd(unsigned long long __a);
-__DEVICE__ double __nv_ull2double_rn(unsigned long long __a);
-__DEVICE__ double __nv_ull2double_ru(unsigned long long __a);
-__DEVICE__ double __nv_ull2double_rz(unsigned long long __a);
-__DEVICE__ float __nv_ull2float_rd(unsigned long long __a);
-__DEVICE__ float __nv_ull2float_rn(unsigned long long __a);
-__DEVICE__ float __nv_ull2float_ru(unsigned long long __a);
-__DEVICE__ float __nv_ull2float_rz(unsigned long long __a);
-__DEVICE__ unsigned long long __nv_ullmax(unsigned long long __a,
-                                          unsigned long long __b);
-__DEVICE__ unsigned long long __nv_ullmin(unsigned long long __a,
-                                          unsigned long long __b);
-__DEVICE__ unsigned int __nv_umax(unsigned int __a, unsigned int __b);
-__DEVICE__ unsigned int __nv_umin(unsigned int __a, unsigned int __b);
-__DEVICE__ unsigned int __nv_umul24(unsigned int __a, unsigned int __b);
-__DEVICE__ unsigned long long __nv_umul64hi(unsigned long long __a,
-                                            unsigned long long __b);
-__DEVICE__ unsigned int __nv_umulhi(unsigned int __a, unsigned int __b);
-__DEVICE__ unsigned int __nv_urhadd(unsigned int __a, unsigned int __b);
-__DEVICE__ unsigned int __nv_usad(unsigned int __a, unsigned int __b,
-                                  unsigned int __c);
-#if CUDA_VERSION >= 9000 && CUDA_VERSION < 9020
-__DEVICE__ int __nv_vabs2(int __a);
-__DEVICE__ int __nv_vabs4(int __a);
-__DEVICE__ int __nv_vabsdiffs2(int __a, int __b);
-__DEVICE__ int __nv_vabsdiffs4(int __a, int __b);
-__DEVICE__ int __nv_vabsdiffu2(int __a, int __b);
-__DEVICE__ int __nv_vabsdiffu4(int __a, int __b);
-__DEVICE__ int __nv_vabsss2(int __a);
-__DEVICE__ int __nv_vabsss4(int __a);
-__DEVICE__ int __nv_vadd2(int __a, int __b);
-__DEVICE__ int __nv_vadd4(int __a, int __b);
-__DEVICE__ int __nv_vaddss2(int __a, int __b);
-__DEVICE__ int __nv_vaddss4(int __a, int __b);
-__DEVICE__ int __nv_vaddus2(int __a, int __b);
-__DEVICE__ int __nv_vaddus4(int __a, int __b);
-__DEVICE__ int __nv_vavgs2(int __a, int __b);
-__DEVICE__ int __nv_vavgs4(int __a, int __b);
-__DEVICE__ int __nv_vavgu2(int __a, int __b);
-__DEVICE__ int __nv_vavgu4(int __a, int __b);
-__DEVICE__ int __nv_vcmpeq2(int __a, int __b);
-__DEVICE__ int __nv_vcmpeq4(int __a, int __b);
-__DEVICE__ int __nv_vcmpges2(int __a, int __b);
-__DEVICE__ int __nv_vcmpges4(int __a, int __b);
-__DEVICE__ int __nv_vcmpgeu2(int __a, int __b);
-__DEVICE__ int __nv_vcmpgeu4(int __a, int __b);
-__DEVICE__ int __nv_vcmpgts2(int __a, int __b);
-__DEVICE__ int __nv_vcmpgts4(int __a, int __b);
-__DEVICE__ int __nv_vcmpgtu2(int __a, int __b);
-__DEVICE__ int __nv_vcmpgtu4(int __a, int __b);
-__DEVICE__ int __nv_vcmples2(int __a, int __b);
-__DEVICE__ int __nv_vcmples4(int __a, int __b);
-__DEVICE__ int __nv_vcmpleu2(int __a, int __b);
-__DEVICE__ int __nv_vcmpleu4(int __a, int __b);
-__DEVICE__ int __nv_vcmplts2(int __a, int __b);
-__DEVICE__ int __nv_vcmplts4(int __a, int __b);
-__DEVICE__ int __nv_vcmpltu2(int __a, int __b);
-__DEVICE__ int __nv_vcmpltu4(int __a, int __b);
-__DEVICE__ int __nv_vcmpne2(int __a, int __b);
-__DEVICE__ int __nv_vcmpne4(int __a, int __b);
-__DEVICE__ int __nv_vhaddu2(int __a, int __b);
-__DEVICE__ int __nv_vhaddu4(int __a, int __b);
-__DEVICE__ int __nv_vmaxs2(int __a, int __b);
-__DEVICE__ int __nv_vmaxs4(int __a, int __b);
-__DEVICE__ int __nv_vmaxu2(int __a, int __b);
-__DEVICE__ int __nv_vmaxu4(int __a, int __b);
-__DEVICE__ int __nv_vmins2(int __a, int __b);
-__DEVICE__ int __nv_vmins4(int __a, int __b);
-__DEVICE__ int __nv_vminu2(int __a, int __b);
-__DEVICE__ int __nv_vminu4(int __a, int __b);
-__DEVICE__ int __nv_vneg2(int __a);
-__DEVICE__ int __nv_vneg4(int __a);
-__DEVICE__ int __nv_vnegss2(int __a);
-__DEVICE__ int __nv_vnegss4(int __a);
-__DEVICE__ int __nv_vsads2(int __a, int __b);
-__DEVICE__ int __nv_vsads4(int __a, int __b);
-__DEVICE__ int __nv_vsadu2(int __a, int __b);
-__DEVICE__ int __nv_vsadu4(int __a, int __b);
-__DEVICE__ int __nv_vseteq2(int __a, int __b);
-__DEVICE__ int __nv_vseteq4(int __a, int __b);
-__DEVICE__ int __nv_vsetges2(int __a, int __b);
-__DEVICE__ int __nv_vsetges4(int __a, int __b);
-__DEVICE__ int __nv_vsetgeu2(int __a, int __b);
-__DEVICE__ int __nv_vsetgeu4(int __a, int __b);
-__DEVICE__ int __nv_vsetgts2(int __a, int __b);
-__DEVICE__ int __nv_vsetgts4(int __a, int __b);
-__DEVICE__ int __nv_vsetgtu2(int __a, int __b);
-__DEVICE__ int __nv_vsetgtu4(int __a, int __b);
-__DEVICE__ int __nv_vsetles2(int __a, int __b);
-__DEVICE__ int __nv_vsetles4(int __a, int __b);
-__DEVICE__ int __nv_vsetleu2(int __a, int __b);
-__DEVICE__ int __nv_vsetleu4(int __a, int __b);
-__DEVICE__ int __nv_vsetlts2(int __a, int __b);
-__DEVICE__ int __nv_vsetlts4(int __a, int __b);
-__DEVICE__ int __nv_vsetltu2(int __a, int __b);
-__DEVICE__ int __nv_vsetltu4(int __a, int __b);
-__DEVICE__ int __nv_vsetne2(int __a, int __b);
-__DEVICE__ int __nv_vsetne4(int __a, int __b);
-__DEVICE__ int __nv_vsub2(int __a, int __b);
-__DEVICE__ int __nv_vsub4(int __a, int __b);
-__DEVICE__ int __nv_vsubss2(int __a, int __b);
-__DEVICE__ int __nv_vsubss4(int __a, int __b);
-__DEVICE__ int __nv_vsubus2(int __a, int __b);
-__DEVICE__ int __nv_vsubus4(int __a, int __b);
-#endif  // CUDA_VERSION
-__DEVICE__ double __nv_y0(double __a);
-__DEVICE__ float __nv_y0f(float __a);
-__DEVICE__ double __nv_y1(double __a);
-__DEVICE__ float __nv_y1f(float __a);
-__DEVICE__ float __nv_ynf(int __a, float __b);
-__DEVICE__ double __nv_yn(int __a, double __b);
-
-#if defined(__OPENMP_NVPTX__)
-#pragma omp end assumes ext_spmd_amenable no_openmp
-#endif
-
-#if defined(__cplusplus)
-} // extern "C"
-#endif
-#endif // __CLANG_CUDA_LIBDEVICE_DECLARES_H__
--- a/lib/include/__clang_cuda_math.h
+++ b/lib/include/__clang_cuda_math.h
@ -1,348 +0,0 @@
-/*===---- __clang_cuda_math.h - Device-side CUDA math support --------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __CLANG_CUDA_MATH_H__
-#define __CLANG_CUDA_MATH_H__
-#ifndef __CUDA__
-#error "This file is for CUDA compilation only."
-#endif
-
-#ifndef __OPENMP_NVPTX__
-#if CUDA_VERSION < 9000
-#error This file is intended to be used with CUDA-9+ only.
-#endif
-#endif
-
-// __DEVICE__ is a helper macro with common set of attributes for the wrappers
-// we implement in this file. We need static in order to avoid emitting unused
-// functions and __forceinline__ helps inlining these wrappers at -O1.
-#pragma push_macro("__DEVICE__")
-#ifdef __OPENMP_NVPTX__
-#if defined(__cplusplus)
-#define __DEVICE__ static constexpr __attribute__((always_inline, nothrow))
-#else
-#define __DEVICE__ static __attribute__((always_inline, nothrow))
-#endif
-#else
-#define __DEVICE__ static __device__ __forceinline__
-#endif
-
-// Specialized version of __DEVICE__ for functions with void return type. Needed
-// because the OpenMP overlay requires constexpr functions here but prior to
-// c++14 void return functions could not be constexpr.
-#pragma push_macro("__DEVICE_VOID__")
-#ifdef __OPENMP_NVPTX__ && defined(__cplusplus) && __cplusplus < 201402L
-#define __DEVICE_VOID__ static __attribute__((always_inline, nothrow))
-#else
-#define __DEVICE_VOID__ __DEVICE__
-#endif
-
-// libdevice provides fast low precision and slow full-recision implementations
-// for some functions. Which one gets selected depends on
-// __CLANG_CUDA_APPROX_TRANSCENDENTALS__ which gets defined by clang if
-// -ffast-math or -fcuda-approx-transcendentals are in effect.
-#pragma push_macro("__FAST_OR_SLOW")
-#if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
-#define __FAST_OR_SLOW(fast, slow) fast
-#else
-#define __FAST_OR_SLOW(fast, slow) slow
-#endif
-
-__DEVICE__ int abs(int __a) { return __nv_abs(__a); }
-__DEVICE__ double fabs(double __a) { return __nv_fabs(__a); }
-__DEVICE__ double acos(double __a) { return __nv_acos(__a); }
-__DEVICE__ float acosf(float __a) { return __nv_acosf(__a); }
-__DEVICE__ double acosh(double __a) { return __nv_acosh(__a); }
-__DEVICE__ float acoshf(float __a) { return __nv_acoshf(__a); }
-__DEVICE__ double asin(double __a) { return __nv_asin(__a); }
-__DEVICE__ float asinf(float __a) { return __nv_asinf(__a); }
-__DEVICE__ double asinh(double __a) { return __nv_asinh(__a); }
-__DEVICE__ float asinhf(float __a) { return __nv_asinhf(__a); }
-__DEVICE__ double atan(double __a) { return __nv_atan(__a); }
-__DEVICE__ double atan2(double __a, double __b) { return __nv_atan2(__a, __b); }
-__DEVICE__ float atan2f(float __a, float __b) { return __nv_atan2f(__a, __b); }
-__DEVICE__ float atanf(float __a) { return __nv_atanf(__a); }
-__DEVICE__ double atanh(double __a) { return __nv_atanh(__a); }
-__DEVICE__ float atanhf(float __a) { return __nv_atanhf(__a); }
-__DEVICE__ double cbrt(double __a) { return __nv_cbrt(__a); }
-__DEVICE__ float cbrtf(float __a) { return __nv_cbrtf(__a); }
-__DEVICE__ double ceil(double __a) { return __nv_ceil(__a); }
-__DEVICE__ float ceilf(float __a) { return __nv_ceilf(__a); }
-__DEVICE__ double copysign(double __a, double __b) {
-  return __nv_copysign(__a, __b);
-}
-__DEVICE__ float copysignf(float __a, float __b) {
-  return __nv_copysignf(__a, __b);
-}
-__DEVICE__ double cos(double __a) { return __nv_cos(__a); }
-__DEVICE__ float cosf(float __a) {
-  return __FAST_OR_SLOW(__nv_fast_cosf, __nv_cosf)(__a);
-}
-__DEVICE__ double cosh(double __a) { return __nv_cosh(__a); }
-__DEVICE__ float coshf(float __a) { return __nv_coshf(__a); }
-__DEVICE__ double cospi(double __a) { return __nv_cospi(__a); }
-__DEVICE__ float cospif(float __a) { return __nv_cospif(__a); }
-__DEVICE__ double cyl_bessel_i0(double __a) { return __nv_cyl_bessel_i0(__a); }
-__DEVICE__ float cyl_bessel_i0f(float __a) { return __nv_cyl_bessel_i0f(__a); }
-__DEVICE__ double cyl_bessel_i1(double __a) { return __nv_cyl_bessel_i1(__a); }
-__DEVICE__ float cyl_bessel_i1f(float __a) { return __nv_cyl_bessel_i1f(__a); }
-__DEVICE__ double erf(double __a) { return __nv_erf(__a); }
-__DEVICE__ double erfc(double __a) { return __nv_erfc(__a); }
-__DEVICE__ float erfcf(float __a) { return __nv_erfcf(__a); }
-__DEVICE__ double erfcinv(double __a) { return __nv_erfcinv(__a); }
-__DEVICE__ float erfcinvf(float __a) { return __nv_erfcinvf(__a); }
-__DEVICE__ double erfcx(double __a) { return __nv_erfcx(__a); }
-__DEVICE__ float erfcxf(float __a) { return __nv_erfcxf(__a); }
-__DEVICE__ float erff(float __a) { return __nv_erff(__a); }
-__DEVICE__ double erfinv(double __a) { return __nv_erfinv(__a); }
-__DEVICE__ float erfinvf(float __a) { return __nv_erfinvf(__a); }
-__DEVICE__ double exp(double __a) { return __nv_exp(__a); }
-__DEVICE__ double exp10(double __a) { return __nv_exp10(__a); }
-__DEVICE__ float exp10f(float __a) { return __nv_exp10f(__a); }
-__DEVICE__ double exp2(double __a) { return __nv_exp2(__a); }
-__DEVICE__ float exp2f(float __a) { return __nv_exp2f(__a); }
-__DEVICE__ float expf(float __a) { return __nv_expf(__a); }
-__DEVICE__ double expm1(double __a) { return __nv_expm1(__a); }
-__DEVICE__ float expm1f(float __a) { return __nv_expm1f(__a); }
-__DEVICE__ float fabsf(float __a) { return __nv_fabsf(__a); }
-__DEVICE__ double fdim(double __a, double __b) { return __nv_fdim(__a, __b); }
-__DEVICE__ float fdimf(float __a, float __b) { return __nv_fdimf(__a, __b); }
-__DEVICE__ double fdivide(double __a, double __b) { return __a / __b; }
-__DEVICE__ float fdividef(float __a, float __b) {
-#if __FAST_MATH__ && !__CUDA_PREC_DIV
-  return __nv_fast_fdividef(__a, __b);
-#else
-  return __a / __b;
-#endif
-}
-__DEVICE__ double floor(double __f) { return __nv_floor(__f); }
-__DEVICE__ float floorf(float __f) { return __nv_floorf(__f); }
-__DEVICE__ double fma(double __a, double __b, double __c) {
-  return __nv_fma(__a, __b, __c);
-}
-__DEVICE__ float fmaf(float __a, float __b, float __c) {
-  return __nv_fmaf(__a, __b, __c);
-}
-__DEVICE__ double fmax(double __a, double __b) { return __nv_fmax(__a, __b); }
-__DEVICE__ float fmaxf(float __a, float __b) { return __nv_fmaxf(__a, __b); }
-__DEVICE__ double fmin(double __a, double __b) { return __nv_fmin(__a, __b); }
-__DEVICE__ float fminf(float __a, float __b) { return __nv_fminf(__a, __b); }
-__DEVICE__ double fmod(double __a, double __b) { return __nv_fmod(__a, __b); }
-__DEVICE__ float fmodf(float __a, float __b) { return __nv_fmodf(__a, __b); }
-__DEVICE__ double frexp(double __a, int *__b) { return __nv_frexp(__a, __b); }
-__DEVICE__ float frexpf(float __a, int *__b) { return __nv_frexpf(__a, __b); }
-__DEVICE__ double hypot(double __a, double __b) { return __nv_hypot(__a, __b); }
-__DEVICE__ float hypotf(float __a, float __b) { return __nv_hypotf(__a, __b); }
-__DEVICE__ int ilogb(double __a) { return __nv_ilogb(__a); }
-__DEVICE__ int ilogbf(float __a) { return __nv_ilogbf(__a); }
-__DEVICE__ double j0(double __a) { return __nv_j0(__a); }
-__DEVICE__ float j0f(float __a) { return __nv_j0f(__a); }
-__DEVICE__ double j1(double __a) { return __nv_j1(__a); }
-__DEVICE__ float j1f(float __a) { return __nv_j1f(__a); }
-__DEVICE__ double jn(int __n, double __a) { return __nv_jn(__n, __a); }
-__DEVICE__ float jnf(int __n, float __a) { return __nv_jnf(__n, __a); }
-#if defined(__LP64__) || defined(_WIN64)
-__DEVICE__ long labs(long __a) { return __nv_llabs(__a); };
-#else
-__DEVICE__ long labs(long __a) { return __nv_abs(__a); };
-#endif
-__DEVICE__ double ldexp(double __a, int __b) { return __nv_ldexp(__a, __b); }
-__DEVICE__ float ldexpf(float __a, int __b) { return __nv_ldexpf(__a, __b); }
-__DEVICE__ double lgamma(double __a) { return __nv_lgamma(__a); }
-__DEVICE__ float lgammaf(float __a) { return __nv_lgammaf(__a); }
-__DEVICE__ long long llabs(long long __a) { return __nv_llabs(__a); }
-__DEVICE__ long long llmax(long long __a, long long __b) {
-  return __nv_llmax(__a, __b);
-}
-__DEVICE__ long long llmin(long long __a, long long __b) {
-  return __nv_llmin(__a, __b);
-}
-__DEVICE__ long long llrint(double __a) { return __nv_llrint(__a); }
-__DEVICE__ long long llrintf(float __a) { return __nv_llrintf(__a); }
-__DEVICE__ long long llround(double __a) { return __nv_llround(__a); }
-__DEVICE__ long long llroundf(float __a) { return __nv_llroundf(__a); }
-__DEVICE__ double round(double __a) { return __nv_round(__a); }
-__DEVICE__ float roundf(float __a) { return __nv_roundf(__a); }
-__DEVICE__ double log(double __a) { return __nv_log(__a); }
-__DEVICE__ double log10(double __a) { return __nv_log10(__a); }
-__DEVICE__ float log10f(float __a) { return __nv_log10f(__a); }
-__DEVICE__ double log1p(double __a) { return __nv_log1p(__a); }
-__DEVICE__ float log1pf(float __a) { return __nv_log1pf(__a); }
-__DEVICE__ double log2(double __a) { return __nv_log2(__a); }
-__DEVICE__ float log2f(float __a) {
-  return __FAST_OR_SLOW(__nv_fast_log2f, __nv_log2f)(__a);
-}
-__DEVICE__ double logb(double __a) { return __nv_logb(__a); }
-__DEVICE__ float logbf(float __a) { return __nv_logbf(__a); }
-__DEVICE__ float logf(float __a) {
-  return __FAST_OR_SLOW(__nv_fast_logf, __nv_logf)(__a);
-}
-#if defined(__LP64__) || defined(_WIN64)
-__DEVICE__ long lrint(double __a) { return llrint(__a); }
-__DEVICE__ long lrintf(float __a) { return __float2ll_rn(__a); }
-__DEVICE__ long lround(double __a) { return llround(__a); }
-__DEVICE__ long lroundf(float __a) { return llroundf(__a); }
-#else
-__DEVICE__ long lrint(double __a) { return (long)rint(__a); }
-__DEVICE__ long lrintf(float __a) { return __float2int_rn(__a); }
-__DEVICE__ long lround(double __a) { return round(__a); }
-__DEVICE__ long lroundf(float __a) { return roundf(__a); }
-#endif
-__DEVICE__ int max(int __a, int __b) { return __nv_max(__a, __b); }
-__DEVICE__ int min(int __a, int __b) { return __nv_min(__a, __b); }
-__DEVICE__ double modf(double __a, double *__b) { return __nv_modf(__a, __b); }
-__DEVICE__ float modff(float __a, float *__b) { return __nv_modff(__a, __b); }
-__DEVICE__ double nearbyint(double __a) { return __builtin_nearbyint(__a); }
-__DEVICE__ float nearbyintf(float __a) { return __builtin_nearbyintf(__a); }
-__DEVICE__ double nextafter(double __a, double __b) {
-  return __nv_nextafter(__a, __b);
-}
-__DEVICE__ float nextafterf(float __a, float __b) {
-  return __nv_nextafterf(__a, __b);
-}
-__DEVICE__ double norm(int __dim, const double *__t) {
-  return __nv_norm(__dim, __t);
-}
-__DEVICE__ double norm3d(double __a, double __b, double __c) {
-  return __nv_norm3d(__a, __b, __c);
-}
-__DEVICE__ float norm3df(float __a, float __b, float __c) {
-  return __nv_norm3df(__a, __b, __c);
-}
-__DEVICE__ double norm4d(double __a, double __b, double __c, double __d) {
-  return __nv_norm4d(__a, __b, __c, __d);
-}
-__DEVICE__ float norm4df(float __a, float __b, float __c, float __d) {
-  return __nv_norm4df(__a, __b, __c, __d);
-}
-__DEVICE__ double normcdf(double __a) { return __nv_normcdf(__a); }
-__DEVICE__ float normcdff(float __a) { return __nv_normcdff(__a); }
-__DEVICE__ double normcdfinv(double __a) { return __nv_normcdfinv(__a); }
-__DEVICE__ float normcdfinvf(float __a) { return __nv_normcdfinvf(__a); }
-__DEVICE__ float normf(int __dim, const float *__t) {
-  return __nv_normf(__dim, __t);
-}
-__DEVICE__ double pow(double __a, double __b) { return __nv_pow(__a, __b); }
-__DEVICE__ float powf(float __a, float __b) { return __nv_powf(__a, __b); }
-__DEVICE__ double powi(double __a, int __b) { return __nv_powi(__a, __b); }
-__DEVICE__ float powif(float __a, int __b) { return __nv_powif(__a, __b); }
-__DEVICE__ double rcbrt(double __a) { return __nv_rcbrt(__a); }
-__DEVICE__ float rcbrtf(float __a) { return __nv_rcbrtf(__a); }
-__DEVICE__ double remainder(double __a, double __b) {
-  return __nv_remainder(__a, __b);
-}
-__DEVICE__ float remainderf(float __a, float __b) {
-  return __nv_remainderf(__a, __b);
-}
-__DEVICE__ double remquo(double __a, double __b, int *__c) {
-  return __nv_remquo(__a, __b, __c);
-}
-__DEVICE__ float remquof(float __a, float __b, int *__c) {
-  return __nv_remquof(__a, __b, __c);
-}
-__DEVICE__ double rhypot(double __a, double __b) {
-  return __nv_rhypot(__a, __b);
-}
-__DEVICE__ float rhypotf(float __a, float __b) {
-  return __nv_rhypotf(__a, __b);
-}
-// __nv_rint* in libdevice is buggy and produces incorrect results.
-__DEVICE__ double rint(double __a) { return __builtin_rint(__a); }
-__DEVICE__ float rintf(float __a) { return __builtin_rintf(__a); }
-__DEVICE__ double rnorm(int __a, const double *__b) {
-  return __nv_rnorm(__a, __b);
-}
-__DEVICE__ double rnorm3d(double __a, double __b, double __c) {
-  return __nv_rnorm3d(__a, __b, __c);
-}
-__DEVICE__ float rnorm3df(float __a, float __b, float __c) {
-  return __nv_rnorm3df(__a, __b, __c);
-}
-__DEVICE__ double rnorm4d(double __a, double __b, double __c, double __d) {
-  return __nv_rnorm4d(__a, __b, __c, __d);
-}
-__DEVICE__ float rnorm4df(float __a, float __b, float __c, float __d) {
-  return __nv_rnorm4df(__a, __b, __c, __d);
-}
-__DEVICE__ float rnormf(int __dim, const float *__t) {
-  return __nv_rnormf(__dim, __t);
-}
-__DEVICE__ double rsqrt(double __a) { return __nv_rsqrt(__a); }
-__DEVICE__ float rsqrtf(float __a) { return __nv_rsqrtf(__a); }
-__DEVICE__ double scalbn(double __a, int __b) { return __nv_scalbn(__a, __b); }
-__DEVICE__ float scalbnf(float __a, int __b) { return __nv_scalbnf(__a, __b); }
-__DEVICE__ double scalbln(double __a, long __b) {
-  if (__b > INT_MAX)
-    return __a > 0 ? HUGE_VAL : -HUGE_VAL;
-  if (__b < INT_MIN)
-    return __a > 0 ? 0.0 : -0.0;
-  return scalbn(__a, (int)__b);
-}
-__DEVICE__ float scalblnf(float __a, long __b) {
-  if (__b > INT_MAX)
-    return __a > 0 ? HUGE_VALF : -HUGE_VALF;
-  if (__b < INT_MIN)
-    return __a > 0 ? 0.f : -0.f;
-  return scalbnf(__a, (int)__b);
-}
-__DEVICE__ double sin(double __a) { return __nv_sin(__a); }
-__DEVICE_VOID__ void sincos(double __a, double *__s, double *__c) {
-  return __nv_sincos(__a, __s, __c);
-}
-__DEVICE_VOID__ void sincosf(float __a, float *__s, float *__c) {
-  return __FAST_OR_SLOW(__nv_fast_sincosf, __nv_sincosf)(__a, __s, __c);
-}
-__DEVICE_VOID__ void sincospi(double __a, double *__s, double *__c) {
-  return __nv_sincospi(__a, __s, __c);
-}
-__DEVICE_VOID__ void sincospif(float __a, float *__s, float *__c) {
-  return __nv_sincospif(__a, __s, __c);
-}
-__DEVICE__ float sinf(float __a) {
-  return __FAST_OR_SLOW(__nv_fast_sinf, __nv_sinf)(__a);
-}
-__DEVICE__ double sinh(double __a) { return __nv_sinh(__a); }
-__DEVICE__ float sinhf(float __a) { return __nv_sinhf(__a); }
-__DEVICE__ double sinpi(double __a) { return __nv_sinpi(__a); }
-__DEVICE__ float sinpif(float __a) { return __nv_sinpif(__a); }
-__DEVICE__ double sqrt(double __a) { return __nv_sqrt(__a); }
-__DEVICE__ float sqrtf(float __a) { return __nv_sqrtf(__a); }
-__DEVICE__ double tan(double __a) { return __nv_tan(__a); }
-__DEVICE__ float tanf(float __a) { return __nv_tanf(__a); }
-__DEVICE__ double tanh(double __a) { return __nv_tanh(__a); }
-__DEVICE__ float tanhf(float __a) { return __nv_tanhf(__a); }
-__DEVICE__ double tgamma(double __a) { return __nv_tgamma(__a); }
-__DEVICE__ float tgammaf(float __a) { return __nv_tgammaf(__a); }
-__DEVICE__ double trunc(double __a) { return __nv_trunc(__a); }
-__DEVICE__ float truncf(float __a) { return __nv_truncf(__a); }
-__DEVICE__ unsigned long long ullmax(unsigned long long __a,
-                                     unsigned long long __b) {
-  return __nv_ullmax(__a, __b);
-}
-__DEVICE__ unsigned long long ullmin(unsigned long long __a,
-                                     unsigned long long __b) {
-  return __nv_ullmin(__a, __b);
-}
-__DEVICE__ unsigned int umax(unsigned int __a, unsigned int __b) {
-  return __nv_umax(__a, __b);
-}
-__DEVICE__ unsigned int umin(unsigned int __a, unsigned int __b) {
-  return __nv_umin(__a, __b);
-}
-__DEVICE__ double y0(double __a) { return __nv_y0(__a); }
-__DEVICE__ float y0f(float __a) { return __nv_y0f(__a); }
-__DEVICE__ double y1(double __a) { return __nv_y1(__a); }
-__DEVICE__ float y1f(float __a) { return __nv_y1f(__a); }
-__DEVICE__ double yn(int __a, double __b) { return __nv_yn(__a, __b); }
-__DEVICE__ float ynf(int __a, float __b) { return __nv_ynf(__a, __b); }
-
-#pragma pop_macro("__DEVICE__")
-#pragma pop_macro("__DEVICE_VOID__")
-#pragma pop_macro("__FAST_OR_SLOW")
-
-#endif // __CLANG_CUDA_MATH_H__
--- a/lib/include/__clang_cuda_math_forward_declares.h
+++ b/lib/include/__clang_cuda_math_forward_declares.h
@ -1,284 +0,0 @@
-/*===- __clang_math_forward_declares.h - Prototypes of __device__ math fns --===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __CLANG__CUDA_MATH_FORWARD_DECLARES_H__
-#define __CLANG__CUDA_MATH_FORWARD_DECLARES_H__
-#if !defined(__CUDA__) && !__HIP__
-#error "This file is for CUDA/HIP compilation only."
-#endif
-
-// This file forward-declares of some math functions we (or the CUDA headers)
-// will define later.  We need to do this, and do it before cmath is included,
-// because the standard library may have constexpr math functions.  In the
-// absence of a prior __device__ decl, those constexpr functions may become
-// implicitly host+device.  host+device functions can't be overloaded, so that
-// would preclude the use of our own __device__ overloads for these functions.
-
-#pragma push_macro("__DEVICE__")
-#define __DEVICE__                                                             \
-  static __inline__ __attribute__((always_inline)) __attribute__((device))
-
-__DEVICE__ long abs(long);
-__DEVICE__ long long abs(long long);
-__DEVICE__ double abs(double);
-__DEVICE__ float abs(float);
-__DEVICE__ int abs(int);
-__DEVICE__ double acos(double);
-__DEVICE__ float acos(float);
-__DEVICE__ double acosh(double);
-__DEVICE__ float acosh(float);
-__DEVICE__ double asin(double);
-__DEVICE__ float asin(float);
-__DEVICE__ double asinh(double);
-__DEVICE__ float asinh(float);
-__DEVICE__ double atan2(double, double);
-__DEVICE__ float atan2(float, float);
-__DEVICE__ double atan(double);
-__DEVICE__ float atan(float);
-__DEVICE__ double atanh(double);
-__DEVICE__ float atanh(float);
-__DEVICE__ double cbrt(double);
-__DEVICE__ float cbrt(float);
-__DEVICE__ double ceil(double);
-__DEVICE__ float ceil(float);
-__DEVICE__ double copysign(double, double);
-__DEVICE__ float copysign(float, float);
-__DEVICE__ double cos(double);
-__DEVICE__ float cos(float);
-__DEVICE__ double cosh(double);
-__DEVICE__ float cosh(float);
-__DEVICE__ double erfc(double);
-__DEVICE__ float erfc(float);
-__DEVICE__ double erf(double);
-__DEVICE__ float erf(float);
-__DEVICE__ double exp2(double);
-__DEVICE__ float exp2(float);
-__DEVICE__ double exp(double);
-__DEVICE__ float exp(float);
-__DEVICE__ double expm1(double);
-__DEVICE__ float expm1(float);
-__DEVICE__ double fabs(double);
-__DEVICE__ float fabs(float);
-__DEVICE__ double fdim(double, double);
-__DEVICE__ float fdim(float, float);
-__DEVICE__ double floor(double);
-__DEVICE__ float floor(float);
-__DEVICE__ double fma(double, double, double);
-__DEVICE__ float fma(float, float, float);
-__DEVICE__ double fmax(double, double);
-__DEVICE__ float fmax(float, float);
-__DEVICE__ double fmin(double, double);
-__DEVICE__ float fmin(float, float);
-__DEVICE__ double fmod(double, double);
-__DEVICE__ float fmod(float, float);
-__DEVICE__ int fpclassify(double);
-__DEVICE__ int fpclassify(float);
-__DEVICE__ double frexp(double, int *);
-__DEVICE__ float frexp(float, int *);
-__DEVICE__ double hypot(double, double);
-__DEVICE__ float hypot(float, float);
-__DEVICE__ int ilogb(double);
-__DEVICE__ int ilogb(float);
-#ifdef _MSC_VER
-__DEVICE__ bool isfinite(long double);
-#endif
-__DEVICE__ bool isfinite(double);
-__DEVICE__ bool isfinite(float);
-__DEVICE__ bool isgreater(double, double);
-__DEVICE__ bool isgreaterequal(double, double);
-__DEVICE__ bool isgreaterequal(float, float);
-__DEVICE__ bool isgreater(float, float);
-#ifdef _MSC_VER
-__DEVICE__ bool isinf(long double);
-#endif
-__DEVICE__ bool isinf(double);
-__DEVICE__ bool isinf(float);
-__DEVICE__ bool isless(double, double);
-__DEVICE__ bool islessequal(double, double);
-__DEVICE__ bool islessequal(float, float);
-__DEVICE__ bool isless(float, float);
-__DEVICE__ bool islessgreater(double, double);
-__DEVICE__ bool islessgreater(float, float);
-#ifdef _MSC_VER
-__DEVICE__ bool isnan(long double);
-#endif
-__DEVICE__ bool isnan(double);
-__DEVICE__ bool isnan(float);
-__DEVICE__ bool isnormal(double);
-__DEVICE__ bool isnormal(float);
-__DEVICE__ bool isunordered(double, double);
-__DEVICE__ bool isunordered(float, float);
-__DEVICE__ long labs(long);
-__DEVICE__ double ldexp(double, int);
-__DEVICE__ float ldexp(float, int);
-__DEVICE__ double lgamma(double);
-__DEVICE__ float lgamma(float);
-__DEVICE__ long long llabs(long long);
-__DEVICE__ long long llrint(double);
-__DEVICE__ long long llrint(float);
-__DEVICE__ double log10(double);
-__DEVICE__ float log10(float);
-__DEVICE__ double log1p(double);
-__DEVICE__ float log1p(float);
-__DEVICE__ double log2(double);
-__DEVICE__ float log2(float);
-__DEVICE__ double logb(double);
-__DEVICE__ float logb(float);
-__DEVICE__ double log(double);
-__DEVICE__ float log(float);
-__DEVICE__ long lrint(double);
-__DEVICE__ long lrint(float);
-__DEVICE__ long lround(double);
-__DEVICE__ long lround(float);
-__DEVICE__ long long llround(float); // No llround(double).
-__DEVICE__ double modf(double, double *);
-__DEVICE__ float modf(float, float *);
-__DEVICE__ double nan(const char *);
-__DEVICE__ float nanf(const char *);
-__DEVICE__ double nearbyint(double);
-__DEVICE__ float nearbyint(float);
-__DEVICE__ double nextafter(double, double);
-__DEVICE__ float nextafter(float, float);
-__DEVICE__ double pow(double, double);
-__DEVICE__ double pow(double, int);
-__DEVICE__ float pow(float, float);
-__DEVICE__ float pow(float, int);
-__DEVICE__ double remainder(double, double);
-__DEVICE__ float remainder(float, float);
-__DEVICE__ double remquo(double, double, int *);
-__DEVICE__ float remquo(float, float, int *);
-__DEVICE__ double rint(double);
-__DEVICE__ float rint(float);
-__DEVICE__ double round(double);
-__DEVICE__ float round(float);
-__DEVICE__ double scalbln(double, long);
-__DEVICE__ float scalbln(float, long);
-__DEVICE__ double scalbn(double, int);
-__DEVICE__ float scalbn(float, int);
-#ifdef _MSC_VER
-__DEVICE__ bool signbit(long double);
-#endif
-__DEVICE__ bool signbit(double);
-__DEVICE__ bool signbit(float);
-__DEVICE__ double sin(double);
-__DEVICE__ float sin(float);
-__DEVICE__ double sinh(double);
-__DEVICE__ float sinh(float);
-__DEVICE__ double sqrt(double);
-__DEVICE__ float sqrt(float);
-__DEVICE__ double tan(double);
-__DEVICE__ float tan(float);
-__DEVICE__ double tanh(double);
-__DEVICE__ float tanh(float);
-__DEVICE__ double tgamma(double);
-__DEVICE__ float tgamma(float);
-__DEVICE__ double trunc(double);
-__DEVICE__ float trunc(float);
-
-// Notably missing above is nexttoward, which we don't define on
-// the device side because libdevice doesn't give us an implementation, and we
-// don't want to be in the business of writing one ourselves.
-
-// We need to define these overloads in exactly the namespace our standard
-// library uses (including the right inline namespace), otherwise they won't be
-// picked up by other functions in the standard library (e.g. functions in
-// <complex>).  Thus the ugliness below.
-#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
-_LIBCPP_BEGIN_NAMESPACE_STD
-#else
-namespace std {
-#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
-_GLIBCXX_BEGIN_NAMESPACE_VERSION
-#endif
-#endif
-
-using ::abs;
-using ::acos;
-using ::acosh;
-using ::asin;
-using ::asinh;
-using ::atan;
-using ::atan2;
-using ::atanh;
-using ::cbrt;
-using ::ceil;
-using ::copysign;
-using ::cos;
-using ::cosh;
-using ::erf;
-using ::erfc;
-using ::exp;
-using ::exp2;
-using ::expm1;
-using ::fabs;
-using ::fdim;
-using ::floor;
-using ::fma;
-using ::fmax;
-using ::fmin;
-using ::fmod;
-using ::fpclassify;
-using ::frexp;
-using ::hypot;
-using ::ilogb;
-using ::isfinite;
-using ::isgreater;
-using ::isgreaterequal;
-using ::isinf;
-using ::isless;
-using ::islessequal;
-using ::islessgreater;
-using ::isnan;
-using ::isnormal;
-using ::isunordered;
-using ::labs;
-using ::ldexp;
-using ::lgamma;
-using ::llabs;
-using ::llrint;
-using ::log;
-using ::log10;
-using ::log1p;
-using ::log2;
-using ::logb;
-using ::lrint;
-using ::lround;
-using ::llround;
-using ::modf;
-using ::nan;
-using ::nanf;
-using ::nearbyint;
-using ::nextafter;
-using ::pow;
-using ::remainder;
-using ::remquo;
-using ::rint;
-using ::round;
-using ::scalbln;
-using ::scalbn;
-using ::signbit;
-using ::sin;
-using ::sinh;
-using ::sqrt;
-using ::tan;
-using ::tanh;
-using ::tgamma;
-using ::trunc;
-
-#ifdef _LIBCPP_END_NAMESPACE_STD
-_LIBCPP_END_NAMESPACE_STD
-#else
-#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
-_GLIBCXX_END_NAMESPACE_VERSION
-#endif
-} // namespace std
-#endif
-
-#pragma pop_macro("__DEVICE__")
-
-#endif
--- a/lib/include/__clang_cuda_runtime_wrapper.h
+++ b/lib/include/__clang_cuda_runtime_wrapper.h
@ -1,503 +0,0 @@
-/*===---- __clang_cuda_runtime_wrapper.h - CUDA runtime support -------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-/*
- * WARNING: This header is intended to be directly -include'd by
- * the compiler and is not supposed to be included by users.
- *
- * CUDA headers are implemented in a way that currently makes it
- * impossible for user code to #include directly when compiling with
- * Clang. They present different view of CUDA-supplied functions
- * depending on where in NVCC's compilation pipeline the headers are
- * included. Neither of these modes provides function definitions with
- * correct attributes, so we use preprocessor to force the headers
- * into a form that Clang can use.
- *
- * Similarly to NVCC which -include's cuda_runtime.h, Clang -include's
- * this file during every CUDA compilation.
- */
-
-#ifndef __CLANG_CUDA_RUNTIME_WRAPPER_H__
-#define __CLANG_CUDA_RUNTIME_WRAPPER_H__
-
-#if defined(__CUDA__) && defined(__clang__)
-
-// Include some forward declares that must come before cmath.
-#include <__clang_cuda_math_forward_declares.h>
-
-// Define __CUDACC__ early as libstdc++ standard headers with GNU extensions
-// enabled depend on it to avoid using __float128, which is unsupported in
-// CUDA.
-#define __CUDACC__
-
-// Include some standard headers to avoid CUDA headers including them
-// while some required macros (like __THROW) are in a weird state.
-#include <cmath>
-#include <cstdlib>
-#include <stdlib.h>
-#include <string.h>
-#undef __CUDACC__
-
-// Preserve common macros that will be changed below by us or by CUDA
-// headers.
-#pragma push_macro("__THROW")
-#pragma push_macro("__CUDA_ARCH__")
-
-// WARNING: Preprocessor hacks below are based on specific details of
-// CUDA-7.x headers and are not expected to work with any other
-// version of CUDA headers.
-#include "cuda.h"
-#if !defined(CUDA_VERSION)
-#error "cuda.h did not define CUDA_VERSION"
-#elif CUDA_VERSION < 7000
-#error "Unsupported CUDA version!"
-#endif
-
-#pragma push_macro("__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__")
-#if CUDA_VERSION >= 10000
-#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
-#endif
-
-// Make largest subset of device functions available during host
-// compilation.
-#ifndef __CUDA_ARCH__
-#define __CUDA_ARCH__ 9999
-#endif
-
-#include "__clang_cuda_builtin_vars.h"
-
-// No need for device_launch_parameters.h as __clang_cuda_builtin_vars.h above
-// has taken care of builtin variables declared in the file.
-#define __DEVICE_LAUNCH_PARAMETERS_H__
-
-// {math,device}_functions.h only have declarations of the
-// functions. We don't need them as we're going to pull in their
-// definitions from .hpp files.
-#define __DEVICE_FUNCTIONS_H__
-#define __MATH_FUNCTIONS_H__
-#define __COMMON_FUNCTIONS_H__
-// device_functions_decls is replaced by __clang_cuda_device_functions.h
-// included below.
-#define __DEVICE_FUNCTIONS_DECLS_H__
-
-#undef __CUDACC__
-#if CUDA_VERSION < 9000
-#define __CUDABE__
-#else
-#define __CUDACC__
-#define __CUDA_LIBDEVICE__
-#endif
-// Disables definitions of device-side runtime support stubs in
-// cuda_device_runtime_api.h
-#include "host_defines.h"
-#undef __CUDACC__
-#include "driver_types.h"
-#include "host_config.h"
-
-// Temporarily replace "nv_weak" with weak, so __attribute__((nv_weak)) in
-// cuda_device_runtime_api.h ends up being __attribute__((weak)) which is the
-// functional equivalent of what we need.
-#pragma push_macro("nv_weak")
-#define nv_weak weak
-#undef __CUDABE__
-#undef __CUDA_LIBDEVICE__
-#define __CUDACC__
-#include "cuda_runtime.h"
-
-#pragma pop_macro("nv_weak")
-#undef __CUDACC__
-#define __CUDABE__
-
-// CUDA headers use __nvvm_memcpy and __nvvm_memset which Clang does
-// not have at the moment. Emulate them with a builtin memcpy/memset.
-#define __nvvm_memcpy(s, d, n, a) __builtin_memcpy(s, d, n)
-#define __nvvm_memset(d, c, n, a) __builtin_memset(d, c, n)
-
-#if CUDA_VERSION < 9000
-#include "crt/device_runtime.h"
-#endif
-#include "crt/host_runtime.h"
-// device_runtime.h defines __cxa_* macros that will conflict with
-// cxxabi.h.
-// FIXME: redefine these as __device__ functions.
-#undef __cxa_vec_ctor
-#undef __cxa_vec_cctor
-#undef __cxa_vec_dtor
-#undef __cxa_vec_new
-#undef __cxa_vec_new2
-#undef __cxa_vec_new3
-#undef __cxa_vec_delete2
-#undef __cxa_vec_delete
-#undef __cxa_vec_delete3
-#undef __cxa_pure_virtual
-
-// math_functions.hpp expects this host function be defined on MacOS, but it
-// ends up not being there because of the games we play here.  Just define it
-// ourselves; it's simple enough.
-#ifdef __APPLE__
-inline __host__ double __signbitd(double x) {
-  return std::signbit(x);
-}
-#endif
-
-// CUDA 9.1 no longer provides declarations for libdevice functions, so we need
-// to provide our own.
-#include <__clang_cuda_libdevice_declares.h>
-
-// Wrappers for many device-side standard library functions, incl. math
-// functions, became compiler builtins in CUDA-9 and have been removed from the
-// CUDA headers. Clang now provides its own implementation of the wrappers.
-#if CUDA_VERSION >= 9000
-#include <__clang_cuda_device_functions.h>
-#include <__clang_cuda_math.h>
-#endif
-
-// __THROW is redefined to be empty by device_functions_decls.h in CUDA. Clang's
-// counterpart does not do it, so we need to make it empty here to keep
-// following CUDA includes happy.
-#undef __THROW
-#define __THROW
-
-// CUDA 8.0.41 relies on __USE_FAST_MATH__ and __CUDA_PREC_DIV's values.
-// Previous versions used to check whether they are defined or not.
-// CU_DEVICE_INVALID macro is only defined in 8.0.41, so we use it
-// here to detect the switch.
-
-#if defined(CU_DEVICE_INVALID)
-#if !defined(__USE_FAST_MATH__)
-#define __USE_FAST_MATH__ 0
-#endif
-
-#if !defined(__CUDA_PREC_DIV)
-#define __CUDA_PREC_DIV 0
-#endif
-#endif
-
-// Temporarily poison __host__ macro to ensure it's not used by any of
-// the headers we're about to include.
-#pragma push_macro("__host__")
-#define __host__ UNEXPECTED_HOST_ATTRIBUTE
-
-// device_functions.hpp and math_functions*.hpp use 'static
-// __forceinline__' (with no __device__) for definitions of device
-// functions. Temporarily redefine __forceinline__ to include
-// __device__.
-#pragma push_macro("__forceinline__")
-#define __forceinline__ __device__ __inline__ __attribute__((always_inline))
-#if CUDA_VERSION < 9000
-#include "device_functions.hpp"
-#endif
-
-// math_function.hpp uses the __USE_FAST_MATH__ macro to determine whether we
-// get the slow-but-accurate or fast-but-inaccurate versions of functions like
-// sin and exp.  This is controlled in clang by -fcuda-approx-transcendentals.
-//
-// device_functions.hpp uses __USE_FAST_MATH__ for a different purpose (fast vs.
-// slow divides), so we need to scope our define carefully here.
-#pragma push_macro("__USE_FAST_MATH__")
-#if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
-#define __USE_FAST_MATH__ 1
-#endif
-
-#if CUDA_VERSION >= 9000
-#include "crt/math_functions.hpp"
-#else
-#include "math_functions.hpp"
-#endif
-
-#pragma pop_macro("__USE_FAST_MATH__")
-
-#if CUDA_VERSION < 9000
-#include "math_functions_dbl_ptx3.hpp"
-#endif
-#pragma pop_macro("__forceinline__")
-
-// Pull in host-only functions that are only available when neither
-// __CUDACC__ nor __CUDABE__ are defined.
-#undef __MATH_FUNCTIONS_HPP__
-#undef __CUDABE__
-#if CUDA_VERSION < 9000
-#include "math_functions.hpp"
-#endif
-// Alas, additional overloads for these functions are hard to get to.
-// Considering that we only need these overloads for a few functions,
-// we can provide them here.
-static inline float rsqrt(float __a) { return rsqrtf(__a); }
-static inline float rcbrt(float __a) { return rcbrtf(__a); }
-static inline float sinpi(float __a) { return sinpif(__a); }
-static inline float cospi(float __a) { return cospif(__a); }
-static inline void sincospi(float __a, float *__b, float *__c) {
-  return sincospif(__a, __b, __c);
-}
-static inline float erfcinv(float __a) { return erfcinvf(__a); }
-static inline float normcdfinv(float __a) { return normcdfinvf(__a); }
-static inline float normcdf(float __a) { return normcdff(__a); }
-static inline float erfcx(float __a) { return erfcxf(__a); }
-
-#if CUDA_VERSION < 9000
-// For some reason single-argument variant is not always declared by
-// CUDA headers. Alas, device_functions.hpp included below needs it.
-static inline __device__ void __brkpt(int __c) { __brkpt(); }
-#endif
-
-// Now include *.hpp with definitions of various GPU functions.  Alas,
-// a lot of thins get declared/defined with __host__ attribute which
-// we don't want and we have to define it out. We also have to include
-// {device,math}_functions.hpp again in order to extract the other
-// branch of #if/else inside.
-#define __host__
-#undef __CUDABE__
-#define __CUDACC__
-#if CUDA_VERSION >= 9000
-// Some atomic functions became compiler builtins in CUDA-9 , so we need their
-// declarations.
-#include "device_atomic_functions.h"
-#endif
-#undef __DEVICE_FUNCTIONS_HPP__
-#include "device_atomic_functions.hpp"
-#if CUDA_VERSION >= 9000
-#include "crt/device_functions.hpp"
-#include "crt/device_double_functions.hpp"
-#else
-#include "device_functions.hpp"
-#define __CUDABE__
-#include "device_double_functions.h"
-#undef __CUDABE__
-#endif
-#include "sm_20_atomic_functions.hpp"
-// Predicate functions used in `__builtin_assume` need to have no side effect.
-// However, sm_20_intrinsics.hpp doesn't define them with neither pure nor
-// const attribute. Rename definitions from sm_20_intrinsics.hpp and re-define
-// them as pure ones.
-#pragma push_macro("__isGlobal")
-#pragma push_macro("__isShared")
-#pragma push_macro("__isConstant")
-#pragma push_macro("__isLocal")
-#define __isGlobal __ignored_cuda___isGlobal
-#define __isShared __ignored_cuda___isShared
-#define __isConstant __ignored_cuda___isConstant
-#define __isLocal __ignored_cuda___isLocal
-#include "sm_20_intrinsics.hpp"
-#pragma pop_macro("__isGlobal")
-#pragma pop_macro("__isShared")
-#pragma pop_macro("__isConstant")
-#pragma pop_macro("__isLocal")
-#pragma push_macro("__DEVICE__")
-#define __DEVICE__ static __device__ __forceinline__ __attribute__((const))
-__DEVICE__ unsigned int __isGlobal(const void *p) {
-  return __nvvm_isspacep_global(p);
-}
-__DEVICE__ unsigned int __isShared(const void *p) {
-  return __nvvm_isspacep_shared(p);
-}
-__DEVICE__ unsigned int __isConstant(const void *p) {
-  return __nvvm_isspacep_const(p);
-}
-__DEVICE__ unsigned int __isLocal(const void *p) {
-  return __nvvm_isspacep_local(p);
-}
-#pragma pop_macro("__DEVICE__")
-#include "sm_32_atomic_functions.hpp"
-
-// Don't include sm_30_intrinsics.h and sm_32_intrinsics.h.  These define the
-// __shfl and __ldg intrinsics using inline (volatile) asm, but we want to
-// define them using builtins so that the optimizer can reason about and across
-// these instructions.  In particular, using intrinsics for ldg gets us the
-// [addr+imm] addressing mode, which, although it doesn't actually exist in the
-// hardware, seems to generate faster machine code because ptxas can more easily
-// reason about our code.
-
-#if CUDA_VERSION >= 8000
-#pragma push_macro("__CUDA_ARCH__")
-#undef __CUDA_ARCH__
-#include "sm_60_atomic_functions.hpp"
-#include "sm_61_intrinsics.hpp"
-#pragma pop_macro("__CUDA_ARCH__")
-#endif
-
-#undef __MATH_FUNCTIONS_HPP__
-
-// math_functions.hpp defines ::signbit as a __host__ __device__ function.  This
-// conflicts with libstdc++'s constexpr ::signbit, so we have to rename
-// math_function.hpp's ::signbit.  It's guarded by #undef signbit, but that's
-// conditional on __GNUC__.  :)
-#pragma push_macro("signbit")
-#pragma push_macro("__GNUC__")
-#undef __GNUC__
-#define signbit __ignored_cuda_signbit
-
-// CUDA-9 omits device-side definitions of some math functions if it sees
-// include guard from math.h wrapper from libstdc++. We have to undo the header
-// guard temporarily to get the definitions we need.
-#pragma push_macro("_GLIBCXX_MATH_H")
-#pragma push_macro("_LIBCPP_VERSION")
-#if CUDA_VERSION >= 9000
-#undef _GLIBCXX_MATH_H
-// We also need to undo another guard that checks for libc++ 3.8+
-#ifdef _LIBCPP_VERSION
-#define _LIBCPP_VERSION 3700
-#endif
-#endif
-
-#if CUDA_VERSION >= 9000
-#include "crt/math_functions.hpp"
-#else
-#include "math_functions.hpp"
-#endif
-#pragma pop_macro("_GLIBCXX_MATH_H")
-#pragma pop_macro("_LIBCPP_VERSION")
-#pragma pop_macro("__GNUC__")
-#pragma pop_macro("signbit")
-
-#pragma pop_macro("__host__")
-
-// __clang_cuda_texture_intrinsics.h must be included first in order to provide
-// implementation for __nv_tex_surf_handler that CUDA's headers depend on.
-// The implementation requires c++11 and only works with CUDA-9 or newer.
-#if __cplusplus >= 201103L && CUDA_VERSION >= 9000
-// clang-format off
-#include <__clang_cuda_texture_intrinsics.h>
-// clang-format on
-#else
-#if CUDA_VERSION >= 9000
-// Provide a hint that texture support needs C++11.
-template <typename T> struct __nv_tex_needs_cxx11 {
-  const static bool value = false;
-};
-template <class T>
-__host__ __device__ void __nv_tex_surf_handler(const char *name, T *ptr,
-                                               cudaTextureObject_t obj,
-                                               float x) {
-  _Static_assert(__nv_tex_needs_cxx11<T>::value,
-                 "Texture support requires C++11");
-}
-#else
-// Textures in CUDA-8 and older are not supported by clang.There's no
-// convenient way to intercept texture use in these versions, so we can't
-// produce a meaningful error. The source code that attempts to use textures
-// will continue to fail as it does now.
-#endif // CUDA_VERSION
-#endif // __cplusplus >= 201103L && CUDA_VERSION >= 9000
-#include "texture_fetch_functions.h"
-#include "texture_indirect_functions.h"
-
-// Restore state of __CUDA_ARCH__ and __THROW we had on entry.
-#pragma pop_macro("__CUDA_ARCH__")
-#pragma pop_macro("__THROW")
-
-// Set up compiler macros expected to be seen during compilation.
-#undef __CUDABE__
-#define __CUDACC__
-
-extern "C" {
-// Device-side CUDA system calls.
-// http://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/index.html#system-calls
-// We need these declarations and wrappers for device-side
-// malloc/free/printf calls to work without relying on
-// -fcuda-disable-target-call-checks option.
-__device__ int vprintf(const char *, const char *);
-__device__ void free(void *) __attribute((nothrow));
-__device__ void *malloc(size_t) __attribute((nothrow)) __attribute__((malloc));
-
-// __assertfail() used to have a `noreturn` attribute. Unfortunately that
-// contributed to triggering the longstanding bug in ptxas when assert was used
-// in sufficiently convoluted code. See
-// https://bugs.llvm.org/show_bug.cgi?id=27738 for the details.
-__device__ void __assertfail(const char *__message, const char *__file,
-                             unsigned __line, const char *__function,
-                             size_t __charSize);
-
-// In order for standard assert() macro on linux to work we need to
-// provide device-side __assert_fail()
-__device__ static inline void __assert_fail(const char *__message,
-                                            const char *__file, unsigned __line,
-                                            const char *__function) {
-  __assertfail(__message, __file, __line, __function, sizeof(char));
-}
-
-// Clang will convert printf into vprintf, but we still need
-// device-side declaration for it.
-__device__ int printf(const char *, ...);
-} // extern "C"
-
-// We also need device-side std::malloc and std::free.
-namespace std {
-__device__ static inline void free(void *__ptr) { ::free(__ptr); }
-__device__ static inline void *malloc(size_t __size) {
-  return ::malloc(__size);
-}
-} // namespace std
-
-// Out-of-line implementations from __clang_cuda_builtin_vars.h.  These need to
-// come after we've pulled in the definition of uint3 and dim3.
-
-__device__ inline __cuda_builtin_threadIdx_t::operator dim3() const {
-  return dim3(x, y, z);
-}
-
-__device__ inline __cuda_builtin_threadIdx_t::operator uint3() const {
-  return {x, y, z};
-}
-
-__device__ inline __cuda_builtin_blockIdx_t::operator dim3() const {
-  return dim3(x, y, z);
-}
-
-__device__ inline __cuda_builtin_blockIdx_t::operator uint3() const {
-  return {x, y, z};
-}
-
-__device__ inline __cuda_builtin_blockDim_t::operator dim3() const {
-  return dim3(x, y, z);
-}
-
-__device__ inline __cuda_builtin_blockDim_t::operator uint3() const {
-  return {x, y, z};
-}
-
-__device__ inline __cuda_builtin_gridDim_t::operator dim3() const {
-  return dim3(x, y, z);
-}
-
-__device__ inline __cuda_builtin_gridDim_t::operator uint3() const {
-  return {x, y, z};
-}
-
-#include <__clang_cuda_cmath.h>
-#include <__clang_cuda_intrinsics.h>
-#include <__clang_cuda_complex_builtins.h>
-
-// curand_mtgp32_kernel helpfully redeclares blockDim and threadIdx in host
-// mode, giving them their "proper" types of dim3 and uint3.  This is
-// incompatible with the types we give in __clang_cuda_builtin_vars.h.  As as
-// hack, force-include the header (nvcc doesn't include it by default) but
-// redefine dim3 and uint3 to our builtin types.  (Thankfully dim3 and uint3 are
-// only used here for the redeclarations of blockDim and threadIdx.)
-#pragma push_macro("dim3")
-#pragma push_macro("uint3")
-#define dim3 __cuda_builtin_blockDim_t
-#define uint3 __cuda_builtin_threadIdx_t
-#include "curand_mtgp32_kernel.h"
-#pragma pop_macro("dim3")
-#pragma pop_macro("uint3")
-#pragma pop_macro("__USE_FAST_MATH__")
-#pragma pop_macro("__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__")
-
-// CUDA runtime uses this undocumented function to access kernel launch
-// configuration. The declaration is in crt/device_functions.h but that file
-// includes a lot of other stuff we don't want. Instead, we'll provide our own
-// declaration for it here.
-#if CUDA_VERSION >= 9020
-extern "C" unsigned __cudaPushCallConfiguration(dim3 gridDim, dim3 blockDim,
-                                                size_t sharedMem = 0,
-                                                void *stream = 0);
-#endif
-
-#endif // __CUDA__
-#endif // __CLANG_CUDA_RUNTIME_WRAPPER_H__
--- a/lib/include/__clang_cuda_texture_intrinsics.h
+++ b/lib/include/__clang_cuda_texture_intrinsics.h
@ -1,742 +0,0 @@
-/*===--- __clang_cuda_texture_intrinsics.h - Device-side texture support ---===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- *
- * This header provides in-header implmentations for NVCC's built-in
- * __nv_tex_surf_handler() which is used by CUDA's texture-related headers.  The
- * built-in is unusual as it's actually a set of function overloads that use the
- * first string literal argument as one of the overload parameters.
- */
-#ifndef __CLANG_CUDA_TEXTURE_INTRINSICS_H__
-#define __CLANG_CUDA_TEXTURE_INTRINSICS_H__
-#ifndef __CUDA__
-#error "This file is for CUDA compilation only."
-#endif
-
-// __nv_tex_surf_handler() provided by this header as a macro.
-#define __nv_tex_surf_handler(__op, __ptr, ...)                                \
-  ::__cuda_tex::__tex_fetch<                                                   \
-      ::__cuda_tex::__Tag<::__cuda_tex::__tex_op_hash(__op)>>(__ptr,           \
-                                                              __VA_ARGS__)
-
-#pragma push_macro("__ASM_OUT")
-#pragma push_macro("__ASM_OUTP")
-#pragma push_macro("__Args")
-#pragma push_macro("__ID")
-#pragma push_macro("__IDV")
-#pragma push_macro("__IMPL_2DGATHER")
-#pragma push_macro("__IMPL_ALIAS")
-#pragma push_macro("__IMPL_ALIASI")
-#pragma push_macro("__IMPL_F1")
-#pragma push_macro("__IMPL_F3")
-#pragma push_macro("__IMPL_F3N")
-#pragma push_macro("__IMPL_F3S")
-#pragma push_macro("__IMPL_S")
-#pragma push_macro("__IMPL_S3")
-#pragma push_macro("__IMPL_S3I")
-#pragma push_macro("__IMPL_S3N")
-#pragma push_macro("__IMPL_S3NI")
-#pragma push_macro("__IMPL_S3S")
-#pragma push_macro("__IMPL_S3SI")
-#pragma push_macro("__IMPL_SI")
-#pragma push_macro("__L")
-#pragma push_macro("__STRIP_PARENS")
-
-// Put all functions into anonymous namespace so they have internal linkage.
-// The device-only function here must be internal in order to avoid ODR
-// violations in case they are used from the files compiled with
-// -fgpu-rdc. E.g. a library and an app using it may be built with a different
-// version of this header file.
-namespace {
-
-// Put the implmentation into its own namespace so we don't pollute the TU.
-namespace __cuda_tex {
-
-// First, we need a perfect hash function and a few constexpr helper functions
-// for converting a string literal into a numeric value which can be used to
-// parametrize a template. We can not use string literals for that as that would
-// require C++20.
-//
-// The hash function was generated with 'gperf' and then manually converted into
-// its constexpr equivalent.
-//
-// NOTE: the perfect hashing scheme comes with inherent self-test. If the hash
-// function has a collision for any of the texture operations, the compilation
-// will fail due to an attempt to redefine a tag with the same value. If the
-// header compiles, then the hash function is good enough for the job.
-
-constexpr int __tex_len(const char *s) {
-  return (s[0] == 0)    ? 0
-         : (s[1] == 0)  ? 1
-         : (s[2] == 0)  ? 2
-         : (s[3] == 0)  ? 3
-         : (s[4] == 0)  ? 4
-         : (s[5] == 0)  ? 5
-         : (s[6] == 0)  ? 6
-         : (s[7] == 0)  ? 7
-         : (s[8] == 0)  ? 8
-         : (s[9] == 0)  ? 9
-         : (s[10] == 0) ? 10
-         : (s[11] == 0) ? 11
-         : (s[12] == 0) ? 12
-         : (s[13] == 0) ? 13
-         : (s[14] == 0) ? 14
-         : (s[15] == 0) ? 15
-         : (s[16] == 0) ? 16
-         : (s[17] == 0) ? 17
-         : (s[18] == 0) ? 18
-         : (s[19] == 0) ? 19
-         : (s[20] == 0) ? 20
-         : (s[21] == 0) ? 21
-         : (s[22] == 0) ? 22
-         : (s[23] == 0) ? 23
-         : (s[24] == 0) ? 24
-         : (s[25] == 0) ? 25
-         : (s[26] == 0) ? 26
-         : (s[27] == 0) ? 27
-         : (s[28] == 0) ? 28
-         : (s[29] == 0) ? 29
-         : (s[30] == 0) ? 30
-         : (s[31] == 0) ? 31
-                        : 32;
-}
-
-constexpr int __tex_hash_map(int c) {
-  return (c == 49)    ? 10
-         : (c == 50)  ? 0
-         : (c == 51)  ? 100
-         : (c == 52)  ? 30
-         : (c == 67)  ? 10
-         : (c == 68)  ? 0
-         : (c == 69)  ? 25
-         : (c == 72)  ? 70
-         : (c == 77)  ? 0
-         : (c == 96)  ? 44
-         : (c == 99)  ? 10
-         : (c == 100) ? 5
-         : (c == 101) ? 60
-         : (c == 102) ? 40
-         : (c == 103) ? 70
-         : (c == 104) ? 25
-         : (c == 112) ? 0
-         : (c == 114) ? 45
-         : (c == 117) ? 5
-         : (c == 118) ? 85
-         : (c == 120) ? 20
-                      : 225;
-}
-
-constexpr int __tex_op_hash(const char *str) {
-  return __tex_len(str) + __tex_hash_map(str[7] + 1) + __tex_hash_map(str[6]) +
-         __tex_hash_map(str[5]) + __tex_hash_map(str[__tex_len(str) - 1]);
-}
-
-// Tag type to identify particular texture operation.
-template <int N> struct __Tag;
-#define __ID(__op) __Tag<__tex_op_hash(__op)>
-// Tags for variants of particular operation. E.g. tex2Dgather can translate
-// into 4 different instructions.
-#define __IDV(__op, __variant)                                                 \
-  __Tag<10000 + __tex_op_hash(__op) * 100 + __variant>
-
-// Helper classes for figuring out key data types for derived types.
-// E.g. char2 has __base_t = char, __fetch_t = char4
-template <class> struct __TypeInfoT;
-// Type info for the fundamental types.
-template <> struct __TypeInfoT<float> {
-  using __base_t = float;
-  using __fetch_t = float4;
-};
-template <> struct __TypeInfoT<char> {
-  using __base_t = char;
-  using __fetch_t = int4;
-};
-template <> struct __TypeInfoT<signed char> {
-  using __base_t = signed char;
-  using __fetch_t = int4;
-};
-template <> struct __TypeInfoT<unsigned char> {
-  using __base_t = unsigned char;
-  using __fetch_t = uint4;
-};
-template <> struct __TypeInfoT<short> {
-  using __base_t = short;
-  using __fetch_t = int4;
-};
-template <> struct __TypeInfoT<unsigned short> {
-  using __base_t = unsigned short;
-  using __fetch_t = uint4;
-};
-template <> struct __TypeInfoT<int> {
-  using __base_t = int;
-  using __fetch_t = int4;
-};
-template <> struct __TypeInfoT<unsigned int> {
-  using __base_t = unsigned int;
-  using __fetch_t = uint4;
-};
-
-// Derived base/fetch types for N-element vectors.
-template <class __T> struct __TypeInfoT {
-  using __base_t = decltype(__T::x);
-  using __fetch_t = typename __TypeInfoT<__base_t>::__fetch_t;
-};
-
-// Classes that implement specific texture ops.
-template <class __op> struct __tex_fetch_v4;
-
-// Helper macros to strip parens from a macro argument.
-#define __Args(...) __VA_ARGS__
-#define __STRIP_PARENS(__X) __X
-#define __L(__X) __STRIP_PARENS(__Args __X)
-
-// Construct inline assembly output args.
-// Results are stored in a temp var __r.
-// isResident bool is pointed to by __ir
-// Asm args for return values. It's a 4-element vector
-#define __ASM_OUT(__t)                                                         \
-  ("=" __t(__r.x), "=" __t(__r.y), "=" __t(__r.z), "=" __t(__r.w))
-// .. possibly combined with a predicate.
-#define __ASM_OUTP(__t) (__L(__ASM_OUT(__t)), "=h"(*__ir))
-
-// Implements a single variant of texture fetch instruction.
-#define __IMPL_F1(__rt, __dt, __args, __asm_op, __asm_outs, __asm_args)        \
-  template <>                                                                  \
-  __device__ __rt __run<__dt>(cudaTextureObject_t __obj, __L(__args)) {        \
-    __rt __r;                                                                  \
-    asm(__asm_op : __L(__asm_outs) : "l"(__obj), __L(__asm_args));             \
-    return __r;                                                                \
-  }
-
-// Implements texture fetch instructions for int4/uint4/float4 data types.
-#define __IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args)        \
-  __IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args,   \
-            __ASM_OUT("r"), __asm_args)                                        \
-  __IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \
-            __ASM_OUT("r"), __asm_args)                                        \
-  __IMPL_F1(float4, float4, __args,                                            \
-            __asm_op ".f32." __ctype "\t" __asm_op_args, __ASM_OUT("f"),       \
-            __asm_args)
-// Implements 'sparse' texture fetch instructions for int4/uint4/float4 data
-// types. Similar to above, but returns a boolean 'isPresent' value in addition
-// to texture data,
-#define __IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args)       \
-  __IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args,   \
-            __ASM_OUTP("r"), __asm_args)                                       \
-  __IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \
-            __ASM_OUTP("r"), __asm_args)                                       \
-  __IMPL_F1(float4, float4, __args,                                            \
-            __asm_op ".f32." __ctype "\t" __asm_op_args, __ASM_OUTP("f"),      \
-            __asm_args)
-
-// Similar to F3, but for integer data which is returned as normalized floats.
-// Only instantiates fetch functions for int4/uint4.
-#define __IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args)       \
-  __IMPL_F1(float4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \
-            __ASM_OUT("r"), __asm_args)                                        \
-  __IMPL_F1(float4, uint4, __args,                                             \
-            __asm_op ".u32." __ctype "\t" __asm_op_args, __ASM_OUT("r"),       \
-            __asm_args)
-
-// Instantiates __tex_fetch_v4 with regular fetch functions.
-#define __IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
-  template <> struct __tex_fetch_v4<__op> {                                    \
-    template <class T>                                                         \
-    __device__ static T __run(cudaTextureObject_t __obj, __L(__args));         \
-    __IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args)            \
-  }
-
-// Same, but for sparse ops. Only available on sm_60+
-#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)
-#define __IMPL_S3SI(__op, __args, __asm_op, __ctype, __asm_op_args,            \
-                    __asm_args)                                                \
-  template <> struct __tex_fetch_v4<__op> {                                    \
-    template <class T>                                                         \
-    __device__ static T __run(cudaTextureObject_t __obj, __L(__args));         \
-    __IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args)           \
-  }
-#else
-#define __IMPL_S3SI(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
-#endif
-
-// Same, but for normalized float ops.
-#define __IMPL_S3NI(__op, __args, __asm_op, __ctype, __asm_op_args,            \
-                    __asm_args)                                                \
-  template <> struct __tex_fetch_v4<__op> {                                    \
-    template <class T>                                                         \
-    __device__ static float4 __run(cudaTextureObject_t __obj, __L(__args));    \
-    __IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args)           \
-  }
-
-// Regular and normalized float ops share a lot of similarities.  This macro
-// instantiates both variants -- normal for __op and normalized for __opn.
-#define __IMPL_SI(__op, __opn, __args, __asm_op, __ctype, __asm_op_args,       \
-                  __asm_args)                                                  \
-  __IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args);      \
-  __IMPL_S3NI(__opn, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
-
-// Convenience macros which converts string literal __op into a __Tag,
-#define __IMPL_S3(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args)  \
-  __IMPL_S3I(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
-#define __IMPL_S3S(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
-  __IMPL_S3SI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
-#define __IMPL_S3N(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
-  __IMPL_S3NI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
-#define __IMPL_S(__op, __opn, __args, __asm_op, __ctype, __asm_op_args,        \
-                 __asm_args)                                                   \
-  __IMPL_SI(__ID(__op), __ID(__opn), __args, __asm_op, __ctype, __asm_op_args, \
-            __asm_args)
-
-// CUDA headers have some 'legacy' texture oprerations that duplicate
-// functionality. So, we just inherit it, instead of refining a copy.
-#define __IMPL_ALIASI(__op, __opn)                                             \
-  template <> struct __tex_fetch_v4<__op> : __tex_fetch_v4<__opn> {}
-#define __IMPL_ALIAS(__op, __opn) __IMPL_ALIASI(__ID(__op), __ID(__opn))
-
-// Now we can instantiate everything we need for each specific texture fetch
-// variant.
-__IMPL_S("__tex1D_v2", "__tex1D_rmnf_v2", (float __x), "tex.1d.v4", "f32",
-         "{%0, %1, %2, %3}, [%4, {%5}];", ("f"(__x)));
-__IMPL_S("__tex1Dfetch_v2", "__tex1Dfetch_rmnf_v2", (int __x), "tex.1d.v4",
-         "s32", "{%0, %1, %2, %3}, [%4, {%5}];", ("r"(__x)));
-__IMPL_ALIAS("__itex1D", "__tex1D_v2");
-__IMPL_ALIAS("__itex1Dfetch", "__tex1Dfetch_v2");
-
-__IMPL_S("__tex1DGrad_v2", "__tex1DGrad_rmnf_v2",
-         (float __x, float __dPdx, float __dPdy), "tex.grad.1d.v4", "f32",
-         "{%0, %1, %2, %3}, [%4, {%5}], {%6}, {%7};",
-         ("f"(__x), "f"(__dPdx), "f"(__dPdy)));
-__IMPL_ALIAS("__itex1DGrad", "__tex1DGrad_v2");
-
-__IMPL_S("__tex1DLayered_v2", "__tex1DLayered_rmnf_v2",
-         (float __x, int __layer), "tex.a1d.v4", "f32",
-         "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("r"(__layer), "f"(__x)));
-__IMPL_ALIAS("__itex1DLayered", "__tex1DLayered_v2");
-
-__IMPL_S("__tex1DLayeredGrad_v2", "__tex1DLayeredGrad_rmnf_v2",
-         (float __x, int __layer, float __dPdx, float __dPdy),
-         "tex.grad.a1d.v4", "f32",
-         "{%0, %1, %2, %3}, [%4, {%5, %6}], {%7}, {%8};",
-         ("r"(__layer), "f"(__x), "f"(__dPdx), "f"(__dPdy)));
-__IMPL_ALIAS("__itex1DLayeredGrad", "__tex1DLayeredGrad_v2");
-
-__IMPL_S("__tex1DLayeredLod_v2", "__tex1DLayeredLod_rmnf_v2",
-         (float __x, int __layer, float __level), "tex.level.a1d.v4", "f32",
-         "{%0, %1, %2, %3}, [%4, {%5, %6}], %7;",
-         ("r"(__layer), "f"(__x), "f"(__level)));
-__IMPL_ALIAS("__itex1DLayeredLod", "__tex1DLayeredLod_v2");
-
-__IMPL_S("__tex1DLod_v2", "__tex1DLod_rmnf_v2", (float __x, float __level),
-         "tex.level.1d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5}], %6;",
-         ("f"(__x), "f"(__level)));
-__IMPL_ALIAS("__itex1DLod", "__tex1DLod_v2");
-
-// 2D
-__IMPL_S("__tex2D_v2", "__tex2D_rmnf_v2", (float __x, float __y), "tex.2d.v4",
-         "f32", "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y)));
-__IMPL_ALIAS("__itex2D", "__tex2D_v2");
-
-__IMPL_S3S("__itex2D_sparse", (float __x, float __y, unsigned char *__ir),
-           "{.reg .pred %%p0;\n\t"
-           "tex.2d.v4",
-           "f32",
-           "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t"
-           " selp.u16 %4, 1, 0, %%p0; }",
-           ("f"(__x), "f"(__y)));
-
-__IMPL_S("__tex2DGrad_v2", "__tex2DGrad_rmnf_v2",
-         (float __x, float __y, const float2 *__dPdx, const float2 *__dPdy),
-         "tex.grad.2d.v4", "f32",
-         "{%0, %1, %2, %3}, [%4, {%5, %6}], {%7, %8}, {%9, %10};",
-         ("f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdy->x),
-          "f"(__dPdy->y)));
-__IMPL_ALIAS("__itex2DGrad_v2", "__tex2DGrad_v2");
-
-__IMPL_S3S("__itex2DGrad_sparse",
-           (float __x, float __y, const float2 *__dPdx, const float2 *__dPdy,
-            unsigned char *__ir),
-           "{.reg .pred %%p0;\n\t"
-           "tex.grad.2d.v4",
-           "f32",
-           "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], {%8, %9}, {%10, %11};\n\t"
-           "selp.u16 %4, 1, 0, %%p0; }",
-           ("f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdy->x),
-            "f"(__dPdy->y)));
-
-__IMPL_S("__tex2DLayered_v2", "__tex2DLayered_rmnf_v2",
-         (float __x, float __y, int __layer), "tex.a2d.v4", "f32",
-         "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
-         ("r"(__layer), "f"(__x), "f"(__y)));
-__IMPL_ALIAS("__itex2DLayered", "__tex2DLayered_v2");
-
-__IMPL_S3S("__itex2DLayered_sparse",
-           (float __x, float __y, int __layer, unsigned char *__ir),
-           "{.reg .pred %%p0;\n\t"
-           "tex.a2d.v4",
-           "f32",
-           "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
-           "selp.u16 %4, 1, 0, %%p0; }",
-           ("r"(__layer), "f"(__x), "f"(__y)));
-
-__IMPL_S("__tex2DLayeredGrad_v2", "__tex2DLayeredGrad_rmnf_v2",
-         (float __x, float __y, int __layer, const float2 *__dPdx,
-          const float2 *__dPdy),
-         "tex.grad.a2d.v4", "f32",
-         "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], {%8, %9}, {%10, %11};",
-         ("r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
-          "f"(__dPdy->x), "f"(__dPdy->y)));
-__IMPL_ALIAS("__itex2DLayeredGrad_v2", "__tex2DLayeredGrad_v2");
-
-__IMPL_S3S(
-    "__itex2DLayeredGrad_sparse",
-    (float __x, float __y, int __layer, const float2 *__dPdx,
-     const float2 *__dPdy, unsigned char *__ir),
-    "{.reg .pred %%p0;\n\t"
-    "tex.grad.a2d.v4",
-    "f32",
-    "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], {%9, %10}, {%11, %12};\n\t"
-    "selp.u16 %4, 1, 0, %%p0; }",
-    ("r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
-     "f"(__dPdy->x), "f"(__dPdy->y)));
-
-__IMPL_S("__tex2DLayeredLod_v2", "__tex2DLayeredLod_rmnf_v2",
-         (float __x, float __y, int __layer, float __level), "tex.level.a2d.v4",
-         "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
-         ("r"(__layer), "f"(__x), "f"(__y), "f"(__level)));
-__IMPL_ALIAS("__itex2DLayeredLod", "__tex2DLayeredLod_v2");
-
-__IMPL_S3S("__itex2DLayeredLod_sparse",
-           (float __x, float __y, int __layer, float __level,
-            unsigned char *__ir),
-           "{.reg .pred %%p0;\n\t"
-           "tex.level.a2d.v4",
-           "f32",
-           "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
-           "selp.u16 %4, 1, 0, %%p0; }",
-           ("r"(__layer), "f"(__x), "f"(__y), "f"(__level)));
-
-__IMPL_S("__tex2DLod_v2", "__tex2DLod_rmnf_v2",
-         (float __x, float __y, float __level), "tex.level.2d.v4", "f32",
-         "{%0, %1, %2, %3}, [%4, {%5, %6}], %7;",
-         ("f"(__x), "f"(__y), "f"(__level)));
-__IMPL_ALIAS("__itex2DLod", "__tex2DLod_v2");
-
-__IMPL_S3S("__itex2DLod_sparse",
-           (float __x, float __y, float __level, unsigned char *__ir),
-           "{.reg .pred %%p0;\n\t"
-           "tex.level.2d.v4",
-           "f32",
-           "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], %8;\n\t"
-           "selp.u16 %4, 1, 0, %%p0; }",
-           ("f"(__x), "f"(__y), "f"(__level)));
-
-// 2D gather is special. Unlike other variants that translate into exactly one
-// asm instruction, it uses one of the four different instructions selected by
-// __comp.  We implement each instruction variant separately, and dispatch the
-// right one from the manually implemented 'umbrella' fetch.
-#define __IMPL_2DGATHER(variant, instr)                                        \
-  __IMPL_SI(__IDV("__tex2Dgather_v2", variant),                                \
-            __IDV("__tex2Dgather_rmnf_v2", variant),                           \
-            (float __x, float __y, int __comp), instr, "f32",                  \
-            "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y)));        \
-  __IMPL_ALIASI(__IDV("__itex2Dgather", variant),                              \
-                __IDV("__tex2Dgather_v2", variant));                           \
-  __IMPL_S3SI(__IDV("__itex2Dgather_sparse", variant),                         \
-              (float __x, float __y, unsigned char *__ir, int __comp),         \
-              "{.reg .pred %%p0;\n\t" instr, "f32",                            \
-              "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t"                     \
-              "selp.u16 %4, 1, 0, %%p0; }",                                    \
-              ("f"(__x), "f"(__y)));
-__IMPL_2DGATHER(0, "tld4.r.2d.v4");
-__IMPL_2DGATHER(1, "tld4.g.2d.v4");
-__IMPL_2DGATHER(2, "tld4.b.2d.v4");
-__IMPL_2DGATHER(3, "tld4.a.2d.v4");
-
-// Umbrella dispatcher -- calls into specific 2Dgather variant.
-template <> struct __tex_fetch_v4<__ID("__tex2Dgather_v2")> {
-  template <class __T>
-  __device__ static __T __run(cudaTextureObject_t __obj, float __x, float __y,
-                              int __comp) {
-    switch (__comp) {
-    case 0:
-      return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 0)>::__run<__T>(
-          __obj, __x, __y, __comp);
-    case 1:
-      return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 1)>::__run<__T>(
-          __obj, __x, __y, __comp);
-    case 2:
-      return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 2)>::__run<__T>(
-          __obj, __x, __y, __comp);
-    case 3:
-      return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 3)>::__run<__T>(
-          __obj, __x, __y, __comp);
-    }
-  }
-};
-__IMPL_ALIAS("__itex2Dgather", "__tex2Dgather_v2");
-
-template <> struct __tex_fetch_v4<__ID("__tex2Dgather_rmnf_v2")> {
-  template <class __T>
-  __device__ static float4 __run(cudaTextureObject_t __obj, float __x,
-                                 float __y, int __comp) {
-    switch (__comp) {
-    case 0:
-      return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 0)>::__run<__T>(
-          __obj, __x, __y, __comp);
-    case 1:
-      return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 1)>::__run<__T>(
-          __obj, __x, __y, __comp);
-    case 2:
-      return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 2)>::__run<__T>(
-          __obj, __x, __y, __comp);
-    case 3:
-      return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 3)>::__run<__T>(
-          __obj, __x, __y, __comp);
-    }
-  }
-};
-
-#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)
-template <> struct __tex_fetch_v4<__ID("__itex2Dgather_sparse")> {
-  template <class __T>
-  __device__ static __T __run(cudaTextureObject_t __obj, float __x, float __y,
-                              unsigned char *__ir, int __comp) {
-    switch (__comp) {
-    case 0:
-      return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 0)>::__run<__T>(
-          __obj, __x, __y, __ir, __comp);
-    case 1:
-      return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 1)>::__run<__T>(
-          __obj, __x, __y, __ir, __comp);
-    case 2:
-      return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 2)>::__run<__T>(
-          __obj, __x, __y, __ir, __comp);
-    case 3:
-      return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 3)>::__run<__T>(
-          __obj, __x, __y, __ir, __comp);
-    }
-  }
-};
-#endif
-
-// 3D
-__IMPL_S("__tex3D_v2", "__tex3D_rmnf_v2", (float __x, float __y, float __z),
-         "tex.3d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
-         ("f"(__x), "f"(__y), "f"(__z)));
-__IMPL_ALIAS("__itex3D", "__tex3D_v2");
-
-__IMPL_S3S("__itex3D_sparse",
-           (float __x, float __y, float __z, unsigned char *__ir),
-           "{.reg .pred %%p0;\n\t"
-           "tex.3d.v4",
-           "f32",
-           "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
-           "selp.u16 %4, 1, 0, %%p0; }",
-           ("f"(__x), "f"(__y), "f"(__z)));
-
-__IMPL_S("__tex3DGrad_v2", "__tex3DGrad_rmnf_v2",
-         (float __x, float __y, float __z, const float4 *__dPdx,
-          const float4 *__dPdy),
-         "tex.grad.3d.v4", "f32",
-         "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
-         "{%8, %9, %10, %10}, {%11, %12, %13, %13};",
-         ("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
-          "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
-__IMPL_ALIAS("__itex3DGrad_v2", "__tex3DGrad_v2");
-
-__IMPL_S3S("__itex3DGrad_sparse",
-           (float __x, float __y, float __z, const float4 *__dPdx,
-            const float4 *__dPdy, unsigned char *__ir),
-           "{.reg .pred %%p0;\n\t"
-           "tex.grad.3d.v4",
-           "f32",
-           "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], "
-           "{%9, %10, %11, %11}, {%12, %13, %14, %14};\n\t"
-           "selp.u16 %4, 1, 0, %%p0; }",
-           ("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
-            "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
-
-__IMPL_S("__tex3DLod_v2", "__tex3DLod_rmnf_v2",
-         (float __x, float __y, float __z, float __level), "tex.level.3d.v4",
-         "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
-         ("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
-__IMPL_ALIAS("__itex3DLod", "__tex3DLod_v2");
-
-__IMPL_S3S("__itex3DLod_sparse",
-           (float __x, float __y, float __z, float __level,
-            unsigned char *__ir),
-           "{.reg .pred %%p0;\n\t"
-           "tex.level.3d.v4",
-           "f32",
-           "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
-           "selp.u16 %4, 1, 0, %%p0; }",
-           ("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
-
-// Cubemap
-__IMPL_S("__texCubemap_v2", "__texCubemap_rmnf_v2",
-         (float __x, float __y, float __z), "tex.cube.v4", "f32",
-         "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
-         ("f"(__x), "f"(__y), "f"(__z)));
-__IMPL_ALIAS("__itexCubemap", "__texCubemap_v2");
-
-__IMPL_S3S("__itexCubemap_sparse",
-           (float __x, float __y, float __z, unsigned char *__ir),
-           "{.reg .pred %%p0;\n\t"
-           "tex.cube.v4",
-           "f32",
-           "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
-           "selp.u16 %4, 1, 0, %%p0; }",
-           ("f"(__x), "f"(__y), "f"(__z)));
-
-__IMPL_S("__texCubemapGrad_v2", "__texCubemapGrad_rmnf_v2",
-         (float __x, float __y, float __z, const float4 *__dPdx,
-          const float4 *__dPdy),
-         "tex.grad.cube.v4", "f32",
-         "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
-         "{%8, %9, %10, %10}, {%11, %12, %13, %13};",
-         ("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
-          "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
-__IMPL_ALIAS("__itexCubemapGrad_v2", "__texCubemapGrad_v2");
-
-__IMPL_S("__texCubemapLayered_v2", "__texCubemapLayered_rmnf_v2",
-         (float __x, float __y, float __z, int __layer), "tex.acube.v4", "f32",
-         "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];",
-         ("r"(__layer), "f"(__x), "f"(__y), "f"(__z)));
-__IMPL_ALIAS("__itexCubemapLayered", "__texCubemapLayered_v2");
-
-__IMPL_S("__texCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_rmnf_v2",
-         (float __x, float __y, float __z, int __layer, const float4 *__dPdx,
-          const float4 *__dPdy),
-         "tex.grad.acube.v4", "f32",
-         "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], "
-         "{%9, %10, %11, %11}, {%12, %13, %14, %14};",
-         ("r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x),
-          "f"(__dPdx->y), "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y),
-          "f"(__dPdy->z)));
-__IMPL_ALIAS("__itexCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_v2");
-
-__IMPL_S("__texCubemapLayeredLod_v2", "__texCubemapLayeredLod_rmnf_v2",
-         (float __x, float __y, float __z, int __layer, float __level),
-         "tex.level.acube.v4", "f32",
-         "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;",
-         ("r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__level)));
-__IMPL_ALIAS("__itexCubemapLayeredLod", "__texCubemapLayeredLod_v2");
-
-__IMPL_S("__texCubemapLod_v2", "__texCubemapLod_rmnf_v2",
-         (float __x, float __y, float __z, float __level), "tex.level.cube.v4",
-         "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
-         ("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
-__IMPL_ALIAS("__itexCubemapLod", "__texCubemapLod_v2");
-
-// Helper class for extracting slice of data from V4 fetch results.
-template <class __DestT, class __SrcT> struct __convert {
-  template <int __NElements = sizeof(__DestT) /
-                              sizeof(typename __TypeInfoT<__DestT>::__base_t)>
-  __device__ static __DestT __run(__SrcT __v);
-  template <> __device__ static __DestT __run<1>(__SrcT __v) { return {__v.x}; }
-  template <> __device__ static __DestT __run<2>(__SrcT __v) {
-    return {__v.x, __v.y};
-  }
-  template <> __device__ static __DestT __run<3>(__SrcT __v) {
-    return {__v.x, __v.y, __v.z};
-  }
-  template <> __device__ static __DestT __run<4>(__SrcT __v) {
-    return {__v.x, __v.y, __v.z, __v.w};
-  }
-};
-
-// These are the top-level function overloads the __nv_tex_surf_handler expands
-// to.  Each overload deals with one of the several ways __nv_tex_surf_handler
-// is called by CUDA headers. In the end, each of the overloads does the same
-// job -- it figures out which `__tex_fetch_v4::run` variant should be used to
-// fetch texture data and which `__convert::run` is needed to convert it into
-// appropriate return type.
-
-// __nv_tex_surf_handler("__tex...", &ret, cudaTextureObject_t handle, args...);
-//   Data type and return type are based on ret.
-template <class __op, class __T, class... __Args>
-__device__ static void __tex_fetch(__T *__ptr, cudaTextureObject_t __handle,
-                                   __Args... __args) {
-  using __FetchT = typename __TypeInfoT<__T>::__fetch_t;
-  *__ptr = __convert<__T, __FetchT>::__run(
-      __tex_fetch_v4<__op>::template __run<__FetchT>(__handle, __args...));
-}
-
-#if CUDA_VERSION < 12000
-// texture<> objects get magically converted into a texture reference.  However,
-// there's no way to convert them to cudaTextureObject_t on C++ level. So, we
-// cheat a bit and use inline assembly to do it. It costs us an extra register
-// and a move, but that is easy for ptxas to optimize away.
-template <class __T>
-__device__ cudaTextureObject_t __tex_handle_to_obj(__T __handle) {
-  cudaTextureObject_t __obj;
-  asm("mov.b64 %0, %1; " : "=l"(__obj) : "l"(__handle));
-  return __obj;
-}
-
-// __nv_tex_surf_handler ("__tex...", &ret, textureReference, args...);
-//   Data type and return type is based on ret.
-template <class __op, class __T, class __HandleT, class... __Args>
-__device__ static void __tex_fetch(__T *__ptr, __HandleT __handle,
-                                   __Args... __args) {
-  using __FetchT = typename __TypeInfoT<__T>::__fetch_t;
-  *__ptr = __convert<__T, __FetchT>::__run(
-      __tex_fetch_v4<__op>::template __run<__FetchT>(
-          __tex_handle_to_obj(__handle), __args...));
-}
-
-// __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, args...);
-// cudaReadModeNormalizedFloat fetches always return float4.
-template <class __op, class __DataT, class __RetT, int __TexT, class... __Args>
-__device__ static void
-__tex_fetch(__DataT *, __RetT *__ptr,
-            texture<__DataT, __TexT, cudaReadModeNormalizedFloat> __handle,
-            __Args... __args) {
-  using __FetchT = typename __TypeInfoT<__DataT>::__fetch_t;
-  *__ptr = __convert<__RetT, float4>::__run(
-      __tex_fetch_v4<__op>::template __run<__FetchT>(
-          __tex_handle_to_obj(__handle), __args...));
-}
-
-// __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, args...);
-// For cudaReadModeElementType fetch return type is based on type_dummy.
-template <class __op, class __DataT, class __RetT, int __TexT, class... __Args>
-__device__ static void
-__tex_fetch(__DataT *, __RetT *__ptr,
-            texture<__DataT, __TexT, cudaReadModeElementType> __handle,
-            __Args... __args) {
-  using __FetchT = typename __TypeInfoT<__DataT>::__fetch_t;
-  *__ptr = __convert<__RetT, __FetchT>::__run(
-      __tex_fetch_v4<__op>::template __run<__FetchT>(
-          __tex_handle_to_obj(__handle), __args...));
-}
-#endif // CUDA_VERSION
-} // namespace __cuda_tex
-} // namespace
-#pragma pop_macro("__ASM_OUT")
-#pragma pop_macro("__ASM_OUTP")
-#pragma pop_macro("__Args")
-#pragma pop_macro("__ID")
-#pragma pop_macro("__IDV")
-#pragma pop_macro("__IMPL_2DGATHER")
-#pragma pop_macro("__IMPL_ALIAS")
-#pragma pop_macro("__IMPL_ALIASI")
-#pragma pop_macro("__IMPL_F1")
-#pragma pop_macro("__IMPL_F3")
-#pragma pop_macro("__IMPL_F3N")
-#pragma pop_macro("__IMPL_F3S")
-#pragma pop_macro("__IMPL_S")
-#pragma pop_macro("__IMPL_S3")
-#pragma pop_macro("__IMPL_S3I")
-#pragma pop_macro("__IMPL_S3N")
-#pragma pop_macro("__IMPL_S3NI")
-#pragma pop_macro("__IMPL_S3S")
-#pragma pop_macro("__IMPL_S3SI")
-#pragma pop_macro("__IMPL_SI")
-#pragma pop_macro("__L")
-#pragma pop_macro("__STRIP_PARENS")
-#endif // __CLANG_CUDA_TEXTURE_INTRINSICS_H__
--- a/lib/include/__clang_hip_cmath.h
+++ b/lib/include/__clang_hip_cmath.h
@ -1,842 +0,0 @@
-/*===---- __clang_hip_cmath.h - HIP cmath decls -----------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __CLANG_HIP_CMATH_H__
-#define __CLANG_HIP_CMATH_H__
-
-#if !defined(__HIP__) && !defined(__OPENMP_AMDGCN__)
-#error "This file is for HIP and OpenMP AMDGCN device compilation only."
-#endif
-
-#if !defined(__HIPCC_RTC__)
-#if defined(__cplusplus)
-#include <limits>
-#include <type_traits>
-#include <utility>
-#endif
-#include <limits.h>
-#include <stdint.h>
-#endif // !defined(__HIPCC_RTC__)
-
-#pragma push_macro("__DEVICE__")
-#pragma push_macro("__CONSTEXPR__")
-#ifdef __OPENMP_AMDGCN__
-#define __DEVICE__ static __attribute__((always_inline, nothrow))
-#define __CONSTEXPR__ constexpr
-#else
-#define __DEVICE__ static __device__ inline __attribute__((always_inline))
-#define __CONSTEXPR__
-#endif // __OPENMP_AMDGCN__
-
-// Start with functions that cannot be defined by DEF macros below.
-#if defined(__cplusplus)
-#if defined __OPENMP_AMDGCN__
-__DEVICE__ __CONSTEXPR__ float fabs(float __x) { return ::fabsf(__x); }
-__DEVICE__ __CONSTEXPR__ float sin(float __x) { return ::sinf(__x); }
-__DEVICE__ __CONSTEXPR__ float cos(float __x) { return ::cosf(__x); }
-#endif
-__DEVICE__ __CONSTEXPR__ double abs(double __x) { return ::fabs(__x); }
-__DEVICE__ __CONSTEXPR__ float abs(float __x) { return ::fabsf(__x); }
-__DEVICE__ __CONSTEXPR__ long long abs(long long __n) { return ::llabs(__n); }
-__DEVICE__ __CONSTEXPR__ long abs(long __n) { return ::labs(__n); }
-__DEVICE__ __CONSTEXPR__ float fma(float __x, float __y, float __z) {
-  return ::fmaf(__x, __y, __z);
-}
-#if !defined(__HIPCC_RTC__)
-// The value returned by fpclassify is platform dependent, therefore it is not
-// supported by hipRTC.
-__DEVICE__ __CONSTEXPR__ int fpclassify(float __x) {
-  return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
-                              FP_ZERO, __x);
-}
-__DEVICE__ __CONSTEXPR__ int fpclassify(double __x) {
-  return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
-                              FP_ZERO, __x);
-}
-#endif // !defined(__HIPCC_RTC__)
-
-__DEVICE__ __CONSTEXPR__ float frexp(float __arg, int *__exp) {
-  return ::frexpf(__arg, __exp);
-}
-
-#if defined(__OPENMP_AMDGCN__)
-// For OpenMP we work around some old system headers that have non-conforming
-// `isinf(float)` and `isnan(float)` implementations that return an `int`. We do
-// this by providing two versions of these functions, differing only in the
-// return type. To avoid conflicting definitions we disable implicit base
-// function generation. That means we will end up with two specializations, one
-// per type, but only one has a base function defined by the system header.
-#pragma omp begin declare variant match(                                       \
-    implementation = {extension(disable_implicit_base)})
-
-// FIXME: We lack an extension to customize the mangling of the variants, e.g.,
-//        add a suffix. This means we would clash with the names of the variants
-//        (note that we do not create implicit base functions here). To avoid
-//        this clash we add a new trait to some of them that is always true
-//        (this is LLVM after all ;)). It will only influence the mangled name
-//        of the variants inside the inner region and avoid the clash.
-#pragma omp begin declare variant match(implementation = {vendor(llvm)})
-
-__DEVICE__ __CONSTEXPR__ int isinf(float __x) { return ::__isinff(__x); }
-__DEVICE__ __CONSTEXPR__ int isinf(double __x) { return ::__isinf(__x); }
-__DEVICE__ __CONSTEXPR__ int isfinite(float __x) { return ::__finitef(__x); }
-__DEVICE__ __CONSTEXPR__ int isfinite(double __x) { return ::__finite(__x); }
-__DEVICE__ __CONSTEXPR__ int isnan(float __x) { return ::__isnanf(__x); }
-__DEVICE__ __CONSTEXPR__ int isnan(double __x) { return ::__isnan(__x); }
-
-#pragma omp end declare variant
-#endif // defined(__OPENMP_AMDGCN__)
-
-__DEVICE__ __CONSTEXPR__ bool isinf(float __x) { return ::__isinff(__x); }
-__DEVICE__ __CONSTEXPR__ bool isinf(double __x) { return ::__isinf(__x); }
-__DEVICE__ __CONSTEXPR__ bool isfinite(float __x) { return ::__finitef(__x); }
-__DEVICE__ __CONSTEXPR__ bool isfinite(double __x) { return ::__finite(__x); }
-__DEVICE__ __CONSTEXPR__ bool isnan(float __x) { return ::__isnanf(__x); }
-__DEVICE__ __CONSTEXPR__ bool isnan(double __x) { return ::__isnan(__x); }
-
-#if defined(__OPENMP_AMDGCN__)
-#pragma omp end declare variant
-#endif // defined(__OPENMP_AMDGCN__)
-
-__DEVICE__ __CONSTEXPR__ bool isgreater(float __x, float __y) {
-  return __builtin_isgreater(__x, __y);
-}
-__DEVICE__ __CONSTEXPR__ bool isgreater(double __x, double __y) {
-  return __builtin_isgreater(__x, __y);
-}
-__DEVICE__ __CONSTEXPR__ bool isgreaterequal(float __x, float __y) {
-  return __builtin_isgreaterequal(__x, __y);
-}
-__DEVICE__ __CONSTEXPR__ bool isgreaterequal(double __x, double __y) {
-  return __builtin_isgreaterequal(__x, __y);
-}
-__DEVICE__ __CONSTEXPR__ bool isless(float __x, float __y) {
-  return __builtin_isless(__x, __y);
-}
-__DEVICE__ __CONSTEXPR__ bool isless(double __x, double __y) {
-  return __builtin_isless(__x, __y);
-}
-__DEVICE__ __CONSTEXPR__ bool islessequal(float __x, float __y) {
-  return __builtin_islessequal(__x, __y);
-}
-__DEVICE__ __CONSTEXPR__ bool islessequal(double __x, double __y) {
-  return __builtin_islessequal(__x, __y);
-}
-__DEVICE__ __CONSTEXPR__ bool islessgreater(float __x, float __y) {
-  return __builtin_islessgreater(__x, __y);
-}
-__DEVICE__ __CONSTEXPR__ bool islessgreater(double __x, double __y) {
-  return __builtin_islessgreater(__x, __y);
-}
-__DEVICE__ __CONSTEXPR__ bool isnormal(float __x) {
-  return __builtin_isnormal(__x);
-}
-__DEVICE__ __CONSTEXPR__ bool isnormal(double __x) {
-  return __builtin_isnormal(__x);
-}
-__DEVICE__ __CONSTEXPR__ bool isunordered(float __x, float __y) {
-  return __builtin_isunordered(__x, __y);
-}
-__DEVICE__ __CONSTEXPR__ bool isunordered(double __x, double __y) {
-  return __builtin_isunordered(__x, __y);
-}
-__DEVICE__ __CONSTEXPR__ float modf(float __x, float *__iptr) {
-  return ::modff(__x, __iptr);
-}
-__DEVICE__ __CONSTEXPR__ float pow(float __base, int __iexp) {
-  return ::powif(__base, __iexp);
-}
-__DEVICE__ __CONSTEXPR__ double pow(double __base, int __iexp) {
-  return ::powi(__base, __iexp);
-}
-__DEVICE__ __CONSTEXPR__ float remquo(float __x, float __y, int *__quo) {
-  return ::remquof(__x, __y, __quo);
-}
-__DEVICE__ __CONSTEXPR__ float scalbln(float __x, long int __n) {
-  return ::scalblnf(__x, __n);
-}
-__DEVICE__ __CONSTEXPR__ bool signbit(float __x) { return ::__signbitf(__x); }
-__DEVICE__ __CONSTEXPR__ bool signbit(double __x) { return ::__signbit(__x); }
-
-// Notably missing above is nexttoward.  We omit it because
-// ocml doesn't provide an implementation, and we don't want to be in the
-// business of implementing tricky libm functions in this header.
-
-// Other functions.
-__DEVICE__ __CONSTEXPR__ _Float16 fma(_Float16 __x, _Float16 __y,
-                                      _Float16 __z) {
-  return __builtin_fmaf16(__x, __y, __z);
-}
-__DEVICE__ __CONSTEXPR__ _Float16 pow(_Float16 __base, int __iexp) {
-  return __ocml_pown_f16(__base, __iexp);
-}
-
-#ifndef __OPENMP_AMDGCN__
-// BEGIN DEF_FUN and HIP_OVERLOAD
-
-// BEGIN DEF_FUN
-
-#pragma push_macro("__DEF_FUN1")
-#pragma push_macro("__DEF_FUN2")
-#pragma push_macro("__DEF_FUN2_FI")
-
-// Define cmath functions with float argument and returns __retty.
-#define __DEF_FUN1(__retty, __func)                                            \
-  __DEVICE__ __CONSTEXPR__ __retty __func(float __x) { return __func##f(__x); }
-
-// Define cmath functions with two float arguments and returns __retty.
-#define __DEF_FUN2(__retty, __func)                                            \
-  __DEVICE__ __CONSTEXPR__ __retty __func(float __x, float __y) {              \
-    return __func##f(__x, __y);                                                \
-  }
-
-// Define cmath functions with a float and an int argument and returns __retty.
-#define __DEF_FUN2_FI(__retty, __func)                                         \
-  __DEVICE__ __CONSTEXPR__ __retty __func(float __x, int __y) {                \
-    return __func##f(__x, __y);                                                \
-  }
-
-__DEF_FUN1(float, acos)
-__DEF_FUN1(float, acosh)
-__DEF_FUN1(float, asin)
-__DEF_FUN1(float, asinh)
-__DEF_FUN1(float, atan)
-__DEF_FUN2(float, atan2)
-__DEF_FUN1(float, atanh)
-__DEF_FUN1(float, cbrt)
-__DEF_FUN1(float, ceil)
-__DEF_FUN2(float, copysign)
-__DEF_FUN1(float, cos)
-__DEF_FUN1(float, cosh)
-__DEF_FUN1(float, erf)
-__DEF_FUN1(float, erfc)
-__DEF_FUN1(float, exp)
-__DEF_FUN1(float, exp2)
-__DEF_FUN1(float, expm1)
-__DEF_FUN1(float, fabs)
-__DEF_FUN2(float, fdim)
-__DEF_FUN1(float, floor)
-__DEF_FUN2(float, fmax)
-__DEF_FUN2(float, fmin)
-__DEF_FUN2(float, fmod)
-__DEF_FUN2(float, hypot)
-__DEF_FUN1(int, ilogb)
-__DEF_FUN2_FI(float, ldexp)
-__DEF_FUN1(float, lgamma)
-__DEF_FUN1(float, log)
-__DEF_FUN1(float, log10)
-__DEF_FUN1(float, log1p)
-__DEF_FUN1(float, log2)
-__DEF_FUN1(float, logb)
-__DEF_FUN1(long long, llrint)
-__DEF_FUN1(long long, llround)
-__DEF_FUN1(long, lrint)
-__DEF_FUN1(long, lround)
-__DEF_FUN1(float, nearbyint)
-__DEF_FUN2(float, nextafter)
-__DEF_FUN2(float, pow)
-__DEF_FUN2(float, remainder)
-__DEF_FUN1(float, rint)
-__DEF_FUN1(float, round)
-__DEF_FUN2_FI(float, scalbn)
-__DEF_FUN1(float, sin)
-__DEF_FUN1(float, sinh)
-__DEF_FUN1(float, sqrt)
-__DEF_FUN1(float, tan)
-__DEF_FUN1(float, tanh)
-__DEF_FUN1(float, tgamma)
-__DEF_FUN1(float, trunc)
-
-#pragma pop_macro("__DEF_FUN1")
-#pragma pop_macro("__DEF_FUN2")
-#pragma pop_macro("__DEF_FUN2_FI")
-
-// END DEF_FUN
-
-// BEGIN HIP_OVERLOAD
-
-#pragma push_macro("__HIP_OVERLOAD1")
-#pragma push_macro("__HIP_OVERLOAD2")
-
-// __hip_enable_if::type is a type function which returns __T if __B is true.
-template <bool __B, class __T = void> struct __hip_enable_if {};
-
-template <class __T> struct __hip_enable_if<true, __T> { typedef __T type; };
-
-namespace __hip {
-template <class _Tp> struct is_integral {
-  enum { value = 0 };
-};
-template <> struct is_integral<bool> {
-  enum { value = 1 };
-};
-template <> struct is_integral<char> {
-  enum { value = 1 };
-};
-template <> struct is_integral<signed char> {
-  enum { value = 1 };
-};
-template <> struct is_integral<unsigned char> {
-  enum { value = 1 };
-};
-template <> struct is_integral<wchar_t> {
-  enum { value = 1 };
-};
-template <> struct is_integral<short> {
-  enum { value = 1 };
-};
-template <> struct is_integral<unsigned short> {
-  enum { value = 1 };
-};
-template <> struct is_integral<int> {
-  enum { value = 1 };
-};
-template <> struct is_integral<unsigned int> {
-  enum { value = 1 };
-};
-template <> struct is_integral<long> {
-  enum { value = 1 };
-};
-template <> struct is_integral<unsigned long> {
-  enum { value = 1 };
-};
-template <> struct is_integral<long long> {
-  enum { value = 1 };
-};
-template <> struct is_integral<unsigned long long> {
-  enum { value = 1 };
-};
-
-// ToDo: specializes is_arithmetic<_Float16>
-template <class _Tp> struct is_arithmetic {
-  enum { value = 0 };
-};
-template <> struct is_arithmetic<bool> {
-  enum { value = 1 };
-};
-template <> struct is_arithmetic<char> {
-  enum { value = 1 };
-};
-template <> struct is_arithmetic<signed char> {
-  enum { value = 1 };
-};
-template <> struct is_arithmetic<unsigned char> {
-  enum { value = 1 };
-};
-template <> struct is_arithmetic<wchar_t> {
-  enum { value = 1 };
-};
-template <> struct is_arithmetic<short> {
-  enum { value = 1 };
-};
-template <> struct is_arithmetic<unsigned short> {
-  enum { value = 1 };
-};
-template <> struct is_arithmetic<int> {
-  enum { value = 1 };
-};
-template <> struct is_arithmetic<unsigned int> {
-  enum { value = 1 };
-};
-template <> struct is_arithmetic<long> {
-  enum { value = 1 };
-};
-template <> struct is_arithmetic<unsigned long> {
-  enum { value = 1 };
-};
-template <> struct is_arithmetic<long long> {
-  enum { value = 1 };
-};
-template <> struct is_arithmetic<unsigned long long> {
-  enum { value = 1 };
-};
-template <> struct is_arithmetic<float> {
-  enum { value = 1 };
-};
-template <> struct is_arithmetic<double> {
-  enum { value = 1 };
-};
-
-struct true_type {
-  static const __constant__ bool value = true;
-};
-struct false_type {
-  static const __constant__ bool value = false;
-};
-
-template <typename __T, typename __U> struct is_same : public false_type {};
-template <typename __T> struct is_same<__T, __T> : public true_type {};
-
-template <typename __T> struct add_rvalue_reference { typedef __T &&type; };
-
-template <typename __T> typename add_rvalue_reference<__T>::type declval();
-
-// decltype is only available in C++11 and above.
-#if __cplusplus >= 201103L
-// __hip_promote
-template <class _Tp> struct __numeric_type {
-  static void __test(...);
-  static _Float16 __test(_Float16);
-  static float __test(float);
-  static double __test(char);
-  static double __test(int);
-  static double __test(unsigned);
-  static double __test(long);
-  static double __test(unsigned long);
-  static double __test(long long);
-  static double __test(unsigned long long);
-  static double __test(double);
-  // No support for long double, use double instead.
-  static double __test(long double);
-
-  typedef decltype(__test(declval<_Tp>())) type;
-  static const bool value = !is_same<type, void>::value;
-};
-
-template <> struct __numeric_type<void> { static const bool value = true; };
-
-template <class _A1, class _A2 = void, class _A3 = void,
-          bool = __numeric_type<_A1>::value &&__numeric_type<_A2>::value
-              &&__numeric_type<_A3>::value>
-class __promote_imp {
-public:
-  static const bool value = false;
-};
-
-template <class _A1, class _A2, class _A3>
-class __promote_imp<_A1, _A2, _A3, true> {
-private:
-  typedef typename __promote_imp<_A1>::type __type1;
-  typedef typename __promote_imp<_A2>::type __type2;
-  typedef typename __promote_imp<_A3>::type __type3;
-
-public:
-  typedef decltype(__type1() + __type2() + __type3()) type;
-  static const bool value = true;
-};
-
-template <class _A1, class _A2> class __promote_imp<_A1, _A2, void, true> {
-private:
-  typedef typename __promote_imp<_A1>::type __type1;
-  typedef typename __promote_imp<_A2>::type __type2;
-
-public:
-  typedef decltype(__type1() + __type2()) type;
-  static const bool value = true;
-};
-
-template <class _A1> class __promote_imp<_A1, void, void, true> {
-public:
-  typedef typename __numeric_type<_A1>::type type;
-  static const bool value = true;
-};
-
-template <class _A1, class _A2 = void, class _A3 = void>
-class __promote : public __promote_imp<_A1, _A2, _A3> {};
-#endif //__cplusplus >= 201103L
-} // namespace __hip
-
-// __HIP_OVERLOAD1 is used to resolve function calls with integer argument to
-// avoid compilation error due to ambibuity. e.g. floor(5) is resolved with
-// floor(double).
-#define __HIP_OVERLOAD1(__retty, __fn)                                         \
-  template <typename __T>                                                      \
-  __DEVICE__ __CONSTEXPR__                                                     \
-      typename __hip_enable_if<__hip::is_integral<__T>::value, __retty>::type  \
-      __fn(__T __x) {                                                          \
-    return ::__fn((double)__x);                                                \
-  }
-
-// __HIP_OVERLOAD2 is used to resolve function calls with mixed float/double
-// or integer argument to avoid compilation error due to ambibuity. e.g.
-// max(5.0f, 6.0) is resolved with max(double, double).
-#if __cplusplus >= 201103L
-#define __HIP_OVERLOAD2(__retty, __fn)                                         \
-  template <typename __T1, typename __T2>                                      \
-  __DEVICE__ __CONSTEXPR__ typename __hip_enable_if<                           \
-      __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value,  \
-      typename __hip::__promote<__T1, __T2>::type>::type                       \
-  __fn(__T1 __x, __T2 __y) {                                                   \
-    typedef typename __hip::__promote<__T1, __T2>::type __result_type;         \
-    return __fn((__result_type)__x, (__result_type)__y);                       \
-  }
-#else
-#define __HIP_OVERLOAD2(__retty, __fn)                                         \
-  template <typename __T1, typename __T2>                                      \
-  __DEVICE__ __CONSTEXPR__                                                     \
-      typename __hip_enable_if<__hip::is_arithmetic<__T1>::value &&            \
-                                   __hip::is_arithmetic<__T2>::value,          \
-                               __retty>::type                                  \
-      __fn(__T1 __x, __T2 __y) {                                               \
-    return __fn((double)__x, (double)__y);                                     \
-  }
-#endif
-
-__HIP_OVERLOAD1(double, acos)
-__HIP_OVERLOAD1(double, acosh)
-__HIP_OVERLOAD1(double, asin)
-__HIP_OVERLOAD1(double, asinh)
-__HIP_OVERLOAD1(double, atan)
-__HIP_OVERLOAD2(double, atan2)
-__HIP_OVERLOAD1(double, atanh)
-__HIP_OVERLOAD1(double, cbrt)
-__HIP_OVERLOAD1(double, ceil)
-__HIP_OVERLOAD2(double, copysign)
-__HIP_OVERLOAD1(double, cos)
-__HIP_OVERLOAD1(double, cosh)
-__HIP_OVERLOAD1(double, erf)
-__HIP_OVERLOAD1(double, erfc)
-__HIP_OVERLOAD1(double, exp)
-__HIP_OVERLOAD1(double, exp2)
-__HIP_OVERLOAD1(double, expm1)
-__HIP_OVERLOAD1(double, fabs)
-__HIP_OVERLOAD2(double, fdim)
-__HIP_OVERLOAD1(double, floor)
-__HIP_OVERLOAD2(double, fmax)
-__HIP_OVERLOAD2(double, fmin)
-__HIP_OVERLOAD2(double, fmod)
-#if !defined(__HIPCC_RTC__)
-__HIP_OVERLOAD1(int, fpclassify)
-#endif // !defined(__HIPCC_RTC__)
-__HIP_OVERLOAD2(double, hypot)
-__HIP_OVERLOAD1(int, ilogb)
-__HIP_OVERLOAD1(bool, isfinite)
-__HIP_OVERLOAD2(bool, isgreater)
-__HIP_OVERLOAD2(bool, isgreaterequal)
-__HIP_OVERLOAD1(bool, isinf)
-__HIP_OVERLOAD2(bool, isless)
-__HIP_OVERLOAD2(bool, islessequal)
-__HIP_OVERLOAD2(bool, islessgreater)
-__HIP_OVERLOAD1(bool, isnan)
-__HIP_OVERLOAD1(bool, isnormal)
-__HIP_OVERLOAD2(bool, isunordered)
-__HIP_OVERLOAD1(double, lgamma)
-__HIP_OVERLOAD1(double, log)
-__HIP_OVERLOAD1(double, log10)
-__HIP_OVERLOAD1(double, log1p)
-__HIP_OVERLOAD1(double, log2)
-__HIP_OVERLOAD1(double, logb)
-__HIP_OVERLOAD1(long long, llrint)
-__HIP_OVERLOAD1(long long, llround)
-__HIP_OVERLOAD1(long, lrint)
-__HIP_OVERLOAD1(long, lround)
-__HIP_OVERLOAD1(double, nearbyint)
-__HIP_OVERLOAD2(double, nextafter)
-__HIP_OVERLOAD2(double, pow)
-__HIP_OVERLOAD2(double, remainder)
-__HIP_OVERLOAD1(double, rint)
-__HIP_OVERLOAD1(double, round)
-__HIP_OVERLOAD1(bool, signbit)
-__HIP_OVERLOAD1(double, sin)
-__HIP_OVERLOAD1(double, sinh)
-__HIP_OVERLOAD1(double, sqrt)
-__HIP_OVERLOAD1(double, tan)
-__HIP_OVERLOAD1(double, tanh)
-__HIP_OVERLOAD1(double, tgamma)
-__HIP_OVERLOAD1(double, trunc)
-
-// Overload these but don't add them to std, they are not part of cmath.
-__HIP_OVERLOAD2(double, max)
-__HIP_OVERLOAD2(double, min)
-
-// Additional Overloads that don't quite match HIP_OVERLOAD.
-#if __cplusplus >= 201103L
-template <typename __T1, typename __T2, typename __T3>
-__DEVICE__ __CONSTEXPR__ typename __hip_enable_if<
-    __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value &&
-        __hip::is_arithmetic<__T3>::value,
-    typename __hip::__promote<__T1, __T2, __T3>::type>::type
-fma(__T1 __x, __T2 __y, __T3 __z) {
-  typedef typename __hip::__promote<__T1, __T2, __T3>::type __result_type;
-  return ::fma((__result_type)__x, (__result_type)__y, (__result_type)__z);
-}
-#else
-template <typename __T1, typename __T2, typename __T3>
-__DEVICE__ __CONSTEXPR__
-    typename __hip_enable_if<__hip::is_arithmetic<__T1>::value &&
-                                 __hip::is_arithmetic<__T2>::value &&
-                                 __hip::is_arithmetic<__T3>::value,
-                             double>::type
-    fma(__T1 __x, __T2 __y, __T3 __z) {
-  return ::fma((double)__x, (double)__y, (double)__z);
-}
-#endif
-
-template <typename __T>
-__DEVICE__ __CONSTEXPR__
-    typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
-    frexp(__T __x, int *__exp) {
-  return ::frexp((double)__x, __exp);
-}
-
-template <typename __T>
-__DEVICE__ __CONSTEXPR__
-    typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
-    ldexp(__T __x, int __exp) {
-  return ::ldexp((double)__x, __exp);
-}
-
-template <typename __T>
-__DEVICE__ __CONSTEXPR__
-    typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
-    modf(__T __x, double *__exp) {
-  return ::modf((double)__x, __exp);
-}
-
-#if __cplusplus >= 201103L
-template <typename __T1, typename __T2>
-__DEVICE__ __CONSTEXPR__
-    typename __hip_enable_if<__hip::is_arithmetic<__T1>::value &&
-                                 __hip::is_arithmetic<__T2>::value,
-                             typename __hip::__promote<__T1, __T2>::type>::type
-    remquo(__T1 __x, __T2 __y, int *__quo) {
-  typedef typename __hip::__promote<__T1, __T2>::type __result_type;
-  return ::remquo((__result_type)__x, (__result_type)__y, __quo);
-}
-#else
-template <typename __T1, typename __T2>
-__DEVICE__ __CONSTEXPR__
-    typename __hip_enable_if<__hip::is_arithmetic<__T1>::value &&
-                                 __hip::is_arithmetic<__T2>::value,
-                             double>::type
-    remquo(__T1 __x, __T2 __y, int *__quo) {
-  return ::remquo((double)__x, (double)__y, __quo);
-}
-#endif
-
-template <typename __T>
-__DEVICE__ __CONSTEXPR__
-    typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
-    scalbln(__T __x, long int __exp) {
-  return ::scalbln((double)__x, __exp);
-}
-
-template <typename __T>
-__DEVICE__ __CONSTEXPR__
-    typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
-    scalbn(__T __x, int __exp) {
-  return ::scalbn((double)__x, __exp);
-}
-
-#pragma pop_macro("__HIP_OVERLOAD1")
-#pragma pop_macro("__HIP_OVERLOAD2")
-
-// END HIP_OVERLOAD
-
-// END DEF_FUN and HIP_OVERLOAD
-
-#endif // ifndef __OPENMP_AMDGCN__
-#endif // defined(__cplusplus)
-
-#ifndef __OPENMP_AMDGCN__
-// Define these overloads inside the namespace our standard library uses.
-#if !defined(__HIPCC_RTC__)
-#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
-_LIBCPP_BEGIN_NAMESPACE_STD
-#else
-namespace std {
-#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
-_GLIBCXX_BEGIN_NAMESPACE_VERSION
-#endif // _GLIBCXX_BEGIN_NAMESPACE_VERSION
-#endif // _LIBCPP_BEGIN_NAMESPACE_STD
-
-// Pull the new overloads we defined above into namespace std.
-// using ::abs; - This may be considered for C++.
-using ::acos;
-using ::acosh;
-using ::asin;
-using ::asinh;
-using ::atan;
-using ::atan2;
-using ::atanh;
-using ::cbrt;
-using ::ceil;
-using ::copysign;
-using ::cos;
-using ::cosh;
-using ::erf;
-using ::erfc;
-using ::exp;
-using ::exp2;
-using ::expm1;
-using ::fabs;
-using ::fdim;
-using ::floor;
-using ::fma;
-using ::fmax;
-using ::fmin;
-using ::fmod;
-using ::fpclassify;
-using ::frexp;
-using ::hypot;
-using ::ilogb;
-using ::isfinite;
-using ::isgreater;
-using ::isgreaterequal;
-using ::isless;
-using ::islessequal;
-using ::islessgreater;
-using ::isnormal;
-using ::isunordered;
-using ::ldexp;
-using ::lgamma;
-using ::llrint;
-using ::llround;
-using ::log;
-using ::log10;
-using ::log1p;
-using ::log2;
-using ::logb;
-using ::lrint;
-using ::lround;
-using ::modf;
-// using ::nan; - This may be considered for C++.
-// using ::nanf; - This may be considered for C++.
-// using ::nanl; - This is not yet defined.
-using ::nearbyint;
-using ::nextafter;
-// using ::nexttoward; - Omit this since we do not have a definition.
-using ::pow;
-using ::remainder;
-using ::remquo;
-using ::rint;
-using ::round;
-using ::scalbln;
-using ::scalbn;
-using ::signbit;
-using ::sin;
-using ::sinh;
-using ::sqrt;
-using ::tan;
-using ::tanh;
-using ::tgamma;
-using ::trunc;
-
-// Well this is fun: We need to pull these symbols in for libc++, but we can't
-// pull them in with libstdc++, because its ::isinf and ::isnan are different
-// than its std::isinf and std::isnan.
-#ifndef __GLIBCXX__
-using ::isinf;
-using ::isnan;
-#endif
-
-// Finally, pull the "foobarf" functions that HIP defines into std.
-using ::acosf;
-using ::acoshf;
-using ::asinf;
-using ::asinhf;
-using ::atan2f;
-using ::atanf;
-using ::atanhf;
-using ::cbrtf;
-using ::ceilf;
-using ::copysignf;
-using ::cosf;
-using ::coshf;
-using ::erfcf;
-using ::erff;
-using ::exp2f;
-using ::expf;
-using ::expm1f;
-using ::fabsf;
-using ::fdimf;
-using ::floorf;
-using ::fmaf;
-using ::fmaxf;
-using ::fminf;
-using ::fmodf;
-using ::frexpf;
-using ::hypotf;
-using ::ilogbf;
-using ::ldexpf;
-using ::lgammaf;
-using ::llrintf;
-using ::llroundf;
-using ::log10f;
-using ::log1pf;
-using ::log2f;
-using ::logbf;
-using ::logf;
-using ::lrintf;
-using ::lroundf;
-using ::modff;
-using ::nearbyintf;
-using ::nextafterf;
-// using ::nexttowardf; - Omit this since we do not have a definition.
-using ::powf;
-using ::remainderf;
-using ::remquof;
-using ::rintf;
-using ::roundf;
-using ::scalblnf;
-using ::scalbnf;
-using ::sinf;
-using ::sinhf;
-using ::sqrtf;
-using ::tanf;
-using ::tanhf;
-using ::tgammaf;
-using ::truncf;
-
-#ifdef _LIBCPP_END_NAMESPACE_STD
-_LIBCPP_END_NAMESPACE_STD
-#else
-#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
-_GLIBCXX_END_NAMESPACE_VERSION
-#endif // _GLIBCXX_BEGIN_NAMESPACE_VERSION
-} // namespace std
-#endif // _LIBCPP_END_NAMESPACE_STD
-#endif // !defined(__HIPCC_RTC__)
-
-// Define device-side math functions from <ymath.h> on MSVC.
-#if !defined(__HIPCC_RTC__)
-#if defined(_MSC_VER)
-
-// Before VS2019, `<ymath.h>` is also included in `<limits>` and other headers.
-// But, from VS2019, it's only included in `<complex>`. Need to include
-// `<ymath.h>` here to ensure C functions declared there won't be markded as
-// `__host__` and `__device__` through `<complex>` wrapper.
-#include <ymath.h>
-
-#if defined(__cplusplus)
-extern "C" {
-#endif // defined(__cplusplus)
-__DEVICE__ __CONSTEXPR__ __attribute__((overloadable)) double _Cosh(double x,
-                                                                    double y) {
-  return cosh(x) * y;
-}
-__DEVICE__ __CONSTEXPR__ __attribute__((overloadable)) float _FCosh(float x,
-                                                                    float y) {
-  return coshf(x) * y;
-}
-__DEVICE__ __CONSTEXPR__ __attribute__((overloadable)) short _Dtest(double *p) {
-  return fpclassify(*p);
-}
-__DEVICE__ __CONSTEXPR__ __attribute__((overloadable)) short _FDtest(float *p) {
-  return fpclassify(*p);
-}
-__DEVICE__ __CONSTEXPR__ __attribute__((overloadable)) double _Sinh(double x,
-                                                                    double y) {
-  return sinh(x) * y;
-}
-__DEVICE__ __CONSTEXPR__ __attribute__((overloadable)) float _FSinh(float x,
-                                                                    float y) {
-  return sinhf(x) * y;
-}
-#if defined(__cplusplus)
-}
-#endif // defined(__cplusplus)
-#endif // defined(_MSC_VER)
-#endif // !defined(__HIPCC_RTC__)
-#endif // ifndef __OPENMP_AMDGCN__
-
-#pragma pop_macro("__DEVICE__")
-#pragma pop_macro("__CONSTEXPR__")
-
-#endif // __CLANG_HIP_CMATH_H__
--- a/lib/include/__clang_hip_libdevice_declares.h
+++ b/lib/include/__clang_hip_libdevice_declares.h
@ -1,353 +0,0 @@
-/*===---- __clang_hip_libdevice_declares.h - HIP device library decls -------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __CLANG_HIP_LIBDEVICE_DECLARES_H__
-#define __CLANG_HIP_LIBDEVICE_DECLARES_H__
-
-#if !defined(__HIPCC_RTC__) && __has_include("hip/hip_version.h")
-#include "hip/hip_version.h"
-#endif // __has_include("hip/hip_version.h")
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// BEGIN FLOAT
-__device__ __attribute__((const)) float __ocml_acos_f32(float);
-__device__ __attribute__((pure)) float __ocml_acosh_f32(float);
-__device__ __attribute__((const)) float __ocml_asin_f32(float);
-__device__ __attribute__((pure)) float __ocml_asinh_f32(float);
-__device__ __attribute__((const)) float __ocml_atan2_f32(float, float);
-__device__ __attribute__((const)) float __ocml_atan_f32(float);
-__device__ __attribute__((pure)) float __ocml_atanh_f32(float);
-__device__ __attribute__((pure)) float __ocml_cbrt_f32(float);
-__device__ __attribute__((const)) float __ocml_ceil_f32(float);
-__device__ __attribute__((const)) __device__ float __ocml_copysign_f32(float,
-                                                                       float);
-__device__ float __ocml_cos_f32(float);
-__device__ float __ocml_native_cos_f32(float);
-__device__ __attribute__((pure)) __device__ float __ocml_cosh_f32(float);
-__device__ float __ocml_cospi_f32(float);
-__device__ float __ocml_i0_f32(float);
-__device__ float __ocml_i1_f32(float);
-__device__ __attribute__((pure)) float __ocml_erfc_f32(float);
-__device__ __attribute__((pure)) float __ocml_erfcinv_f32(float);
-__device__ __attribute__((pure)) float __ocml_erfcx_f32(float);
-__device__ __attribute__((pure)) float __ocml_erf_f32(float);
-__device__ __attribute__((pure)) float __ocml_erfinv_f32(float);
-__device__ __attribute__((pure)) float __ocml_exp10_f32(float);
-__device__ __attribute__((pure)) float __ocml_native_exp10_f32(float);
-__device__ __attribute__((pure)) float __ocml_exp2_f32(float);
-__device__ __attribute__((pure)) float __ocml_exp_f32(float);
-__device__ __attribute__((pure)) float __ocml_native_exp_f32(float);
-__device__ __attribute__((pure)) float __ocml_expm1_f32(float);
-__device__ __attribute__((const)) float __ocml_fabs_f32(float);
-__device__ __attribute__((const)) float __ocml_fdim_f32(float, float);
-__device__ __attribute__((const)) float __ocml_floor_f32(float);
-__device__ __attribute__((const)) float __ocml_fma_f32(float, float, float);
-__device__ __attribute__((const)) float __ocml_fmax_f32(float, float);
-__device__ __attribute__((const)) float __ocml_fmin_f32(float, float);
-__device__ __attribute__((const)) __device__ float __ocml_fmod_f32(float,
-                                                                   float);
-__device__ float __ocml_frexp_f32(float,
-                                  __attribute__((address_space(5))) int *);
-__device__ __attribute__((const)) float __ocml_hypot_f32(float, float);
-__device__ __attribute__((const)) int __ocml_ilogb_f32(float);
-__device__ __attribute__((const)) int __ocml_isfinite_f32(float);
-__device__ __attribute__((const)) int __ocml_isinf_f32(float);
-__device__ __attribute__((const)) int __ocml_isnan_f32(float);
-__device__ float __ocml_j0_f32(float);
-__device__ float __ocml_j1_f32(float);
-__device__ __attribute__((const)) float __ocml_ldexp_f32(float, int);
-__device__ float __ocml_lgamma_f32(float);
-__device__ __attribute__((pure)) float __ocml_log10_f32(float);
-__device__ __attribute__((pure)) float __ocml_native_log10_f32(float);
-__device__ __attribute__((pure)) float __ocml_log1p_f32(float);
-__device__ __attribute__((pure)) float __ocml_log2_f32(float);
-__device__ __attribute__((pure)) float __ocml_native_log2_f32(float);
-__device__ __attribute__((const)) float __ocml_logb_f32(float);
-__device__ __attribute__((pure)) float __ocml_log_f32(float);
-__device__ __attribute__((pure)) float __ocml_native_log_f32(float);
-__device__ float __ocml_modf_f32(float,
-                                 __attribute__((address_space(5))) float *);
-__device__ __attribute__((const)) float __ocml_nearbyint_f32(float);
-__device__ __attribute__((const)) float __ocml_nextafter_f32(float, float);
-__device__ __attribute__((const)) float __ocml_len3_f32(float, float, float);
-__device__ __attribute__((const)) float __ocml_len4_f32(float, float, float,
-                                                        float);
-__device__ __attribute__((pure)) float __ocml_ncdf_f32(float);
-__device__ __attribute__((pure)) float __ocml_ncdfinv_f32(float);
-__device__ __attribute__((pure)) float __ocml_pow_f32(float, float);
-__device__ __attribute__((pure)) float __ocml_pown_f32(float, int);
-__device__ __attribute__((pure)) float __ocml_rcbrt_f32(float);
-__device__ __attribute__((const)) float __ocml_remainder_f32(float, float);
-__device__ float __ocml_remquo_f32(float, float,
-                                   __attribute__((address_space(5))) int *);
-__device__ __attribute__((const)) float __ocml_rhypot_f32(float, float);
-__device__ __attribute__((const)) float __ocml_rint_f32(float);
-__device__ __attribute__((const)) float __ocml_rlen3_f32(float, float, float);
-__device__ __attribute__((const)) float __ocml_rlen4_f32(float, float, float,
-                                                         float);
-__device__ __attribute__((const)) float __ocml_round_f32(float);
-__device__ __attribute__((pure)) float __ocml_rsqrt_f32(float);
-__device__ __attribute__((const)) float __ocml_scalb_f32(float, float);
-__device__ __attribute__((const)) float __ocml_scalbn_f32(float, int);
-__device__ __attribute__((const)) int __ocml_signbit_f32(float);
-__device__ float __ocml_sincos_f32(float,
-                                   __attribute__((address_space(5))) float *);
-__device__ float __ocml_sincospi_f32(float,
-                                     __attribute__((address_space(5))) float *);
-__device__ float __ocml_sin_f32(float);
-__device__ float __ocml_native_sin_f32(float);
-__device__ __attribute__((pure)) float __ocml_sinh_f32(float);
-__device__ float __ocml_sinpi_f32(float);
-__device__ __attribute__((const)) float __ocml_sqrt_f32(float);
-__device__ __attribute__((const)) float __ocml_native_sqrt_f32(float);
-__device__ float __ocml_tan_f32(float);
-__device__ __attribute__((pure)) float __ocml_tanh_f32(float);
-__device__ float __ocml_tgamma_f32(float);
-__device__ __attribute__((const)) float __ocml_trunc_f32(float);
-__device__ float __ocml_y0_f32(float);
-__device__ float __ocml_y1_f32(float);
-
-// BEGIN INTRINSICS
-__device__ __attribute__((const)) float __ocml_add_rte_f32(float, float);
-__device__ __attribute__((const)) float __ocml_add_rtn_f32(float, float);
-__device__ __attribute__((const)) float __ocml_add_rtp_f32(float, float);
-__device__ __attribute__((const)) float __ocml_add_rtz_f32(float, float);
-__device__ __attribute__((const)) float __ocml_sub_rte_f32(float, float);
-__device__ __attribute__((const)) float __ocml_sub_rtn_f32(float, float);
-__device__ __attribute__((const)) float __ocml_sub_rtp_f32(float, float);
-__device__ __attribute__((const)) float __ocml_sub_rtz_f32(float, float);
-__device__ __attribute__((const)) float __ocml_mul_rte_f32(float, float);
-__device__ __attribute__((const)) float __ocml_mul_rtn_f32(float, float);
-__device__ __attribute__((const)) float __ocml_mul_rtp_f32(float, float);
-__device__ __attribute__((const)) float __ocml_mul_rtz_f32(float, float);
-__device__ __attribute__((const)) float __ocml_div_rte_f32(float, float);
-__device__ __attribute__((const)) float __ocml_div_rtn_f32(float, float);
-__device__ __attribute__((const)) float __ocml_div_rtp_f32(float, float);
-__device__ __attribute__((const)) float __ocml_div_rtz_f32(float, float);
-__device__ __attribute__((const)) float __ocml_sqrt_rte_f32(float);
-__device__ __attribute__((const)) float __ocml_sqrt_rtn_f32(float);
-__device__ __attribute__((const)) float __ocml_sqrt_rtp_f32(float);
-__device__ __attribute__((const)) float __ocml_sqrt_rtz_f32(float);
-__device__ __attribute__((const)) float __ocml_fma_rte_f32(float, float, float);
-__device__ __attribute__((const)) float __ocml_fma_rtn_f32(float, float, float);
-__device__ __attribute__((const)) float __ocml_fma_rtp_f32(float, float, float);
-__device__ __attribute__((const)) float __ocml_fma_rtz_f32(float, float, float);
-// END INTRINSICS
-// END FLOAT
-
-// BEGIN DOUBLE
-__device__ __attribute__((const)) double __ocml_acos_f64(double);
-__device__ __attribute__((pure)) double __ocml_acosh_f64(double);
-__device__ __attribute__((const)) double __ocml_asin_f64(double);
-__device__ __attribute__((pure)) double __ocml_asinh_f64(double);
-__device__ __attribute__((const)) double __ocml_atan2_f64(double, double);
-__device__ __attribute__((const)) double __ocml_atan_f64(double);
-__device__ __attribute__((pure)) double __ocml_atanh_f64(double);
-__device__ __attribute__((pure)) double __ocml_cbrt_f64(double);
-__device__ __attribute__((const)) double __ocml_ceil_f64(double);
-__device__ __attribute__((const)) double __ocml_copysign_f64(double, double);
-__device__ double __ocml_cos_f64(double);
-__device__ __attribute__((pure)) double __ocml_cosh_f64(double);
-__device__ double __ocml_cospi_f64(double);
-__device__ double __ocml_i0_f64(double);
-__device__ double __ocml_i1_f64(double);
-__device__ __attribute__((pure)) double __ocml_erfc_f64(double);
-__device__ __attribute__((pure)) double __ocml_erfcinv_f64(double);
-__device__ __attribute__((pure)) double __ocml_erfcx_f64(double);
-__device__ __attribute__((pure)) double __ocml_erf_f64(double);
-__device__ __attribute__((pure)) double __ocml_erfinv_f64(double);
-__device__ __attribute__((pure)) double __ocml_exp10_f64(double);
-__device__ __attribute__((pure)) double __ocml_exp2_f64(double);
-__device__ __attribute__((pure)) double __ocml_exp_f64(double);
-__device__ __attribute__((pure)) double __ocml_expm1_f64(double);
-__device__ __attribute__((const)) double __ocml_fabs_f64(double);
-__device__ __attribute__((const)) double __ocml_fdim_f64(double, double);
-__device__ __attribute__((const)) double __ocml_floor_f64(double);
-__device__ __attribute__((const)) double __ocml_fma_f64(double, double, double);
-__device__ __attribute__((const)) double __ocml_fmax_f64(double, double);
-__device__ __attribute__((const)) double __ocml_fmin_f64(double, double);
-__device__ __attribute__((const)) double __ocml_fmod_f64(double, double);
-__device__ double __ocml_frexp_f64(double,
-                                   __attribute__((address_space(5))) int *);
-__device__ __attribute__((const)) double __ocml_hypot_f64(double, double);
-__device__ __attribute__((const)) int __ocml_ilogb_f64(double);
-__device__ __attribute__((const)) int __ocml_isfinite_f64(double);
-__device__ __attribute__((const)) int __ocml_isinf_f64(double);
-__device__ __attribute__((const)) int __ocml_isnan_f64(double);
-__device__ double __ocml_j0_f64(double);
-__device__ double __ocml_j1_f64(double);
-__device__ __attribute__((const)) double __ocml_ldexp_f64(double, int);
-__device__ double __ocml_lgamma_f64(double);
-__device__ __attribute__((pure)) double __ocml_log10_f64(double);
-__device__ __attribute__((pure)) double __ocml_log1p_f64(double);
-__device__ __attribute__((pure)) double __ocml_log2_f64(double);
-__device__ __attribute__((const)) double __ocml_logb_f64(double);
-__device__ __attribute__((pure)) double __ocml_log_f64(double);
-__device__ double __ocml_modf_f64(double,
-                                  __attribute__((address_space(5))) double *);
-__device__ __attribute__((const)) double __ocml_nearbyint_f64(double);
-__device__ __attribute__((const)) double __ocml_nextafter_f64(double, double);
-__device__ __attribute__((const)) double __ocml_len3_f64(double, double,
-                                                         double);
-__device__ __attribute__((const)) double __ocml_len4_f64(double, double, double,
-                                                         double);
-__device__ __attribute__((pure)) double __ocml_ncdf_f64(double);
-__device__ __attribute__((pure)) double __ocml_ncdfinv_f64(double);
-__device__ __attribute__((pure)) double __ocml_pow_f64(double, double);
-__device__ __attribute__((pure)) double __ocml_pown_f64(double, int);
-__device__ __attribute__((pure)) double __ocml_rcbrt_f64(double);
-__device__ __attribute__((const)) double __ocml_remainder_f64(double, double);
-__device__ double __ocml_remquo_f64(double, double,
-                                    __attribute__((address_space(5))) int *);
-__device__ __attribute__((const)) double __ocml_rhypot_f64(double, double);
-__device__ __attribute__((const)) double __ocml_rint_f64(double);
-__device__ __attribute__((const)) double __ocml_rlen3_f64(double, double,
-                                                          double);
-__device__ __attribute__((const)) double __ocml_rlen4_f64(double, double,
-                                                          double, double);
-__device__ __attribute__((const)) double __ocml_round_f64(double);
-__device__ __attribute__((pure)) double __ocml_rsqrt_f64(double);
-__device__ __attribute__((const)) double __ocml_scalb_f64(double, double);
-__device__ __attribute__((const)) double __ocml_scalbn_f64(double, int);
-__device__ __attribute__((const)) int __ocml_signbit_f64(double);
-__device__ double __ocml_sincos_f64(double,
-                                    __attribute__((address_space(5))) double *);
-__device__ double
-__ocml_sincospi_f64(double, __attribute__((address_space(5))) double *);
-__device__ double __ocml_sin_f64(double);
-__device__ __attribute__((pure)) double __ocml_sinh_f64(double);
-__device__ double __ocml_sinpi_f64(double);
-__device__ __attribute__((const)) double __ocml_sqrt_f64(double);
-__device__ double __ocml_tan_f64(double);
-__device__ __attribute__((pure)) double __ocml_tanh_f64(double);
-__device__ double __ocml_tgamma_f64(double);
-__device__ __attribute__((const)) double __ocml_trunc_f64(double);
-__device__ double __ocml_y0_f64(double);
-__device__ double __ocml_y1_f64(double);
-
-// BEGIN INTRINSICS
-__device__ __attribute__((const)) double __ocml_add_rte_f64(double, double);
-__device__ __attribute__((const)) double __ocml_add_rtn_f64(double, double);
-__device__ __attribute__((const)) double __ocml_add_rtp_f64(double, double);
-__device__ __attribute__((const)) double __ocml_add_rtz_f64(double, double);
-__device__ __attribute__((const)) double __ocml_sub_rte_f64(double, double);
-__device__ __attribute__((const)) double __ocml_sub_rtn_f64(double, double);
-__device__ __attribute__((const)) double __ocml_sub_rtp_f64(double, double);
-__device__ __attribute__((const)) double __ocml_sub_rtz_f64(double, double);
-__device__ __attribute__((const)) double __ocml_mul_rte_f64(double, double);
-__device__ __attribute__((const)) double __ocml_mul_rtn_f64(double, double);
-__device__ __attribute__((const)) double __ocml_mul_rtp_f64(double, double);
-__device__ __attribute__((const)) double __ocml_mul_rtz_f64(double, double);
-__device__ __attribute__((const)) double __ocml_div_rte_f64(double, double);
-__device__ __attribute__((const)) double __ocml_div_rtn_f64(double, double);
-__device__ __attribute__((const)) double __ocml_div_rtp_f64(double, double);
-__device__ __attribute__((const)) double __ocml_div_rtz_f64(double, double);
-__device__ __attribute__((const)) double __ocml_sqrt_rte_f64(double);
-__device__ __attribute__((const)) double __ocml_sqrt_rtn_f64(double);
-__device__ __attribute__((const)) double __ocml_sqrt_rtp_f64(double);
-__device__ __attribute__((const)) double __ocml_sqrt_rtz_f64(double);
-__device__ __attribute__((const)) double __ocml_fma_rte_f64(double, double,
-                                                            double);
-__device__ __attribute__((const)) double __ocml_fma_rtn_f64(double, double,
-                                                            double);
-__device__ __attribute__((const)) double __ocml_fma_rtp_f64(double, double,
-                                                            double);
-__device__ __attribute__((const)) double __ocml_fma_rtz_f64(double, double,
-                                                            double);
-
-__device__ __attribute__((const)) _Float16 __ocml_ceil_f16(_Float16);
-__device__ _Float16 __ocml_cos_f16(_Float16);
-__device__ __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float);
-__device__ __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float);
-__device__ __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float);
-__device__ __attribute__((pure)) _Float16 __ocml_exp_f16(_Float16);
-__device__ __attribute__((pure)) _Float16 __ocml_exp10_f16(_Float16);
-__device__ __attribute__((pure)) _Float16 __ocml_exp2_f16(_Float16);
-__device__ __attribute__((const)) _Float16 __ocml_floor_f16(_Float16);
-__device__ __attribute__((const)) _Float16 __ocml_fma_f16(_Float16, _Float16,
-                                                          _Float16);
-__device__ __attribute__((const)) _Float16 __ocml_fmax_f16(_Float16, _Float16);
-__device__ __attribute__((const)) _Float16 __ocml_fmin_f16(_Float16, _Float16);
-__device__ __attribute__((const)) _Float16 __ocml_fabs_f16(_Float16);
-__device__ __attribute__((const)) int __ocml_isinf_f16(_Float16);
-__device__ __attribute__((const)) int __ocml_isnan_f16(_Float16);
-__device__ __attribute__((pure)) _Float16 __ocml_log_f16(_Float16);
-__device__ __attribute__((pure)) _Float16 __ocml_log10_f16(_Float16);
-__device__ __attribute__((pure)) _Float16 __ocml_log2_f16(_Float16);
-__device__ __attribute__((const)) _Float16 __ocml_rint_f16(_Float16);
-__device__ __attribute__((const)) _Float16 __ocml_rsqrt_f16(_Float16);
-__device__ _Float16 __ocml_sin_f16(_Float16);
-__device__ __attribute__((const)) _Float16 __ocml_sqrt_f16(_Float16);
-__device__ __attribute__((const)) _Float16 __ocml_trunc_f16(_Float16);
-__device__ __attribute__((pure)) _Float16 __ocml_pown_f16(_Float16, int);
-
-typedef _Float16 __2f16 __attribute__((ext_vector_type(2)));
-typedef short __2i16 __attribute__((ext_vector_type(2)));
-
-// We need to match C99's bool and get an i1 in the IR.
-#ifdef __cplusplus
-typedef bool __ockl_bool;
-#else
-typedef _Bool __ockl_bool;
-#endif
-
-__device__ __attribute__((const)) float __ockl_fdot2(__2f16 a, __2f16 b,
-                                                     float c, __ockl_bool s);
-__device__ __attribute__((const)) __2f16 __ocml_ceil_2f16(__2f16);
-__device__ __attribute__((const)) __2f16 __ocml_fabs_2f16(__2f16);
-__device__ __2f16 __ocml_cos_2f16(__2f16);
-__device__ __attribute__((pure)) __2f16 __ocml_exp_2f16(__2f16);
-__device__ __attribute__((pure)) __2f16 __ocml_exp10_2f16(__2f16);
-__device__ __attribute__((pure)) __2f16 __ocml_exp2_2f16(__2f16);
-__device__ __attribute__((const)) __2f16 __ocml_floor_2f16(__2f16);
-__device__ __attribute__((const))
-__2f16 __ocml_fma_2f16(__2f16, __2f16, __2f16);
-__device__ __attribute__((const)) __2i16 __ocml_isinf_2f16(__2f16);
-__device__ __attribute__((const)) __2i16 __ocml_isnan_2f16(__2f16);
-__device__ __attribute__((pure)) __2f16 __ocml_log_2f16(__2f16);
-__device__ __attribute__((pure)) __2f16 __ocml_log10_2f16(__2f16);
-__device__ __attribute__((pure)) __2f16 __ocml_log2_2f16(__2f16);
-
-#if HIP_VERSION_MAJOR * 100 + HIP_VERSION_MINOR >= 560
-#define __DEPRECATED_SINCE_HIP_560(X) __attribute__((deprecated(X)))
-#else
-#define __DEPRECATED_SINCE_HIP_560(X)
-#endif
-
-// Deprecated, should be removed when rocm releases using it are no longer
-// relevant.
-__DEPRECATED_SINCE_HIP_560("use ((_Float16)1.0) / ")
-__device__ inline _Float16 __llvm_amdgcn_rcp_f16(_Float16 x) {
-  return ((_Float16)1.0f) / x;
-}
-
-__DEPRECATED_SINCE_HIP_560("use ((__2f16)1.0) / ")
-__device__ inline __2f16
-__llvm_amdgcn_rcp_2f16(__2f16 __x)
-{
-  return ((__2f16)1.0f) / __x;
-}
-
-#undef __DEPRECATED_SINCE_HIP_560
-
-__device__ __attribute__((const)) __2f16 __ocml_rint_2f16(__2f16);
-__device__ __attribute__((const)) __2f16 __ocml_rsqrt_2f16(__2f16);
-__device__ __2f16 __ocml_sin_2f16(__2f16);
-__device__ __attribute__((const)) __2f16 __ocml_sqrt_2f16(__2f16);
-__device__ __attribute__((const)) __2f16 __ocml_trunc_2f16(__2f16);
-__device__ __attribute__((const)) __2f16 __ocml_pown_2f16(__2f16, __2i16);
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // __CLANG_HIP_LIBDEVICE_DECLARES_H__
--- a/lib/include/__clang_hip_math.h
+++ b/lib/include/__clang_hip_math.h
--- a/lib/include/__clang_hip_runtime_wrapper.h
+++ b/lib/include/__clang_hip_runtime_wrapper.h
@ -1,151 +0,0 @@
-/*===---- __clang_hip_runtime_wrapper.h - HIP runtime support ---------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-/*
- * WARNING: This header is intended to be directly -include'd by
- * the compiler and is not supposed to be included by users.
- *
- */
-
-#ifndef __CLANG_HIP_RUNTIME_WRAPPER_H__
-#define __CLANG_HIP_RUNTIME_WRAPPER_H__
-
-#if __HIP__
-
-#define __host__ __attribute__((host))
-#define __device__ __attribute__((device))
-#define __global__ __attribute__((global))
-#define __shared__ __attribute__((shared))
-#define __constant__ __attribute__((constant))
-#define __managed__ __attribute__((managed))
-
-#if !defined(__cplusplus) || __cplusplus < 201103L
-  #define nullptr NULL;
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-  __attribute__((__visibility__("default")))
-  __attribute__((weak))
-  __attribute__((noreturn))
-  __device__ void __cxa_pure_virtual(void) {
-    __builtin_trap();
-  }
-  __attribute__((__visibility__("default")))
-  __attribute__((weak))
-  __attribute__((noreturn))
-  __device__ void __cxa_deleted_virtual(void) {
-    __builtin_trap();
-  }
-}
-#endif //__cplusplus
-
-#if !defined(__HIPCC_RTC__)
-#include <cmath>
-#include <cstdlib>
-#include <stdlib.h>
-#if __has_include("hip/hip_version.h")
-#include "hip/hip_version.h"
-#endif // __has_include("hip/hip_version.h")
-#else
-typedef __SIZE_TYPE__ size_t;
-// Define macros which are needed to declare HIP device API's without standard
-// C/C++ headers. This is for readability so that these API's can be written
-// the same way as non-hipRTC use case. These macros need to be popped so that
-// they do not pollute users' name space.
-#pragma push_macro("NULL")
-#pragma push_macro("uint32_t")
-#pragma push_macro("uint64_t")
-#pragma push_macro("CHAR_BIT")
-#pragma push_macro("INT_MAX")
-#define NULL (void *)0
-#define uint32_t __UINT32_TYPE__
-#define uint64_t __UINT64_TYPE__
-#define CHAR_BIT __CHAR_BIT__
-#define INT_MAX __INTMAX_MAX__
-#endif // __HIPCC_RTC__
-
-typedef __SIZE_TYPE__ __hip_size_t;
-
-#ifdef __cplusplus
-extern "C" {
-#endif //__cplusplus
-
-#if HIP_VERSION_MAJOR * 100 + HIP_VERSION_MINOR >= 405
-extern "C" __device__ unsigned long long __ockl_dm_alloc(unsigned long long __size);
-extern "C" __device__ void __ockl_dm_dealloc(unsigned long long __addr);
-#if __has_feature(address_sanitizer)
-extern "C" __device__ unsigned long long __asan_malloc_impl(unsigned long long __size, unsigned long long __pc);
-extern "C" __device__ void __asan_free_impl(unsigned long long __addr, unsigned long long __pc);
-__attribute__((noinline, weak)) __device__ void *malloc(__hip_size_t __size) {
-  unsigned long long __pc = (unsigned long long)__builtin_return_address(0);
-  return (void *)__asan_malloc_impl(__size, __pc);
-}
-__attribute__((noinline, weak)) __device__ void free(void *__ptr) {
-  unsigned long long __pc = (unsigned long long)__builtin_return_address(0);
-  __asan_free_impl((unsigned long long)__ptr, __pc);
-}
-#else
-__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
-  return (void *) __ockl_dm_alloc(__size);
-}
-__attribute__((weak)) inline __device__ void free(void *__ptr) {
-  __ockl_dm_dealloc((unsigned long long)__ptr);
-}
-#endif // __has_feature(address_sanitizer)
-#else  // HIP version check
-#if __HIP_ENABLE_DEVICE_MALLOC__
-__device__ void *__hip_malloc(__hip_size_t __size);
-__device__ void *__hip_free(void *__ptr);
-__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
-  return __hip_malloc(__size);
-}
-__attribute__((weak)) inline __device__ void free(void *__ptr) {
-  __hip_free(__ptr);
-}
-#else
-__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
-  __builtin_trap();
-  return (void *)0;
-}
-__attribute__((weak)) inline __device__ void free(void *__ptr) {
-  __builtin_trap();
-}
-#endif
-#endif // HIP version check
-
-#ifdef __cplusplus
-} // extern "C"
-#endif //__cplusplus
-
-#include <__clang_hip_libdevice_declares.h>
-#include <__clang_hip_math.h>
-#include <__clang_hip_stdlib.h>
-
-#if defined(__HIPCC_RTC__)
-#include <__clang_hip_cmath.h>
-#else
-#include <__clang_cuda_math_forward_declares.h>
-#include <__clang_hip_cmath.h>
-#include <__clang_cuda_complex_builtins.h>
-#include <algorithm>
-#include <complex>
-#include <new>
-#endif // __HIPCC_RTC__
-
-#define __CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__ 1
-#if defined(__HIPCC_RTC__)
-#pragma pop_macro("NULL")
-#pragma pop_macro("uint32_t")
-#pragma pop_macro("uint64_t")
-#pragma pop_macro("CHAR_BIT")
-#pragma pop_macro("INT_MAX")
-#endif // __HIPCC_RTC__
-#endif // __HIP__
-#endif // __CLANG_HIP_RUNTIME_WRAPPER_H__
--- a/lib/include/__clang_hip_stdlib.h
+++ b/lib/include/__clang_hip_stdlib.h
@ -1,43 +0,0 @@
-/*===---- __clang_hip_stdlib.h - Device-side HIP math support --------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __CLANG_HIP_STDLIB_H__
-
-#if !defined(__HIP__) && !defined(__OPENMP_AMDGCN__)
-#error "This file is for HIP and OpenMP AMDGCN device compilation only."
-#endif
-
-#if !defined(__cplusplus)
-
-#include <limits.h>
-
-#ifdef __OPENMP_AMDGCN__
-#define __DEVICE__ static inline __attribute__((always_inline, nothrow))
-#else
-#define __DEVICE__ static __device__ inline __attribute__((always_inline))
-#endif
-
-__DEVICE__
-int abs(int __x) {
-  int __sgn = __x >> (sizeof(int) * CHAR_BIT - 1);
-  return (__x ^ __sgn) - __sgn;
-}
-__DEVICE__
-long labs(long __x) {
-  long __sgn = __x >> (sizeof(long) * CHAR_BIT - 1);
-  return (__x ^ __sgn) - __sgn;
-}
-__DEVICE__
-long long llabs(long long __x) {
-  long long __sgn = __x >> (sizeof(long long) * CHAR_BIT - 1);
-  return (__x ^ __sgn) - __sgn;
-}
-
-#endif // !defined(__cplusplus)
-
-#endif // #define __CLANG_HIP_STDLIB_H__
--- a/lib/include/__stddef_max_align_t.h
+++ b/lib/include/__stddef_max_align_t.h
@ -1,27 +0,0 @@
-/*===---- __stddef_max_align_t.h - Definition of max_align_t for modules ---===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __CLANG_MAX_ALIGN_T_DEFINED
-#define __CLANG_MAX_ALIGN_T_DEFINED
-
-#if defined(_MSC_VER)
-typedef double max_align_t;
-#elif defined(__APPLE__)
-typedef long double max_align_t;
-#else
-// Define 'max_align_t' to match the GCC definition.
-typedef struct {
-  long long __clang_max_align_nonce1
-      __attribute__((__aligned__(__alignof__(long long))));
-  long double __clang_max_align_nonce2
-      __attribute__((__aligned__(__alignof__(long double))));
-} max_align_t;
-#endif
-
-#endif
--- a/lib/include/__wmmintrin_aes.h
+++ b/lib/include/__wmmintrin_aes.h
@ -1,140 +0,0 @@
-/*===---- __wmmintrin_aes.h - AES intrinsics -------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __WMMINTRIN_H
-#error "Never use <__wmmintrin_aes.h> directly; include <wmmintrin.h> instead."
-#endif
-
-#ifndef __WMMINTRIN_AES_H
-#define __WMMINTRIN_AES_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("aes"), __min_vector_width__(128)))
-
-/// Performs a single round of AES encryption using the Equivalent
-///    Inverse Cipher, transforming the state value from the first source
-///    operand using a 128-bit round key value contained in the second source
-///    operand, and writes the result to the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VAESENC </c> instruction.
-///
-/// \param __V
-///    A 128-bit integer vector containing the state value.
-/// \param __R
-///    A 128-bit integer vector containing the round key value.
-/// \returns A 128-bit integer vector containing the encrypted value.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_aesenc_si128(__m128i __V, __m128i __R)
-{
-  return (__m128i)__builtin_ia32_aesenc128((__v2di)__V, (__v2di)__R);
-}
-
-/// Performs the final round of AES encryption using the Equivalent
-///    Inverse Cipher, transforming the state value from the first source
-///    operand using a 128-bit round key value contained in the second source
-///    operand, and writes the result to the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VAESENCLAST </c> instruction.
-///
-/// \param __V
-///    A 128-bit integer vector containing the state value.
-/// \param __R
-///    A 128-bit integer vector containing the round key value.
-/// \returns A 128-bit integer vector containing the encrypted value.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_aesenclast_si128(__m128i __V, __m128i __R)
-{
-  return (__m128i)__builtin_ia32_aesenclast128((__v2di)__V, (__v2di)__R);
-}
-
-/// Performs a single round of AES decryption using the Equivalent
-///    Inverse Cipher, transforming the state value from the first source
-///    operand using a 128-bit round key value contained in the second source
-///    operand, and writes the result to the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VAESDEC </c> instruction.
-///
-/// \param __V
-///    A 128-bit integer vector containing the state value.
-/// \param __R
-///    A 128-bit integer vector containing the round key value.
-/// \returns A 128-bit integer vector containing the decrypted value.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_aesdec_si128(__m128i __V, __m128i __R)
-{
-  return (__m128i)__builtin_ia32_aesdec128((__v2di)__V, (__v2di)__R);
-}
-
-/// Performs the final round of AES decryption using the Equivalent
-///    Inverse Cipher, transforming the state value from the first source
-///    operand using a 128-bit round key value contained in the second source
-///    operand, and writes the result to the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VAESDECLAST </c> instruction.
-///
-/// \param __V
-///    A 128-bit integer vector containing the state value.
-/// \param __R
-///    A 128-bit integer vector containing the round key value.
-/// \returns A 128-bit integer vector containing the decrypted value.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_aesdeclast_si128(__m128i __V, __m128i __R)
-{
-  return (__m128i)__builtin_ia32_aesdeclast128((__v2di)__V, (__v2di)__R);
-}
-
-/// Applies the AES InvMixColumns() transformation to an expanded key
-///    contained in the source operand, and writes the result to the
-///    destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VAESIMC </c> instruction.
-///
-/// \param __V
-///    A 128-bit integer vector containing the expanded key.
-/// \returns A 128-bit integer vector containing the transformed value.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_aesimc_si128(__m128i __V)
-{
-  return (__m128i)__builtin_ia32_aesimc128((__v2di)__V);
-}
-
-/// Generates a round key for AES encryption, operating on 128-bit data
-///    specified in the first source operand and using an 8-bit round constant
-///    specified by the second source operand, and writes the result to the
-///    destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_aeskeygenassist_si128(__m128i C, const int R);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> AESKEYGENASSIST </c> instruction.
-///
-/// \param C
-///    A 128-bit integer vector that is used to generate the AES encryption key.
-/// \param R
-///    An 8-bit round constant used to generate the AES encryption key.
-/// \returns A 128-bit round key for AES encryption.
-#define _mm_aeskeygenassist_si128(C, R) \
-  ((__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R)))
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif  /* __WMMINTRIN_AES_H */
--- a/lib/include/__wmmintrin_pclmul.h
+++ b/lib/include/__wmmintrin_pclmul.h
@ -1,48 +0,0 @@
-/*===---- __wmmintrin_pclmul.h - PCMUL intrinsics ---------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __WMMINTRIN_H
-#error "Never use <__wmmintrin_pclmul.h> directly; include <wmmintrin.h> instead."
-#endif
-
-#ifndef __WMMINTRIN_PCLMUL_H
-#define __WMMINTRIN_PCLMUL_H
-
-/// Multiplies two 64-bit integer values, which are selected from source
-///    operands using the immediate-value operand. The multiplication is a
-///    carry-less multiplication, and the 128-bit integer product is stored in
-///    the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_clmulepi64_si128(__m128i X, __m128i Y, const int I);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPCLMULQDQ </c> instruction.
-///
-/// \param X
-///    A 128-bit vector of [2 x i64] containing one of the source operands.
-/// \param Y
-///    A 128-bit vector of [2 x i64] containing one of the source operands.
-/// \param I
-///    An immediate value specifying which 64-bit values to select from the
-///    operands. Bit 0 is used to select a value from operand \a X, and bit
-///    4 is used to select a value from operand \a Y: \n
-///    Bit[0]=0 indicates that bits[63:0] of operand \a X are used. \n
-///    Bit[0]=1 indicates that bits[127:64] of operand \a X are used. \n
-///    Bit[4]=0 indicates that bits[63:0] of operand \a Y are used. \n
-///    Bit[4]=1 indicates that bits[127:64] of operand \a Y are used.
-/// \returns The 128-bit integer vector containing the result of the carry-less
-///    multiplication of the selected 64-bit values.
-#define _mm_clmulepi64_si128(X, Y, I) \
-  ((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(X), \
-                                        (__v2di)(__m128i)(Y), (char)(I)))
-
-#endif /* __WMMINTRIN_PCLMUL_H */
--- a/lib/include/adxintrin.h
+++ b/lib/include/adxintrin.h
@ -1,227 +0,0 @@
-/*===---- adxintrin.h - ADX intrinsics -------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <adxintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __ADXINTRIN_H
-#define __ADXINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
-
-/* Use C++ inline semantics in C++, GNU inline for C mode. */
-#if defined(__cplusplus)
-#define __INLINE __inline
-#else
-#define __INLINE static __inline
-#endif
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-/* Intrinsics that are available only if __ADX__ is defined. */
-
-/// Adds unsigned 32-bit integers \a __x and \a __y, plus 0 or 1 as indicated
-///    by the carry flag \a __cf. Stores the unsigned 32-bit sum in the memory
-///    at \a __p, and returns the 8-bit carry-out (carry flag).
-///
-/// \code{.operation}
-/// temp := (__cf == 0) ? 0 : 1
-/// Store32(__p, __x + __y + temp)
-/// result := CF
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c ADCX instruction.
-///
-/// \param __cf
-///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
-/// \param __x
-///    A 32-bit unsigned addend.
-/// \param __y
-///    A 32-bit unsigned addend.
-/// \param __p
-///    Pointer to memory for storing the sum.
-/// \returns The 8-bit unsigned carry-out value.
-__INLINE unsigned char
-    __attribute__((__always_inline__, __nodebug__, __target__("adx")))
-    _addcarryx_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
-                   unsigned int *__p) {
-  return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
-}
-
-#ifdef __x86_64__
-/// Adds unsigned 64-bit integers \a __x and \a __y, plus 0 or 1 as indicated
-///    by the carry flag \a __cf. Stores the unsigned 64-bit sum in the memory
-///    at \a __p, and returns the 8-bit carry-out (carry flag).
-///
-/// \code{.operation}
-/// temp := (__cf == 0) ? 0 : 1
-/// Store64(__p, __x + __y + temp)
-/// result := CF
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c ADCX instruction.
-///
-/// \param __cf
-///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
-/// \param __x
-///    A 64-bit unsigned addend.
-/// \param __y
-///    A 64-bit unsigned addend.
-/// \param __p
-///    Pointer to memory for storing the sum.
-/// \returns The 8-bit unsigned carry-out value.
-__INLINE unsigned char
-    __attribute__((__always_inline__, __nodebug__, __target__("adx")))
-    _addcarryx_u64(unsigned char __cf, unsigned long long __x,
-                   unsigned long long __y, unsigned long long *__p) {
-  return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
-}
-#endif
-
-/* Intrinsics that are also available if __ADX__ is undefined. */
-
-/// Adds unsigned 32-bit integers \a __x and \a __y, plus 0 or 1 as indicated
-///    by the carry flag \a __cf. Stores the unsigned 32-bit sum in the memory
-///    at \a __p, and returns the 8-bit carry-out (carry flag).
-///
-/// \code{.operation}
-/// temp := (__cf == 0) ? 0 : 1
-/// Store32(__p, __x + __y + temp)
-/// result := CF
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c ADC instruction.
-///
-/// \param __cf
-///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
-/// \param __x
-///    A 32-bit unsigned addend.
-/// \param __y
-///    A 32-bit unsigned addend.
-/// \param __p
-///    Pointer to memory for storing the sum.
-/// \returns The 8-bit unsigned carry-out value.
-__INLINE unsigned char __DEFAULT_FN_ATTRS _addcarry_u32(unsigned char __cf,
-                                                        unsigned int __x,
-                                                        unsigned int __y,
-                                                        unsigned int *__p) {
-  return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
-}
-
-#ifdef __x86_64__
-/// Adds unsigned 64-bit integers \a __x and \a __y, plus 0 or 1 as indicated
-///    by the carry flag \a __cf. Stores the unsigned 64-bit sum in the memory
-///    at \a __p, and returns the 8-bit carry-out (carry flag).
-///
-/// \code{.operation}
-/// temp := (__cf == 0) ? 0 : 1
-/// Store64(__p, __x + __y + temp)
-/// result := CF
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c ADC instruction.
-///
-/// \param __cf
-///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
-/// \param __x
-///    A 64-bit unsigned addend.
-/// \param __y
-///    A 64-bit unsigned addend.
-/// \param __p
-///    Pointer to memory for storing the sum.
-/// \returns The 8-bit unsigned carry-out value.
-__INLINE unsigned char __DEFAULT_FN_ATTRS
-_addcarry_u64(unsigned char __cf, unsigned long long __x,
-              unsigned long long __y, unsigned long long *__p) {
-  return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
-}
-#endif
-
-/// Adds unsigned 32-bit integer \a __y to 0 or 1 as indicated by the carry
-///    flag \a __cf, and subtracts the result from unsigned 32-bit integer
-///    \a __x. Stores the unsigned 32-bit difference in the memory at \a __p,
-///    and returns the 8-bit carry-out (carry or overflow flag).
-///
-/// \code{.operation}
-/// temp := (__cf == 0) ? 0 : 1
-/// Store32(__p, __x - (__y + temp))
-/// result := CF
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c SBB instruction.
-///
-/// \param __cf
-///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
-/// \param __x
-///    The 32-bit unsigned minuend.
-/// \param __y
-///    The 32-bit unsigned subtrahend.
-/// \param __p
-///    Pointer to memory for storing the difference.
-/// \returns The 8-bit unsigned carry-out value.
-__INLINE unsigned char __DEFAULT_FN_ATTRS _subborrow_u32(unsigned char __cf,
-                                                         unsigned int __x,
-                                                         unsigned int __y,
-                                                         unsigned int *__p) {
-  return __builtin_ia32_subborrow_u32(__cf, __x, __y, __p);
-}
-
-#ifdef __x86_64__
-/// Adds unsigned 64-bit integer \a __y to 0 or 1 as indicated by the carry
-///    flag \a __cf, and subtracts the result from unsigned 64-bit integer
-///    \a __x. Stores the unsigned 64-bit difference in the memory at \a __p,
-///    and returns the 8-bit carry-out (carry or overflow flag).
-///
-/// \code{.operation}
-/// temp := (__cf == 0) ? 0 : 1
-/// Store64(__p, __x - (__y + temp))
-/// result := CF
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c ADC instruction.
-///
-/// \param __cf
-///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
-/// \param __x
-///    The 64-bit unsigned minuend.
-/// \param __y
-///    The 64-bit unsigned subtrahend.
-/// \param __p
-///    Pointer to memory for storing the difference.
-/// \returns The 8-bit unsigned carry-out value.
-__INLINE unsigned char __DEFAULT_FN_ATTRS
-_subborrow_u64(unsigned char __cf, unsigned long long __x,
-               unsigned long long __y, unsigned long long *__p) {
-  return __builtin_ia32_subborrow_u64(__cf, __x, __y, __p);
-}
-#endif
-
-#if defined(__cplusplus)
-}
-#endif
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __ADXINTRIN_H */
--- a/lib/include/altivec.h
+++ b/lib/include/altivec.h
--- a/lib/include/ammintrin.h
+++ b/lib/include/ammintrin.h
@ -1,183 +0,0 @@
-/*===---- ammintrin.h - SSE4a intrinsics -----------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __AMMINTRIN_H
-#define __AMMINTRIN_H
-
-#if !defined(__i386__) && !defined(__x86_64__)
-#error "This header is only meant to be used on x86 and x64 architecture"
-#endif
-
-#include <pmmintrin.h>
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a"), __min_vector_width__(128)))
-
-/// Extracts the specified bits from the lower 64 bits of the 128-bit
-///    integer vector operand at the index \a idx and of the length \a len.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_extracti_si64(__m128i x, const int len, const int idx);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
-///
-/// \param x
-///    The value from which bits are extracted.
-/// \param len
-///    Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
-///    are zero, the length is interpreted as 64.
-/// \param idx
-///    Bits [5:0] specify the index of the least significant bit; the other
-///    bits are ignored. If the sum of the index and length is greater than 64,
-///    the result is undefined. If the length and index are both zero, bits
-///    [63:0] of parameter \a x are extracted. If the length is zero but the
-///    index is non-zero, the result is undefined.
-/// \returns A 128-bit integer vector whose lower 64 bits contain the bits
-///    extracted from the source operand.
-#define _mm_extracti_si64(x, len, idx) \
-  ((__m128i)__builtin_ia32_extrqi((__v2di)(__m128i)(x), \
-                                  (char)(len), (char)(idx)))
-
-/// Extracts the specified bits from the lower 64 bits of the 128-bit
-///    integer vector operand at the index and of the length specified by
-///    \a __y.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
-///
-/// \param __x
-///    The value from which bits are extracted.
-/// \param __y
-///    Specifies the index of the least significant bit at [13:8] and the
-///    length at [5:0]; all other bits are ignored. If bits [5:0] are zero, the
-///    length is interpreted as 64. If the sum of the index and length is
-///    greater than 64, the result is undefined. If the length and index are
-///    both zero, bits [63:0] of parameter \a __x are extracted. If the length
-///    is zero but the index is non-zero, the result is undefined.
-/// \returns A 128-bit vector whose lower 64 bits contain the bits extracted
-///    from the source operand.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_extract_si64(__m128i __x, __m128i __y)
-{
-  return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y);
-}
-
-/// Inserts bits of a specified length from the source integer vector
-///    \a y into the lower 64 bits of the destination integer vector \a x at
-///    the index \a idx and of the length \a len.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_inserti_si64(__m128i x, __m128i y, const int len,
-/// const int idx);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
-///
-/// \param x
-///    The destination operand where bits will be inserted. The inserted bits
-///    are defined by the length \a len and by the index \a idx specifying the
-///    least significant bit.
-/// \param y
-///    The source operand containing the bits to be extracted. The extracted
-///    bits are the least significant bits of operand \a y of length \a len.
-/// \param len
-///    Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
-///    are zero, the length is interpreted as 64.
-/// \param idx
-///    Bits [5:0] specify the index of the least significant bit; the other
-///    bits are ignored. If the sum of the index and length is greater than 64,
-///    the result is undefined. If the length and index are both zero, bits
-///    [63:0] of parameter \a y are inserted into parameter \a x. If the length
-///    is zero but the index is non-zero, the result is undefined.
-/// \returns A 128-bit integer vector containing the original lower 64-bits of
-///    destination operand \a x with the specified bitfields replaced by the
-///    lower bits of source operand \a y. The upper 64 bits of the return value
-///    are undefined.
-#define _mm_inserti_si64(x, y, len, idx) \
-  ((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(x), \
-                                    (__v2di)(__m128i)(y), \
-                                    (char)(len), (char)(idx)))
-
-/// Inserts bits of a specified length from the source integer vector
-///    \a __y into the lower 64 bits of the destination integer vector \a __x
-///    at the index and of the length specified by \a __y.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
-///
-/// \param __x
-///    The destination operand where bits will be inserted. The inserted bits
-///    are defined by the length and by the index of the least significant bit
-///    specified by operand \a __y.
-/// \param __y
-///    The source operand containing the bits to be extracted. The extracted
-///    bits are the least significant bits of operand \a __y with length
-///    specified by bits [69:64]. These are inserted into the destination at the
-///    index specified by bits [77:72]; all other bits are ignored. If bits
-///    [69:64] are zero, the length is interpreted as 64. If the sum of the
-///    index and length is greater than 64, the result is undefined. If the
-///    length and index are both zero, bits [63:0] of parameter \a __y are
-///    inserted into parameter \a __x. If the length is zero but the index is
-///    non-zero, the result is undefined.
-/// \returns A 128-bit integer vector containing the original lower 64-bits of
-///    destination operand \a __x with the specified bitfields replaced by the
-///    lower bits of source operand \a __y. The upper 64 bits of the return
-///    value are undefined.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_insert_si64(__m128i __x, __m128i __y)
-{
-  return (__m128i)__builtin_ia32_insertq((__v2di)__x, (__v2di)__y);
-}
-
-/// Stores a 64-bit double-precision value in a 64-bit memory location.
-///    To minimize caching, the data is flagged as non-temporal (unlikely to be
-///    used again soon).
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> MOVNTSD </c> instruction.
-///
-/// \param __p
-///    The 64-bit memory location used to store the register value.
-/// \param __a
-///    The 64-bit double-precision floating-point register value to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_stream_sd(double *__p, __m128d __a)
-{
-  __builtin_ia32_movntsd(__p, (__v2df)__a);
-}
-
-/// Stores a 32-bit single-precision floating-point value in a 32-bit
-///    memory location. To minimize caching, the data is flagged as
-///    non-temporal (unlikely to be used again soon).
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> MOVNTSS </c> instruction.
-///
-/// \param __p
-///    The 32-bit memory location used to store the register value.
-/// \param __a
-///    The 32-bit single-precision floating-point register value to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_stream_ss(float *__p, __m128 __a)
-{
-  __builtin_ia32_movntss(__p, (__v4sf)__a);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __AMMINTRIN_H */
--- a/lib/include/amxcomplexintrin.h
+++ b/lib/include/amxcomplexintrin.h
@ -1,169 +0,0 @@
-/*===--------- amxcomplexintrin.h - AMXCOMPLEX intrinsics -*- C++ -*---------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===------------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <amxcomplexintrin.h> directly; include <immintrin.h> instead."
-#endif // __IMMINTRIN_H
-
-#ifndef __AMX_COMPLEXINTRIN_H
-#define __AMX_COMPLEXINTRIN_H
-#ifdef __x86_64__
-
-#define __DEFAULT_FN_ATTRS_COMPLEX                                             \
-  __attribute__((__always_inline__, __nodebug__, __target__("amx-complex")))
-
-/// Perform matrix multiplication of two tiles containing complex elements and
-///    accumulate the results into a packed single precision tile. Each dword
-///    element in input tiles \a a and \a b is interpreted as a complex number
-///    with FP16 real part and FP16 imaginary part.
-/// Calculates the imaginary part of the result. For each possible combination
-///    of (row of \a a, column of \a b), it performs a set of multiplication
-///    and accumulations on all corresponding complex numbers (one from \a a
-///    and one from \a b). The imaginary part of the \a a element is multiplied
-///    with the real part of the corresponding \a b element, and the real part
-///    of the \a a element is multiplied with the imaginary part of the
-///    corresponding \a b elements. The two accumulated results are added, and
-///    then accumulated into the corresponding row and column of \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// void _tile_cmmimfp16ps(__tile dst, __tile a, __tile b);
-/// \endcode
-///
-/// \code{.operation}
-/// FOR m := 0 TO dst.rows - 1
-///	tmp := dst.row[m]
-///	FOR k := 0 TO (a.colsb / 4) - 1
-///		FOR n := 0 TO (dst.colsb / 4) - 1
-///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
-///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
-///		ENDFOR
-///	ENDFOR
-///	write_row_and_zero(dst, m, tmp, dst.colsb)
-/// ENDFOR
-/// zero_upper_rows(dst, dst.rows)
-/// zero_tileconfig_start()
-/// \endcode
-///
-/// This intrinsic corresponds to the \c TCMMIMFP16PS instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param a
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param b
-///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_cmmimfp16ps(dst, a, b) __builtin_ia32_tcmmimfp16ps(dst, a, b)
-
-/// Perform matrix multiplication of two tiles containing complex elements and
-///    accumulate the results into a packed single precision tile. Each dword
-///    element in input tiles \a a and \a b is interpreted as a complex number
-///    with FP16 real part and FP16 imaginary part.
-/// Calculates the real part of the result. For each possible combination
-///    of (row of \a a, column of \a b), it performs a set of multiplication
-///    and accumulations on all corresponding complex numbers (one from \a a
-///    and one from \a b). The real part of the \a a element is multiplied
-///    with the real part of the corresponding \a b element, and the negated
-///    imaginary part of the \a a element is multiplied with the imaginary
-///    part of the corresponding \a b elements. The two accumulated results
-///    are added, and then accumulated into the corresponding row and column
-///    of \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// void _tile_cmmrlfp16ps(__tile dst, __tile a, __tile b);
-/// \endcode
-///
-/// \code{.operation}
-/// FOR m := 0 TO dst.rows - 1
-///	tmp := dst.row[m]
-///	FOR k := 0 TO (a.colsb / 4) - 1
-///		FOR n := 0 TO (dst.colsb / 4) - 1
-///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0])
-///			tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1])
-///		ENDFOR
-///	ENDFOR
-///	write_row_and_zero(dst, m, tmp, dst.colsb)
-/// ENDFOR
-/// zero_upper_rows(dst, dst.rows)
-/// zero_tileconfig_start()
-/// \endcode
-///
-/// This intrinsic corresponds to the \c TCMMIMFP16PS instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param a
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param b
-///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_cmmrlfp16ps(dst, a, b) __builtin_ia32_tcmmrlfp16ps(dst, a, b)
-
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_COMPLEX
-_tile_cmmimfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
-                           _tile1024i dst, _tile1024i src1, _tile1024i src2) {
-  return __builtin_ia32_tcmmimfp16ps_internal(m, n, k, dst, src1, src2);
-}
-
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_COMPLEX
-_tile_cmmrlfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
-                           _tile1024i dst, _tile1024i src1, _tile1024i src2) {
-  return __builtin_ia32_tcmmrlfp16ps_internal(m, n, k, dst, src1, src2);
-}
-
-/// Perform matrix multiplication of two tiles containing complex elements and
-/// accumulate the results into a packed single precision tile. Each dword
-/// element in input tiles src0 and src1 is interpreted as a complex number with
-/// FP16 real part and FP16 imaginary part.
-/// This function calculates the imaginary part of the result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TCMMIMFP16PS </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS_COMPLEX
-static void __tile_cmmimfp16ps(__tile1024i *dst, __tile1024i src0,
-                               __tile1024i src1) {
-  dst->tile = _tile_cmmimfp16ps_internal(src0.row, src1.col, src0.col,
-                                         dst->tile, src0.tile, src1.tile);
-}
-
-/// Perform matrix multiplication of two tiles containing complex elements and
-/// accumulate the results into a packed single precision tile. Each dword
-/// element in input tiles src0 and src1 is interpreted as a complex number with
-/// FP16 real part and FP16 imaginary part.
-/// This function calculates the real part of the result.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TCMMRLFP16PS </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS_COMPLEX
-static void __tile_cmmrlfp16ps(__tile1024i *dst, __tile1024i src0,
-                               __tile1024i src1) {
-  dst->tile = _tile_cmmrlfp16ps_internal(src0.row, src1.col, src0.col,
-                                         dst->tile, src0.tile, src1.tile);
-}
-
-#endif // __x86_64__
-#endif // __AMX_COMPLEXINTRIN_H
--- a/lib/include/amxfp16intrin.h
+++ b/lib/include/amxfp16intrin.h
@ -1,58 +0,0 @@
-/*===------------- amxfp16intrin.h - AMX_FP16 intrinsics -*- C++ -*---------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===------------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <amxfp16intrin.h> directly; use <immintrin.h> instead."
-#endif /* __IMMINTRIN_H */
-
-#ifndef __AMX_FP16INTRIN_H
-#define __AMX_FP16INTRIN_H
-#ifdef __x86_64__
-
-/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles \a a
-///    and \a b, accumulating the intermediate single-precision (32-bit)
-///    floating-point elements with elements in \a dst, and store the 32-bit
-///    result back to tile \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// void _tile_dpfp16ps (__tile dst, __tile a, __tile b)
-/// \endcode
-///
-/// \code{.operation}
-/// FOR m := 0 TO dst.rows - 1
-///	tmp := dst.row[m]
-///	FOR k := 0 TO (a.colsb / 4) - 1
-///		FOR n := 0 TO (dst.colsb / 4) - 1
-///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) *
-///					FP32(b.row[k].fp16[2*n+0])
-///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) *
-///					FP32(b.row[k].fp16[2*n+1])
-///		ENDFOR
-///	ENDFOR
-///	write_row_and_zero(dst, m, tmp, dst.colsb)
-/// ENDFOR
-/// zero_upper_rows(dst, dst.rows)
-/// zero_tileconfig_start()
-/// \endcode
-///
-/// This intrinsic corresponds to the \c TDPFP16PS instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param a
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param b
-///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpfp16ps(dst, a, b)                                \
-  __builtin_ia32_tdpfp16ps(dst, a, b)
-
-#endif /* __x86_64__ */
-#endif /* __AMX_FP16INTRIN_H */
--- a/lib/include/amxintrin.h
+++ b/lib/include/amxintrin.h
@ -1,524 +0,0 @@
-/*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===------------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <amxintrin.h> directly; include <immintrin.h> instead."
-#endif /* __IMMINTRIN_H */
-
-#ifndef __AMXINTRIN_H
-#define __AMXINTRIN_H
-#ifdef __x86_64__
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS_TILE                                                \
-  __attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
-#define __DEFAULT_FN_ATTRS_INT8                                                \
-  __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
-#define __DEFAULT_FN_ATTRS_BF16                                                \
-  __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
-#define __DEFAULT_FN_ATTRS_FP16                                                \
-  __attribute__((__always_inline__, __nodebug__, __target__("amx-fp16")))
-
-/// Load tile configuration from a 64-byte memory location specified by
-/// "mem_addr". The tile configuration includes the tile type palette, the
-/// number of bytes per row, and the number of rows. If the specified
-/// palette_id is zero, that signifies the init state for both the tile
-/// config and the tile data, and the tiles are zeroed. Any invalid
-/// configurations will result in #GP fault.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
-///
-/// \param __config
-///    A pointer to 512-bits configuration
-static __inline__ void __DEFAULT_FN_ATTRS_TILE
-_tile_loadconfig(const void *__config) {
-  __builtin_ia32_tile_loadconfig(__config);
-}
-
-/// Stores the current tile configuration to a 64-byte memory location
-/// specified by "mem_addr". The tile configuration includes the tile type
-/// palette, the number of bytes per row, and the number of rows. If tiles
-/// are not configured, all zeroes will be stored to memory.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
-///
-/// \param __config
-///    A pointer to 512-bits configuration
-static __inline__ void __DEFAULT_FN_ATTRS_TILE
-_tile_storeconfig(void *__config) {
-  __builtin_ia32_tile_storeconfig(__config);
-}
-
-/// Release the tile configuration to return to the init state, which
-/// releases all storage it currently holds.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
-static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
-  __builtin_ia32_tilerelease();
-}
-
-/// Load tile rows from memory specifieid by "base" address and "stride" into
-/// destination tile "dst" using the tile configuration previously configured
-/// via "_tile_loadconfig".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
-///
-/// \param dst
-///    A destination tile. Max size is 1024 Bytes.
-/// \param base
-///    A pointer to base address.
-/// \param stride
-///    The stride between the rows' data to be loaded in memory.
-#define _tile_loadd(dst, base, stride)                                         \
-  __builtin_ia32_tileloadd64((dst), ((const void *)(base)),                    \
-                             (__SIZE_TYPE__)(stride))
-
-/// Load tile rows from memory specifieid by "base" address and "stride" into
-/// destination tile "dst" using the tile configuration previously configured
-/// via "_tile_loadconfig". This intrinsic provides a hint to the implementation
-/// that the data will likely not be reused in the near future and the data
-/// caching can be optimized accordingly.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
-///
-/// \param dst
-///    A destination tile. Max size is 1024 Bytes.
-/// \param base
-///    A pointer to base address.
-/// \param stride
-///    The stride between the rows' data to be loaded in memory.
-#define _tile_stream_loadd(dst, base, stride)                                  \
-  __builtin_ia32_tileloaddt164((dst), ((const void *)(base)),                  \
-                               (__SIZE_TYPE__)(stride))
-
-/// Store the tile specified by "src" to memory specifieid by "base" address and
-/// "stride" using the tile configuration previously configured via
-/// "_tile_loadconfig".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
-///
-/// \param dst
-///    A destination tile. Max size is 1024 Bytes.
-/// \param base
-///    A pointer to base address.
-/// \param stride
-///    The stride between the rows' data to be stored in memory.
-#define _tile_stored(dst, base, stride)                                        \
-  __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
-
-/// Zero the tile specified by "tdest".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
-///
-/// \param tile
-///    The destination tile to be zero. Max size is 1024 Bytes.
-#define _tile_zero(tile) __builtin_ia32_tilezero((tile))
-
-/// Compute dot-product of bytes in tiles with a source/destination accumulator.
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
-/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
-/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
-/// and store the 32-bit result back to tile "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbssd(dst, src0, src1)                                          \
-  __builtin_ia32_tdpbssd((dst), (src0), (src1))
-
-/// Compute dot-product of bytes in tiles with a source/destination accumulator.
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
-/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
-/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
-/// in "dst", and store the 32-bit result back to tile "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbsud(dst, src0, src1)                                          \
-  __builtin_ia32_tdpbsud((dst), (src0), (src1))
-
-/// Compute dot-product of bytes in tiles with a source/destination accumulator.
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
-/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
-/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
-/// and store the 32-bit result back to tile "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbusd(dst, src0, src1)                                          \
-  __builtin_ia32_tdpbusd((dst), (src0), (src1))
-
-/// Compute dot-product of bytes in tiles with a source/destination accumulator.
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
-/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
-/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
-/// "dst", and store the 32-bit result back to tile "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbuud(dst, src0, src1)                                          \
-  __builtin_ia32_tdpbuud((dst), (src0), (src1))
-
-/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
-/// src1, accumulating the intermediate single-precision (32-bit) floating-point
-/// elements with elements in "dst", and store the 32-bit result back to tile
-/// "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbf16ps(dst, src0, src1)                                        \
-  __builtin_ia32_tdpbf16ps((dst), (src0), (src1))
-
-/// AMX tile register size can be configured, the maximum size is 16x64=1024
-/// bytes. Since there is no 2D type in llvm IR, we use vector type to
-/// represent 2D tile and the fixed size is maximum amx tile register size.
-typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
-
-/// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
-_tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
-                     __SIZE_TYPE__ stride) {
-  return __builtin_ia32_tileloadd64_internal(m, n, base,
-                                             (__SIZE_TYPE__)(stride));
-}
-
-/// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
-_tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base,
-                       __SIZE_TYPE__ stride) {
-  return __builtin_ia32_tileloaddt164_internal(m, n, base,
-                                               (__SIZE_TYPE__)(stride));
-}
-
-/// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
-_tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
-                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
-  return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
-}
-
-/// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
-_tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k,
-                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
-  return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2);
-}
-
-/// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
-_tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k,
-                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
-  return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2);
-}
-
-/// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
-_tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k,
-                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
-  return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2);
-}
-
-/// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ void __DEFAULT_FN_ATTRS_INT8
-_tile_stored_internal(unsigned short m, unsigned short n, void *base,
-                      __SIZE_TYPE__ stride, _tile1024i tile) {
-  return __builtin_ia32_tilestored64_internal(m, n, base,
-                                              (__SIZE_TYPE__)(stride), tile);
-}
-
-/// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
-_tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
-                        _tile1024i dst, _tile1024i src1, _tile1024i src2) {
-  return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
-}
-
-/// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP16
-_tile_dpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
-                        _tile1024i dst, _tile1024i src1, _tile1024i src2) {
-  return __builtin_ia32_tdpfp16ps_internal(m, n, k, dst, src1, src2);
-}
-
-/// This struct pack the shape and tile data together for user. We suggest
-/// initializing the struct as early as possible, because compiler depends
-/// on the shape information to do configure. The constant value is preferred
-/// for optimization by compiler.
-typedef struct __tile1024i_str {
-  const unsigned short row;
-  const unsigned short col;
-  _tile1024i tile;
-} __tile1024i;
-
-/// Load tile rows from memory specifieid by "base" address and "stride" into
-/// destination tile "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
-///
-/// \param dst
-///    A destination tile. Max size is 1024 Bytes.
-/// \param base
-///    A pointer to base address.
-/// \param stride
-///    The stride between the rows' data to be loaded in memory.
-__DEFAULT_FN_ATTRS_TILE
-static __inline__ void __tile_loadd(__tile1024i *dst, const void *base,
-                                    __SIZE_TYPE__ stride) {
-  dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
-}
-
-/// Load tile rows from memory specifieid by "base" address and "stride" into
-/// destination tile "dst". This intrinsic provides a hint to the implementation
-/// that the data will likely not be reused in the near future and the data
-/// caching can be optimized accordingly.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
-///
-/// \param dst
-///    A destination tile. Max size is 1024 Bytes.
-/// \param base
-///    A pointer to base address.
-/// \param stride
-///    The stride between the rows' data to be loaded in memory.
-__DEFAULT_FN_ATTRS_TILE
-static __inline__ void __tile_stream_loadd(__tile1024i *dst, const void *base,
-                                           __SIZE_TYPE__ stride) {
-  dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
-}
-
-/// Compute dot-product of bytes in tiles with a source/destination accumulator.
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
-/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
-/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
-/// and store the 32-bit result back to tile "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS_INT8
-static __inline__ void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
-                                     __tile1024i src1) {
-  dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
-                                    src0.tile, src1.tile);
-}
-
-/// Compute dot-product of bytes in tiles with a source/destination accumulator.
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
-/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
-/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
-/// in "dst", and store the 32-bit result back to tile "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS_INT8
-static __inline__ void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
-                                     __tile1024i src1) {
-  dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
-                                    src0.tile, src1.tile);
-}
-
-/// Compute dot-product of bytes in tiles with a source/destination accumulator.
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
-/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
-/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
-/// and store the 32-bit result back to tile "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS_INT8
-static __inline__ void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
-                                     __tile1024i src1) {
-  dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
-                                    src0.tile, src1.tile);
-}
-
-/// Compute dot-product of bytes in tiles with a source/destination accumulator.
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
-/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
-/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
-/// "dst", and store the 32-bit result back to tile "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS_INT8
-static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
-                                     __tile1024i src1) {
-  dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
-                                    src0.tile, src1.tile);
-}
-
-/// Store the tile specified by "src" to memory specifieid by "base" address and
-/// "stride".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
-///
-/// \param base
-///    A pointer to base address.
-/// \param stride
-///    The stride between the rows' data to be stored in memory.
-__DEFAULT_FN_ATTRS_TILE
-static __inline__ void __tile_stored(void *base, __SIZE_TYPE__ stride,
-                                     __tile1024i src) {
-  _tile_stored_internal(src.row, src.col, base, stride, src.tile);
-}
-
-/// Zero the tile specified by "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
-///
-/// \param dst
-///    The destination tile to be zero. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS_TILE
-static __inline__ void __tile_zero(__tile1024i *dst) {
-  dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
-}
-
-/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
-/// src1, accumulating the intermediate single-precision (32-bit) floating-point
-/// elements with elements in "dst", and store the 32-bit result back to tile
-/// "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS_BF16
-static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
-                                       __tile1024i src1) {
-  dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
-                                      src0.tile, src1.tile);
-}
-
-/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles src0 and
-/// src1, accumulating the intermediate single-precision (32-bit) floating-point
-/// elements with elements in "dst", and store the 32-bit result back to tile
-/// "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TDPFP16PS </c> instruction.
-///
-/// \param dst
-///    The destination tile. Max size is 1024 Bytes.
-/// \param src0
-///    The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-///    The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS_FP16
-static __inline__ void __tile_dpfp16ps(__tile1024i *dst, __tile1024i src0,
-                                       __tile1024i src1) {
-  dst->tile = _tile_dpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,
-                                      src0.tile, src1.tile);
-}
-
-#undef __DEFAULT_FN_ATTRS_TILE
-#undef __DEFAULT_FN_ATTRS_INT8
-#undef __DEFAULT_FN_ATTRS_BF16
-#undef __DEFAULT_FN_ATTRS_FP16
-
-#endif /* __x86_64__ */
-#endif /* __AMXINTRIN_H */
--- a/lib/include/arm64intr.h
+++ b/lib/include/arm64intr.h
@ -1,35 +0,0 @@
-/*===---- arm64intr.h - ARM64 Windows intrinsics -------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-/* Only include this if we're compiling for the windows platform. */
-#ifndef _MSC_VER
-#include_next <arm64intr.h>
-#else
-
-#ifndef __ARM64INTR_H
-#define __ARM64INTR_H
-
-typedef enum
-{
-  _ARM64_BARRIER_SY    = 0xF,
-  _ARM64_BARRIER_ST    = 0xE,
-  _ARM64_BARRIER_LD    = 0xD,
-  _ARM64_BARRIER_ISH   = 0xB,
-  _ARM64_BARRIER_ISHST = 0xA,
-  _ARM64_BARRIER_ISHLD = 0x9,
-  _ARM64_BARRIER_NSH   = 0x7,
-  _ARM64_BARRIER_NSHST = 0x6,
-  _ARM64_BARRIER_NSHLD = 0x5,
-  _ARM64_BARRIER_OSH   = 0x3,
-  _ARM64_BARRIER_OSHST = 0x2,
-  _ARM64_BARRIER_OSHLD = 0x1
-} _ARM64INTR_BARRIER_TYPE;
-
-#endif /* __ARM64INTR_H */
-#endif /* _MSC_VER */
--- a/lib/include/arm_acle.h
+++ b/lib/include/arm_acle.h
@ -1,782 +0,0 @@
-/*===---- arm_acle.h - ARM Non-Neon intrinsics -----------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __ARM_ACLE_H
-#define __ARM_ACLE_H
-
-#ifndef __ARM_ACLE
-#error "ACLE intrinsics support not enabled."
-#endif
-
-#include <stdint.h>
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-/* 8 SYNCHRONIZATION, BARRIER AND HINT INTRINSICS */
-/* 8.3 Memory barriers */
-#if !__has_builtin(__dmb)
-#define __dmb(i) __builtin_arm_dmb(i)
-#endif
-#if !__has_builtin(__dsb)
-#define __dsb(i) __builtin_arm_dsb(i)
-#endif
-#if !__has_builtin(__isb)
-#define __isb(i) __builtin_arm_isb(i)
-#endif
-
-/* 8.4 Hints */
-
-#if !__has_builtin(__wfi)
-static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfi(void) {
-  __builtin_arm_wfi();
-}
-#endif
-
-#if !__has_builtin(__wfe)
-static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfe(void) {
-  __builtin_arm_wfe();
-}
-#endif
-
-#if !__has_builtin(__sev)
-static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sev(void) {
-  __builtin_arm_sev();
-}
-#endif
-
-#if !__has_builtin(__sevl)
-static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sevl(void) {
-  __builtin_arm_sevl();
-}
-#endif
-
-#if !__has_builtin(__yield)
-static __inline__ void __attribute__((__always_inline__, __nodebug__)) __yield(void) {
-  __builtin_arm_yield();
-}
-#endif
-
-#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
-#define __dbg(t) __builtin_arm_dbg(t)
-#endif
-
-/* 8.5 Swap */
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
-__swp(uint32_t __x, volatile uint32_t *__p) {
-  uint32_t v;
-  do
-    v = __builtin_arm_ldrex(__p);
-  while (__builtin_arm_strex(__x, __p));
-  return v;
-}
-
-/* 8.6 Memory prefetch intrinsics */
-/* 8.6.1 Data prefetch */
-#define __pld(addr) __pldx(0, 0, 0, addr)
-
-#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
-#define __pldx(access_kind, cache_level, retention_policy, addr) \
-  __builtin_arm_prefetch(addr, access_kind, 1)
-#else
-#define __pldx(access_kind, cache_level, retention_policy, addr) \
-  __builtin_arm_prefetch(addr, access_kind, cache_level, retention_policy, 1)
-#endif
-
-/* 8.6.2 Instruction prefetch */
-#define __pli(addr) __plix(0, 0, addr)
-
-#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
-#define __plix(cache_level, retention_policy, addr) \
-  __builtin_arm_prefetch(addr, 0, 0)
-#else
-#define __plix(cache_level, retention_policy, addr) \
-  __builtin_arm_prefetch(addr, 0, cache_level, retention_policy, 0)
-#endif
-
-/* 8.7 NOP */
-#if !defined(_MSC_VER) || !defined(__aarch64__)
-static __inline__ void __attribute__((__always_inline__, __nodebug__)) __nop(void) {
-  __builtin_arm_nop();
-}
-#endif
-
-/* 9 DATA-PROCESSING INTRINSICS */
-/* 9.2 Miscellaneous data-processing intrinsics */
-/* ROR */
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
-__ror(uint32_t __x, uint32_t __y) {
-  __y %= 32;
-  if (__y == 0)
-    return __x;
-  return (__x >> __y) | (__x << (32 - __y));
-}
-
-static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
-__rorll(uint64_t __x, uint32_t __y) {
-  __y %= 64;
-  if (__y == 0)
-    return __x;
-  return (__x >> __y) | (__x << (64 - __y));
-}
-
-static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
-__rorl(unsigned long __x, uint32_t __y) {
-#if __SIZEOF_LONG__ == 4
-  return __ror(__x, __y);
-#else
-  return __rorll(__x, __y);
-#endif
-}
-
-
-/* CLZ */
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
-__clz(uint32_t __t) {
-  return __builtin_arm_clz(__t);
-}
-
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
-__clzl(unsigned long __t) {
-#if __SIZEOF_LONG__ == 4
-  return __builtin_arm_clz(__t);
-#else
-  return __builtin_arm_clz64(__t);
-#endif
-}
-
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
-__clzll(uint64_t __t) {
-  return __builtin_arm_clz64(__t);
-}
-
-/* CLS */
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
-__cls(uint32_t __t) {
-  return __builtin_arm_cls(__t);
-}
-
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
-__clsl(unsigned long __t) {
-#if __SIZEOF_LONG__ == 4
-  return __builtin_arm_cls(__t);
-#else
-  return __builtin_arm_cls64(__t);
-#endif
-}
-
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
-__clsll(uint64_t __t) {
-  return __builtin_arm_cls64(__t);
-}
-
-/* REV */
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
-__rev(uint32_t __t) {
-  return __builtin_bswap32(__t);
-}
-
-static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
-__revl(unsigned long __t) {
-#if __SIZEOF_LONG__ == 4
-  return __builtin_bswap32(__t);
-#else
-  return __builtin_bswap64(__t);
-#endif
-}
-
-static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
-__revll(uint64_t __t) {
-  return __builtin_bswap64(__t);
-}
-
-/* REV16 */
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
-__rev16(uint32_t __t) {
-  return __ror(__rev(__t), 16);
-}
-
-static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
-__rev16ll(uint64_t __t) {
-  return (((uint64_t)__rev16(__t >> 32)) << 32) | (uint64_t)__rev16((uint32_t)__t);
-}
-
-static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
-__rev16l(unsigned long __t) {
-#if __SIZEOF_LONG__ == 4
-    return __rev16(__t);
-#else
-    return __rev16ll(__t);
-#endif
-}
-
-/* REVSH */
-static __inline__ int16_t __attribute__((__always_inline__, __nodebug__))
-__revsh(int16_t __t) {
-  return (int16_t)__builtin_bswap16((uint16_t)__t);
-}
-
-/* RBIT */
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
-__rbit(uint32_t __t) {
-  return __builtin_arm_rbit(__t);
-}
-
-static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
-__rbitll(uint64_t __t) {
-#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
-  return (((uint64_t)__builtin_arm_rbit(__t)) << 32) |
-         __builtin_arm_rbit(__t >> 32);
-#else
-  return __builtin_arm_rbit64(__t);
-#endif
-}
-
-static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
-__rbitl(unsigned long __t) {
-#if __SIZEOF_LONG__ == 4
-  return __rbit(__t);
-#else
-  return __rbitll(__t);
-#endif
-}
-
-/*
- * 9.3 16-bit multiplications
- */
-#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
-static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
-__smulbb(int32_t __a, int32_t __b) {
-  return __builtin_arm_smulbb(__a, __b);
-}
-static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
-__smulbt(int32_t __a, int32_t __b) {
-  return __builtin_arm_smulbt(__a, __b);
-}
-static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
-__smultb(int32_t __a, int32_t __b) {
-  return __builtin_arm_smultb(__a, __b);
-}
-static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
-__smultt(int32_t __a, int32_t __b) {
-  return __builtin_arm_smultt(__a, __b);
-}
-static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
-__smulwb(int32_t __a, int32_t __b) {
-  return __builtin_arm_smulwb(__a, __b);
-}
-static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
-__smulwt(int32_t __a, int32_t __b) {
-  return __builtin_arm_smulwt(__a, __b);
-}
-#endif
-
-/*
- * 9.4 Saturating intrinsics
- *
- * FIXME: Change guard to their corresponding __ARM_FEATURE flag when Q flag
- * intrinsics are implemented and the flag is enabled.
- */
-/* 9.4.1 Width-specified saturation intrinsics */
-#if defined(__ARM_FEATURE_SAT) && __ARM_FEATURE_SAT
-#define __ssat(x, y) __builtin_arm_ssat(x, y)
-#define __usat(x, y) __builtin_arm_usat(x, y)
-#endif
-
-/* 9.4.2 Saturating addition and subtraction intrinsics */
-#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-__qadd(int32_t __t, int32_t __v) {
-  return __builtin_arm_qadd(__t, __v);
-}
-
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-__qsub(int32_t __t, int32_t __v) {
-  return __builtin_arm_qsub(__t, __v);
-}
-
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-__qdbl(int32_t __t) {
-  return __builtin_arm_qadd(__t, __t);
-}
-#endif
-
-/* 9.4.3 Accumultating multiplications */
-#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-__smlabb(int32_t __a, int32_t __b, int32_t __c) {
-  return __builtin_arm_smlabb(__a, __b, __c);
-}
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-__smlabt(int32_t __a, int32_t __b, int32_t __c) {
-  return __builtin_arm_smlabt(__a, __b, __c);
-}
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-__smlatb(int32_t __a, int32_t __b, int32_t __c) {
-  return __builtin_arm_smlatb(__a, __b, __c);
-}
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-__smlatt(int32_t __a, int32_t __b, int32_t __c) {
-  return __builtin_arm_smlatt(__a, __b, __c);
-}
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-__smlawb(int32_t __a, int32_t __b, int32_t __c) {
-  return __builtin_arm_smlawb(__a, __b, __c);
-}
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-__smlawt(int32_t __a, int32_t __b, int32_t __c) {
-  return __builtin_arm_smlawt(__a, __b, __c);
-}
-#endif
-
-
-/* 9.5.4 Parallel 16-bit saturation */
-#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
-#define __ssat16(x, y) __builtin_arm_ssat16(x, y)
-#define __usat16(x, y) __builtin_arm_usat16(x, y)
-#endif
-
-/* 9.5.5 Packing and unpacking */
-#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
-typedef int32_t int8x4_t;
-typedef int32_t int16x2_t;
-typedef uint32_t uint8x4_t;
-typedef uint32_t uint16x2_t;
-
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__sxtab16(int16x2_t __a, int8x4_t __b) {
-  return __builtin_arm_sxtab16(__a, __b);
-}
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__sxtb16(int8x4_t __a) {
-  return __builtin_arm_sxtb16(__a);
-}
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__uxtab16(int16x2_t __a, int8x4_t __b) {
-  return __builtin_arm_uxtab16(__a, __b);
-}
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__uxtb16(int8x4_t __a) {
-  return __builtin_arm_uxtb16(__a);
-}
-#endif
-
-/* 9.5.6 Parallel selection */
-#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
-static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
-__sel(uint8x4_t __a, uint8x4_t __b) {
-  return __builtin_arm_sel(__a, __b);
-}
-#endif
-
-/* 9.5.7 Parallel 8-bit addition and subtraction */
-#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
-static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
-__qadd8(int8x4_t __a, int8x4_t __b) {
-  return __builtin_arm_qadd8(__a, __b);
-}
-static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
-__qsub8(int8x4_t __a, int8x4_t __b) {
-  return __builtin_arm_qsub8(__a, __b);
-}
-static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
-__sadd8(int8x4_t __a, int8x4_t __b) {
-  return __builtin_arm_sadd8(__a, __b);
-}
-static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
-__shadd8(int8x4_t __a, int8x4_t __b) {
-  return __builtin_arm_shadd8(__a, __b);
-}
-static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
-__shsub8(int8x4_t __a, int8x4_t __b) {
-  return __builtin_arm_shsub8(__a, __b);
-}
-static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
-__ssub8(int8x4_t __a, int8x4_t __b) {
-  return __builtin_arm_ssub8(__a, __b);
-}
-static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
-__uadd8(uint8x4_t __a, uint8x4_t __b) {
-  return __builtin_arm_uadd8(__a, __b);
-}
-static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
-__uhadd8(uint8x4_t __a, uint8x4_t __b) {
-  return __builtin_arm_uhadd8(__a, __b);
-}
-static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
-__uhsub8(uint8x4_t __a, uint8x4_t __b) {
-  return __builtin_arm_uhsub8(__a, __b);
-}
-static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
-__uqadd8(uint8x4_t __a, uint8x4_t __b) {
-  return __builtin_arm_uqadd8(__a, __b);
-}
-static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
-__uqsub8(uint8x4_t __a, uint8x4_t __b) {
-  return __builtin_arm_uqsub8(__a, __b);
-}
-static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
-__usub8(uint8x4_t __a, uint8x4_t __b) {
-  return __builtin_arm_usub8(__a, __b);
-}
-#endif
-
-/* 9.5.8 Sum of 8-bit absolute differences */
-#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
-__usad8(uint8x4_t __a, uint8x4_t __b) {
-  return __builtin_arm_usad8(__a, __b);
-}
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
-__usada8(uint8x4_t __a, uint8x4_t __b, uint32_t __c) {
-  return __builtin_arm_usada8(__a, __b, __c);
-}
-#endif
-
-/* 9.5.9 Parallel 16-bit addition and subtraction */
-#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__qadd16(int16x2_t __a, int16x2_t __b) {
-  return __builtin_arm_qadd16(__a, __b);
-}
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__qasx(int16x2_t __a, int16x2_t __b) {
-  return __builtin_arm_qasx(__a, __b);
-}
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__qsax(int16x2_t __a, int16x2_t __b) {
-  return __builtin_arm_qsax(__a, __b);
-}
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__qsub16(int16x2_t __a, int16x2_t __b) {
-  return __builtin_arm_qsub16(__a, __b);
-}
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__sadd16(int16x2_t __a, int16x2_t __b) {
-  return __builtin_arm_sadd16(__a, __b);
-}
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__sasx(int16x2_t __a, int16x2_t __b) {
-  return __builtin_arm_sasx(__a, __b);
-}
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__shadd16(int16x2_t __a, int16x2_t __b) {
-  return __builtin_arm_shadd16(__a, __b);
-}
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__shasx(int16x2_t __a, int16x2_t __b) {
-  return __builtin_arm_shasx(__a, __b);
-}
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__shsax(int16x2_t __a, int16x2_t __b) {
-  return __builtin_arm_shsax(__a, __b);
-}
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__shsub16(int16x2_t __a, int16x2_t __b) {
-  return __builtin_arm_shsub16(__a, __b);
-}
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__ssax(int16x2_t __a, int16x2_t __b) {
-  return __builtin_arm_ssax(__a, __b);
-}
-static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
-__ssub16(int16x2_t __a, int16x2_t __b) {
-  return __builtin_arm_ssub16(__a, __b);
-}
-static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
-__uadd16(uint16x2_t __a, uint16x2_t __b) {
-  return __builtin_arm_uadd16(__a, __b);
-}
-static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
-__uasx(uint16x2_t __a, uint16x2_t __b) {
-  return __builtin_arm_uasx(__a, __b);
-}
-static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
-__uhadd16(uint16x2_t __a, uint16x2_t __b) {
-  return __builtin_arm_uhadd16(__a, __b);
-}
-static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
-__uhasx(uint16x2_t __a, uint16x2_t __b) {
-  return __builtin_arm_uhasx(__a, __b);
-}
-static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
-__uhsax(uint16x2_t __a, uint16x2_t __b) {
-  return __builtin_arm_uhsax(__a, __b);
-}
-static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
-__uhsub16(uint16x2_t __a, uint16x2_t __b) {
-  return __builtin_arm_uhsub16(__a, __b);
-}
-static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
-__uqadd16(uint16x2_t __a, uint16x2_t __b) {
-  return __builtin_arm_uqadd16(__a, __b);
-}
-static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
-__uqasx(uint16x2_t __a, uint16x2_t __b) {
-  return __builtin_arm_uqasx(__a, __b);
-}
-static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
-__uqsax(uint16x2_t __a, uint16x2_t __b) {
-  return __builtin_arm_uqsax(__a, __b);
-}
-static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
-__uqsub16(uint16x2_t __a, uint16x2_t __b) {
-  return __builtin_arm_uqsub16(__a, __b);
-}
-static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
-__usax(uint16x2_t __a, uint16x2_t __b) {
-  return __builtin_arm_usax(__a, __b);
-}
-static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
-__usub16(uint16x2_t __a, uint16x2_t __b) {
-  return __builtin_arm_usub16(__a, __b);
-}
-#endif
-
-/* 9.5.10 Parallel 16-bit multiplications */
-#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-__smlad(int16x2_t __a, int16x2_t __b, int32_t __c) {
-  return __builtin_arm_smlad(__a, __b, __c);
-}
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-__smladx(int16x2_t __a, int16x2_t __b, int32_t __c) {
-  return __builtin_arm_smladx(__a, __b, __c);
-}
-static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
-__smlald(int16x2_t __a, int16x2_t __b, int64_t __c) {
-  return __builtin_arm_smlald(__a, __b, __c);
-}
-static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
-__smlaldx(int16x2_t __a, int16x2_t __b, int64_t __c) {
-  return __builtin_arm_smlaldx(__a, __b, __c);
-}
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-__smlsd(int16x2_t __a, int16x2_t __b, int32_t __c) {
-  return __builtin_arm_smlsd(__a, __b, __c);
-}
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-__smlsdx(int16x2_t __a, int16x2_t __b, int32_t __c) {
-  return __builtin_arm_smlsdx(__a, __b, __c);
-}
-static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
-__smlsld(int16x2_t __a, int16x2_t __b, int64_t __c) {
-  return __builtin_arm_smlsld(__a, __b, __c);
-}
-static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
-__smlsldx(int16x2_t __a, int16x2_t __b, int64_t __c) {
-  return __builtin_arm_smlsldx(__a, __b, __c);
-}
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-__smuad(int16x2_t __a, int16x2_t __b) {
-  return __builtin_arm_smuad(__a, __b);
-}
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-__smuadx(int16x2_t __a, int16x2_t __b) {
-  return __builtin_arm_smuadx(__a, __b);
-}
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-__smusd(int16x2_t __a, int16x2_t __b) {
-  return __builtin_arm_smusd(__a, __b);
-}
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
-__smusdx(int16x2_t __a, int16x2_t __b) {
-  return __builtin_arm_smusdx(__a, __b);
-}
-#endif
-
-/* 9.7 CRC32 intrinsics */
-#if (defined(__ARM_FEATURE_CRC32) && __ARM_FEATURE_CRC32) ||                   \
-    (defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE)
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
-__crc32b(uint32_t __a, uint8_t __b) {
-  return __builtin_arm_crc32b(__a, __b);
-}
-
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
-__crc32h(uint32_t __a, uint16_t __b) {
-  return __builtin_arm_crc32h(__a, __b);
-}
-
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
-__crc32w(uint32_t __a, uint32_t __b) {
-  return __builtin_arm_crc32w(__a, __b);
-}
-
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
-__crc32d(uint32_t __a, uint64_t __b) {
-  return __builtin_arm_crc32d(__a, __b);
-}
-
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
-__crc32cb(uint32_t __a, uint8_t __b) {
-  return __builtin_arm_crc32cb(__a, __b);
-}
-
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
-__crc32ch(uint32_t __a, uint16_t __b) {
-  return __builtin_arm_crc32ch(__a, __b);
-}
-
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
-__crc32cw(uint32_t __a, uint32_t __b) {
-  return __builtin_arm_crc32cw(__a, __b);
-}
-
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
-__crc32cd(uint32_t __a, uint64_t __b) {
-  return __builtin_arm_crc32cd(__a, __b);
-}
-#endif
-
-/* Armv8.3-A Javascript conversion intrinsic */
-#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("v8.3a")))
-__jcvt(double __a) {
-  return __builtin_arm_jcvt(__a);
-}
-#endif
-
-/* Armv8.5-A FP rounding intrinsics */
-#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
-static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
-__rint32zf(float __a) {
-  return __builtin_arm_rint32zf(__a);
-}
-
-static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
-__rint32z(double __a) {
-  return __builtin_arm_rint32z(__a);
-}
-
-static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
-__rint64zf(float __a) {
-  return __builtin_arm_rint64zf(__a);
-}
-
-static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
-__rint64z(double __a) {
-  return __builtin_arm_rint64z(__a);
-}
-
-static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
-__rint32xf(float __a) {
-  return __builtin_arm_rint32xf(__a);
-}
-
-static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
-__rint32x(double __a) {
-  return __builtin_arm_rint32x(__a);
-}
-
-static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
-__rint64xf(float __a) {
-  return __builtin_arm_rint64xf(__a);
-}
-
-static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
-__rint64x(double __a) {
-  return __builtin_arm_rint64x(__a);
-}
-#endif
-
-/* Armv8.7-A load/store 64-byte intrinsics */
-#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
-typedef struct {
-    uint64_t val[8];
-} data512_t;
-
-static __inline__ data512_t __attribute__((__always_inline__, __nodebug__, target("ls64")))
-__arm_ld64b(const void *__addr) {
-  data512_t __value;
-  __builtin_arm_ld64b(__addr, __value.val);
-  return __value;
-}
-static __inline__ void __attribute__((__always_inline__, __nodebug__, target("ls64")))
-__arm_st64b(void *__addr, data512_t __value) {
-  __builtin_arm_st64b(__addr, __value.val);
-}
-static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("ls64")))
-__arm_st64bv(void *__addr, data512_t __value) {
-  return __builtin_arm_st64bv(__addr, __value.val);
-}
-static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("ls64")))
-__arm_st64bv0(void *__addr, data512_t __value) {
-  return __builtin_arm_st64bv0(__addr, __value.val);
-}
-#endif
-
-/* 10.1 Special register intrinsics */
-#define __arm_rsr(sysreg) __builtin_arm_rsr(sysreg)
-#define __arm_rsr64(sysreg) __builtin_arm_rsr64(sysreg)
-#define __arm_rsr128(sysreg) __builtin_arm_rsr128(sysreg)
-#define __arm_rsrp(sysreg) __builtin_arm_rsrp(sysreg)
-#define __arm_rsrf(sysreg) __builtin_bit_cast(float, __arm_rsr(sysreg))
-#define __arm_rsrf64(sysreg) __builtin_bit_cast(double, __arm_rsr64(sysreg))
-#define __arm_wsr(sysreg, v) __builtin_arm_wsr(sysreg, v)
-#define __arm_wsr64(sysreg, v) __builtin_arm_wsr64(sysreg, v)
-#define __arm_wsr128(sysreg, v) __builtin_arm_wsr128(sysreg, v)
-#define __arm_wsrp(sysreg, v) __builtin_arm_wsrp(sysreg, v)
-#define __arm_wsrf(sysreg, v) __arm_wsr(sysreg, __builtin_bit_cast(uint32_t, v))
-#define __arm_wsrf64(sysreg, v) __arm_wsr64(sysreg, __builtin_bit_cast(uint64_t, v))
-
-/* Memory Tagging Extensions (MTE) Intrinsics */
-#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
-#define __arm_mte_create_random_tag(__ptr, __mask)  __builtin_arm_irg(__ptr, __mask)
-#define __arm_mte_increment_tag(__ptr, __tag_offset)  __builtin_arm_addg(__ptr, __tag_offset)
-#define __arm_mte_exclude_tag(__ptr, __excluded)  __builtin_arm_gmi(__ptr, __excluded)
-#define __arm_mte_get_tag(__ptr) __builtin_arm_ldg(__ptr)
-#define __arm_mte_set_tag(__ptr) __builtin_arm_stg(__ptr)
-#define __arm_mte_ptrdiff(__ptra, __ptrb) __builtin_arm_subp(__ptra, __ptrb)
-
-/* Memory Operations Intrinsics */
-#define __arm_mops_memset_tag(__tagged_address, __value, __size)    \
-  __builtin_arm_mops_memset_tag(__tagged_address, __value, __size)
-#endif
-
-/* Transactional Memory Extension (TME) Intrinsics */
-#if defined(__ARM_FEATURE_TME) && __ARM_FEATURE_TME
-
-#define _TMFAILURE_REASON  0x00007fffu
-#define _TMFAILURE_RTRY    0x00008000u
-#define _TMFAILURE_CNCL    0x00010000u
-#define _TMFAILURE_MEM     0x00020000u
-#define _TMFAILURE_IMP     0x00040000u
-#define _TMFAILURE_ERR     0x00080000u
-#define _TMFAILURE_SIZE    0x00100000u
-#define _TMFAILURE_NEST    0x00200000u
-#define _TMFAILURE_DBG     0x00400000u
-#define _TMFAILURE_INT     0x00800000u
-#define _TMFAILURE_TRIVIAL 0x01000000u
-
-#define __tstart()        __builtin_arm_tstart()
-#define __tcommit()       __builtin_arm_tcommit()
-#define __tcancel(__arg)  __builtin_arm_tcancel(__arg)
-#define __ttest()         __builtin_arm_ttest()
-
-#endif /* __ARM_FEATURE_TME */
-
-/* Armv8.5-A Random number generation intrinsics */
-#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
-static __inline__ int __attribute__((__always_inline__, __nodebug__, target("rand")))
-__rndr(uint64_t *__p) {
-  return __builtin_arm_rndr(__p);
-}
-static __inline__ int __attribute__((__always_inline__, __nodebug__, target("rand")))
-__rndrrs(uint64_t *__p) {
-  return __builtin_arm_rndrrs(__p);
-}
-#endif
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif /* __ARM_ACLE_H */
--- a/lib/include/arm_bf16.h
+++ b/lib/include/arm_bf16.h
@ -1,20 +0,0 @@
-/*===---- arm_bf16.h - ARM BF16 intrinsics -----------------------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __ARM_BF16_H
-#define __ARM_BF16_H
-
-typedef __bf16 bfloat16_t;
-#define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))
-
-
-#undef __ai
-
-#endif
--- a/lib/include/arm_cde.h
+++ b/lib/include/arm_cde.h
@ -1,410 +0,0 @@
-/*===---- arm_cde.h - ARM CDE intrinsics -----------------------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __ARM_CDE_H
-#define __ARM_CDE_H
-
-#if !__ARM_FEATURE_CDE
-#error "CDE support not enabled"
-#endif
-
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx1)))
-uint32_t __arm_cx1(int, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx1a)))
-uint32_t __arm_cx1a(int, uint32_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx1d)))
-uint64_t __arm_cx1d(int, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx1da)))
-uint64_t __arm_cx1da(int, uint64_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx2)))
-uint32_t __arm_cx2(int, uint32_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx2a)))
-uint32_t __arm_cx2a(int, uint32_t, uint32_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx2d)))
-uint64_t __arm_cx2d(int, uint32_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx2da)))
-uint64_t __arm_cx2da(int, uint64_t, uint32_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx3)))
-uint32_t __arm_cx3(int, uint32_t, uint32_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx3a)))
-uint32_t __arm_cx3a(int, uint32_t, uint32_t, uint32_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx3d)))
-uint64_t __arm_cx3d(int, uint32_t, uint32_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx3da)))
-uint64_t __arm_cx3da(int, uint64_t, uint32_t, uint32_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1_u32)))
-uint32_t __arm_vcx1_u32(int, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1a_u32)))
-uint32_t __arm_vcx1a_u32(int, uint32_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1d_u64)))
-uint64_t __arm_vcx1d_u64(int, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1da_u64)))
-uint64_t __arm_vcx1da_u64(int, uint64_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx2_u32)))
-uint32_t __arm_vcx2_u32(int, uint32_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx2a_u32)))
-uint32_t __arm_vcx2a_u32(int, uint32_t, uint32_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx2d_u64)))
-uint64_t __arm_vcx2d_u64(int, uint64_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx2da_u64)))
-uint64_t __arm_vcx2da_u64(int, uint64_t, uint64_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx3_u32)))
-uint32_t __arm_vcx3_u32(int, uint32_t, uint32_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx3a_u32)))
-uint32_t __arm_vcx3a_u32(int, uint32_t, uint32_t, uint32_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx3d_u64)))
-uint64_t __arm_vcx3d_u64(int, uint64_t, uint64_t, uint32_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx3da_u64)))
-uint64_t __arm_vcx3da_u64(int, uint64_t, uint64_t, uint64_t, uint32_t);
-
-#if __ARM_FEATURE_MVE
-
-typedef uint16_t mve_pred16_t;
-typedef __attribute__((__neon_vector_type__(8), __clang_arm_mve_strict_polymorphism)) int16_t int16x8_t;
-typedef __attribute__((__neon_vector_type__(4), __clang_arm_mve_strict_polymorphism)) int32_t int32x4_t;
-typedef __attribute__((__neon_vector_type__(2), __clang_arm_mve_strict_polymorphism)) int64_t int64x2_t;
-typedef __attribute__((__neon_vector_type__(16), __clang_arm_mve_strict_polymorphism)) int8_t int8x16_t;
-typedef __attribute__((__neon_vector_type__(8), __clang_arm_mve_strict_polymorphism)) uint16_t uint16x8_t;
-typedef __attribute__((__neon_vector_type__(4), __clang_arm_mve_strict_polymorphism)) uint32_t uint32x4_t;
-typedef __attribute__((__neon_vector_type__(2), __clang_arm_mve_strict_polymorphism)) uint64_t uint64x2_t;
-typedef __attribute__((__neon_vector_type__(16), __clang_arm_mve_strict_polymorphism)) uint8_t uint8x16_t;
-
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_s16)))
-int16x8_t __arm_vcx1q_m(int, int16x8_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_s32)))
-int32x4_t __arm_vcx1q_m(int, int32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_s64)))
-int64x2_t __arm_vcx1q_m(int, int64x2_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_s8)))
-int8x16_t __arm_vcx1q_m(int, int8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_u16)))
-uint16x8_t __arm_vcx1q_m(int, uint16x8_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_u32)))
-uint32x4_t __arm_vcx1q_m(int, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_u64)))
-uint64x2_t __arm_vcx1q_m(int, uint64x2_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_u8)))
-uint8x16_t __arm_vcx1q_m(int, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_u8)))
-uint8x16_t __arm_vcx1q_u8(int, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_s16)))
-int16x8_t __arm_vcx1qa_m(int, int16x8_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_s32)))
-int32x4_t __arm_vcx1qa_m(int, int32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_s64)))
-int64x2_t __arm_vcx1qa_m(int, int64x2_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_s8)))
-int8x16_t __arm_vcx1qa_m(int, int8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_u16)))
-uint16x8_t __arm_vcx1qa_m(int, uint16x8_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_u32)))
-uint32x4_t __arm_vcx1qa_m(int, uint32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_u64)))
-uint64x2_t __arm_vcx1qa_m(int, uint64x2_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_u8)))
-uint8x16_t __arm_vcx1qa_m(int, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_s16)))
-int16x8_t __arm_vcx1qa(int, int16x8_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_s32)))
-int32x4_t __arm_vcx1qa(int, int32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_s64)))
-int64x2_t __arm_vcx1qa(int, int64x2_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_s8)))
-int8x16_t __arm_vcx1qa(int, int8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_u16)))
-uint16x8_t __arm_vcx1qa(int, uint16x8_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_u32)))
-uint32x4_t __arm_vcx1qa(int, uint32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_u64)))
-uint64x2_t __arm_vcx1qa(int, uint64x2_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_u8)))
-uint8x16_t __arm_vcx1qa(int, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_s16)))
-int16x8_t __arm_vcx2q_m_impl(int, int16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_s32)))
-int32x4_t __arm_vcx2q_m_impl(int, int32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_s64)))
-int64x2_t __arm_vcx2q_m_impl(int, int64x2_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_s8)))
-int8x16_t __arm_vcx2q_m_impl(int, int8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_u16)))
-uint16x8_t __arm_vcx2q_m_impl(int, uint16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_u32)))
-uint32x4_t __arm_vcx2q_m_impl(int, uint32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_u64)))
-uint64x2_t __arm_vcx2q_m_impl(int, uint64x2_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_u8)))
-uint8x16_t __arm_vcx2q_m_impl(int, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_s16)))
-int16x8_t __arm_vcx2q(int, int16x8_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_s32)))
-int32x4_t __arm_vcx2q(int, int32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_s64)))
-int64x2_t __arm_vcx2q(int, int64x2_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_s8)))
-int8x16_t __arm_vcx2q(int, int8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u16)))
-uint16x8_t __arm_vcx2q(int, uint16x8_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u32)))
-uint32x4_t __arm_vcx2q(int, uint32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u64)))
-uint64x2_t __arm_vcx2q(int, uint64x2_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8)))
-uint8x16_t __arm_vcx2q(int, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_s16)))
-uint8x16_t __arm_vcx2q_u8(int, int16x8_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_s32)))
-uint8x16_t __arm_vcx2q_u8(int, int32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_s64)))
-uint8x16_t __arm_vcx2q_u8(int, int64x2_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_s8)))
-uint8x16_t __arm_vcx2q_u8(int, int8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_u16)))
-uint8x16_t __arm_vcx2q_u8(int, uint16x8_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_u32)))
-uint8x16_t __arm_vcx2q_u8(int, uint32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_u64)))
-uint8x16_t __arm_vcx2q_u8(int, uint64x2_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_u8)))
-uint8x16_t __arm_vcx2q_u8(int, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_s16)))
-int16x8_t __arm_vcx2qa_impl(int, int16x8_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_s32)))
-int32x4_t __arm_vcx2qa_impl(int, int32x4_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_s64)))
-int64x2_t __arm_vcx2qa_impl(int, int64x2_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_s8)))
-int8x16_t __arm_vcx2qa_impl(int, int8x16_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_u16)))
-uint16x8_t __arm_vcx2qa_impl(int, uint16x8_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_u32)))
-uint32x4_t __arm_vcx2qa_impl(int, uint32x4_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_u64)))
-uint64x2_t __arm_vcx2qa_impl(int, uint64x2_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_u8)))
-uint8x16_t __arm_vcx2qa_impl(int, uint8x16_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_s16)))
-int16x8_t __arm_vcx2qa_m_impl(int, int16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_s32)))
-int32x4_t __arm_vcx2qa_m_impl(int, int32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_s64)))
-int64x2_t __arm_vcx2qa_m_impl(int, int64x2_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_s8)))
-int8x16_t __arm_vcx2qa_m_impl(int, int8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_u16)))
-uint16x8_t __arm_vcx2qa_m_impl(int, uint16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_u32)))
-uint32x4_t __arm_vcx2qa_m_impl(int, uint32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_u64)))
-uint64x2_t __arm_vcx2qa_m_impl(int, uint64x2_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_u8)))
-uint8x16_t __arm_vcx2qa_m_impl(int, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_s16)))
-int16x8_t __arm_vcx3q_impl(int, int16x8_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_s32)))
-int32x4_t __arm_vcx3q_impl(int, int32x4_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_s64)))
-int64x2_t __arm_vcx3q_impl(int, int64x2_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_s8)))
-int8x16_t __arm_vcx3q_impl(int, int8x16_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_u16)))
-uint16x8_t __arm_vcx3q_impl(int, uint16x8_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_u32)))
-uint32x4_t __arm_vcx3q_impl(int, uint32x4_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_u64)))
-uint64x2_t __arm_vcx3q_impl(int, uint64x2_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_u8)))
-uint8x16_t __arm_vcx3q_impl(int, uint8x16_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_s16)))
-int16x8_t __arm_vcx3q_m_impl(int, int16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_s32)))
-int32x4_t __arm_vcx3q_m_impl(int, int32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_s64)))
-int64x2_t __arm_vcx3q_m_impl(int, int64x2_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_s8)))
-int8x16_t __arm_vcx3q_m_impl(int, int8x16_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_u16)))
-uint16x8_t __arm_vcx3q_m_impl(int, uint16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_u32)))
-uint32x4_t __arm_vcx3q_m_impl(int, uint32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_u64)))
-uint64x2_t __arm_vcx3q_m_impl(int, uint64x2_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_u8)))
-uint8x16_t __arm_vcx3q_m_impl(int, uint8x16_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_s16)))
-uint8x16_t __arm_vcx3q_u8_impl(int, int16x8_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_s32)))
-uint8x16_t __arm_vcx3q_u8_impl(int, int32x4_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_s64)))
-uint8x16_t __arm_vcx3q_u8_impl(int, int64x2_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_s8)))
-uint8x16_t __arm_vcx3q_u8_impl(int, int8x16_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_u16)))
-uint8x16_t __arm_vcx3q_u8_impl(int, uint16x8_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_u32)))
-uint8x16_t __arm_vcx3q_u8_impl(int, uint32x4_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_u64)))
-uint8x16_t __arm_vcx3q_u8_impl(int, uint64x2_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_u8)))
-uint8x16_t __arm_vcx3q_u8_impl(int, uint8x16_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_s16)))
-int16x8_t __arm_vcx3qa_impl(int, int16x8_t, uint8x16_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_s32)))
-int32x4_t __arm_vcx3qa_impl(int, int32x4_t, uint8x16_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_s64)))
-int64x2_t __arm_vcx3qa_impl(int, int64x2_t, uint8x16_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_s8)))
-int8x16_t __arm_vcx3qa_impl(int, int8x16_t, uint8x16_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_u16)))
-uint16x8_t __arm_vcx3qa_impl(int, uint16x8_t, uint8x16_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_u32)))
-uint32x4_t __arm_vcx3qa_impl(int, uint32x4_t, uint8x16_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_u64)))
-uint64x2_t __arm_vcx3qa_impl(int, uint64x2_t, uint8x16_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_u8)))
-uint8x16_t __arm_vcx3qa_impl(int, uint8x16_t, uint8x16_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_s16)))
-int16x8_t __arm_vcx3qa_m_impl(int, int16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_s32)))
-int32x4_t __arm_vcx3qa_m_impl(int, int32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_s64)))
-int64x2_t __arm_vcx3qa_m_impl(int, int64x2_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_s8)))
-int8x16_t __arm_vcx3qa_m_impl(int, int8x16_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_u16)))
-uint16x8_t __arm_vcx3qa_m_impl(int, uint16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_u32)))
-uint32x4_t __arm_vcx3qa_m_impl(int, uint32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_u64)))
-uint64x2_t __arm_vcx3qa_m_impl(int, uint64x2_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_u8)))
-uint8x16_t __arm_vcx3qa_m_impl(int, uint8x16_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_u8)))
-int16x8_t __arm_vreinterpretq_s16_u8(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_u8)))
-int32x4_t __arm_vreinterpretq_s32_u8(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_u8)))
-int64x2_t __arm_vreinterpretq_s64_u8(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_u8)))
-int8x16_t __arm_vreinterpretq_s8_u8(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_u8)))
-uint16x8_t __arm_vreinterpretq_u16_u8(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_u8)))
-uint32x4_t __arm_vreinterpretq_u32_u8(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_u8)))
-uint64x2_t __arm_vreinterpretq_u64_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s16)))
-uint8x16_t __arm_vreinterpretq_u8(int16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s32)))
-uint8x16_t __arm_vreinterpretq_u8(int32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s64)))
-uint8x16_t __arm_vreinterpretq_u8(int64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s8)))
-uint8x16_t __arm_vreinterpretq_u8(int8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u16)))
-uint8x16_t __arm_vreinterpretq_u8(uint16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u32)))
-uint8x16_t __arm_vreinterpretq_u8(uint32x4_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u64)))
-uint8x16_t __arm_vreinterpretq_u8(uint64x2_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vreinterpretq_u8_u8)))
-uint8x16_t __arm_vreinterpretq_u8(uint8x16_t);
-#define __arm_vcx2q_m(cp, inactive, n, imm, pred) __arm_vcx2q_m_impl((cp), (inactive), __arm_vreinterpretq_u8(n), (imm), (pred))
-#define __arm_vcx2qa(cp, acc, n, imm) __arm_vcx2qa_impl((cp), (acc), __arm_vreinterpretq_u8(n), (imm))
-#define __arm_vcx2qa_m(cp, acc, n, imm, pred) __arm_vcx2qa_m_impl((cp), (acc), __arm_vreinterpretq_u8(n), (imm), (pred))
-#define __arm_vcx3q(cp, n, m, imm) __arm_vcx3q_impl((cp), (n), __arm_vreinterpretq_u8(m), (imm))
-#define __arm_vcx3q_m(cp, inactive, n, m, imm, pred) __arm_vcx3q_m_impl((cp), (inactive), __arm_vreinterpretq_u8(n), __arm_vreinterpretq_u8(m), (imm), (pred))
-#define __arm_vcx3q_u8(cp, n, m, imm) __arm_vcx3q_u8_impl((cp), (n), __arm_vreinterpretq_u8(m), (imm))
-#define __arm_vcx3qa(cp, acc, n, m, imm) __arm_vcx3qa_impl((cp), (acc), __arm_vreinterpretq_u8(n), __arm_vreinterpretq_u8(m), (imm))
-#define __arm_vcx3qa_m(cp, acc, n, m, imm, pred) __arm_vcx3qa_m_impl((cp), (acc), __arm_vreinterpretq_u8(n), __arm_vreinterpretq_u8(m), (imm), (pred))
-
-#endif /* __ARM_FEATURE_MVE */
-
-#if __ARM_FEATURE_MVE & 2
-
-typedef __fp16 float16_t;
-typedef float float32_t;
-typedef __attribute__((__neon_vector_type__(8), __clang_arm_mve_strict_polymorphism)) float16_t float16x8_t;
-typedef __attribute__((__neon_vector_type__(4), __clang_arm_mve_strict_polymorphism)) float32_t float32x4_t;
-
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_f16)))
-float16x8_t __arm_vcx1q_m(int, float16x8_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_f32)))
-float32x4_t __arm_vcx1q_m(int, float32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_f16)))
-float16x8_t __arm_vcx1qa(int, float16x8_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_f32)))
-float32x4_t __arm_vcx1qa(int, float32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_f16)))
-float16x8_t __arm_vcx1qa_m(int, float16x8_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_f32)))
-float32x4_t __arm_vcx1qa_m(int, float32x4_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_f16)))
-float16x8_t __arm_vcx2q(int, float16x8_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_f32)))
-float32x4_t __arm_vcx2q(int, float32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_f16)))
-float16x8_t __arm_vcx2q_m_impl(int, float16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_f32)))
-float32x4_t __arm_vcx2q_m_impl(int, float32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_f16)))
-uint8x16_t __arm_vcx2q_u8(int, float16x8_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_f32)))
-uint8x16_t __arm_vcx2q_u8(int, float32x4_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_f16)))
-float16x8_t __arm_vcx2qa_impl(int, float16x8_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_f32)))
-float32x4_t __arm_vcx2qa_impl(int, float32x4_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_f16)))
-float16x8_t __arm_vcx2qa_m_impl(int, float16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_f32)))
-float32x4_t __arm_vcx2qa_m_impl(int, float32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_f16)))
-float16x8_t __arm_vcx3q_impl(int, float16x8_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_f32)))
-float32x4_t __arm_vcx3q_impl(int, float32x4_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_f16)))
-float16x8_t __arm_vcx3q_m_impl(int, float16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_f32)))
-float32x4_t __arm_vcx3q_m_impl(int, float32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_f16)))
-uint8x16_t __arm_vcx3q_u8_impl(int, float16x8_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_f32)))
-uint8x16_t __arm_vcx3q_u8_impl(int, float32x4_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_f16)))
-float16x8_t __arm_vcx3qa_impl(int, float16x8_t, uint8x16_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_f32)))
-float32x4_t __arm_vcx3qa_impl(int, float32x4_t, uint8x16_t, uint8x16_t, uint32_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_f16)))
-float16x8_t __arm_vcx3qa_m_impl(int, float16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_f32)))
-float32x4_t __arm_vcx3qa_m_impl(int, float32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_u8)))
-float16x8_t __arm_vreinterpretq_f16_u8(uint8x16_t);
-static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_u8)))
-float32x4_t __arm_vreinterpretq_f32_u8(uint8x16_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_f16)))
-uint8x16_t __arm_vreinterpretq_u8(float16x8_t);
-static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_f32)))
-uint8x16_t __arm_vreinterpretq_u8(float32x4_t);
-
-#endif /* __ARM_FEATURE_MVE & 2 */
-
-#ifdef __cplusplus
-} /* extern "C" */
-#endif
-
-#endif /* __ARM_CDE_H */
--- a/lib/include/arm_cmse.h
+++ b/lib/include/arm_cmse.h
@ -1,217 +0,0 @@
-//===---- arm_cmse.h - Arm CMSE support -----------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __ARM_CMSE_H
-#define __ARM_CMSE_H
-
-#if (__ARM_FEATURE_CMSE & 0x1)
-#include <stddef.h>
-#include <stdint.h>
-
-#define __ARM_CMSE_SECURE_MODE (__ARM_FEATURE_CMSE & 0x2)
-#define CMSE_MPU_READWRITE 1 /* checks if readwrite_ok field is set */
-#define CMSE_AU_NONSECURE  2 /* checks if permissions have secure field unset */
-#define CMSE_MPU_UNPRIV    4 /* sets T flag on TT insrtuction */
-#define CMSE_MPU_READ      8 /* checks if read_ok field is set */
-#define CMSE_MPU_NONSECURE 16 /* sets A flag, checks if secure field unset */
-#define CMSE_NONSECURE (CMSE_AU_NONSECURE | CMSE_MPU_NONSECURE)
-
-#define cmse_check_pointed_object(p, f) \
-  cmse_check_address_range((p), sizeof(*(p)), (f))
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-typedef union {
-  struct cmse_address_info {
-#ifdef __ARM_BIG_ENDIAN
-    /* __ARM_BIG_ENDIAN */
-#if (__ARM_CMSE_SECURE_MODE)
-    unsigned idau_region : 8;
-    unsigned idau_region_valid : 1;
-    unsigned secure : 1;
-    unsigned nonsecure_readwrite_ok : 1;
-    unsigned nonsecure_read_ok : 1;
-#else
-    unsigned : 12;
-#endif
-    unsigned readwrite_ok : 1;
-    unsigned read_ok : 1;
-#if (__ARM_CMSE_SECURE_MODE)
-    unsigned sau_region_valid : 1;
-#else
-    unsigned : 1;
-#endif
-    unsigned mpu_region_valid : 1;
-#if (__ARM_CMSE_SECURE_MODE)
-    unsigned sau_region : 8;
-#else
-    unsigned : 8;
-#endif
-    unsigned mpu_region : 8;
-
-#else /* __ARM_LITTLE_ENDIAN */
-    unsigned mpu_region : 8;
-#if (__ARM_CMSE_SECURE_MODE)
-    unsigned sau_region : 8;
-#else
-    unsigned : 8;
-#endif
-    unsigned mpu_region_valid : 1;
-#if (__ARM_CMSE_SECURE_MODE)
-    unsigned sau_region_valid : 1;
-#else
-    unsigned : 1;
-#endif
-    unsigned read_ok : 1;
-    unsigned readwrite_ok : 1;
-#if (__ARM_CMSE_SECURE_MODE)
-    unsigned nonsecure_read_ok : 1;
-    unsigned nonsecure_readwrite_ok : 1;
-    unsigned secure : 1;
-    unsigned idau_region_valid : 1;
-    unsigned idau_region : 8;
-#else
-    unsigned : 12;
-#endif
-#endif /*__ARM_LITTLE_ENDIAN */
-  } flags;
-  unsigned value;
-} cmse_address_info_t;
-
-static cmse_address_info_t __attribute__((__always_inline__, __nodebug__))
-cmse_TT(void *__p) {
-  cmse_address_info_t __u;
-  __u.value = __builtin_arm_cmse_TT(__p);
-  return __u;
-}
-static cmse_address_info_t __attribute__((__always_inline__, __nodebug__))
-cmse_TTT(void *__p) {
-  cmse_address_info_t __u;
-  __u.value = __builtin_arm_cmse_TTT(__p);
-  return __u;
-}
-
-#if __ARM_CMSE_SECURE_MODE
-static cmse_address_info_t __attribute__((__always_inline__, __nodebug__))
-cmse_TTA(void *__p) {
-  cmse_address_info_t __u;
-  __u.value = __builtin_arm_cmse_TTA(__p);
-  return __u;
-}
-static cmse_address_info_t __attribute__((__always_inline__, __nodebug__))
-cmse_TTAT(void *__p) {
-  cmse_address_info_t __u;
-  __u.value = __builtin_arm_cmse_TTAT(__p);
-  return __u;
-}
-#endif
-
-#define cmse_TT_fptr(p) cmse_TT(__builtin_bit_cast(void *, (p)))
-#define cmse_TTT_fptr(p) cmse_TTT(__builtin_bit_cast(void *, (p)))
-
-#if __ARM_CMSE_SECURE_MODE
-#define cmse_TTA_fptr(p) cmse_TTA(__builtin_bit_cast(void *, (p)))
-#define cmse_TTAT_fptr(p) cmse_TTAT(__builtin_bit_cast(void *, (p)))
-#endif
-
-static void *__attribute__((__always_inline__))
-cmse_check_address_range(void *__pb, size_t __s, int __flags) {
-  uintptr_t __begin = (uintptr_t)__pb;
-  uintptr_t __end = __begin + __s - 1;
-
-  if (__end < __begin)
-    return NULL; /* wrap around check */
-
-  /* Check whether the range crosses a 32-bytes aligned address */
-  const int __single_check = (__begin ^ __end) < 0x20u;
-
-  /* execute the right variant of the TT instructions */
-  void *__pe = (void *)__end;
-  cmse_address_info_t __permb, __perme;
-  switch (__flags & (CMSE_MPU_UNPRIV | CMSE_MPU_NONSECURE)) {
-  case 0:
-    __permb = cmse_TT(__pb);
-    __perme = __single_check ? __permb : cmse_TT(__pe);
-    break;
-  case CMSE_MPU_UNPRIV:
-    __permb = cmse_TTT(__pb);
-    __perme = __single_check ? __permb : cmse_TTT(__pe);
-    break;
-#if __ARM_CMSE_SECURE_MODE
-  case CMSE_MPU_NONSECURE:
-    __permb = cmse_TTA(__pb);
-    __perme = __single_check ? __permb : cmse_TTA(__pe);
-    break;
-  case CMSE_MPU_UNPRIV | CMSE_MPU_NONSECURE:
-    __permb = cmse_TTAT(__pb);
-    __perme = __single_check ? __permb : cmse_TTAT(__pe);
-    break;
-#endif
-  /* if CMSE_NONSECURE is specified w/o __ARM_CMSE_SECURE_MODE */
-  default:
-    return NULL;
-  }
-
-  /* check that the range does not cross MPU, SAU, or IDAU region boundaries */
-  if (__permb.value != __perme.value)
-    return NULL;
-#if !(__ARM_CMSE_SECURE_MODE)
-  /* CMSE_AU_NONSECURE is only supported when __ARM_FEATURE_CMSE & 0x2 */
-  if (__flags & CMSE_AU_NONSECURE)
-    return NULL;
-#endif
-
-  /* check the permission on the range */
-  switch (__flags & ~(CMSE_MPU_UNPRIV | CMSE_MPU_NONSECURE)) {
-#if (__ARM_CMSE_SECURE_MODE)
-  case CMSE_MPU_READ | CMSE_MPU_READWRITE | CMSE_AU_NONSECURE:
-  case CMSE_MPU_READWRITE | CMSE_AU_NONSECURE:
-    return __permb.flags.nonsecure_readwrite_ok ? __pb : NULL;
-
-  case CMSE_MPU_READ | CMSE_AU_NONSECURE:
-    return __permb.flags.nonsecure_read_ok ? __pb : NULL;
-
-  case CMSE_AU_NONSECURE:
-    return __permb.flags.secure ? NULL : __pb;
-#endif
-  case CMSE_MPU_READ | CMSE_MPU_READWRITE:
-  case CMSE_MPU_READWRITE:
-    return __permb.flags.readwrite_ok ? __pb : NULL;
-
-  case CMSE_MPU_READ:
-    return __permb.flags.read_ok ? __pb : NULL;
-
-  default:
-    return NULL;
-  }
-}
-
-#if __ARM_CMSE_SECURE_MODE
-static int __attribute__((__always_inline__, __nodebug__))
-cmse_nonsecure_caller(void) {
-  return !((uintptr_t)__builtin_return_address(0) & 1);
-}
-
-#define cmse_nsfptr_create(p)                                                  \
-  __builtin_bit_cast(__typeof__(p),                                            \
-                     (__builtin_bit_cast(uintptr_t, p) & ~(uintptr_t)1))
-
-#define cmse_is_nsfptr(p) ((__builtin_bit_cast(uintptr_t, p) & 1) == 0)
-
-#endif /* __ARM_CMSE_SECURE_MODE */
-
-void __attribute__((__noreturn__)) cmse_abort(void);
-#if defined(__cplusplus)
-}
-#endif
-
-#endif /* (__ARM_FEATURE_CMSE & 0x1) */
-
-#endif /* __ARM_CMSE_H */
--- a/lib/include/arm_fp16.h
+++ b/lib/include/arm_fp16.h
@ -1,596 +0,0 @@
-/*===---- arm_fp16.h - ARM FP16 intrinsics ---------------------------------===
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __ARM_FP16_H
-#define __ARM_FP16_H
-
-#include <stdint.h>
-
-typedef __fp16 float16_t;
-#define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))
-
-#if defined(__aarch64__)
-#define vabdh_f16(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (float16_t) __builtin_neon_vabdh_f16(__s0, __s1); \
-  __ret; \
-})
-#define vabsh_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vabsh_f16(__s0); \
-  __ret; \
-})
-#define vaddh_f16(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (float16_t) __builtin_neon_vaddh_f16(__s0, __s1); \
-  __ret; \
-})
-#define vcageh_f16(__p0, __p1) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (uint16_t) __builtin_neon_vcageh_f16(__s0, __s1); \
-  __ret; \
-})
-#define vcagth_f16(__p0, __p1) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (uint16_t) __builtin_neon_vcagth_f16(__s0, __s1); \
-  __ret; \
-})
-#define vcaleh_f16(__p0, __p1) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (uint16_t) __builtin_neon_vcaleh_f16(__s0, __s1); \
-  __ret; \
-})
-#define vcalth_f16(__p0, __p1) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (uint16_t) __builtin_neon_vcalth_f16(__s0, __s1); \
-  __ret; \
-})
-#define vceqh_f16(__p0, __p1) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (uint16_t) __builtin_neon_vceqh_f16(__s0, __s1); \
-  __ret; \
-})
-#define vceqzh_f16(__p0) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint16_t) __builtin_neon_vceqzh_f16(__s0); \
-  __ret; \
-})
-#define vcgeh_f16(__p0, __p1) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (uint16_t) __builtin_neon_vcgeh_f16(__s0, __s1); \
-  __ret; \
-})
-#define vcgezh_f16(__p0) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint16_t) __builtin_neon_vcgezh_f16(__s0); \
-  __ret; \
-})
-#define vcgth_f16(__p0, __p1) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (uint16_t) __builtin_neon_vcgth_f16(__s0, __s1); \
-  __ret; \
-})
-#define vcgtzh_f16(__p0) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint16_t) __builtin_neon_vcgtzh_f16(__s0); \
-  __ret; \
-})
-#define vcleh_f16(__p0, __p1) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (uint16_t) __builtin_neon_vcleh_f16(__s0, __s1); \
-  __ret; \
-})
-#define vclezh_f16(__p0) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint16_t) __builtin_neon_vclezh_f16(__s0); \
-  __ret; \
-})
-#define vclth_f16(__p0, __p1) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (uint16_t) __builtin_neon_vclth_f16(__s0, __s1); \
-  __ret; \
-})
-#define vcltzh_f16(__p0) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint16_t) __builtin_neon_vcltzh_f16(__s0); \
-  __ret; \
-})
-#define vcvth_n_s16_f16(__p0, __p1) __extension__ ({ \
-  int16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (int16_t) __builtin_neon_vcvth_n_s16_f16(__s0, __p1); \
-  __ret; \
-})
-#define vcvth_n_s32_f16(__p0, __p1) __extension__ ({ \
-  int32_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (int32_t) __builtin_neon_vcvth_n_s32_f16(__s0, __p1); \
-  __ret; \
-})
-#define vcvth_n_s64_f16(__p0, __p1) __extension__ ({ \
-  int64_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (int64_t) __builtin_neon_vcvth_n_s64_f16(__s0, __p1); \
-  __ret; \
-})
-#define vcvth_n_u16_f16(__p0, __p1) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint16_t) __builtin_neon_vcvth_n_u16_f16(__s0, __p1); \
-  __ret; \
-})
-#define vcvth_n_u32_f16(__p0, __p1) __extension__ ({ \
-  uint32_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint32_t) __builtin_neon_vcvth_n_u32_f16(__s0, __p1); \
-  __ret; \
-})
-#define vcvth_n_u64_f16(__p0, __p1) __extension__ ({ \
-  uint64_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint64_t) __builtin_neon_vcvth_n_u64_f16(__s0, __p1); \
-  __ret; \
-})
-#define vcvth_s16_f16(__p0) __extension__ ({ \
-  int16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (int16_t) __builtin_neon_vcvth_s16_f16(__s0); \
-  __ret; \
-})
-#define vcvth_s32_f16(__p0) __extension__ ({ \
-  int32_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (int32_t) __builtin_neon_vcvth_s32_f16(__s0); \
-  __ret; \
-})
-#define vcvth_s64_f16(__p0) __extension__ ({ \
-  int64_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (int64_t) __builtin_neon_vcvth_s64_f16(__s0); \
-  __ret; \
-})
-#define vcvth_u16_f16(__p0) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint16_t) __builtin_neon_vcvth_u16_f16(__s0); \
-  __ret; \
-})
-#define vcvth_u32_f16(__p0) __extension__ ({ \
-  uint32_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint32_t) __builtin_neon_vcvth_u32_f16(__s0); \
-  __ret; \
-})
-#define vcvth_u64_f16(__p0) __extension__ ({ \
-  uint64_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint64_t) __builtin_neon_vcvth_u64_f16(__s0); \
-  __ret; \
-})
-#define vcvtah_s16_f16(__p0) __extension__ ({ \
-  int16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (int16_t) __builtin_neon_vcvtah_s16_f16(__s0); \
-  __ret; \
-})
-#define vcvtah_s32_f16(__p0) __extension__ ({ \
-  int32_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (int32_t) __builtin_neon_vcvtah_s32_f16(__s0); \
-  __ret; \
-})
-#define vcvtah_s64_f16(__p0) __extension__ ({ \
-  int64_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (int64_t) __builtin_neon_vcvtah_s64_f16(__s0); \
-  __ret; \
-})
-#define vcvtah_u16_f16(__p0) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint16_t) __builtin_neon_vcvtah_u16_f16(__s0); \
-  __ret; \
-})
-#define vcvtah_u32_f16(__p0) __extension__ ({ \
-  uint32_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint32_t) __builtin_neon_vcvtah_u32_f16(__s0); \
-  __ret; \
-})
-#define vcvtah_u64_f16(__p0) __extension__ ({ \
-  uint64_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint64_t) __builtin_neon_vcvtah_u64_f16(__s0); \
-  __ret; \
-})
-#define vcvth_f16_u16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  uint16_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vcvth_f16_u16(__s0); \
-  __ret; \
-})
-#define vcvth_f16_s16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  int16_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vcvth_f16_s16(__s0); \
-  __ret; \
-})
-#define vcvth_f16_u32(__p0) __extension__ ({ \
-  float16_t __ret; \
-  uint32_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vcvth_f16_u32(__s0); \
-  __ret; \
-})
-#define vcvth_f16_s32(__p0) __extension__ ({ \
-  float16_t __ret; \
-  int32_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vcvth_f16_s32(__s0); \
-  __ret; \
-})
-#define vcvth_f16_u64(__p0) __extension__ ({ \
-  float16_t __ret; \
-  uint64_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vcvth_f16_u64(__s0); \
-  __ret; \
-})
-#define vcvth_f16_s64(__p0) __extension__ ({ \
-  float16_t __ret; \
-  int64_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vcvth_f16_s64(__s0); \
-  __ret; \
-})
-#define vcvth_n_f16_u32(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  uint32_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vcvth_n_f16_u32(__s0, __p1); \
-  __ret; \
-})
-#define vcvth_n_f16_s32(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  int32_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vcvth_n_f16_s32(__s0, __p1); \
-  __ret; \
-})
-#define vcvth_n_f16_u64(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  uint64_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vcvth_n_f16_u64(__s0, __p1); \
-  __ret; \
-})
-#define vcvth_n_f16_s64(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  int64_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vcvth_n_f16_s64(__s0, __p1); \
-  __ret; \
-})
-#define vcvth_n_f16_u16(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  uint16_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vcvth_n_f16_u16(__s0, __p1); \
-  __ret; \
-})
-#define vcvth_n_f16_s16(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  int16_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vcvth_n_f16_s16(__s0, __p1); \
-  __ret; \
-})
-#define vcvtmh_s16_f16(__p0) __extension__ ({ \
-  int16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (int16_t) __builtin_neon_vcvtmh_s16_f16(__s0); \
-  __ret; \
-})
-#define vcvtmh_s32_f16(__p0) __extension__ ({ \
-  int32_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (int32_t) __builtin_neon_vcvtmh_s32_f16(__s0); \
-  __ret; \
-})
-#define vcvtmh_s64_f16(__p0) __extension__ ({ \
-  int64_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (int64_t) __builtin_neon_vcvtmh_s64_f16(__s0); \
-  __ret; \
-})
-#define vcvtmh_u16_f16(__p0) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint16_t) __builtin_neon_vcvtmh_u16_f16(__s0); \
-  __ret; \
-})
-#define vcvtmh_u32_f16(__p0) __extension__ ({ \
-  uint32_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint32_t) __builtin_neon_vcvtmh_u32_f16(__s0); \
-  __ret; \
-})
-#define vcvtmh_u64_f16(__p0) __extension__ ({ \
-  uint64_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint64_t) __builtin_neon_vcvtmh_u64_f16(__s0); \
-  __ret; \
-})
-#define vcvtnh_s16_f16(__p0) __extension__ ({ \
-  int16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (int16_t) __builtin_neon_vcvtnh_s16_f16(__s0); \
-  __ret; \
-})
-#define vcvtnh_s32_f16(__p0) __extension__ ({ \
-  int32_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (int32_t) __builtin_neon_vcvtnh_s32_f16(__s0); \
-  __ret; \
-})
-#define vcvtnh_s64_f16(__p0) __extension__ ({ \
-  int64_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (int64_t) __builtin_neon_vcvtnh_s64_f16(__s0); \
-  __ret; \
-})
-#define vcvtnh_u16_f16(__p0) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint16_t) __builtin_neon_vcvtnh_u16_f16(__s0); \
-  __ret; \
-})
-#define vcvtnh_u32_f16(__p0) __extension__ ({ \
-  uint32_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint32_t) __builtin_neon_vcvtnh_u32_f16(__s0); \
-  __ret; \
-})
-#define vcvtnh_u64_f16(__p0) __extension__ ({ \
-  uint64_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint64_t) __builtin_neon_vcvtnh_u64_f16(__s0); \
-  __ret; \
-})
-#define vcvtph_s16_f16(__p0) __extension__ ({ \
-  int16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (int16_t) __builtin_neon_vcvtph_s16_f16(__s0); \
-  __ret; \
-})
-#define vcvtph_s32_f16(__p0) __extension__ ({ \
-  int32_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (int32_t) __builtin_neon_vcvtph_s32_f16(__s0); \
-  __ret; \
-})
-#define vcvtph_s64_f16(__p0) __extension__ ({ \
-  int64_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (int64_t) __builtin_neon_vcvtph_s64_f16(__s0); \
-  __ret; \
-})
-#define vcvtph_u16_f16(__p0) __extension__ ({ \
-  uint16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint16_t) __builtin_neon_vcvtph_u16_f16(__s0); \
-  __ret; \
-})
-#define vcvtph_u32_f16(__p0) __extension__ ({ \
-  uint32_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint32_t) __builtin_neon_vcvtph_u32_f16(__s0); \
-  __ret; \
-})
-#define vcvtph_u64_f16(__p0) __extension__ ({ \
-  uint64_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (uint64_t) __builtin_neon_vcvtph_u64_f16(__s0); \
-  __ret; \
-})
-#define vdivh_f16(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (float16_t) __builtin_neon_vdivh_f16(__s0, __s1); \
-  __ret; \
-})
-#define vfmah_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  float16_t __s2 = __p2; \
-  __ret = (float16_t) __builtin_neon_vfmah_f16(__s0, __s1, __s2); \
-  __ret; \
-})
-#define vfmsh_f16(__p0, __p1, __p2) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  float16_t __s2 = __p2; \
-  __ret = (float16_t) __builtin_neon_vfmsh_f16(__s0, __s1, __s2); \
-  __ret; \
-})
-#define vmaxh_f16(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (float16_t) __builtin_neon_vmaxh_f16(__s0, __s1); \
-  __ret; \
-})
-#define vmaxnmh_f16(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (float16_t) __builtin_neon_vmaxnmh_f16(__s0, __s1); \
-  __ret; \
-})
-#define vminh_f16(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (float16_t) __builtin_neon_vminh_f16(__s0, __s1); \
-  __ret; \
-})
-#define vminnmh_f16(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (float16_t) __builtin_neon_vminnmh_f16(__s0, __s1); \
-  __ret; \
-})
-#define vmulh_f16(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (float16_t) __builtin_neon_vmulh_f16(__s0, __s1); \
-  __ret; \
-})
-#define vmulxh_f16(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (float16_t) __builtin_neon_vmulxh_f16(__s0, __s1); \
-  __ret; \
-})
-#define vnegh_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vnegh_f16(__s0); \
-  __ret; \
-})
-#define vrecpeh_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vrecpeh_f16(__s0); \
-  __ret; \
-})
-#define vrecpsh_f16(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (float16_t) __builtin_neon_vrecpsh_f16(__s0, __s1); \
-  __ret; \
-})
-#define vrecpxh_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vrecpxh_f16(__s0); \
-  __ret; \
-})
-#define vrndh_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vrndh_f16(__s0); \
-  __ret; \
-})
-#define vrndah_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vrndah_f16(__s0); \
-  __ret; \
-})
-#define vrndih_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vrndih_f16(__s0); \
-  __ret; \
-})
-#define vrndmh_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vrndmh_f16(__s0); \
-  __ret; \
-})
-#define vrndnh_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vrndnh_f16(__s0); \
-  __ret; \
-})
-#define vrndph_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vrndph_f16(__s0); \
-  __ret; \
-})
-#define vrndxh_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vrndxh_f16(__s0); \
-  __ret; \
-})
-#define vrsqrteh_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vrsqrteh_f16(__s0); \
-  __ret; \
-})
-#define vrsqrtsh_f16(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (float16_t) __builtin_neon_vrsqrtsh_f16(__s0, __s1); \
-  __ret; \
-})
-#define vsqrth_f16(__p0) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  __ret = (float16_t) __builtin_neon_vsqrth_f16(__s0); \
-  __ret; \
-})
-#define vsubh_f16(__p0, __p1) __extension__ ({ \
-  float16_t __ret; \
-  float16_t __s0 = __p0; \
-  float16_t __s1 = __p1; \
-  __ret = (float16_t) __builtin_neon_vsubh_f16(__s0, __s1); \
-  __ret; \
-})
-#endif
-
-#undef __ai
-
-#endif /* __ARM_FP16_H */
--- a/lib/include/arm_mve.h
+++ b/lib/include/arm_mve.h
--- a/lib/include/arm_neon.h
+++ b/lib/include/arm_neon.h
--- a/lib/include/arm_neon_sve_bridge.h
+++ b/lib/include/arm_neon_sve_bridge.h
@ -1,182 +0,0 @@
-/*===---- arm_neon_sve_bridge.h - ARM NEON SVE Bridge intrinsics -----------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __ARM_NEON_SVE_BRIDGE_H
-#define __ARM_NEON_SVE_BRIDGE_H
-
-#include <arm_neon.h>
-#include <arm_sve.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* Function attributes */
-#define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))
-#define __aio                                                                  \
-  static __inline__                                                            \
-      __attribute__((__always_inline__, __nodebug__, __overloadable__))
-
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s8)))
-svint8_t svset_neonq(svint8_t, int8x16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s16)))
-svint16_t svset_neonq(svint16_t, int16x8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s32)))
-svint32_t svset_neonq(svint32_t, int32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s64)))
-svint64_t svset_neonq(svint64_t, int64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u8)))
-svuint8_t svset_neonq(svuint8_t, uint8x16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u16)))
-svuint16_t svset_neonq(svuint16_t, uint16x8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u32)))
-svuint32_t svset_neonq(svuint32_t, uint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u64)))
-svuint64_t svset_neonq(svuint64_t, uint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f16)))
-svfloat16_t svset_neonq(svfloat16_t, float16x8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f32)))
-svfloat32_t svset_neonq(svfloat32_t, float32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f64)))
-svfloat64_t svset_neonq(svfloat64_t, float64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s8)))
-svint8_t svset_neonq_s8(svint8_t, int8x16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s16)))
-svint16_t svset_neonq_s16(svint16_t, int16x8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s32)))
-svint32_t svset_neonq_s32(svint32_t, int32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s64)))
-svint64_t svset_neonq_s64(svint64_t, int64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u8)))
-svuint8_t svset_neonq_u8(svuint8_t, uint8x16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u16)))
-svuint16_t svset_neonq_u16(svuint16_t, uint16x8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u32)))
-svuint32_t svset_neonq_u32(svuint32_t, uint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u64)))
-svuint64_t svset_neonq_u64(svuint64_t, uint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f16)))
-svfloat16_t svset_neonq_f16(svfloat16_t, float16x8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f32)))
-svfloat32_t svset_neonq_f32(svfloat32_t, float32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f64)))
-svfloat64_t svset_neonq_f64(svfloat64_t, float64x2_t);
-
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s8)))
-int8x16_t svget_neonq(svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s16)))
-int16x8_t svget_neonq(svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s32)))
-int32x4_t svget_neonq(svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s64)))
-int64x2_t svget_neonq(svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u8)))
-uint8x16_t svget_neonq(svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u16)))
-uint16x8_t svget_neonq(svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u32)))
-uint32x4_t svget_neonq(svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u64)))
-uint64x2_t svget_neonq(svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f16)))
-float16x8_t svget_neonq(svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f32)))
-float32x4_t svget_neonq(svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f64)))
-float64x2_t svget_neonq(svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s8)))
-int8x16_t svget_neonq_s8(svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s16)))
-int16x8_t svget_neonq_s16(svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s32)))
-int32x4_t svget_neonq_s32(svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s64)))
-int64x2_t svget_neonq_s64(svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u8)))
-uint8x16_t svget_neonq_u8(svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u16)))
-uint16x8_t svget_neonq_u16(svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u32)))
-uint32x4_t svget_neonq_u32(svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u64)))
-uint64x2_t svget_neonq_u64(svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f16)))
-float16x8_t svget_neonq_f16(svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f32)))
-float32x4_t svget_neonq_f32(svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f64)))
-float64x2_t svget_neonq_f64(svfloat64_t);
-
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s8)))
-svint8_t svdup_neonq(int8x16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s16)))
-svint16_t svdup_neonq(int16x8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s32)))
-svint32_t svdup_neonq(int32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s64)))
-svint64_t svdup_neonq(int64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u8)))
-svuint8_t svdup_neonq(uint8x16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u16)))
-svuint16_t svdup_neonq(uint16x8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u32)))
-svuint32_t svdup_neonq(uint32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u64)))
-svuint64_t svdup_neonq(uint64x2_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f16)))
-svfloat16_t svdup_neonq(float16x8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f32)))
-svfloat32_t svdup_neonq(float32x4_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f64)))
-svfloat64_t svdup_neonq(float64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s8)))
-svint8_t svdup_neonq_s8(int8x16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s16)))
-svint16_t svdup_neonq_s16(int16x8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s32)))
-svint32_t svdup_neonq_s32(int32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s64)))
-svint64_t svdup_neonq_s64(int64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u8)))
-svuint8_t svdup_neonq_u8(uint8x16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u16)))
-svuint16_t svdup_neonq_u16(uint16x8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u32)))
-svuint32_t svdup_neonq_u32(uint32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u64)))
-svuint64_t svdup_neonq_u64(uint64x2_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f16)))
-svfloat16_t svdup_neonq_f16(float16x8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f32)))
-svfloat32_t svdup_neonq_f32(float32x4_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f64)))
-svfloat64_t svdup_neonq_f64(float64x2_t);
-
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_bf16)))
-svbfloat16_t svset_neonq(svbfloat16_t, bfloat16x8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_bf16)))
-svbfloat16_t svset_neonq_bf16(svbfloat16_t, bfloat16x8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_bf16)))
-bfloat16x8_t svget_neonq(svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_bf16)))
-bfloat16x8_t svget_neonq_bf16(svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_bf16)))
-svbfloat16_t svdup_neonq(bfloat16x8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_bf16)))
-svbfloat16_t svdup_neonq_bf16(bfloat16x8_t);
-
-#undef __ai
-#undef __aio
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif //__ARM_NEON_SVE_BRIDGE_H
--- a/lib/include/arm_sme_draft_spec_subject_to_change.h
+++ b/lib/include/arm_sme_draft_spec_subject_to_change.h
@ -1,642 +0,0 @@
-/*===---- arm_sme_draft_spec_subject_to_change.h - ARM SME intrinsics ------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __ARM_SME_H
-#define __ARM_SME_H
-
-#if !defined(__LITTLE_ENDIAN__)
-#error "Big endian is currently not supported for arm_sme_draft_spec_subject_to_change.h"
-#endif
-#include <arm_sve.h> 
-
-/* Function attributes */
-#define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))
-
-#define __aio static __inline__ __attribute__((__always_inline__, __nodebug__, __overloadable__))
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za32_u32_m), arm_streaming, arm_shared_za))
-void svaddha_za32_u32_m(uint64_t, svbool_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za32_s32_m), arm_streaming, arm_shared_za))
-void svaddha_za32_s32_m(uint64_t, svbool_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddva_za32_u32_m), arm_streaming, arm_shared_za))
-void svaddva_za32_u32_m(uint64_t, svbool_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddva_za32_s32_m), arm_streaming, arm_shared_za))
-void svaddva_za32_s32_m(uint64_t, svbool_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svcntsb), arm_streaming_compatible, arm_preserves_za))
-uint64_t svcntsb(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svcntsd), arm_streaming_compatible, arm_preserves_za))
-uint64_t svcntsd(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svcntsh), arm_streaming_compatible, arm_preserves_za))
-uint64_t svcntsh(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svcntsw), arm_streaming_compatible, arm_preserves_za))
-uint64_t svcntsw(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_vnum_za128), arm_streaming, arm_shared_za))
-void svld1_hor_vnum_za128(uint64_t, uint32_t, uint64_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_vnum_za16), arm_streaming, arm_shared_za))
-void svld1_hor_vnum_za16(uint64_t, uint32_t, uint64_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_vnum_za32), arm_streaming, arm_shared_za))
-void svld1_hor_vnum_za32(uint64_t, uint32_t, uint64_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_vnum_za64), arm_streaming, arm_shared_za))
-void svld1_hor_vnum_za64(uint64_t, uint32_t, uint64_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_vnum_za8), arm_streaming, arm_shared_za))
-void svld1_hor_vnum_za8(uint64_t, uint32_t, uint64_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_za128), arm_streaming, arm_shared_za))
-void svld1_hor_za128(uint64_t, uint32_t, uint64_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_za16), arm_streaming, arm_shared_za))
-void svld1_hor_za16(uint64_t, uint32_t, uint64_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_za32), arm_streaming, arm_shared_za))
-void svld1_hor_za32(uint64_t, uint32_t, uint64_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_za64), arm_streaming, arm_shared_za))
-void svld1_hor_za64(uint64_t, uint32_t, uint64_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_za8), arm_streaming, arm_shared_za))
-void svld1_hor_za8(uint64_t, uint32_t, uint64_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_vnum_za128), arm_streaming, arm_shared_za))
-void svld1_ver_vnum_za128(uint64_t, uint32_t, uint64_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_vnum_za16), arm_streaming, arm_shared_za))
-void svld1_ver_vnum_za16(uint64_t, uint32_t, uint64_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_vnum_za32), arm_streaming, arm_shared_za))
-void svld1_ver_vnum_za32(uint64_t, uint32_t, uint64_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_vnum_za64), arm_streaming, arm_shared_za))
-void svld1_ver_vnum_za64(uint64_t, uint32_t, uint64_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_vnum_za8), arm_streaming, arm_shared_za))
-void svld1_ver_vnum_za8(uint64_t, uint32_t, uint64_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_za128), arm_streaming, arm_shared_za))
-void svld1_ver_za128(uint64_t, uint32_t, uint64_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_za16), arm_streaming, arm_shared_za))
-void svld1_ver_za16(uint64_t, uint32_t, uint64_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_za32), arm_streaming, arm_shared_za))
-void svld1_ver_za32(uint64_t, uint32_t, uint64_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_za64), arm_streaming, arm_shared_za))
-void svld1_ver_za64(uint64_t, uint32_t, uint64_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_za8), arm_streaming, arm_shared_za))
-void svld1_ver_za8(uint64_t, uint32_t, uint64_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_f16_m), arm_streaming, arm_shared_za))
-void svmopa_za32_f16_m(uint64_t, svbool_t, svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_bf16_m), arm_streaming, arm_shared_za))
-void svmopa_za32_bf16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_f32_m), arm_streaming, arm_shared_za))
-void svmopa_za32_f32_m(uint64_t, svbool_t, svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_s8_m), arm_streaming, arm_shared_za))
-void svmopa_za32_s8_m(uint64_t, svbool_t, svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_u8_m), arm_streaming, arm_shared_za))
-void svmopa_za32_u8_m(uint64_t, svbool_t, svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_f16_m), arm_streaming, arm_shared_za))
-void svmops_za32_f16_m(uint64_t, svbool_t, svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_bf16_m), arm_streaming, arm_shared_za))
-void svmops_za32_bf16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_f32_m), arm_streaming, arm_shared_za))
-void svmops_za32_f32_m(uint64_t, svbool_t, svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_s8_m), arm_streaming, arm_shared_za))
-void svmops_za32_s8_m(uint64_t, svbool_t, svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_u8_m), arm_streaming, arm_shared_za))
-void svmops_za32_u8_m(uint64_t, svbool_t, svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_u8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint8_t svread_hor_za128_u8_m(svuint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_u32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint32_t svread_hor_za128_u32_m(svuint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_u64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint64_t svread_hor_za128_u64_m(svuint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_u16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint16_t svread_hor_za128_u16_m(svuint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_bf16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svbfloat16_t svread_hor_za128_bf16_m(svbfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_s8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint8_t svread_hor_za128_s8_m(svint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_f64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat64_t svread_hor_za128_f64_m(svfloat64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_f32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat32_t svread_hor_za128_f32_m(svfloat32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_f16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat16_t svread_hor_za128_f16_m(svfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_s32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint32_t svread_hor_za128_s32_m(svint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_s64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint64_t svread_hor_za128_s64_m(svint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_s16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint16_t svread_hor_za128_s16_m(svint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_u16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint16_t svread_hor_za16_u16_m(svuint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_bf16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svbfloat16_t svread_hor_za16_bf16_m(svbfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_f16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat16_t svread_hor_za16_f16_m(svfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_s16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint16_t svread_hor_za16_s16_m(svint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za32_u32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint32_t svread_hor_za32_u32_m(svuint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za32_f32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat32_t svread_hor_za32_f32_m(svfloat32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za32_s32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint32_t svread_hor_za32_s32_m(svint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za64_u64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint64_t svread_hor_za64_u64_m(svuint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za64_f64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat64_t svread_hor_za64_f64_m(svfloat64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za64_s64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint64_t svread_hor_za64_s64_m(svint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za8_u8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint8_t svread_hor_za8_u8_m(svuint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za8_s8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint8_t svread_hor_za8_s8_m(svint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_u8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint8_t svread_ver_za128_u8_m(svuint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_u32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint32_t svread_ver_za128_u32_m(svuint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_u64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint64_t svread_ver_za128_u64_m(svuint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_u16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint16_t svread_ver_za128_u16_m(svuint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_bf16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svbfloat16_t svread_ver_za128_bf16_m(svbfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_s8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint8_t svread_ver_za128_s8_m(svint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_f64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat64_t svread_ver_za128_f64_m(svfloat64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_f32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat32_t svread_ver_za128_f32_m(svfloat32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_f16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat16_t svread_ver_za128_f16_m(svfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_s32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint32_t svread_ver_za128_s32_m(svint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_s64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint64_t svread_ver_za128_s64_m(svint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_s16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint16_t svread_ver_za128_s16_m(svint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_u16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint16_t svread_ver_za16_u16_m(svuint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_bf16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svbfloat16_t svread_ver_za16_bf16_m(svbfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_f16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat16_t svread_ver_za16_f16_m(svfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_s16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint16_t svread_ver_za16_s16_m(svint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za32_u32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint32_t svread_ver_za32_u32_m(svuint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za32_f32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat32_t svread_ver_za32_f32_m(svfloat32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za32_s32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint32_t svread_ver_za32_s32_m(svint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za64_u64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint64_t svread_ver_za64_u64_m(svuint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za64_f64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat64_t svread_ver_za64_f64_m(svfloat64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za64_s64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint64_t svread_ver_za64_s64_m(svint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za8_u8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint8_t svread_ver_za8_u8_m(svuint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za8_s8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint8_t svread_ver_za8_s8_m(svint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_vnum_za128), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_hor_vnum_za128(uint64_t, uint32_t, uint64_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_vnum_za16), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_hor_vnum_za16(uint64_t, uint32_t, uint64_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_vnum_za32), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_hor_vnum_za32(uint64_t, uint32_t, uint64_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_vnum_za64), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_hor_vnum_za64(uint64_t, uint32_t, uint64_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_vnum_za8), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_hor_vnum_za8(uint64_t, uint32_t, uint64_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_za128), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_hor_za128(uint64_t, uint32_t, uint64_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_za16), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_hor_za16(uint64_t, uint32_t, uint64_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_za32), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_hor_za32(uint64_t, uint32_t, uint64_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_za64), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_hor_za64(uint64_t, uint32_t, uint64_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_za8), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_hor_za8(uint64_t, uint32_t, uint64_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_vnum_za128), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_ver_vnum_za128(uint64_t, uint32_t, uint64_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_vnum_za16), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_ver_vnum_za16(uint64_t, uint32_t, uint64_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_vnum_za32), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_ver_vnum_za32(uint64_t, uint32_t, uint64_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_vnum_za64), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_ver_vnum_za64(uint64_t, uint32_t, uint64_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_vnum_za8), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_ver_vnum_za8(uint64_t, uint32_t, uint64_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_za128), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_ver_za128(uint64_t, uint32_t, uint64_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_za16), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_ver_za16(uint64_t, uint32_t, uint64_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_za32), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_ver_za32(uint64_t, uint32_t, uint64_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_za64), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_ver_za64(uint64_t, uint32_t, uint64_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_za8), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_ver_za8(uint64_t, uint32_t, uint64_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumopa_za32_s8_m), arm_streaming, arm_shared_za))
-void svsumopa_za32_s8_m(uint64_t, svbool_t, svbool_t, svint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumops_za32_s8_m), arm_streaming, arm_shared_za))
-void svsumops_za32_s8_m(uint64_t, svbool_t, svbool_t, svint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmopa_za32_u8_m), arm_streaming, arm_shared_za))
-void svusmopa_za32_u8_m(uint64_t, svbool_t, svbool_t, svuint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmops_za32_u8_m), arm_streaming, arm_shared_za))
-void svusmops_za32_u8_m(uint64_t, svbool_t, svbool_t, svuint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_u8_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_u8_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_u32_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_u32_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_u64_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_u64_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_u16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_u16_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_bf16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_bf16_m(uint64_t, uint32_t, uint64_t, svbool_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_s8_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_s8_m(uint64_t, uint32_t, uint64_t, svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_f64_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_f64_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_f32_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_f32_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_f16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_f16_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_s32_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_s32_m(uint64_t, uint32_t, uint64_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_s64_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_s64_m(uint64_t, uint32_t, uint64_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_s16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_s16_m(uint64_t, uint32_t, uint64_t, svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_u16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za16_u16_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_bf16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za16_bf16_m(uint64_t, uint32_t, uint64_t, svbool_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_f16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za16_f16_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_s16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za16_s16_m(uint64_t, uint32_t, uint64_t, svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za32_u32_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za32_u32_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za32_f32_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za32_f32_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za32_s32_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za32_s32_m(uint64_t, uint32_t, uint64_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za64_u64_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za64_u64_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za64_f64_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za64_f64_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za64_s64_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za64_s64_m(uint64_t, uint32_t, uint64_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za8_u8_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za8_u8_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za8_s8_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za8_s8_m(uint64_t, uint32_t, uint64_t, svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_u8_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_u8_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_u32_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_u32_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_u64_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_u64_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_u16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_u16_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_bf16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_bf16_m(uint64_t, uint32_t, uint64_t, svbool_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_s8_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_s8_m(uint64_t, uint32_t, uint64_t, svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_f64_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_f64_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_f32_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_f32_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_f16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_f16_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_s32_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_s32_m(uint64_t, uint32_t, uint64_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_s64_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_s64_m(uint64_t, uint32_t, uint64_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_s16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_s16_m(uint64_t, uint32_t, uint64_t, svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_u16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za16_u16_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_bf16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za16_bf16_m(uint64_t, uint32_t, uint64_t, svbool_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_f16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za16_f16_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_s16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za16_s16_m(uint64_t, uint32_t, uint64_t, svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za32_u32_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za32_u32_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za32_f32_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za32_f32_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za32_s32_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za32_s32_m(uint64_t, uint32_t, uint64_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za64_u64_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za64_u64_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za64_f64_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za64_f64_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za64_s64_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za64_s64_m(uint64_t, uint32_t, uint64_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za8_u8_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za8_u8_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za8_s8_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za8_s8_m(uint64_t, uint32_t, uint64_t, svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svzero_mask_za), arm_streaming_compatible, arm_shared_za))
-void svzero_mask_za(uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svzero_za), arm_streaming_compatible, arm_shared_za))
-void svzero_za();
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za32_u32_m), arm_streaming, arm_shared_za))
-void svaddha_za32_m(uint64_t, svbool_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za32_s32_m), arm_streaming, arm_shared_za))
-void svaddha_za32_m(uint64_t, svbool_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddva_za32_u32_m), arm_streaming, arm_shared_za))
-void svaddva_za32_m(uint64_t, svbool_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddva_za32_s32_m), arm_streaming, arm_shared_za))
-void svaddva_za32_m(uint64_t, svbool_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_f16_m), arm_streaming, arm_shared_za))
-void svmopa_za32_m(uint64_t, svbool_t, svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_bf16_m), arm_streaming, arm_shared_za))
-void svmopa_za32_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_f32_m), arm_streaming, arm_shared_za))
-void svmopa_za32_m(uint64_t, svbool_t, svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_s8_m), arm_streaming, arm_shared_za))
-void svmopa_za32_m(uint64_t, svbool_t, svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_u8_m), arm_streaming, arm_shared_za))
-void svmopa_za32_m(uint64_t, svbool_t, svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_f16_m), arm_streaming, arm_shared_za))
-void svmops_za32_m(uint64_t, svbool_t, svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_bf16_m), arm_streaming, arm_shared_za))
-void svmops_za32_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_f32_m), arm_streaming, arm_shared_za))
-void svmops_za32_m(uint64_t, svbool_t, svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_s8_m), arm_streaming, arm_shared_za))
-void svmops_za32_m(uint64_t, svbool_t, svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_u8_m), arm_streaming, arm_shared_za))
-void svmops_za32_m(uint64_t, svbool_t, svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_u8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint8_t svread_hor_za128_m(svuint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_u32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint32_t svread_hor_za128_m(svuint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_u64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint64_t svread_hor_za128_m(svuint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_u16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint16_t svread_hor_za128_m(svuint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_bf16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svbfloat16_t svread_hor_za128_m(svbfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_s8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint8_t svread_hor_za128_m(svint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_f64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat64_t svread_hor_za128_m(svfloat64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_f32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat32_t svread_hor_za128_m(svfloat32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_f16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat16_t svread_hor_za128_m(svfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_s32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint32_t svread_hor_za128_m(svint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_s64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint64_t svread_hor_za128_m(svint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_s16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint16_t svread_hor_za128_m(svint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_u16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint16_t svread_hor_za16_m(svuint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_bf16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svbfloat16_t svread_hor_za16_m(svbfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_f16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat16_t svread_hor_za16_m(svfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_s16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint16_t svread_hor_za16_m(svint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za32_u32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint32_t svread_hor_za32_m(svuint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za32_f32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat32_t svread_hor_za32_m(svfloat32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za32_s32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint32_t svread_hor_za32_m(svint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za64_u64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint64_t svread_hor_za64_m(svuint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za64_f64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat64_t svread_hor_za64_m(svfloat64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za64_s64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint64_t svread_hor_za64_m(svint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za8_u8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint8_t svread_hor_za8_m(svuint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za8_s8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint8_t svread_hor_za8_m(svint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_u8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint8_t svread_ver_za128_m(svuint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_u32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint32_t svread_ver_za128_m(svuint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_u64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint64_t svread_ver_za128_m(svuint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_u16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint16_t svread_ver_za128_m(svuint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_bf16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svbfloat16_t svread_ver_za128_m(svbfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_s8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint8_t svread_ver_za128_m(svint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_f64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat64_t svread_ver_za128_m(svfloat64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_f32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat32_t svread_ver_za128_m(svfloat32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_f16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat16_t svread_ver_za128_m(svfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_s32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint32_t svread_ver_za128_m(svint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_s64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint64_t svread_ver_za128_m(svint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_s16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint16_t svread_ver_za128_m(svint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_u16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint16_t svread_ver_za16_m(svuint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_bf16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svbfloat16_t svread_ver_za16_m(svbfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_f16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat16_t svread_ver_za16_m(svfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_s16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint16_t svread_ver_za16_m(svint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za32_u32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint32_t svread_ver_za32_m(svuint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za32_f32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat32_t svread_ver_za32_m(svfloat32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za32_s32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint32_t svread_ver_za32_m(svint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za64_u64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint64_t svread_ver_za64_m(svuint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za64_f64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat64_t svread_ver_za64_m(svfloat64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za64_s64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint64_t svread_ver_za64_m(svint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za8_u8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint8_t svread_ver_za8_m(svuint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za8_s8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint8_t svread_ver_za8_m(svint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumopa_za32_s8_m), arm_streaming, arm_shared_za))
-void svsumopa_za32_m(uint64_t, svbool_t, svbool_t, svint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumops_za32_s8_m), arm_streaming, arm_shared_za))
-void svsumops_za32_m(uint64_t, svbool_t, svbool_t, svint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmopa_za32_u8_m), arm_streaming, arm_shared_za))
-void svusmopa_za32_m(uint64_t, svbool_t, svbool_t, svuint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmops_za32_u8_m), arm_streaming, arm_shared_za))
-void svusmops_za32_m(uint64_t, svbool_t, svbool_t, svuint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_u8_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_u32_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_u64_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_u16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_bf16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_s8_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_f64_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_f32_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_f16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_s32_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_s64_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_s16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_u16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za16_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_bf16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za16_m(uint64_t, uint32_t, uint64_t, svbool_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_f16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za16_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_s16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za16_m(uint64_t, uint32_t, uint64_t, svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za32_u32_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za32_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za32_f32_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za32_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za32_s32_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za32_m(uint64_t, uint32_t, uint64_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za64_u64_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za64_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za64_f64_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za64_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za64_s64_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za64_m(uint64_t, uint32_t, uint64_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za8_u8_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za8_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za8_s8_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za8_m(uint64_t, uint32_t, uint64_t, svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_u8_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_u32_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_u64_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_u16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_bf16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_s8_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_f64_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_f32_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_f16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_s32_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_s64_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_s16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_u16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za16_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_bf16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za16_m(uint64_t, uint32_t, uint64_t, svbool_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_f16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za16_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_s16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za16_m(uint64_t, uint32_t, uint64_t, svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za32_u32_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za32_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za32_f32_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za32_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za32_s32_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za32_m(uint64_t, uint32_t, uint64_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za64_u64_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za64_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za64_f64_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za64_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za64_s64_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za64_m(uint64_t, uint32_t, uint64_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za8_u8_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za8_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za8_s8_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za8_m(uint64_t, uint32_t, uint64_t, svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za64_f64_m), arm_streaming, arm_shared_za))
-void svmopa_za64_f64_m(uint64_t, svbool_t, svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za64_f64_m), arm_streaming, arm_shared_za))
-void svmops_za64_f64_m(uint64_t, svbool_t, svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za64_f64_m), arm_streaming, arm_shared_za))
-void svmopa_za64_m(uint64_t, svbool_t, svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za64_f64_m), arm_streaming, arm_shared_za))
-void svmops_za64_m(uint64_t, svbool_t, svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za64_u64_m), arm_streaming, arm_shared_za))
-void svaddha_za64_u64_m(uint64_t, svbool_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za64_s64_m), arm_streaming, arm_shared_za))
-void svaddha_za64_s64_m(uint64_t, svbool_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddva_za64_u64_m), arm_streaming, arm_shared_za))
-void svaddva_za64_u64_m(uint64_t, svbool_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddva_za64_s64_m), arm_streaming, arm_shared_za))
-void svaddva_za64_s64_m(uint64_t, svbool_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za64_s16_m), arm_streaming, arm_shared_za))
-void svmopa_za64_s16_m(uint64_t, svbool_t, svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za64_u16_m), arm_streaming, arm_shared_za))
-void svmopa_za64_u16_m(uint64_t, svbool_t, svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za64_s16_m), arm_streaming, arm_shared_za))
-void svmops_za64_s16_m(uint64_t, svbool_t, svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za64_u16_m), arm_streaming, arm_shared_za))
-void svmops_za64_u16_m(uint64_t, svbool_t, svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumopa_za64_s16_m), arm_streaming, arm_shared_za))
-void svsumopa_za64_s16_m(uint64_t, svbool_t, svbool_t, svint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumops_za64_s16_m), arm_streaming, arm_shared_za))
-void svsumops_za64_s16_m(uint64_t, svbool_t, svbool_t, svint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmopa_za64_u16_m), arm_streaming, arm_shared_za))
-void svusmopa_za64_u16_m(uint64_t, svbool_t, svbool_t, svuint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmops_za64_u16_m), arm_streaming, arm_shared_za))
-void svusmops_za64_u16_m(uint64_t, svbool_t, svbool_t, svuint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za64_u64_m), arm_streaming, arm_shared_za))
-void svaddha_za64_m(uint64_t, svbool_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za64_s64_m), arm_streaming, arm_shared_za))
-void svaddha_za64_m(uint64_t, svbool_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddva_za64_u64_m), arm_streaming, arm_shared_za))
-void svaddva_za64_m(uint64_t, svbool_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddva_za64_s64_m), arm_streaming, arm_shared_za))
-void svaddva_za64_m(uint64_t, svbool_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za64_s16_m), arm_streaming, arm_shared_za))
-void svmopa_za64_m(uint64_t, svbool_t, svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za64_u16_m), arm_streaming, arm_shared_za))
-void svmopa_za64_m(uint64_t, svbool_t, svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za64_s16_m), arm_streaming, arm_shared_za))
-void svmops_za64_m(uint64_t, svbool_t, svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za64_u16_m), arm_streaming, arm_shared_za))
-void svmops_za64_m(uint64_t, svbool_t, svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumopa_za64_s16_m), arm_streaming, arm_shared_za))
-void svsumopa_za64_m(uint64_t, svbool_t, svbool_t, svint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumops_za64_s16_m), arm_streaming, arm_shared_za))
-void svsumops_za64_m(uint64_t, svbool_t, svbool_t, svint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmopa_za64_u16_m), arm_streaming, arm_shared_za))
-void svusmopa_za64_m(uint64_t, svbool_t, svbool_t, svuint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmops_za64_u16_m), arm_streaming, arm_shared_za))
-void svusmops_za64_m(uint64_t, svbool_t, svbool_t, svuint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svldr_vnum_za), arm_streaming_compatible, arm_shared_za))
-void svldr_vnum_za(uint32_t, uint64_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svstr_vnum_za), arm_streaming_compatible, arm_shared_za, arm_preserves_za))
-void svstr_vnum_za(uint32_t, uint64_t, void *);
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#undef __ai
-
-#endif /* __ARM_SME_H */
--- a/lib/include/arm_sve.h
+++ b/lib/include/arm_sve.h
--- a/lib/include/armintr.h
+++ b/lib/include/armintr.h
@ -1,31 +0,0 @@
-/*===---- armintr.h - ARM Windows intrinsics -------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-/* Only include this if we're compiling for the windows platform. */
-#ifndef _MSC_VER
-#include_next <armintr.h>
-#else
-
-#ifndef __ARMINTR_H
-#define __ARMINTR_H
-
-typedef enum
-{
-  _ARM_BARRIER_SY    = 0xF,
-  _ARM_BARRIER_ST    = 0xE,
-  _ARM_BARRIER_ISH   = 0xB,
-  _ARM_BARRIER_ISHST = 0xA,
-  _ARM_BARRIER_NSH   = 0x7,
-  _ARM_BARRIER_NSHST = 0x6,
-  _ARM_BARRIER_OSH   = 0x3,
-  _ARM_BARRIER_OSHST = 0x2
-} _ARMINTR_BARRIER_TYPE;
-
-#endif /* __ARMINTR_H */
-#endif /* _MSC_VER */
--- a/lib/include/avx2intrin.h
+++ b/lib/include/avx2intrin.h
--- a/lib/include/avx512bf16intrin.h
+++ b/lib/include/avx512bf16intrin.h
@ -1,282 +0,0 @@
-/*===------------ avx512bf16intrin.h - AVX512_BF16 intrinsics --------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifdef __SSE2__
-
-#ifndef __AVX512BF16INTRIN_H
-#define __AVX512BF16INTRIN_H
-
-typedef __bf16 __v32bf __attribute__((__vector_size__(64), __aligned__(64)));
-typedef __bf16 __m512bh __attribute__((__vector_size__(64), __aligned__(64)));
-typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead")));
-
-#define __DEFAULT_FN_ATTRS512 \
-  __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16"), \
-                 __min_vector_width__(512)))
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16")))
-
-/// Convert One BF16 Data to One Single Float Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic does not correspond to a specific instruction.
-///
-/// \param __A
-///    A bfloat data.
-/// \returns A float data whose sign field and exponent field keep unchanged,
-///    and fraction field is extended to 23 bits.
-static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bf16 __A) {
-  return __builtin_ia32_cvtsbf162ss_32(__A);
-}
-
-/// Convert Two Packed Single Data to One Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
-///
-/// \param __A
-///    A 512-bit vector of [16 x float].
-/// \param __B
-///    A 512-bit vector of [16 x float].
-/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
-///    conversion of __B, and higher 256 bits come from conversion of __A.
-static __inline__ __m512bh __DEFAULT_FN_ATTRS512
-_mm512_cvtne2ps_pbh(__m512 __A, __m512 __B) {
-  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_512((__v16sf) __A,
-                                                    (__v16sf) __B);
-}
-
-/// Convert Two Packed Single Data to One Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
-///
-/// \param __A
-///    A 512-bit vector of [16 x float].
-/// \param __B
-///    A 512-bit vector of [16 x float].
-/// \param __W
-///    A 512-bit vector of [32 x bfloat].
-/// \param __U
-///    A 32-bit mask value specifying what is chosen for each element.
-///    A 1 means conversion of __A or __B. A 0 means element from __W.
-/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
-///    conversion of __B, and higher 256 bits come from conversion of __A.
-static __inline__ __m512bh __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) {
-  return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
-                                        (__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
-                                        (__v32bf)__W);
-}
-
-/// Convert Two Packed Single Data to One Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
-///
-/// \param __A
-///    A 512-bit vector of [16 x float].
-/// \param __B
-///    A 512-bit vector of [16 x float].
-/// \param __U
-///    A 32-bit mask value specifying what is chosen for each element.
-///    A 1 means conversion of __A or __B. A 0 means element is zero.
-/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
-///    conversion of __B, and higher 256 bits come from conversion of __A.
-static __inline__ __m512bh __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) {
-  return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
-                                        (__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
-                                        (__v32bf)_mm512_setzero_si512());
-}
-
-/// Convert Packed Single Data to Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
-///
-/// \param __A
-///    A 512-bit vector of [16 x float].
-/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
-static __inline__ __m256bh __DEFAULT_FN_ATTRS512
-_mm512_cvtneps_pbh(__m512 __A) {
-  return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
-                                              (__v16bf)_mm256_undefined_si256(),
-                                              (__mmask16)-1);
-}
-
-/// Convert Packed Single Data to Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
-///
-/// \param __A
-///    A 512-bit vector of [16 x float].
-/// \param __W
-///    A 256-bit vector of [16 x bfloat].
-/// \param __U
-///    A 16-bit mask value specifying what is chosen for each element.
-///    A 1 means conversion of __A. A 0 means element from __W.
-/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
-static __inline__ __m256bh __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) {
-  return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
-                                                        (__v16bf)__W,
-                                                        (__mmask16)__U);
-}
-
-/// Convert Packed Single Data to Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
-///
-/// \param __A
-///    A 512-bit vector of [16 x float].
-/// \param __U
-///    A 16-bit mask value specifying what is chosen for each element.
-///    A 1 means conversion of __A. A 0 means element is zero.
-/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
-static __inline__ __m256bh __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) {
-  return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
-                                                (__v16bf)_mm256_setzero_si256(),
-                                                (__mmask16)__U);
-}
-
-/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
-///
-/// \param __A
-///    A 512-bit vector of [32 x bfloat].
-/// \param __B
-///    A 512-bit vector of [32 x bfloat].
-/// \param __D
-///    A 512-bit vector of [16 x float].
-/// \returns A 512-bit vector of [16 x float] comes from  Dot Product of
-///  __A, __B and __D
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_dpbf16_ps(__m512 __D, __m512bh __A, __m512bh __B) {
-  return (__m512)__builtin_ia32_dpbf16ps_512((__v16sf) __D,
-                                             (__v32bf) __A,
-                                             (__v32bf) __B);
-}
-
-/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
-///
-/// \param __A
-///    A 512-bit vector of [32 x bfloat].
-/// \param __B
-///    A 512-bit vector of [32 x bfloat].
-/// \param __D
-///    A 512-bit vector of [16 x float].
-/// \param __U
-///    A 16-bit mask value specifying what is chosen for each element.
-///    A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
-/// \returns A 512-bit vector of [16 x float] comes from  Dot Product of
-///  __A, __B and __D
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_dpbf16_ps(__m512 __D, __mmask16 __U, __m512bh __A, __m512bh __B) {
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
-                                       (__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
-                                       (__v16sf)__D);
-}
-
-/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
-///
-/// \param __A
-///    A 512-bit vector of [32 x bfloat].
-/// \param __B
-///    A 512-bit vector of [32 x bfloat].
-/// \param __D
-///    A 512-bit vector of [16 x float].
-/// \param __U
-///    A 16-bit mask value specifying what is chosen for each element.
-///    A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
-/// \returns A 512-bit vector of [16 x float] comes from  Dot Product of
-///  __A, __B and __D
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) {
-  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
-                                       (__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
-                                       (__v16sf)_mm512_setzero_si512());
-}
-
-/// Convert Packed BF16 Data to Packed float Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \param __A
-///    A 256-bit vector of [16 x bfloat].
-/// \returns A 512-bit vector of [16 x float] come from conversion of __A
-static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
-  return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
-      (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
-}
-
-/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \param __U
-///    A 16-bit mask. Elements are zeroed out when the corresponding mask
-///    bit is not set.
-/// \param __A
-///    A 256-bit vector of [16 x bfloat].
-/// \returns A 512-bit vector of [16 x float] come from conversion of __A
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
-  return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
-      (__m512i)_mm512_maskz_cvtepi16_epi32((__mmask16)__U, (__m256i)__A), 16));
-}
-
-/// Convert Packed BF16 Data to Packed float Data using merging mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \param __S
-///    A 512-bit vector of [16 x float]. Elements are copied from __S when
-///     the corresponding mask bit is not set.
-/// \param __U
-///    A 16-bit mask.
-/// \param __A
-///    A 256-bit vector of [16 x bfloat].
-/// \returns A 512-bit vector of [16 x float] come from conversion of __A
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
-  return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32(
-      (__m512i)__S, (__mmask16)__U,
-      (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
-}
-
-#undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS512
-
-#endif
-#endif
--- a/lib/include/avx512bitalgintrin.h
+++ b/lib/include/avx512bitalgintrin.h
@ -1,83 +0,0 @@
-/*===------------- avx512bitalgintrin.h - BITALG intrinsics ------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512bitalgintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512BITALGINTRIN_H
-#define __AVX512BITALGINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bitalg"), __min_vector_width__(512)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_popcnt_epi16(__m512i __A)
-{
-  return (__m512i) __builtin_ia32_vpopcntw_512((__v32hi) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_popcnt_epi16(__m512i __A, __mmask32 __U, __m512i __B)
-{
-  return (__m512i) __builtin_ia32_selectw_512((__mmask32) __U,
-              (__v32hi) _mm512_popcnt_epi16(__B),
-              (__v32hi) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __B)
-{
-  return _mm512_mask_popcnt_epi16((__m512i) _mm512_setzero_si512(),
-              __U,
-              __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_popcnt_epi8(__m512i __A)
-{
-  return (__m512i) __builtin_ia32_vpopcntb_512((__v64qi) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_popcnt_epi8(__m512i __A, __mmask64 __U, __m512i __B)
-{
-  return (__m512i) __builtin_ia32_selectb_512((__mmask64) __U,
-              (__v64qi) _mm512_popcnt_epi8(__B),
-              (__v64qi) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_popcnt_epi8(__mmask64 __U, __m512i __B)
-{
-  return _mm512_mask_popcnt_epi8((__m512i) _mm512_setzero_si512(),
-              __U,
-              __B);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_mask_bitshuffle_epi64_mask(__mmask64 __U, __m512i __A, __m512i __B)
-{
-  return (__mmask64) __builtin_ia32_vpshufbitqmb512_mask((__v64qi) __A,
-              (__v64qi) __B,
-              __U);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_bitshuffle_epi64_mask(__m512i __A, __m512i __B)
-{
-  return _mm512_mask_bitshuffle_epi64_mask((__mmask64) -1,
-              __A,
-              __B);
-}
-
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
--- a/lib/include/avx512bwintrin.h
+++ b/lib/include/avx512bwintrin.h
--- a/lib/include/avx512cdintrin.h
+++ b/lib/include/avx512cdintrin.h
@ -1,123 +0,0 @@
-/*===------------- avx512cdintrin.h - AVX512CD intrinsics ------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512cdintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512CDINTRIN_H
-#define __AVX512CDINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512cd"), __min_vector_width__(512)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_conflict_epi64 (__m512i __A)
-{
-  return (__m512i) __builtin_ia32_vpconflictdi_512 ((__v8di) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                             (__v8di)_mm512_conflict_epi64(__A),
-                                             (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                             (__v8di)_mm512_conflict_epi64(__A),
-                                             (__v8di)_mm512_setzero_si512 ());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_conflict_epi32 (__m512i __A)
-{
-  return (__m512i) __builtin_ia32_vpconflictsi_512 ((__v16si) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                            (__v16si)_mm512_conflict_epi32(__A),
-                                            (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                            (__v16si)_mm512_conflict_epi32(__A),
-                                            (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_lzcnt_epi32 (__m512i __A)
-{
-  return (__m512i) __builtin_ia32_vplzcntd_512 ((__v16si) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                             (__v16si)_mm512_lzcnt_epi32(__A),
-                                             (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                             (__v16si)_mm512_lzcnt_epi32(__A),
-                                             (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_lzcnt_epi64 (__m512i __A)
-{
-  return (__m512i) __builtin_ia32_vplzcntq_512 ((__v8di) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                             (__v8di)_mm512_lzcnt_epi64(__A),
-                                             (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                             (__v8di)_mm512_lzcnt_epi64(__A),
-                                             (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_broadcastmb_epi64 (__mmask8 __A)
-{
-  return (__m512i) _mm512_set1_epi64((long long) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_broadcastmw_epi32 (__mmask16 __A)
-{
-  return (__m512i) _mm512_set1_epi32((int) __A);
-
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
--- a/lib/include/avx512dqintrin.h
+++ b/lib/include/avx512dqintrin.h
--- a/lib/include/avx512erintrin.h
+++ b/lib/include/avx512erintrin.h
@ -1,271 +0,0 @@
-/*===---- avx512erintrin.h - AVX512ER intrinsics ---------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512erintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512ERINTRIN_H
-#define __AVX512ERINTRIN_H
-
-/* exp2a23 */
-#define _mm512_exp2a23_round_pd(A, R) \
-  ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
-                                       (__v8df)_mm512_setzero_pd(), \
-                                       (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_exp2a23_round_pd(S, M, A, R) \
-  ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
-                                       (__v8df)(__m512d)(S), (__mmask8)(M), \
-                                       (int)(R)))
-
-#define _mm512_maskz_exp2a23_round_pd(M, A, R) \
-  ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
-                                       (__v8df)_mm512_setzero_pd(), \
-                                       (__mmask8)(M), (int)(R)))
-
-#define _mm512_exp2a23_pd(A) \
-  _mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_exp2a23_pd(S, M, A) \
-  _mm512_mask_exp2a23_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_maskz_exp2a23_pd(M, A) \
-  _mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_exp2a23_round_ps(A, R) \
-  ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
-                                      (__v16sf)_mm512_setzero_ps(), \
-                                      (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_exp2a23_round_ps(S, M, A, R) \
-  ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
-                                      (__v16sf)(__m512)(S), (__mmask16)(M), \
-                                      (int)(R)))
-
-#define _mm512_maskz_exp2a23_round_ps(M, A, R) \
-  ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
-                                      (__v16sf)_mm512_setzero_ps(), \
-                                      (__mmask16)(M), (int)(R)))
-
-#define _mm512_exp2a23_ps(A) \
-  _mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_exp2a23_ps(S, M, A) \
-  _mm512_mask_exp2a23_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_maskz_exp2a23_ps(M, A) \
-  _mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
-
-/* rsqrt28 */
-#define _mm512_rsqrt28_round_pd(A, R) \
-  ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
-                                          (__v8df)_mm512_setzero_pd(), \
-                                          (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) \
-  ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
-                                          (__v8df)(__m512d)(S), (__mmask8)(M), \
-                                          (int)(R)))
-
-#define _mm512_maskz_rsqrt28_round_pd(M, A, R) \
-  ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
-                                          (__v8df)_mm512_setzero_pd(), \
-                                          (__mmask8)(M), (int)(R)))
-
-#define _mm512_rsqrt28_pd(A) \
-  _mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_rsqrt28_pd(S, M, A) \
-  _mm512_mask_rsqrt28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_maskz_rsqrt28_pd(M, A) \
-  _mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_rsqrt28_round_ps(A, R) \
-  ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
-                                         (__v16sf)_mm512_setzero_ps(), \
-                                         (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) \
-  ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
-                                         (__v16sf)(__m512)(S), (__mmask16)(M), \
-                                         (int)(R)))
-
-#define _mm512_maskz_rsqrt28_round_ps(M, A, R) \
-  ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
-                                         (__v16sf)_mm512_setzero_ps(), \
-                                         (__mmask16)(M), (int)(R)))
-
-#define _mm512_rsqrt28_ps(A) \
-  _mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_rsqrt28_ps(S, M, A) \
-  _mm512_mask_rsqrt28_round_ps((S), (M), A, _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_maskz_rsqrt28_ps(M, A) \
-  _mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_rsqrt28_round_ss(A, B, R) \
-  ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
-                                               (__v4sf)(__m128)(B), \
-                                               (__v4sf)_mm_setzero_ps(), \
-                                               (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) \
-  ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
-                                               (__v4sf)(__m128)(B), \
-                                               (__v4sf)(__m128)(S), \
-                                               (__mmask8)(M), (int)(R)))
-
-#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) \
-  ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
-                                               (__v4sf)(__m128)(B), \
-                                               (__v4sf)_mm_setzero_ps(), \
-                                               (__mmask8)(M), (int)(R)))
-
-#define _mm_rsqrt28_ss(A, B) \
-  _mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_mask_rsqrt28_ss(S, M, A, B) \
-  _mm_mask_rsqrt28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_maskz_rsqrt28_ss(M, A, B) \
-  _mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_rsqrt28_round_sd(A, B, R) \
-  ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
-                                                (__v2df)(__m128d)(B), \
-                                                (__v2df)_mm_setzero_pd(), \
-                                                (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) \
-  ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
-                                                (__v2df)(__m128d)(B), \
-                                                (__v2df)(__m128d)(S), \
-                                                (__mmask8)(M), (int)(R)))
-
-#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) \
-  ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
-                                                (__v2df)(__m128d)(B), \
-                                                (__v2df)_mm_setzero_pd(), \
-                                                (__mmask8)(M), (int)(R)))
-
-#define _mm_rsqrt28_sd(A, B) \
-  _mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_mask_rsqrt28_sd(S, M, A, B) \
-  _mm_mask_rsqrt28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_maskz_rsqrt28_sd(M, A, B) \
-  _mm_maskz_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-/* rcp28 */
-#define _mm512_rcp28_round_pd(A, R) \
-  ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
-                                        (__v8df)_mm512_setzero_pd(), \
-                                        (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_rcp28_round_pd(S, M, A, R) \
-  ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
-                                        (__v8df)(__m512d)(S), (__mmask8)(M), \
-                                        (int)(R)))
-
-#define _mm512_maskz_rcp28_round_pd(M, A, R) \
-  ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
-                                        (__v8df)_mm512_setzero_pd(), \
-                                        (__mmask8)(M), (int)(R)))
-
-#define _mm512_rcp28_pd(A) \
-  _mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_rcp28_pd(S, M, A) \
-  _mm512_mask_rcp28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_maskz_rcp28_pd(M, A) \
-  _mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_rcp28_round_ps(A, R) \
-  ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
-                                       (__v16sf)_mm512_setzero_ps(), \
-                                       (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_rcp28_round_ps(S, M, A, R) \
-  ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
-                                       (__v16sf)(__m512)(S), (__mmask16)(M), \
-                                       (int)(R)))
-
-#define _mm512_maskz_rcp28_round_ps(M, A, R) \
-  ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
-                                       (__v16sf)_mm512_setzero_ps(), \
-                                       (__mmask16)(M), (int)(R)))
-
-#define _mm512_rcp28_ps(A) \
-  _mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_rcp28_ps(S, M, A) \
-  _mm512_mask_rcp28_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_maskz_rcp28_ps(M, A) \
-  _mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_rcp28_round_ss(A, B, R) \
-  ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
-                                             (__v4sf)(__m128)(B), \
-                                             (__v4sf)_mm_setzero_ps(), \
-                                             (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_rcp28_round_ss(S, M, A, B, R) \
-  ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
-                                             (__v4sf)(__m128)(B), \
-                                             (__v4sf)(__m128)(S), \
-                                             (__mmask8)(M), (int)(R)))
-
-#define _mm_maskz_rcp28_round_ss(M, A, B, R) \
-  ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
-                                             (__v4sf)(__m128)(B), \
-                                             (__v4sf)_mm_setzero_ps(), \
-                                             (__mmask8)(M), (int)(R)))
-
-#define _mm_rcp28_ss(A, B) \
-  _mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_mask_rcp28_ss(S, M, A, B) \
-  _mm_mask_rcp28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_maskz_rcp28_ss(M, A, B) \
-  _mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_rcp28_round_sd(A, B, R) \
-  ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
-                                              (__v2df)(__m128d)(B), \
-                                              (__v2df)_mm_setzero_pd(), \
-                                              (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_rcp28_round_sd(S, M, A, B, R) \
-  ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
-                                              (__v2df)(__m128d)(B), \
-                                              (__v2df)(__m128d)(S), \
-                                              (__mmask8)(M), (int)(R)))
-
-#define _mm_maskz_rcp28_round_sd(M, A, B, R) \
-  ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
-                                              (__v2df)(__m128d)(B), \
-                                              (__v2df)_mm_setzero_pd(), \
-                                              (__mmask8)(M), (int)(R)))
-
-#define _mm_rcp28_sd(A, B) \
-  _mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_mask_rcp28_sd(S, M, A, B) \
-  _mm_mask_rcp28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_maskz_rcp28_sd(M, A, B) \
-  _mm_maskz_rcp28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#endif /* __AVX512ERINTRIN_H */
--- a/lib/include/avx512fintrin.h
+++ b/lib/include/avx512fintrin.h
--- a/lib/include/avx512fp16intrin.h
+++ b/lib/include/avx512fp16intrin.h
--- a/lib/include/avx512ifmaintrin.h
+++ b/lib/include/avx512ifmaintrin.h
@ -1,68 +0,0 @@
-/*===------------- avx512ifmaintrin.h - IFMA intrinsics ------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512ifmaintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __IFMAINTRIN_H
-#define __IFMAINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"), __min_vector_width__(512)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_madd52hi_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
-{
-  return (__m512i)__builtin_ia32_vpmadd52huq512((__v8di) __X, (__v8di) __Y,
-                                                (__v8di) __Z);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_madd52hi_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_selectq_512(__M,
-                                   (__v8di)_mm512_madd52hi_epu64(__W, __X, __Y),
-                                   (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_madd52hi_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
-{
-  return (__m512i)__builtin_ia32_selectq_512(__M,
-                                   (__v8di)_mm512_madd52hi_epu64(__X, __Y, __Z),
-                                   (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_madd52lo_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
-{
-  return (__m512i)__builtin_ia32_vpmadd52luq512((__v8di) __X, (__v8di) __Y,
-                                                (__v8di) __Z);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_madd52lo_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_selectq_512(__M,
-                                   (__v8di)_mm512_madd52lo_epu64(__W, __X, __Y),
-                                   (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_madd52lo_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
-{
-  return (__m512i)__builtin_ia32_selectq_512(__M,
-                                   (__v8di)_mm512_madd52lo_epu64(__X, __Y, __Z),
-                                   (__v8di)_mm512_setzero_si512());
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
--- a/lib/include/avx512ifmavlintrin.h
+++ b/lib/include/avx512ifmavlintrin.h
@ -1,105 +0,0 @@
-/*===------------- avx512ifmavlintrin.h - IFMA intrinsics ------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512ifmavlintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __IFMAVLINTRIN_H
-#define __IFMAVLINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl"), __min_vector_width__(256)))
-
-#define _mm_madd52hi_epu64(X, Y, Z)                                            \
-  ((__m128i)__builtin_ia32_vpmadd52huq128((__v2di)(X), (__v2di)(Y),            \
-                                          (__v2di)(Z)))
-
-#define _mm256_madd52hi_epu64(X, Y, Z)                                         \
-  ((__m256i)__builtin_ia32_vpmadd52huq256((__v4di)(X), (__v4di)(Y),            \
-                                          (__v4di)(Z)))
-
-#define _mm_madd52lo_epu64(X, Y, Z)                                            \
-  ((__m128i)__builtin_ia32_vpmadd52luq128((__v2di)(X), (__v2di)(Y),            \
-                                          (__v2di)(Z)))
-
-#define _mm256_madd52lo_epu64(X, Y, Z)                                         \
-  ((__m256i)__builtin_ia32_vpmadd52luq256((__v4di)(X), (__v4di)(Y),            \
-                                          (__v4di)(Z)))
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_madd52hi_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_selectq_128(__M,
-                                      (__v2di)_mm_madd52hi_epu64(__W, __X, __Y),
-                                      (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_madd52hi_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
-{
-  return (__m128i)__builtin_ia32_selectq_128(__M,
-                                      (__v2di)_mm_madd52hi_epu64(__X, __Y, __Z),
-                                      (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_selectq_256(__M,
-                                   (__v4di)_mm256_madd52hi_epu64(__W, __X, __Y),
-                                   (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_madd52hi_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
-{
-  return (__m256i)__builtin_ia32_selectq_256(__M,
-                                   (__v4di)_mm256_madd52hi_epu64(__X, __Y, __Z),
-                                   (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_madd52lo_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_selectq_128(__M,
-                                      (__v2di)_mm_madd52lo_epu64(__W, __X, __Y),
-                                      (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_madd52lo_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
-{
-  return (__m128i)__builtin_ia32_selectq_128(__M,
-                                      (__v2di)_mm_madd52lo_epu64(__X, __Y, __Z),
-                                      (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_selectq_256(__M,
-                                   (__v4di)_mm256_madd52lo_epu64(__W, __X, __Y),
-                                   (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_madd52lo_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
-{
-  return (__m256i)__builtin_ia32_selectq_256(__M,
-                                   (__v4di)_mm256_madd52lo_epu64(__X, __Y, __Z),
-                                   (__v4di)_mm256_setzero_si256());
-}
-
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif
--- a/lib/include/avx512pfintrin.h
+++ b/lib/include/avx512pfintrin.h
@ -1,97 +0,0 @@
-/*===------------- avx512pfintrin.h - PF intrinsics ------------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512pfintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512PFINTRIN_H
-#define __AVX512PFINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512pf")))
-
-#define _mm512_mask_prefetch_i32gather_pd(index, mask, addr, scale, hint) \
-  __builtin_ia32_gatherpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
-                             (void const *)(addr), (int)(scale), \
-                             (int)(hint))
-
-#define _mm512_prefetch_i32gather_pd(index, addr, scale, hint) \
-  __builtin_ia32_gatherpfdpd((__mmask8) -1, (__v8si)(__m256i)(index), \
-                             (void const *)(addr), (int)(scale), \
-                             (int)(hint))
-
-#define _mm512_mask_prefetch_i32gather_ps(index, mask, addr, scale, hint) \
-  __builtin_ia32_gatherpfdps((__mmask16)(mask), \
-                             (__v16si)(__m512i)(index), (void const *)(addr), \
-                             (int)(scale), (int)(hint))
-
-#define _mm512_prefetch_i32gather_ps(index, addr, scale, hint) \
-  __builtin_ia32_gatherpfdps((__mmask16) -1, \
-                             (__v16si)(__m512i)(index), (void const *)(addr), \
-                             (int)(scale), (int)(hint))
-
-#define _mm512_mask_prefetch_i64gather_pd(index, mask, addr, scale, hint) \
-  __builtin_ia32_gatherpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
-                             (void const *)(addr), (int)(scale), \
-                             (int)(hint))
-
-#define _mm512_prefetch_i64gather_pd(index, addr, scale, hint) \
-  __builtin_ia32_gatherpfqpd((__mmask8) -1, (__v8di)(__m512i)(index), \
-                             (void const *)(addr), (int)(scale), \
-                             (int)(hint))
-
-#define _mm512_mask_prefetch_i64gather_ps(index, mask, addr, scale, hint) \
-  __builtin_ia32_gatherpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
-                             (void const *)(addr), (int)(scale), (int)(hint))
-
-#define _mm512_prefetch_i64gather_ps(index, addr, scale, hint) \
-  __builtin_ia32_gatherpfqps((__mmask8) -1, (__v8di)(__m512i)(index), \
-                             (void const *)(addr), (int)(scale), (int)(hint))
-
-#define _mm512_prefetch_i32scatter_pd(addr, index, scale, hint) \
-  __builtin_ia32_scatterpfdpd((__mmask8)-1, (__v8si)(__m256i)(index), \
-                              (void *)(addr), (int)(scale), \
-                              (int)(hint))
-
-#define _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, scale, hint) \
-  __builtin_ia32_scatterpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
-                              (void *)(addr), (int)(scale), \
-                              (int)(hint))
-
-#define _mm512_prefetch_i32scatter_ps(addr, index, scale, hint) \
-  __builtin_ia32_scatterpfdps((__mmask16)-1, (__v16si)(__m512i)(index), \
-                              (void *)(addr), (int)(scale), (int)(hint))
-
-#define _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, scale, hint) \
-  __builtin_ia32_scatterpfdps((__mmask16)(mask), \
-                              (__v16si)(__m512i)(index), (void *)(addr), \
-                              (int)(scale), (int)(hint))
-
-#define _mm512_prefetch_i64scatter_pd(addr, index, scale, hint) \
-  __builtin_ia32_scatterpfqpd((__mmask8)-1, (__v8di)(__m512i)(index), \
-                              (void *)(addr), (int)(scale), \
-                              (int)(hint))
-
-#define _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, scale, hint) \
-  __builtin_ia32_scatterpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
-                              (void *)(addr), (int)(scale), \
-                              (int)(hint))
-
-#define _mm512_prefetch_i64scatter_ps(addr, index, scale, hint) \
-  __builtin_ia32_scatterpfqps((__mmask8)-1, (__v8di)(__m512i)(index), \
-                              (void *)(addr), (int)(scale), (int)(hint))
-
-#define _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, scale, hint) \
-  __builtin_ia32_scatterpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
-                              (void *)(addr), (int)(scale), (int)(hint))
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
--- a/lib/include/avx512vbmi2intrin.h
+++ b/lib/include/avx512vbmi2intrin.h
@ -1,357 +0,0 @@
-/*===------------- avx512vbmi2intrin.h - VBMI2 intrinsics ------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vbmi2intrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512VBMI2INTRIN_H
-#define __AVX512VBMI2INTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi2"), __min_vector_width__(512)))
-
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_compress_epi16(__m512i __S, __mmask32 __U, __m512i __D)
-{
-  return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D,
-              (__v32hi) __S,
-              __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_compress_epi16(__mmask32 __U, __m512i __D)
-{
-  return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D,
-              (__v32hi) _mm512_setzero_si512(),
-              __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_compress_epi8(__m512i __S, __mmask64 __U, __m512i __D)
-{
-  return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D,
-              (__v64qi) __S,
-              __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_compress_epi8(__mmask64 __U, __m512i __D)
-{
-  return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D,
-              (__v64qi) _mm512_setzero_si512(),
-              __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm512_mask_compressstoreu_epi16(void *__P, __mmask32 __U, __m512i __D)
-{
-  __builtin_ia32_compressstorehi512_mask ((__v32hi *) __P, (__v32hi) __D,
-              __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm512_mask_compressstoreu_epi8(void *__P, __mmask64 __U, __m512i __D)
-{
-  __builtin_ia32_compressstoreqi512_mask ((__v64qi *) __P, (__v64qi) __D,
-              __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_expand_epi16(__m512i __S, __mmask32 __U, __m512i __D)
-{
-  return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D,
-              (__v32hi) __S,
-              __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_expand_epi16(__mmask32 __U, __m512i __D)
-{
-  return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D,
-              (__v32hi) _mm512_setzero_si512(),
-              __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_expand_epi8(__m512i __S, __mmask64 __U, __m512i __D)
-{
-  return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D,
-              (__v64qi) __S,
-              __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_expand_epi8(__mmask64 __U, __m512i __D)
-{
-  return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D,
-              (__v64qi) _mm512_setzero_si512(),
-              __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_expandloadu_epi16(__m512i __S, __mmask32 __U, void const *__P)
-{
-  return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P,
-              (__v32hi) __S,
-              __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_expandloadu_epi16(__mmask32 __U, void const *__P)
-{
-  return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P,
-              (__v32hi) _mm512_setzero_si512(),
-              __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_expandloadu_epi8(__m512i __S, __mmask64 __U, void const *__P)
-{
-  return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P,
-              (__v64qi) __S,
-              __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P)
-{
-  return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P,
-              (__v64qi) _mm512_setzero_si512(),
-              __U);
-}
-
-#define _mm512_shldi_epi64(A, B, I) \
-  ((__m512i)__builtin_ia32_vpshldq512((__v8di)(__m512i)(A), \
-                                      (__v8di)(__m512i)(B), (int)(I)))
-
-#define _mm512_mask_shldi_epi64(S, U, A, B, I) \
-  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                     (__v8di)_mm512_shldi_epi64((A), (B), (I)), \
-                                     (__v8di)(__m512i)(S)))
-
-#define _mm512_maskz_shldi_epi64(U, A, B, I) \
-  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                     (__v8di)_mm512_shldi_epi64((A), (B), (I)), \
-                                     (__v8di)_mm512_setzero_si512()))
-
-#define _mm512_shldi_epi32(A, B, I) \
-  ((__m512i)__builtin_ia32_vpshldd512((__v16si)(__m512i)(A), \
-                                      (__v16si)(__m512i)(B), (int)(I)))
-
-#define _mm512_mask_shldi_epi32(S, U, A, B, I) \
-  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                    (__v16si)_mm512_shldi_epi32((A), (B), (I)), \
-                                    (__v16si)(__m512i)(S)))
-
-#define _mm512_maskz_shldi_epi32(U, A, B, I) \
-  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                    (__v16si)_mm512_shldi_epi32((A), (B), (I)), \
-                                    (__v16si)_mm512_setzero_si512()))
-
-#define _mm512_shldi_epi16(A, B, I) \
-  ((__m512i)__builtin_ia32_vpshldw512((__v32hi)(__m512i)(A), \
-                                      (__v32hi)(__m512i)(B), (int)(I)))
-
-#define _mm512_mask_shldi_epi16(S, U, A, B, I) \
-  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
-                                    (__v32hi)_mm512_shldi_epi16((A), (B), (I)), \
-                                    (__v32hi)(__m512i)(S)))
-
-#define _mm512_maskz_shldi_epi16(U, A, B, I) \
-  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
-                                    (__v32hi)_mm512_shldi_epi16((A), (B), (I)), \
-                                    (__v32hi)_mm512_setzero_si512()))
-
-#define _mm512_shrdi_epi64(A, B, I) \
-  ((__m512i)__builtin_ia32_vpshrdq512((__v8di)(__m512i)(A), \
-                                      (__v8di)(__m512i)(B), (int)(I)))
-
-#define _mm512_mask_shrdi_epi64(S, U, A, B, I) \
-  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                     (__v8di)_mm512_shrdi_epi64((A), (B), (I)), \
-                                     (__v8di)(__m512i)(S)))
-
-#define _mm512_maskz_shrdi_epi64(U, A, B, I) \
-  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                     (__v8di)_mm512_shrdi_epi64((A), (B), (I)), \
-                                     (__v8di)_mm512_setzero_si512()))
-
-#define _mm512_shrdi_epi32(A, B, I) \
-  ((__m512i)__builtin_ia32_vpshrdd512((__v16si)(__m512i)(A), \
-                                      (__v16si)(__m512i)(B), (int)(I)))
-
-#define _mm512_mask_shrdi_epi32(S, U, A, B, I) \
-  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                    (__v16si)_mm512_shrdi_epi32((A), (B), (I)), \
-                                    (__v16si)(__m512i)(S)))
-
-#define _mm512_maskz_shrdi_epi32(U, A, B, I) \
-  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                    (__v16si)_mm512_shrdi_epi32((A), (B), (I)), \
-                                    (__v16si)_mm512_setzero_si512()))
-
-#define _mm512_shrdi_epi16(A, B, I) \
-  ((__m512i)__builtin_ia32_vpshrdw512((__v32hi)(__m512i)(A), \
-                                      (__v32hi)(__m512i)(B), (int)(I)))
-
-#define _mm512_mask_shrdi_epi16(S, U, A, B, I) \
-  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
-                                    (__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \
-                                    (__v32hi)(__m512i)(S)))
-
-#define _mm512_maskz_shrdi_epi16(U, A, B, I) \
-  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
-                                    (__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \
-                                    (__v32hi)_mm512_setzero_si512()))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_shldv_epi64(__m512i __A, __m512i __B, __m512i __C)
-{
-  return (__m512i)__builtin_ia32_vpshldvq512((__v8di)__A, (__v8di)__B,
-                                             (__v8di)__C);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_shldv_epi64(__m512i __A, __mmask8 __U, __m512i __B, __m512i __C)
-{
-  return (__m512i)__builtin_ia32_selectq_512(__U,
-                                      (__v8di)_mm512_shldv_epi64(__A, __B, __C),
-                                      (__v8di)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_shldv_epi64(__mmask8 __U, __m512i __A, __m512i __B, __m512i __C)
-{
-  return (__m512i)__builtin_ia32_selectq_512(__U,
-                                      (__v8di)_mm512_shldv_epi64(__A, __B, __C),
-                                      (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_shldv_epi32(__m512i __A, __m512i __B, __m512i __C)
-{
-  return (__m512i)__builtin_ia32_vpshldvd512((__v16si)__A, (__v16si)__B,
-                                             (__v16si)__C);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_shldv_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C)
-{
-  return (__m512i)__builtin_ia32_selectd_512(__U,
-                                     (__v16si)_mm512_shldv_epi32(__A, __B, __C),
-                                     (__v16si)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_shldv_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C)
-{
-  return (__m512i)__builtin_ia32_selectd_512(__U,
-                                     (__v16si)_mm512_shldv_epi32(__A, __B, __C),
-                                     (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_shldv_epi16(__m512i __A, __m512i __B, __m512i __C)
-{
-  return (__m512i)__builtin_ia32_vpshldvw512((__v32hi)__A, (__v32hi)__B,
-                                             (__v32hi)__C);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_shldv_epi16(__m512i __A, __mmask32 __U, __m512i __B, __m512i __C)
-{
-  return (__m512i)__builtin_ia32_selectw_512(__U,
-                                     (__v32hi)_mm512_shldv_epi16(__A, __B, __C),
-                                     (__v32hi)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_shldv_epi16(__mmask32 __U, __m512i __A, __m512i __B, __m512i __C)
-{
-  return (__m512i)__builtin_ia32_selectw_512(__U,
-                                     (__v32hi)_mm512_shldv_epi16(__A, __B, __C),
-                                     (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_shrdv_epi64(__m512i __A, __m512i __B, __m512i __C)
-{
-  return (__m512i)__builtin_ia32_vpshrdvq512((__v8di)__A, (__v8di)__B,
-                                             (__v8di)__C);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_shrdv_epi64(__m512i __A, __mmask8 __U, __m512i __B, __m512i __C)
-{
-  return (__m512i)__builtin_ia32_selectq_512(__U,
-                                      (__v8di)_mm512_shrdv_epi64(__A, __B, __C),
-                                      (__v8di)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_shrdv_epi64(__mmask8 __U, __m512i __A, __m512i __B, __m512i __C)
-{
-  return (__m512i)__builtin_ia32_selectq_512(__U,
-                                      (__v8di)_mm512_shrdv_epi64(__A, __B, __C),
-                                      (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_shrdv_epi32(__m512i __A, __m512i __B, __m512i __C)
-{
-  return (__m512i)__builtin_ia32_vpshrdvd512((__v16si)__A, (__v16si)__B,
-                                             (__v16si)__C);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_shrdv_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C)
-{
-  return (__m512i) __builtin_ia32_selectd_512(__U,
-                                     (__v16si)_mm512_shrdv_epi32(__A, __B, __C),
-                                     (__v16si)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_shrdv_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C)
-{
-  return (__m512i) __builtin_ia32_selectd_512(__U,
-                                     (__v16si)_mm512_shrdv_epi32(__A, __B, __C),
-                                     (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_shrdv_epi16(__m512i __A, __m512i __B, __m512i __C)
-{
-  return (__m512i)__builtin_ia32_vpshrdvw512((__v32hi)__A, (__v32hi)__B,
-                                             (__v32hi)__C);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_shrdv_epi16(__m512i __A, __mmask32 __U, __m512i __B, __m512i __C)
-{
-  return (__m512i)__builtin_ia32_selectw_512(__U,
-                                     (__v32hi)_mm512_shrdv_epi16(__A, __B, __C),
-                                     (__v32hi)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_shrdv_epi16(__mmask32 __U, __m512i __A, __m512i __B, __m512i __C)
-{
-  return (__m512i)__builtin_ia32_selectw_512(__U,
-                                     (__v32hi)_mm512_shrdv_epi16(__A, __B, __C),
-                                     (__v32hi)_mm512_setzero_si512());
-}
-
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
-
--- a/lib/include/avx512vbmiintrin.h
+++ b/lib/include/avx512vbmiintrin.h
@ -1,105 +0,0 @@
-/*===------------- avx512vbmiintrin.h - VBMI intrinsics ------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vbmiintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __VBMIINTRIN_H
-#define __VBMIINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi"), __min_vector_width__(512)))
-
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_vpermi2varqi512((__v64qi)__A, (__v64qi)__I,
-                                                 (__v64qi) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_permutex2var_epi8(__m512i __A, __mmask64 __U, __m512i __I,
-                              __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectb_512(__U,
-                               (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
-                               (__v64qi)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U,
-                               __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectb_512(__U,
-                               (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
-                               (__v64qi)__I);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i __I,
-                               __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectb_512(__U,
-                               (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
-                               (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_permutexvar_epi8 (__m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_permvarqi512((__v64qi) __B, (__v64qi) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A,
-        __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
-                                     (__v64qi)_mm512_permutexvar_epi8(__A, __B),
-                                     (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
-             __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
-                                     (__v64qi)_mm512_permutexvar_epi8(__A, __B),
-                                     (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_multishift_epi64_epi8(__m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_vpmultishiftqb512((__v64qi)__X, (__v64qi) __Y);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_multishift_epi64_epi8(__m512i __W, __mmask64 __M, __m512i __X,
-                                  __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
-                                (__v64qi)_mm512_multishift_epi64_epi8(__X, __Y),
-                                (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_multishift_epi64_epi8(__mmask64 __M, __m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
-                                (__v64qi)_mm512_multishift_epi64_epi8(__X, __Y),
-                                (__v64qi)_mm512_setzero_si512());
-}
-
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
--- a/lib/include/avx512vbmivlintrin.h
+++ b/lib/include/avx512vbmivlintrin.h
@ -1,188 +0,0 @@
-/*===------------- avx512vbmivlintrin.h - VBMI intrinsics ------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vbmivlintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __VBMIVLINTRIN_H
-#define __VBMIVLINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi,avx512vl"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi,avx512vl"), __min_vector_width__(256)))
-
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vpermi2varqi128((__v16qi)__A,
-                                                 (__v16qi)__I,
-                                                 (__v16qi)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_permutex2var_epi8(__m128i __A, __mmask16 __U, __m128i __I,
-                           __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectb_128(__U,
-                                  (__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
-                                  (__v16qi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask2_permutex2var_epi8(__m128i __A, __m128i __I, __mmask16 __U,
-                            __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectb_128(__U,
-                                  (__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
-                                  (__v16qi)__I);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I,
-                            __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectb_128(__U,
-                                  (__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
-                                  (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_vpermi2varqi256((__v32qi)__A, (__v32qi)__I,
-                                                 (__v32qi)__B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_permutex2var_epi8(__m256i __A, __mmask32 __U, __m256i __I,
-                              __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectb_256(__U,
-                               (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
-                               (__v32qi)__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask2_permutex2var_epi8(__m256i __A, __m256i __I, __mmask32 __U,
-                               __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectb_256(__U,
-                               (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
-                               (__v32qi)__I);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i __I,
-                               __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectb_256(__U,
-                               (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
-                               (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_permutexvar_epi8 (__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_permvarqi128((__v16qi)__B, (__v16qi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
-                                        (__v16qi)_mm_permutexvar_epi8(__A, __B),
-                                        (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_permutexvar_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
-          __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
-                                        (__v16qi)_mm_permutexvar_epi8(__A, __B),
-                                        (__v16qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_permutexvar_epi8 (__m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_permvarqi256((__v32qi) __B, (__v32qi) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A,
-        __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
-                                     (__v32qi)_mm256_permutexvar_epi8(__A, __B),
-                                     (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
-             __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
-                                     (__v32qi)_mm256_permutexvar_epi8(__A, __B),
-                                     (__v32qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_multishift_epi64_epi8(__m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_vpmultishiftqb128((__v16qi)__X, (__v16qi)__Y);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_multishift_epi64_epi8(__m128i __W, __mmask16 __M, __m128i __X,
-                               __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
-                                   (__v16qi)_mm_multishift_epi64_epi8(__X, __Y),
-                                   (__v16qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_multishift_epi64_epi8(__mmask16 __M, __m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
-                                   (__v16qi)_mm_multishift_epi64_epi8(__X, __Y),
-                                   (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_multishift_epi64_epi8(__m256i __X, __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_vpmultishiftqb256((__v32qi)__X, (__v32qi)__Y);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_multishift_epi64_epi8(__m256i __W, __mmask32 __M, __m256i __X,
-                                  __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
-                                (__v32qi)_mm256_multishift_epi64_epi8(__X, __Y),
-                                (__v32qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_multishift_epi64_epi8(__mmask32 __M, __m256i __X, __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
-                                (__v32qi)_mm256_multishift_epi64_epi8(__X, __Y),
-                                (__v32qi)_mm256_setzero_si256());
-}
-
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif
--- a/lib/include/avx512vlbf16intrin.h
+++ b/lib/include/avx512vlbf16intrin.h
@ -1,515 +0,0 @@
-/*===--------- avx512vlbf16intrin.h - AVX512_BF16 intrinsics ---------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vlbf16intrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifdef __SSE2__
-
-#ifndef __AVX512VLBF16INTRIN_H
-#define __AVX512VLBF16INTRIN_H
-
-#define __DEFAULT_FN_ATTRS128 \
-  __attribute__((__always_inline__, __nodebug__, \
-                 __target__("avx512vl, avx512bf16"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 \
-  __attribute__((__always_inline__, __nodebug__, \
-                 __target__("avx512vl, avx512bf16"), __min_vector_width__(256)))
-
-/// Convert Two Packed Single Data to One Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
-///
-/// \param __A
-///    A 128-bit vector of [4 x float].
-/// \param __B
-///    A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
-///    conversion of __B, and higher 64 bits come from conversion of __A.
-static __inline__ __m128bh __DEFAULT_FN_ATTRS128
-_mm_cvtne2ps_pbh(__m128 __A, __m128 __B) {
-  return (__m128bh)__builtin_ia32_cvtne2ps2bf16_128((__v4sf) __A,
-                                                    (__v4sf) __B);
-}
-
-/// Convert Two Packed Single Data to One Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
-///
-/// \param __A
-///    A 128-bit vector of [4 x float].
-/// \param __B
-///    A 128-bit vector of [4 x float].
-/// \param __W
-///    A 128-bit vector of [8 x bfloat].
-/// \param __U
-///    A 8-bit mask value specifying what is chosen for each element.
-///    A 1 means conversion of __A or __B. A 0 means element from __W.
-/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
-///    conversion of __B, and higher 64 bits come from conversion of __A.
-static __inline__ __m128bh __DEFAULT_FN_ATTRS128
-_mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
-                                             (__v8bf)_mm_cvtne2ps_pbh(__A, __B),
-                                             (__v8bf)__W);
-}
-
-/// Convert Two Packed Single Data to One Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
-///
-/// \param __A
-///    A 128-bit vector of [4 x float].
-/// \param __B
-///    A 128-bit vector of [4 x float].
-/// \param __U
-///    A 8-bit mask value specifying what is chosen for each element.
-///    A 1 means conversion of __A or __B. A 0 means element is zero.
-/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
-///    conversion of __B, and higher 64 bits come from conversion of __A.
-static __inline__ __m128bh __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtne2ps_pbh(__mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
-                                             (__v8bf)_mm_cvtne2ps_pbh(__A, __B),
-                                             (__v8bf)_mm_setzero_si128());
-}
-
-/// Convert Two Packed Single Data to One Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
-///
-/// \param __A
-///    A 256-bit vector of [8 x float].
-/// \param __B
-///    A 256-bit vector of [8 x float].
-/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
-///    conversion of __B, and higher 128 bits come from conversion of __A.
-static __inline__ __m256bh __DEFAULT_FN_ATTRS256
-_mm256_cvtne2ps_pbh(__m256 __A, __m256 __B) {
-  return (__m256bh)__builtin_ia32_cvtne2ps2bf16_256((__v8sf) __A,
-                                                    (__v8sf) __B);
-}
-
-/// Convert Two Packed Single Data to One Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
-///
-/// \param __A
-///    A 256-bit vector of [8 x float].
-/// \param __B
-///    A 256-bit vector of [8 x float].
-/// \param __W
-///    A 256-bit vector of [16 x bfloat].
-/// \param __U
-///    A 16-bit mask value specifying what is chosen for each element.
-///    A 1 means conversion of __A or __B. A 0 means element from __W.
-/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
-///    conversion of __B, and higher 128 bits come from conversion of __A.
-static __inline__ __m256bh __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) {
-  return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U,
-                                         (__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
-                                         (__v16bf)__W);
-}
-
-/// Convert Two Packed Single Data to One Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
-///
-/// \param __A
-///    A 256-bit vector of [8 x float].
-/// \param __B
-///    A 256-bit vector of [8 x float].
-/// \param __U
-///    A 16-bit mask value specifying what is chosen for each element.
-///    A 1 means conversion of __A or __B. A 0 means element is zero.
-/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
-///    conversion of __B, and higher 128 bits come from conversion of __A.
-static __inline__ __m256bh __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B) {
-  return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U,
-                                         (__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
-                                         (__v16bf)_mm256_setzero_si256());
-}
-
-/// Convert Packed Single Data to Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
-///
-/// \param __A
-///    A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
-///    conversion of __A, and higher 64 bits are 0.
-#define _mm_cvtneps_pbh(A)                                                     \
-  ((__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)(A)))
-
-/// Convert Packed Single Data to Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
-///
-/// \param __A
-///    A 128-bit vector of [4 x float].
-/// \param __W
-///    A 128-bit vector of [8 x bfloat].
-/// \param __U
-///    A 4-bit mask value specifying what is chosen for each element.
-///    A 1 means conversion of __A. A 0 means element from __W.
-/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
-///    conversion of __A, and higher 64 bits are 0.
-static __inline__ __m128bh __DEFAULT_FN_ATTRS128
-_mm_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m128 __A) {
-  return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
-                                                        (__v8bf)__W,
-                                                        (__mmask8)__U);
-}
-
-/// Convert Packed Single Data to Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
-///
-/// \param __A
-///    A 128-bit vector of [4 x float].
-/// \param __U
-///    A 4-bit mask value specifying what is chosen for each element.
-///    A 1 means conversion of __A. A 0 means element is zero.
-/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
-///    conversion of __A, and higher 64 bits are 0.
-static __inline__ __m128bh __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtneps_pbh(__mmask8 __U, __m128 __A) {
-  return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
-                                                    (__v8bf)_mm_setzero_si128(),
-                                                    (__mmask8)__U);
-}
-
-/// Convert Packed Single Data to Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
-///
-/// \param __A
-///    A 256-bit vector of [8 x float].
-/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
-#define _mm256_cvtneps_pbh(A)                                                  \
-  ((__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)(A)))
-
-/// Convert Packed Single Data to Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
-///
-/// \param __A
-///    A 256-bit vector of [8 x float].
-/// \param __W
-///    A 256-bit vector of [8 x bfloat].
-/// \param __U
-///    A 8-bit mask value specifying what is chosen for each element.
-///    A 1 means conversion of __A. A 0 means element from __W.
-/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
-static __inline__ __m128bh __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A) {
-  return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
-                                                        (__v8bf)__W,
-                                                        (__mmask8)__U);
-}
-
-/// Convert Packed Single Data to Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
-///
-/// \param __A
-///    A 256-bit vector of [8 x float].
-/// \param __U
-///    A 8-bit mask value specifying what is chosen for each element.
-///    A 1 means conversion of __A. A 0 means element is zero.
-/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
-static __inline__ __m128bh __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtneps_pbh(__mmask8 __U, __m256 __A) {
-  return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
-                                                    (__v8bf)_mm_setzero_si128(),
-                                                    (__mmask8)__U);
-}
-
-/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
-///
-/// \param __A
-///    A 128-bit vector of [8 x bfloat].
-/// \param __B
-///    A 128-bit vector of [8 x bfloat].
-/// \param __D
-///    A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [4 x float] comes from  Dot Product of
-///  __A, __B and __D
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_dpbf16_ps(__m128 __D, __m128bh __A, __m128bh __B) {
-  return (__m128)__builtin_ia32_dpbf16ps_128((__v4sf)__D,
-                                             (__v8bf)__A,
-                                             (__v8bf)__B);
-}
-
-/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
-///
-/// \param __A
-///    A 128-bit vector of [8 x bfloat].
-/// \param __B
-///    A 128-bit vector of [8 x bfloat].
-/// \param __D
-///    A 128-bit vector of [4 x float].
-/// \param __U
-///    A 8-bit mask value specifying what is chosen for each element.
-///    A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
-/// \returns A 128-bit vector of [4 x float] comes from  Dot Product of
-///  __A, __B and __D
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_dpbf16_ps(__m128 __D, __mmask8 __U, __m128bh __A, __m128bh __B) {
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                           (__v4sf)_mm_dpbf16_ps(__D, __A, __B),
-                                           (__v4sf)__D);
-}
-
-/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
-///
-/// \param __A
-///    A 128-bit vector of [8 x bfloat].
-/// \param __B
-///    A 128-bit vector of [8 x bfloat].
-/// \param __D
-///    A 128-bit vector of [4 x float].
-/// \param __U
-///    A 8-bit mask value specifying what is chosen for each element.
-///    A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
-/// \returns A 128-bit vector of [4 x float] comes from  Dot Product of
-///  __A, __B and __D
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_dpbf16_ps(__mmask8 __U, __m128 __D, __m128bh __A, __m128bh __B) {
-  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
-                                           (__v4sf)_mm_dpbf16_ps(__D, __A, __B),
-                                           (__v4sf)_mm_setzero_si128());
-}
-
-/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
-///
-/// \param __A
-///    A 256-bit vector of [16 x bfloat].
-/// \param __B
-///    A 256-bit vector of [16 x bfloat].
-/// \param __D
-///    A 256-bit vector of [8 x float].
-/// \returns A 256-bit vector of [8 x float] comes from  Dot Product of
-///  __A, __B and __D
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_dpbf16_ps(__m256 __D, __m256bh __A, __m256bh __B) {
-  return (__m256)__builtin_ia32_dpbf16ps_256((__v8sf)__D,
-                                             (__v16bf)__A,
-                                             (__v16bf)__B);
-}
-
-/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
-///
-/// \param __A
-///    A 256-bit vector of [16 x bfloat].
-/// \param __B
-///    A 256-bit vector of [16 x bfloat].
-/// \param __D
-///    A 256-bit vector of [8 x float].
-/// \param __U
-///    A 16-bit mask value specifying what is chosen for each element.
-///    A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
-/// \returns A 256-bit vector of [8 x float] comes from  Dot Product of
-///  __A, __B and __D
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_dpbf16_ps(__m256 __D, __mmask8 __U, __m256bh __A, __m256bh __B) {
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                        (__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
-                                        (__v8sf)__D);
-}
-
-/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
-///
-/// \param __A
-///    A 256-bit vector of [16 x bfloat].
-/// \param __B
-///    A 256-bit vector of [16 x bfloat].
-/// \param __D
-///    A 256-bit vector of [8 x float].
-/// \param __U
-///    A 8-bit mask value specifying what is chosen for each element.
-///    A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
-/// \returns A 256-bit vector of [8 x float] comes from  Dot Product of
-///  __A, __B and __D
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_dpbf16_ps(__mmask8 __U, __m256 __D, __m256bh __A, __m256bh __B) {
-  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
-                                        (__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
-                                        (__v8sf)_mm256_setzero_si256());
-}
-
-/// Convert One Single float Data to One BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
-///
-/// \param __A
-///    A float data.
-/// \returns A bf16 data whose sign field and exponent field keep unchanged,
-///    and fraction field is truncated to 7 bits.
-static __inline__ __bf16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
-  __v4sf __V = {__A, 0, 0, 0};
-  __v8bf __R = __builtin_ia32_cvtneps2bf16_128_mask(
-      (__v4sf)__V, (__v8bf)_mm_undefined_si128(), (__mmask8)-1);
-  return (__bf16)__R[0];
-}
-
-/// Convert Packed BF16 Data to Packed float Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \param __A
-///    A 128-bit vector of [4 x bfloat].
-/// \returns A 128-bit vector of [4 x float] come from conversion of __A
-static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) {
-  return _mm_castsi128_ps(
-      (__m128i)_mm_slli_epi32((__m128i)_mm_cvtepi16_epi32((__m128i)__A), 16));
-}
-
-/// Convert Packed BF16 Data to Packed float Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \param __A
-///    A 128-bit vector of [8 x bfloat].
-/// \returns A 256-bit vector of [8 x float] come from conversion of __A
-static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
-  return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
-      (__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 16));
-}
-
-/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \param __U
-///    A 4-bit mask. Elements are zeroed out when the corresponding mask
-///    bit is not set.
-/// \param __A
-///    A 128-bit vector of [4 x bfloat].
-/// \returns A 128-bit vector of [4 x float] come from conversion of __A
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
-  return _mm_castsi128_ps((__m128i)_mm_slli_epi32(
-      (__m128i)_mm_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
-}
-
-/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \param __U
-///    A 8-bit mask. Elements are zeroed out when the corresponding mask
-///    bit is not set.
-/// \param __A
-///    A 128-bit vector of [8 x bfloat].
-/// \returns A 256-bit vector of [8 x float] come from conversion of __A
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
-  return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
-      (__m256i)_mm256_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
-}
-
-/// Convert Packed BF16 Data to Packed float Data using merging mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \param __S
-///    A 128-bit vector of [4 x float]. Elements are copied from __S when
-///     the corresponding mask bit is not set.
-/// \param __U
-///    A 4-bit mask. Elements are zeroed out when the corresponding mask
-///    bit is not set.
-/// \param __A
-///    A 128-bit vector of [4 x bfloat].
-/// \returns A 128-bit vector of [4 x float] come from conversion of __A
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) {
-  return _mm_castsi128_ps((__m128i)_mm_mask_slli_epi32(
-      (__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32((__m128i)__A),
-      16));
-}
-
-/// Convert Packed BF16 Data to Packed float Data using merging mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \param __S
-///    A 256-bit vector of [8 x float]. Elements are copied from __S when
-///     the corresponding mask bit is not set.
-/// \param __U
-///    A 8-bit mask. Elements are zeroed out when the corresponding mask
-///    bit is not set.
-/// \param __A
-///    A 128-bit vector of [8 x bfloat].
-/// \returns A 256-bit vector of [8 x float] come from conversion of __A
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
-  return _mm256_castsi256_ps((__m256i)_mm256_mask_slli_epi32(
-      (__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32((__m128i)__A),
-      16));
-}
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif
-#endif
--- a/lib/include/avx512vlbitalgintrin.h
+++ b/lib/include/avx512vlbitalgintrin.h
@ -1,145 +0,0 @@
-/*===---- avx512vlbitalgintrin.h - BITALG intrinsics -----------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vlbitalgintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512VLBITALGINTRIN_H
-#define __AVX512VLBITALGINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bitalg"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bitalg"), __min_vector_width__(256)))
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_popcnt_epi16(__m256i __A)
-{
-  return (__m256i) __builtin_ia32_vpopcntw_256((__v16hi) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, __m256i __B)
-{
-  return (__m256i) __builtin_ia32_selectw_256((__mmask16) __U,
-              (__v16hi) _mm256_popcnt_epi16(__B),
-              (__v16hi) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B)
-{
-  return _mm256_mask_popcnt_epi16((__m256i) _mm256_setzero_si256(),
-              __U,
-              __B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_popcnt_epi16(__m128i __A)
-{
-  return (__m128i) __builtin_ia32_vpopcntw_128((__v8hi) __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B)
-{
-  return (__m128i) __builtin_ia32_selectw_128((__mmask8) __U,
-              (__v8hi) _mm_popcnt_epi16(__B),
-              (__v8hi) __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_popcnt_epi16(__mmask8 __U, __m128i __B)
-{
-  return _mm_mask_popcnt_epi16((__m128i) _mm_setzero_si128(),
-              __U,
-              __B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_popcnt_epi8(__m256i __A)
-{
-  return (__m256i) __builtin_ia32_vpopcntb_256((__v32qi) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, __m256i __B)
-{
-  return (__m256i) __builtin_ia32_selectb_256((__mmask32) __U,
-              (__v32qi) _mm256_popcnt_epi8(__B),
-              (__v32qi) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B)
-{
-  return _mm256_mask_popcnt_epi8((__m256i) _mm256_setzero_si256(),
-              __U,
-              __B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_popcnt_epi8(__m128i __A)
-{
-  return (__m128i) __builtin_ia32_vpopcntb_128((__v16qi) __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B)
-{
-  return (__m128i) __builtin_ia32_selectb_128((__mmask16) __U,
-              (__v16qi) _mm_popcnt_epi8(__B),
-              (__v16qi) __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_popcnt_epi8(__mmask16 __U, __m128i __B)
-{
-  return _mm_mask_popcnt_epi8((__m128i) _mm_setzero_si128(),
-              __U,
-              __B);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
-_mm256_mask_bitshuffle_epi64_mask(__mmask32 __U, __m256i __A, __m256i __B)
-{
-  return (__mmask32) __builtin_ia32_vpshufbitqmb256_mask((__v32qi) __A,
-              (__v32qi) __B,
-              __U);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
-_mm256_bitshuffle_epi64_mask(__m256i __A, __m256i __B)
-{
-  return _mm256_mask_bitshuffle_epi64_mask((__mmask32) -1,
-              __A,
-              __B);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
-_mm_mask_bitshuffle_epi64_mask(__mmask16 __U, __m128i __A, __m128i __B)
-{
-  return (__mmask16) __builtin_ia32_vpshufbitqmb128_mask((__v16qi) __A,
-              (__v16qi) __B,
-              __U);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
-_mm_bitshuffle_epi64_mask(__m128i __A, __m128i __B)
-{
-  return _mm_mask_bitshuffle_epi64_mask((__mmask16) -1,
-              __A,
-              __B);
-}
-
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif
--- a/lib/include/avx512vlbwintrin.h
+++ b/lib/include/avx512vlbwintrin.h
--- a/lib/include/avx512vlcdintrin.h
+++ b/lib/include/avx512vlcdintrin.h
@ -1,225 +0,0 @@
-/*===---- avx512vlcdintrin.h - AVX512VL and AVX512CD intrinsics ------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vlcdintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512VLCDINTRIN_H
-#define __AVX512VLCDINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512cd"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512cd"), __min_vector_width__(256)))
-
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_broadcastmb_epi64 (__mmask8 __A)
-{
-  return (__m128i) _mm_set1_epi64x((long long) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_broadcastmb_epi64 (__mmask8 __A)
-{
-  return (__m256i) _mm256_set1_epi64x((long long)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_broadcastmw_epi32 (__mmask16 __A)
-{
-  return (__m128i) _mm_set1_epi32((int)__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_broadcastmw_epi32 (__mmask16 __A)
-{
-  return (__m256i) _mm256_set1_epi32((int)__A);
-}
-
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_conflict_epi64 (__m128i __A)
-{
-  return (__m128i) __builtin_ia32_vpconflictdi_128 ((__v2di) __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_conflict_epi64(__A),
-                                             (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_conflict_epi64(__A),
-                                             (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_conflict_epi64 (__m256i __A)
-{
-  return (__m256i) __builtin_ia32_vpconflictdi_256 ((__v4di) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_conflict_epi64(__A),
-                                             (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_conflict_epi64(__A),
-                                             (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_conflict_epi32 (__m128i __A)
-{
-  return (__m128i) __builtin_ia32_vpconflictsi_128 ((__v4si) __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_conflict_epi32(__A),
-                                             (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_conflict_epi32(__A),
-                                             (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_conflict_epi32 (__m256i __A)
-{
-  return (__m256i) __builtin_ia32_vpconflictsi_256 ((__v8si) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_conflict_epi32(__A),
-                                             (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_conflict_epi32(__A),
-                                             (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_lzcnt_epi32 (__m128i __A)
-{
-  return (__m128i) __builtin_ia32_vplzcntd_128 ((__v4si) __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_lzcnt_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_lzcnt_epi32(__A),
-                                             (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_lzcnt_epi32 (__mmask8 __U, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_lzcnt_epi32(__A),
-                                             (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_lzcnt_epi32 (__m256i __A)
-{
-  return (__m256i) __builtin_ia32_vplzcntd_256 ((__v8si) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_lzcnt_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_lzcnt_epi32(__A),
-                                             (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_lzcnt_epi32 (__mmask8 __U, __m256i __A)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_lzcnt_epi32(__A),
-                                             (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_lzcnt_epi64 (__m128i __A)
-{
-  return (__m128i) __builtin_ia32_vplzcntq_128 ((__v2di) __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_lzcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_lzcnt_epi64(__A),
-                                             (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_lzcnt_epi64 (__mmask8 __U, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_lzcnt_epi64(__A),
-                                             (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_lzcnt_epi64 (__m256i __A)
-{
-  return (__m256i) __builtin_ia32_vplzcntq_256 ((__v4di) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_lzcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_lzcnt_epi64(__A),
-                                             (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_lzcnt_epi64(__A),
-                                             (__v4di)_mm256_setzero_si256());
-}
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif /* __AVX512VLCDINTRIN_H */
--- a/lib/include/avx512vldqintrin.h
+++ b/lib/include/avx512vldqintrin.h
--- a/lib/include/avx512vlfp16intrin.h
+++ b/lib/include/avx512vlfp16intrin.h
--- a/lib/include/avx512vlintrin.h
+++ b/lib/include/avx512vlintrin.h
--- a/lib/include/avx512vlvbmi2intrin.h
+++ b/lib/include/avx512vlvbmi2intrin.h
@ -1,689 +0,0 @@
-/*===------------- avx512vlvbmi2intrin.h - VBMI2 intrinsics -----------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vlvbmi2intrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512VLVBMI2INTRIN_H
-#define __AVX512VLVBMI2INTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vbmi2"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vbmi2"), __min_vector_width__(256)))
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_compress_epi16(__m128i __S, __mmask8 __U, __m128i __D)
-{
-  return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D,
-              (__v8hi) __S,
-              __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_compress_epi16(__mmask8 __U, __m128i __D)
-{
-  return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D,
-              (__v8hi) _mm_setzero_si128(),
-              __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_compress_epi8(__m128i __S, __mmask16 __U, __m128i __D)
-{
-  return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D,
-              (__v16qi) __S,
-              __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_compress_epi8(__mmask16 __U, __m128i __D)
-{
-  return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D,
-              (__v16qi) _mm_setzero_si128(),
-              __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_compressstoreu_epi16(void *__P, __mmask8 __U, __m128i __D)
-{
-  __builtin_ia32_compressstorehi128_mask ((__v8hi *) __P, (__v8hi) __D,
-              __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_compressstoreu_epi8(void *__P, __mmask16 __U, __m128i __D)
-{
-  __builtin_ia32_compressstoreqi128_mask ((__v16qi *) __P, (__v16qi) __D,
-              __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_expand_epi16(__m128i __S, __mmask8 __U, __m128i __D)
-{
-  return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D,
-              (__v8hi) __S,
-              __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_expand_epi16(__mmask8 __U, __m128i __D)
-{
-  return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D,
-              (__v8hi) _mm_setzero_si128(),
-              __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_expand_epi8(__m128i __S, __mmask16 __U, __m128i __D)
-{
-  return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D,
-              (__v16qi) __S,
-              __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_expand_epi8(__mmask16 __U, __m128i __D)
-{
-  return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D,
-              (__v16qi) _mm_setzero_si128(),
-              __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_expandloadu_epi16(__m128i __S, __mmask8 __U, void const *__P)
-{
-  return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P,
-              (__v8hi) __S,
-              __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_expandloadu_epi16(__mmask8 __U, void const *__P)
-{
-  return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P,
-              (__v8hi) _mm_setzero_si128(),
-              __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_expandloadu_epi8(__m128i __S, __mmask16 __U, void const *__P)
-{
-  return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P,
-              (__v16qi) __S,
-              __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_expandloadu_epi8(__mmask16 __U, void const *__P)
-{
-  return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P,
-              (__v16qi) _mm_setzero_si128(),
-              __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_compress_epi16(__m256i __S, __mmask16 __U, __m256i __D)
-{
-  return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D,
-              (__v16hi) __S,
-              __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_compress_epi16(__mmask16 __U, __m256i __D)
-{
-  return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D,
-              (__v16hi) _mm256_setzero_si256(),
-              __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_compress_epi8(__m256i __S, __mmask32 __U, __m256i __D)
-{
-  return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D,
-              (__v32qi) __S,
-              __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_compress_epi8(__mmask32 __U, __m256i __D)
-{
-  return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D,
-              (__v32qi) _mm256_setzero_si256(),
-              __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_compressstoreu_epi16(void *__P, __mmask16 __U, __m256i __D)
-{
-  __builtin_ia32_compressstorehi256_mask ((__v16hi *) __P, (__v16hi) __D,
-              __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_compressstoreu_epi8(void *__P, __mmask32 __U, __m256i __D)
-{
-  __builtin_ia32_compressstoreqi256_mask ((__v32qi *) __P, (__v32qi) __D,
-              __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_expand_epi16(__m256i __S, __mmask16 __U, __m256i __D)
-{
-  return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D,
-              (__v16hi) __S,
-              __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_expand_epi16(__mmask16 __U, __m256i __D)
-{
-  return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D,
-              (__v16hi) _mm256_setzero_si256(),
-              __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_expand_epi8(__m256i __S, __mmask32 __U, __m256i __D)
-{
-  return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D,
-              (__v32qi) __S,
-              __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_expand_epi8(__mmask32 __U, __m256i __D)
-{
-  return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D,
-              (__v32qi) _mm256_setzero_si256(),
-              __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_expandloadu_epi16(__m256i __S, __mmask16 __U, void const *__P)
-{
-  return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P,
-              (__v16hi) __S,
-              __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_expandloadu_epi16(__mmask16 __U, void const *__P)
-{
-  return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P,
-              (__v16hi) _mm256_setzero_si256(),
-              __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_expandloadu_epi8(__m256i __S, __mmask32 __U, void const *__P)
-{
-  return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P,
-              (__v32qi) __S,
-              __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P)
-{
-  return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P,
-              (__v32qi) _mm256_setzero_si256(),
-              __U);
-}
-
-#define _mm256_shldi_epi64(A, B, I) \
-  ((__m256i)__builtin_ia32_vpshldq256((__v4di)(__m256i)(A), \
-                                      (__v4di)(__m256i)(B), (int)(I)))
-
-#define _mm256_mask_shldi_epi64(S, U, A, B, I) \
-  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
-                                     (__v4di)_mm256_shldi_epi64((A), (B), (I)), \
-                                     (__v4di)(__m256i)(S)))
-
-#define _mm256_maskz_shldi_epi64(U, A, B, I) \
-  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
-                                     (__v4di)_mm256_shldi_epi64((A), (B), (I)), \
-                                     (__v4di)_mm256_setzero_si256()))
-
-#define _mm_shldi_epi64(A, B, I) \
-  ((__m128i)__builtin_ia32_vpshldq128((__v2di)(__m128i)(A), \
-                                      (__v2di)(__m128i)(B), (int)(I)))
-
-#define _mm_mask_shldi_epi64(S, U, A, B, I) \
-  ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
-                                       (__v2di)_mm_shldi_epi64((A), (B), (I)), \
-                                       (__v2di)(__m128i)(S)))
-
-#define _mm_maskz_shldi_epi64(U, A, B, I) \
-  ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
-                                       (__v2di)_mm_shldi_epi64((A), (B), (I)), \
-                                       (__v2di)_mm_setzero_si128()))
-
-#define _mm256_shldi_epi32(A, B, I) \
-  ((__m256i)__builtin_ia32_vpshldd256((__v8si)(__m256i)(A), \
-                                      (__v8si)(__m256i)(B), (int)(I)))
-
-#define _mm256_mask_shldi_epi32(S, U, A, B, I) \
-  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
-                                     (__v8si)_mm256_shldi_epi32((A), (B), (I)), \
-                                     (__v8si)(__m256i)(S)))
-
-#define _mm256_maskz_shldi_epi32(U, A, B, I) \
-  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
-                                     (__v8si)_mm256_shldi_epi32((A), (B), (I)), \
-                                     (__v8si)_mm256_setzero_si256()))
-
-#define _mm_shldi_epi32(A, B, I) \
-  ((__m128i)__builtin_ia32_vpshldd128((__v4si)(__m128i)(A), \
-                                      (__v4si)(__m128i)(B), (int)(I)))
-
-#define _mm_mask_shldi_epi32(S, U, A, B, I) \
-  ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
-                                       (__v4si)_mm_shldi_epi32((A), (B), (I)), \
-                                       (__v4si)(__m128i)(S)))
-
-#define _mm_maskz_shldi_epi32(U, A, B, I) \
-  ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
-                                       (__v4si)_mm_shldi_epi32((A), (B), (I)), \
-                                       (__v4si)_mm_setzero_si128()))
-
-#define _mm256_shldi_epi16(A, B, I) \
-  ((__m256i)__builtin_ia32_vpshldw256((__v16hi)(__m256i)(A), \
-                                      (__v16hi)(__m256i)(B), (int)(I)))
-
-#define _mm256_mask_shldi_epi16(S, U, A, B, I) \
-  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
-                                    (__v16hi)_mm256_shldi_epi16((A), (B), (I)), \
-                                    (__v16hi)(__m256i)(S)))
-
-#define _mm256_maskz_shldi_epi16(U, A, B, I) \
-  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
-                                    (__v16hi)_mm256_shldi_epi16((A), (B), (I)), \
-                                    (__v16hi)_mm256_setzero_si256()))
-
-#define _mm_shldi_epi16(A, B, I) \
-  ((__m128i)__builtin_ia32_vpshldw128((__v8hi)(__m128i)(A), \
-                                      (__v8hi)(__m128i)(B), (int)(I)))
-
-#define _mm_mask_shldi_epi16(S, U, A, B, I) \
-  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
-                                       (__v8hi)_mm_shldi_epi16((A), (B), (I)), \
-                                       (__v8hi)(__m128i)(S)))
-
-#define _mm_maskz_shldi_epi16(U, A, B, I) \
-  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
-                                       (__v8hi)_mm_shldi_epi16((A), (B), (I)), \
-                                       (__v8hi)_mm_setzero_si128()))
-
-#define _mm256_shrdi_epi64(A, B, I) \
-  ((__m256i)__builtin_ia32_vpshrdq256((__v4di)(__m256i)(A), \
-                                      (__v4di)(__m256i)(B), (int)(I)))
-
-#define _mm256_mask_shrdi_epi64(S, U, A, B, I) \
-  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
-                                     (__v4di)_mm256_shrdi_epi64((A), (B), (I)), \
-                                     (__v4di)(__m256i)(S)))
-
-#define _mm256_maskz_shrdi_epi64(U, A, B, I) \
-  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
-                                     (__v4di)_mm256_shrdi_epi64((A), (B), (I)), \
-                                     (__v4di)_mm256_setzero_si256()))
-
-#define _mm_shrdi_epi64(A, B, I) \
-  ((__m128i)__builtin_ia32_vpshrdq128((__v2di)(__m128i)(A), \
-                                      (__v2di)(__m128i)(B), (int)(I)))
-
-#define _mm_mask_shrdi_epi64(S, U, A, B, I) \
-  ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
-                                       (__v2di)_mm_shrdi_epi64((A), (B), (I)), \
-                                       (__v2di)(__m128i)(S)))
-
-#define _mm_maskz_shrdi_epi64(U, A, B, I) \
-  ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
-                                       (__v2di)_mm_shrdi_epi64((A), (B), (I)), \
-                                       (__v2di)_mm_setzero_si128()))
-
-#define _mm256_shrdi_epi32(A, B, I) \
-  ((__m256i)__builtin_ia32_vpshrdd256((__v8si)(__m256i)(A), \
-                                      (__v8si)(__m256i)(B), (int)(I)))
-
-#define _mm256_mask_shrdi_epi32(S, U, A, B, I) \
-  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
-                                     (__v8si)_mm256_shrdi_epi32((A), (B), (I)), \
-                                     (__v8si)(__m256i)(S)))
-
-#define _mm256_maskz_shrdi_epi32(U, A, B, I) \
-  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
-                                     (__v8si)_mm256_shrdi_epi32((A), (B), (I)), \
-                                     (__v8si)_mm256_setzero_si256()))
-
-#define _mm_shrdi_epi32(A, B, I) \
-  ((__m128i)__builtin_ia32_vpshrdd128((__v4si)(__m128i)(A), \
-                                      (__v4si)(__m128i)(B), (int)(I)))
-
-#define _mm_mask_shrdi_epi32(S, U, A, B, I) \
-  ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
-                                       (__v4si)_mm_shrdi_epi32((A), (B), (I)), \
-                                       (__v4si)(__m128i)(S)))
-
-#define _mm_maskz_shrdi_epi32(U, A, B, I) \
-  ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
-                                       (__v4si)_mm_shrdi_epi32((A), (B), (I)), \
-                                       (__v4si)_mm_setzero_si128()))
-
-#define _mm256_shrdi_epi16(A, B, I) \
-  ((__m256i)__builtin_ia32_vpshrdw256((__v16hi)(__m256i)(A), \
-                                      (__v16hi)(__m256i)(B), (int)(I)))
-
-#define _mm256_mask_shrdi_epi16(S, U, A, B, I) \
-  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
-                                    (__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \
-                                    (__v16hi)(__m256i)(S)))
-
-#define _mm256_maskz_shrdi_epi16(U, A, B, I) \
-  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
-                                    (__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \
-                                    (__v16hi)_mm256_setzero_si256()))
-
-#define _mm_shrdi_epi16(A, B, I) \
-  ((__m128i)__builtin_ia32_vpshrdw128((__v8hi)(__m128i)(A), \
-                                      (__v8hi)(__m128i)(B), (int)(I)))
-
-#define _mm_mask_shrdi_epi16(S, U, A, B, I) \
-  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
-                                       (__v8hi)_mm_shrdi_epi16((A), (B), (I)), \
-                                       (__v8hi)(__m128i)(S)))
-
-#define _mm_maskz_shrdi_epi16(U, A, B, I) \
-  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
-                                       (__v8hi)_mm_shrdi_epi16((A), (B), (I)), \
-                                       (__v8hi)_mm_setzero_si128()))
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_shldv_epi64(__m256i __A, __m256i __B, __m256i __C)
-{
-  return (__m256i)__builtin_ia32_vpshldvq256((__v4di)__A, (__v4di)__B,
-                                             (__v4di)__C);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_shldv_epi64(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C)
-{
-  return (__m256i)__builtin_ia32_selectq_256(__U,
-                                      (__v4di)_mm256_shldv_epi64(__A, __B, __C),
-                                      (__v4di)__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_shldv_epi64(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C)
-{
-  return (__m256i)__builtin_ia32_selectq_256(__U,
-                                      (__v4di)_mm256_shldv_epi64(__A, __B, __C),
-                                      (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_shldv_epi64(__m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_vpshldvq128((__v2di)__A, (__v2di)__B,
-                                             (__v2di)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_shldv_epi64(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_selectq_128(__U,
-                                         (__v2di)_mm_shldv_epi64(__A, __B, __C),
-                                         (__v2di)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_shldv_epi64(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_selectq_128(__U,
-                                         (__v2di)_mm_shldv_epi64(__A, __B, __C),
-                                         (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_shldv_epi32(__m256i __A, __m256i __B, __m256i __C)
-{
-  return (__m256i)__builtin_ia32_vpshldvd256((__v8si)__A, (__v8si)__B,
-                                             (__v8si)__C);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_shldv_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C)
-{
-  return (__m256i)__builtin_ia32_selectd_256(__U,
-                                      (__v8si)_mm256_shldv_epi32(__A, __B, __C),
-                                      (__v8si)__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_shldv_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C)
-{
-  return (__m256i)__builtin_ia32_selectd_256(__U,
-                                      (__v8si)_mm256_shldv_epi32(__A, __B, __C),
-                                      (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_shldv_epi32(__m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_vpshldvd128((__v4si)__A, (__v4si)__B,
-                                             (__v4si)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_shldv_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_selectd_128(__U,
-                                         (__v4si)_mm_shldv_epi32(__A, __B, __C),
-                                         (__v4si)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_shldv_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_selectd_128(__U,
-                                         (__v4si)_mm_shldv_epi32(__A, __B, __C),
-                                         (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_shldv_epi16(__m256i __A, __m256i __B, __m256i __C)
-{
-  return (__m256i)__builtin_ia32_vpshldvw256((__v16hi)__A, (__v16hi)__B,
-                                             (__v16hi)__C);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_shldv_epi16(__m256i __A, __mmask16 __U, __m256i __B, __m256i __C)
-{
-  return (__m256i)__builtin_ia32_selectw_256(__U,
-                                      (__v16hi)_mm256_shldv_epi16(__A, __B, __C),
-                                      (__v16hi)__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_shldv_epi16(__mmask16 __U, __m256i __A, __m256i __B, __m256i __C)
-{
-  return (__m256i)__builtin_ia32_selectw_256(__U,
-                                      (__v16hi)_mm256_shldv_epi16(__A, __B, __C),
-                                      (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_shldv_epi16(__m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_vpshldvw128((__v8hi)__A, (__v8hi)__B,
-                                             (__v8hi)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_shldv_epi16(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_selectw_128(__U,
-                                         (__v8hi)_mm_shldv_epi16(__A, __B, __C),
-                                         (__v8hi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_shldv_epi16(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_selectw_128(__U,
-                                         (__v8hi)_mm_shldv_epi16(__A, __B, __C),
-                                         (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_shrdv_epi64(__m256i __A, __m256i __B, __m256i __C)
-{
-  return (__m256i)__builtin_ia32_vpshrdvq256((__v4di)__A, (__v4di)__B,
-                                             (__v4di)__C);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_shrdv_epi64(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C)
-{
-  return (__m256i)__builtin_ia32_selectq_256(__U,
-                                      (__v4di)_mm256_shrdv_epi64(__A, __B, __C),
-                                      (__v4di)__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_shrdv_epi64(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C)
-{
-  return (__m256i)__builtin_ia32_selectq_256(__U,
-                                      (__v4di)_mm256_shrdv_epi64(__A, __B, __C),
-                                      (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_shrdv_epi64(__m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_vpshrdvq128((__v2di)__A, (__v2di)__B,
-                                             (__v2di)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_shrdv_epi64(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_selectq_128(__U,
-                                         (__v2di)_mm_shrdv_epi64(__A, __B, __C),
-                                         (__v2di)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_shrdv_epi64(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_selectq_128(__U,
-                                         (__v2di)_mm_shrdv_epi64(__A, __B, __C),
-                                         (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_shrdv_epi32(__m256i __A, __m256i __B, __m256i __C)
-{
-  return (__m256i)__builtin_ia32_vpshrdvd256((__v8si)__A, (__v8si)__B,
-                                             (__v8si)__C);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_shrdv_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C)
-{
-  return (__m256i)__builtin_ia32_selectd_256(__U,
-                                      (__v8si)_mm256_shrdv_epi32(__A, __B, __C),
-                                      (__v8si)__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_shrdv_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C)
-{
-  return (__m256i)__builtin_ia32_selectd_256(__U,
-                                      (__v8si)_mm256_shrdv_epi32(__A, __B, __C),
-                                      (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_shrdv_epi32(__m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_vpshrdvd128((__v4si)__A, (__v4si)__B,
-                                             (__v4si)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_shrdv_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_selectd_128(__U,
-                                         (__v4si)_mm_shrdv_epi32(__A, __B, __C),
-                                         (__v4si)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_shrdv_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_selectd_128(__U,
-                                         (__v4si)_mm_shrdv_epi32(__A, __B, __C),
-                                         (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_shrdv_epi16(__m256i __A, __m256i __B, __m256i __C)
-{
-  return (__m256i)__builtin_ia32_vpshrdvw256((__v16hi)__A, (__v16hi)__B,
-                                             (__v16hi)__C);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_shrdv_epi16(__m256i __A, __mmask16 __U, __m256i __B, __m256i __C)
-{
-  return (__m256i)__builtin_ia32_selectw_256(__U,
-                                     (__v16hi)_mm256_shrdv_epi16(__A, __B, __C),
-                                     (__v16hi)__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_shrdv_epi16(__mmask16 __U, __m256i __A, __m256i __B, __m256i __C)
-{
-  return (__m256i)__builtin_ia32_selectw_256(__U,
-                                     (__v16hi)_mm256_shrdv_epi16(__A, __B, __C),
-                                     (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_shrdv_epi16(__m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_vpshrdvw128((__v8hi)__A, (__v8hi)__B,
-                                             (__v8hi)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_shrdv_epi16(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_selectw_128(__U,
-                                         (__v8hi)_mm_shrdv_epi16(__A, __B, __C),
-                                         (__v8hi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_shrdv_epi16(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
-{
-  return (__m128i)__builtin_ia32_selectw_128(__U,
-                                         (__v8hi)_mm_shrdv_epi16(__A, __B, __C),
-                                         (__v8hi)_mm_setzero_si128());
-}
-
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif
--- a/lib/include/avx512vlvnniintrin.h
+++ b/lib/include/avx512vlvnniintrin.h
@ -1,304 +0,0 @@
-/*===------------- avx512vlvnniintrin.h - VNNI intrinsics ------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vlvnniintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512VLVNNIINTRIN_H
-#define __AVX512VLVNNIINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(256)))
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
-/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
-/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
-/// in \a S, and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
-///
-/// \code{.operation}
-///    FOR j := 0 to 7
-///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
-///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
-///      tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
-///      tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
-///      DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
-///    ENDFOR
-///    DST[MAX:256] := 0
-/// \endcode
-#define _mm256_dpbusd_epi32(S, A, B) \
-  ((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
-/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
-/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
-/// in \a S using signed saturation, and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
-///
-/// \code{.operation}
-///    FOR j := 0 to 7
-///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
-///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
-///      tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
-///      tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
-///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
-///    ENDFOR
-///    DST[MAX:256] := 0
-/// \endcode
-#define _mm256_dpbusds_epi32(S, A, B) \
-  ((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
-/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
-/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
-///  and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
-///
-/// \code{.operation}
-///    FOR j := 0 to 7
-///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
-///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
-///      DST.dword[j] := S.dword[j] + tmp1 + tmp2
-///    ENDFOR
-///    DST[MAX:256] := 0
-/// \endcode
-#define _mm256_dpwssd_epi32(S, A, B) \
-  ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
-/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
-/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
-/// using signed saturation, and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
-///
-/// \code{.operation}
-///    FOR j := 0 to 7
-///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
-///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
-///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
-///    ENDFOR
-///    DST[MAX:256] := 0
-/// \endcode
-#define _mm256_dpwssds_epi32(S, A, B) \
-  ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
-/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
-/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
-/// in \a S, and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
-///
-/// \code{.operation}
-///    FOR j := 0 to 3
-///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
-///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
-///      tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
-///      tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
-///      DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
-///    ENDFOR
-///    DST[MAX:128] := 0
-/// \endcode
-#define _mm_dpbusd_epi32(S, A, B) \
-  ((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
-/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
-/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
-/// in \a S using signed saturation, and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
-///
-/// \code{.operation}
-///    FOR j := 0 to 3
-///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
-///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
-///      tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
-///      tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
-///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
-///    ENDFOR
-///    DST[MAX:128] := 0
-/// \endcode
-#define _mm_dpbusds_epi32(S, A, B) \
-  ((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
-/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
-/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
-/// and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
-///
-/// \code{.operation}
-///    FOR j := 0 to 3
-///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
-///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
-///      DST.dword[j] := S.dword[j] + tmp1 + tmp2
-///    ENDFOR
-///    DST[MAX:128] := 0
-/// \endcode
-#define _mm_dpwssd_epi32(S, A, B) \
-  ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
-/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
-/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
-/// using signed saturation, and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
-///
-/// \code{.operation}
-///    FOR j := 0 to 3
-///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
-///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
-///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
-///    ENDFOR
-///    DST[MAX:128] := 0
-/// \endcode
-#define _mm_dpwssds_epi32(S, A, B) \
-  ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256(__U,
-                                     (__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
-                                     (__v8si)__S);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256(__U,
-                                     (__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
-                                     (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256(__U,
-                                    (__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
-                                    (__v8si)__S);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256(__U,
-                                     (__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
-                                     (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256(__U,
-                                     (__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
-                                     (__v8si)__S);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256(__U,
-                                     (__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
-                                     (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256(__U,
-                                    (__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
-                                    (__v8si)__S);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_selectd_256(__U,
-                                    (__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
-                                    (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128(__U,
-                                        (__v4si)_mm_dpbusd_epi32(__S, __A, __B),
-                                        (__v4si)__S);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128(__U,
-                                        (__v4si)_mm_dpbusd_epi32(__S, __A, __B),
-                                        (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128(__U,
-                                       (__v4si)_mm_dpbusds_epi32(__S, __A, __B),
-                                       (__v4si)__S);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128(__U,
-                                       (__v4si)_mm_dpbusds_epi32(__S, __A, __B),
-                                       (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128(__U,
-                                        (__v4si)_mm_dpwssd_epi32(__S, __A, __B),
-                                        (__v4si)__S);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128(__U,
-                                        (__v4si)_mm_dpwssd_epi32(__S, __A, __B),
-                                        (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128(__U,
-                                       (__v4si)_mm_dpwssds_epi32(__S, __A, __B),
-                                       (__v4si)__S);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_selectd_128(__U,
-                                       (__v4si)_mm_dpwssds_epi32(__S, __A, __B),
-                                       (__v4si)_mm_setzero_si128());
-}
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif
--- a/lib/include/avx512vlvp2intersectintrin.h
+++ b/lib/include/avx512vlvp2intersectintrin.h
@ -1,121 +0,0 @@
-/*===------ avx512vlvp2intersectintrin.h - VL VP2INTERSECT intrinsics ------===
- *
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vlvp2intersectintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef _AVX512VLVP2INTERSECT_H
-#define _AVX512VLVP2INTERSECT_H
-
-#define __DEFAULT_FN_ATTRS128 \
-  __attribute__((__always_inline__, __nodebug__,  __target__("avx512vl,avx512vp2intersect"), \
-                 __min_vector_width__(128)))
-
-#define __DEFAULT_FN_ATTRS256 \
-  __attribute__((__always_inline__, __nodebug__,  __target__("avx512vl,avx512vp2intersect"), \
-                 __min_vector_width__(256)))
-/// Store, in an even/odd pair of mask registers, the indicators of the
-/// locations of value matches between dwords in operands __a and __b.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VP2INTERSECTD </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [8 x i32].
-/// \param __b
-///    A 256-bit vector of [8 x i32]
-/// \param __m0
-///    A pointer point to 8-bit mask
-/// \param __m1
-///    A pointer point to 8-bit mask
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_2intersect_epi32(__m256i __a, __m256i __b, __mmask8 *__m0, __mmask8 *__m1) {
-  __builtin_ia32_vp2intersect_d_256((__v8si)__a, (__v8si)__b, __m0, __m1);
-}
-
-/// Store, in an even/odd pair of mask registers, the indicators of the
-/// locations of value matches between quadwords in operands __a and __b.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VP2INTERSECTQ </c> instruction.
-///
-/// \param __a
-///    A 256-bit vector of [4 x i64].
-/// \param __b
-///    A 256-bit vector of [4 x i64]
-/// \param __m0
-///    A pointer point to 8-bit mask
-/// \param __m1
-///    A pointer point to 8-bit mask
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_2intersect_epi64(__m256i __a, __m256i __b, __mmask8 *__m0, __mmask8 *__m1) {
-  __builtin_ia32_vp2intersect_q_256((__v4di)__a, (__v4di)__b, __m0, __m1);
-}
-
-/// Store, in an even/odd pair of mask registers, the indicators of the
-/// locations of value matches between dwords in operands __a and __b.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VP2INTERSECTD </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [4 x i32].
-/// \param __b
-///    A 128-bit vector of [4 x i32]
-/// \param __m0
-///    A pointer point to 8-bit mask
-/// \param __m1
-///    A pointer point to 8-bit mask
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_2intersect_epi32(__m128i __a, __m128i __b, __mmask8 *__m0, __mmask8 *__m1) {
-  __builtin_ia32_vp2intersect_d_128((__v4si)__a, (__v4si)__b, __m0, __m1);
-}
-
-/// Store, in an even/odd pair of mask registers, the indicators of the
-/// locations of value matches between quadwords in operands __a and __b.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VP2INTERSECTQ </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector of [2 x i64].
-/// \param __b
-///    A 128-bit vector of [2 x i64]
-/// \param __m0
-///    A pointer point to 8-bit mask
-/// \param __m1
-///    A pointer point to 8-bit mask
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_2intersect_epi64(__m128i __a, __m128i __b, __mmask8 *__m0, __mmask8 *__m1) {
-  __builtin_ia32_vp2intersect_q_128((__v2di)__a, (__v2di)__b, __m0, __m1);
-}
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif
--- a/lib/include/avx512vnniintrin.h
+++ b/lib/include/avx512vnniintrin.h
@ -1,115 +0,0 @@
-/*===------------- avx512vnniintrin.h - VNNI intrinsics ------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vnniintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512VNNIINTRIN_H
-#define __AVX512VNNIINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vnni"), __min_vector_width__(512)))
-
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_vpdpbusd512((__v16si)__S, (__v16si)__A,
-                                             (__v16si)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_dpbusd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512(__U,
-                                    (__v16si)_mm512_dpbusd_epi32(__S, __A, __B),
-                                    (__v16si)__S);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_dpbusd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512(__U,
-                                    (__v16si)_mm512_dpbusd_epi32(__S, __A, __B),
-                                    (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_vpdpbusds512((__v16si)__S, (__v16si)__A,
-                                              (__v16si)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_dpbusds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512(__U,
-                                   (__v16si)_mm512_dpbusds_epi32(__S, __A, __B),
-                                   (__v16si)__S);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512(__U,
-                                   (__v16si)_mm512_dpbusds_epi32(__S, __A, __B),
-                                   (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_vpdpwssd512((__v16si)__S, (__v16si)__A,
-                                             (__v16si)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512(__U,
-                                    (__v16si)_mm512_dpwssd_epi32(__S, __A, __B),
-                                    (__v16si)__S);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512(__U,
-                                    (__v16si)_mm512_dpwssd_epi32(__S, __A, __B),
-                                    (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_vpdpwssds512((__v16si)__S, (__v16si)__A,
-                                              (__v16si)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512(__U,
-                                   (__v16si)_mm512_dpwssds_epi32(__S, __A, __B),
-                                   (__v16si)__S);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
-{
-  return (__m512i)__builtin_ia32_selectd_512(__U,
-                                   (__v16si)_mm512_dpwssds_epi32(__S, __A, __B),
-                                   (__v16si)_mm512_setzero_si512());
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
--- a/lib/include/avx512vp2intersectintrin.h
+++ b/lib/include/avx512vp2intersectintrin.h
@ -1,77 +0,0 @@
-/*===------- avx512vpintersectintrin.h - VP2INTERSECT intrinsics ------------===
- *
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vp2intersect.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef _AVX512VP2INTERSECT_H
-#define _AVX512VP2INTERSECT_H
-
-#define __DEFAULT_FN_ATTRS \
-  __attribute__((__always_inline__, __nodebug__,  __target__("avx512vp2intersect"), \
-                 __min_vector_width__(512)))
-
-/// Store, in an even/odd pair of mask registers, the indicators of the
-/// locations of value matches between dwords in operands __a and __b.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VP2INTERSECTD </c> instruction.
-///
-/// \param __a
-///    A 512-bit vector of [16 x i32].
-/// \param __b
-///    A 512-bit vector of [16 x i32]
-/// \param __m0
-///    A pointer point to 16-bit mask
-/// \param __m1
-///    A pointer point to 16-bit mask
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm512_2intersect_epi32(__m512i __a, __m512i __b, __mmask16 *__m0, __mmask16 *__m1) {
-  __builtin_ia32_vp2intersect_d_512((__v16si)__a, (__v16si)__b, __m0, __m1);
-}
-
-/// Store, in an even/odd pair of mask registers, the indicators of the
-/// locations of value matches between quadwords in operands __a and __b.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VP2INTERSECTQ </c> instruction.
-///
-/// \param __a
-///    A 512-bit vector of [8 x i64].
-/// \param __b
-///    A 512-bit vector of [8 x i64]
-/// \param __m0
-///    A pointer point to 8-bit mask
-/// \param __m1
-///    A pointer point to 8-bit mask
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm512_2intersect_epi64(__m512i __a, __m512i __b, __mmask8 *__m0, __mmask8 *__m1) {
-  __builtin_ia32_vp2intersect_q_512((__v8di)__a, (__v8di)__b, __m0, __m1);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
--- a/lib/include/avx512vpopcntdqintrin.h
+++ b/lib/include/avx512vpopcntdqintrin.h
@ -1,54 +0,0 @@
-/*===----- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics-------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error                                                                         \
-    "Never use <avx512vpopcntdqintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512VPOPCNTDQINTRIN_H
-#define __AVX512VPOPCNTDQINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq"), __min_vector_width__(512)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi64(__m512i __A) {
-  return (__m512i)__builtin_ia32_vpopcntq_512((__v8di)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_popcnt_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
-  return (__m512i)__builtin_ia32_selectq_512(
-      (__mmask8)__U, (__v8di)_mm512_popcnt_epi64(__A), (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_popcnt_epi64(__mmask8 __U, __m512i __A) {
-  return _mm512_mask_popcnt_epi64((__m512i)_mm512_setzero_si512(), __U, __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi32(__m512i __A) {
-  return (__m512i)__builtin_ia32_vpopcntd_512((__v16si)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_popcnt_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
-  return (__m512i)__builtin_ia32_selectd_512(
-      (__mmask16)__U, (__v16si)_mm512_popcnt_epi32(__A), (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_popcnt_epi32(__mmask16 __U, __m512i __A) {
-  return _mm512_mask_popcnt_epi32((__m512i)_mm512_setzero_si512(), __U, __A);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
--- a/lib/include/avx512vpopcntdqvlintrin.h
+++ b/lib/include/avx512vpopcntdqvlintrin.h
@ -1,91 +0,0 @@
-/*===---- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics -------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error                                                                         \
-    "Never use <avx512vpopcntdqvlintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512VPOPCNTDQVLINTRIN_H
-#define __AVX512VPOPCNTDQVLINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq,avx512vl"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq,avx512vl"), __min_vector_width__(256)))
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_popcnt_epi64(__m128i __A) {
-  return (__m128i)__builtin_ia32_vpopcntq_128((__v2di)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_popcnt_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
-  return (__m128i)__builtin_ia32_selectq_128(
-      (__mmask8)__U, (__v2di)_mm_popcnt_epi64(__A), (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_popcnt_epi64(__mmask8 __U, __m128i __A) {
-  return _mm_mask_popcnt_epi64((__m128i)_mm_setzero_si128(), __U, __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_popcnt_epi32(__m128i __A) {
-  return (__m128i)__builtin_ia32_vpopcntd_128((__v4si)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_popcnt_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
-  return (__m128i)__builtin_ia32_selectd_128(
-      (__mmask8)__U, (__v4si)_mm_popcnt_epi32(__A), (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_popcnt_epi32(__mmask8 __U, __m128i __A) {
-  return _mm_mask_popcnt_epi32((__m128i)_mm_setzero_si128(), __U, __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_popcnt_epi64(__m256i __A) {
-  return (__m256i)__builtin_ia32_vpopcntq_256((__v4di)__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_popcnt_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
-  return (__m256i)__builtin_ia32_selectq_256(
-      (__mmask8)__U, (__v4di)_mm256_popcnt_epi64(__A), (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_popcnt_epi64(__mmask8 __U, __m256i __A) {
-  return _mm256_mask_popcnt_epi64((__m256i)_mm256_setzero_si256(), __U, __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_popcnt_epi32(__m256i __A) {
-  return (__m256i)__builtin_ia32_vpopcntd_256((__v8si)__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_popcnt_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
-  return (__m256i)__builtin_ia32_selectd_256(
-      (__mmask8)__U, (__v8si)_mm256_popcnt_epi32(__A), (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_popcnt_epi32(__mmask8 __U, __m256i __A) {
-  return _mm256_mask_popcnt_epi32((__m256i)_mm256_setzero_si256(), __U, __A);
-}
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif
--- a/lib/include/avxifmaintrin.h
+++ b/lib/include/avxifmaintrin.h
@ -1,177 +0,0 @@
-/*===----------------- avxifmaintrin.h - IFMA intrinsics -------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <avxifmaintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVXIFMAINTRIN_H
-#define __AVXIFMAINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avxifma"),        \
-                 __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avxifma"),        \
-                 __min_vector_width__(256)))
-
-// must vex-encoding
-
-/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
-/// and \a __Z to form a 104-bit intermediate result. Add the high 52-bit
-/// unsigned integer from the intermediate result with the corresponding
-/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128i
-/// _mm_madd52hi_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPMADD52HUQ instruction.
-///
-/// \return
-/// 	return __m128i dst.
-/// \param __X
-/// 	A 128-bit vector of [2 x i64]
-/// \param __Y
-/// 	A 128-bit vector of [2 x i64]
-/// \param __Z
-/// 	A 128-bit vector of [2 x i64]
-///
-/// \code{.operation}
-/// FOR j := 0 to 1
-/// 	i := j*64
-/// 	tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
-/// 	dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52])
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_madd52hi_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
-  return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di)__X, (__v2di)__Y,
-                                                (__v2di)__Z);
-}
-
-/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
-/// and \a __Z to form a 104-bit intermediate result. Add the high 52-bit
-/// unsigned integer from the intermediate result with the corresponding
-/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i
-/// _mm256_madd52hi_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPMADD52HUQ instruction.
-///
-/// \return
-/// 	return __m256i dst.
-/// \param __X
-/// 	A 256-bit vector of [4 x i64]
-/// \param __Y
-/// 	A 256-bit vector of [4 x i64]
-/// \param __Z
-/// 	A 256-bit vector of [4 x i64]
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	i := j*64
-/// 	tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
-/// 	dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52])
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_madd52hi_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
-  return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y,
-                                                (__v4di)__Z);
-}
-
-/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
-/// and \a __Z to form a 104-bit intermediate result. Add the low 52-bit
-/// unsigned integer from the intermediate result with the corresponding
-/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128i
-/// _mm_madd52lo_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPMADD52LUQ instruction.
-///
-/// \return
-/// 	return __m128i dst.
-/// \param __X
-/// 	A 128-bit vector of [2 x i64]
-/// \param __Y
-/// 	A 128-bit vector of [2 x i64]
-/// \param __Z
-/// 	A 128-bit vector of [2 x i64]
-///
-/// \code{.operation}
-/// FOR j := 0 to 1
-/// 	i := j*64
-/// 	tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
-/// 	dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0])
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_madd52lo_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
-  return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y,
-                                                (__v2di)__Z);
-}
-
-/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
-/// and \a __Z to form a 104-bit intermediate result. Add the low 52-bit
-/// unsigned integer from the intermediate result with the corresponding
-/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i
-/// _mm256_madd52lo_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPMADD52LUQ instruction.
-///
-/// \return
-/// 	return __m256i dst.
-/// \param __X
-/// 	A 256-bit vector of [4 x i64]
-/// \param __Y
-/// 	A 256-bit vector of [4 x i64]
-/// \param __Z
-/// 	A 256-bit vector of [4 x i64]
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	i := j*64
-/// 	tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
-/// 	dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0])
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_madd52lo_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
-  return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y,
-                                                (__v4di)__Z);
-}
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif // __AVXIFMAINTRIN_H
--- a/lib/include/avxintrin.h
+++ b/lib/include/avxintrin.h
--- a/lib/include/avxneconvertintrin.h
+++ b/lib/include/avxneconvertintrin.h
@ -1,484 +0,0 @@
-/*===-------------- avxneconvertintrin.h - AVXNECONVERT --------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error                                                                         \
-    "Never use <avxneconvertintrin.h> directly; include <immintrin.h> instead."
-#endif // __IMMINTRIN_H
-
-#ifdef __SSE2__
-
-#ifndef __AVXNECONVERTINTRIN_H
-#define __AVXNECONVERTINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"),   \
-                 __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"),   \
-                 __min_vector_width__(256)))
-
-/// Convert scalar BF16 (16-bit) floating-point element
-/// stored at memory locations starting at location \a __A to a
-/// single-precision (32-bit) floating-point, broadcast it to packed
-/// single-precision (32-bit) floating-point elements, and store the results in
-/// \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm_bcstnebf16_ps(const void *__A);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction.
-///
-/// \param __A
-///    A pointer to a 16-bit memory location. The address of the memory
-///    location does not have to be aligned.
-/// \returns
-///    A 128-bit vector of [4 x float].
-///
-/// \code{.operation}
-/// b := Convert_BF16_To_FP32(MEM[__A+15:__A])
-/// FOR j := 0 to 3
-///   m := j*32
-///   dst[m+31:m] := b
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_bcstnebf16_ps(const void *__A) {
-  return (__m128)__builtin_ia32_vbcstnebf162ps128((const __bf16 *)__A);
-}
-
-/// Convert scalar BF16 (16-bit) floating-point element
-/// stored at memory locations starting at location \a __A to a
-/// single-precision (32-bit) floating-point, broadcast it to packed
-/// single-precision (32-bit) floating-point elements, and store the results in
-/// \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm256_bcstnebf16_ps(const void *__A);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction.
-///
-/// \param __A
-///    A pointer to a 16-bit memory location. The address of the memory
-///    location does not have to be aligned.
-/// \returns
-///    A 256-bit vector of [8 x float].
-///
-/// \code{.operation}
-/// b := Convert_BF16_To_FP32(MEM[__A+15:__A])
-/// FOR j := 0 to 7
-///   m := j*32
-///   dst[m+31:m] := b
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_bcstnebf16_ps(const void *__A) {
-  return (__m256)__builtin_ia32_vbcstnebf162ps256((const __bf16 *)__A);
-}
-
-/// Convert scalar half-precision (16-bit) floating-point element
-/// stored at memory locations starting at location \a __A to a
-/// single-precision (32-bit) floating-point, broadcast it to packed
-/// single-precision (32-bit) floating-point elements, and store the results in
-/// \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm_bcstnesh_ps(const void *__A);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VBCSTNESH2PS instruction.
-///
-/// \param __A
-///    A pointer to a 16-bit memory location. The address of the memory
-///    location does not have to be aligned.
-/// \returns
-///    A 128-bit vector of [4 x float].
-///
-/// \code{.operation}
-/// b := Convert_FP16_To_FP32(MEM[__A+15:__A])
-/// FOR j := 0 to 3
-///   m := j*32
-///   dst[m+31:m] := b
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_bcstnesh_ps(const void *__A) {
-  return (__m128)__builtin_ia32_vbcstnesh2ps128((const _Float16 *)__A);
-}
-
-/// Convert scalar half-precision (16-bit) floating-point element
-/// stored at memory locations starting at location \a __A to a
-/// single-precision (32-bit) floating-point, broadcast it to packed
-/// single-precision (32-bit) floating-point elements, and store the results in
-/// \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm256_bcstnesh_ps(const void *__A);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VBCSTNESH2PS instruction.
-///
-/// \param __A
-///    A pointer to a 16-bit memory location. The address of the memory
-///    location does not have to be aligned.
-/// \returns
-///    A 256-bit vector of [8 x float].
-///
-/// \code{.operation}
-/// b := Convert_FP16_To_FP32(MEM[__A+15:__A])
-/// FOR j := 0 to 7
-///   m := j*32
-///   dst[m+31:m] := b
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_bcstnesh_ps(const void *__A) {
-  return (__m256)__builtin_ia32_vbcstnesh2ps256((const _Float16 *)__A);
-}
-
-/// Convert packed BF16 (16-bit) floating-point even-indexed elements
-/// stored at memory locations starting at location \a __A to packed
-/// single-precision (32-bit) floating-point elements, and store the results in
-/// \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm_cvtneebf16_ps(const __m128bh *__A);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction.
-///
-/// \param __A
-///    A pointer to a 128-bit memory location containing 8 consecutive
-///    BF16 (16-bit) floating-point values.
-/// \returns
-///    A 128-bit vector of [4 x float].
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	k := j*2
-/// 	i := k*16
-/// 	m := j*32
-/// 	dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_cvtneebf16_ps(const __m128bh *__A) {
-  return (__m128)__builtin_ia32_vcvtneebf162ps128((const __v8bf *)__A);
-}
-
-/// Convert packed BF16 (16-bit) floating-point even-indexed elements
-/// stored at memory locations starting at location \a __A to packed
-/// single-precision (32-bit) floating-point elements, and store the results in
-/// \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm256_cvtneebf16_ps(const __m256bh *__A);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction.
-///
-/// \param __A
-///    A pointer to a 256-bit memory location containing 16 consecutive
-///    BF16 (16-bit) floating-point values.
-/// \returns
-///    A 256-bit vector of [8 x float].
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-/// 	k := j*2
-/// 	i := k*16
-/// 	m := j*32
-/// 	dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_cvtneebf16_ps(const __m256bh *__A) {
-  return (__m256)__builtin_ia32_vcvtneebf162ps256((const __v16bf *)__A);
-}
-
-/// Convert packed half-precision (16-bit) floating-point even-indexed elements
-/// stored at memory locations starting at location \a __A to packed
-/// single-precision (32-bit) floating-point elements, and store the results in
-/// \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm_cvtneeph_ps(const __m128h *__A);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction.
-///
-/// \param __A
-///    A pointer to a 128-bit memory location containing 8 consecutive
-///    half-precision (16-bit) floating-point values.
-/// \returns
-///    A 128-bit vector of [4 x float].
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	k := j*2
-/// 	i := k*16
-/// 	m := j*32
-/// 	dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_cvtneeph_ps(const __m128h *__A) {
-  return (__m128)__builtin_ia32_vcvtneeph2ps128((const __v8hf *)__A);
-}
-
-/// Convert packed half-precision (16-bit) floating-point even-indexed elements
-/// stored at memory locations starting at location \a __A to packed
-/// single-precision (32-bit) floating-point elements, and store the results in
-/// \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm256_cvtneeph_ps(const __m256h *__A);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction.
-///
-/// \param __A
-///    A pointer to a 256-bit memory location containing 16 consecutive
-///    half-precision (16-bit) floating-point values.
-/// \returns
-///    A 256-bit vector of [8 x float].
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-/// 	k := j*2
-/// 	i := k*16
-/// 	m := j*32
-/// 	dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_cvtneeph_ps(const __m256h *__A) {
-  return (__m256)__builtin_ia32_vcvtneeph2ps256((const __v16hf *)__A);
-}
-
-/// Convert packed BF16 (16-bit) floating-point odd-indexed elements
-/// stored at memory locations starting at location \a __A to packed
-/// single-precision (32-bit) floating-point elements, and store the results in
-/// \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm_cvtneobf16_ps(const __m128bh *__A);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction.
-///
-/// \param __A
-///    A pointer to a 128-bit memory location containing 8 consecutive
-///    BF16 (16-bit) floating-point values.
-/// \returns
-///    A 128-bit vector of [4 x float].
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	k := j*2+1
-/// 	i := k*16
-/// 	m := j*32
-/// 	dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_cvtneobf16_ps(const __m128bh *__A) {
-  return (__m128)__builtin_ia32_vcvtneobf162ps128((const __v8bf *)__A);
-}
-
-/// Convert packed BF16 (16-bit) floating-point odd-indexed elements
-/// stored at memory locations starting at location \a __A to packed
-/// single-precision (32-bit) floating-point elements, and store the results in
-/// \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm256_cvtneobf16_ps(const __m256bh *__A);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction.
-///
-/// \param __A
-///    A pointer to a 256-bit memory location containing 16 consecutive
-///    BF16 (16-bit) floating-point values.
-/// \returns
-///    A 256-bit vector of [8 x float].
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-/// 	k := j*2+1
-/// 	i := k*16
-/// 	m := j*32
-/// 	dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_cvtneobf16_ps(const __m256bh *__A) {
-  return (__m256)__builtin_ia32_vcvtneobf162ps256((const __v16bf *)__A);
-}
-
-/// Convert packed half-precision (16-bit) floating-point odd-indexed elements
-/// stored at memory locations starting at location \a __A to packed
-/// single-precision (32-bit) floating-point elements, and store the results in
-/// \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm_cvtneoph_ps(const __m128h *__A);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction.
-///
-/// \param __A
-///    A pointer to a 128-bit memory location containing 8 consecutive
-///    half-precision (16-bit) floating-point values.
-/// \returns
-///    A 128-bit vector of [4 x float].
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	k := j*2+1
-/// 	i := k*16
-/// 	m := j*32
-/// 	dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_cvtneoph_ps(const __m128h *__A) {
-  return (__m128)__builtin_ia32_vcvtneoph2ps128((const __v8hf *)__A);
-}
-
-/// Convert packed half-precision (16-bit) floating-point odd-indexed elements
-/// stored at memory locations starting at location \a __A to packed
-/// single-precision (32-bit) floating-point elements, and store the results in
-/// \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm256_cvtneoph_ps(const __m256h *__A);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction.
-///
-/// \param __A
-///    A pointer to a 256-bit memory location containing 16 consecutive
-///    half-precision (16-bit) floating-point values.
-/// \returns
-///    A 256-bit vector of [8 x float].
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-/// 	k := j*2+1
-/// 	i := k*16
-/// 	m := j*32
-/// 	dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_cvtneoph_ps(const __m256h *__A) {
-  return (__m256)__builtin_ia32_vcvtneoph2ps256((const __v16hf *)__A);
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in \a __A
-/// to packed BF16 (16-bit) floating-point elements, and store the results in \a
-/// dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm_cvtneps_avx_pbh(__m128 __A);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction.
-///
-/// \param __A
-///    A 128-bit vector of [4 x float].
-/// \returns
-///    A 128-bit vector of [8 x bfloat].
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	dst.word[j] := Convert_FP32_To_BF16(__A.fp32[j])
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128bh __DEFAULT_FN_ATTRS128
-_mm_cvtneps_avx_pbh(__m128 __A) {
-  return (__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)__A);
-}
-
-/// Convert packed single-precision (32-bit) floating-point elements in \a __A
-/// to packed BF16 (16-bit) floating-point elements, and store the results in \a
-/// dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm256_cvtneps_avx_pbh(__m256 __A);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction.
-///
-/// \param __A
-///    A 256-bit vector of [8 x float].
-/// \returns
-///    A 128-bit vector of [8 x bfloat].
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-/// 	dst.word[j] := Convert_FP32_To_BF16(a.fp32[j])
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128bh __DEFAULT_FN_ATTRS256
-_mm256_cvtneps_avx_pbh(__m256 __A) {
-  return (__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)__A);
-}
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif // __AVXNECONVERTINTRIN_H
-#endif // __SSE2__
--- a/lib/include/avxvnniint16intrin.h
+++ b/lib/include/avxvnniint16intrin.h
@ -1,473 +0,0 @@
-/*===----------- avxvnniint16intrin.h - AVXVNNIINT16 intrinsics-------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error                                                                         \
-    "Never use <avxvnniint16intrin.h> directly; include <immintrin.h> instead."
-#endif // __IMMINTRIN_H
-
-#ifndef __AVXVNNIINT16INTRIN_H
-#define __AVXVNNIINT16INTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"),   \
-                 __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"),   \
-                 __min_vector_width__(256)))
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128i _mm_dpwsud_epi32(__m128i __W, __m128i __A, __m128i __B)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPWSUD instruction.
-///
-/// \param __W
-///    A 128-bit vector of [4 x int].
-/// \param __A
-///    A 128-bit vector of [8 x short].
-/// \param __B
-///    A 128-bit vector of [8 x unsigned short].
-/// \returns
-///    A 128-bit vector of [4 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
-/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsud_epi32(__m128i __W,
-                                                                 __m128i __A,
-                                                                 __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpwsud128((__v4si)__W, (__v4si)__A,
-                                             (__v4si)__B);
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPWSUD instruction.
-///
-/// \param __W
-///    A 256-bit vector of [8 x int].
-/// \param __A
-///    A 256-bit vector of [16 x short].
-/// \param __B
-///    A 256-bit vector of [16 x unsigned short].
-/// \returns
-///    A 256-bit vector of [8 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-/// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
-/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpwsud256((__v8si)__W, (__v8si)__A,
-                                             (__v8si)__B);
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
-///
-/// \param __W
-///    A 128-bit vector of [4 x int].
-/// \param __A
-///    A 128-bit vector of [8 x short].
-/// \param __B
-///    A 128-bit vector of [8 x unsigned short].
-/// \returns
-///    A 128-bit vector of [4 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
-/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsuds_epi32(__m128i __W,
-                                                                  __m128i __A,
-                                                                  __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpwsuds128((__v4si)__W, (__v4si)__A,
-                                              (__v4si)__B);
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
-///
-/// \param __W
-///    A 256-bit vector of [8 x int].
-/// \param __A
-///    A 256-bit vector of [16 x short].
-/// \param __B
-///    A 256-bit vector of [16 x unsigned short].
-/// \returns
-///    A 256-bit vector of [8 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-/// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
-/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpwsuds256((__v8si)__W, (__v8si)__A,
-                                              (__v8si)__B);
-}
-
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128i _mm_dpbusd_epi32(__m128i __W, __m128i __A, __m128i __B)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPWUSD instruction.
-///
-/// \param __W
-///    A 128-bit vector of [4 x int].
-/// \param __A
-///    A 128-bit vector of [8 x unsigned short].
-/// \param __B
-///    A 128-bit vector of [8 x short].
-/// \returns
-///    A 128-bit vector of [4 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
-/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusd_epi32(__m128i __W,
-                                                                 __m128i __A,
-                                                                 __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpwusd128((__v4si)__W, (__v4si)__A,
-                                             (__v4si)__B);
-}
-
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPWUSD instruction.
-///
-/// \param __W
-///    A 256-bit vector of [8 x int].
-/// \param __A
-///    A 256-bit vector of [16 x unsigned short].
-/// \param __B
-///    A 256-bit vector of [16 x short].
-/// \returns
-///    A 256-bit vector of [8 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
-/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpwusd256((__v8si)__W, (__v8si)__A,
-                                             (__v8si)__B);
-}
-
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128i _mm_dpwusds_epi32(__m128i __W, __m128i __A, __m128i __B)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
-///
-/// \param __W
-///    A 128-bit vector of [4 x int].
-/// \param __A
-///    A 128-bit vector of [8 x unsigned short].
-/// \param __B
-///    A 128-bit vector of [8 x short].
-/// \returns
-///    A 128-bit vector of [4 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
-/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusds_epi32(__m128i __W,
-                                                                  __m128i __A,
-                                                                  __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpwusds128((__v4si)__W, (__v4si)__A,
-                                              (__v4si)__B);
-}
-
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
-///
-/// \param __W
-///    A 256-bit vector of [8 x int].
-/// \param __A
-///    A 256-bit vector of [16 x unsigned short].
-/// \param __B
-///    A 256-bit vector of [16 x short].
-/// \returns
-///    A 256-bit vector of [8 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
-/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwusds_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpwusds256((__v8si)__W, (__v8si)__A,
-                                              (__v8si)__B);
-}
-
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128i _mm_dpwuud_epi32(__m128i __W, __m128i __A, __m128i __B)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPWUUD instruction.
-///
-/// \param __W
-///    A 128-bit vector of [4 x unsigned int].
-/// \param __A
-///    A 128-bit vector of [8 x unsigned short].
-/// \param __B
-///    A 128-bit vector of [8 x unsigned short].
-/// \returns
-///    A 128-bit vector of [4 x unsigned int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
-/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuud_epi32(__m128i __W,
-                                                                 __m128i __A,
-                                                                 __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpwuud128((__v4si)__W, (__v4si)__A,
-                                             (__v4si)__B);
-}
-
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPWUUD instruction.
-///
-/// \param __W
-///    A 256-bit vector of [8 x unsigned int].
-/// \param __A
-///    A 256-bit vector of [16 x unsigned short].
-/// \param __B
-///    A 256-bit vector of [16 x unsigned short].
-/// \returns
-///    A 256-bit vector of [8 x unsigned int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
-/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpwuud256((__v8si)__W, (__v8si)__A,
-                                             (__v8si)__B);
-}
-
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
-///
-/// \param __W
-///    A 128-bit vector of [4 x unsigned int].
-/// \param __A
-///    A 128-bit vector of [8 x unsigned short].
-/// \param __B
-///    A 128-bit vector of [8 x unsigned short].
-/// \returns
-///    A 128-bit vector of [4 x unsigned int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
-/// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuuds_epi32(__m128i __W,
-                                                                  __m128i __A,
-                                                                  __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpwuuds128((__v4si)__W, (__v4si)__A,
-                                              (__v4si)__B);
-}
-
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// __m256i _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B)
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
-///
-/// \param __W
-///    A 256-bit vector of [8 x unsigned int].
-/// \param __A
-///    A 256-bit vector of [16 x unsigned short].
-/// \param __B
-///    A 256-bit vector of [16 x unsigned short].
-/// \returns
-///    A 256-bit vector of [8 x unsigned int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
-/// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpwuuds256((__v8si)__W, (__v8si)__A,
-                                              (__v8si)__B);
-}
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif // __AVXVNNIINT16INTRIN_H
--- a/lib/include/avxvnniint8intrin.h
+++ b/lib/include/avxvnniint8intrin.h
@ -1,471 +0,0 @@
-/*===-------- avxvnniint8intrin.h - AVXVNNIINT8 intrinsics -----------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error                                                                         \
-    "Never use <avxvnniint8intrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVXVNNIINT8INTRIN_H
-#define __AVXVNNIINT8INTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS256                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"),    \
-                 __min_vector_width__(256)))
-#define __DEFAULT_FN_ATTRS128                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"),    \
-                 __min_vector_width__(128)))
-
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
-///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
-///    signed 16-bit results. Sum these 4 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
-///
-/// \param __A
-///    A 128-bit vector of [16 x char].
-/// \param __B
-///    A 128-bit vector of [16 x char].
-/// \returns
-///    A 128-bit vector of [4 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
-/// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
-/// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
-/// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
-/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W,
-                                                                 __m128i __A,
-                                                                 __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpbssd128((__v4si)__W, (__v4si)__A,
-                                             (__v4si)__B);
-}
-
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
-///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
-///    signed 16-bit results. Sum these 4 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
-///
-/// \param __A
-///    A 256-bit vector of [32 x char].
-/// \param __B
-///    A 256-bit vector of [32 x char].
-/// \returns
-///    A 256-bit vector of [8 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-/// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
-/// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
-/// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
-/// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
-/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpbssd256((__v8si)__W, (__v8si)__A,
-                                             (__v8si)__B);
-}
-
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
-///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
-///    signed 16-bit results. Sum these 4 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
-///
-/// \param __A
-///    A 128-bit vector of [16 x char].
-/// \param __B
-///    A 128-bit vector of [16 x char].
-/// \returns
-///    A 128-bit vector of [4 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
-/// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
-/// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
-/// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
-/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W,
-                                                                  __m128i __A,
-                                                                  __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpbssds128((__v4si)__W, (__v4si)__A,
-                                              (__v4si)__B);
-}
-
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
-///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
-///    signed 16-bit results. Sum these 4 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
-///
-/// \param __A
-///    A 256-bit vector of [32 x char].
-/// \param __B
-///    A 256-bit vector of [32 x char].
-/// \returns
-///    A 256-bit vector of [8 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-/// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
-/// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
-/// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
-/// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
-/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpbssds256((__v8si)__W, (__v8si)__A,
-                                              (__v8si)__B);
-}
-
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
-///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
-///    signed 16-bit results. Sum these 4 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
-///
-/// \param __A
-///    A 128-bit vector of [16 x char].
-/// \param __B
-///    A 128-bit vector of [16 x unsigned char].
-/// \returns
-///    A 128-bit vector of [4 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
-/// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
-/// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
-/// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
-/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W,
-                                                                 __m128i __A,
-                                                                 __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpbsud128((__v4si)__W, (__v4si)__A,
-                                             (__v4si)__B);
-}
-
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
-///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
-///    signed 16-bit results. Sum these 4 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
-///
-/// \param __A
-///    A 256-bit vector of [32 x char].
-/// \param __B
-///    A 256-bit vector of [32 x unsigned char].
-/// \returns
-///    A 256-bit vector of [8 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-/// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
-/// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
-/// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
-/// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
-/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpbsud256((__v8si)__W, (__v8si)__A,
-                                             (__v8si)__B);
-}
-
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
-///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
-///    signed 16-bit results. Sum these 4 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
-///
-/// \param __A
-///    A 128-bit vector of [16 x char].
-/// \param __B
-///    A 128-bit vector of [16 x unsigned char].
-/// \returns
-///    A 128-bit vector of [4 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
-/// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
-/// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
-/// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
-/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W,
-                                                                  __m128i __A,
-                                                                  __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpbsuds128((__v4si)__W, (__v4si)__A,
-                                              (__v4si)__B);
-}
-
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
-///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
-///    signed 16-bit results. Sum these 4 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
-///
-/// \param __A
-///    A 256-bit vector of [32 x char].
-/// \param __B
-///    A 256-bit vector of [32 x unsigned char].
-/// \returns
-///    A 256-bit vector of [8 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-/// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
-/// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
-/// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
-/// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
-/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpbsuds256((__v8si)__W, (__v8si)__A,
-                                              (__v8si)__B);
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
-///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
-///    signed 16-bit results. Sum these 4 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
-///
-/// \param __A
-///    A 128-bit vector of [16 x unsigned char].
-/// \param __B
-///    A 128-bit vector of [16 x unsigned char].
-/// \returns
-///    A 128-bit vector of [4 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
-/// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
-/// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
-/// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
-/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W,
-                                                                 __m128i __A,
-                                                                 __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpbuud128((__v4si)__W, (__v4si)__A,
-                                             (__v4si)__B);
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
-///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
-///    signed 16-bit results. Sum these 4 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
-///
-/// \param __A
-///    A 256-bit vector of [32 x unsigned char].
-/// \param __B
-///    A 256-bit vector of [32 x unsigned char].
-/// \returns
-///    A 256-bit vector of [8 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-/// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
-/// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
-/// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
-/// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
-/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpbuud256((__v8si)__W, (__v8si)__A,
-                                             (__v8si)__B);
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
-///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
-///    signed 16-bit results. Sum these 4 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm_dpbuuds_epi32( __m128i __W, __m128i __A, __m128i __B);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
-///
-/// \param __A
-///    A 128-bit vector of [16 x unsigned char].
-/// \param __B
-///    A 128-bit vector of [16 x unsigned char].
-/// \returns
-///    A 128-bit vector of [4 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 3
-/// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
-/// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
-/// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
-/// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
-/// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
-/// ENDFOR
-/// dst[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W,
-                                                                  __m128i __A,
-                                                                  __m128i __B) {
-  return (__m128i)__builtin_ia32_vpdpbuuds128((__v4si)__W, (__v4si)__A,
-                                              (__v4si)__B);
-}
-
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
-///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
-///    signed 16-bit results. Sum these 4 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
-///
-/// \param __A
-///    A 256-bit vector of [32 x unsigned char].
-/// \param __B
-///    A 256-bit vector of [32 x unsigned char].
-/// \returns
-///    A 256-bit vector of [8 x int].
-///
-/// \code{.operation}
-/// FOR j := 0 to 7
-/// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
-/// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
-/// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
-/// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
-/// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
-/// ENDFOR
-/// dst[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
-  return (__m256i)__builtin_ia32_vpdpbuuds256((__v8si)__W, (__v8si)__A,
-                                              (__v8si)__B);
-}
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif // __AVXVNNIINT8INTRIN_H
--- a/lib/include/avxvnniintrin.h
+++ b/lib/include/avxvnniintrin.h
@ -1,225 +0,0 @@
-/*===--------------- avxvnniintrin.h - VNNI intrinsics --------------------===
- *
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avxvnniintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVXVNNIINTRIN_H
-#define __AVXVNNIINTRIN_H
-
-/* Below intrinsics defined in avx512vlvnniintrin.h can be used for AVXVNNI */
-/// \fn __m256i _mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
-/// \fn __m256i _mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
-/// \fn __m256i _mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
-/// \fn __m256i _mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
-/// \fn __m128i _mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
-/// \fn __m128i _mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
-/// \fn __m128i _mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
-/// \fn __m128i _mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
-
-/* Intrinsics with _avx_ prefix are for compatibility with msvc. */
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(256)))
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(128)))
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
-/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
-/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
-/// in \a __S, and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
-///
-/// \code{.operation}
-///    FOR j := 0 to 7
-///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
-///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
-///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
-///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
-///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
-///    ENDFOR
-///    DST[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
-/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
-/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
-/// in \a __S using signed saturation, and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
-///
-/// \code{.operation}
-///    FOR j := 0 to 7
-///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
-///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
-///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
-///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
-///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
-///    ENDFOR
-///    DST[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
-/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
-/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
-///  and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
-///
-/// \code{.operation}
-///    FOR j := 0 to 7
-///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
-///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
-///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2
-///    ENDFOR
-///    DST[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
-/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
-/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
-/// using signed saturation, and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
-///
-/// \code{.operation}
-///    FOR j := 0 to 7
-///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
-///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
-///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
-///    ENDFOR
-///    DST[MAX:256] := 0
-/// \endcode
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
-/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
-/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
-/// in \a __S, and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
-///
-/// \code{.operation}
-///    FOR j := 0 to 3
-///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
-///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
-///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
-///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
-///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
-///    ENDFOR
-///    DST[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
-/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
-/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
-/// in \a __S using signed saturation, and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
-///
-/// \code{.operation}
-///    FOR j := 0 to 3
-///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
-///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
-///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
-///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
-///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
-///    ENDFOR
-///    DST[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
-/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
-/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
-/// and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
-///
-/// \code{.operation}
-///    FOR j := 0 to 3
-///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
-///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
-///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2
-///    ENDFOR
-///    DST[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
-/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
-/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
-/// using signed saturation, and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
-///
-/// \code{.operation}
-///    FOR j := 0 to 3
-///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
-///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
-///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
-///    ENDFOR
-///    DST[MAX:128] := 0
-/// \endcode
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
-}
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif // __AVXVNNIINTRIN_H
--- a/lib/include/bmi2intrin.h
+++ b/lib/include/bmi2intrin.h
@ -1,255 +0,0 @@
-/*===---- bmi2intrin.h - BMI2 intrinsics -----------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <bmi2intrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __BMI2INTRIN_H
-#define __BMI2INTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi2")))
-
-/// Copies the unsigned 32-bit integer \a __X and zeroes the upper bits
-///    starting at bit number \a __Y.
-///
-/// \code{.operation}
-/// i := __Y[7:0]
-/// result := __X
-/// IF i < 32
-///   result[31:i] := 0
-/// FI
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c BZHI instruction.
-///
-/// \param __X
-///    The 32-bit source value to copy.
-/// \param __Y
-///    The lower 8 bits specify the bit number of the lowest bit to zero.
-/// \returns The partially zeroed 32-bit value.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_bzhi_u32(unsigned int __X, unsigned int __Y)
-{
-  return __builtin_ia32_bzhi_si(__X, __Y);
-}
-
-/// Deposit (scatter) low-order bits from the unsigned 32-bit integer \a __X
-///    into the 32-bit result, according to the mask in the unsigned 32-bit
-///    integer \a __Y. All other bits of the result are zero.
-///
-/// \code{.operation}
-/// i := 0
-/// result := 0
-/// FOR m := 0 TO 31
-///   IF __Y[m] == 1
-///     result[m] := __X[i]
-///     i := i + 1
-///   ENDIF
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c PDEP instruction.
-///
-/// \param __X
-///    The 32-bit source value to copy.
-/// \param __Y
-///    The 32-bit mask specifying where to deposit source bits.
-/// \returns The 32-bit result.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_pdep_u32(unsigned int __X, unsigned int __Y)
-{
-  return __builtin_ia32_pdep_si(__X, __Y);
-}
-
-/// Extract (gather) bits from the unsigned 32-bit integer \a __X into the
-///    low-order bits of the 32-bit result, according to the mask in the
-///    unsigned 32-bit integer \a __Y. All other bits of the result are zero.
-///
-/// \code{.operation}
-/// i := 0
-/// result := 0
-/// FOR m := 0 TO 31
-///   IF __Y[m] == 1
-///     result[i] := __X[m]
-///     i := i + 1
-///   ENDIF
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c PEXT instruction.
-///
-/// \param __X
-///    The 32-bit source value to copy.
-/// \param __Y
-///    The 32-bit mask specifying which source bits to extract.
-/// \returns The 32-bit result.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_pext_u32(unsigned int __X, unsigned int __Y)
-{
-  return __builtin_ia32_pext_si(__X, __Y);
-}
-
-/// Multiplies the unsigned 32-bit integers \a __X and \a __Y to form a
-///    64-bit product. Stores the upper 32 bits of the product in the
-///    memory at \a __P and returns the lower 32 bits.
-///
-/// \code{.operation}
-/// Store32(__P, (__X * __Y)[63:32])
-/// result := (__X * __Y)[31:0]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c MULX instruction.
-///
-/// \param __X
-///    An unsigned 32-bit multiplicand.
-/// \param __Y
-///    An unsigned 32-bit multiplicand.
-/// \param __P
-///    A pointer to memory for storing the upper half of the product.
-/// \returns The lower half of the product.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P)
-{
-  unsigned long long __res = (unsigned long long) __X * __Y;
-  *__P = (unsigned int)(__res >> 32);
-  return (unsigned int)__res;
-}
-
-#ifdef  __x86_64__
-
-/// Copies the unsigned 64-bit integer \a __X and zeroes the upper bits
-///    starting at bit number \a __Y.
-///
-/// \code{.operation}
-/// i := __Y[7:0]
-/// result := __X
-/// IF i < 64
-///   result[63:i] := 0
-/// FI
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c BZHI instruction.
-///
-/// \param __X
-///    The 64-bit source value to copy.
-/// \param __Y
-///    The lower 8 bits specify the bit number of the lowest bit to zero.
-/// \returns The partially zeroed 64-bit value.
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-_bzhi_u64(unsigned long long __X, unsigned long long __Y)
-{
-  return __builtin_ia32_bzhi_di(__X, __Y);
-}
-
-/// Deposit (scatter) low-order bits from the unsigned 64-bit integer \a __X
-///    into the 64-bit result, according to the mask in the unsigned 64-bit
-///    integer \a __Y. All other bits of the result are zero.
-///
-/// \code{.operation}
-/// i := 0
-/// result := 0
-/// FOR m := 0 TO 63
-///   IF __Y[m] == 1
-///     result[m] := __X[i]
-///     i := i + 1
-///   ENDIF
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c PDEP instruction.
-///
-/// \param __X
-///    The 64-bit source value to copy.
-/// \param __Y
-///    The 64-bit mask specifying where to deposit source bits.
-/// \returns The 64-bit result.
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-_pdep_u64(unsigned long long __X, unsigned long long __Y)
-{
-  return __builtin_ia32_pdep_di(__X, __Y);
-}
-
-/// Extract (gather) bits from the unsigned 64-bit integer \a __X into the
-///    low-order bits of the 64-bit result, according to the mask in the
-///    unsigned 64-bit integer \a __Y. All other bits of the result are zero.
-///
-/// \code{.operation}
-/// i := 0
-/// result := 0
-/// FOR m := 0 TO 63
-///   IF __Y[m] == 1
-///     result[i] := __X[m]
-///     i := i + 1
-///   ENDIF
-/// ENDFOR
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c PEXT instruction.
-///
-/// \param __X
-///    The 64-bit source value to copy.
-/// \param __Y
-///    The 64-bit mask specifying which source bits to extract.
-/// \returns The 64-bit result.
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-_pext_u64(unsigned long long __X, unsigned long long __Y)
-{
-  return __builtin_ia32_pext_di(__X, __Y);
-}
-
-/// Multiplies the unsigned 64-bit integers \a __X and \a __Y to form a
-///    128-bit product. Stores the upper 64 bits of the product to the
-///    memory addressed by \a __P and returns the lower 64 bits.
-///
-/// \code{.operation}
-/// Store64(__P, (__X * __Y)[127:64])
-/// result := (__X * __Y)[63:0]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c MULX instruction.
-///
-/// \param __X
-///    An unsigned 64-bit multiplicand.
-/// \param __Y
-///    An unsigned 64-bit multiplicand.
-/// \param __P
-///    A pointer to memory for storing the upper half of the product.
-/// \returns The lower half of the product.
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-_mulx_u64 (unsigned long long __X, unsigned long long __Y,
-	   unsigned long long *__P)
-{
-  unsigned __int128 __res = (unsigned __int128) __X * __Y;
-  *__P = (unsigned long long) (__res >> 64);
-  return (unsigned long long) __res;
-}
-
-#endif /* __x86_64__  */
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __BMI2INTRIN_H */
--- a/lib/include/bmiintrin.h
+++ b/lib/include/bmiintrin.h
@ -1,431 +0,0 @@
-/*===---- bmiintrin.h - BMI intrinsics -------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
-#error "Never use <bmiintrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __BMIINTRIN_H
-#define __BMIINTRIN_H
-
-/* Allow using the tzcnt intrinsics even for non-BMI targets. Since the TZCNT
-   instruction behaves as BSF on non-BMI targets, there is code that expects
-   to use it as a potentially faster version of BSF. */
-#define __RELAXED_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
-
-#define _tzcnt_u16(a)     (__tzcnt_u16((a)))
-
-/// Counts the number of trailing zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
-///
-/// \param __X
-///    An unsigned 16-bit integer whose trailing zeros are to be counted.
-/// \returns An unsigned 16-bit integer containing the number of trailing zero
-///    bits in the operand.
-static __inline__ unsigned short __RELAXED_FN_ATTRS
-__tzcnt_u16(unsigned short __X)
-{
-  return __builtin_ia32_tzcnt_u16(__X);
-}
-
-/// Counts the number of trailing zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
-///
-/// \param __X
-///    An unsigned 32-bit integer whose trailing zeros are to be counted.
-/// \returns An unsigned 32-bit integer containing the number of trailing zero
-///    bits in the operand.
-/// \see _mm_tzcnt_32
-static __inline__ unsigned int __RELAXED_FN_ATTRS
-__tzcnt_u32(unsigned int __X)
-{
-  return __builtin_ia32_tzcnt_u32(__X);
-}
-
-/// Counts the number of trailing zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
-///
-/// \param __X
-///    An unsigned 32-bit integer whose trailing zeros are to be counted.
-/// \returns An 32-bit integer containing the number of trailing zero bits in
-///    the operand.
-/// \see __tzcnt_u32
-static __inline__ int __RELAXED_FN_ATTRS
-_mm_tzcnt_32(unsigned int __X)
-{
-  return (int)__builtin_ia32_tzcnt_u32(__X);
-}
-
-#define _tzcnt_u32(a)     (__tzcnt_u32((a)))
-
-#ifdef __x86_64__
-
-/// Counts the number of trailing zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
-///
-/// \param __X
-///    An unsigned 64-bit integer whose trailing zeros are to be counted.
-/// \returns An unsigned 64-bit integer containing the number of trailing zero
-///    bits in the operand.
-/// \see _mm_tzcnt_64
-static __inline__ unsigned long long __RELAXED_FN_ATTRS
-__tzcnt_u64(unsigned long long __X)
-{
-  return __builtin_ia32_tzcnt_u64(__X);
-}
-
-/// Counts the number of trailing zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
-///
-/// \param __X
-///    An unsigned 64-bit integer whose trailing zeros are to be counted.
-/// \returns An 64-bit integer containing the number of trailing zero bits in
-///    the operand.
-/// \see __tzcnt_u64
-static __inline__ long long __RELAXED_FN_ATTRS
-_mm_tzcnt_64(unsigned long long __X)
-{
-  return (long long)__builtin_ia32_tzcnt_u64(__X);
-}
-
-#define _tzcnt_u64(a)     (__tzcnt_u64((a)))
-
-#endif /* __x86_64__ */
-
-#undef __RELAXED_FN_ATTRS
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__BMI__)
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi")))
-
-#define _andn_u32(a, b)   (__andn_u32((a), (b)))
-
-/* _bextr_u32 != __bextr_u32 */
-#define _blsi_u32(a)      (__blsi_u32((a)))
-
-#define _blsmsk_u32(a)    (__blsmsk_u32((a)))
-
-#define _blsr_u32(a)      (__blsr_u32((a)))
-
-/// Performs a bitwise AND of the second operand with the one's
-///    complement of the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> ANDN </c> instruction.
-///
-/// \param __X
-///    An unsigned integer containing one of the operands.
-/// \param __Y
-///    An unsigned integer containing one of the operands.
-/// \returns An unsigned integer containing the bitwise AND of the second
-///    operand with the one's complement of the first operand.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__andn_u32(unsigned int __X, unsigned int __Y)
-{
-  return ~__X & __Y;
-}
-
-/* AMD-specified, double-leading-underscore version of BEXTR */
-/// Extracts the specified bits from the first operand and returns them
-///    in the least significant bits of the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
-///
-/// \param __X
-///    An unsigned integer whose bits are to be extracted.
-/// \param __Y
-///    An unsigned integer used to specify which bits are extracted. Bits [7:0]
-///    specify the index of the least significant bit. Bits [15:8] specify the
-///    number of bits to be extracted.
-/// \returns An unsigned integer whose least significant bits contain the
-///    extracted bits.
-/// \see _bextr_u32
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__bextr_u32(unsigned int __X, unsigned int __Y)
-{
-  return __builtin_ia32_bextr_u32(__X, __Y);
-}
-
-/* Intel-specified, single-leading-underscore version of BEXTR */
-/// Extracts the specified bits from the first operand and returns them
-///    in the least significant bits of the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
-///
-/// \param __X
-///    An unsigned integer whose bits are to be extracted.
-/// \param __Y
-///    An unsigned integer used to specify the index of the least significant
-///    bit for the bits to be extracted. Bits [7:0] specify the index.
-/// \param __Z
-///    An unsigned integer used to specify the number of bits to be extracted.
-///    Bits [7:0] specify the number of bits.
-/// \returns An unsigned integer whose least significant bits contain the
-///    extracted bits.
-/// \see __bextr_u32
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z)
-{
-  return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
-}
-
-/* Intel-specified, single-leading-underscore version of BEXTR2 */
-/// Extracts the specified bits from the first operand and returns them
-///    in the least significant bits of the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
-///
-/// \param __X
-///    An unsigned integer whose bits are to be extracted.
-/// \param __Y
-///    An unsigned integer used to specify which bits are extracted. Bits [7:0]
-///    specify the index of the least significant bit. Bits [15:8] specify the
-///    number of bits to be extracted.
-/// \returns An unsigned integer whose least significant bits contain the
-///    extracted bits.
-/// \see __bextr_u32
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_bextr2_u32(unsigned int __X, unsigned int __Y) {
-  return __builtin_ia32_bextr_u32(__X, __Y);
-}
-
-/// Clears all bits in the source except for the least significant bit
-///    containing a value of 1 and returns the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> BLSI </c> instruction.
-///
-/// \param __X
-///    An unsigned integer whose bits are to be cleared.
-/// \returns An unsigned integer containing the result of clearing the bits from
-///    the source operand.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blsi_u32(unsigned int __X)
-{
-  return __X & -__X;
-}
-
-/// Creates a mask whose bits are set to 1, using bit 0 up to and
-///    including the least significant bit that is set to 1 in the source
-///    operand and returns the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
-///
-/// \param __X
-///    An unsigned integer used to create the mask.
-/// \returns An unsigned integer containing the newly created mask.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blsmsk_u32(unsigned int __X)
-{
-  return __X ^ (__X - 1);
-}
-
-/// Clears the least significant bit that is set to 1 in the source
-///    operand and returns the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> BLSR </c> instruction.
-///
-/// \param __X
-///    An unsigned integer containing the operand to be cleared.
-/// \returns An unsigned integer containing the result of clearing the source
-///    operand.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blsr_u32(unsigned int __X)
-{
-  return __X & (__X - 1);
-}
-
-#ifdef __x86_64__
-
-#define _andn_u64(a, b)   (__andn_u64((a), (b)))
-
-/* _bextr_u64 != __bextr_u64 */
-#define _blsi_u64(a)      (__blsi_u64((a)))
-
-#define _blsmsk_u64(a)    (__blsmsk_u64((a)))
-
-#define _blsr_u64(a)      (__blsr_u64((a)))
-
-/// Performs a bitwise AND of the second operand with the one's
-///    complement of the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> ANDN </c> instruction.
-///
-/// \param __X
-///    An unsigned 64-bit integer containing one of the operands.
-/// \param __Y
-///    An unsigned 64-bit integer containing one of the operands.
-/// \returns An unsigned 64-bit integer containing the bitwise AND of the second
-///    operand with the one's complement of the first operand.
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__andn_u64 (unsigned long long __X, unsigned long long __Y)
-{
-  return ~__X & __Y;
-}
-
-/* AMD-specified, double-leading-underscore version of BEXTR */
-/// Extracts the specified bits from the first operand and returns them
-///    in the least significant bits of the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
-///
-/// \param __X
-///    An unsigned 64-bit integer whose bits are to be extracted.
-/// \param __Y
-///    An unsigned 64-bit integer used to specify which bits are extracted. Bits
-///    [7:0] specify the index of the least significant bit. Bits [15:8] specify
-///    the number of bits to be extracted.
-/// \returns An unsigned 64-bit integer whose least significant bits contain the
-///    extracted bits.
-/// \see _bextr_u64
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__bextr_u64(unsigned long long __X, unsigned long long __Y)
-{
-  return __builtin_ia32_bextr_u64(__X, __Y);
-}
-
-/* Intel-specified, single-leading-underscore version of BEXTR */
-/// Extracts the specified bits from the first operand and returns them
-///     in the least significant bits of the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
-///
-/// \param __X
-///    An unsigned 64-bit integer whose bits are to be extracted.
-/// \param __Y
-///    An unsigned integer used to specify the index of the least significant
-///    bit for the bits to be extracted. Bits [7:0] specify the index.
-/// \param __Z
-///    An unsigned integer used to specify the number of bits to be extracted.
-///    Bits [7:0] specify the number of bits.
-/// \returns An unsigned 64-bit integer whose least significant bits contain the
-///    extracted bits.
-/// \see __bextr_u64
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-_bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z)
-{
-  return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
-}
-
-/* Intel-specified, single-leading-underscore version of BEXTR2 */
-/// Extracts the specified bits from the first operand and returns them
-///    in the least significant bits of the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
-///
-/// \param __X
-///    An unsigned 64-bit integer whose bits are to be extracted.
-/// \param __Y
-///    An unsigned 64-bit integer used to specify which bits are extracted. Bits
-///    [7:0] specify the index of the least significant bit. Bits [15:8] specify
-///    the number of bits to be extracted.
-/// \returns An unsigned 64-bit integer whose least significant bits contain the
-///    extracted bits.
-/// \see __bextr_u64
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-_bextr2_u64(unsigned long long __X, unsigned long long __Y) {
-  return __builtin_ia32_bextr_u64(__X, __Y);
-}
-
-/// Clears all bits in the source except for the least significant bit
-///    containing a value of 1 and returns the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> BLSI </c> instruction.
-///
-/// \param __X
-///    An unsigned 64-bit integer whose bits are to be cleared.
-/// \returns An unsigned 64-bit integer containing the result of clearing the
-///    bits from the source operand.
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blsi_u64(unsigned long long __X)
-{
-  return __X & -__X;
-}
-
-/// Creates a mask whose bits are set to 1, using bit 0 up to and
-///    including the least significant bit that is set to 1 in the source
-///    operand and returns the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
-///
-/// \param __X
-///    An unsigned 64-bit integer used to create the mask.
-/// \returns An unsigned 64-bit integer containing the newly created mask.
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blsmsk_u64(unsigned long long __X)
-{
-  return __X ^ (__X - 1);
-}
-
-/// Clears the least significant bit that is set to 1 in the source
-///    operand and returns the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> BLSR </c> instruction.
-///
-/// \param __X
-///    An unsigned 64-bit integer containing the operand to be cleared.
-/// \returns An unsigned 64-bit integer containing the result of clearing the
-///    source operand.
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blsr_u64(unsigned long long __X)
-{
-  return __X & (__X - 1);
-}
-
-#endif /* __x86_64__ */
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules)   \
-          || defined(__BMI__) */
-
-#endif /* __BMIINTRIN_H */
--- a/lib/include/builtins.h
+++ b/lib/include/builtins.h
@ -1,16 +0,0 @@
-/*===---- builtins.h - Standard header for extra builtins -----------------===*\
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
-\*===----------------------------------------------------------------------===*/
-
-/// Some legacy compilers have builtin definitions in a file named builtins.h.
-/// This header file has been added to allow compatibility with code that was
-/// written for those compilers. Code may have an include line for this file
-/// and to avoid an error an empty file with this name is provided.
-#ifndef __BUILTINS_H
-#define __BUILTINS_H
-
-#endif /* __BUILTINS_H */
--- a/lib/include/cet.h
+++ b/lib/include/cet.h
@ -1,66 +0,0 @@
-/*===------ cet.h -Control-flow Enforcement Technology  feature ------------===
- * Add x86 feature with IBT and/or SHSTK bits to ELF program property if they
- * are enabled. Otherwise, contents in this header file are unused. This file
- * is mainly design for assembly source code which want to enable CET.
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __CET_H
-#define __CET_H
-
-#ifdef __ASSEMBLER__
-
-#ifndef __CET__
-# define _CET_ENDBR
-#endif
-
-#ifdef __CET__
-
-# ifdef __LP64__
-#  if __CET__ & 0x1
-#    define _CET_ENDBR endbr64
-#  else
-#    define _CET_ENDBR
-#  endif
-# else
-#  if __CET__ & 0x1
-#    define _CET_ENDBR endbr32
-#  else
-#    define _CET_ENDBR
-#  endif
-# endif
-
-
-#  ifdef __LP64__
-#   define __PROPERTY_ALIGN 3
-#  else
-#   define __PROPERTY_ALIGN 2
-#  endif
-
-	.pushsection ".note.gnu.property", "a"
-	.p2align __PROPERTY_ALIGN
-	.long 1f - 0f		/* name length.  */
-	.long 4f - 1f		/* data length.  */
-	/* NT_GNU_PROPERTY_TYPE_0.   */
-	.long 5			/* note type.  */
-0:
-	.asciz "GNU"		/* vendor name.  */
-1:
-	.p2align __PROPERTY_ALIGN
-	/* GNU_PROPERTY_X86_FEATURE_1_AND.  */
-	.long 0xc0000002	/* pr_type.  */
-	.long 3f - 2f		/* pr_datasz.  */
-2:
-	/* GNU_PROPERTY_X86_FEATURE_1_XXX.  */
-	.long __CET__
-3:
-	.p2align __PROPERTY_ALIGN
-4:
-	.popsection
-#endif
-#endif
-#endif
--- a/lib/include/cetintrin.h
+++ b/lib/include/cetintrin.h
@ -1,115 +0,0 @@
-/*===---- cetintrin.h - CET intrinsic --------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <cetintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __CETINTRIN_H
-#define __CETINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__, __target__("shstk")))
-
-static __inline__ void __DEFAULT_FN_ATTRS _incsspd(int __a) {
-  __builtin_ia32_incsspd((unsigned int)__a);
-}
-
-#ifdef __x86_64__
-static __inline__ void __DEFAULT_FN_ATTRS _incsspq(unsigned long long __a) {
-  __builtin_ia32_incsspq(__a);
-}
-#endif /* __x86_64__ */
-
-#ifdef __x86_64__
-static __inline__ void __DEFAULT_FN_ATTRS _inc_ssp(unsigned int __a) {
-  __builtin_ia32_incsspq(__a);
-}
-#else /* __x86_64__ */
-static __inline__ void __DEFAULT_FN_ATTRS _inc_ssp(unsigned int __a) {
-  __builtin_ia32_incsspd(__a);
-}
-#endif /* __x86_64__ */
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd(unsigned int __a) {
-  return __builtin_ia32_rdsspd(__a);
-}
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd_i32(void) {
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wuninitialized"
-  unsigned int t;
-  return __builtin_ia32_rdsspd(t);
-#pragma clang diagnostic pop
-}
-
-#ifdef __x86_64__
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq(unsigned long long __a) {
-  return __builtin_ia32_rdsspq(__a);
-}
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq_i64(void) {
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wuninitialized"
-  unsigned long long t;
-  return __builtin_ia32_rdsspq(t);
-#pragma clang diagnostic pop
-}
-#endif /* __x86_64__ */
-
-#ifdef __x86_64__
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS _get_ssp(void) {
-  return __builtin_ia32_rdsspq(0);
-}
-#else /* __x86_64__ */
-static __inline__ unsigned int __DEFAULT_FN_ATTRS _get_ssp(void) {
-  return __builtin_ia32_rdsspd(0);
-}
-#endif /* __x86_64__ */
-
-static __inline__ void __DEFAULT_FN_ATTRS _saveprevssp(void) {
-  __builtin_ia32_saveprevssp();
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS _rstorssp(void * __p) {
-  __builtin_ia32_rstorssp(__p);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS _wrssd(unsigned int __a, void * __p) {
-  __builtin_ia32_wrssd(__a, __p);
-}
-
-#ifdef __x86_64__
-static __inline__ void __DEFAULT_FN_ATTRS _wrssq(unsigned long long __a, void * __p) {
-  __builtin_ia32_wrssq(__a, __p);
-}
-#endif /* __x86_64__ */
-
-static __inline__ void __DEFAULT_FN_ATTRS _wrussd(unsigned int __a, void * __p) {
-  __builtin_ia32_wrussd(__a, __p);
-}
-
-#ifdef __x86_64__
-static __inline__ void __DEFAULT_FN_ATTRS _wrussq(unsigned long long __a, void * __p) {
-  __builtin_ia32_wrussq(__a, __p);
-}
-#endif /* __x86_64__ */
-
-static __inline__ void __DEFAULT_FN_ATTRS _setssbsy(void) {
-  __builtin_ia32_setssbsy();
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS _clrssbsy(void * __p) {
-  __builtin_ia32_clrssbsy(__p);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __CETINTRIN_H */
--- a/lib/include/cldemoteintrin.h
+++ b/lib/include/cldemoteintrin.h
@ -1,36 +0,0 @@
-/*===---- cldemoteintrin.h - CLDEMOTE intrinsic ----------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
-#error "Never use <cldemoteintrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __CLDEMOTEINTRIN_H
-#define __CLDEMOTEINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS \
-  __attribute__((__always_inline__, __nodebug__,  __target__("cldemote")))
-
-/// Hint to hardware that the cache line that contains \p __P should be demoted
-/// from the cache closest to the processor core to a level more distant from
-/// the processor core.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CLDEMOTE </c> instruction.
-static __inline__ void __DEFAULT_FN_ATTRS
-_cldemote(const void * __P) {
-  __builtin_ia32_cldemote(__P);
-}
-
-#define _mm_cldemote(p) _cldemote(p)
-#undef __DEFAULT_FN_ATTRS
-
-#endif
--- a/lib/include/clflushoptintrin.h
+++ b/lib/include/clflushoptintrin.h
@ -1,36 +0,0 @@
-/*===---- clflushoptintrin.h - CLFLUSHOPT intrinsic ------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <clflushoptintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __CLFLUSHOPTINTRIN_H
-#define __CLFLUSHOPTINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("clflushopt")))
-
-/// Invalidates all levels of the cache hierarchy and flushes modified data to
-///    memory for the cache line specified by the address \a __m.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c CLFLUSHOPT instruction.
-///
-/// \param __m
-///    An address within the cache line to flush and invalidate.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_clflushopt(void const * __m) {
-  __builtin_ia32_clflushopt(__m);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
--- a/lib/include/clwbintrin.h
+++ b/lib/include/clwbintrin.h
@ -1,38 +0,0 @@
-/*===---- clwbintrin.h - CLWB intrinsic ------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <clwbintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __CLWBINTRIN_H
-#define __CLWBINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("clwb")))
-
-/// Writes back to memory the cache line (if modified) that contains the
-/// linear address specified in \a __p from any level of the cache hierarchy in
-/// the cache coherence domain
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> CLWB </c> instruction.
-///
-/// \param __p
-///    A pointer to the memory location used to identify the cache line to be
-///    written back.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_clwb(void const *__p) {
-  __builtin_ia32_clwb(__p);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
--- a/lib/include/clzerointrin.h
+++ b/lib/include/clzerointrin.h
@ -1,38 +0,0 @@
-/*===----------------------- clzerointrin.h - CLZERO ----------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __X86INTRIN_H
-#error "Never use <clzerointrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __CLZEROINTRIN_H
-#define __CLZEROINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS \
-  __attribute__((__always_inline__, __nodebug__,  __target__("clzero")))
-
-/// Zeroes out the cache line for the address \a __line. This uses a
-///    non-temporal store. Calling \c _mm_sfence() afterward might be needed
-///    to enforce ordering.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c CLZERO instruction.
-///
-/// \param __line
-///    An address within the cache line to zero out.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_clzero (void * __line)
-{
-  __builtin_ia32_clzero ((void *)__line);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __CLZEROINTRIN_H */
--- a/lib/include/cmpccxaddintrin.h
+++ b/lib/include/cmpccxaddintrin.h
@ -1,70 +0,0 @@
-/*===--------------- cmpccxaddintrin.h - CMPCCXADD intrinsics--------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __X86GPRINTRIN_H
-#error                                                                         \
-    "Never use <cmpccxaddintrin.h> directly; include <x86gprintrin.h> instead."
-#endif // __X86GPRINTRIN_H
-
-#ifndef __CMPCCXADDINTRIN_H
-#define __CMPCCXADDINTRIN_H
-#ifdef __x86_64__
-
-typedef enum {
-  _CMPCCX_O,   /* Overflow.  */
-  _CMPCCX_NO,  /* No overflow.  */
-  _CMPCCX_B,   /* Below.  */
-  _CMPCCX_NB,  /* Not below.  */
-  _CMPCCX_Z,   /* Zero.  */
-  _CMPCCX_NZ,  /* Not zero.  */
-  _CMPCCX_BE,  /* Below or equal.  */
-  _CMPCCX_NBE, /* Neither below nor equal.  */
-  _CMPCCX_S,   /* Sign.  */
-  _CMPCCX_NS,  /* No sign.  */
-  _CMPCCX_P,   /* Parity.  */
-  _CMPCCX_NP,  /* No parity.  */
-  _CMPCCX_L,   /* Less.  */
-  _CMPCCX_NL,  /* Not less.  */
-  _CMPCCX_LE,  /* Less or equal.  */
-  _CMPCCX_NLE, /* Neither less nor equal.  */
-} _CMPCCX_ENUM;
-
-/// Compares the value from the memory __A with the value of __B. If the
-/// specified condition __D is met, then add the third operand __C to the
-/// __A and write it into __A, else the value of __A is unchanged. The return
-/// value is the original value of __A.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c CMPCCXADD instructions.
-///
-/// \param __A
-///    __A pointer specifying the memory address.
-///
-/// \param __B
-///   A integer operand.
-///
-/// \param __C
-///   A integer operand.
-///
-/// \param __D
-///   The specified condition.
-///
-/// \returns a integer which is the original value of first operand.
-
-#define _cmpccxadd_epi32(__A, __B, __C, __D)                                   \
-  ((int)(__builtin_ia32_cmpccxadd32((void *)(__A), (int)(__B), (int)(__C),     \
-                                    (int)(__D))))
-
-#define _cmpccxadd_epi64(__A, __B, __C, __D)                                   \
-  ((long long)(__builtin_ia32_cmpccxadd64((void *)(__A), (long long)(__B),     \
-                                          (long long)(__C), (int)(__D))))
-
-#endif // __x86_64__
-#endif // __CMPCCXADDINTRIN_H
--- a/lib/include/cpuid.h
+++ b/lib/include/cpuid.h
@ -1,331 +0,0 @@
-/*===---- cpuid.h - X86 cpu model detection --------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __CPUID_H
-#define __CPUID_H
-
-#if !(__x86_64__ || __i386__)
-#error this header is for x86 only
-#endif
-
-/* Responses identification request with %eax 0 */
-/* AMD:     "AuthenticAMD" */
-#define signature_AMD_ebx 0x68747541
-#define signature_AMD_edx 0x69746e65
-#define signature_AMD_ecx 0x444d4163
-/* CENTAUR: "CentaurHauls" */
-#define signature_CENTAUR_ebx 0x746e6543
-#define signature_CENTAUR_edx 0x48727561
-#define signature_CENTAUR_ecx 0x736c7561
-/* CYRIX:   "CyrixInstead" */
-#define signature_CYRIX_ebx 0x69727943
-#define signature_CYRIX_edx 0x736e4978
-#define signature_CYRIX_ecx 0x64616574
-/* HYGON:   "HygonGenuine" */
-#define signature_HYGON_ebx 0x6f677948
-#define signature_HYGON_edx 0x6e65476e
-#define signature_HYGON_ecx 0x656e6975
-/* INTEL:   "GenuineIntel" */
-#define signature_INTEL_ebx 0x756e6547
-#define signature_INTEL_edx 0x49656e69
-#define signature_INTEL_ecx 0x6c65746e
-/* TM1:     "TransmetaCPU" */
-#define signature_TM1_ebx 0x6e617254
-#define signature_TM1_edx 0x74656d73
-#define signature_TM1_ecx 0x55504361
-/* TM2:     "GenuineTMx86" */
-#define signature_TM2_ebx 0x756e6547
-#define signature_TM2_edx 0x54656e69
-#define signature_TM2_ecx 0x3638784d
-/* NSC:     "Geode by NSC" */
-#define signature_NSC_ebx 0x646f6547
-#define signature_NSC_edx 0x79622065
-#define signature_NSC_ecx 0x43534e20
-/* NEXGEN:  "NexGenDriven" */
-#define signature_NEXGEN_ebx 0x4778654e
-#define signature_NEXGEN_edx 0x72446e65
-#define signature_NEXGEN_ecx 0x6e657669
-/* RISE:    "RiseRiseRise" */
-#define signature_RISE_ebx 0x65736952
-#define signature_RISE_edx 0x65736952
-#define signature_RISE_ecx 0x65736952
-/* SIS:     "SiS SiS SiS " */
-#define signature_SIS_ebx 0x20536953
-#define signature_SIS_edx 0x20536953
-#define signature_SIS_ecx 0x20536953
-/* UMC:     "UMC UMC UMC " */
-#define signature_UMC_ebx 0x20434d55
-#define signature_UMC_edx 0x20434d55
-#define signature_UMC_ecx 0x20434d55
-/* VIA:     "VIA VIA VIA " */
-#define signature_VIA_ebx 0x20414956
-#define signature_VIA_edx 0x20414956
-#define signature_VIA_ecx 0x20414956
-/* VORTEX:  "Vortex86 SoC" */
-#define signature_VORTEX_ebx 0x74726f56
-#define signature_VORTEX_edx 0x36387865
-#define signature_VORTEX_ecx 0x436f5320
-
-/* Features in %ecx for leaf 1 */
-#define bit_SSE3        0x00000001
-#define bit_PCLMULQDQ   0x00000002
-#define bit_PCLMUL      bit_PCLMULQDQ   /* for gcc compat */
-#define bit_DTES64      0x00000004
-#define bit_MONITOR     0x00000008
-#define bit_DSCPL       0x00000010
-#define bit_VMX         0x00000020
-#define bit_SMX         0x00000040
-#define bit_EIST        0x00000080
-#define bit_TM2         0x00000100
-#define bit_SSSE3       0x00000200
-#define bit_CNXTID      0x00000400
-#define bit_FMA         0x00001000
-#define bit_CMPXCHG16B  0x00002000
-#define bit_xTPR        0x00004000
-#define bit_PDCM        0x00008000
-#define bit_PCID        0x00020000
-#define bit_DCA         0x00040000
-#define bit_SSE41       0x00080000
-#define bit_SSE4_1      bit_SSE41       /* for gcc compat */
-#define bit_SSE42       0x00100000
-#define bit_SSE4_2      bit_SSE42       /* for gcc compat */
-#define bit_x2APIC      0x00200000
-#define bit_MOVBE       0x00400000
-#define bit_POPCNT      0x00800000
-#define bit_TSCDeadline 0x01000000
-#define bit_AESNI       0x02000000
-#define bit_AES         bit_AESNI       /* for gcc compat */
-#define bit_XSAVE       0x04000000
-#define bit_OSXSAVE     0x08000000
-#define bit_AVX         0x10000000
-#define bit_F16C        0x20000000
-#define bit_RDRND       0x40000000
-
-/* Features in %edx for leaf 1 */
-#define bit_FPU         0x00000001
-#define bit_VME         0x00000002
-#define bit_DE          0x00000004
-#define bit_PSE         0x00000008
-#define bit_TSC         0x00000010
-#define bit_MSR         0x00000020
-#define bit_PAE         0x00000040
-#define bit_MCE         0x00000080
-#define bit_CX8         0x00000100
-#define bit_CMPXCHG8B   bit_CX8         /* for gcc compat */
-#define bit_APIC        0x00000200
-#define bit_SEP         0x00000800
-#define bit_MTRR        0x00001000
-#define bit_PGE         0x00002000
-#define bit_MCA         0x00004000
-#define bit_CMOV        0x00008000
-#define bit_PAT         0x00010000
-#define bit_PSE36       0x00020000
-#define bit_PSN         0x00040000
-#define bit_CLFSH       0x00080000
-#define bit_DS          0x00200000
-#define bit_ACPI        0x00400000
-#define bit_MMX         0x00800000
-#define bit_FXSR        0x01000000
-#define bit_FXSAVE      bit_FXSR        /* for gcc compat */
-#define bit_SSE         0x02000000
-#define bit_SSE2        0x04000000
-#define bit_SS          0x08000000
-#define bit_HTT         0x10000000
-#define bit_TM          0x20000000
-#define bit_PBE         0x80000000
-
-/* Features in %ebx for leaf 7 sub-leaf 0 */
-#define bit_FSGSBASE    0x00000001
-#define bit_SGX         0x00000004
-#define bit_BMI         0x00000008
-#define bit_HLE         0x00000010
-#define bit_AVX2        0x00000020
-#define bit_SMEP        0x00000080
-#define bit_BMI2        0x00000100
-#define bit_ENH_MOVSB   0x00000200
-#define bit_INVPCID     0x00000400
-#define bit_RTM         0x00000800
-#define bit_MPX         0x00004000
-#define bit_AVX512F     0x00010000
-#define bit_AVX512DQ    0x00020000
-#define bit_RDSEED      0x00040000
-#define bit_ADX         0x00080000
-#define bit_AVX512IFMA  0x00200000
-#define bit_CLFLUSHOPT  0x00800000
-#define bit_CLWB        0x01000000
-#define bit_AVX512PF    0x04000000
-#define bit_AVX512ER    0x08000000
-#define bit_AVX512CD    0x10000000
-#define bit_SHA         0x20000000
-#define bit_AVX512BW    0x40000000
-#define bit_AVX512VL    0x80000000
-
-/* Features in %ecx for leaf 7 sub-leaf 0 */
-#define bit_PREFTCHWT1       0x00000001
-#define bit_AVX512VBMI       0x00000002
-#define bit_PKU              0x00000004
-#define bit_OSPKE            0x00000010
-#define bit_WAITPKG          0x00000020
-#define bit_AVX512VBMI2      0x00000040
-#define bit_SHSTK            0x00000080
-#define bit_GFNI             0x00000100
-#define bit_VAES             0x00000200
-#define bit_VPCLMULQDQ       0x00000400
-#define bit_AVX512VNNI       0x00000800
-#define bit_AVX512BITALG     0x00001000
-#define bit_AVX512VPOPCNTDQ  0x00004000
-#define bit_RDPID            0x00400000
-#define bit_CLDEMOTE         0x02000000
-#define bit_MOVDIRI          0x08000000
-#define bit_MOVDIR64B        0x10000000
-#define bit_ENQCMD           0x20000000
-
-/* Features in %edx for leaf 7 sub-leaf 0 */
-#define bit_AVX5124VNNIW  0x00000004
-#define bit_AVX5124FMAPS  0x00000008
-#define bit_UINTR         0x00000020
-#define bit_SERIALIZE     0x00004000
-#define bit_TSXLDTRK      0x00010000
-#define bit_PCONFIG       0x00040000
-#define bit_IBT           0x00100000
-#define bit_AMXBF16       0x00400000
-#define bit_AVX512FP16    0x00800000
-#define bit_AMXTILE       0x01000000
-#define bit_AMXINT8       0x02000000
-
-/* Features in %eax for leaf 7 sub-leaf 1 */
-#define bit_RAOINT        0x00000008
-#define bit_AVXVNNI       0x00000010
-#define bit_AVX512BF16    0x00000020
-#define bit_CMPCCXADD     0x00000080
-#define bit_AMXFP16       0x00200000
-#define bit_HRESET        0x00400000
-#define bit_AVXIFMA       0x00800000
-
-/* Features in %edx for leaf 7 sub-leaf 1 */
-#define bit_AVXVNNIINT8   0x00000010
-#define bit_AVXNECONVERT  0x00000020
-#define bit_PREFETCHI     0x00004000
-
-/* Features in %eax for leaf 13 sub-leaf 1 */
-#define bit_XSAVEOPT    0x00000001
-#define bit_XSAVEC      0x00000002
-#define bit_XSAVES      0x00000008
-
-/* Features in %eax for leaf 0x14 sub-leaf 0 */
-#define bit_PTWRITE     0x00000010
-
-/* Features in %ecx for leaf 0x80000001 */
-#define bit_LAHF_LM     0x00000001
-#define bit_ABM         0x00000020
-#define bit_LZCNT       bit_ABM        /* for gcc compat */
-#define bit_SSE4a       0x00000040
-#define bit_PRFCHW      0x00000100
-#define bit_XOP         0x00000800
-#define bit_LWP         0x00008000
-#define bit_FMA4        0x00010000
-#define bit_TBM         0x00200000
-#define bit_MWAITX      0x20000000
-
-/* Features in %edx for leaf 0x80000001 */
-#define bit_MMXEXT      0x00400000
-#define bit_LM          0x20000000
-#define bit_3DNOWP      0x40000000
-#define bit_3DNOW       0x80000000
-
-/* Features in %ebx for leaf 0x80000008 */
-#define bit_CLZERO      0x00000001
-#define bit_RDPRU       0x00000010
-#define bit_WBNOINVD    0x00000200
-
-
-#if __i386__
-#define __cpuid(__leaf, __eax, __ebx, __ecx, __edx) \
-    __asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
-                  : "0"(__leaf))
-
-#define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx) \
-    __asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
-                  : "0"(__leaf), "2"(__count))
-#else
-/* x86-64 uses %rbx as the base register, so preserve it. */
-#define __cpuid(__leaf, __eax, __ebx, __ecx, __edx) \
-    __asm("  xchgq  %%rbx,%q1\n" \
-          "  cpuid\n" \
-          "  xchgq  %%rbx,%q1" \
-        : "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
-        : "0"(__leaf))
-
-#define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx) \
-    __asm("  xchgq  %%rbx,%q1\n" \
-          "  cpuid\n" \
-          "  xchgq  %%rbx,%q1" \
-        : "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
-        : "0"(__leaf), "2"(__count))
-#endif
-
-static __inline unsigned int __get_cpuid_max (unsigned int __leaf,
-                                              unsigned int *__sig)
-{
-    unsigned int __eax, __ebx, __ecx, __edx;
-#if __i386__
-    int __cpuid_supported;
-
-    __asm("  pushfl\n"
-          "  popl   %%eax\n"
-          "  movl   %%eax,%%ecx\n"
-          "  xorl   $0x00200000,%%eax\n"
-          "  pushl  %%eax\n"
-          "  popfl\n"
-          "  pushfl\n"
-          "  popl   %%eax\n"
-          "  movl   $0,%0\n"
-          "  cmpl   %%eax,%%ecx\n"
-          "  je     1f\n"
-          "  movl   $1,%0\n"
-          "1:"
-        : "=r" (__cpuid_supported) : : "eax", "ecx");
-    if (!__cpuid_supported)
-        return 0;
-#endif
-
-    __cpuid(__leaf, __eax, __ebx, __ecx, __edx);
-    if (__sig)
-        *__sig = __ebx;
-    return __eax;
-}
-
-static __inline int __get_cpuid (unsigned int __leaf, unsigned int *__eax,
-                                 unsigned int *__ebx, unsigned int *__ecx,
-                                 unsigned int *__edx)
-{
-    unsigned int __max_leaf = __get_cpuid_max(__leaf & 0x80000000, 0);
-
-    if (__max_leaf == 0 || __max_leaf < __leaf)
-        return 0;
-
-    __cpuid(__leaf, *__eax, *__ebx, *__ecx, *__edx);
-    return 1;
-}
-
-static __inline int __get_cpuid_count (unsigned int __leaf,
-                                       unsigned int __subleaf,
-                                       unsigned int *__eax, unsigned int *__ebx,
-                                       unsigned int *__ecx, unsigned int *__edx)
-{
-    unsigned int __max_leaf = __get_cpuid_max(__leaf & 0x80000000, 0);
-
-    if (__max_leaf == 0 || __max_leaf < __leaf)
-        return 0;
-
-    __cpuid_count(__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
-    return 1;
-}
-
-#endif /* __CPUID_H */
--- a/lib/include/crc32intrin.h
+++ b/lib/include/crc32intrin.h
@ -1,100 +0,0 @@
-/*===---- crc32intrin.h - SSE4.2 Accumulate CRC32 intrinsics ---------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __CRC32INTRIN_H
-#define __CRC32INTRIN_H
-
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__, __target__("crc32")))
-
-/// Adds the unsigned integer operand to the CRC-32C checksum of the
-///    unsigned char operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CRC32B </c> instruction.
-///
-/// \param __C
-///    An unsigned integer operand to add to the CRC-32C checksum of operand
-///    \a  __D.
-/// \param __D
-///    An unsigned 8-bit integer operand used to compute the CRC-32C checksum.
-/// \returns The result of adding operand \a __C to the CRC-32C checksum of
-///    operand \a __D.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_mm_crc32_u8(unsigned int __C, unsigned char __D)
-{
-  return __builtin_ia32_crc32qi(__C, __D);
-}
-
-/// Adds the unsigned integer operand to the CRC-32C checksum of the
-///    unsigned short operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CRC32W </c> instruction.
-///
-/// \param __C
-///    An unsigned integer operand to add to the CRC-32C checksum of operand
-///    \a __D.
-/// \param __D
-///    An unsigned 16-bit integer operand used to compute the CRC-32C checksum.
-/// \returns The result of adding operand \a __C to the CRC-32C checksum of
-///    operand \a __D.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_mm_crc32_u16(unsigned int __C, unsigned short __D)
-{
-  return __builtin_ia32_crc32hi(__C, __D);
-}
-
-/// Adds the first unsigned integer operand to the CRC-32C checksum of
-///    the second unsigned integer operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CRC32L </c> instruction.
-///
-/// \param __C
-///    An unsigned integer operand to add to the CRC-32C checksum of operand
-///    \a __D.
-/// \param __D
-///    An unsigned 32-bit integer operand used to compute the CRC-32C checksum.
-/// \returns The result of adding operand \a __C to the CRC-32C checksum of
-///    operand \a __D.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_mm_crc32_u32(unsigned int __C, unsigned int __D)
-{
-  return __builtin_ia32_crc32si(__C, __D);
-}
-
-#ifdef __x86_64__
-/// Adds the unsigned integer operand to the CRC-32C checksum of the
-///    unsigned 64-bit integer operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CRC32Q </c> instruction.
-///
-/// \param __C
-///    An unsigned integer operand to add to the CRC-32C checksum of operand
-///    \a __D.
-/// \param __D
-///    An unsigned 64-bit integer operand used to compute the CRC-32C checksum.
-/// \returns The result of adding operand \a __C to the CRC-32C checksum of
-///    operand \a __D.
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-_mm_crc32_u64(unsigned long long __C, unsigned long long __D)
-{
-  return __builtin_ia32_crc32di(__C, __D);
-}
-#endif /* __x86_64__ */
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __CRC32INTRIN_H */
--- a/lib/include/cuda_wrappers/algorithm
+++ b/lib/include/cuda_wrappers/algorithm
@ -1,116 +0,0 @@
-/*===---- algorithm - CUDA wrapper for <algorithm> -------------------------===
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __CLANG_CUDA_WRAPPERS_ALGORITHM
-#define __CLANG_CUDA_WRAPPERS_ALGORITHM
-
-// This header defines __device__ overloads of std::min/max.
-//
-// Ideally we'd declare these functions only if we're <= C++11.  In C++14,
-// these functions are constexpr, and so are implicitly __host__ __device__.
-//
-// However, the compiler being in C++14 mode does not imply that the standard
-// library supports C++14.  There is no macro we can test to check that the
-// stdlib has constexpr std::min/max.  Thus we have to unconditionally define
-// our device overloads.
-//
-// A host+device function cannot be overloaded, and a constexpr function
-// implicitly become host device if there's no explicitly host or device
-// overload preceding it.  So the simple thing to do would be to declare our
-// device min/max overloads, and then #include_next <algorithm>.  This way our
-// device overloads would come first, and so if we have a C++14 stdlib, its
-// min/max won't become host+device and conflict with our device overloads.
-//
-// But that also doesn't work.  libstdc++ is evil and declares std::min/max in
-// an internal header that is included *before* <algorithm>.  Thus by the time
-// we're inside of this file, std::min/max may already have been declared, and
-// thus we can't prevent them from becoming host+device if they're constexpr.
-//
-// Therefore we perpetrate the following hack: We mark our __device__ overloads
-// with __attribute__((enable_if(true, ""))).  This causes the signature of the
-// function to change without changing anything else about it.  (Except that
-// overload resolution will prefer it over the __host__ __device__ version
-// rather than considering them equally good).
-
-#include_next <algorithm>
-
-// We need to define these overloads in exactly the namespace our standard
-// library uses (including the right inline namespace), otherwise they won't be
-// picked up by other functions in the standard library (e.g. functions in
-// <complex>).  Thus the ugliness below.
-#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
-_LIBCPP_BEGIN_NAMESPACE_STD
-#else
-namespace std {
-#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
-_GLIBCXX_BEGIN_NAMESPACE_VERSION
-#endif
-#endif
-
-#pragma push_macro("_CPP14_CONSTEXPR")
-#if __cplusplus >= 201402L
-#define _CPP14_CONSTEXPR constexpr
-#else
-#define _CPP14_CONSTEXPR
-#endif
-
-template <class __T, class __Cmp>
-__attribute__((enable_if(true, "")))
-inline _CPP14_CONSTEXPR __host__ __device__ const __T &
-max(const __T &__a, const __T &__b, __Cmp __cmp) {
-  return __cmp(__a, __b) ? __b : __a;
-}
-
-template <class __T>
-__attribute__((enable_if(true, "")))
-inline _CPP14_CONSTEXPR __host__ __device__ const __T &
-max(const __T &__a, const __T &__b) {
-  return __a < __b ? __b : __a;
-}
-
-template <class __T, class __Cmp>
-__attribute__((enable_if(true, "")))
-inline _CPP14_CONSTEXPR __host__ __device__ const __T &
-min(const __T &__a, const __T &__b, __Cmp __cmp) {
-  return __cmp(__b, __a) ? __b : __a;
-}
-
-template <class __T>
-__attribute__((enable_if(true, "")))
-inline _CPP14_CONSTEXPR __host__ __device__ const __T &
-min(const __T &__a, const __T &__b) {
-  return __a < __b ? __a : __b;
-}
-
-#pragma pop_macro("_CPP14_CONSTEXPR")
-
-#ifdef _LIBCPP_END_NAMESPACE_STD
-_LIBCPP_END_NAMESPACE_STD
-#else
-#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
-_GLIBCXX_END_NAMESPACE_VERSION
-#endif
-} // namespace std
-#endif
-
-#endif // __CLANG_CUDA_WRAPPERS_ALGORITHM
--- a/lib/include/cuda_wrappers/bits/shared_ptr_base.h
+++ b/lib/include/cuda_wrappers/bits/shared_ptr_base.h
@ -1,9 +0,0 @@
-// CUDA headers define __noinline__ which interferes with libstdc++'s use of
-// `__attribute((__noinline__))`. In order to avoid compilation error,
-// temporarily unset __noinline__ when we include affected libstdc++ header.
-
-#pragma push_macro("__noinline__")
-#undef __noinline__
-#include_next "bits/shared_ptr_base.h"
-
-#pragma pop_macro("__noinline__")
--- a/lib/include/cuda_wrappers/cmath
+++ b/lib/include/cuda_wrappers/cmath
@ -1,90 +0,0 @@
-/*===---- cmath - CUDA wrapper for <cmath> ---------------------------------===
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __CLANG_CUDA_WRAPPERS_CMATH
-#define __CLANG_CUDA_WRAPPERS_CMATH
-
-#include_next <cmath>
-
-#if defined(_LIBCPP_STD_VER)
-
-// libc++ will need long double variants of these functions, but CUDA does not
-// provide them. We'll provide their declarations, which should allow the
-// headers to parse, but would not allow accidental use of them on a GPU.
-
-__attribute__((device)) long double logb(long double);
-__attribute__((device)) long double scalbn(long double, int);
-
-namespace std {
-
-// For __constexpr_fmin/fmax we only need device-side overloads before c++14
-// where they are not constexpr.
-#if _LIBCPP_STD_VER < 14
-
-__attribute__((device))
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 float __constexpr_fmax(float __x, float __y) _NOEXCEPT {
-  return __builtin_fmaxf(__x, __y);
-}
-
-__attribute__((device))
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 double __constexpr_fmax(double __x, double __y) _NOEXCEPT {
-  return __builtin_fmax(__x, __y);
-}
-
-__attribute__((device))
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 long double
-__constexpr_fmax(long double __x, long double __y) _NOEXCEPT {
-  return __builtin_fmaxl(__x, __y);
-}
-
-template <class _Tp, class _Up, __enable_if_t<is_arithmetic<_Tp>::value && is_arithmetic<_Up>::value, int> = 0>
-__attribute__((device))
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 typename __promote<_Tp, _Up>::type
-__constexpr_fmax(_Tp __x, _Up __y) _NOEXCEPT {
-  using __result_type = typename __promote<_Tp, _Up>::type;
-  return std::__constexpr_fmax(static_cast<__result_type>(__x), static_cast<__result_type>(__y));
-}
-#endif // _LIBCPP_STD_VER < 14
-
-// For logb/scalbn templates we must always provide device overloads because
-// libc++ implementation uses __builtin_XXX which gets translated into a libcall
-// which we can't handle on GPU. We need to forward those to CUDA-provided
-// implementations.
-
-template <class _Tp>
-__attribute__((device))
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp __constexpr_logb(_Tp __x) {
-  return ::logb(__x);
-}
-
-template <class _Tp>
-__attribute__((device))
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp __constexpr_scalbn(_Tp __x, int __exp) {
-  return ::scalbn(__x, __exp);
-}
-
-} // namespace std//
-
-#endif // _LIBCPP_STD_VER
-
-#endif // include guard
--- a/lib/include/cuda_wrappers/complex
+++ b/lib/include/cuda_wrappers/complex
@ -1,90 +0,0 @@
-/*===---- complex - CUDA wrapper for <complex> ------------------------------===
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __CLANG_CUDA_WRAPPERS_COMPLEX
-#define __CLANG_CUDA_WRAPPERS_COMPLEX
-
-// Wrapper around <complex> that forces its functions to be __host__
-// __device__.
-
-// First, include host-only headers we think are likely to be included by
-// <complex>, so that the pragma below only applies to <complex> itself.
-#if __cplusplus >= 201103L
-#include <type_traits>
-#endif
-#include <stdexcept>
-#include <cmath>
-#include <sstream>
-
-// Next, include our <algorithm> wrapper, to ensure that device overloads of
-// std::min/max are available.
-#include <algorithm>
-
-#pragma clang force_cuda_host_device begin
-
-// When compiling for device, ask libstdc++ to use its own implements of
-// complex functions, rather than calling builtins (which resolve to library
-// functions that don't exist when compiling CUDA device code).
-//
-// This is a little dicey, because it causes libstdc++ to define a different
-// set of overloads on host and device.
-//
-//   // Present only when compiling for host.
-//   __host__ __device__ void complex<float> sin(const complex<float>& x) {
-//     return __builtin_csinf(x);
-//   }
-//
-//   // Present when compiling for host and for device.
-//   template <typename T>
-//   void __host__ __device__ complex<T> sin(const complex<T>& x) {
-//     return complex<T>(sin(x.real()) * cosh(x.imag()),
-//                       cos(x.real()), sinh(x.imag()));
-//   }
-//
-// This is safe because when compiling for device, all function calls in
-// __host__ code to sin() will still resolve to *something*, even if they don't
-// resolve to the same function as they resolve to when compiling for host.  We
-// don't care that they don't resolve to the right function because we won't
-// codegen this host code when compiling for device.
-
-#pragma push_macro("_GLIBCXX_USE_C99_COMPLEX")
-#pragma push_macro("_GLIBCXX_USE_C99_COMPLEX_TR1")
-#define _GLIBCXX_USE_C99_COMPLEX 0
-#define _GLIBCXX_USE_C99_COMPLEX_TR1 0
-
-// Work around a compatibility issue with libstdc++ 11.1.0
-// https://bugs.llvm.org/show_bug.cgi?id=50383
-#pragma push_macro("__failed_assertion")
-#if _GLIBCXX_RELEASE == 11
-#define __failed_assertion __cuda_failed_assertion
-#endif
-
-#include_next <complex>
-
-#pragma pop_macro("__failed_assertion")
-#pragma pop_macro("_GLIBCXX_USE_C99_COMPLEX_TR1")
-#pragma pop_macro("_GLIBCXX_USE_C99_COMPLEX")
-
-#pragma clang force_cuda_host_device end
-
-#endif // include guard
--- a/lib/include/cuda_wrappers/new
+++ b/lib/include/cuda_wrappers/new
@ -1,106 +0,0 @@
-/*===---- new - CUDA wrapper for <new> -------------------------------------===
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __CLANG_CUDA_WRAPPERS_NEW
-#define __CLANG_CUDA_WRAPPERS_NEW
-
-#include_next <new>
-
-#if !defined(__device__)
-// The header has been included too early from the standard C++ library
-// and CUDA-specific macros are not available yet.
-// Undo the include guard and try again later.
-#undef __CLANG_CUDA_WRAPPERS_NEW
-#else
-
-#pragma push_macro("CUDA_NOEXCEPT")
-#if __cplusplus >= 201103L
-#define CUDA_NOEXCEPT noexcept
-#else
-#define CUDA_NOEXCEPT
-#endif
-
-// Device overrides for non-placement new and delete.
-__device__ inline void *operator new(__SIZE_TYPE__ size) {
-  if (size == 0) {
-    size = 1;
-  }
-  return ::malloc(size);
-}
-__device__ inline void *operator new(__SIZE_TYPE__ size,
-                                     const std::nothrow_t &) CUDA_NOEXCEPT {
-  return ::operator new(size);
-}
-
-__device__ inline void *operator new[](__SIZE_TYPE__ size) {
-  return ::operator new(size);
-}
-__device__ inline void *operator new[](__SIZE_TYPE__ size,
-                                       const std::nothrow_t &) {
-  return ::operator new(size);
-}
-
-__device__ inline void operator delete(void* ptr) CUDA_NOEXCEPT {
-  if (ptr) {
-    ::free(ptr);
-  }
-}
-__device__ inline void operator delete(void *ptr,
-                                       const std::nothrow_t &) CUDA_NOEXCEPT {
-  ::operator delete(ptr);
-}
-
-__device__ inline void operator delete[](void* ptr) CUDA_NOEXCEPT {
-  ::operator delete(ptr);
-}
-__device__ inline void operator delete[](void *ptr,
-                                         const std::nothrow_t &) CUDA_NOEXCEPT {
-  ::operator delete(ptr);
-}
-
-// Sized delete, C++14 only.
-#if __cplusplus >= 201402L
-__device__ inline void operator delete(void *ptr,
-                                       __SIZE_TYPE__ size) CUDA_NOEXCEPT {
-  ::operator delete(ptr);
-}
-__device__ inline void operator delete[](void *ptr,
-                                         __SIZE_TYPE__ size) CUDA_NOEXCEPT {
-  ::operator delete(ptr);
-}
-#endif
-
-// Device overrides for placement new and delete.
-__device__ inline void *operator new(__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT {
-  return __ptr;
-}
-__device__ inline void *operator new[](__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT {
-  return __ptr;
-}
-__device__ inline void operator delete(void *, void *) CUDA_NOEXCEPT {}
-__device__ inline void operator delete[](void *, void *) CUDA_NOEXCEPT {}
-
-#pragma pop_macro("CUDA_NOEXCEPT")
-
-#endif // __device__
-#endif // include guard
--- a/lib/include/emmintrin.h
+++ b/lib/include/emmintrin.h
--- a/lib/include/enqcmdintrin.h
+++ b/lib/include/enqcmdintrin.h
@ -1,63 +0,0 @@
-/*===------------------ enqcmdintrin.h - enqcmd intrinsics -----------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <enqcmdintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __ENQCMDINTRIN_H
-#define __ENQCMDINTRIN_H
-
-/* Define the default attributes for the functions in this file */
-#define _DEFAULT_FN_ATTRS \
-  __attribute__((__always_inline__, __nodebug__, __target__("enqcmd")))
-
-/// Reads 64-byte command pointed by \a __src, formats 64-byte enqueue store
-///    data, and performs 64-byte enqueue store to memory pointed by \a __dst.
-///    This intrinsics may only be used in User mode.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsics corresponds to the <c> ENQCMD </c> instruction.
-///
-/// \param __dst
-///    Pointer to the destination of the enqueue store.
-/// \param __src
-///    Pointer to 64-byte command data.
-/// \returns If the command data is successfully written to \a __dst then 0 is
-///    returned. Otherwise 1 is returned.
-static __inline__ int _DEFAULT_FN_ATTRS
-_enqcmd (void *__dst, const void *__src)
-{
-  return __builtin_ia32_enqcmd(__dst, __src);
-}
-
-/// Reads 64-byte command pointed by \a __src, formats 64-byte enqueue store
-///    data, and performs 64-byte enqueue store to memory pointed by \a __dst
-///    This intrinsic may only be used in Privileged mode.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsics corresponds to the <c> ENQCMDS </c> instruction.
-///
-/// \param __dst
-///    Pointer to the destination of the enqueue store.
-/// \param __src
-///    Pointer to 64-byte command data.
-/// \returns If the command data is successfully written to \a __dst then 0 is
-///    returned. Otherwise 1 is returned.
-static __inline__ int _DEFAULT_FN_ATTRS
-_enqcmds (void *__dst, const void *__src)
-{
-  return __builtin_ia32_enqcmds(__dst, __src);
-}
-
-#undef _DEFAULT_FN_ATTRS
-
-#endif /* __ENQCMDINTRIN_H */
--- a/lib/include/f16cintrin.h
+++ b/lib/include/f16cintrin.h
@ -1,162 +0,0 @@
-/*===---- f16cintrin.h - F16C intrinsics -----------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#if !defined __IMMINTRIN_H
-#error "Never use <f16cintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __F16CINTRIN_H
-#define __F16CINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 \
-  __attribute__((__always_inline__, __nodebug__, __target__("f16c"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 \
-  __attribute__((__always_inline__, __nodebug__, __target__("f16c"), __min_vector_width__(256)))
-
-/* NOTE: Intel documents the 128-bit versions of these as being in emmintrin.h,
- * but that's because icc can emulate these without f16c using a library call.
- * Since we don't do that let's leave these in f16cintrin.h.
- */
-
-/// Converts a 16-bit half-precision float value into a 32-bit float
-///    value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
-///
-/// \param __a
-///    A 16-bit half-precision float value.
-/// \returns The converted 32-bit float value.
-static __inline float __DEFAULT_FN_ATTRS128
-_cvtsh_ss(unsigned short __a)
-{
-  __v8hi __v = {(short)__a, 0, 0, 0, 0, 0, 0, 0};
-  __v4sf __r = __builtin_ia32_vcvtph2ps(__v);
-  return __r[0];
-}
-
-/// Converts a 32-bit single-precision float value to a 16-bit
-///    half-precision float value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned short _cvtss_sh(float a, const int imm);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
-///
-/// \param a
-///    A 32-bit single-precision float value to be converted to a 16-bit
-///    half-precision float value.
-/// \param imm
-///    An immediate value controlling rounding using bits [2:0]: \n
-///    000: Nearest \n
-///    001: Down \n
-///    010: Up \n
-///    011: Truncate \n
-///    1XX: Use MXCSR.RC for rounding
-/// \returns The converted 16-bit half-precision float value.
-#define _cvtss_sh(a, imm) __extension__ ({ \
-  (unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
-                                                     (imm)))[0]); })
-
-/// Converts a 128-bit vector containing 32-bit float values into a
-///    128-bit vector containing 16-bit half-precision float values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_cvtps_ph(__m128 a, const int imm);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
-///
-/// \param a
-///    A 128-bit vector containing 32-bit float values.
-/// \param imm
-///    An immediate value controlling rounding using bits [2:0]: \n
-///    000: Nearest \n
-///    001: Down \n
-///    010: Up \n
-///    011: Truncate \n
-///    1XX: Use MXCSR.RC for rounding
-/// \returns A 128-bit vector containing converted 16-bit half-precision float
-///    values. The lower 64 bits are used to store the converted 16-bit
-///    half-precision floating-point values.
-#define _mm_cvtps_ph(a, imm) \
-  ((__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)))
-
-/// Converts a 128-bit vector containing 16-bit half-precision float
-///    values into a 128-bit vector containing 32-bit float values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector containing 16-bit half-precision float values. The lower
-///    64 bits are used in the conversion.
-/// \returns A 128-bit vector of [4 x float] containing converted float values.
-static __inline __m128 __DEFAULT_FN_ATTRS128
-_mm_cvtph_ps(__m128i __a)
-{
-  return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__a);
-}
-
-/// Converts a 256-bit vector of [8 x float] into a 128-bit vector
-///    containing 16-bit half-precision float values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm256_cvtps_ph(__m256 a, const int imm);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
-///
-/// \param a
-///    A 256-bit vector containing 32-bit single-precision float values to be
-///    converted to 16-bit half-precision float values.
-/// \param imm
-///    An immediate value controlling rounding using bits [2:0]: \n
-///    000: Nearest \n
-///    001: Down \n
-///    010: Up \n
-///    011: Truncate \n
-///    1XX: Use MXCSR.RC for rounding
-/// \returns A 128-bit vector containing the converted 16-bit half-precision
-///    float values.
-#define _mm256_cvtps_ph(a, imm) \
- ((__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm)))
-
-/// Converts a 128-bit vector containing 16-bit half-precision float
-///    values into a 256-bit vector of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector containing 16-bit half-precision float values to be
-///    converted to 32-bit single-precision float values.
-/// \returns A vector of [8 x float] containing the converted 32-bit
-///    single-precision float values.
-static __inline __m256 __DEFAULT_FN_ATTRS256
-_mm256_cvtph_ps(__m128i __a)
-{
-  return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
-}
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif /* __F16CINTRIN_H */
--- a/lib/include/float.h
+++ b/lib/include/float.h
@ -1,168 +0,0 @@
-/*===---- float.h - Characteristics of floating point types ----------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __CLANG_FLOAT_H
-#define __CLANG_FLOAT_H
-
-/* If we're on MinGW, fall back to the system's float.h, which might have
- * additional definitions provided for Windows.
- * For more details see http://msdn.microsoft.com/en-us/library/y0ybw9fy.aspx
- *
- * Also fall back on Darwin and AIX to allow additional definitions and
- * implementation-defined values.
- */
-#if (defined(__APPLE__) || defined(__MINGW32__) || defined(_MSC_VER) ||        \
-     defined(_AIX)) &&                                                         \
-    __STDC_HOSTED__ && __has_include_next(<float.h>)
-
-/* Prior to Apple's 10.7 SDK, float.h SDK header used to apply an extra level
- * of #include_next<float.h> to keep Metrowerks compilers happy. Avoid this
- * extra indirection.
- */
-#ifdef __APPLE__
-#define _FLOAT_H_
-#endif
-
-#  include_next <float.h>
-
-/* Undefine anything that we'll be redefining below. */
-#  undef FLT_EVAL_METHOD
-#  undef FLT_ROUNDS
-#  undef FLT_RADIX
-#  undef FLT_MANT_DIG
-#  undef DBL_MANT_DIG
-#  undef LDBL_MANT_DIG
-#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) ||              \
-    !defined(__STRICT_ANSI__) ||                                               \
-    (defined(__cplusplus) && __cplusplus >= 201103L) ||                        \
-    (__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
-#    undef DECIMAL_DIG
-#  endif
-#  undef FLT_DIG
-#  undef DBL_DIG
-#  undef LDBL_DIG
-#  undef FLT_MIN_EXP
-#  undef DBL_MIN_EXP
-#  undef LDBL_MIN_EXP
-#  undef FLT_MIN_10_EXP
-#  undef DBL_MIN_10_EXP
-#  undef LDBL_MIN_10_EXP
-#  undef FLT_MAX_EXP
-#  undef DBL_MAX_EXP
-#  undef LDBL_MAX_EXP
-#  undef FLT_MAX_10_EXP
-#  undef DBL_MAX_10_EXP
-#  undef LDBL_MAX_10_EXP
-#  undef FLT_MAX
-#  undef DBL_MAX
-#  undef LDBL_MAX
-#  undef FLT_EPSILON
-#  undef DBL_EPSILON
-#  undef LDBL_EPSILON
-#  undef FLT_MIN
-#  undef DBL_MIN
-#  undef LDBL_MIN
-#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) ||              \
-    !defined(__STRICT_ANSI__) ||                                               \
-    (defined(__cplusplus) && __cplusplus >= 201703L) ||                        \
-    (__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
-#    undef FLT_TRUE_MIN
-#    undef DBL_TRUE_MIN
-#    undef LDBL_TRUE_MIN
-#    undef FLT_DECIMAL_DIG
-#    undef DBL_DECIMAL_DIG
-#    undef LDBL_DECIMAL_DIG
-#    undef FLT_HAS_SUBNORM
-#    undef DBL_HAS_SUBNORM
-#    undef LDBL_HAS_SUBNORM
-#  endif
-#endif
-
-/* Characteristics of floating point types, C99 5.2.4.2.2 */
-
-#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) ||              \
-    (defined(__cplusplus) && __cplusplus >= 201103L)
-#define FLT_EVAL_METHOD __FLT_EVAL_METHOD__
-#endif
-#define FLT_ROUNDS (__builtin_flt_rounds())
-#define FLT_RADIX __FLT_RADIX__
-
-#define FLT_MANT_DIG __FLT_MANT_DIG__
-#define DBL_MANT_DIG __DBL_MANT_DIG__
-#define LDBL_MANT_DIG __LDBL_MANT_DIG__
-
-#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) ||              \
-    !defined(__STRICT_ANSI__) ||                                               \
-    (defined(__cplusplus) && __cplusplus >= 201103L) ||                        \
-    (__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
-#  define DECIMAL_DIG __DECIMAL_DIG__
-#endif
-
-#define FLT_DIG __FLT_DIG__
-#define DBL_DIG __DBL_DIG__
-#define LDBL_DIG __LDBL_DIG__
-
-#define FLT_MIN_EXP __FLT_MIN_EXP__
-#define DBL_MIN_EXP __DBL_MIN_EXP__
-#define LDBL_MIN_EXP __LDBL_MIN_EXP__
-
-#define FLT_MIN_10_EXP __FLT_MIN_10_EXP__
-#define DBL_MIN_10_EXP __DBL_MIN_10_EXP__
-#define LDBL_MIN_10_EXP __LDBL_MIN_10_EXP__
-
-#define FLT_MAX_EXP __FLT_MAX_EXP__
-#define DBL_MAX_EXP __DBL_MAX_EXP__
-#define LDBL_MAX_EXP __LDBL_MAX_EXP__
-
-#define FLT_MAX_10_EXP __FLT_MAX_10_EXP__
-#define DBL_MAX_10_EXP __DBL_MAX_10_EXP__
-#define LDBL_MAX_10_EXP __LDBL_MAX_10_EXP__
-
-#define FLT_MAX __FLT_MAX__
-#define DBL_MAX __DBL_MAX__
-#define LDBL_MAX __LDBL_MAX__
-
-#define FLT_EPSILON __FLT_EPSILON__
-#define DBL_EPSILON __DBL_EPSILON__
-#define LDBL_EPSILON __LDBL_EPSILON__
-
-#define FLT_MIN __FLT_MIN__
-#define DBL_MIN __DBL_MIN__
-#define LDBL_MIN __LDBL_MIN__
-
-#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) ||              \
-    !defined(__STRICT_ANSI__) ||                                               \
-    (defined(__cplusplus) && __cplusplus >= 201703L) ||                        \
-    (__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
-#  define FLT_TRUE_MIN __FLT_DENORM_MIN__
-#  define DBL_TRUE_MIN __DBL_DENORM_MIN__
-#  define LDBL_TRUE_MIN __LDBL_DENORM_MIN__
-#  define FLT_DECIMAL_DIG __FLT_DECIMAL_DIG__
-#  define DBL_DECIMAL_DIG __DBL_DECIMAL_DIG__
-#  define LDBL_DECIMAL_DIG __LDBL_DECIMAL_DIG__
-#  define FLT_HAS_SUBNORM __FLT_HAS_DENORM__
-#  define DBL_HAS_SUBNORM __DBL_HAS_DENORM__
-#  define LDBL_HAS_SUBNORM __LDBL_HAS_DENORM__
-#endif
-
-#ifdef __STDC_WANT_IEC_60559_TYPES_EXT__
-#  define FLT16_MANT_DIG    __FLT16_MANT_DIG__
-#  define FLT16_DECIMAL_DIG __FLT16_DECIMAL_DIG__
-#  define FLT16_DIG         __FLT16_DIG__
-#  define FLT16_MIN_EXP     __FLT16_MIN_EXP__
-#  define FLT16_MIN_10_EXP  __FLT16_MIN_10_EXP__
-#  define FLT16_MAX_EXP     __FLT16_MAX_EXP__
-#  define FLT16_MAX_10_EXP  __FLT16_MAX_10_EXP__
-#  define FLT16_MAX         __FLT16_MAX__
-#  define FLT16_EPSILON     __FLT16_EPSILON__
-#  define FLT16_MIN         __FLT16_MIN__
-#  define FLT16_TRUE_MIN    __FLT16_TRUE_MIN__
-#endif /* __STDC_WANT_IEC_60559_TYPES_EXT__ */
-
-#endif /* __CLANG_FLOAT_H */
--- a/lib/include/fma4intrin.h
+++ b/lib/include/fma4intrin.h
@ -1,218 +0,0 @@
-/*===---- fma4intrin.h - FMA4 intrinsics -----------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __X86INTRIN_H
-#error "Never use <fma4intrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __FMA4INTRIN_H
-#define __FMA4INTRIN_H
-
-#include <pmmintrin.h>
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma4"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma4"), __min_vector_width__(256)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_macc_ps(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_macc_pd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_macc_ss(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_msub_ps(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_msub_pd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_msub_ss(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maddsub_ps(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C)
-{
-  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C)
-{
-  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C)
-{
-  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C)
-{
-  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C)
-{
-  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C)
-{
-  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C)
-{
-  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C)
-{
-  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maddsub_ps(__m256 __A, __m256 __B, __m256 __C)
-{
-  return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C)
-{
-  return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C)
-{
-  return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C)
-{
-  return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
-}
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif /* __FMA4INTRIN_H */
--- a/lib/include/fmaintrin.h
+++ b/lib/include/fmaintrin.h
@ -1,780 +0,0 @@
-/*===---- fmaintrin.h - FMA intrinsics -------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <fmaintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __FMAINTRIN_H
-#define __FMAINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(256)))
-
-/// Computes a multiply-add of 128-bit vectors of [4 x float].
-///    For each element, computes <c> (__A * __B) + __C </c>.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMADD213PS instruction.
-///
-/// \param __A
-///    A 128-bit vector of [4 x float] containing the multiplicand.
-/// \param __B
-///    A 128-bit vector of [4 x float] containing the multiplier.
-/// \param __C
-///    A 128-bit vector of [4 x float] containing the addend.
-/// \returns A 128-bit vector of [4 x float] containing the result.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
-}
-
-/// Computes a multiply-add of 128-bit vectors of [2 x double].
-///    For each element, computes <c> (__A * __B) + __C </c>.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMADD213PD instruction.
-///
-/// \param __A
-///    A 128-bit vector of [2 x double] containing the multiplicand.
-/// \param __B
-///    A 128-bit vector of [2 x double] containing the multiplier.
-/// \param __C
-///    A 128-bit vector of [2 x double] containing the addend.
-/// \returns A 128-bit [2 x double] vector containing the result.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
-}
-
-/// Computes a scalar multiply-add of the single-precision values in the
-///    low 32 bits of 128-bit vectors of [4 x float].
-/// \code
-/// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
-/// result[127:32] = __A[127:32]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMADD213SS instruction.
-///
-/// \param __A
-///    A 128-bit vector of [4 x float] containing the multiplicand in the low
-///    32 bits.
-/// \param __B
-///    A 128-bit vector of [4 x float] containing the multiplier in the low
-///    32 bits.
-/// \param __C
-///    A 128-bit vector of [4 x float] containing the addend in the low
-///    32 bits.
-/// \returns A 128-bit vector of [4 x float] containing the result in the low
-///    32 bits and a copy of \a __A[127:32] in the upper 96 bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
-}
-
-/// Computes a scalar multiply-add of the double-precision values in the
-///    low 64 bits of 128-bit vectors of [2 x double].
-/// \code
-/// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
-/// result[127:64] = __A[127:64]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMADD213SD instruction.
-///
-/// \param __A
-///    A 128-bit vector of [2 x double] containing the multiplicand in the low
-///    64 bits.
-/// \param __B
-///    A 128-bit vector of [2 x double] containing the multiplier in the low
-///    64 bits.
-/// \param __C
-///    A 128-bit vector of [2 x double] containing the addend in the low
-///    64 bits.
-/// \returns A 128-bit vector of [2 x double] containing the result in the low
-///    64 bits and a copy of \a __A[127:64] in the upper 64 bits.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C);
-}
-
-/// Computes a multiply-subtract of 128-bit vectors of [4 x float].
-///    For each element, computes <c> (__A * __B) - __C </c>.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMSUB213PS instruction.
-///
-/// \param __A
-///    A 128-bit vector of [4 x float] containing the multiplicand.
-/// \param __B
-///    A 128-bit vector of [4 x float] containing the multiplier.
-/// \param __C
-///    A 128-bit vector of [4 x float] containing the subtrahend.
-/// \returns A 128-bit vector of [4 x float] containing the result.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
-}
-
-/// Computes a multiply-subtract of 128-bit vectors of [2 x double].
-///    For each element, computes <c> (__A * __B) - __C </c>.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMSUB213PD instruction.
-///
-/// \param __A
-///    A 128-bit vector of [2 x double] containing the multiplicand.
-/// \param __B
-///    A 128-bit vector of [2 x double] containing the multiplier.
-/// \param __C
-///    A 128-bit vector of [2 x double] containing the addend.
-/// \returns A 128-bit vector of [2 x double] containing the result.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
-}
-
-/// Computes a scalar multiply-subtract of the single-precision values in
-///    the low 32 bits of 128-bit vectors of [4 x float].
-/// \code
-/// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
-/// result[127:32] = __A[127:32]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMSUB213SS instruction.
-///
-/// \param __A
-///    A 128-bit vector of [4 x float] containing the multiplicand in the low
-///    32 bits.
-/// \param __B
-///    A 128-bit vector of [4 x float] containing the multiplier in the low
-///    32 bits.
-/// \param __C
-///    A 128-bit vector of [4 x float] containing the subtrahend in the low
-///   32 bits.
-/// \returns A 128-bit vector of [4 x float] containing the result in the low
-///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
-}
-
-/// Computes a scalar multiply-subtract of the double-precision values in
-///    the low 64 bits of 128-bit vectors of [2 x double].
-/// \code
-/// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
-/// result[127:64] = __A[127:64]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMSUB213SD instruction.
-///
-/// \param __A
-///    A 128-bit vector of [2 x double] containing the multiplicand in the low
-///    64 bits.
-/// \param __B
-///    A 128-bit vector of [2 x double] containing the multiplier in the low
-///    64 bits.
-/// \param __C
-///    A 128-bit vector of [2 x double] containing the subtrahend in the low
-///    64 bits.
-/// \returns A 128-bit vector of [2 x double] containing the result in the low
-///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C);
-}
-
-/// Computes a negated multiply-add of 128-bit vectors of [4 x float].
-///    For each element, computes <c> -(__A * __B) + __C </c>.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFNMADD213DPS instruction.
-///
-/// \param __A
-///    A 128-bit vector of [4 x float] containing the multiplicand.
-/// \param __B
-///    A 128-bit vector of [4 x float] containing the multiplier.
-/// \param __C
-///    A 128-bit vector of [4 x float] containing the addend.
-/// \returns A 128-bit [4 x float] vector containing the result.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
-}
-
-/// Computes a negated multiply-add of 128-bit vectors of [2 x double].
-///    For each element, computes <c> -(__A * __B) + __C </c>.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFNMADD213PD instruction.
-///
-/// \param __A
-///    A 128-bit vector of [2 x double] containing the multiplicand.
-/// \param __B
-///    A 128-bit vector of [2 x double] containing the multiplier.
-/// \param __C
-///    A 128-bit vector of [2 x double] containing the addend.
-/// \returns A 128-bit vector of [2 x double] containing the result.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
-}
-
-/// Computes a scalar negated multiply-add of the single-precision values in
-///    the low 32 bits of 128-bit vectors of [4 x float].
-/// \code
-/// result[31:0] = -(__A[31:0] * __B[31:0]) + __C[31:0]
-/// result[127:32] = __A[127:32]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFNMADD213SS instruction.
-///
-/// \param __A
-///    A 128-bit vector of [4 x float] containing the multiplicand in the low
-///    32 bits.
-/// \param __B
-///    A 128-bit vector of [4 x float] containing the multiplier in the low
-///    32 bits.
-/// \param __C
-///    A 128-bit vector of [4 x float] containing the addend in the low
-///    32 bits.
-/// \returns A 128-bit vector of [4 x float] containing the result in the low
-///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C);
-}
-
-/// Computes a scalar negated multiply-add of the double-precision values
-///    in the low 64 bits of 128-bit vectors of [2 x double].
-/// \code
-/// result[63:0] = -(__A[63:0] * __B[63:0]) + __C[63:0]
-/// result[127:64] = __A[127:64]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFNMADD213SD instruction.
-///
-/// \param __A
-///    A 128-bit vector of [2 x double] containing the multiplicand in the low
-///    64 bits.
-/// \param __B
-///    A 128-bit vector of [2 x double] containing the multiplier in the low
-///    64 bits.
-/// \param __C
-///    A 128-bit vector of [2 x double] containing the addend in the low
-///    64 bits.
-/// \returns A 128-bit vector of [2 x double] containing the result in the low
-///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C);
-}
-
-/// Computes a negated multiply-subtract of 128-bit vectors of [4 x float].
-///    For each element, computes <c> -(__A * __B) - __C </c>.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
-///
-/// \param __A
-///    A 128-bit vector of [4 x float] containing the multiplicand.
-/// \param __B
-///    A 128-bit vector of [4 x float] containing the multiplier.
-/// \param __C
-///    A 128-bit vector of [4 x float] containing the subtrahend.
-/// \returns A 128-bit vector of [4 x float] containing the result.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
-}
-
-/// Computes a negated multiply-subtract of 128-bit vectors of [2 x double].
-///    For each element, computes <c> -(__A * __B) - __C </c>.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
-///
-/// \param __A
-///    A 128-bit vector of [2 x double] containing the multiplicand.
-/// \param __B
-///    A 128-bit vector of [2 x double] containing the multiplier.
-/// \param __C
-///    A 128-bit vector of [2 x double] containing the subtrahend.
-/// \returns A 128-bit vector of [2 x double] containing the result.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
-}
-
-/// Computes a scalar negated multiply-subtract of the single-precision
-///    values in the low 32 bits of 128-bit vectors of [4 x float].
-/// \code
-/// result[31:0] = -(__A[31:0] * __B[31:0]) - __C[31:0]
-/// result[127:32] = __A[127:32]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFNMSUB213SS instruction.
-///
-/// \param __A
-///    A 128-bit vector of [4 x float] containing the multiplicand in the low
-///    32 bits.
-/// \param __B
-///    A 128-bit vector of [4 x float] containing the multiplier in the low
-///    32 bits.
-/// \param __C
-///    A 128-bit vector of [4 x float] containing the subtrahend in the low
-///    32 bits.
-/// \returns A 128-bit vector of [4 x float] containing the result in the low
-///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C);
-}
-
-/// Computes a scalar negated multiply-subtract of the double-precision
-///    values in the low 64 bits of 128-bit vectors of [2 x double].
-/// \code
-/// result[63:0] = -(__A[63:0] * __B[63:0]) - __C[63:0]
-/// result[127:64] = __A[127:64]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFNMSUB213SD instruction.
-///
-/// \param __A
-///    A 128-bit vector of [2 x double] containing the multiplicand in the low
-///    64 bits.
-/// \param __B
-///    A 128-bit vector of [2 x double] containing the multiplier in the low
-///    64 bits.
-/// \param __C
-///    A 128-bit vector of [2 x double] containing the subtrahend in the low
-///    64 bits.
-/// \returns A 128-bit vector of [2 x double] containing the result in the low
-///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C);
-}
-
-/// Computes a multiply with alternating add/subtract of 128-bit vectors of
-///    [4 x float].
-/// \code
-/// result[31:0]  = (__A[31:0] * __B[31:0]) - __C[31:0]
-/// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
-/// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
-/// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
-///
-/// \param __A
-///    A 128-bit vector of [4 x float] containing the multiplicand.
-/// \param __B
-///    A 128-bit vector of [4 x float] containing the multiplier.
-/// \param __C
-///    A 128-bit vector of [4 x float] containing the addend/subtrahend.
-/// \returns A 128-bit vector of [4 x float] containing the result.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
-}
-
-/// Computes a multiply with alternating add/subtract of 128-bit vectors of
-///    [2 x double].
-/// \code
-/// result[63:0]  = (__A[63:0] * __B[63:0]) - __C[63:0]
-/// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
-///
-/// \param __A
-///    A 128-bit vector of [2 x double] containing the multiplicand.
-/// \param __B
-///    A 128-bit vector of [2 x double] containing the multiplier.
-/// \param __C
-///    A 128-bit vector of [2 x double] containing the addend/subtrahend.
-/// \returns A 128-bit vector of [2 x double] containing the result.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
-}
-
-/// Computes a multiply with alternating add/subtract of 128-bit vectors of
-///    [4 x float].
-/// \code
-/// result[31:0]  = (__A[31:0] * __B[31:0]) + __C[31:0]
-/// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
-/// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
-/// result[127:96 = (__A[127:96] * __B[127:96]) - __C[127:96]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
-///
-/// \param __A
-///    A 128-bit vector of [4 x float] containing the multiplicand.
-/// \param __B
-///    A 128-bit vector of [4 x float] containing the multiplier.
-/// \param __C
-///    A 128-bit vector of [4 x float] containing the addend/subtrahend.
-/// \returns A 128-bit vector of [4 x float] containing the result.
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
-{
-  return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
-}
-
-/// Computes a multiply with alternating add/subtract of 128-bit vectors of
-///    [2 x double].
-/// \code
-/// result[63:0]  = (__A[63:0] * __B[63:0]) + __C[63:0]
-/// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
-///
-/// \param __A
-///    A 128-bit vector of [2 x double] containing the multiplicand.
-/// \param __B
-///    A 128-bit vector of [2 x double] containing the multiplier.
-/// \param __C
-///    A 128-bit vector of [2 x double] containing the addend/subtrahend.
-/// \returns A 128-bit vector of [2 x double] containing the result.
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
-{
-  return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
-}
-
-/// Computes a multiply-add of 256-bit vectors of [8 x float].
-///    For each element, computes <c> (__A * __B) + __C </c>.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMADD213PS instruction.
-///
-/// \param __A
-///    A 256-bit vector of [8 x float] containing the multiplicand.
-/// \param __B
-///    A 256-bit vector of [8 x float] containing the multiplier.
-/// \param __C
-///    A 256-bit vector of [8 x float] containing the addend.
-/// \returns A 256-bit vector of [8 x float] containing the result.
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
-{
-  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
-}
-
-/// Computes a multiply-add of 256-bit vectors of [4 x double].
-///    For each element, computes <c> (__A * __B) + __C </c>.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMADD213PD instruction.
-///
-/// \param __A
-///    A 256-bit vector of [4 x double] containing the multiplicand.
-/// \param __B
-///    A 256-bit vector of [4 x double] containing the multiplier.
-/// \param __C
-///    A 256-bit vector of [4 x double] containing the addend.
-/// \returns A 256-bit vector of [4 x double] containing the result.
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
-{
-  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
-}
-
-/// Computes a multiply-subtract of 256-bit vectors of [8 x float].
-///    For each element, computes <c> (__A * __B) - __C </c>.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMSUB213PS instruction.
-///
-/// \param __A
-///    A 256-bit vector of [8 x float] containing the multiplicand.
-/// \param __B
-///    A 256-bit vector of [8 x float] containing the multiplier.
-/// \param __C
-///    A 256-bit vector of [8 x float] containing the subtrahend.
-/// \returns A 256-bit vector of [8 x float] containing the result.
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
-{
-  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
-}
-
-/// Computes a multiply-subtract of 256-bit vectors of [4 x double].
-///    For each element, computes <c> (__A * __B) - __C </c>.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMSUB213PD instruction.
-///
-/// \param __A
-///    A 256-bit vector of [4 x double] containing the multiplicand.
-/// \param __B
-///    A 256-bit vector of [4 x double] containing the multiplier.
-/// \param __C
-///    A 256-bit vector of [4 x double] containing the subtrahend.
-/// \returns A 256-bit vector of [4 x double] containing the result.
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
-{
-  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
-}
-
-/// Computes a negated multiply-add of 256-bit vectors of [8 x float].
-///    For each element, computes <c> -(__A * __B) + __C </c>.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFNMADD213PS instruction.
-///
-/// \param __A
-///    A 256-bit vector of [8 x float] containing the multiplicand.
-/// \param __B
-///    A 256-bit vector of [8 x float] containing the multiplier.
-/// \param __C
-///    A 256-bit vector of [8 x float] containing the addend.
-/// \returns A 256-bit vector of [8 x float] containing the result.
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
-{
-  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
-}
-
-/// Computes a negated multiply-add of 256-bit vectors of [4 x double].
-///    For each element, computes <c> -(__A * __B) + __C </c>.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFNMADD213PD instruction.
-///
-/// \param __A
-///    A 256-bit vector of [4 x double] containing the multiplicand.
-/// \param __B
-///    A 256-bit vector of [4 x double] containing the multiplier.
-/// \param __C
-///    A 256-bit vector of [4 x double] containing the addend.
-/// \returns A 256-bit vector of [4 x double] containing the result.
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
-{
-  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
-}
-
-/// Computes a negated multiply-subtract of 256-bit vectors of [8 x float].
-///    For each element, computes <c> -(__A * __B) - __C </c>.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
-///
-/// \param __A
-///    A 256-bit vector of [8 x float] containing the multiplicand.
-/// \param __B
-///    A 256-bit vector of [8 x float] containing the multiplier.
-/// \param __C
-///    A 256-bit vector of [8 x float] containing the subtrahend.
-/// \returns A 256-bit vector of [8 x float] containing the result.
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
-{
-  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
-}
-
-/// Computes a negated multiply-subtract of 256-bit vectors of [4 x double].
-///    For each element, computes <c> -(__A * __B) - __C </c>.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
-///
-/// \param __A
-///    A 256-bit vector of [4 x double] containing the multiplicand.
-/// \param __B
-///    A 256-bit vector of [4 x double] containing the multiplier.
-/// \param __C
-///    A 256-bit vector of [4 x double] containing the subtrahend.
-/// \returns A 256-bit vector of [4 x double] containing the result.
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
-{
-  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
-}
-
-/// Computes a multiply with alternating add/subtract of 256-bit vectors of
-///    [8 x float].
-/// \code
-/// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
-/// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
-/// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
-/// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
-/// result[159:128] = (__A[159:128] * __B[159:128]) - __C[159:128]
-/// result[191:160] = (__A[191:160] * __B[191:160]) + __C[191:160]
-/// result[223:192] = (__A[223:192] * __B[223:192]) - __C[223:192]
-/// result[255:224] = (__A[255:224] * __B[255:224]) + __C[255:224]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
-///
-/// \param __A
-///    A 256-bit vector of [8 x float] containing the multiplicand.
-/// \param __B
-///    A 256-bit vector of [8 x float] containing the multiplier.
-/// \param __C
-///    A 256-bit vector of [8 x float] containing the addend/subtrahend.
-/// \returns A 256-bit vector of [8 x float] containing the result.
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
-{
-  return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
-}
-
-/// Computes a multiply with alternating add/subtract of 256-bit vectors of
-///    [4 x double].
-/// \code
-/// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
-/// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
-/// result[191:128] = (__A[191:128] * __B[191:128]) - __C[191:128]
-/// result[255:192] = (__A[255:192] * __B[255:192]) + __C[255:192]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
-///
-/// \param __A
-///    A 256-bit vector of [4 x double] containing the multiplicand.
-/// \param __B
-///    A 256-bit vector of [4 x double] containing the multiplier.
-/// \param __C
-///    A 256-bit vector of [4 x double] containing the addend/subtrahend.
-/// \returns A 256-bit vector of [4 x double] containing the result.
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
-{
-  return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
-}
-
-/// Computes a vector multiply with alternating add/subtract of 256-bit
-///    vectors of [8 x float].
-/// \code
-/// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
-/// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
-/// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
-/// result[127:96] = (__A[127:96] * __B[127:96]) - __C[127:96]
-/// result[159:128] = (__A[159:128] * __B[159:128]) + __C[159:128]
-/// result[191:160] = (__A[191:160] * __B[191:160]) - __C[191:160]
-/// result[223:192] = (__A[223:192] * __B[223:192]) + __C[223:192]
-/// result[255:224] = (__A[255:224] * __B[255:224]) - __C[255:224]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
-///
-/// \param __A
-///    A 256-bit vector of [8 x float] containing the multiplicand.
-/// \param __B
-///    A 256-bit vector of [8 x float] containing the multiplier.
-/// \param __C
-///    A 256-bit vector of [8 x float] containing the addend/subtrahend.
-/// \returns A 256-bit vector of [8 x float] containing the result.
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
-{
-  return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
-}
-
-/// Computes a vector multiply with alternating add/subtract of 256-bit
-///    vectors of [4 x double].
-/// \code
-/// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
-/// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
-/// result[191:128] = (__A[191:128] * __B[191:128]) + __C[191:128]
-/// result[255:192] = (__A[255:192] * __B[255:192]) - __C[255:192]
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c VFMSUBADD213PD instruction.
-///
-/// \param __A
-///    A 256-bit vector of [4 x double] containing the multiplicand.
-/// \param __B
-///    A 256-bit vector of [4 x double] containing the multiplier.
-/// \param __C
-///    A 256-bit vector of [4 x double] containing the addend/subtrahend.
-/// \returns A 256-bit vector of [4 x double] containing the result.
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
-{
-  return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
-}
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif /* __FMAINTRIN_H */
--- a/lib/include/fxsrintrin.h
+++ b/lib/include/fxsrintrin.h
@ -1,91 +0,0 @@
-/*===---- fxsrintrin.h - FXSR intrinsic ------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <fxsrintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __FXSRINTRIN_H
-#define __FXSRINTRIN_H
-
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("fxsr")))
-
-/// Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
-///    memory region pointed to by the input parameter \a __p.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> FXSAVE </c> instruction.
-///
-/// \param __p
-///    A pointer to a 512-byte memory region. The beginning of this memory
-///    region should be aligned on a 16-byte boundary.
-static __inline__ void __DEFAULT_FN_ATTRS
-_fxsave(void *__p)
-{
-  __builtin_ia32_fxsave(__p);
-}
-
-/// Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
-///    memory region pointed to by the input parameter \a __p. The contents of
-///    this memory region should have been written to by a previous \c _fxsave
-///    or \c _fxsave64 intrinsic.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> FXRSTOR </c> instruction.
-///
-/// \param __p
-///    A pointer to a 512-byte memory region. The beginning of this memory
-///    region should be aligned on a 16-byte boundary.
-static __inline__ void __DEFAULT_FN_ATTRS
-_fxrstor(void *__p)
-{
-  __builtin_ia32_fxrstor(__p);
-}
-
-#ifdef __x86_64__
-/// Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
-///    memory region pointed to by the input parameter \a __p.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> FXSAVE64 </c> instruction.
-///
-/// \param __p
-///    A pointer to a 512-byte memory region. The beginning of this memory
-///    region should be aligned on a 16-byte boundary.
-static __inline__ void __DEFAULT_FN_ATTRS
-_fxsave64(void *__p)
-{
-  __builtin_ia32_fxsave64(__p);
-}
-
-/// Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
-///    memory region pointed to by the input parameter \a __p. The contents of
-///    this memory region should have been written to by a previous \c _fxsave
-///    or \c _fxsave64 intrinsic.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> FXRSTOR64 </c> instruction.
-///
-/// \param __p
-///    A pointer to a 512-byte memory region. The beginning of this memory
-///    region should be aligned on a 16-byte boundary.
-static __inline__ void __DEFAULT_FN_ATTRS
-_fxrstor64(void *__p)
-{
-  __builtin_ia32_fxrstor64(__p);
-}
-#endif
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
--- a/lib/include/gfniintrin.h
+++ b/lib/include/gfniintrin.h
@ -1,194 +0,0 @@
-/*===----------------- gfniintrin.h - GFNI intrinsics ----------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <gfniintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __GFNIINTRIN_H
-#define __GFNIINTRIN_H
-
-/* Default attributes for simple form (no masking). */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("gfni"), __min_vector_width__(128)))
-
-/* Default attributes for YMM unmasked form. */
-#define __DEFAULT_FN_ATTRS_Y __attribute__((__always_inline__, __nodebug__, __target__("avx,gfni"), __min_vector_width__(256)))
-
-/* Default attributes for ZMM unmasked forms. */
-#define __DEFAULT_FN_ATTRS_Z __attribute__((__always_inline__, __nodebug__, __target__("avx512f,gfni"), __min_vector_width__(512)))
-/* Default attributes for ZMM masked forms. */
-#define __DEFAULT_FN_ATTRS_Z_MASK __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"), __min_vector_width__(512)))
-
-/* Default attributes for VLX masked forms. */
-#define __DEFAULT_FN_ATTRS_VL128 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS_VL256 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(256)))
-
-#define _mm_gf2p8affineinv_epi64_epi8(A, B, I) \
-  ((__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \
-                                                   (__v16qi)(__m128i)(B), \
-                                                   (char)(I)))
-
-#define _mm_gf2p8affine_epi64_epi8(A, B, I) \
-  ((__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A), \
-                                                   (__v16qi)(__m128i)(B), \
-                                                   (char)(I)))
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_gf2p8mul_epi8(__m128i __A, __m128i __B)
-{
-  return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi((__v16qi) __A,
-              (__v16qi) __B);
-}
-
-#ifdef __AVXINTRIN_H
-#define _mm256_gf2p8affineinv_epi64_epi8(A, B, I) \
-  ((__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A), \
-                                                   (__v32qi)(__m256i)(B), \
-                                                   (char)(I)))
-
-#define _mm256_gf2p8affine_epi64_epi8(A, B, I) \
-  ((__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A), \
-                                                   (__v32qi)(__m256i)(B), \
-                                                   (char)(I)))
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS_Y
-_mm256_gf2p8mul_epi8(__m256i __A, __m256i __B)
-{
-  return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi((__v32qi) __A,
-              (__v32qi) __B);
-}
-#endif /* __AVXINTRIN_H */
-
-#ifdef __AVX512BWINTRIN_H
-#define _mm512_gf2p8affineinv_epi64_epi8(A, B, I) \
-  ((__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)(__m512i)(A), \
-                                                   (__v64qi)(__m512i)(B), \
-                                                   (char)(I)))
-
-#define _mm512_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
-  ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
-         (__v64qi)_mm512_gf2p8affineinv_epi64_epi8(A, B, I), \
-         (__v64qi)(__m512i)(S)))
-
-#define _mm512_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
-  _mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_si512(), \
-         U, A, B, I)
-
-#define _mm512_gf2p8affine_epi64_epi8(A, B, I) \
-  ((__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)(__m512i)(A), \
-                                                   (__v64qi)(__m512i)(B), \
-                                                   (char)(I)))
-
-#define _mm512_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
-  ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
-         (__v64qi)_mm512_gf2p8affine_epi64_epi8((A), (B), (I)), \
-         (__v64qi)(__m512i)(S)))
-
-#define _mm512_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
-  _mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_si512(), \
-         U, A, B, I)
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS_Z
-_mm512_gf2p8mul_epi8(__m512i __A, __m512i __B)
-{
-  return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi((__v64qi) __A,
-              (__v64qi) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS_Z_MASK
-_mm512_mask_gf2p8mul_epi8(__m512i __S, __mmask64 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i) __builtin_ia32_selectb_512(__U,
-              (__v64qi) _mm512_gf2p8mul_epi8(__A, __B),
-              (__v64qi) __S);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS_Z_MASK
-_mm512_maskz_gf2p8mul_epi8(__mmask64 __U, __m512i __A, __m512i __B)
-{
-  return _mm512_mask_gf2p8mul_epi8((__m512i)_mm512_setzero_si512(),
-              __U, __A, __B);
-}
-#endif /* __AVX512BWINTRIN_H */
-
-#ifdef __AVX512VLBWINTRIN_H
-#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
-  ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
-         (__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I), \
-         (__v16qi)(__m128i)(S)))
-
-#define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
-  _mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(), \
-         U, A, B, I)
-
-#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
-  ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
-         (__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I), \
-         (__v32qi)(__m256i)(S)))
-
-#define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
-  _mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \
-         U, A, B, I)
-
-#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
-  ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
-         (__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I), \
-         (__v16qi)(__m128i)(S)))
-
-#define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
-  _mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(), U, A, B, I)
-
-#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
-  ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
-         (__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I), \
-         (__v32qi)(__m256i)(S)))
-
-#define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
-  _mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(), \
-         U, A, B, I)
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128
-_mm_mask_gf2p8mul_epi8(__m128i __S, __mmask16 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i) __builtin_ia32_selectb_128(__U,
-              (__v16qi) _mm_gf2p8mul_epi8(__A, __B),
-              (__v16qi) __S);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128
-_mm_maskz_gf2p8mul_epi8(__mmask16 __U, __m128i __A, __m128i __B)
-{
-  return _mm_mask_gf2p8mul_epi8((__m128i)_mm_setzero_si128(),
-              __U, __A, __B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256
-_mm256_mask_gf2p8mul_epi8(__m256i __S, __mmask32 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i) __builtin_ia32_selectb_256(__U,
-              (__v32qi) _mm256_gf2p8mul_epi8(__A, __B),
-              (__v32qi) __S);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256
-_mm256_maskz_gf2p8mul_epi8(__mmask32 __U, __m256i __A, __m256i __B)
-{
-  return _mm256_mask_gf2p8mul_epi8((__m256i)_mm256_setzero_si256(),
-              __U, __A, __B);
-}
-#endif /* __AVX512VLBWINTRIN_H */
-
-#undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_Y
-#undef __DEFAULT_FN_ATTRS_Z
-#undef __DEFAULT_FN_ATTRS_VL128
-#undef __DEFAULT_FN_ATTRS_VL256
-
-#endif /* __GFNIINTRIN_H */
-
--- a/lib/include/hexagon_circ_brev_intrinsics.h
+++ b/lib/include/hexagon_circ_brev_intrinsics.h
@ -1,298 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _HEXAGON_CIRC_BREV_INTRINSICS_H_
-#define _HEXAGON_CIRC_BREV_INTRINSICS_H_ 1
-
-#include <hexagon_protos.h>
-#include <stdint.h>
-
-/* Circular Load */
-/* ==========================================================================
-   Assembly Syntax:       Return=instruction()
-   C Intrinsic Prototype: void Q6_circ_load_update_D(Word64 dst, Word64 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
-   Instruction Type:      InstructionType
-   Execution Slots:       SLOT0123
-   ========================================================================== */
-#define Q6_circ_load_update_D(dest,ptr,incr,bufsize,K)  \
-    { ptr = (int64_t *) HEXAGON_circ_ldd (ptr, &(dest), ((((K)+1)<<24)|((bufsize)<<3)), ((incr)*8)); }
-
-/* ==========================================================================
-   Assembly Syntax:       Return=instruction()
-   C Intrinsic Prototype: void Q6_circ_load_update_W(Word32 dst, Word32 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
-   Instruction Type:      InstructionType
-   Execution Slots:       SLOT0123
-   ========================================================================== */
-#define Q6_circ_load_update_W(dest,ptr,incr,bufsize,K)  \
-    { ptr = (int *) HEXAGON_circ_ldw (ptr, &(dest), (((K)<<24)|((bufsize)<<2)), ((incr)*4)); }
-
-/* ==========================================================================
-   Assembly Syntax:       Return=instruction()
-   C Intrinsic Prototype: void Q6_circ_load_update_H(Word16 dst, Word16 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
-   Instruction Type:      InstructionType
-   Execution Slots:       SLOT0123
-   ========================================================================== */
-#define Q6_circ_load_update_H(dest,ptr,incr,bufsize,K)  \
-    { ptr = (int16_t *) HEXAGON_circ_ldh (ptr, &(dest), ((((K)-1)<<24)|((bufsize)<<1)), ((incr)*2)); }
-
-/* ==========================================================================
-   Assembly Syntax:       Return=instruction()
-   C Intrinsic Prototype: void Q6_circ_load_update_UH( UWord16 dst,  UWord16 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
-   Instruction Type:      InstructionType
-   Execution Slots:       SLOT0123
-   ========================================================================== */
-#define Q6_circ_load_update_UH(dest,ptr,incr,bufsize,K) \
-    { ptr = (uint16_t *) HEXAGON_circ_lduh (ptr, &(dest), ((((K)-1)<<24)|((bufsize)<<1)), ((incr)*2)); }
-
-/* ==========================================================================
-   Assembly Syntax:       Return=instruction()
-   C Intrinsic Prototype: void Q6_circ_load_update_B(Word8 dst, Word8 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
-   Instruction Type:      InstructionType
-   Execution Slots:       SLOT0123
-   ========================================================================== */
-#define Q6_circ_load_update_B(dest,ptr,incr,bufsize,K)  \
-    { ptr = (int8_t *) HEXAGON_circ_ldb (ptr, &(dest), ((((K)-2)<<24)|(bufsize)), incr); }
-
-/* ==========================================================================
-   Assembly Syntax:       Return=instruction()
-   C Intrinsic Prototype: void  Q6_circ_load_update_UB(UWord8 dst, UWord8 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
-   Instruction Type:      InstructionType
-   Execution Slots:       SLOT0123
-   ========================================================================== */
-#define Q6_circ_load_update_UB(dest,ptr,incr,bufsize,K) \
-    { ptr = (uint8_t *) HEXAGON_circ_ldub (ptr, &(dest), ((((K)-2)<<24)|(bufsize)), incr); }
-
-/* Circular Store */
-/* ==========================================================================
-   Assembly Syntax:       Return=instruction()
-   C Intrinsic Prototype: void Q6_circ_store_update_D(Word64 *src, Word64 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
-   Instruction Type:      InstructionType
-   Execution Slots:       SLOT0123
-   ========================================================================== */
-#define Q6_circ_store_update_D(src,ptr,incr,bufsize,K)  \
-    { ptr = (int64_t *) HEXAGON_circ_std (ptr, src, ((((K)+1)<<24)|((bufsize)<<3)), ((incr)*8)); }
-
-/* ==========================================================================
-   Assembly Syntax:       Return=instruction()
-   C Intrinsic Prototype: void Q6_circ_store_update_W(Word32 *src, Word32 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
-   Instruction Type:      InstructionType
-   Execution Slots:       SLOT0123
-   ========================================================================== */
-#define Q6_circ_store_update_W(src,ptr,incr,bufsize,K)  \
-    { ptr = (int *) HEXAGON_circ_stw (ptr, src, (((K)<<24)|((bufsize)<<2)), ((incr)*4)); }
-
-/* ==========================================================================
-   Assembly Syntax:       Return=instruction()
-   C Intrinsic Prototype: void Q6_circ_store_update_HL(Word16 *src, Word16 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
-   Instruction Type:      InstructionType
-   Execution Slots:       SLOT0123
-   ========================================================================== */
-#define Q6_circ_store_update_HL(src,ptr,incr,bufsize,K) \
-    { ptr = (int16_t *) HEXAGON_circ_sth (ptr, src, ((((K)-1)<<24)|((bufsize)<<1)), ((incr)*2)); }
-
-/* ==========================================================================
-   Assembly Syntax:       Return=instruction()
-   C Intrinsic Prototype: void Q6_circ_store_update_HH(Word16 *src, Word16 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
-   Instruction Type:      InstructionType
-   Execution Slots:       SLOT0123
-   ========================================================================== */
-#define Q6_circ_store_update_HH(src,ptr,incr,bufsize,K) \
-    { ptr = (int16_t *) HEXAGON_circ_sthhi (ptr, src, ((((K)-1)<<24)|((bufsize)<<1)), ((incr)*2)); }
-
-/* ==========================================================================
-   Assembly Syntax:       Return=instruction()
-   C Intrinsic Prototype: void Q6_circ_store_update_B(Word8 *src, Word8 *ptr, UWord32 I4, UWord32 bufsize,  UWord64 K)
-   Instruction Type:      InstructionType
-   Execution Slots:       SLOT0123
-   ========================================================================== */
-#define Q6_circ_store_update_B(src,ptr,incr,bufsize,K)  \
-    { ptr = (int8_t *) HEXAGON_circ_stb (ptr, src, ((((K)-2)<<24)|(bufsize)), incr); }
-
-
-/* Bit Reverse Load */
-/* ==========================================================================
-   Assembly Syntax:       Return=instruction()
-   C Intrinsic Prototype: void Q6_bitrev_load_update_D(Word64 dst, Word64 *ptr, UWord32 Iu4)
-   Instruction Type:      InstructionType
-   Execution Slots:       SLOT0123
-   ========================================================================== */
-#define Q6_bitrev_load_update_D(dest,ptr,log2bufsize) \
-    { ptr = (int64_t *) HEXAGON_brev_ldd (ptr, &(dest), (1<<(16-((log2bufsize) + 3)))); }
-
-/* ==========================================================================
-   Assembly Syntax:       Return=instruction()
-   C Intrinsic Prototype: void Q6_bitrev_load_update_W(Word32 dst, Word32 *ptr, UWord32 Iu4)
-   Instruction Type:      InstructionType
-   Execution Slots:       SLOT0123
-   ========================================================================== */
-#define Q6_bitrev_load_update_W(dest,ptr,log2bufsize) \
-    { ptr = (int *) HEXAGON_brev_ldw (ptr, &(dest), (1<<(16-((log2bufsize) + 2)))); }
-
-/* ==========================================================================
-   Assembly Syntax:       Return=instruction()
-   C Intrinsic Prototype: void Q6_bitrev_load_update_H(Word16 dst, Word16 *ptr, UWord32 Iu4)
-   Instruction Type:      InstructionType
-   Execution Slots:       SLOT0123
-   ========================================================================== */
-#define Q6_bitrev_load_update_H(dest,ptr,log2bufsize) \
-    { ptr = (int16_t *) HEXAGON_brev_ldh (ptr, &(dest), (1<<(16-((log2bufsize) + 1)))); }
-
-/* ==========================================================================
-   Assembly Syntax:       Return=instruction()
-   C Intrinsic Prototype: void Q6_bitrev_load_update_UH(UWord16 dst,  UWord16 *ptr, UWord32 Iu4)
-   Instruction Type:      InstructionType
-   Execution Slots:       SLOT0123
-   ========================================================================== */
-#define Q6_bitrev_load_update_UH(dest,ptr,log2bufsize) \
-    { ptr = (uint16_t *) HEXAGON_brev_lduh (ptr, &(dest), (1<<(16-((log2bufsize) + 1)))); }
-
-/* ==========================================================================
-   Assembly Syntax:       Return=instruction()
-   C Intrinsic Prototype: void Q6_bitrev_load_update_B(Word8 dst, Word8 *ptr, UWord32 Iu4)
-   Instruction Type:      InstructionType
-   Execution Slots:       SLOT0123
-   ========================================================================== */
-#define Q6_bitrev_load_update_B(dest,ptr,log2bufsize) \
-    { ptr = (int8_t *) HEXAGON_brev_ldb (ptr, &(dest), (1<<(16-((log2bufsize))))); }
-
-/* ==========================================================================
-   Assembly Syntax:       Return=instruction()
-   C Intrinsic Prototype: void Q6_bitrev_load_update_UB(UWord8 dst, UWord8 *ptr, UWord32 Iu4)
-   Instruction Type:      InstructionType
-   Execution Slots:       SLOT0123
-   ========================================================================== */
-#define Q6_bitrev_load_update_UB(dest,ptr,log2bufsize) \
-    { ptr = (uint8_t *) HEXAGON_brev_ldub (ptr, &(dest), (1<<(16-((log2bufsize))))); }
-
-/* Bit Reverse Store */
-
-/* ==========================================================================
-   Assembly Syntax:       Return=instruction()
-   C Intrinsic Prototype: void Q6_bitrev_store_update_D(Word64 *src, Word64 *ptr, UWord32 Iu4)
-   Instruction Type:      InstructionType
-   Execution Slots:       SLOT0123
-   ========================================================================== */
-#define Q6_bitrev_store_update_D(src,ptr,log2bufsize)   \
-    { ptr = (int64_t *) HEXAGON_brev_std (ptr, src, (1<<(16-((log2bufsize) + 3)))); }
-
-/* ==========================================================================
-   Assembly Syntax:       Return=instruction()
-   C Intrinsic Prototype: void Q6_bitrev_store_update_W(Word32 *src, Word32 *ptr, UWord32 Iu4)
-   Instruction Type:      InstructionType
-   Execution Slots:       SLOT0123
-   ========================================================================== */
-#define Q6_bitrev_store_update_W(src,ptr,log2bufsize)   \
-    { ptr = (int *) HEXAGON_brev_stw (ptr, src, (1<<(16-((log2bufsize) + 2)))); }
-
-/* ==========================================================================
-   Assembly Syntax:       Return=instruction()
-   C Intrinsic Prototype: void Q6_bitrev_store_update_HL(Word16 *src, Word16 *ptr, Word32 Iu4)
-   Instruction Type:      InstructionType
-   Execution Slots:       SLOT0123
-   ========================================================================== */
-#define Q6_bitrev_store_update_HL(src,ptr,log2bufsize)   \
-    { ptr = (int16_t *) HEXAGON_brev_sth (ptr, src, (1<<(16-((log2bufsize) + 1)))); }
-
-/* ==========================================================================
-   Assembly Syntax:       Return=instruction()
-   C Intrinsic Prototype: void Q6_bitrev_store_update_HH(Word16 *src, Word16 *ptr, UWord32 Iu4)
-   Instruction Type:      InstructionType
-   Execution Slots:       SLOT0123
-   ========================================================================== */
-#define Q6_bitrev_store_update_HH(src,ptr,log2bufsize)   \
-    { ptr = (int16_t *) HEXAGON_brev_sthhi (ptr, src, (1<<(16-((log2bufsize) + 1)))); }
-
-/* ==========================================================================
-   Assembly Syntax:       Return=instruction()
-   C Intrinsic Prototype: void Q6_bitrev_store_update_B(Word8 *src, Word8 *ptr, UWord32 Iu4)
-   Instruction Type:      InstructionType
-   Execution Slots:       SLOT0123
-   ========================================================================== */
-#define Q6_bitrev_store_update_B(src,ptr,log2bufsize)   \
-    { ptr = (int8_t *) HEXAGON_brev_stb (ptr, src, (1<<(16-((log2bufsize))))); }
-
-
-#define HEXAGON_circ_ldd  __builtin_circ_ldd
-#define HEXAGON_circ_ldw  __builtin_circ_ldw
-#define HEXAGON_circ_ldh  __builtin_circ_ldh
-#define HEXAGON_circ_lduh __builtin_circ_lduh
-#define HEXAGON_circ_ldb  __builtin_circ_ldb
-#define HEXAGON_circ_ldub __builtin_circ_ldub
-
-
-#define HEXAGON_circ_std  __builtin_circ_std
-#define HEXAGON_circ_stw  __builtin_circ_stw
-#define HEXAGON_circ_sth  __builtin_circ_sth
-#define HEXAGON_circ_sthhi __builtin_circ_sthhi
-#define HEXAGON_circ_stb  __builtin_circ_stb
-
-
-#define HEXAGON_brev_ldd  __builtin_brev_ldd
-#define HEXAGON_brev_ldw  __builtin_brev_ldw
-#define HEXAGON_brev_ldh  __builtin_brev_ldh
-#define HEXAGON_brev_lduh __builtin_brev_lduh
-#define HEXAGON_brev_ldb  __builtin_brev_ldb
-#define HEXAGON_brev_ldub __builtin_brev_ldub
-
-#define HEXAGON_brev_std  __builtin_brev_std
-#define HEXAGON_brev_stw  __builtin_brev_stw
-#define HEXAGON_brev_sth  __builtin_brev_sth
-#define HEXAGON_brev_sthhi __builtin_brev_sthhi
-#define HEXAGON_brev_stb  __builtin_brev_stb
-
-#ifdef __HVX__
-/* ==========================================================================
-   Assembly Syntax:       if (Qt) vmem(Rt+#0) = Vs
-   C Intrinsic Prototype: void Q6_vmaskedstoreq_QAV(HVX_VectorPred Qt, HVX_VectorAddress A, HVX_Vector Vs)
-   Instruction Type:      COPROC_VMEM
-   Execution Slots:       SLOT0
-   ========================================================================== */
-
-#define Q6_vmaskedstoreq_QAV __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmaskedstoreq)
-
-/* ==========================================================================
-   Assembly Syntax:       if (!Qt) vmem(Rt+#0) = Vs
-   C Intrinsic Prototype: void Q6_vmaskedstorenq_QAV(HVX_VectorPred Qt, HVX_VectorAddress A, HVX_Vector Vs)
-   Instruction Type:      COPROC_VMEM
-   Execution Slots:       SLOT0
-   ========================================================================== */
-
-#define Q6_vmaskedstorenq_QAV __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmaskedstorenq)
-
-/* ==========================================================================
-   Assembly Syntax:       if (Qt) vmem(Rt+#0):nt = Vs
-   C Intrinsic Prototype: void Q6_vmaskedstorentq_QAV(HVX_VectorPred Qt, HVX_VectorAddress A, HVX_Vector Vs)
-   Instruction Type:      COPROC_VMEM
-   Execution Slots:       SLOT0
-   ========================================================================== */
-
-#define Q6_vmaskedstorentq_QAV __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmaskedstorentq)
-
-/* ==========================================================================
-   Assembly Syntax:       if (!Qt) vmem(Rt+#0):nt = Vs
-   C Intrinsic Prototype: void Q6_vmaskedstorentnq_QAV(HVX_VectorPred Qt, HVX_VectorAddress A, HVX_Vector Vs)
-   Instruction Type:      COPROC_VMEM
-   Execution Slots:       SLOT0
-   ========================================================================== */
-
-#define Q6_vmaskedstorentnq_QAV __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmaskedstorentnq)
-
-#endif
-
-
-#endif  /* #ifndef _HEXAGON_CIRC_BREV_INTRINSICS_H_ */
-
-#ifdef __NOT_DEFINED__
-/*** comment block template  ***/
-/* ==========================================================================
-   Assembly Syntax:       Return=instruction()
-   C Intrinsic Prototype: ReturnType Intrinsic(ParamType Rs, ParamType Rt)
-   Instruction Type:      InstructionType
-   Execution Slots:       SLOT0123
-   ========================================================================== */
-#endif /***  __NOT_DEFINED__  ***/
--- a/lib/include/hexagon_protos.h
+++ b/lib/include/hexagon_protos.h
--- a/lib/include/hexagon_types.h
+++ b/lib/include/hexagon_types.h
--- a/Show More
+++ b/Show More