first commit

2026-03-24 07:02:07 +00:00 · 2022-05-23 00:16:32 +04:00
commit d660f2a4ca
24786 changed files with 4428337 additions and 0 deletions
--- a/.venv/Lib/site-packages/numpy/distutils/checks/cpu_asimd.c
+++ b/.venv/Lib/site-packages/numpy/distutils/checks/cpu_asimd.c
@@ -0,0 +1,25 @@
+#ifdef _MSC_VER
+    #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(void)
+{
+    float32x4_t v1 = vdupq_n_f32(1.0f), v2 = vdupq_n_f32(2.0f);
+    /* MAXMIN */
+    int ret  = (int)vgetq_lane_f32(vmaxnmq_f32(v1, v2), 0);
+        ret += (int)vgetq_lane_f32(vminnmq_f32(v1, v2), 0);
+    /* ROUNDING */
+    ret += (int)vgetq_lane_f32(vrndq_f32(v1), 0);
+#ifdef __aarch64__
+    {
+        float64x2_t vd1 = vdupq_n_f64(1.0), vd2 = vdupq_n_f64(2.0);
+        /* MAXMIN */
+        ret += (int)vgetq_lane_f64(vmaxnmq_f64(vd1, vd2), 0);
+        ret += (int)vgetq_lane_f64(vminnmq_f64(vd1, vd2), 0);
+        /* ROUNDING */
+        ret += (int)vgetq_lane_f64(vrndq_f64(vd1), 0);
+    }
+#endif
+    return ret;
+}
--- a/.venv/Lib/site-packages/numpy/distutils/checks/cpu_asimddp.c
+++ b/.venv/Lib/site-packages/numpy/distutils/checks/cpu_asimddp.c
@@ -0,0 +1,15 @@
+#ifdef _MSC_VER
+    #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(void)
+{
+    uint8x16_t v1 = vdupq_n_u8((unsigned char)1), v2 = vdupq_n_u8((unsigned char)2);
+    uint32x4_t va = vdupq_n_u32(3);
+    int ret = (int)vgetq_lane_u32(vdotq_u32(va, v1, v2), 0);
+#ifdef __aarch64__
+    ret += (int)vgetq_lane_u32(vdotq_laneq_u32(va, v1, v2, 0), 0);
+#endif
+    return ret;
+}
--- a/.venv/Lib/site-packages/numpy/distutils/checks/cpu_asimdfhm.c
+++ b/.venv/Lib/site-packages/numpy/distutils/checks/cpu_asimdfhm.c
@@ -0,0 +1,17 @@
+#ifdef _MSC_VER
+    #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(void)
+{
+    float16x8_t vhp  = vdupq_n_f16((float16_t)1);
+    float16x4_t vlhp = vdup_n_f16((float16_t)1);
+    float32x4_t vf   = vdupq_n_f32(1.0f);
+    float32x2_t vlf  = vdup_n_f32(1.0f);
+
+    int ret  = (int)vget_lane_f32(vfmlal_low_u32(vlf, vlhp, vlhp), 0);
+        ret += (int)vgetq_lane_f32(vfmlslq_high_u32(vf, vhp, vhp), 0);
+
+    return ret;
+}
--- a/.venv/Lib/site-packages/numpy/distutils/checks/cpu_asimdhp.c
+++ b/.venv/Lib/site-packages/numpy/distutils/checks/cpu_asimdhp.c
@@ -0,0 +1,14 @@
+#ifdef _MSC_VER
+    #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(void)
+{
+    float16x8_t vhp  = vdupq_n_f16((float16_t)-1);
+    float16x4_t vlhp = vdup_n_f16((float16_t)-1);
+
+    int ret  =  (int)vgetq_lane_f16(vabdq_f16(vhp, vhp), 0);
+        ret  += (int)vget_lane_f16(vabd_f16(vlhp, vlhp), 0);
+    return ret;
+}
--- a/.venv/Lib/site-packages/numpy/distutils/checks/cpu_avx.c
+++ b/.venv/Lib/site-packages/numpy/distutils/checks/cpu_avx.c
@@ -0,0 +1,20 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #ifndef __AVX__
+        #error "HOST/ARCH doesn't support AVX"
+    #endif
+#endif
+
+#include <immintrin.h>
+
+int main(int argc, char **argv)
+{
+    __m256 a = _mm256_add_ps(_mm256_loadu_ps((const float*)argv[argc-1]), _mm256_loadu_ps((const float*)argv[1]));
+    return (int)_mm_cvtss_f32(_mm256_castps256_ps128(a));
+}
--- a/.venv/Lib/site-packages/numpy/distutils/checks/cpu_avx2.c
+++ b/.venv/Lib/site-packages/numpy/distutils/checks/cpu_avx2.c
@@ -0,0 +1,20 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #ifndef __AVX2__
+        #error "HOST/ARCH doesn't support AVX2"
+    #endif
+#endif
+
+#include <immintrin.h>
+
+int main(int argc, char **argv)
+{
+    __m256i a = _mm256_abs_epi16(_mm256_loadu_si256((const __m256i*)argv[argc-1]));
+    return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
+}
--- a/.venv/Lib/site-packages/numpy/distutils/checks/cpu_avx512_clx.c
+++ b/.venv/Lib/site-packages/numpy/distutils/checks/cpu_avx512_clx.c
@@ -0,0 +1,22 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #ifndef __AVX512VNNI__
+        #error "HOST/ARCH doesn't support CascadeLake AVX512 features"
+    #endif
+#endif
+
+#include <immintrin.h>
+
+int main(int argc, char **argv)
+{
+    /* VNNI */
+    __m512i a = _mm512_loadu_si512((const __m512i*)argv[argc-1]);
+            a = _mm512_dpbusd_epi32(a, _mm512_setzero_si512(), a);
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+}
--- a/.venv/Lib/site-packages/numpy/distutils/checks/cpu_avx512_cnl.c
+++ b/.venv/Lib/site-packages/numpy/distutils/checks/cpu_avx512_cnl.c
@@ -0,0 +1,24 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #if !defined(__AVX512VBMI__) || !defined(__AVX512IFMA__)
+        #error "HOST/ARCH doesn't support CannonLake AVX512 features"
+    #endif
+#endif
+
+#include <immintrin.h>
+
+int main(int argc, char **argv)
+{
+    __m512i a = _mm512_loadu_si512((const __m512i*)argv[argc-1]);
+    /* IFMA */
+    a = _mm512_madd52hi_epu64(a, a, _mm512_setzero_si512());
+    /* VMBI */
+    a = _mm512_permutex2var_epi8(a, _mm512_setzero_si512(), a);
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+}
--- a/.venv/Lib/site-packages/numpy/distutils/checks/cpu_avx512_icl.c
+++ b/.venv/Lib/site-packages/numpy/distutils/checks/cpu_avx512_icl.c
@@ -0,0 +1,26 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #if !defined(__AVX512VPOPCNTDQ__) || !defined(__AVX512BITALG__) || !defined(__AVX512VPOPCNTDQ__)
+        #error "HOST/ARCH doesn't support IceLake AVX512 features"
+    #endif
+#endif
+
+#include <immintrin.h>
+
+int main(int argc, char **argv)
+{
+    __m512i a = _mm512_loadu_si512((const __m512i*)argv[argc-1]);
+    /* VBMI2 */
+    a = _mm512_shrdv_epi64(a, a, _mm512_setzero_si512());
+    /* BITLAG */
+    a = _mm512_popcnt_epi8(a);
+    /* VPOPCNTDQ */
+    a = _mm512_popcnt_epi64(a);
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+}
--- a/.venv/Lib/site-packages/numpy/distutils/checks/cpu_avx512_knl.c
+++ b/.venv/Lib/site-packages/numpy/distutils/checks/cpu_avx512_knl.c
@@ -0,0 +1,25 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #if !defined(__AVX512ER__) || !defined(__AVX512PF__)
+        #error "HOST/ARCH doesn't support Knights Landing AVX512 features"
+    #endif
+#endif
+
+#include <immintrin.h>
+
+int main(int argc, char **argv)
+{
+    int base[128];
+    __m512d ad = _mm512_loadu_pd((const __m512d*)argv[argc-1]);
+    /* ER */
+    __m512i a = _mm512_castpd_si512(_mm512_exp2a23_pd(ad));
+    /* PF */
+    _mm512_mask_prefetch_i64scatter_pd(base, _mm512_cmpeq_epi64_mask(a, a), a, 1, _MM_HINT_T1);
+    return base[0];
+}
--- a/.venv/Lib/site-packages/numpy/distutils/checks/cpu_avx512_knm.c
+++ b/.venv/Lib/site-packages/numpy/distutils/checks/cpu_avx512_knm.c
@@ -0,0 +1,30 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #if !defined(__AVX5124FMAPS__) || !defined(__AVX5124VNNIW__) || !defined(__AVX512VPOPCNTDQ__)
+        #error "HOST/ARCH doesn't support Knights Mill AVX512 features"
+    #endif
+#endif
+
+#include <immintrin.h>
+
+int main(int argc, char **argv)
+{
+    __m512i a = _mm512_loadu_si512((const __m512i*)argv[argc-1]);
+    __m512 b = _mm512_loadu_ps((const __m512*)argv[argc-2]);
+
+    /* 4FMAPS */
+    b = _mm512_4fmadd_ps(b, b, b, b, b, NULL);
+    /* 4VNNIW */
+    a = _mm512_4dpwssd_epi32(a, a, a, a, a, NULL);
+    /* VPOPCNTDQ */
+    a = _mm512_popcnt_epi64(a);
+
+    a = _mm512_add_epi32(a, _mm512_castps_si512(b));
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+}
--- a/.venv/Lib/site-packages/numpy/distutils/checks/cpu_avx512_skx.c
+++ b/.venv/Lib/site-packages/numpy/distutils/checks/cpu_avx512_skx.c
@@ -0,0 +1,26 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #if !defined(__AVX512VL__) || !defined(__AVX512BW__) || !defined(__AVX512DQ__)
+        #error "HOST/ARCH doesn't support SkyLake AVX512 features"
+    #endif
+#endif
+
+#include <immintrin.h>
+
+int main(int argc, char **argv)
+{
+    __m512i aa = _mm512_abs_epi32(_mm512_loadu_si512((const __m512i*)argv[argc-1]));
+    /* VL */
+    __m256i a = _mm256_abs_epi64(_mm512_extracti64x4_epi64(aa, 1));
+    /* DQ */
+    __m512i b = _mm512_broadcast_i32x8(a);
+    /* BW */
+    b = _mm512_abs_epi16(b);
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(b));
+}
--- a/.venv/Lib/site-packages/numpy/distutils/checks/cpu_avx512cd.c
+++ b/.venv/Lib/site-packages/numpy/distutils/checks/cpu_avx512cd.c
@@ -0,0 +1,20 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #ifndef __AVX512CD__
+        #error "HOST/ARCH doesn't support AVX512CD"
+    #endif
+#endif
+
+#include <immintrin.h>
+
+int main(int argc, char **argv)
+{
+    __m512i a = _mm512_lzcnt_epi32(_mm512_loadu_si512((const __m512i*)argv[argc-1]));
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+}
--- a/.venv/Lib/site-packages/numpy/distutils/checks/cpu_avx512f.c
+++ b/.venv/Lib/site-packages/numpy/distutils/checks/cpu_avx512f.c
@@ -0,0 +1,20 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #ifndef __AVX512F__
+        #error "HOST/ARCH doesn't support AVX512F"
+    #endif
+#endif
+
+#include <immintrin.h>
+
+int main(int argc, char **argv)
+{
+    __m512i a = _mm512_abs_epi32(_mm512_loadu_si512((const __m512i*)argv[argc-1]));
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+}
--- a/.venv/Lib/site-packages/numpy/distutils/checks/cpu_f16c.c
+++ b/.venv/Lib/site-packages/numpy/distutils/checks/cpu_f16c.c
@@ -0,0 +1,22 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #ifndef __F16C__
+        #error "HOST/ARCH doesn't support F16C"
+    #endif
+#endif
+
+#include <emmintrin.h>
+#include <immintrin.h>
+
+int main(int argc, char **argv)
+{
+    __m128 a  = _mm_cvtph_ps(_mm_loadu_si128((const __m128i*)argv[argc-1]));
+    __m256 a8 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)argv[argc-2]));
+    return (int)(_mm_cvtss_f32(a) + _mm_cvtss_f32(_mm256_castps256_ps128(a8)));
+}
--- a/.venv/Lib/site-packages/numpy/distutils/checks/cpu_fma3.c
+++ b/.venv/Lib/site-packages/numpy/distutils/checks/cpu_fma3.c
@@ -0,0 +1,22 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #if !defined(__FMA__) && !defined(__AVX2__)
+        #error "HOST/ARCH doesn't support FMA3"
+    #endif
+#endif
+
+#include <xmmintrin.h>
+#include <immintrin.h>
+
+int main(int argc, char **argv)
+{
+    __m256 a = _mm256_loadu_ps((const float*)argv[argc-1]);
+           a = _mm256_fmadd_ps(a, a, a);
+    return (int)_mm_cvtss_f32(_mm256_castps256_ps128(a));
+}
--- a/.venv/Lib/site-packages/numpy/distutils/checks/cpu_fma4.c
+++ b/.venv/Lib/site-packages/numpy/distutils/checks/cpu_fma4.c
@@ -0,0 +1,13 @@
+#include <immintrin.h>
+#ifdef _MSC_VER
+    #include <ammintrin.h>
+#else
+    #include <x86intrin.h>
+#endif
+
+int main(int argc, char **argv)
+{
+    __m256 a = _mm256_loadu_ps((const float*)argv[argc-1]);
+           a = _mm256_macc_ps(a, a, a);
+    return (int)_mm_cvtss_f32(_mm256_castps256_ps128(a));
+}
--- a/.venv/Lib/site-packages/numpy/distutils/checks/cpu_neon.c
+++ b/.venv/Lib/site-packages/numpy/distutils/checks/cpu_neon.c
@@ -0,0 +1,15 @@
+#ifdef _MSC_VER
+    #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(void)
+{
+    float32x4_t v1 = vdupq_n_f32(1.0f), v2 = vdupq_n_f32(2.0f);
+    int ret = (int)vgetq_lane_f32(vmulq_f32(v1, v2), 0);
+#ifdef __aarch64__
+    float64x2_t vd1 = vdupq_n_f64(1.0), vd2 = vdupq_n_f64(2.0);
+    ret += (int)vgetq_lane_f64(vmulq_f64(vd1, vd2), 0);
+#endif
+    return ret;
+}
--- a/.venv/Lib/site-packages/numpy/distutils/checks/cpu_neon_fp16.c
+++ b/.venv/Lib/site-packages/numpy/distutils/checks/cpu_neon_fp16.c
@@ -0,0 +1,11 @@
+#ifdef _MSC_VER
+    #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(void)
+{
+    short z4[] = {0, 0, 0, 0, 0, 0, 0, 0};
+    float32x4_t v_z4 = vcvt_f32_f16((float16x4_t)vld1_s16((const short*)z4));
+    return (int)vgetq_lane_f32(v_z4, 0);
+}
--- a/.venv/Lib/site-packages/numpy/distutils/checks/cpu_neon_vfpv4.c
+++ b/.venv/Lib/site-packages/numpy/distutils/checks/cpu_neon_vfpv4.c
@@ -0,0 +1,19 @@
+#ifdef _MSC_VER
+    #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(void)
+{
+    float32x4_t v1 = vdupq_n_f32(1.0f);
+    float32x4_t v2 = vdupq_n_f32(2.0f);
+    float32x4_t v3 = vdupq_n_f32(3.0f);
+    int ret = (int)vgetq_lane_f32(vfmaq_f32(v1, v2, v3), 0);
+#ifdef __aarch64__
+    float64x2_t vd1 = vdupq_n_f64(1.0);
+    float64x2_t vd2 = vdupq_n_f64(2.0);
+    float64x2_t vd3 = vdupq_n_f64(3.0);
+    ret += (int)vgetq_lane_f64(vfmaq_f64(vd1, vd2, vd3), 0);
+#endif
+    return ret;
+}
--- a/.venv/Lib/site-packages/numpy/distutils/checks/cpu_popcnt.c
+++ b/.venv/Lib/site-packages/numpy/distutils/checks/cpu_popcnt.c
@@ -0,0 +1,32 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env vr `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #if !defined(__SSE4_2__) && !defined(__POPCNT__)
+        #error "HOST/ARCH doesn't support POPCNT"
+    #endif
+#endif
+
+#ifdef _MSC_VER
+    #include <nmmintrin.h>
+#else
+    #include <popcntintrin.h>
+#endif
+
+int main(int argc, char **argv)
+{
+    // To make sure popcnt instructions are generated
+    // and been tested against the assembler
+    unsigned long long a = *((unsigned long long*)argv[argc-1]);
+    unsigned int b = *((unsigned int*)argv[argc-2]);
+
+#if defined(_M_X64) || defined(__x86_64__)
+    a = _mm_popcnt_u64(a);
+#endif
+    b = _mm_popcnt_u32(b);
+    return (int)a + b;
+}
--- a/.venv/Lib/site-packages/numpy/distutils/checks/cpu_sse.c
+++ b/.venv/Lib/site-packages/numpy/distutils/checks/cpu_sse.c
@@ -0,0 +1,20 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #ifndef __SSE__
+        #error "HOST/ARCH doesn't support SSE"
+    #endif
+#endif
+
+#include <xmmintrin.h>
+
+int main(void)
+{
+    __m128 a = _mm_add_ps(_mm_setzero_ps(), _mm_setzero_ps());
+    return (int)_mm_cvtss_f32(a);
+}
--- a/.venv/Lib/site-packages/numpy/distutils/checks/cpu_sse2.c
+++ b/.venv/Lib/site-packages/numpy/distutils/checks/cpu_sse2.c
@@ -0,0 +1,20 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #ifndef __SSE2__
+        #error "HOST/ARCH doesn't support SSE2"
+    #endif
+#endif
+
+#include <emmintrin.h>
+
+int main(void)
+{
+    __m128i a = _mm_add_epi16(_mm_setzero_si128(), _mm_setzero_si128());
+    return _mm_cvtsi128_si32(a);
+}
--- a/.venv/Lib/site-packages/numpy/distutils/checks/cpu_sse3.c
+++ b/.venv/Lib/site-packages/numpy/distutils/checks/cpu_sse3.c
@@ -0,0 +1,20 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #ifndef __SSE3__
+        #error "HOST/ARCH doesn't support SSE3"
+    #endif
+#endif
+
+#include <pmmintrin.h>
+
+int main(void)
+{
+    __m128 a = _mm_hadd_ps(_mm_setzero_ps(), _mm_setzero_ps());
+    return (int)_mm_cvtss_f32(a);
+}
--- a/.venv/Lib/site-packages/numpy/distutils/checks/cpu_sse41.c
+++ b/.venv/Lib/site-packages/numpy/distutils/checks/cpu_sse41.c
@@ -0,0 +1,20 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #ifndef __SSE4_1__
+        #error "HOST/ARCH doesn't support SSE41"
+    #endif
+#endif
+
+#include <smmintrin.h>
+
+int main(void)
+{
+    __m128 a = _mm_floor_ps(_mm_setzero_ps());
+    return (int)_mm_cvtss_f32(a);
+}
--- a/.venv/Lib/site-packages/numpy/distutils/checks/cpu_sse42.c
+++ b/.venv/Lib/site-packages/numpy/distutils/checks/cpu_sse42.c
@@ -0,0 +1,20 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #ifndef __SSE4_2__
+        #error "HOST/ARCH doesn't support SSE42"
+    #endif
+#endif
+
+#include <smmintrin.h>
+
+int main(void)
+{
+    __m128 a = _mm_hadd_ps(_mm_setzero_ps(), _mm_setzero_ps());
+    return (int)_mm_cvtss_f32(a);
+}
--- a/.venv/Lib/site-packages/numpy/distutils/checks/cpu_ssse3.c
+++ b/.venv/Lib/site-packages/numpy/distutils/checks/cpu_ssse3.c
@@ -0,0 +1,20 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #ifndef __SSSE3__
+        #error "HOST/ARCH doesn't support SSSE3"
+    #endif
+#endif
+
+#include <tmmintrin.h>
+
+int main(void)
+{
+    __m128i a = _mm_hadd_epi16(_mm_setzero_si128(), _mm_setzero_si128());
+    return (int)_mm_cvtsi128_si32(a);
+}
--- a/.venv/Lib/site-packages/numpy/distutils/checks/cpu_vsx.c
+++ b/.venv/Lib/site-packages/numpy/distutils/checks/cpu_vsx.c
@@ -0,0 +1,21 @@
+#ifndef __VSX__
+    #error "VSX is not supported"
+#endif
+#include <altivec.h>
+
+#if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__))
+    #define vsx_ld  vec_vsx_ld
+    #define vsx_st  vec_vsx_st
+#else
+    #define vsx_ld  vec_xl
+    #define vsx_st  vec_xst
+#endif
+
+int main(void)
+{
+    unsigned int zout[4];
+    unsigned int z4[] = {0, 0, 0, 0};
+    __vector unsigned int v_z4 = vsx_ld(0, z4);
+    vsx_st(v_z4, 0, zout);
+    return zout[0];
+}
--- a/.venv/Lib/site-packages/numpy/distutils/checks/cpu_vsx2.c
+++ b/.venv/Lib/site-packages/numpy/distutils/checks/cpu_vsx2.c
@@ -0,0 +1,13 @@
+#ifndef __VSX__
+    #error "VSX is not supported"
+#endif
+#include <altivec.h>
+
+typedef __vector unsigned long long v_uint64x2;
+
+int main(void)
+{
+    v_uint64x2 z2 = (v_uint64x2){0, 0};
+    z2 = (v_uint64x2)vec_cmpeq(z2, z2);
+    return (int)vec_extract(z2, 0);
+}
--- a/.venv/Lib/site-packages/numpy/distutils/checks/cpu_vsx3.c
+++ b/.venv/Lib/site-packages/numpy/distutils/checks/cpu_vsx3.c
@@ -0,0 +1,13 @@
+#ifndef __VSX__
+    #error "VSX is not supported"
+#endif
+#include <altivec.h>
+
+typedef __vector unsigned int v_uint32x4;
+
+int main(void)
+{
+    v_uint32x4 z4 = (v_uint32x4){0, 0, 0, 0};
+    z4 = vec_absd(z4, z4);
+    return (int)vec_extract(z4, 0);
+}
--- a/.venv/Lib/site-packages/numpy/distutils/checks/cpu_xop.c
+++ b/.venv/Lib/site-packages/numpy/distutils/checks/cpu_xop.c
@@ -0,0 +1,12 @@
+#include <immintrin.h>
+#ifdef _MSC_VER
+    #include <ammintrin.h>
+#else
+    #include <x86intrin.h>
+#endif
+
+int main(void)
+{
+    __m128i a = _mm_comge_epu32(_mm_setzero_si128(), _mm_setzero_si128());
+    return _mm_cvtsi128_si32(a);
+}
--- a/.venv/Lib/site-packages/numpy/distutils/checks/extra_avx512bw_mask.c
+++ b/.venv/Lib/site-packages/numpy/distutils/checks/extra_avx512bw_mask.c
@@ -0,0 +1,18 @@
+#include <immintrin.h>
+/**
+ * Test BW mask operations due to:
+ *  - MSVC has supported it since vs2019 see,
+ *    https://developercommunity.visualstudio.com/content/problem/518298/missing-avx512bw-mask-intrinsics.html
+ *  - Clang >= v8.0
+ *  - GCC >= v7.1
+ */
+int main(void)
+{
+    __mmask64 m64 = _mm512_cmpeq_epi8_mask(_mm512_set1_epi8((char)1), _mm512_set1_epi8((char)1));
+    m64 = _kor_mask64(m64, m64);
+    m64 = _kxor_mask64(m64, m64);
+    m64 = _cvtu64_mask64(_cvtmask64_u64(m64));
+    m64 = _mm512_kunpackd(m64, m64);
+    m64 = (__mmask64)_mm512_kunpackw((__mmask32)m64, (__mmask32)m64);
+    return (int)_cvtmask64_u64(m64);
+}
--- a/.venv/Lib/site-packages/numpy/distutils/checks/extra_avx512dq_mask.c
+++ b/.venv/Lib/site-packages/numpy/distutils/checks/extra_avx512dq_mask.c
@@ -0,0 +1,16 @@
+#include <immintrin.h>
+/**
+ * Test DQ mask operations due to:
+ *  - MSVC has supported it since vs2019 see,
+ *    https://developercommunity.visualstudio.com/content/problem/518298/missing-avx512bw-mask-intrinsics.html
+ *  - Clang >= v8.0
+ *  - GCC >= v7.1
+ */
+int main(void)
+{
+    __mmask8 m8 = _mm512_cmpeq_epi64_mask(_mm512_set1_epi64(1), _mm512_set1_epi64(1));
+    m8 = _kor_mask8(m8, m8);
+    m8 = _kxor_mask8(m8, m8);
+    m8 = _cvtu32_mask8(_cvtmask8_u32(m8));
+    return (int)_cvtmask8_u32(m8);
+}
--- a/.venv/Lib/site-packages/numpy/distutils/checks/extra_avx512f_reduce.c
+++ b/.venv/Lib/site-packages/numpy/distutils/checks/extra_avx512f_reduce.c
@@ -0,0 +1,41 @@
+#include <immintrin.h>
+/**
+ * The following intrinsics don't have direct native support but compilers
+ * tend to emulate them.
+ * They're usually supported by gcc >= 7.1, clang >= 4 and icc >= 19
+ */
+int main(void)
+{
+    __m512  one_ps = _mm512_set1_ps(1.0f);
+    __m512d one_pd = _mm512_set1_pd(1.0);
+    __m512i one_i64 = _mm512_set1_epi64(1);
+    // add
+    float sum_ps  = _mm512_reduce_add_ps(one_ps);
+    double sum_pd = _mm512_reduce_add_pd(one_pd);
+    int sum_int   = (int)_mm512_reduce_add_epi64(one_i64);
+        sum_int  += (int)_mm512_reduce_add_epi32(one_i64);
+    // mul
+    sum_ps  += _mm512_reduce_mul_ps(one_ps);
+    sum_pd  += _mm512_reduce_mul_pd(one_pd);
+    sum_int += (int)_mm512_reduce_mul_epi64(one_i64);
+    sum_int += (int)_mm512_reduce_mul_epi32(one_i64);
+    // min
+    sum_ps  += _mm512_reduce_min_ps(one_ps);
+    sum_pd  += _mm512_reduce_min_pd(one_pd);
+    sum_int += (int)_mm512_reduce_min_epi32(one_i64);
+    sum_int += (int)_mm512_reduce_min_epu32(one_i64);
+    sum_int += (int)_mm512_reduce_min_epi64(one_i64);
+    // max
+    sum_ps  += _mm512_reduce_max_ps(one_ps);
+    sum_pd  += _mm512_reduce_max_pd(one_pd);
+    sum_int += (int)_mm512_reduce_max_epi32(one_i64);
+    sum_int += (int)_mm512_reduce_max_epu32(one_i64);
+    sum_int += (int)_mm512_reduce_max_epi64(one_i64);
+    // and
+    sum_int += (int)_mm512_reduce_and_epi32(one_i64);
+    sum_int += (int)_mm512_reduce_and_epi64(one_i64);
+    // or
+    sum_int += (int)_mm512_reduce_or_epi32(one_i64);
+    sum_int += (int)_mm512_reduce_or_epi64(one_i64);
+    return (int)sum_ps + (int)sum_pd + sum_int;
+}
--- a/.venv/Lib/site-packages/numpy/distutils/checks/extra_vsx_asm.c
+++ b/.venv/Lib/site-packages/numpy/distutils/checks/extra_vsx_asm.c
@@ -0,0 +1,36 @@
+/**
+ * Testing ASM VSX register number fixer '%x<n>'
+ *
+ * old versions of CLANG doesn't support %x<n> in the inline asm template
+ * which fixes register number when using any of the register constraints wa, wd, wf.
+ *
+ * xref:
+ * - https://bugs.llvm.org/show_bug.cgi?id=31837
+ * - https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html
+ */
+#ifndef __VSX__
+    #error "VSX is not supported"
+#endif
+#include <altivec.h>
+
+#if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__))
+    #define vsx_ld  vec_vsx_ld
+    #define vsx_st  vec_vsx_st
+#else
+    #define vsx_ld  vec_xl
+    #define vsx_st  vec_xst
+#endif
+
+int main(void)
+{
+    float z4[] = {0, 0, 0, 0};
+    signed int zout[] = {0, 0, 0, 0};
+
+    __vector float vz4 = vsx_ld(0, z4);
+    __vector signed int asm_ret = vsx_ld(0, zout);
+
+    __asm__ ("xvcvspsxws %x0,%x1" : "=wa" (vz4) : "wa" (asm_ret));
+
+    vsx_st(asm_ret, 0, zout);
+    return zout[0];
+}
--- a/.venv/Lib/site-packages/numpy/distutils/checks/test_flags.c
+++ b/.venv/Lib/site-packages/numpy/distutils/checks/test_flags.c
@@ -0,0 +1 @@
+int test_flags;