From 118980e02e59ff31871df59dce257075394f3533 Mon Sep 17 00:00:00 2001 From: Aaditya Dhruv Date: Sun, 25 Jan 2026 15:10:37 -0600 Subject: wip --- include/cglm/simd/x86.h | 365 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 365 insertions(+) create mode 100644 include/cglm/simd/x86.h (limited to 'include/cglm/simd/x86.h') diff --git a/include/cglm/simd/x86.h b/include/cglm/simd/x86.h new file mode 100644 index 0000000..2410d0f --- /dev/null +++ b/include/cglm/simd/x86.h @@ -0,0 +1,365 @@ +/* + * Copyright (c), Recep Aslantas. + * + * MIT License (MIT), http://opensource.org/licenses/MIT + * Full license can be found in the LICENSE file + */ + +#ifndef cglm_simd_x86_h +#define cglm_simd_x86_h +#include "intrin.h" +#ifdef CGLM_SIMD_x86 + +#ifdef CGLM_ALL_UNALIGNED +# define glmm_load(p) _mm_loadu_ps(p) +# define glmm_store(p, a) _mm_storeu_ps(p, a) +#else +# define glmm_load(p) _mm_load_ps(p) +# define glmm_store(p, a) _mm_store_ps(p, a) +#endif + +#define glmm_128 __m128 + +#ifdef __AVX__ +# define glmm_shuff1(xmm, z, y, x, w) \ + _mm_permute_ps((xmm), _MM_SHUFFLE(z, y, x, w)) +#else +# if !defined(CGLM_NO_INT_DOMAIN) && defined(__SSE2__) +# define glmm_shuff1(xmm, z, y, x, w) \ + _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xmm), \ + _MM_SHUFFLE(z, y, x, w))) +# else +# define glmm_shuff1(xmm, z, y, x, w) \ + _mm_shuffle_ps(xmm, xmm, _MM_SHUFFLE(z, y, x, w)) +# endif +#endif + +#define glmm_splat(x, lane) glmm_shuff1(x, lane, lane, lane, lane) + +#ifdef __AVX__ +# define glmm_set1(x) _mm_broadcast_ss(&x) +# define glmm_set1_ptr(x) _mm_broadcast_ss(x) +# define glmm_set1_rval(x) _mm_set1_ps(x) +# ifdef __AVX2__ +# define glmm_splat_x(x) _mm_broadcastss_ps(x) +# else +# define glmm_splat_x(x) _mm_permute_ps(x, _MM_SHUFFLE(0, 0, 0, 0)) +# endif +# define glmm_splat_y(x) _mm_permute_ps(x, _MM_SHUFFLE(1, 1, 1, 1)) +# define glmm_splat_z(x) _mm_permute_ps(x, _MM_SHUFFLE(2, 2, 2, 2)) +# define glmm_splat_w(x) _mm_permute_ps(x, _MM_SHUFFLE(3, 3, 3, 3)) +#else +# define glmm_set1(x) _mm_set1_ps(x) +# define glmm_set1_ptr(x) _mm_set1_ps(*x) +# define glmm_set1_rval(x) _mm_set1_ps(x) + +# define glmm_splat_x(x) glmm_splat(x, 0) +# define glmm_splat_y(x) glmm_splat(x, 1) +# define glmm_splat_z(x) glmm_splat(x, 2) +# define glmm_splat_w(x) glmm_splat(x, 3) +#endif + +#ifdef __AVX__ +# ifdef CGLM_ALL_UNALIGNED +# define glmm_load256(p) _mm256_loadu_ps(p) +# define glmm_store256(p, a) _mm256_storeu_ps(p, a) +# else +# define glmm_load256(p) _mm256_load_ps(p) +# define glmm_store256(p, a) _mm256_store_ps(p, a) +# endif +#endif + +/* Note that `0x80000000` corresponds to `INT_MIN` for a 32-bit int. */ + +#if defined(__SSE2__) +# define GLMM_NEGZEROf ((int)0x80000000) /* 0x80000000 ---> -0.0f */ +# define GLMM_POSZEROf ((int)0x00000000) /* 0x00000000 ---> +0.0f */ +#else +# ifdef CGLM_FAST_MATH + union { int i; float f; } static GLMM_NEGZEROf_TU = { .i = (int)0x80000000 }; +# define GLMM_NEGZEROf GLMM_NEGZEROf_TU.f +# define GLMM_POSZEROf 0.0f +# else +# define GLMM_NEGZEROf -0.0f +# define GLMM_POSZEROf 0.0f +# endif +#endif + +#if defined(__SSE2__) +# define GLMM__SIGNMASKf(X, Y, Z, W) \ + _mm_castsi128_ps(_mm_set_epi32(X, Y, Z, W)) + /* _mm_set_ps(X, Y, Z, W); */ +#else +# define GLMM__SIGNMASKf(X, Y, Z, W) _mm_set_ps(X, Y, Z, W) +#endif + +#define glmm_float32x4_SIGNMASK_PNPN GLMM__SIGNMASKf(GLMM_POSZEROf, GLMM_NEGZEROf, GLMM_POSZEROf, GLMM_NEGZEROf) +#define glmm_float32x4_SIGNMASK_NPNP GLMM__SIGNMASKf(GLMM_NEGZEROf, GLMM_POSZEROf, GLMM_NEGZEROf, GLMM_POSZEROf) +#define glmm_float32x4_SIGNMASK_NPPN GLMM__SIGNMASKf(GLMM_NEGZEROf, GLMM_POSZEROf, GLMM_POSZEROf, GLMM_NEGZEROf) + +/* fasth math prevents -0.0f to work */ +#if defined(__SSE2__) +# define glmm_float32x4_SIGNMASK_NEG _mm_castsi128_ps(_mm_set1_epi32(GLMM_NEGZEROf)) /* _mm_set1_ps(-0.0f) */ +#else +# define glmm_float32x4_SIGNMASK_NEG glmm_set1(GLMM_NEGZEROf) +#endif + +#define glmm_float32x8_SIGNMASK_NEG _mm256_castsi256_ps(_mm256_set1_epi32(GLMM_NEGZEROf)) + +static inline +__m128 +glmm_abs(__m128 x) { + return _mm_andnot_ps(glmm_float32x4_SIGNMASK_NEG, x); +} + +static inline __m128 glmm_min(__m128 a, __m128 b) { return _mm_min_ps(a, b); } +static inline __m128 glmm_max(__m128 a, __m128 b) { return _mm_max_ps(a, b); } + +static inline +__m128 +glmm_vhadd(__m128 v) { + __m128 x0; + x0 = _mm_add_ps(v, glmm_shuff1(v, 0, 1, 2, 3)); + x0 = _mm_add_ps(x0, glmm_shuff1(x0, 1, 0, 0, 1)); + return x0; +} + +static inline +__m128 +glmm_vhadds(__m128 v) { +#if defined(__SSE3__) + __m128 shuf, sums; + shuf = _mm_movehdup_ps(v); + sums = _mm_add_ps(v, shuf); + shuf = _mm_movehl_ps(shuf, sums); + sums = _mm_add_ss(sums, shuf); + return sums; +#else + __m128 shuf, sums; + shuf = glmm_shuff1(v, 2, 3, 0, 1); + sums = _mm_add_ps(v, shuf); + shuf = _mm_movehl_ps(shuf, sums); + sums = _mm_add_ss(sums, shuf); + return sums; +#endif +} + +static inline +float +glmm_hadd(__m128 v) { + return _mm_cvtss_f32(glmm_vhadds(v)); +} + +static inline +__m128 +glmm_vhmin(__m128 v) { + __m128 x0, x1, x2; + x0 = _mm_movehl_ps(v, v); /* [2, 3, 2, 3] */ + x1 = _mm_min_ps(x0, v); /* [0|2, 1|3, 2|2, 3|3] */ + x2 = glmm_splat(x1, 1); /* [1|3, 1|3, 1|3, 1|3] */ + return _mm_min_ss(x1, x2); +} + +static inline +float +glmm_hmin(__m128 v) { + return _mm_cvtss_f32(glmm_vhmin(v)); +} + +static inline +__m128 +glmm_vhmax(__m128 v) { + __m128 x0, x1, x2; + x0 = _mm_movehl_ps(v, v); /* [2, 3, 2, 3] */ + x1 = _mm_max_ps(x0, v); /* [0|2, 1|3, 2|2, 3|3] */ + x2 = glmm_splat(x1, 1); /* [1|3, 1|3, 1|3, 1|3] */ + return _mm_max_ss(x1, x2); +} + +static inline +float +glmm_hmax(__m128 v) { + return _mm_cvtss_f32(glmm_vhmax(v)); +} + +static inline +__m128 +glmm_vdots(__m128 a, __m128 b) { +#if (defined(__SSE4_1__) || defined(__SSE4_2__)) && defined(CGLM_SSE4_DOT) + return _mm_dp_ps(a, b, 0xFF); +#elif defined(__SSE3__) && defined(CGLM_SSE3_DOT) + __m128 x0, x1; + x0 = _mm_mul_ps(a, b); + x1 = _mm_hadd_ps(x0, x0); + return _mm_hadd_ps(x1, x1); +#else + return glmm_vhadds(_mm_mul_ps(a, b)); +#endif +} + +static inline +__m128 +glmm_vdot(__m128 a, __m128 b) { +#if (defined(__SSE4_1__) || defined(__SSE4_2__)) && defined(CGLM_SSE4_DOT) + return _mm_dp_ps(a, b, 0xFF); +#elif defined(__SSE3__) && defined(CGLM_SSE3_DOT) + __m128 x0, x1; + x0 = _mm_mul_ps(a, b); + x1 = _mm_hadd_ps(x0, x0); + return _mm_hadd_ps(x1, x1); +#else + __m128 x0; + x0 = _mm_mul_ps(a, b); + x0 = _mm_add_ps(x0, glmm_shuff1(x0, 1, 0, 3, 2)); + return _mm_add_ps(x0, glmm_shuff1(x0, 0, 1, 0, 1)); +#endif +} + +static inline +float +glmm_dot(__m128 a, __m128 b) { + return _mm_cvtss_f32(glmm_vdots(a, b)); +} + +static inline +float +glmm_norm(__m128 a) { + return _mm_cvtss_f32(_mm_sqrt_ss(glmm_vhadds(_mm_mul_ps(a, a)))); +} + +static inline +float +glmm_norm2(__m128 a) { + return _mm_cvtss_f32(glmm_vhadds(_mm_mul_ps(a, a))); +} + +static inline +float +glmm_norm_one(__m128 a) { + return _mm_cvtss_f32(glmm_vhadds(glmm_abs(a))); +} + +static inline +float +glmm_norm_inf(__m128 a) { + return _mm_cvtss_f32(glmm_vhmax(glmm_abs(a))); +} + +#if defined(__SSE2__) +static inline +__m128 +glmm_load3(float v[3]) { + __m128i xy; + __m128 z; + + xy = _mm_loadl_epi64(CGLM_CASTPTR_ASSUME_ALIGNED(v, const __m128i)); + z = _mm_load_ss(&v[2]); + + return _mm_movelh_ps(_mm_castsi128_ps(xy), z); +} + +static inline +void +glmm_store3(float v[3], __m128 vx) { + _mm_storel_pi(CGLM_CASTPTR_ASSUME_ALIGNED(v, __m64), vx); + _mm_store_ss(&v[2], glmm_shuff1(vx, 2, 2, 2, 2)); +} +#endif + +static inline +__m128 +glmm_div(__m128 a, __m128 b) { + return _mm_div_ps(a, b); +} + +/* enable FMA macro for MSVC? */ +#if defined(_MSC_VER) && !defined(__FMA__) && defined(__AVX2__) +# define __FMA__ 1 +#endif + +static inline +__m128 +glmm_fmadd(__m128 a, __m128 b, __m128 c) { +#ifdef __FMA__ + return _mm_fmadd_ps(a, b, c); +#else + return _mm_add_ps(c, _mm_mul_ps(a, b)); +#endif +} + +static inline +__m128 +glmm_fnmadd(__m128 a, __m128 b, __m128 c) { +#ifdef __FMA__ + return _mm_fnmadd_ps(a, b, c); +#else + return _mm_sub_ps(c, _mm_mul_ps(a, b)); +#endif +} + +static inline +__m128 +glmm_fmsub(__m128 a, __m128 b, __m128 c) { +#ifdef __FMA__ + return _mm_fmsub_ps(a, b, c); +#else + return _mm_sub_ps(_mm_mul_ps(a, b), c); +#endif +} + +static inline +__m128 +glmm_fnmsub(__m128 a, __m128 b, __m128 c) { +#ifdef __FMA__ + return _mm_fnmsub_ps(a, b, c); +#else + return _mm_xor_ps(_mm_add_ps(_mm_mul_ps(a, b), c), + glmm_float32x4_SIGNMASK_NEG); +#endif +} + +#if defined(__AVX__) +static inline +__m256 +glmm256_fmadd(__m256 a, __m256 b, __m256 c) { +#ifdef __FMA__ + return _mm256_fmadd_ps(a, b, c); +#else + return _mm256_add_ps(c, _mm256_mul_ps(a, b)); +#endif +} + +static inline +__m256 +glmm256_fnmadd(__m256 a, __m256 b, __m256 c) { +#ifdef __FMA__ + return _mm256_fnmadd_ps(a, b, c); +#else + return _mm256_sub_ps(c, _mm256_mul_ps(a, b)); +#endif +} + +static inline +__m256 +glmm256_fmsub(__m256 a, __m256 b, __m256 c) { +#ifdef __FMA__ + return _mm256_fmsub_ps(a, b, c); +#else + return _mm256_sub_ps(_mm256_mul_ps(a, b), c); +#endif +} + +static inline +__m256 +glmm256_fnmsub(__m256 a, __m256 b, __m256 c) { +#ifdef __FMA__ + return _mm256_fmsub_ps(a, b, c); +#else + return _mm256_xor_ps(_mm256_sub_ps(_mm256_mul_ps(a, b), c), + glmm_float32x8_SIGNMASK_NEG); +#endif +} +#endif + +#endif +#endif /* cglm_simd_x86_h */ -- cgit