/**************************************************************************** * Copyright (C) 2017 Intel Corporation. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. ****************************************************************************/ #pragma once #if 0 //=========================================================================== // Placeholder name representing either SIMD4, SIMD256, or SIMD16 structures. //=========================================================================== struct SIMD256 // or SIMD4 or SIMD16 { //======================================================================= // SIMD Types // // These typedefs are examples. The SIMD256 and SIMD16 implementations will // use different base types with this same naming. using Float = __m256; // Packed single-precision float vector using Double = __m256d; // Packed double-precision float vector using Integer = __m256i; // Packed integer vector (mutable element widths) using Mask = uint8_t; // Integer representing mask bits //======================================================================= // Standard interface // (available in both SIMD256 and SIMD16 widths) //======================================================================= //----------------------------------------------------------------------- // Single precision floating point arithmetic operations //----------------------------------------------------------------------- static Float add_ps(Float a, Float b); // return a + b static Float div_ps(Float a, Float b); // return a / b static Float fmadd_ps(Float a, Float b, Float c); // return (a * b) + c static Float fmsub_ps(Float a, Float b, Float c); // return (a * b) - c static Float max_ps(Float a, Float b); // return (a > b) ? a : b static Float min_ps(Float a, Float b); // return (a < b) ? a : b static Float mul_ps(Float a, Float b); // return a * b static Float rcp_ps(Float a); // return 1.0f / a static Float rsqrt_ps(Float a); // return 1.0f / sqrt(a) static Float sub_ps(Float a, Float b); // return a - b enum class RoundMode { TO_NEAREST_INT = 0x00, // Round to nearest integer == TRUNCATE(value + (signof(value))0.5) TO_NEG_INF = 0x01, // Round to negative infinity TO_POS_INF = 0x02, // Round to positive infinity TO_ZERO = 0x03, // Round to 0 a.k.a. truncate CUR_DIRECTION = 0x04, // Round in direction set in MXCSR register RAISE_EXC = 0x00, // Raise exception on overflow NO_EXC = 0x08, // Suppress exceptions NINT = static_cast(TO_NEAREST_INT) | static_cast(RAISE_EXC), NINT_NOEXC = static_cast(TO_NEAREST_INT) | static_cast(NO_EXC), FLOOR = static_cast(TO_NEG_INF) | static_cast(RAISE_EXC), FLOOR_NOEXC = static_cast(TO_NEG_INF) | static_cast(NO_EXC), CEIL = static_cast(TO_POS_INF) | static_cast(RAISE_EXC), CEIL_NOEXC = static_cast(TO_POS_INF) | static_cast(NO_EXC), TRUNC = static_cast(TO_ZERO) | static_cast(RAISE_EXC), TRUNC_NOEXC = static_cast(TO_ZERO) | static_cast(NO_EXC), RINT = static_cast(CUR_DIRECTION) | static_cast(RAISE_EXC), NEARBYINT = static_cast(CUR_DIRECTION) | static_cast(NO_EXC), }; // return round_func(a) // // round_func is chosen on the RMT template parameter. See the documentation // for the RoundMode enumeration above. template static Float round_ps(Float a); // return round(a) //----------------------------------------------------------------------- // Integer (various width) arithmetic operations //----------------------------------------------------------------------- static Integer abs_epi32(Integer a); // return absolute_value(a) (int32) static Integer add_epi32(Integer a, Integer b); // return a + b (int32) static Integer add_epi8(Integer a, Integer b); // return a + b (int8) static Integer adds_epu8(Integer a, Integer b); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) static Integer max_epi32(Integer a, Integer b); // return (a > b) ? a : b (int32) static Integer max_epu32(Integer a, Integer b); // return (a > b) ? a : b (uint32) static Integer min_epi32(Integer a, Integer b); // return (a < b) ? a : b (int32) static Integer min_epu32(Integer a, Integer b); // return (a < b) ? a : b (uint32) static Integer mul_epi32(Integer a, Integer b); // return a * b (int32) // return (a * b) & 0xFFFFFFFF // // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, // and store the low 32 bits of the intermediate integers in dst. static Float mullo_epi32(Integer a, Integer b); static Integer sub_epi32(Integer a, Integer b); // return a - b (int32) static Integer sub_epi64(Integer a, Integer b); // return a - b (int64) static Integer subs_epu8(Integer a, Integer b); // return (b > a) ? 0 : (a - b) (uint8) //----------------------------------------------------------------------- // Logical operations //----------------------------------------------------------------------- static Float and_ps(Float a, Float b); // return a & b (float treated as int) static Integer and_si(Integer a, Integer b); // return a & b (int) static Float andnot_ps(Float a, Float b); // return (~a) & b (float treated as int) static Integer andnot_si(Integer a, Integer b); // return (~a) & b (int) static Float or_ps(Float a, Float b); // return a | b (float treated as int) static Float or_si(Integer a, Integer b); // return a | b (int) static Float xor_ps(Float a, Float b); // return a ^ b (float treated as int) static Integer xor_si(Integer a, Integer b); // return a ^ b (int) //----------------------------------------------------------------------- // Shift operations //----------------------------------------------------------------------- template static Integer slli_epi32(Integer a); // return a << ImmT static Integer sllv_epi32(Integer a, Integer b); // return a << b template static Integer srai_epi32(Integer a); // return a >> ImmT (int32) template static Integer srli_epi32(Integer a); // return a >> ImmT (uint32) template // for each 128-bit lane: static Integer srli_si(Integer a); // return a >> (ImmT*8) (uint) template static Float srlisi_ps(Float a); // same as srli_si, but with Float cast to int static Integer srlv_epi32(Integer a, Integer b); // return a >> b (uint32) //----------------------------------------------------------------------- // Conversion operations //----------------------------------------------------------------------- static Float castpd_ps(Double a); // return *(Float*)(&a) static Integer castps_si(Float a); // return *(Integer*)(&a) static Double castsi_pd(Integer a); // return *(Double*)(&a) static Double castps_pd(Float a); // return *(Double*)(&a) static Float castsi_ps(Integer a); // return *(Float*)(&a) static Float cvtepi32_ps(Integer a); // return (float)a (int32 --> float) static Integer cvtepu8_epi16(Integer a); // return (int16)a (uint8 --> int16) static Integer cvtepu8_epi32(Integer a); // return (int32)a (uint8 --> int32) static Integer cvtepu16_epi32(Integer a); // return (int32)a (uint16 --> int32) static Integer cvtepu16_epi64(Integer a); // return (int64)a (uint16 --> int64) static Integer cvtepu32_epi64(Integer a); // return (int64)a (uint32 --> int64) static Integer cvtps_epi32(Float a); // return (int32)a (float --> int32) static Integer cvttps_epi32(Float a); // return (int32)a (rnd_to_zero(float) --> int32) //----------------------------------------------------------------------- // Comparison operations //----------------------------------------------------------------------- // Comparison types used with cmp_ps: // - ordered comparisons are always false if either operand is NaN // - unordered comparisons are always true if either operand is NaN // - signaling comparisons raise an exception if either operand is NaN // - non-signaling comparisons will never raise an exception // // Ordered: return (a != NaN) && (b != NaN) && (a cmp b) // Unordered: return (a == NaN) || (b == NaN) || (a cmp b) enum class CompareType { EQ_OQ = 0x00, // Equal (ordered, nonsignaling) LT_OS = 0x01, // Less-than (ordered, signaling) LE_OS = 0x02, // Less-than-or-equal (ordered, signaling) UNORD_Q = 0x03, // Unordered (nonsignaling) NEQ_UQ = 0x04, // Not-equal (unordered, nonsignaling) NLT_US = 0x05, // Not-less-than (unordered, signaling) NLE_US = 0x06, // Not-less-than-or-equal (unordered, signaling) ORD_Q = 0x07, // Ordered (nonsignaling) EQ_UQ = 0x08, // Equal (unordered, non-signaling) NGE_US = 0x09, // Not-greater-than-or-equal (unordered, signaling) NGT_US = 0x0A, // Not-greater-than (unordered, signaling) FALSE_OQ = 0x0B, // False (ordered, nonsignaling) NEQ_OQ = 0x0C, // Not-equal (ordered, non-signaling) GE_OS = 0x0D, // Greater-than-or-equal (ordered, signaling) GT_OS = 0x0E, // Greater-than (ordered, signaling) TRUE_UQ = 0x0F, // True (unordered, non-signaling) EQ_OS = 0x10, // Equal (ordered, signaling) LT_OQ = 0x11, // Less-than (ordered, nonsignaling) LE_OQ = 0x12, // Less-than-or-equal (ordered, nonsignaling) UNORD_S = 0x13, // Unordered (signaling) NEQ_US = 0x14, // Not-equal (unordered, signaling) NLT_UQ = 0x15, // Not-less-than (unordered, nonsignaling) NLE_UQ = 0x16, // Not-less-than-or-equal (unordered, nonsignaling) ORD_S = 0x17, // Ordered (signaling) EQ_US = 0x18, // Equal (unordered, signaling) NGE_UQ = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling) NGT_UQ = 0x1A, // Not-greater-than (unordered, nonsignaling) FALSE_OS = 0x1B, // False (ordered, signaling) NEQ_OS = 0x1C, // Not-equal (ordered, signaling) GE_OQ = 0x1D, // Greater-than-or-equal (ordered, nonsignaling) GT_OQ = 0x1E, // Greater-than (ordered, nonsignaling) TRUE_US = 0x1F, // True (unordered, signaling) }; // return a (CmpTypeT) b (float) // // See documentation for CompareType above for valid values for CmpTypeT. template static Float cmp_ps(Float a, Float b); // return a (CmtTypeT) b (see above) static Float cmpgt_ps(Float a, Float b); // return cmp_ps(a, b) static Float cmple_ps(Float a, Float b); // return cmp_ps(a, b) static Float cmplt_ps(Float a, Float b); // return cmp_ps(a, b) static Float cmpneq_ps(Float a, Float b); // return cmp_ps(a, b) static Float cmpeq_ps(Float a, Float b); // return cmp_ps(a, b) static Float cmpge_ps(Float a, Float b); // return cmp_ps(a, b) static Integer cmpeq_epi8(Integer a, Integer b); // return a == b (int8) static Integer cmpeq_epi16(Integer a, Integer b); // return a == b (int16) static Integer cmpeq_epi32(Integer a, Integer b); // return a == b (int32) static Integer cmpeq_epi64(Integer a, Integer b); // return a == b (int64) static Integer cmpgt_epi8(Integer a, Integer b); // return a > b (int8) static Integer cmpgt_epi16(Integer a, Integer b); // return a > b (int16) static Integer cmpgt_epi32(Integer a, Integer b); // return a > b (int32) static Integer cmpgt_epi64(Integer a, Integer b); // return a > b (int64) static Integer cmplt_epi32(Integer a, Integer b); // return a < b (int32) static bool testz_ps(Float a, Float b); // return all_lanes_zero(a & b) ? 1 : 0 (float) static bool testz_si(Integer a, Integer b); // return all_lanes_zero(a & b) ? 1 : 0 (int) //----------------------------------------------------------------------- // Blend / shuffle / permute operations //----------------------------------------------------------------------- template static Float blend_ps(Float a, Float b); // return ImmT ? b : a (float) static Integer blendv_epi32(Integer a, Integer b, Float mask); // return mask ? b : a (int) static Float blendv_ps(Float a, Float b, Float mask); // return mask ? b : a (float) static Float broadcast_ss(float const *p); // return *p (all elements in vector get same value) static Integer packs_epi16(Integer a, Integer b); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16 static Integer packs_epi32(Integer a, Integer b); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32 static Integer packus_epi16(Integer a, Integer b); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16 static Integer packus_epi32(Integer a, Integer b); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32 static Float permute_epi32(Integer a, Integer swiz); // return a[swiz[i]] for each 32-bit lane i (int32) static Float permute_ps(Float a, Integer swiz); // return a[swiz[i]] for each 32-bit lane i (float) template static Integer shuffle_epi32(Integer a, Integer b); template static Integer shuffle_epi64(Integer a, Integer b); static Integer shuffle_epi8(Integer a, Integer b); template static Float shuffle_pd(Double a, Double b); template static Float shuffle_ps(Float a, Float b); static Integer unpackhi_epi16(Integer a, Integer b); static Integer unpackhi_epi32(Integer a, Integer b); static Integer unpackhi_epi64(Integer a, Integer b); static Integer unpackhi_epi8(Integer a, Integer b); static Float unpackhi_pd(Double a, Double b); static Float unpackhi_ps(Float a, Float b); static Integer unpacklo_epi16(Integer a, Integer b); static Integer unpacklo_epi32(Integer a, Integer b); static Integer unpacklo_epi64(Integer a, Integer b); static Integer unpacklo_epi8(Integer a, Integer b); static Float unpacklo_pd(Double a, Double b); static Float unpacklo_ps(Float a, Float b); //----------------------------------------------------------------------- // Load / store operations //----------------------------------------------------------------------- enum class ScaleFactor { SF_1, // No scaling SF_2, // Scale offset by 2 SF_4, // Scale offset by 4 SF_8, // Scale offset by 8 }; template static Float i32gather_ps(float const* p, Integer idx); // return *(float*)(((int8*)p) + (idx * ScaleT)) static Float load1_ps(float const *p); // return *p (broadcast 1 value to all elements) static Float load_ps(float const *p); // return *p (loads SIMD width elements from memory) static Integer load_si(Integer const *p); // return *p static Float loadu_ps(float const *p); // return *p (same as load_ps but allows for unaligned mem) static Integer loadu_si(Integer const *p); // return *p (same as load_si but allows for unaligned mem) // for each element: (mask & (1 << 31)) ? (i32gather_ps(p, idx), mask = 0) : old template static Float mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask); static void maskstore_ps(float *p, Integer mask, Float src); static int movemask_epi8(Integer a); static int movemask_pd(Double a); static int movemask_ps(Float a); static Integer set1_epi32(int i); // return i (all elements are same value) static Integer set1_epi8(char i); // return i (all elements are same value) static Float set1_ps(float f); // return f (all elements are same value) static Float setzero_ps(); // return 0 (float) static Integer setzero_si(); // return 0 (integer) static void store_ps(float *p, Float a); // *p = a (stores all elements contiguously in memory) static void store_si(Integer *p, Integer a); // *p = a static void stream_ps(float *p, Float a); // *p = a (same as store_ps, but doesn't keep memory in cache) //======================================================================= // Legacy interface (available only in SIMD256 width) //======================================================================= static Float broadcast_ps(__m128 const *p); template static __m128d extractf128_pd(Double a); template static __m128 extractf128_ps(Float a); template static __m128i extractf128_si(Integer a); template static Double insertf128_pd(Double a, __m128d b); template static Float insertf128_ps(Float a, __m128 b); template static Integer insertf128_si(Integer a, __m128i b); static Integer loadu2_si(__m128 const* phi, __m128 const* plo); template static Double permute2f128_pd(Double a, Double b); template static Float permute2f128_ps(Float a, Float b); template static Integer permute2f128_si(Integer a, Integer b); static Integer set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0); static void storeu2_si(__m128i *phi, __m128i *plo, Integer src); //======================================================================= // Advanced masking interface (currently available only in SIMD16 width) //======================================================================= //======================================================================= // Extended Utility Functions (common to SIMD256 and SIMD16) //======================================================================= //----------------------------------------------------------------------- // Extended Types //----------------------------------------------------------------------- // Vec4, an SOA SIMD set of 4-dimensional vectors union Vec4 { Vec4() = default; Vec4(Float in) { s.x = in; s.y = in; s.z = in; s.w = in; } Vec4(Float x, Float y, Float z, Float w) { s.x = x; s.y = y; s.z = z; s.w = w; } Float v[4]; Integer vi[4]; struct { Float x; Float y; Float z; Float w; } s; Float& operator[] (const int i) { return v[i]; } Float const & operator[] (const int i) const { return v[i]; } }; //----------------------------------------------------------------------- // Extended Functions //----------------------------------------------------------------------- static void vec4_set1_ps(Vec4& r, const float *p); // r[0] = set1(p[0]), r[1] = set1(p[1]), ... static void vec4_set1_vps(Vec4& r, Float s); // r[0] = s, r[1] = s, ... static Float vec4_dp3_ps(const Vec4& v0, const Vec4& v1); // return dp3(v0, v1) static Float vec4_dp4_ps(const Vec4& v0, const Vec4& v1); // return dp4(v0, v1) static Float vec4_rcp_length_ps(const Vec4& v); // return 1.0f / sqrt(dp4(v, v)) static void vec4_normalize_ps(Vec4& r, const Vec4& v); // r = v * rcp_length(v) static void vec4_mul_ps(Vec4& r, const Vec4& v, Float s); // r = v * set1_vps(s) static void vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1); // r = v0 * v1 static void vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1); // r = v0 + v1 static void vec4_min_ps(Vec4& r, const Vec4& v0, Float s); // r = (v0 < s) ? v0 : s static void vec4_max_ps(Vec4& r, const Vec4& v0, Float s); // r = (v0 > s) ? v0 : s // Matrix4x4 * Vector4 // result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * v.s.w) // result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * v.s.w) // result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * v.s.w) // result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * v.s.w) static void mat4x4_vec4_multiply( Vec4& result, const float *pMatrix, const Vec4& v); // Matrix4x4 * Vector3 - Direction Vector where w = 0. // result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 0) // result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 0) // result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 0) // result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * 0) static void mat3x3_vec3_w0_multiply( Vec4& result, const float *pMatrix, const Vec4& v); // Matrix4x4 * Vector3 - Position vector where w = 1. // result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 1) // result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 1) // result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 1) // result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * 1) static void mat4x4_vec3_w1_multiply( Vec4& result, const float *pMatrix, const Vec4& v); // Matrix4x3 * Vector3 - Position vector where w = 1. // result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 1) // result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 1) // result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 1) // result.s.w = 1 static void mat4x3_vec3_w1_multiply( Vec4& result, const float *pMatrix, const Vec4& v); }; #endif // #if 0