/**************************************************************************** * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. ****************************************************************************/ #ifndef __SWR_INTRIN_H__ #define __SWR_INTRIN_H__ #include "os.h" #if !defined(SIMD_ARCH) #define SIMD_ARCH KNOB_ARCH #endif #include "simdlib_types.hpp" typedef SIMDImpl::SIMD128Impl::Float simd4scalar; typedef SIMDImpl::SIMD128Impl::Double simd4scalard; typedef SIMDImpl::SIMD128Impl::Integer simd4scalari; typedef SIMDImpl::SIMD128Impl::Vec4 simd4vector; typedef SIMDImpl::SIMD128Impl::Mask simd4mask; typedef SIMDImpl::SIMD256Impl::Float simd8scalar; typedef SIMDImpl::SIMD256Impl::Double simd8scalard; typedef SIMDImpl::SIMD256Impl::Integer simd8scalari; typedef SIMDImpl::SIMD256Impl::Vec4 simd8vector; typedef SIMDImpl::SIMD256Impl::Mask simd8mask; typedef SIMDImpl::SIMD512Impl::Float simd16scalar; typedef SIMDImpl::SIMD512Impl::Double simd16scalard; typedef SIMDImpl::SIMD512Impl::Integer simd16scalari; typedef SIMDImpl::SIMD512Impl::Vec4 simd16vector; typedef SIMDImpl::SIMD512Impl::Mask simd16mask; #if KNOB_SIMD_WIDTH == 8 typedef simd8scalar simdscalar; typedef simd8scalard simdscalard; typedef simd8scalari simdscalari; typedef simd8vector simdvector; typedef simd8mask simdmask; #else #error Unsupported vector width #endif INLINE UINT pdep_u32(UINT a, UINT mask) { #if KNOB_ARCH >= KNOB_ARCH_AVX2 return _pdep_u32(a, mask); #else UINT result = 0; // copied from http://wm.ite.pl/articles/pdep-soft-emu.html // using bsf instead of funky loop DWORD maskIndex; while (_BitScanForward(&maskIndex, mask)) { // 1. isolate lowest set bit of mask const UINT lowest = 1 << maskIndex; // 2. populate LSB from src const UINT LSB = (UINT)((int)(a << 31) >> 31); // 3. copy bit from mask result |= LSB & lowest; // 4. clear lowest bit mask &= ~lowest; // 5. prepare for next iteration a >>= 1; } return result; #endif } INLINE UINT pext_u32(UINT a, UINT mask) { #if KNOB_ARCH >= KNOB_ARCH_AVX2 return _pext_u32(a, mask); #else UINT result = 0; DWORD maskIndex; uint32_t currentBit = 0; while (_BitScanForward(&maskIndex, mask)) { // 1. isolate lowest set bit of mask const UINT lowest = 1 << maskIndex; // 2. copy bit from mask result |= ((a & lowest) > 0) << currentBit++; // 3. clear lowest bit mask &= ~lowest; } return result; #endif } #endif //__SWR_INTRIN_H__