/**************************************************************************** * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. * * @file builder_misc.cpp * * @brief Implementation for miscellaneous builder functions * * Notes: * ******************************************************************************/ #include "jit_pch.hpp" #include "builder.h" #include namespace SwrJit { void Builder::AssertMemoryUsageParams(Value* ptr, JIT_MEM_CLIENT usage) { SWR_ASSERT( ptr->getType() != mInt64Ty, "Address appears to be GFX access. Requires translation through BuilderGfxMem."); } Value* Builder::GEP(Value* Ptr, Value* Idx, Type* Ty, const Twine& Name) { return IRB()->CreateGEP(Ptr, Idx, Name); } Value* Builder::GEP(Type* Ty, Value* Ptr, Value* Idx, const Twine& Name) { return IRB()->CreateGEP(Ty, Ptr, Idx, Name); } Value* Builder::GEP(Value* ptr, const std::initializer_list& indexList, Type* Ty) { std::vector indices; for (auto i : indexList) indices.push_back(i); return GEPA(ptr, indices); } Value* Builder::GEP(Value* ptr, const std::initializer_list& indexList, Type* Ty) { std::vector indices; for (auto i : indexList) indices.push_back(C(i)); return GEPA(ptr, indices); } Value* Builder::GEPA(Value* Ptr, ArrayRef IdxList, const Twine& Name) { return IRB()->CreateGEP(Ptr, IdxList, Name); } Value* Builder::GEPA(Type* Ty, Value* Ptr, ArrayRef IdxList, const Twine& Name) { return IRB()->CreateGEP(Ty, Ptr, IdxList, Name); } Value* Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list& indexList) { std::vector indices; for (auto i : indexList) indices.push_back(i); return IN_BOUNDS_GEP(ptr, indices); } Value* Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list& indexList) { std::vector indices; for (auto i : indexList) indices.push_back(C(i)); return IN_BOUNDS_GEP(ptr, indices); } LoadInst* Builder::LOAD(Value* Ptr, const char* Name, Type* Ty, JIT_MEM_CLIENT usage) { AssertMemoryUsageParams(Ptr, usage); return IRB()->CreateLoad(Ptr, Name); } LoadInst* Builder::LOAD(Value* Ptr, const Twine& Name, Type* Ty, JIT_MEM_CLIENT usage) { AssertMemoryUsageParams(Ptr, usage); return IRB()->CreateLoad(Ptr, Name); } LoadInst* Builder::LOAD(Type* Ty, Value* Ptr, const Twine& Name, JIT_MEM_CLIENT usage) { AssertMemoryUsageParams(Ptr, usage); return IRB()->CreateLoad(Ty, Ptr, Name); } LoadInst* Builder::LOAD(Value* Ptr, bool isVolatile, const Twine& Name, Type* Ty, JIT_MEM_CLIENT usage) { AssertMemoryUsageParams(Ptr, usage); return IRB()->CreateLoad(Ptr, isVolatile, Name); } LoadInst* Builder::LOAD(Value* basePtr, const std::initializer_list& indices, const llvm::Twine& name, Type* Ty, JIT_MEM_CLIENT usage) { std::vector valIndices; for (auto i : indices) valIndices.push_back(C(i)); return Builder::LOAD(GEPA(basePtr, valIndices), name); } LoadInst* Builder::LOADV(Value* basePtr, const std::initializer_list& indices, const llvm::Twine& name) { std::vector valIndices; for (auto i : indices) valIndices.push_back(i); return LOAD(GEPA(basePtr, valIndices), name); } StoreInst* Builder::STORE(Value* val, Value* basePtr, const std::initializer_list& indices, Type* Ty, JIT_MEM_CLIENT usage) { std::vector valIndices; for (auto i : indices) valIndices.push_back(C(i)); return STORE(val, GEPA(basePtr, valIndices)); } StoreInst* Builder::STOREV(Value* val, Value* basePtr, const std::initializer_list& indices) { std::vector valIndices; for (auto i : indices) valIndices.push_back(i); return STORE(val, GEPA(basePtr, valIndices)); } Value* Builder::OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset) { return GEP(base, offset); } Value* Builder::MEM_ADD(Value* i32Incr, Value* basePtr, const std::initializer_list& indices, const llvm::Twine& name) { Value* i32Value = LOAD(GEP(basePtr, indices), name); Value* i32Result = ADD(i32Value, i32Incr); return STORE(i32Result, GEP(basePtr, indices)); } ////////////////////////////////////////////////////////////////////////// /// @brief Generate a masked gather operation in LLVM IR. If not /// supported on the underlying platform, emulate it with loads /// @param vSrc - SIMD wide value that will be loaded if mask is invalid /// @param pBase - Int8* base VB address pointer value /// @param vIndices - SIMD wide value of VB byte offsets /// @param vMask - SIMD wide mask that controls whether to access memory or the src values /// @param scale - value to scale indices by Value* Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale, JIT_MEM_CLIENT usage) { AssertMemoryUsageParams(pBase, usage); return VGATHERPS(vSrc, pBase, vIndices, vMask, C(scale)); } ////////////////////////////////////////////////////////////////////////// /// @brief Generate a masked gather operation in LLVM IR. If not /// supported on the underlying platform, emulate it with loads /// @param vSrc - SIMD wide value that will be loaded if mask is invalid /// @param pBase - Int8* base VB address pointer value /// @param vIndices - SIMD wide value of VB byte offsets /// @param vMask - SIMD wide mask that controls whether to access memory or the src values /// @param scale - value to scale indices by Value* Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale, JIT_MEM_CLIENT usage) { AssertMemoryUsageParams(pBase, usage); return VGATHERDD(vSrc, pBase, vIndices, vMask, C(scale)); } ////////////////////////////////////////////////////////////////////////// /// @brief Generate a masked gather operation in LLVM IR. If not /// supported on the underlying platform, emulate it with loads /// @param vSrc - SIMD wide value that will be loaded if mask is invalid /// @param pBase - Int8* base VB address pointer value /// @param vIndices - SIMD wide value of VB byte offsets /// @param vMask - SIMD wide mask that controls whether to access memory or the src values /// @param scale - value to scale indices by Value* Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale) { return VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale)); } ////////////////////////////////////////////////////////////////////////// /// @brief Alternative masked gather where source is a vector of pointers /// @param pVecSrcPtr - SIMD wide vector of pointers /// @param pVecMask - SIMD active lanes /// @param pVecPassthru - SIMD wide vector of values to load when lane is inactive Value* Builder::GATHER_PTR(Value* pVecSrcPtr, Value* pVecMask, Value* pVecPassthru) { return MASKED_GATHER(pVecSrcPtr, 4, pVecMask, pVecPassthru); } void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets, Value* mask, Value* vGatherComponents[], bool bPackedOutput, JIT_MEM_CLIENT usage) { const SWR_FORMAT_INFO& info = GetFormatInfo(format); if (info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32) { GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage); } else { GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage); } } void Builder::GATHER4PS(const SWR_FORMAT_INFO& info, Value* pSrcBase, Value* byteOffsets, Value* vMask, Value* vGatherComponents[], bool bPackedOutput, JIT_MEM_CLIENT usage) { switch (info.bpp / info.numComps) { case 16: { Value* vGatherResult[2]; // TODO: vGatherMaskedVal Value* vGatherMaskedVal = VIMMED1((float)0); // always have at least one component out of x or y to fetch vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage); // e.g. result of first 8x32bit integer gather for 16bit components // 256i - 0 1 2 3 4 5 6 7 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy // // if we have at least one component out of x or y to fetch if (info.numComps > 2) { // offset base to the next components(zw) in the vertex to gather pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4)); vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage); // e.g. result of second 8x32bit integer gather for 16bit components // 256i - 0 1 2 3 4 5 6 7 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw // } else { vGatherResult[1] = vGatherMaskedVal; } // Shuffle gathered components into place, each row is a component Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); } break; case 32: { // apply defaults for (uint32_t i = 0; i < 4; ++i) { vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]); } for (uint32_t i = 0; i < info.numComps; i++) { uint32_t swizzleIndex = info.swizzle[i]; // Gather a SIMD of components vGatherComponents[swizzleIndex] = GATHERPS( vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage); // offset base to the next component to gather pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4)); } } break; default: SWR_INVALID("Invalid float format"); break; } } void Builder::GATHER4DD(const SWR_FORMAT_INFO& info, Value* pSrcBase, Value* byteOffsets, Value* vMask, Value* vGatherComponents[], bool bPackedOutput, JIT_MEM_CLIENT usage) { switch (info.bpp / info.numComps) { case 8: { Value* vGatherMaskedVal = VIMMED1((int32_t)0); Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage); // e.g. result of an 8x32bit integer gather for 8bit components // 256i - 0 1 2 3 4 5 6 7 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); } break; case 16: { Value* vGatherResult[2]; // TODO: vGatherMaskedVal Value* vGatherMaskedVal = VIMMED1((int32_t)0); // always have at least one component out of x or y to fetch vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage); // e.g. result of first 8x32bit integer gather for 16bit components // 256i - 0 1 2 3 4 5 6 7 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy // // if we have at least one component out of x or y to fetch if (info.numComps > 2) { // offset base to the next components(zw) in the vertex to gather pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4)); vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage); // e.g. result of second 8x32bit integer gather for 16bit components // 256i - 0 1 2 3 4 5 6 7 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw // } else { vGatherResult[1] = vGatherMaskedVal; } // Shuffle gathered components into place, each row is a component Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); } break; case 32: { // apply defaults for (uint32_t i = 0; i < 4; ++i) { vGatherComponents[i] = VIMMED1((int)info.defaults[i]); } for (uint32_t i = 0; i < info.numComps; i++) { uint32_t swizzleIndex = info.swizzle[i]; // Gather a SIMD of components vGatherComponents[swizzleIndex] = GATHERDD( vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage); // offset base to the next component to gather pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4)); } } break; default: SWR_INVALID("unsupported format"); break; } } void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO& info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput) { // cast types Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits // input could either be float or int vector; do shuffle work in int vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty); vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty); if (bPackedOutput) { Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits // shuffle mask Value* vConstMask = C({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}); Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy); // after pshufb: group components together in each 128bit lane // 256i - 0 1 2 3 4 5 6 7 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy Value* vi128XY = BITCAST(VPERMD(vShufResult, C({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); // after PERMD: move and pack xy components into each 128bit lane // 256i - 0 1 2 3 4 5 6 7 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy // do the same for zw components Value* vi128ZW = nullptr; if (info.numComps > 2) { Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy); vi128ZW = BITCAST(VPERMD(vShufResult, C({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); } for (uint32_t i = 0; i < 4; i++) { uint32_t swizzleIndex = info.swizzle[i]; // todo: fixed for packed Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i])); if (i >= info.numComps) { // set the default component val vGatherOutput[swizzleIndex] = vGatherMaskedVal; continue; } // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; // if x or y, use vi128XY permute result, else use vi128ZW Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; // extract packed component 128 bit lanes vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane)); } } else { // pshufb masks for each component Value* vConstMask[2]; // x/z shuffle mask vConstMask[0] = C({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, }); // y/w shuffle mask vConstMask[1] = C({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1, 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1}); // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits // apply defaults for (uint32_t i = 0; i < 4; ++i) { vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]); } for (uint32_t i = 0; i < info.numComps; i++) { uint32_t swizzleIndex = info.swizzle[i]; // select correct constMask for x/z or y/w pshufb uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1; // if x or y, use vi128XY permute result, else use vi128ZW uint32_t selectedGather = (i < 2) ? 0 : 1; vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy); // after pshufb mask for x channel; z uses the same shuffle from the second gather // 256i - 0 1 2 3 4 5 6 7 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 } } } void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO& info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput) { // cast types Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits if (bPackedOutput) { Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits // shuffle mask Value* vConstMask = C({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}); Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy); // after pshufb: group components together in each 128bit lane // 256i - 0 1 2 3 4 5 6 7 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww Value* vi128XY = BITCAST(VPERMD(vShufResult, C({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty); // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane // 256i - 0 1 2 3 4 5 6 7 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care) // do the same for zw components Value* vi128ZW = nullptr; if (info.numComps > 2) { vi128ZW = BITCAST(VPERMD(vShufResult, C({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty); } // sign extend all enabled components. If we have a fill vVertexElements, output to // current simdvertex for (uint32_t i = 0; i < 4; i++) { uint32_t swizzleIndex = info.swizzle[i]; // todo: fix for packed Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i])); if (i >= info.numComps) { // set the default component val vGatherOutput[swizzleIndex] = vGatherMaskedVal; continue; } // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; // if x or y, use vi128XY permute result, else use vi128ZW Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; // sign extend vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane)); } } // else zero extend else { // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits // apply defaults for (uint32_t i = 0; i < 4; ++i) { vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]); } for (uint32_t i = 0; i < info.numComps; i++) { uint32_t swizzleIndex = info.swizzle[i]; // pshufb masks for each component Value* vConstMask; switch (i) { case 0: // x shuffle mask vConstMask = C({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1}); break; case 1: // y shuffle mask vConstMask = C({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1, 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1}); break; case 2: // z shuffle mask vConstMask = C({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1, 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1}); break; case 3: // w shuffle mask vConstMask = C({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1, 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1}); break; default: vConstMask = nullptr; break; } vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy); // after pshufb for x channel // 256i - 0 1 2 3 4 5 6 7 // x000 x000 x000 x000 x000 x000 x000 x000 } } } ////////////////////////////////////////////////////////////////////////// /// @brief emulates a scatter operation. /// @param pDst - pointer to destination /// @param vSrc - vector of src data to scatter /// @param vOffsets - vector of byte offsets from pDst /// @param vMask - mask of valid lanes void Builder::SCATTERPS( Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask, JIT_MEM_CLIENT usage) { AssertMemoryUsageParams(pDst, usage); /* Scatter algorithm while(Index = BitScanForward(mask)) srcElem = srcVector[Index] offsetElem = offsetVector[Index] *(pDst + offsetElem) = srcElem Update mask (&= ~(1<GetInsertBlock(); Function* pFunc = pCurBB->getParent(); Type* pSrcTy = vSrc->getType()->getVectorElementType(); // Store vectors on stack if (pScatterStackSrc == nullptr) { // Save off stack allocations and reuse per scatter. Significantly reduces stack // requirements for shaders with a lot of scatters. pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty); pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty); } Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0)); Value* pOffsetsArrayPtr = pScatterStackOffsets; STORE(vSrc, pSrcArrayPtr); STORE(vOffsets, pOffsetsArrayPtr); // Cast to pointers for random access pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0)); pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0)); Value* pMask = VMOVMSK(vMask); // Setup loop basic block BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc); // compute first set bit Value* pIndex = CTTZ(pMask, C(false)); Value* pIsUndef = ICMP_EQ(pIndex, C(32)); // Split current block or create new one if building inline BasicBlock* pPostLoop; if (pCurBB->getTerminator()) { pPostLoop = pCurBB->splitBasicBlock(cast(pIsUndef)->getNextNode()); // Remove unconditional jump created by splitBasicBlock pCurBB->getTerminator()->eraseFromParent(); // Add terminator to end of original block IRB()->SetInsertPoint(pCurBB); // Add conditional branch COND_BR(pIsUndef, pPostLoop, pLoop); } else { pPostLoop = BasicBlock::Create(mpJitMgr->mContext, "PostScatter_Loop", pFunc); // Add conditional branch COND_BR(pIsUndef, pPostLoop, pLoop); } // Add loop basic block contents IRB()->SetInsertPoint(pLoop); PHINode* pIndexPhi = PHI(mInt32Ty, 2); PHINode* pMaskPhi = PHI(mInt32Ty, 2); pIndexPhi->addIncoming(pIndex, pCurBB); pMaskPhi->addIncoming(pMask, pCurBB); // Extract elements for this index Value* pSrcElem = LOADV(pSrcArrayPtr, {pIndexPhi}); Value* pOffsetElem = LOADV(pOffsetsArrayPtr, {pIndexPhi}); // GEP to this offset in dst Value* pCurDst = GEP(pDst, pOffsetElem, mInt8PtrTy); pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0)); STORE(pSrcElem, pCurDst); // Update the mask Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi))); // Terminator Value* pNewIndex = CTTZ(pNewMask, C(false)); pIsUndef = ICMP_EQ(pNewIndex, C(32)); COND_BR(pIsUndef, pPostLoop, pLoop); // Update phi edges pIndexPhi->addIncoming(pNewIndex, pLoop); pMaskPhi->addIncoming(pNewMask, pLoop); // Move builder to beginning of post loop IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin()); } } // namespace SwrJit