diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp index c5e8a89e1..fa1bc8db0 100644 --- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp +++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp @@ -18,25 +18,24 @@ subject to the following restrictions: b3GpuParallelLinearBvh::b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue) : m_queue(queue), - m_fill(context, device, queue), m_radixSorter(context, device, queue), m_rootNodeIndex(context, queue), - - m_numNodesPerLevelGpu(context, queue), - m_firstIndexOffsetPerLevelGpu(context, queue), + m_maxDistanceFromRoot(context, queue), m_internalNodeAabbs(context, queue), m_internalNodeLeafIndexRanges(context, queue), m_internalNodeChildNodes(context, queue), m_internalNodeParentNodes(context, queue), - m_maxCommonPrefix(context, queue), m_commonPrefixes(context, queue), - m_leftInternalNodePointers(context, queue), - m_rightInternalNodePointers(context, queue), - m_internalNodeLeftChildNodes(context, queue), - m_internalNodeRightChildNodes(context, queue), + m_commonPrefixLengths(context, queue), + m_childNodeCount(context, queue), + m_distanceFromRoot(context, queue), + m_TEMP_leftLowerPrefix(context, queue), + m_TEMP_rightLowerPrefix(context, queue), + m_TEMP_leftSharedPrefixLength(context, queue), + m_TEMP_rightSharedPrefixLength(context, queue), m_leafNodeParentNodes(context, queue), m_mortonCodesAndAabbIndicies(context, queue), @@ -46,8 +45,8 @@ b3GpuParallelLinearBvh::b3GpuParallelLinearBvh(cl_context context, cl_device_id m_largeAabbs(context, queue) { m_rootNodeIndex.resize(1); - m_maxCommonPrefix.resize(1); - + m_maxDistanceFromRoot.resize(1); + // const char CL_PROGRAM_PATH[] = "src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl"; @@ -64,21 +63,16 @@ b3GpuParallelLinearBvh::b3GpuParallelLinearBvh(cl_context context, cl_device_id m_assignMortonCodesAndAabbIndiciesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "assignMortonCodesAndAabbIndicies", &error, m_parallelLinearBvhProgram, additionalMacros ); b3Assert(m_assignMortonCodesAndAabbIndiciesKernel); - m_constructBinaryTreeKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "constructBinaryTree", &error, m_parallelLinearBvhProgram, additionalMacros ); - b3Assert(m_constructBinaryTreeKernel); - m_determineInternalNodeAabbsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "determineInternalNodeAabbs", &error, m_parallelLinearBvhProgram, additionalMacros ); - b3Assert(m_determineInternalNodeAabbsKernel); - - m_computePrefixAndInitPointersKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "computePrefixAndInitPointers", &error, m_parallelLinearBvhProgram, additionalMacros ); - b3Assert(m_computePrefixAndInitPointersKernel); - m_correctDuplicatePrefixesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "correctDuplicatePrefixes", &error, m_parallelLinearBvhProgram, additionalMacros ); - b3Assert(m_correctDuplicatePrefixesKernel); + m_computeAdjacentPairCommonPrefixKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "computeAdjacentPairCommonPrefix", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_computeAdjacentPairCommonPrefixKernel); m_buildBinaryRadixTreeLeafNodesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeLeafNodes", &error, m_parallelLinearBvhProgram, additionalMacros ); b3Assert(m_buildBinaryRadixTreeLeafNodesKernel); m_buildBinaryRadixTreeInternalNodesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeInternalNodes", &error, m_parallelLinearBvhProgram, additionalMacros ); b3Assert(m_buildBinaryRadixTreeInternalNodesKernel); - m_convertChildNodeFormatKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "convertChildNodeFormat", &error, m_parallelLinearBvhProgram, additionalMacros ); - b3Assert(m_convertChildNodeFormatKernel); + m_findDistanceFromRootKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findDistanceFromRoot", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_findDistanceFromRootKernel); + m_buildBinaryRadixTreeAabbsRecursiveKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeAabbsRecursive", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_buildBinaryRadixTreeAabbsRecursiveKernel); m_plbvhCalculateOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhCalculateOverlappingPairs", &error, m_parallelLinearBvhProgram, additionalMacros ); b3Assert(m_plbvhCalculateOverlappingPairsKernel); @@ -96,14 +90,11 @@ b3GpuParallelLinearBvh::~b3GpuParallelLinearBvh() clReleaseKernel(m_findAllNodesMergedAabbKernel); clReleaseKernel(m_assignMortonCodesAndAabbIndiciesKernel); - clReleaseKernel(m_constructBinaryTreeKernel); - clReleaseKernel(m_determineInternalNodeAabbsKernel); - - clReleaseKernel(m_computePrefixAndInitPointersKernel); - clReleaseKernel(m_correctDuplicatePrefixesKernel); + clReleaseKernel(m_computeAdjacentPairCommonPrefixKernel); clReleaseKernel(m_buildBinaryRadixTreeLeafNodesKernel); clReleaseKernel(m_buildBinaryRadixTreeInternalNodesKernel); - clReleaseKernel(m_convertChildNodeFormatKernel); + clReleaseKernel(m_findDistanceFromRootKernel); + clReleaseKernel(m_buildBinaryRadixTreeAabbsRecursiveKernel); clReleaseKernel(m_plbvhCalculateOverlappingPairsKernel); clReleaseKernel(m_plbvhRayTraverseKernel); @@ -186,18 +177,19 @@ void b3GpuParallelLinearBvh::build(const b3OpenCLArray& worldSpaceAab m_internalNodeParentNodes.resize(numInternalNodes); m_commonPrefixes.resize(numInternalNodes); - m_leftInternalNodePointers.resize(numInternalNodes); - m_rightInternalNodePointers.resize(numInternalNodes); - m_internalNodeLeftChildNodes.resize(numInternalNodes); - m_internalNodeRightChildNodes.resize(numInternalNodes); + m_commonPrefixLengths.resize(numInternalNodes); + m_childNodeCount.resize(numInternalNodes); + m_distanceFromRoot.resize(numInternalNodes); + m_TEMP_leftLowerPrefix.resize(numInternalNodes); + m_TEMP_rightLowerPrefix.resize(numInternalNodes); + m_TEMP_leftSharedPrefixLength.resize(numInternalNodes); + m_TEMP_rightSharedPrefixLength.resize(numInternalNodes); m_leafNodeParentNodes.resize(numLeaves); m_mortonCodesAndAabbIndicies.resize(numLeaves); m_mergedAabb.resize(numLeaves); } - - //Find the merged AABB of all small AABBs; this is used to define the size of //each cell in the virtual grid(2^10 cells in each dimension). { @@ -255,18 +247,7 @@ void b3GpuParallelLinearBvh::build(const b3OpenCLArray& worldSpaceAab clFinish(m_queue); } - //Optional; only element at m_internalNodeParentNodes[0], the root node, needs to be set here - //as the parent indices of other nodes are overwritten during m_constructBinaryTreeKernel - { - B3_PROFILE("Reset parent node indices"); - - m_fill.execute( m_internalNodeParentNodes, B3_PLBVH_ROOT_NODE_MARKER, m_internalNodeParentNodes.size() ); - m_fill.execute( m_leafNodeParentNodes, B3_PLBVH_ROOT_NODE_MARKER, m_leafNodeParentNodes.size() ); - clFinish(m_queue); - } - // - //constructSimpleBinaryTree(); constructRadixBinaryTree(); } @@ -424,148 +405,11 @@ void b3GpuParallelLinearBvh::testRaysAgainstBvhAabbs(const b3OpenCLArray>= 1) mostSignificantBit++; //Start counting from 0 (0 and 1 have msb 0, 2 has msb 1) - } - numLevels = mostSignificantBit + 1; - - //If the number of nodes is not a power of 2(as in, can be expressed as 2^N where N is an integer), then there is 1 additional level - if( ~(1 << mostSignificantBit) & numLeaves ) numLevels++; - } - - //Determine number of internal nodes per level, use prefix sum to get offsets of each level, and send to GPU - { - B3_PROFILE("Determine number of nodes per level"); - - m_numNodesPerLevelCpu.resize(numLevels); - - //The last level contains the leaf nodes; number of leaves is already known - if(numLevels - 1 >= 0) m_numNodesPerLevelCpu[numLevels - 1] = numLeaves; - - //Calculate number of nodes in each level; - //start from the second to last level(level right next to leaf nodes) and move towards the root(level 0) - int remainder = 0; - for(int levelIndex = numLevels - 2; levelIndex >= 0; --levelIndex) - { - int numNodesPreviousLevel = m_numNodesPerLevelCpu[levelIndex + 1]; //For first iteration this == numLeaves - int numNodesCurrentLevel = numNodesPreviousLevel / 2; - - remainder += numNodesPreviousLevel % 2; - if(remainder == 2) - { - numNodesCurrentLevel++; - remainder = 0; - } - - m_numNodesPerLevelCpu[levelIndex] = numNodesCurrentLevel; - } - - //Prefix sum to calculate the first index offset of each level - { - m_firstIndexOffsetPerLevelCpu = m_numNodesPerLevelCpu; - - //Perform inclusive scan - for(int i = 1; i < m_firstIndexOffsetPerLevelCpu.size(); ++i) - m_firstIndexOffsetPerLevelCpu[i] += m_firstIndexOffsetPerLevelCpu[i - 1]; - - //Convert inclusive scan to exclusive scan to get the offsets - //This is equivalent to shifting each element in m_firstIndexOffsetPerLevelCpu[] by 1 to the right, - //and setting the first element to 0 - for(int i = 0; i < m_firstIndexOffsetPerLevelCpu.size(); ++i) - m_firstIndexOffsetPerLevelCpu[i] -= m_numNodesPerLevelCpu[i]; - } - - //Copy to GPU - m_numNodesPerLevelGpu.copyFromHost(m_numNodesPerLevelCpu, false); - m_firstIndexOffsetPerLevelGpu.copyFromHost(m_firstIndexOffsetPerLevelCpu, false); - clFinish(m_queue); - } - - //Construct binary tree; find the children of each internal node, and assign parent nodes - { - B3_PROFILE("Construct binary tree"); - - const int ROOT_NODE_INDEX = 0x80000000; //Default root index is 0, most significant bit is set to indicate internal node - m_rootNodeIndex.copyFromHostPointer(&ROOT_NODE_INDEX, 1); - - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( m_firstIndexOffsetPerLevelGpu.getBufferCL() ), - b3BufferInfoCL( m_numNodesPerLevelGpu.getBufferCL() ), - b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), - b3BufferInfoCL( m_internalNodeParentNodes.getBufferCL() ), - b3BufferInfoCL( m_leafNodeParentNodes.getBufferCL() ) - }; - - b3LauncherCL launcher(m_queue, m_constructBinaryTreeKernel, "m_constructBinaryTreeKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst(numLevels); - launcher.setConst(numInternalNodes); - - launcher.launch1D(numInternalNodes); - clFinish(m_queue); - } - - //For each internal node, check children to get its AABB; start from the - //last level, which contains the leaves, and move towards the root - { - B3_PROFILE("Set AABBs"); - - //Due to the arrangement of internal nodes, each internal node corresponds - //to a contiguous range of leaf node indices. This characteristic can be used - //to optimize calculateOverlappingPairs(); checking if - //(m_internalNodeLeafIndexRanges[].y < leafNodeIndex) can be used to ensure that - //each pair is processed only once. - { - B3_PROFILE("Reset internal node index ranges"); - - b3Int2 invalidIndexRange; - invalidIndexRange.x = -1; //x == min - invalidIndexRange.y = -2; //y == max - - m_fill.execute( m_internalNodeLeafIndexRanges, invalidIndexRange, m_internalNodeLeafIndexRanges.size() ); - clFinish(m_queue); - } - - int lastInternalLevelIndex = numLevels - 2; //Last level is leaf node level - for(int level = lastInternalLevelIndex; level >= 0; --level) - { - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( m_firstIndexOffsetPerLevelGpu.getBufferCL() ), - b3BufferInfoCL( m_numNodesPerLevelGpu.getBufferCL() ), - b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), - b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), - b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), - b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ), - b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ) - }; - - b3LauncherCL launcher(m_queue, m_determineInternalNodeAabbsKernel, "m_determineInternalNodeAabbsKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst(numLevels); - launcher.setConst(numInternalNodes); - launcher.setConst(level); - - launcher.launch1D(numLeaves); - } - clFinish(m_queue); - } -} +// remove +#include +int isLeafNode(int index) { return (index >> 31 == 0); } +int getIndexWithInternalNodeMarkerRemoved(int index) { return index & (~0x80000000); } +int getIndexWithInternalNodeMarkerSet(int isLeaf, int index) { return (isLeaf) ? index : (index | 0x80000000); } void b3GpuParallelLinearBvh::constructRadixBinaryTree() { @@ -576,17 +420,16 @@ void b3GpuParallelLinearBvh::constructRadixBinaryTree() //For each internal node, compute common prefix and set pointers to left and right internal nodes { - B3_PROFILE("m_computePrefixAndInitPointersKernel"); + B3_PROFILE("m_computeAdjacentPairCommonPrefixKernel"); b3BufferInfoCL bufferInfo[] = { b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), b3BufferInfoCL( m_commonPrefixes.getBufferCL() ), - b3BufferInfoCL( m_leftInternalNodePointers.getBufferCL() ), - b3BufferInfoCL( m_rightInternalNodePointers.getBufferCL() ) + b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ) }; - b3LauncherCL launcher(m_queue, m_computePrefixAndInitPointersKernel, "m_computePrefixAndInitPointersKernel"); + b3LauncherCL launcher(m_queue, m_computeAdjacentPairCommonPrefixKernel, "m_computeAdjacentPairCommonPrefixKernel"); launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); launcher.setConst(numInternalNodes); @@ -594,96 +437,185 @@ void b3GpuParallelLinearBvh::constructRadixBinaryTree() clFinish(m_queue); } - //Increase the common prefixes so that there are no adjacent duplicates for each internal node { - B3_PROFILE("m_correctDuplicatePrefixesKernel"); - - int reset = 0; - m_maxCommonPrefix.copyFromHostPointer(&reset, 1); - - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( m_commonPrefixes.getBufferCL() ), - b3BufferInfoCL( m_maxCommonPrefix.getBufferCL() ), - }; - - b3LauncherCL launcher(m_queue, m_correctDuplicatePrefixesKernel, "m_correctDuplicatePrefixesKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst(numInternalNodes); - - launcher.launch1D(numInternalNodes); + static b3AlignedObjectArray prefixLengths; + m_commonPrefixLengths.copyToHost(prefixLengths); clFinish(m_queue); + + for(int i = 1; i < prefixLengths.size(); ++i) + if( prefixLengths[i - 1] == prefixLengths[i] ) + for(;;) printf("duplicate prefix[%d]: %d\n", i, prefixLengths[i]); } - //For each leaf node, find parent nodes and assign child node indices { - B3_PROFILE("m_buildBinaryRadixTreeLeafNodesKernel"); - - b3BufferInfoCL bufferInfo[] = + //For each leaf node, find parent nodes and assign child node indices { - b3BufferInfoCL( m_commonPrefixes.getBufferCL() ), - b3BufferInfoCL( m_internalNodeLeftChildNodes.getBufferCL() ), - b3BufferInfoCL( m_internalNodeRightChildNodes.getBufferCL() ) - }; + B3_PROFILE("m_buildBinaryRadixTreeLeafNodesKernel"); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ), + b3BufferInfoCL( m_leafNodeParentNodes.getBufferCL() ), + b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeLeafNodesKernel, "m_buildBinaryRadixTreeLeafNodesKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numLeaves); + + launcher.launch1D(numLeaves); + clFinish(m_queue); + } - b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeLeafNodesKernel, "m_buildBinaryRadixTreeLeafNodesKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst(numLeaves); - - launcher.launch1D(numLeaves); - clFinish(m_queue); - } - - //For each internal node, find parent nodes and assign child node indices - { - B3_PROFILE("m_buildBinaryRadixTreeInternalNodesKernel"); - - int maxCommonPrefix = -1; - m_maxCommonPrefix.copyToHostPointer(&maxCommonPrefix, 1); - - //-1 so that the root sets its AABB - for(int processedCommonPrefix = maxCommonPrefix; processedCommonPrefix >= -1; --processedCommonPrefix) + //For each internal node, find parent nodes and assign child node indices { + B3_PROFILE("m_buildBinaryRadixTreeInternalNodesKernel"); + b3BufferInfoCL bufferInfo[] = { b3BufferInfoCL( m_commonPrefixes.getBufferCL() ), - b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), - b3BufferInfoCL( m_internalNodeLeftChildNodes.getBufferCL() ), - b3BufferInfoCL( m_internalNodeRightChildNodes.getBufferCL() ), - b3BufferInfoCL( m_leftInternalNodePointers.getBufferCL() ), - b3BufferInfoCL( m_rightInternalNodePointers.getBufferCL() ), - b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), - b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ), - b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ) + b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ), + b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), + b3BufferInfoCL( m_internalNodeParentNodes.getBufferCL() ), + b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ), + b3BufferInfoCL( m_TEMP_leftLowerPrefix.getBufferCL() ), + b3BufferInfoCL( m_TEMP_rightLowerPrefix.getBufferCL() ), + b3BufferInfoCL( m_TEMP_leftSharedPrefixLength.getBufferCL() ), + b3BufferInfoCL( m_TEMP_rightSharedPrefixLength.getBufferCL() ) }; b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeInternalNodesKernel, "m_buildBinaryRadixTreeInternalNodesKernel"); launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst(processedCommonPrefix); launcher.setConst(numInternalNodes); launcher.launch1D(numInternalNodes); + clFinish(m_queue); } - clFinish(m_queue); - } - - { - B3_PROFILE("m_convertChildNodeFormatKernel"); - - b3BufferInfoCL bufferInfo[] = + if(0) { - b3BufferInfoCL( m_internalNodeLeftChildNodes.getBufferCL() ), - b3BufferInfoCL( m_internalNodeRightChildNodes.getBufferCL() ), - b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ) - }; + static b3AlignedObjectArray mortonCodesAndAabbIndices; + + static b3AlignedObjectArray child; + static b3AlignedObjectArray commonPrefixes; + static b3AlignedObjectArray commonPrefixLengths; + static b3AlignedObjectArray tempLeftLowerPrefixIndex; + static b3AlignedObjectArray tempRightLowerPrefixIndex; + static b3AlignedObjectArray tempLeftLowerPrefixSPL; + static b3AlignedObjectArray tempRightLowerPrefixSPL; + static b3AlignedObjectArray internalParent; + + m_mortonCodesAndAabbIndicies.copyToHost(mortonCodesAndAabbIndices); + + m_internalNodeChildNodes.copyToHost(child); + m_commonPrefixes.copyToHost(commonPrefixes); + m_commonPrefixLengths.copyToHost(commonPrefixLengths); + m_TEMP_leftLowerPrefix.copyToHost(tempLeftLowerPrefixIndex); + m_TEMP_rightLowerPrefix.copyToHost(tempRightLowerPrefixIndex); + m_TEMP_leftSharedPrefixLength.copyToHost(tempLeftLowerPrefixSPL); + m_TEMP_rightSharedPrefixLength.copyToHost(tempRightLowerPrefixSPL); + m_internalNodeParentNodes.copyToHost(internalParent); + + int rootNode = -1; + m_rootNodeIndex.copyToHostPointer(&rootNode, 1); + clFinish(m_queue); + + printf( "rootNode: %d\n", getIndexWithInternalNodeMarkerRemoved(rootNode) ); + + for(int i = 0; i < numInternalNodes; ++i) + { + b3Int2 childNodes = child[i]; + + printf("childNodes[%d]:", i); + printf( " %d", getIndexWithInternalNodeMarkerRemoved(childNodes.x) ); + if( !isLeafNode(childNodes.x) ) printf("i"); + printf( ", %d", getIndexWithInternalNodeMarkerRemoved(childNodes.y) ); + if( !isLeafNode(childNodes.y) ) printf("i"); + printf(" (lr: %d, %d)", tempLeftLowerPrefixIndex[i], tempRightLowerPrefixIndex[i]); + printf(" (spl: %d, %d)", tempLeftLowerPrefixSPL[i], tempRightLowerPrefixSPL[i]); + printf(" (prefix: %d)", commonPrefixLengths[i]); + printf(" (par: %d)", internalParent[i]); + printf("\n"); + } + printf("\n"); + + for(int i = 0; i < numInternalNodes; ++i) + { + int hi = static_cast(commonPrefixes[i] >> 32); + int lo = static_cast(commonPrefixes[i]); + + printf("commonPrefix[%d]: %x, %d, len %d \n", i, hi, lo, commonPrefixLengths[i]); + } + printf("\n"); + + for(int i = 0; i < numLeaves; ++i) + { + printf("z-curve[%d]: %x \n", i, mortonCodesAndAabbIndices[i].m_key); + } + printf("\n"); + + + std::cout << std::endl; + for(;;); + } - b3LauncherCL launcher(m_queue, m_convertChildNodeFormatKernel, "m_convertChildNodeFormatKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst(numInternalNodes); + //Find the number of nodes seperating each internal node and the root node + //so that the AABBs can be set using the next kernel + { + B3_PROFILE("m_findDistanceFromRootKernel"); - launcher.launch1D(numInternalNodes); - clFinish(m_queue); + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ), + b3BufferInfoCL( m_internalNodeParentNodes.getBufferCL() ), + b3BufferInfoCL( m_maxDistanceFromRoot.getBufferCL() ), + b3BufferInfoCL( m_distanceFromRoot.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_findDistanceFromRootKernel, "m_findDistanceFromRootKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numInternalNodes); + + launcher.launch1D(numInternalNodes); + clFinish(m_queue); + } + + //Starting from the nodes nearest to the leaf nodes, recursively move up + //the tree to set the AABBs of each internal node; each internal node + //checks its children and merges their AABBs + { + B3_PROFILE("m_buildBinaryRadixTreeAabbsRecursiveKernel"); + + int maxDistanceFromRoot = -1; + { + B3_PROFILE("copy maxDistanceFromRoot to CPU"); + m_maxDistanceFromRoot.copyToHostPointer(&maxDistanceFromRoot, 1); + clFinish(m_queue); + } + + for(int distanceFromRoot = maxDistanceFromRoot; distanceFromRoot >= 0; --distanceFromRoot) + { + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_distanceFromRoot.getBufferCL() ), + b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), + b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), + b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), + b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeAabbsRecursiveKernel, "m_buildBinaryRadixTreeAabbsRecursiveKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(maxDistanceFromRoot); + launcher.setConst(distanceFromRoot); + launcher.setConst(numInternalNodes); + + launcher.launch1D(numInternalNodes); + } + + clFinish(m_queue); + } + } } diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h index 559786cbb..4d5467481 100644 --- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h +++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h @@ -22,10 +22,11 @@ subject to the following restrictions: #include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" #include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h" #include "Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h" -#define B3_PLBVH_ROOT_NODE_MARKER -1 //Syncronize with parallelLinearBvh.cl +#define b3Int64 cl_long ///@brief GPU Parallel Linearized Bounding Volume Heirarchy(LBVH) that is reconstructed every frame ///@remarks @@ -36,16 +37,13 @@ subject to the following restrictions: ///The basic algorithm for building the BVH as presented in [Lauterbach et al. 2009] consists of 4 stages: /// - [fully parallel] Assign morton codes for each AABB using its center (after quantizing the AABB centers into a virtual grid) /// - [fully parallel] Sort morton codes -/// - [somewhat parallel] Build radix binary tree (assign parent/child pointers for internal nodes of the BVH) +/// - [somewhat parallel] Build binary radix tree (assign parent/child pointers for internal nodes of the BVH) /// - [somewhat parallel] Set internal node AABBs ///@par ///[Karras 2012] improves on the algorithm by introducing fully parallel methods for the last 2 stages. -///The BVH implementation here is almost the same as [Karras 2012], but a different method is used for constructing the tree. -/// - Instead of building a binary radix tree, we simply pair each node with its nearest sibling. -/// This has the effect of further worsening the quality of the BVH, but the main spatial partitioning is done by the -/// Z-curve anyways, and this method should be simpler and faster during construction. -/// - Rather than traveling upwards towards the root from the leaf nodes, as in the paper, -/// each internal node checks its child nodes to get its AABB. +///The BVH implementation here shares many concepts with [Karras 2012], but a different method is used for constructing the tree. +///Instead of searching for the child nodes of each internal node, we search for the parent node of each node. +///Additionally, a non-atomic traversal that starts from the leaf nodes and moves towards the root node is used to set the AABBs. class b3GpuParallelLinearBvh { cl_command_queue m_queue; @@ -56,58 +54,49 @@ class b3GpuParallelLinearBvh cl_kernel m_findAllNodesMergedAabbKernel; cl_kernel m_assignMortonCodesAndAabbIndiciesKernel; - //Simple binary tree construction kernels - cl_kernel m_constructBinaryTreeKernel; - cl_kernel m_determineInternalNodeAabbsKernel; - - //Radix binary tree construction kernels - cl_kernel m_computePrefixAndInitPointersKernel; - cl_kernel m_correctDuplicatePrefixesKernel; + //Binary radix tree construction kernels + cl_kernel m_computeAdjacentPairCommonPrefixKernel; cl_kernel m_buildBinaryRadixTreeLeafNodesKernel; cl_kernel m_buildBinaryRadixTreeInternalNodesKernel; - cl_kernel m_convertChildNodeFormatKernel; + cl_kernel m_findDistanceFromRootKernel; + cl_kernel m_buildBinaryRadixTreeAabbsRecursiveKernel; //Traversal kernels cl_kernel m_plbvhCalculateOverlappingPairsKernel; cl_kernel m_plbvhRayTraverseKernel; - cl_kernel m_plbvhLargeAabbAabbTestKernel; cl_kernel m_plbvhLargeAabbRayTestKernel; - - b3FillCL m_fill; + b3RadixSort32CL m_radixSorter; - // + //1 element b3OpenCLArray m_rootNodeIndex; + b3OpenCLArray m_maxDistanceFromRoot; - //1 element per level in the tree - b3AlignedObjectArray m_numNodesPerLevelCpu; //Level 0(m_numNodesPerLevelCpu[0]) is the root, last level contains the leaf nodes - b3AlignedObjectArray m_firstIndexOffsetPerLevelCpu; //Contains the index/offset of the first node in that level - b3OpenCLArray m_numNodesPerLevelGpu; - b3OpenCLArray m_firstIndexOffsetPerLevelGpu; - - //1 element per internal node (number_of_internal_nodes = number_of_leaves - 1) + //1 element per internal node (number_of_internal_nodes == number_of_leaves - 1) b3OpenCLArray m_internalNodeAabbs; b3OpenCLArray m_internalNodeLeafIndexRanges; //x == min leaf index, y == max leaf index b3OpenCLArray m_internalNodeChildNodes; //x == left child, y == right child b3OpenCLArray m_internalNodeParentNodes; - //1 element per internal node; for radix binary tree construction - b3OpenCLArray m_maxCommonPrefix; - b3OpenCLArray m_commonPrefixes; - b3OpenCLArray m_leftInternalNodePointers; //Linked list - b3OpenCLArray m_rightInternalNodePointers; //Linked list - b3OpenCLArray m_internalNodeLeftChildNodes; - b3OpenCLArray m_internalNodeRightChildNodes; + //1 element per internal node; for binary radix tree construction + b3OpenCLArray m_commonPrefixes; + b3OpenCLArray m_commonPrefixLengths; + b3OpenCLArray m_childNodeCount; + b3OpenCLArray m_distanceFromRoot; + b3OpenCLArray m_TEMP_leftLowerPrefix; + b3OpenCLArray m_TEMP_rightLowerPrefix; + b3OpenCLArray m_TEMP_leftSharedPrefixLength; + b3OpenCLArray m_TEMP_rightSharedPrefixLength; //1 element per leaf node (leaf nodes only include small AABBs) b3OpenCLArray m_leafNodeParentNodes; - b3OpenCLArray m_mortonCodesAndAabbIndicies; //m_key = morton code, m_value == aabb index - b3OpenCLArray m_mergedAabb; + b3OpenCLArray m_mortonCodesAndAabbIndicies; //m_key == morton code, m_value == aabb index + b3OpenCLArray m_mergedAabb; //m_mergedAabb[0] contains the merged AABB of all leaf nodes b3OpenCLArray m_leafNodeAabbs; //Contains only small AABBs - //1 element per large AABB - b3OpenCLArray m_largeAabbs; //Not stored in the BVH + //1 element per large AABB, which is not stored in the BVH + b3OpenCLArray m_largeAabbs; public: b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue); @@ -131,8 +120,6 @@ public: b3OpenCLArray& out_numRayRigidPairs, b3OpenCLArray& out_rayRigidPairs); private: - void constructSimpleBinaryTree(); - void constructRadixBinaryTree(); }; diff --git a/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl b/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl index 3bcd4f351..2c17f8623 100644 --- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl +++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl @@ -45,34 +45,29 @@ unsigned int interleaveBits(unsigned int x) //........ ........ ......12 3456789A //x //....1..2 ..3..4.. 5..6..7. .8..9..A //x after interleaving bits - //........ ....1234 56789A12 3456789A //x |= (x << 10) - //........ ....1111 1....... ...11111 //0x 00 0F 80 1F - //........ ....1234 5....... ...6789A //x = ( x | (x << 10) ) & 0x000F801F; + //......12 3456789A ......12 3456789A //x ^ (x << 16) + //11111111 ........ ........ 11111111 //0x FF 00 00 FF + //......12 ........ ........ 3456789A //x = (x ^ (x << 16)) & 0xFF0000FF; - //.......1 23451234 5.....67 89A6789A //x |= (x << 5) - //.......1 1.....11 1.....11 .....111 //0x 01 83 83 07 - //.......1 2.....34 5.....67 .....89A //x = ( x | (x << 5) ) & 0x01838307; + //......12 ........ 3456789A 3456789A //x ^ (x << 8) + //......11 ........ 1111.... ....1111 //0x 03 00 F0 0F + //......12 ........ 3456.... ....789A //x = (x ^ (x << 8)) & 0x0300F00F; - //....12.1 2..34534 5..67.67 ..89A89A //x |= (x << 3) - //....1... 1..1...1 1..1...1 ..1...11 //0x 08 91 91 23 - //....1... 2..3...4 5..6...7 ..8...9A //x = ( x | (x << 3) ) & 0x08919123; + //..12..12 ....3456 3456.... 789A789A //x ^ (x << 4) + //......11 ....11.. ..11.... 11....11 //0x 03 0C 30 C3 + //......12 ....34.. ..56.... 78....9A //x = (x ^ (x << 4)) & 0x030C30C3; - //...11..2 2.33..4N 5.66..77 .88..9NA //x |= (x << 1) ( N indicates overlapping bits, first overlap is bit {4,5} second is {9,A} ) - //....1..1 ..1...1. 1..1..1. .1...1.1 //0x 09 22 92 45 - //....1..2 ..3...4. 5..6..7. .8...9.A //x = ( x | (x << 1) ) & 0x09229245; - - //...11.22 .33..445 5.66.77. 88..99AA //x |= (x << 1) - //....1..1 ..1..1.. 1..1..1. .1..1..1 //0x 09 34 92 29 - //....1..2 ..3..4.. 5..6..7. .8..9..A //x = ( x | (x << 1) ) & 0x09349229; + //....1212 ..3434.. 5656..78 78..9A9A //x ^ (x << 2) + //....1..1 ..1..1.. 1..1..1. .1..1..1 //0x 09 24 92 49 + //....1..2 ..3..4.. 5..6..7. .8..9..A //x = (x ^ (x << 2)) & 0x09249249; //........ ........ ......11 11111111 //0x000003FF x &= 0x000003FF; //Clear all bits above bit 10 - x = ( x | (x << 10) ) & 0x000F801F; - x = ( x | (x << 5) ) & 0x01838307; - x = ( x | (x << 3) ) & 0x08919123; - x = ( x | (x << 1) ) & 0x09229245; - x = ( x | (x << 1) ) & 0x09349229; + x = (x ^ (x << 16)) & 0xFF0000FF; + x = (x ^ (x << 8)) & 0x0300F00F; + x = (x ^ (x << 4)) & 0x030C30C3; + x = (x ^ (x << 2)) & 0x09249249; return x; } @@ -160,147 +155,11 @@ __kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabb //The most significant bit(0x80000000) of a int32 is used to distinguish between leaf and internal nodes. //If it is set, then the index is for an internal node; otherwise, it is a leaf node. -//In both cases, the bit should be cleared to access the index. +//In both cases, the bit should be cleared to access the actual node index. int isLeafNode(int index) { return (index >> 31 == 0); } int getIndexWithInternalNodeMarkerRemoved(int index) { return index & (~0x80000000); } int getIndexWithInternalNodeMarkerSet(int isLeaf, int index) { return (isLeaf) ? index : (index | 0x80000000); } -__kernel void constructBinaryTree(__global int* firstIndexOffsetPerLevel, - __global int* numNodesPerLevel, - __global int2* out_internalNodeChildIndices, - __global int* out_internalNodeParentNodes, - __global int* out_leafNodeParentNodes, - int numLevels, int numInternalNodes) -{ - int internalNodeIndex = get_global_id(0); - if(internalNodeIndex >= numInternalNodes) return; - - //Find the level that this node is in, using linear search(could replace with binary search) - int level = 0; - int numInternalLevels = numLevels - 1; //All levels except the last are internal nodes - for(; level < numInternalLevels; ++level) - { - if( firstIndexOffsetPerLevel[level] <= internalNodeIndex && internalNodeIndex < firstIndexOffsetPerLevel[level + 1]) break; - } - - //Check lower levels to find child nodes - //Left child is always in the next level, but the same does not apply to the right child - int indexInLevel = internalNodeIndex - firstIndexOffsetPerLevel[level]; - int firstIndexInNextLevel = firstIndexOffsetPerLevel[level + 1]; //Should never be out of bounds(see for loop above) - - int leftChildLevel = level + 1; - int leftChildIndex = firstIndexInNextLevel + indexInLevel * 2; - - int rightChildLevel = level + 1; - int rightChildIndex = leftChildIndex + 1; - - //Under certain conditions, the right child index as calculated above is invalid; need to find the correct index - // - //First condition: must be at least 2 levels apart from the leaf node level; - //if the current level is right next to the leaf node level, then the right child - //will never be invalid due to the way the nodes are allocated (also avoid a out-of-bounds memory access) - // - //Second condition: not enough nodes in the next level for each parent to have 2 children, so the right child is invalid - // - //Third condition: must be the last node in its level - if( level < numLevels - 2 - && numNodesPerLevel[level] * 2 > numNodesPerLevel[level + 1] - && indexInLevel == numNodesPerLevel[level] - 1 ) - { - //Check lower levels until we find a node without a parent - for(; rightChildLevel < numLevels - 1; ++rightChildLevel) - { - int rightChildNextLevel = rightChildLevel + 1; - - //If this branch is taken, it means that the last node in rightChildNextLevel has no parent - if( numNodesPerLevel[rightChildLevel] * 2 < numNodesPerLevel[rightChildNextLevel] ) - { - //Set the node to the last node in rightChildNextLevel - rightChildLevel = rightChildNextLevel; - rightChildIndex = firstIndexOffsetPerLevel[rightChildNextLevel] + numNodesPerLevel[rightChildNextLevel] - 1; - break; - } - } - } - - int isLeftChildLeaf = (leftChildLevel >= numLevels - 1); - int isRightChildLeaf = (rightChildLevel >= numLevels - 1); - - //If left/right child is a leaf node, the index needs to be corrected - //the way the index is calculated assumes that the leaf and internal nodes are in a contiguous array, - //with leaf nodes at the end of the array; in actuality, the leaf and internal nodes are in separate arrays - { - int leafNodeLevel = numLevels - 1; - leftChildIndex = (isLeftChildLeaf) ? leftChildIndex - firstIndexOffsetPerLevel[leafNodeLevel] : leftChildIndex; - rightChildIndex = (isRightChildLeaf) ? rightChildIndex - firstIndexOffsetPerLevel[leafNodeLevel] : rightChildIndex; - } - - //Set the negative sign bit if the node is internal - int2 childIndices; - childIndices.x = getIndexWithInternalNodeMarkerSet(isLeftChildLeaf, leftChildIndex); - childIndices.y = getIndexWithInternalNodeMarkerSet(isRightChildLeaf, rightChildIndex); - out_internalNodeChildIndices[internalNodeIndex] = childIndices; - - //Assign parent node index to children - __global int* out_leftChildParentNodeIndices = (isLeftChildLeaf) ? out_leafNodeParentNodes : out_internalNodeParentNodes; - out_leftChildParentNodeIndices[leftChildIndex] = internalNodeIndex; - - __global int* out_rightChildParentNodeIndices = (isRightChildLeaf) ? out_leafNodeParentNodes : out_internalNodeParentNodes; - out_rightChildParentNodeIndices[rightChildIndex] = internalNodeIndex; -} - -__kernel void determineInternalNodeAabbs(__global int* firstIndexOffsetPerLevel, - __global int* numNodesPerLevel, - __global int2* internalNodeChildIndices, - __global SortDataCL* mortonCodesAndAabbIndices, - __global b3AabbCL* leafNodeAabbs, - __global int2* out_internalNodeLeafIndexRanges, - __global b3AabbCL* out_internalNodeAabbs, - int numLevels, int numInternalNodes, int level) -{ - int i = get_global_id(0); - if(i >= numInternalNodes) return; - - //For each node in a level, check its child nodes to determine its AABB - { - int indexInLevel = i; //Index relative to firstIndexOffsetPerLevel[level] - - int numNodesInLevel = numNodesPerLevel[level]; - if(indexInLevel < numNodesInLevel) - { - int internalNodeIndexGlobal = indexInLevel + firstIndexOffsetPerLevel[level]; - int2 childIndicies = internalNodeChildIndices[internalNodeIndexGlobal]; - - int leftChildIndex = getIndexWithInternalNodeMarkerRemoved(childIndicies.x); - int rightChildIndex = getIndexWithInternalNodeMarkerRemoved(childIndicies.y); - - int isLeftChildLeaf = isLeafNode(childIndicies.x); - int isRightChildLeaf = isLeafNode(childIndicies.y); - - //left/RightChildLeafIndex == Rigid body indicies - int leftChildLeafIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1; - int rightChildLeafIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1; - - b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftChildLeafIndex] : out_internalNodeAabbs[leftChildIndex]; - b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightChildLeafIndex] : out_internalNodeAabbs[rightChildIndex]; - - // - b3AabbCL internalNodeAabb; - internalNodeAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min); - internalNodeAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max); - out_internalNodeAabbs[internalNodeIndexGlobal] = internalNodeAabb; - - //For index range, x == min and y == max; left child always has lower index - int2 leafIndexRange; - leafIndexRange.x = (isLeftChildLeaf) ? leftChildIndex : out_internalNodeLeafIndexRanges[leftChildIndex].x; - leafIndexRange.y = (isRightChildLeaf) ? rightChildIndex : out_internalNodeLeafIndexRanges[rightChildIndex].y; - - out_internalNodeLeafIndexRanges[internalNodeIndexGlobal] = leafIndexRange; - } - } -} - - //From sap.cl #define NEW_PAIR_MARKER -1 @@ -567,84 +426,65 @@ __kernel void plbvhLargeAabbRayTest(__global b3AabbCL* largeRigidAabbs, __global } - -#define B3_PLBVH_LINKED_LIST_INVALID_NODE -1 - -int longestCommonPrefix(int i, int j) { return clz(i ^ j); } - -__kernel void computePrefixAndInitPointers(__global SortDataCL* mortonCodesAndAabbIndices, - __global int* out_commonPrefixes, - __global int* out_leftInternalNodePointers, - __global int* out_rightInternalNodePointers, - int numInternalNodes) -{ - int internalNodeIndex = get_global_id(0); - if (internalNodeIndex >= numInternalNodes) return; - - //Compute common prefix - { - //Here, (internalNodeIndex + 1) is never out of bounds since it is a leaf node index, - //and the number of internal nodes is always numLeafNodes - 1 - int leftLeafMortonCode = mortonCodesAndAabbIndices[internalNodeIndex].m_key; - int rightLeafMortonCode = mortonCodesAndAabbIndices[internalNodeIndex + 1].m_key; - - out_commonPrefixes[internalNodeIndex] = longestCommonPrefix(leftLeafMortonCode, rightLeafMortonCode); - } - - //Assign neighbor pointers of this node - { - int leftInternalIndex = internalNodeIndex - 1; - int rightInternalIndex = internalNodeIndex + 1; - - out_leftInternalNodePointers[internalNodeIndex] = (leftInternalIndex >= 0) ? leftInternalIndex : B3_PLBVH_LINKED_LIST_INVALID_NODE; - out_rightInternalNodePointers[internalNodeIndex] = (rightInternalIndex < numInternalNodes) ? rightInternalIndex : B3_PLBVH_LINKED_LIST_INVALID_NODE; - } -} - -__kernel void correctDuplicatePrefixes(__global int* commonPrefixes, __global int* out_maxCommonPrefix, int numInternalNodes) -{ - int internalNodeIndex = get_global_id(0); - if (internalNodeIndex >= numInternalNodes) return; - - int commonPrefix = commonPrefixes[internalNodeIndex]; - - //Linear search to find the size of the subtree - int firstSubTreeIndex = internalNodeIndex; - int lastSubTreeIndex = internalNodeIndex; - - while(firstSubTreeIndex - 1 >= 0 && commonPrefix == commonPrefixes[firstSubTreeIndex - 1]) --firstSubTreeIndex; - while(lastSubTreeIndex + 1 < numInternalNodes && commonPrefix == commonPrefixes[lastSubTreeIndex + 1]) ++lastSubTreeIndex; - - //Fix duplicate common prefixes by incrementing them so that a subtree is formed. - //Recursively divide the tree until the position of the split is this node's index. - //Every time this node is not the split node, increment the common prefix. - int isCurrentSplitNode = false; - int correctedCommonPrefix = commonPrefix; - - while(!isCurrentSplitNode) - { - int numInternalNodesInSubTree = lastSubTreeIndex - firstSubTreeIndex + 1; - int splitNodeIndex = firstSubTreeIndex + numInternalNodesInSubTree / 2; - - if(internalNodeIndex > splitNodeIndex) firstSubTreeIndex = splitNodeIndex + 1; - else if(internalNodeIndex < splitNodeIndex) lastSubTreeIndex = splitNodeIndex - 1; - //else if(internalNodeIndex == splitNodeIndex) break; - - isCurrentSplitNode = (internalNodeIndex == splitNodeIndex); - if(!isCurrentSplitNode) correctedCommonPrefix++; - } - - commonPrefixes[internalNodeIndex] = correctedCommonPrefix; - atomic_max(out_maxCommonPrefix, correctedCommonPrefix); -} - //Set so that it is always greater than the actual common prefixes, and never selected as a parent node. //If there are no duplicates, then the highest common prefix is 32 or 64, depending on the number of bits used for the z-curve. //Duplicates common prefixes increase the highest common prefix by N, where 2^N is the number of duplicate nodes. #define B3_PLBVH_INVALID_COMMON_PREFIX 128 -__kernel void buildBinaryRadixTreeLeafNodes(__global int* commonPrefixes, __global int* out_leftChildNodes, - __global int* out_rightChildNodes, int numLeafNodes) +#define B3_PLBVH_ROOT_NODE_MARKER -1 + +#define b3Int64 long + +int computeCommonPrefixLength(b3Int64 i, b3Int64 j) { return (int)clz(i ^ j); } +b3Int64 computeCommonPrefix(b3Int64 i, b3Int64 j) +{ + //This function only needs to return (i & j) in order for the algorithm to work, + //but it may help with debugging to mask out the lower bits. + + b3Int64 commonPrefixLength = (b3Int64)computeCommonPrefixLength(i, j); + + b3Int64 sharedBits = i & j; + b3Int64 bitmask = ((b3Int64)(~0)) << (64 - commonPrefixLength); //Set all bits after the common prefix to 0 + + return sharedBits & bitmask; +} +int getSharedPrefixLength(b3Int64 prefixA, int prefixLengthA, b3Int64 prefixB, int prefixLengthB) +{ + return b3Min( computeCommonPrefixLength(prefixA, prefixB), b3Min(prefixLengthA, prefixLengthB) ); +} + +__kernel void computeAdjacentPairCommonPrefix(__global SortDataCL* mortonCodesAndAabbIndices, + __global b3Int64* out_commonPrefixes, + __global int* out_commonPrefixLengths, + int numInternalNodes) +{ + int internalNodeIndex = get_global_id(0); + if (internalNodeIndex >= numInternalNodes) return; + + //Here, (internalNodeIndex + 1) is never out of bounds since it is a leaf node index, + //and the number of internal nodes is always numLeafNodes - 1 + int leftLeafIndex = internalNodeIndex; + int rightLeafIndex = internalNodeIndex + 1; + + int leftLeafMortonCode = mortonCodesAndAabbIndices[leftLeafIndex].m_key; + int rightLeafMortonCode = mortonCodesAndAabbIndices[rightLeafIndex].m_key; + + //Binary radix tree construction algorithm does not work if there are duplicate morton codes. + //Append the index of each leaf node to each morton code so that there are no duplicates. + //The algorithm also requires that the morton codes are sorted in ascending order; this requirement + //is also satisfied with this method, as (leftLeafIndex < rightLeafIndex) is always true. + // + //upsample(a, b) == ( ((b3Int64)a) << 32) | b + b3Int64 nonduplicateLeftMortonCode = upsample(leftLeafMortonCode, leftLeafIndex); + b3Int64 nonduplicateRightMortonCode = upsample(rightLeafMortonCode, rightLeafIndex); + + out_commonPrefixes[internalNodeIndex] = computeCommonPrefix(nonduplicateLeftMortonCode, nonduplicateRightMortonCode); + out_commonPrefixLengths[internalNodeIndex] = computeCommonPrefixLength(nonduplicateLeftMortonCode, nonduplicateRightMortonCode); +} + + +__kernel void buildBinaryRadixTreeLeafNodes(__global int* commonPrefixLengths, __global int* out_leafNodeParentNodes, + __global int2* out_childNodes, int numLeafNodes) { int leafNodeIndex = get_global_id(0); if (leafNodeIndex >= numLeafNodes) return; @@ -654,8 +494,8 @@ __kernel void buildBinaryRadixTreeLeafNodes(__global int* commonPrefixes, __glob int leftSplitIndex = leafNodeIndex - 1; int rightSplitIndex = leafNodeIndex; - int leftCommonPrefix = (leftSplitIndex >= 0) ? commonPrefixes[leftSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX; - int rightCommonPrefix = (rightSplitIndex < numInternalNodes) ? commonPrefixes[rightSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX; + int leftCommonPrefix = (leftSplitIndex >= 0) ? commonPrefixLengths[leftSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX; + int rightCommonPrefix = (rightSplitIndex < numInternalNodes) ? commonPrefixLengths[rightSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX; //Parent node is the highest adjacent common prefix that is lower than the node's common prefix //Leaf nodes are considered as having the highest common prefix @@ -667,72 +507,223 @@ __kernel void buildBinaryRadixTreeLeafNodes(__global int* commonPrefixes, __glob if(rightCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = true; int parentNodeIndex = (isLeftHigherCommonPrefix) ? leftSplitIndex : rightSplitIndex; + out_leafNodeParentNodes[leafNodeIndex] = parentNodeIndex; - //If the left node is the parent, then this node is its right child and vice versa - __global int* out_childNode = (isLeftHigherCommonPrefix) ? out_rightChildNodes : out_leftChildNodes; + int isRightChild = (isLeftHigherCommonPrefix); //If the left node is the parent, then this node is its right child and vice versa + //out_childNodesAsInt[0] == int2.x == left child + //out_childNodesAsInt[1] == int2.y == right child int isLeaf = 1; - out_childNode[parentNodeIndex] = getIndexWithInternalNodeMarkerSet(isLeaf, leafNodeIndex); + __global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]); + out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, leafNodeIndex); } -__kernel void buildBinaryRadixTreeInternalNodes(__global int* commonPrefixes, __global SortDataCL* mortonCodesAndAabbIndices, - __global int* leftChildNodes, __global int* rightChildNodes, - __global int* leftNeighborPointers, __global int* rightNeighborPointers, - __global b3AabbCL* leafNodeAabbs, __global b3AabbCL* internalNodeAabbs, - __global int* out_rootNodeIndex, - int processedCommonPrefix, int numInternalNodes) +__kernel void buildBinaryRadixTreeInternalNodes(__global b3Int64* commonPrefixes, __global int* commonPrefixLengths, + __global int2* out_childNodes, + __global int* out_internalNodeParentNodes, __global int* out_rootNodeIndex, + __global int* TEMP_out_leftLowerPrefix, __global int* TEMP_out_rightLowerPrefix, + __global int* TEMP_spl_left, __global int* TEMP_spl_right, + int numInternalNodes) { - int internalNodeIndex = get_global_id(0); - if (internalNodeIndex >= numInternalNodes) return; + int internalNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0); + if(internalNodeIndex >= numInternalNodes) return; - int commonPrefix = commonPrefixes[internalNodeIndex]; - if (commonPrefix == processedCommonPrefix) + b3Int64 nodePrefix = commonPrefixes[internalNodeIndex]; + int nodePrefixLength = commonPrefixLengths[internalNodeIndex]; + +//#define USE_LINEAR_SEARCH +#ifdef USE_LINEAR_SEARCH + int leftIndex = -1; + int rightIndex = -1; + + for(int i = internalNodeIndex - 1; i >= 0; --i) { - //Check neighbors and compare the common prefix to select the parent node - int leftNodeIndex = leftNeighborPointers[internalNodeIndex]; - int rightNodeIndex = rightNeighborPointers[internalNodeIndex]; - - int leftCommonPrefix = (leftNodeIndex != B3_PLBVH_LINKED_LIST_INVALID_NODE) ? commonPrefixes[leftNodeIndex] : B3_PLBVH_INVALID_COMMON_PREFIX; - int rightCommonPrefix = (rightNodeIndex != B3_PLBVH_LINKED_LIST_INVALID_NODE) ? commonPrefixes[rightNodeIndex] : B3_PLBVH_INVALID_COMMON_PREFIX; - - //Parent node is the highest common prefix that is lower than the node's common prefix - //Since the nodes with lower common prefixes are removed, that condition does not have to be tested for, - //and we only need to pick the node with the higher prefix. - int isLeftHigherCommonPrefix = (leftCommonPrefix > rightCommonPrefix); - - // - if(leftCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = false; - else if(rightCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = true; - - int isRootNode = false; - if(leftCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX && rightCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isRootNode = true; - - int parentNodeIndex = (isLeftHigherCommonPrefix) ? leftNodeIndex : rightNodeIndex; - - //If the left node is the parent, then this node is its right child and vice versa - __global int* out_childNode = (isLeftHigherCommonPrefix) ? rightChildNodes : leftChildNodes; - - int isLeaf = 0; - if(!isRootNode) out_childNode[parentNodeIndex] = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex); - - if(isRootNode) *out_rootNodeIndex = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex); - - //Remove this node from the linked list, - //so that the left and right nodes point at each other instead of this node - if(leftNodeIndex != B3_PLBVH_LINKED_LIST_INVALID_NODE) rightNeighborPointers[leftNodeIndex] = rightNodeIndex; - if(rightNodeIndex != B3_PLBVH_LINKED_LIST_INVALID_NODE) leftNeighborPointers[rightNodeIndex] = leftNodeIndex; - - //For debug - leftNeighborPointers[internalNodeIndex] = -2; - rightNeighborPointers[internalNodeIndex] = -2; + int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]); + if(nodeLeftSharedPrefixLength < nodePrefixLength) + { + leftIndex = i; + break; + } } - //Processing occurs from highest common prefix to lowest common prefix - //Nodes in the previously processed level have their children set, so we merge their child AABBs here - if (commonPrefix == processedCommonPrefix + 1) + for(int i = internalNodeIndex + 1; i < numInternalNodes; ++i) { - int leftChildIndex = leftChildNodes[internalNodeIndex]; - int rightChildIndex = rightChildNodes[internalNodeIndex]; + int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]); + if(nodeRightSharedPrefixLength < nodePrefixLength) + { + rightIndex = i; + break; + } + } + +#else //Use binary search + + //Find nearest element to left with a lower common prefix + int leftIndex = -1; + { + int lower = 0; + int upper = internalNodeIndex - 1; + + while(lower <= upper) + { + int mid = (lower + upper) / 2; + b3Int64 midPrefix = commonPrefixes[mid]; + int midPrefixLength = commonPrefixLengths[mid]; + + int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength); + if(nodeMidSharedPrefixLength < nodePrefixLength) + { + int right = mid + 1; + if(right < internalNodeIndex) + { + b3Int64 rightPrefix = commonPrefixes[right]; + int rightPrefixLength = commonPrefixLengths[right]; + + int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, rightPrefix, rightPrefixLength); + if(nodeRightSharedPrefixLength < nodePrefixLength) + { + lower = right; + leftIndex = right; + } + else + { + leftIndex = mid; + break; + } + } + else + { + leftIndex = mid; + break; + } + } + else upper = mid - 1; + } + } + + //Find nearest element to right with a lower common prefix + int rightIndex = -1; + { + int lower = internalNodeIndex + 1; + int upper = numInternalNodes - 1; + + while(lower <= upper) + { + int mid = (lower + upper) / 2; + b3Int64 midPrefix = commonPrefixes[mid]; + int midPrefixLength = commonPrefixLengths[mid]; + + int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength); + if(nodeMidSharedPrefixLength < nodePrefixLength) + { + int left = mid - 1; + if(left > internalNodeIndex) + { + b3Int64 leftPrefix = commonPrefixes[left]; + int leftPrefixLength = commonPrefixLengths[left]; + + int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, leftPrefix, leftPrefixLength); + if(nodeLeftSharedPrefixLength < nodePrefixLength) + { + upper = left; + rightIndex = left; + } + else + { + rightIndex = mid; + break; + } + } + else + { + rightIndex = mid; + break; + } + } + else lower = mid + 1; + } + } +#endif + + TEMP_out_leftLowerPrefix[internalNodeIndex] = leftIndex; + TEMP_out_rightLowerPrefix[internalNodeIndex] = rightIndex; + TEMP_spl_left[internalNodeIndex] = (leftIndex != -1) ? getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[leftIndex], commonPrefixLengths[leftIndex]) : -1; + TEMP_spl_right[internalNodeIndex] = (rightIndex != -1) ? getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[rightIndex], commonPrefixLengths[rightIndex]) : -1; + + //Select parent + { + int leftPrefixLength = (leftIndex != -1) ? commonPrefixLengths[leftIndex] : B3_PLBVH_INVALID_COMMON_PREFIX; + int rightPrefixLength = (rightIndex != -1) ? commonPrefixLengths[rightIndex] : B3_PLBVH_INVALID_COMMON_PREFIX; + + int isLeftHigherPrefixLength = (leftPrefixLength > rightPrefixLength); + + if(leftPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = false; + else if(rightPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = true; + + int parentNodeIndex = (isLeftHigherPrefixLength) ? leftIndex : rightIndex; + + int isRootNode = (leftIndex == -1 && rightIndex == -1); + out_internalNodeParentNodes[internalNodeIndex] = (!isRootNode) ? parentNodeIndex : B3_PLBVH_ROOT_NODE_MARKER; + + int isLeaf = 0; + if(!isRootNode) + { + int isRightChild = (isLeftHigherPrefixLength); //If the left node is the parent, then this node is its right child and vice versa + + //out_childNodesAsInt[0] == int2.x == left child + //out_childNodesAsInt[1] == int2.y == right child + __global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]); + out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex); + } + else *out_rootNodeIndex = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex); + } +} + +__kernel void findDistanceFromRoot(__global int* rootNodeIndex, __global int* internalNodeParentNodes, + __global int* out_maxDistanceFromRoot, __global int* out_distanceFromRoot, int numInternalNodes) +{ + if( get_global_id(0) == 0 ) atomic_xchg(out_maxDistanceFromRoot, 0); + + int internalNodeIndex = get_global_id(0); + if(internalNodeIndex >= numInternalNodes) return; + + // + int distanceFromRoot = 0; + { + int parentIndex = internalNodeParentNodes[internalNodeIndex]; + while(parentIndex != B3_PLBVH_ROOT_NODE_MARKER) + { + parentIndex = internalNodeParentNodes[parentIndex]; + ++distanceFromRoot; + } + } + out_distanceFromRoot[internalNodeIndex] = distanceFromRoot; + + // + __local int localMaxDistanceFromRoot; + if( get_local_id(0) == 0 ) localMaxDistanceFromRoot = 0; + barrier(CLK_LOCAL_MEM_FENCE); + + atomic_max(&localMaxDistanceFromRoot, distanceFromRoot); + barrier(CLK_LOCAL_MEM_FENCE); + + if( get_local_id(0) == 0 ) atomic_max(out_maxDistanceFromRoot, localMaxDistanceFromRoot); +} + +__kernel void buildBinaryRadixTreeAabbsRecursive(__global int* distanceFromRoot, __global SortDataCL* mortonCodesAndAabbIndices, + __global int2* childNodes, + __global b3AabbCL* leafNodeAabbs, __global b3AabbCL* internalNodeAabbs, + int maxDistanceFromRoot, int processedDistance, int numInternalNodes) +{ + int internalNodeIndex = get_global_id(0); + if(internalNodeIndex >= numInternalNodes) return; + + int distance = distanceFromRoot[internalNodeIndex]; + + if(distance == processedDistance) + { + int leftChildIndex = childNodes[internalNodeIndex].x; + int rightChildIndex = childNodes[internalNodeIndex].y; int isLeftChildLeaf = isLeafNode(leftChildIndex); int isRightChildLeaf = isLeafNode(rightChildIndex); @@ -753,18 +744,3 @@ __kernel void buildBinaryRadixTreeInternalNodes(__global int* commonPrefixes, __ internalNodeAabbs[internalNodeIndex] = mergedAabb; } } - -__kernel void convertChildNodeFormat(__global int* leftChildNodes, __global int* rightChildNodes, - __global int2* out_childNodes, int numInternalNodes) -{ - int internalNodeIndex = get_global_id(0); - if (internalNodeIndex >= numInternalNodes) return; - - int2 childNodesIndices; - childNodesIndices.x = leftChildNodes[internalNodeIndex]; - childNodesIndices.y = rightChildNodes[internalNodeIndex]; - - out_childNodes[internalNodeIndex] = childNodesIndices; -} - - diff --git a/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h b/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h index 1a1437d10..46848b1aa 100644 --- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h +++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h @@ -41,34 +41,29 @@ static const char* parallelLinearBvhCL= \ " //........ ........ ......12 3456789A //x\n" " //....1..2 ..3..4.. 5..6..7. .8..9..A //x after interleaving bits\n" " \n" -" //........ ....1234 56789A12 3456789A //x |= (x << 10)\n" -" //........ ....1111 1....... ...11111 //0x 00 0F 80 1F\n" -" //........ ....1234 5....... ...6789A //x = ( x | (x << 10) ) & 0x000F801F; \n" +" //......12 3456789A ......12 3456789A //x ^ (x << 16)\n" +" //11111111 ........ ........ 11111111 //0x FF 00 00 FF\n" +" //......12 ........ ........ 3456789A //x = (x ^ (x << 16)) & 0xFF0000FF;\n" " \n" -" //.......1 23451234 5.....67 89A6789A //x |= (x << 5)\n" -" //.......1 1.....11 1.....11 .....111 //0x 01 83 83 07\n" -" //.......1 2.....34 5.....67 .....89A //x = ( x | (x << 5) ) & 0x01838307;\n" +" //......12 ........ 3456789A 3456789A //x ^ (x << 8)\n" +" //......11 ........ 1111.... ....1111 //0x 03 00 F0 0F\n" +" //......12 ........ 3456.... ....789A //x = (x ^ (x << 8)) & 0x0300F00F;\n" " \n" -" //....12.1 2..34534 5..67.67 ..89A89A //x |= (x << 3)\n" -" //....1... 1..1...1 1..1...1 ..1...11 //0x 08 91 91 23\n" -" //....1... 2..3...4 5..6...7 ..8...9A //x = ( x | (x << 3) ) & 0x08919123;\n" +" //..12..12 ....3456 3456.... 789A789A //x ^ (x << 4)\n" +" //......11 ....11.. ..11.... 11....11 //0x 03 0C 30 C3\n" +" //......12 ....34.. ..56.... 78....9A //x = (x ^ (x << 4)) & 0x030C30C3;\n" " \n" -" //...11..2 2.33..4N 5.66..77 .88..9NA //x |= (x << 1) ( N indicates overlapping bits, first overlap is bit {4,5} second is {9,A} )\n" -" //....1..1 ..1...1. 1..1..1. .1...1.1 //0x 09 22 92 45\n" -" //....1..2 ..3...4. 5..6..7. .8...9.A //x = ( x | (x << 1) ) & 0x09229245;\n" -" \n" -" //...11.22 .33..445 5.66.77. 88..99AA //x |= (x << 1)\n" -" //....1..1 ..1..1.. 1..1..1. .1..1..1 //0x 09 34 92 29\n" -" //....1..2 ..3..4.. 5..6..7. .8..9..A //x = ( x | (x << 1) ) & 0x09349229;\n" +" //....1212 ..3434.. 5656..78 78..9A9A //x ^ (x << 2)\n" +" //....1..1 ..1..1.. 1..1..1. .1..1..1 //0x 09 24 92 49\n" +" //....1..2 ..3..4.. 5..6..7. .8..9..A //x = (x ^ (x << 2)) & 0x09249249;\n" " \n" " //........ ........ ......11 11111111 //0x000003FF\n" " x &= 0x000003FF; //Clear all bits above bit 10\n" " \n" -" x = ( x | (x << 10) ) & 0x000F801F;\n" -" x = ( x | (x << 5) ) & 0x01838307;\n" -" x = ( x | (x << 3) ) & 0x08919123;\n" -" x = ( x | (x << 1) ) & 0x09229245;\n" -" x = ( x | (x << 1) ) & 0x09349229;\n" +" x = (x ^ (x << 16)) & 0xFF0000FF;\n" +" x = (x ^ (x << 8)) & 0x0300F00F;\n" +" x = (x ^ (x << 4)) & 0x030C30C3;\n" +" x = (x ^ (x << 2)) & 0x09249249;\n" " \n" " return x;\n" "}\n" @@ -150,143 +145,10 @@ static const char* parallelLinearBvhCL= \ "#define B3_PLVBH_TRAVERSE_MAX_STACK_SIZE 128\n" "//The most significant bit(0x80000000) of a int32 is used to distinguish between leaf and internal nodes.\n" "//If it is set, then the index is for an internal node; otherwise, it is a leaf node. \n" -"//In both cases, the bit should be cleared to access the index.\n" +"//In both cases, the bit should be cleared to access the actual node index.\n" "int isLeafNode(int index) { return (index >> 31 == 0); }\n" "int getIndexWithInternalNodeMarkerRemoved(int index) { return index & (~0x80000000); }\n" "int getIndexWithInternalNodeMarkerSet(int isLeaf, int index) { return (isLeaf) ? index : (index | 0x80000000); }\n" -"__kernel void constructBinaryTree(__global int* firstIndexOffsetPerLevel,\n" -" __global int* numNodesPerLevel,\n" -" __global int2* out_internalNodeChildIndices, \n" -" __global int* out_internalNodeParentNodes, \n" -" __global int* out_leafNodeParentNodes, \n" -" int numLevels, int numInternalNodes)\n" -"{\n" -" int internalNodeIndex = get_global_id(0);\n" -" if(internalNodeIndex >= numInternalNodes) return;\n" -" \n" -" //Find the level that this node is in, using linear search(could replace with binary search)\n" -" int level = 0;\n" -" int numInternalLevels = numLevels - 1; //All levels except the last are internal nodes\n" -" for(; level < numInternalLevels; ++level)\n" -" {\n" -" if( firstIndexOffsetPerLevel[level] <= internalNodeIndex && internalNodeIndex < firstIndexOffsetPerLevel[level + 1]) break;\n" -" }\n" -" \n" -" //Check lower levels to find child nodes\n" -" //Left child is always in the next level, but the same does not apply to the right child\n" -" int indexInLevel = internalNodeIndex - firstIndexOffsetPerLevel[level];\n" -" int firstIndexInNextLevel = firstIndexOffsetPerLevel[level + 1]; //Should never be out of bounds(see for loop above)\n" -" \n" -" int leftChildLevel = level + 1;\n" -" int leftChildIndex = firstIndexInNextLevel + indexInLevel * 2;\n" -" \n" -" int rightChildLevel = level + 1;\n" -" int rightChildIndex = leftChildIndex + 1;\n" -" \n" -" //Under certain conditions, the right child index as calculated above is invalid; need to find the correct index\n" -" //\n" -" //First condition: must be at least 2 levels apart from the leaf node level;\n" -" //if the current level is right next to the leaf node level, then the right child\n" -" //will never be invalid due to the way the nodes are allocated (also avoid a out-of-bounds memory access)\n" -" //\n" -" //Second condition: not enough nodes in the next level for each parent to have 2 children, so the right child is invalid\n" -" //\n" -" //Third condition: must be the last node in its level\n" -" if( level < numLevels - 2 \n" -" && numNodesPerLevel[level] * 2 > numNodesPerLevel[level + 1] \n" -" && indexInLevel == numNodesPerLevel[level] - 1 )\n" -" {\n" -" //Check lower levels until we find a node without a parent\n" -" for(; rightChildLevel < numLevels - 1; ++rightChildLevel)\n" -" {\n" -" int rightChildNextLevel = rightChildLevel + 1;\n" -" \n" -" //If this branch is taken, it means that the last node in rightChildNextLevel has no parent\n" -" if( numNodesPerLevel[rightChildLevel] * 2 < numNodesPerLevel[rightChildNextLevel] )\n" -" {\n" -" //Set the node to the last node in rightChildNextLevel\n" -" rightChildLevel = rightChildNextLevel;\n" -" rightChildIndex = firstIndexOffsetPerLevel[rightChildNextLevel] + numNodesPerLevel[rightChildNextLevel] - 1;\n" -" break;\n" -" }\n" -" }\n" -" }\n" -" \n" -" int isLeftChildLeaf = (leftChildLevel >= numLevels - 1);\n" -" int isRightChildLeaf = (rightChildLevel >= numLevels - 1);\n" -" \n" -" //If left/right child is a leaf node, the index needs to be corrected\n" -" //the way the index is calculated assumes that the leaf and internal nodes are in a contiguous array,\n" -" //with leaf nodes at the end of the array; in actuality, the leaf and internal nodes are in separate arrays\n" -" {\n" -" int leafNodeLevel = numLevels - 1;\n" -" leftChildIndex = (isLeftChildLeaf) ? leftChildIndex - firstIndexOffsetPerLevel[leafNodeLevel] : leftChildIndex;\n" -" rightChildIndex = (isRightChildLeaf) ? rightChildIndex - firstIndexOffsetPerLevel[leafNodeLevel] : rightChildIndex;\n" -" }\n" -" \n" -" //Set the negative sign bit if the node is internal\n" -" int2 childIndices;\n" -" childIndices.x = getIndexWithInternalNodeMarkerSet(isLeftChildLeaf, leftChildIndex);\n" -" childIndices.y = getIndexWithInternalNodeMarkerSet(isRightChildLeaf, rightChildIndex);\n" -" out_internalNodeChildIndices[internalNodeIndex] = childIndices;\n" -" \n" -" //Assign parent node index to children\n" -" __global int* out_leftChildParentNodeIndices = (isLeftChildLeaf) ? out_leafNodeParentNodes : out_internalNodeParentNodes;\n" -" out_leftChildParentNodeIndices[leftChildIndex] = internalNodeIndex;\n" -" \n" -" __global int* out_rightChildParentNodeIndices = (isRightChildLeaf) ? out_leafNodeParentNodes : out_internalNodeParentNodes;\n" -" out_rightChildParentNodeIndices[rightChildIndex] = internalNodeIndex;\n" -"}\n" -"__kernel void determineInternalNodeAabbs(__global int* firstIndexOffsetPerLevel,\n" -" __global int* numNodesPerLevel, \n" -" __global int2* internalNodeChildIndices,\n" -" __global SortDataCL* mortonCodesAndAabbIndices,\n" -" __global b3AabbCL* leafNodeAabbs, \n" -" __global int2* out_internalNodeLeafIndexRanges,\n" -" __global b3AabbCL* out_internalNodeAabbs, \n" -" int numLevels, int numInternalNodes, int level)\n" -"{\n" -" int i = get_global_id(0);\n" -" if(i >= numInternalNodes) return;\n" -" \n" -" //For each node in a level, check its child nodes to determine its AABB\n" -" {\n" -" int indexInLevel = i; //Index relative to firstIndexOffsetPerLevel[level]\n" -" \n" -" int numNodesInLevel = numNodesPerLevel[level];\n" -" if(indexInLevel < numNodesInLevel)\n" -" {\n" -" int internalNodeIndexGlobal = indexInLevel + firstIndexOffsetPerLevel[level];\n" -" int2 childIndicies = internalNodeChildIndices[internalNodeIndexGlobal];\n" -" \n" -" int leftChildIndex = getIndexWithInternalNodeMarkerRemoved(childIndicies.x);\n" -" int rightChildIndex = getIndexWithInternalNodeMarkerRemoved(childIndicies.y);\n" -" \n" -" int isLeftChildLeaf = isLeafNode(childIndicies.x);\n" -" int isRightChildLeaf = isLeafNode(childIndicies.y);\n" -" \n" -" //left/RightChildLeafIndex == Rigid body indicies\n" -" int leftChildLeafIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1;\n" -" int rightChildLeafIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1;\n" -" \n" -" b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftChildLeafIndex] : out_internalNodeAabbs[leftChildIndex];\n" -" b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightChildLeafIndex] : out_internalNodeAabbs[rightChildIndex];\n" -" \n" -" //\n" -" b3AabbCL internalNodeAabb;\n" -" internalNodeAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min);\n" -" internalNodeAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max);\n" -" out_internalNodeAabbs[internalNodeIndexGlobal] = internalNodeAabb;\n" -" \n" -" //For index range, x == min and y == max; left child always has lower index\n" -" int2 leafIndexRange;\n" -" leafIndexRange.x = (isLeftChildLeaf) ? leftChildIndex : out_internalNodeLeafIndexRanges[leftChildIndex].x;\n" -" leafIndexRange.y = (isRightChildLeaf) ? rightChildIndex : out_internalNodeLeafIndexRanges[rightChildIndex].y;\n" -" \n" -" out_internalNodeLeafIndexRanges[internalNodeIndexGlobal] = leafIndexRange;\n" -" }\n" -" }\n" -"}\n" "//From sap.cl\n" "#define NEW_PAIR_MARKER -1\n" "bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, const b3AabbCL* aabb2)\n" @@ -539,78 +401,57 @@ static const char* parallelLinearBvhCL= \ " }\n" " }\n" "}\n" -"#define B3_PLBVH_LINKED_LIST_INVALID_NODE -1\n" -"int longestCommonPrefix(int i, int j) { return clz(i ^ j); }\n" -"__kernel void computePrefixAndInitPointers(__global SortDataCL* mortonCodesAndAabbIndices,\n" -" __global int* out_commonPrefixes,\n" -" __global int* out_leftInternalNodePointers, \n" -" __global int* out_rightInternalNodePointers,\n" +"//Set so that it is always greater than the actual common prefixes, and never selected as a parent node.\n" +"//If there are no duplicates, then the highest common prefix is 32 or 64, depending on the number of bits used for the z-curve.\n" +"//Duplicates common prefixes increase the highest common prefix by N, where 2^N is the number of duplicate nodes.\n" +"#define B3_PLBVH_INVALID_COMMON_PREFIX 128\n" +"#define B3_PLBVH_ROOT_NODE_MARKER -1\n" +"#define b3Int64 long\n" +"int computeCommonPrefixLength(b3Int64 i, b3Int64 j) { return (int)clz(i ^ j); }\n" +"b3Int64 computeCommonPrefix(b3Int64 i, b3Int64 j) \n" +"{\n" +" //This function only needs to return (i & j) in order for the algorithm to work,\n" +" //but it may help with debugging to mask out the lower bits.\n" +" b3Int64 commonPrefixLength = (b3Int64)computeCommonPrefixLength(i, j);\n" +" b3Int64 sharedBits = i & j;\n" +" b3Int64 bitmask = ((b3Int64)(~0)) << (64 - commonPrefixLength); //Set all bits after the common prefix to 0\n" +" \n" +" return sharedBits & bitmask;\n" +"}\n" +"int getSharedPrefixLength(b3Int64 prefixA, int prefixLengthA, b3Int64 prefixB, int prefixLengthB)\n" +"{\n" +" return b3Min( computeCommonPrefixLength(prefixA, prefixB), b3Min(prefixLengthA, prefixLengthB) );\n" +"}\n" +"__kernel void computeAdjacentPairCommonPrefix(__global SortDataCL* mortonCodesAndAabbIndices,\n" +" __global b3Int64* out_commonPrefixes,\n" +" __global int* out_commonPrefixLengths,\n" " int numInternalNodes)\n" "{\n" " int internalNodeIndex = get_global_id(0);\n" " if (internalNodeIndex >= numInternalNodes) return;\n" " \n" -" //Compute common prefix\n" -" {\n" -" //Here, (internalNodeIndex + 1) is never out of bounds since it is a leaf node index,\n" -" //and the number of internal nodes is always numLeafNodes - 1\n" -" int leftLeafMortonCode = mortonCodesAndAabbIndices[internalNodeIndex].m_key;\n" -" int rightLeafMortonCode = mortonCodesAndAabbIndices[internalNodeIndex + 1].m_key;\n" +" //Here, (internalNodeIndex + 1) is never out of bounds since it is a leaf node index,\n" +" //and the number of internal nodes is always numLeafNodes - 1\n" +" int leftLeafIndex = internalNodeIndex;\n" +" int rightLeafIndex = internalNodeIndex + 1;\n" " \n" -" out_commonPrefixes[internalNodeIndex] = longestCommonPrefix(leftLeafMortonCode, rightLeafMortonCode);\n" -" }\n" +" int leftLeafMortonCode = mortonCodesAndAabbIndices[leftLeafIndex].m_key;\n" +" int rightLeafMortonCode = mortonCodesAndAabbIndices[rightLeafIndex].m_key;\n" " \n" -" //Assign neighbor pointers of this node\n" -" {\n" -" int leftInternalIndex = internalNodeIndex - 1;\n" -" int rightInternalIndex = internalNodeIndex + 1;\n" -" \n" -" out_leftInternalNodePointers[internalNodeIndex] = (leftInternalIndex >= 0) ? leftInternalIndex : B3_PLBVH_LINKED_LIST_INVALID_NODE;\n" -" out_rightInternalNodePointers[internalNodeIndex] = (rightInternalIndex < numInternalNodes) ? rightInternalIndex : B3_PLBVH_LINKED_LIST_INVALID_NODE;\n" -" }\n" +" //Binary radix tree construction algorithm does not work if there are duplicate morton codes.\n" +" //Append the index of each leaf node to each morton code so that there are no duplicates.\n" +" //The algorithm also requires that the morton codes are sorted in ascending order; this requirement\n" +" //is also satisfied with this method, as (leftLeafIndex < rightLeafIndex) is always true.\n" +" //\n" +" //upsample(a, b) == ( ((b3Int64)a) << 32) | b\n" +" b3Int64 nonduplicateLeftMortonCode = upsample(leftLeafMortonCode, leftLeafIndex);\n" +" b3Int64 nonduplicateRightMortonCode = upsample(rightLeafMortonCode, rightLeafIndex);\n" +" \n" +" out_commonPrefixes[internalNodeIndex] = computeCommonPrefix(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n" +" out_commonPrefixLengths[internalNodeIndex] = computeCommonPrefixLength(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n" "}\n" -"__kernel void correctDuplicatePrefixes(__global int* commonPrefixes, __global int* out_maxCommonPrefix, int numInternalNodes)\n" -"{\n" -" int internalNodeIndex = get_global_id(0);\n" -" if (internalNodeIndex >= numInternalNodes) return;\n" -" \n" -" int commonPrefix = commonPrefixes[internalNodeIndex];\n" -" \n" -" //Linear search to find the size of the subtree\n" -" int firstSubTreeIndex = internalNodeIndex;\n" -" int lastSubTreeIndex = internalNodeIndex;\n" -" \n" -" while(firstSubTreeIndex - 1 >= 0 && commonPrefix == commonPrefixes[firstSubTreeIndex - 1]) --firstSubTreeIndex;\n" -" while(lastSubTreeIndex + 1 < numInternalNodes && commonPrefix == commonPrefixes[lastSubTreeIndex + 1]) ++lastSubTreeIndex;\n" -" \n" -" //Fix duplicate common prefixes by incrementing them so that a subtree is formed.\n" -" //Recursively divide the tree until the position of the split is this node's index.\n" -" //Every time this node is not the split node, increment the common prefix.\n" -" int isCurrentSplitNode = false;\n" -" int correctedCommonPrefix = commonPrefix;\n" -" \n" -" while(!isCurrentSplitNode)\n" -" {\n" -" int numInternalNodesInSubTree = lastSubTreeIndex - firstSubTreeIndex + 1;\n" -" int splitNodeIndex = firstSubTreeIndex + numInternalNodesInSubTree / 2;\n" -" \n" -" if(internalNodeIndex > splitNodeIndex) firstSubTreeIndex = splitNodeIndex + 1;\n" -" else if(internalNodeIndex < splitNodeIndex) lastSubTreeIndex = splitNodeIndex - 1;\n" -" //else if(internalNodeIndex == splitNodeIndex) break;\n" -" \n" -" isCurrentSplitNode = (internalNodeIndex == splitNodeIndex);\n" -" if(!isCurrentSplitNode) correctedCommonPrefix++;\n" -" }\n" -" \n" -" commonPrefixes[internalNodeIndex] = correctedCommonPrefix;\n" -" atomic_max(out_maxCommonPrefix, correctedCommonPrefix);\n" -"}\n" -"//Set so that it is always greater than the actual common prefixes, and never selected as a parent node.\n" -"//If there are no duplicates, then the highest common prefix is 32 or 64, depending on the number of bits used for the z-curve.\n" -"//Duplicates common prefixes increase the highest common prefix by N, where 2^N is the number of duplicate nodes.\n" -"#define B3_PLBVH_INVALID_COMMON_PREFIX 128\n" -"__kernel void buildBinaryRadixTreeLeafNodes(__global int* commonPrefixes, __global int* out_leftChildNodes, \n" -" __global int* out_rightChildNodes, int numLeafNodes)\n" +"__kernel void buildBinaryRadixTreeLeafNodes(__global int* commonPrefixLengths, __global int* out_leafNodeParentNodes,\n" +" __global int2* out_childNodes, int numLeafNodes)\n" "{\n" " int leafNodeIndex = get_global_id(0);\n" " if (leafNodeIndex >= numLeafNodes) return;\n" @@ -620,8 +461,8 @@ static const char* parallelLinearBvhCL= \ " int leftSplitIndex = leafNodeIndex - 1;\n" " int rightSplitIndex = leafNodeIndex;\n" " \n" -" int leftCommonPrefix = (leftSplitIndex >= 0) ? commonPrefixes[leftSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" -" int rightCommonPrefix = (rightSplitIndex < numInternalNodes) ? commonPrefixes[rightSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" +" int leftCommonPrefix = (leftSplitIndex >= 0) ? commonPrefixLengths[leftSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" +" int rightCommonPrefix = (rightSplitIndex < numInternalNodes) ? commonPrefixLengths[rightSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" " \n" " //Parent node is the highest adjacent common prefix that is lower than the node's common prefix\n" " //Leaf nodes are considered as having the highest common prefix\n" @@ -633,71 +474,218 @@ static const char* parallelLinearBvhCL= \ " if(rightCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = true;\n" " \n" " int parentNodeIndex = (isLeftHigherCommonPrefix) ? leftSplitIndex : rightSplitIndex;\n" +" out_leafNodeParentNodes[leafNodeIndex] = parentNodeIndex;\n" " \n" -" //If the left node is the parent, then this node is its right child and vice versa\n" -" __global int* out_childNode = (isLeftHigherCommonPrefix) ? out_rightChildNodes : out_leftChildNodes;\n" +" int isRightChild = (isLeftHigherCommonPrefix); //If the left node is the parent, then this node is its right child and vice versa\n" " \n" +" //out_childNodesAsInt[0] == int2.x == left child\n" +" //out_childNodesAsInt[1] == int2.y == right child\n" " int isLeaf = 1;\n" -" out_childNode[parentNodeIndex] = getIndexWithInternalNodeMarkerSet(isLeaf, leafNodeIndex);\n" +" __global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n" +" out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, leafNodeIndex);\n" "}\n" -"__kernel void buildBinaryRadixTreeInternalNodes(__global int* commonPrefixes, __global SortDataCL* mortonCodesAndAabbIndices,\n" -" __global int* leftChildNodes, __global int* rightChildNodes,\n" -" __global int* leftNeighborPointers, __global int* rightNeighborPointers,\n" -" __global b3AabbCL* leafNodeAabbs, __global b3AabbCL* internalNodeAabbs,\n" -" __global int* out_rootNodeIndex,\n" -" int processedCommonPrefix, int numInternalNodes)\n" +"__kernel void buildBinaryRadixTreeInternalNodes(__global b3Int64* commonPrefixes, __global int* commonPrefixLengths,\n" +" __global int2* out_childNodes,\n" +" __global int* out_internalNodeParentNodes, __global int* out_rootNodeIndex,\n" +" __global int* TEMP_out_leftLowerPrefix, __global int* TEMP_out_rightLowerPrefix,\n" +" __global int* TEMP_spl_left, __global int* TEMP_spl_right,\n" +" int numInternalNodes)\n" "{\n" -" int internalNodeIndex = get_global_id(0);\n" -" if (internalNodeIndex >= numInternalNodes) return;\n" +" int internalNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n" +" if(internalNodeIndex >= numInternalNodes) return;\n" " \n" -" int commonPrefix = commonPrefixes[internalNodeIndex];\n" -" if (commonPrefix == processedCommonPrefix)\n" +" b3Int64 nodePrefix = commonPrefixes[internalNodeIndex];\n" +" int nodePrefixLength = commonPrefixLengths[internalNodeIndex];\n" +" \n" +"//#define USE_LINEAR_SEARCH\n" +"#ifdef USE_LINEAR_SEARCH\n" +" int leftIndex = -1;\n" +" int rightIndex = -1;\n" +" \n" +" for(int i = internalNodeIndex - 1; i >= 0; --i)\n" " {\n" -" //Check neighbors and compare the common prefix to select the parent node\n" -" int leftNodeIndex = leftNeighborPointers[internalNodeIndex];\n" -" int rightNodeIndex = rightNeighborPointers[internalNodeIndex];\n" -" \n" -" int leftCommonPrefix = (leftNodeIndex != B3_PLBVH_LINKED_LIST_INVALID_NODE) ? commonPrefixes[leftNodeIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" -" int rightCommonPrefix = (rightNodeIndex != B3_PLBVH_LINKED_LIST_INVALID_NODE) ? commonPrefixes[rightNodeIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" -" \n" -" //Parent node is the highest common prefix that is lower than the node's common prefix\n" -" //Since the nodes with lower common prefixes are removed, that condition does not have to be tested for,\n" -" //and we only need to pick the node with the higher prefix.\n" -" int isLeftHigherCommonPrefix = (leftCommonPrefix > rightCommonPrefix);\n" -" \n" -" //\n" -" if(leftCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = false;\n" -" else if(rightCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = true;\n" -" \n" -" int isRootNode = false;\n" -" if(leftCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX && rightCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isRootNode = true;\n" -" \n" -" int parentNodeIndex = (isLeftHigherCommonPrefix) ? leftNodeIndex : rightNodeIndex;\n" -" \n" -" //If the left node is the parent, then this node is its right child and vice versa\n" -" __global int* out_childNode = (isLeftHigherCommonPrefix) ? rightChildNodes : leftChildNodes;\n" -" \n" -" int isLeaf = 0;\n" -" if(!isRootNode) out_childNode[parentNodeIndex] = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n" -" \n" -" if(isRootNode) *out_rootNodeIndex = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n" -" \n" -" //Remove this node from the linked list, \n" -" //so that the left and right nodes point at each other instead of this node\n" -" if(leftNodeIndex != B3_PLBVH_LINKED_LIST_INVALID_NODE) rightNeighborPointers[leftNodeIndex] = rightNodeIndex;\n" -" if(rightNodeIndex != B3_PLBVH_LINKED_LIST_INVALID_NODE) leftNeighborPointers[rightNodeIndex] = leftNodeIndex;\n" -" \n" -" //For debug\n" -" leftNeighborPointers[internalNodeIndex] = -2;\n" -" rightNeighborPointers[internalNodeIndex] = -2;\n" +" int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n" +" if(nodeLeftSharedPrefixLength < nodePrefixLength)\n" +" {\n" +" leftIndex = i;\n" +" break;\n" +" }\n" " }\n" " \n" -" //Processing occurs from highest common prefix to lowest common prefix\n" -" //Nodes in the previously processed level have their children set, so we merge their child AABBs here\n" -" if (commonPrefix == processedCommonPrefix + 1)\n" +" for(int i = internalNodeIndex + 1; i < numInternalNodes; ++i)\n" " {\n" -" int leftChildIndex = leftChildNodes[internalNodeIndex];\n" -" int rightChildIndex = rightChildNodes[internalNodeIndex];\n" +" int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n" +" if(nodeRightSharedPrefixLength < nodePrefixLength)\n" +" {\n" +" rightIndex = i;\n" +" break;\n" +" }\n" +" }\n" +" \n" +"#else //Use binary search\n" +" //Find nearest element to left with a lower common prefix\n" +" int leftIndex = -1;\n" +" {\n" +" int lower = 0;\n" +" int upper = internalNodeIndex - 1;\n" +" \n" +" while(lower <= upper)\n" +" {\n" +" int mid = (lower + upper) / 2;\n" +" b3Int64 midPrefix = commonPrefixes[mid];\n" +" int midPrefixLength = commonPrefixLengths[mid];\n" +" \n" +" int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n" +" if(nodeMidSharedPrefixLength < nodePrefixLength) \n" +" {\n" +" int right = mid + 1;\n" +" if(right < internalNodeIndex)\n" +" {\n" +" b3Int64 rightPrefix = commonPrefixes[right];\n" +" int rightPrefixLength = commonPrefixLengths[right];\n" +" \n" +" int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, rightPrefix, rightPrefixLength);\n" +" if(nodeRightSharedPrefixLength < nodePrefixLength) \n" +" {\n" +" lower = right;\n" +" leftIndex = right;\n" +" }\n" +" else \n" +" {\n" +" leftIndex = mid;\n" +" break;\n" +" }\n" +" }\n" +" else \n" +" {\n" +" leftIndex = mid;\n" +" break;\n" +" }\n" +" }\n" +" else upper = mid - 1;\n" +" }\n" +" }\n" +" \n" +" //Find nearest element to right with a lower common prefix\n" +" int rightIndex = -1;\n" +" {\n" +" int lower = internalNodeIndex + 1;\n" +" int upper = numInternalNodes - 1;\n" +" \n" +" while(lower <= upper)\n" +" {\n" +" int mid = (lower + upper) / 2;\n" +" b3Int64 midPrefix = commonPrefixes[mid];\n" +" int midPrefixLength = commonPrefixLengths[mid];\n" +" \n" +" int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n" +" if(nodeMidSharedPrefixLength < nodePrefixLength) \n" +" {\n" +" int left = mid - 1;\n" +" if(left > internalNodeIndex)\n" +" {\n" +" b3Int64 leftPrefix = commonPrefixes[left];\n" +" int leftPrefixLength = commonPrefixLengths[left];\n" +" \n" +" int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, leftPrefix, leftPrefixLength);\n" +" if(nodeLeftSharedPrefixLength < nodePrefixLength) \n" +" {\n" +" upper = left;\n" +" rightIndex = left;\n" +" }\n" +" else \n" +" {\n" +" rightIndex = mid;\n" +" break;\n" +" }\n" +" }\n" +" else \n" +" {\n" +" rightIndex = mid;\n" +" break;\n" +" }\n" +" }\n" +" else lower = mid + 1;\n" +" }\n" +" }\n" +"#endif\n" +" \n" +" TEMP_out_leftLowerPrefix[internalNodeIndex] = leftIndex;\n" +" TEMP_out_rightLowerPrefix[internalNodeIndex] = rightIndex;\n" +" TEMP_spl_left[internalNodeIndex] = (leftIndex != -1) ? getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[leftIndex], commonPrefixLengths[leftIndex]) : -1;\n" +" TEMP_spl_right[internalNodeIndex] = (rightIndex != -1) ? getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[rightIndex], commonPrefixLengths[rightIndex]) : -1;\n" +" \n" +" //Select parent\n" +" {\n" +" int leftPrefixLength = (leftIndex != -1) ? commonPrefixLengths[leftIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" +" int rightPrefixLength = (rightIndex != -1) ? commonPrefixLengths[rightIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" +" \n" +" int isLeftHigherPrefixLength = (leftPrefixLength > rightPrefixLength);\n" +" \n" +" if(leftPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = false;\n" +" else if(rightPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = true;\n" +" \n" +" int parentNodeIndex = (isLeftHigherPrefixLength) ? leftIndex : rightIndex;\n" +" \n" +" int isRootNode = (leftIndex == -1 && rightIndex == -1);\n" +" out_internalNodeParentNodes[internalNodeIndex] = (!isRootNode) ? parentNodeIndex : B3_PLBVH_ROOT_NODE_MARKER;\n" +" \n" +" int isLeaf = 0;\n" +" if(!isRootNode)\n" +" {\n" +" int isRightChild = (isLeftHigherPrefixLength); //If the left node is the parent, then this node is its right child and vice versa\n" +" \n" +" //out_childNodesAsInt[0] == int2.x == left child\n" +" //out_childNodesAsInt[1] == int2.y == right child\n" +" __global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n" +" out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n" +" }\n" +" else *out_rootNodeIndex = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n" +" }\n" +"}\n" +"__kernel void findDistanceFromRoot(__global int* rootNodeIndex, __global int* internalNodeParentNodes,\n" +" __global int* out_maxDistanceFromRoot, __global int* out_distanceFromRoot, int numInternalNodes)\n" +"{\n" +" if( get_global_id(0) == 0 ) atomic_xchg(out_maxDistanceFromRoot, 0);\n" +" int internalNodeIndex = get_global_id(0);\n" +" if(internalNodeIndex >= numInternalNodes) return;\n" +" \n" +" //\n" +" int distanceFromRoot = 0;\n" +" {\n" +" int parentIndex = internalNodeParentNodes[internalNodeIndex];\n" +" while(parentIndex != B3_PLBVH_ROOT_NODE_MARKER)\n" +" {\n" +" parentIndex = internalNodeParentNodes[parentIndex];\n" +" ++distanceFromRoot;\n" +" }\n" +" }\n" +" out_distanceFromRoot[internalNodeIndex] = distanceFromRoot;\n" +" \n" +" //\n" +" __local int localMaxDistanceFromRoot;\n" +" if( get_local_id(0) == 0 ) localMaxDistanceFromRoot = 0;\n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" +" \n" +" atomic_max(&localMaxDistanceFromRoot, distanceFromRoot);\n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" +" \n" +" if( get_local_id(0) == 0 ) atomic_max(out_maxDistanceFromRoot, localMaxDistanceFromRoot);\n" +"}\n" +"__kernel void buildBinaryRadixTreeAabbsRecursive(__global int* distanceFromRoot, __global SortDataCL* mortonCodesAndAabbIndices,\n" +" __global int2* childNodes,\n" +" __global b3AabbCL* leafNodeAabbs, __global b3AabbCL* internalNodeAabbs,\n" +" int maxDistanceFromRoot, int processedDistance, int numInternalNodes)\n" +"{\n" +" int internalNodeIndex = get_global_id(0);\n" +" if(internalNodeIndex >= numInternalNodes) return;\n" +" \n" +" int distance = distanceFromRoot[internalNodeIndex];\n" +" \n" +" if(distance == processedDistance)\n" +" {\n" +" int leftChildIndex = childNodes[internalNodeIndex].x;\n" +" int rightChildIndex = childNodes[internalNodeIndex].y;\n" " \n" " int isLeftChildLeaf = isLeafNode(leftChildIndex);\n" " int isRightChildLeaf = isLeafNode(rightChildIndex);\n" @@ -718,16 +706,4 @@ static const char* parallelLinearBvhCL= \ " internalNodeAabbs[internalNodeIndex] = mergedAabb;\n" " }\n" "}\n" -"__kernel void convertChildNodeFormat(__global int* leftChildNodes, __global int* rightChildNodes, \n" -" __global int2* out_childNodes, int numInternalNodes)\n" -"{\n" -" int internalNodeIndex = get_global_id(0);\n" -" if (internalNodeIndex >= numInternalNodes) return;\n" -" \n" -" int2 childNodesIndices;\n" -" childNodesIndices.x = leftChildNodes[internalNodeIndex];\n" -" childNodesIndices.y = rightChildNodes[internalNodeIndex];\n" -" \n" -" out_childNodes[internalNodeIndex] = childNodesIndices;\n" -"}\n" ;