diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h index 58ec34ab8..f2199a3cf 100644 --- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h +++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h @@ -44,10 +44,8 @@ subject to the following restrictions: ///The BVH implementation here is almost the same as [Karras 2012], but a different method is used for constructing the tree. /// - Instead of building a binary radix tree, we simply pair each node with its nearest sibling. /// This has the effect of further worsening the quality of the BVH, but the main spatial partitioning is done by the -/// Z-curve anyways, and this method should be simpler and faster during construction. Additionally, it is still possible -/// to improve the quality of the BVH by rearranging the connections between nodes. -/// - Due to the way the tree is constructed, it becomes unnecessary to use atomic_inc to get -/// the AABB for each internal node. Rather than traveling upwards from the leaf nodes, as in the paper, +/// Z-curve anyways, and this method should be simpler and faster during construction. +/// - Rather than traveling upwards towards the root from the leaf nodes, as in the paper, /// each internal node checks its child nodes to get its AABB. class b3GpuParallelLinearBvh { @@ -73,6 +71,7 @@ class b3GpuParallelLinearBvh //1 element per internal node (number_of_internal_nodes = number_of_leaves - 1); index 0 is the root node b3OpenCLArray m_internalNodeAabbs; + b3OpenCLArray m_internalNodeLeafIndexRanges; //x == min leaf index, y == max leaf index b3OpenCLArray m_internalNodeChildNodes; //x == left child, y == right child b3OpenCLArray m_internalNodeParentNodes; @@ -90,6 +89,7 @@ public: m_numNodesPerLevelGpu(context, queue), m_firstIndexOffsetPerLevelGpu(context, queue), m_internalNodeAabbs(context, queue), + m_internalNodeLeafIndexRanges(context, queue), m_internalNodeChildNodes(context, queue), m_internalNodeParentNodes(context, queue), m_leafNodeParentNodes(context, queue), @@ -128,8 +128,6 @@ public: clReleaseProgram(m_parallelLinearBvhProgram); } - - // fix: need to handle/test case with 2 nodes void build(const b3OpenCLArray& worldSpaceAabbs) { @@ -143,6 +141,7 @@ public: // { m_internalNodeAabbs.resize(numInternalNodes); + m_internalNodeLeafIndexRanges.resize(numInternalNodes); m_internalNodeChildNodes.resize(numInternalNodes); m_internalNodeParentNodes.resize(numInternalNodes); @@ -180,17 +179,20 @@ public: //Calculate number of nodes in each level; //start from the second to last level(level right next to leaf nodes) and move towards the root(level 0) - int hasRemainder = 0; + int remainder = 0; for(int levelIndex = numLevels - 2; levelIndex >= 0; --levelIndex) { int numNodesPreviousLevel = m_numNodesPerLevelCpu[levelIndex + 1]; //For first iteration this == numLeaves - - bool allNodesAllocated = ( (numNodesPreviousLevel + hasRemainder) % 2 == 0 ); - - int numNodesCurrentLevel = (allNodesAllocated) ? (numNodesPreviousLevel + hasRemainder) / 2 : numNodesPreviousLevel / 2; - m_numNodesPerLevelCpu[levelIndex] = numNodesCurrentLevel; + int numNodesCurrentLevel = numNodesPreviousLevel / 2; - hasRemainder = static_cast(!allNodesAllocated); + remainder += numNodesPreviousLevel % 2; + if(remainder == 2) + { + numNodesCurrentLevel++; + remainder = 0; + } + + m_numNodesPerLevelCpu[levelIndex] = numNodesCurrentLevel; } //Prefix sum to calculate the first index offset of each level @@ -232,17 +234,22 @@ public: B3_PROFILE("Find AABB of merged nodes"); m_mergedAabb.copyFromOpenCLArray(worldSpaceAabbs); //Need to make a copy since the kernel modifies the array - - b3BufferInfoCL bufferInfo[] = + + for(int numAabbsNeedingMerge = numLeaves; numAabbsNeedingMerge >= 2; + numAabbsNeedingMerge = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2) { - b3BufferInfoCL( m_mergedAabb.getBufferCL() ) //Resulting AABB is stored in m_mergedAabb[0] - }; + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_mergedAabb.getBufferCL() ) //Resulting AABB is stored in m_mergedAabb[0] + }; + + b3LauncherCL launcher(m_queue, m_findAllNodesMergedAabbKernel, "m_findAllNodesMergedAabbKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numAabbsNeedingMerge); + + launcher.launch1D(numAabbsNeedingMerge); + } - b3LauncherCL launcher(m_queue, m_findAllNodesMergedAabbKernel, "m_findAllNodesMergedAabbKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst(numLeaves); - - launcher.launch1D(numLeaves); clFinish(m_queue); } @@ -315,7 +322,8 @@ public: m_internalNodeChildNodes.copyToHost(internalNodeChildNodes, false); clFinish(m_queue); - for(int i = 0; i < 256; ++i) printf("ch[%d]: %d, %d\n", i, internalNodeChildNodes[i].x, internalNodeChildNodes[i].y); + for(int i = 0; i < numInternalNodes; ++i) + printf("ch[%d]: %d, %d\n", i, internalNodeChildNodes[i].x, internalNodeChildNodes[i].y); printf("\n"); } } @@ -325,30 +333,58 @@ public: { B3_PROFILE("Set AABBs"); - b3BufferInfoCL bufferInfo[] = + //Due to the arrangement of internal nodes, each internal node corresponds + //to a contiguous range of leaf node indices. This characteristic can be used + //to optimize calculateOverlappingPairs(); checking if + //(m_internalNodeLeafIndexRanges[].y < leafNodeIndex) can be used to ensure that + //each pair is processed only once. { - b3BufferInfoCL( m_firstIndexOffsetPerLevelGpu.getBufferCL() ), - b3BufferInfoCL( m_numNodesPerLevelGpu.getBufferCL() ), - b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), - b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), - b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ), - b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ) - }; + B3_PROFILE("Reset internal node index ranges"); + + b3Int2 invalidIndexRange; + invalidIndexRange.x = -1; //x == min + invalidIndexRange.y = -2; //y == max + + m_fill.execute( m_internalNodeLeafIndexRanges, invalidIndexRange, m_internalNodeLeafIndexRanges.size() ); + clFinish(m_queue); + } - b3LauncherCL launcher(m_queue, m_determineInternalNodeAabbsKernel, "m_determineInternalNodeAabbsKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst(numLevels); - launcher.setConst(numInternalNodes); - - launcher.launch1D(numLeaves); + int lastInternalLevelIndex = numLevels - 2; //Last level is leaf node level + for(int level = lastInternalLevelIndex; level >= 0; --level) + { + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_firstIndexOffsetPerLevelGpu.getBufferCL() ), + b3BufferInfoCL( m_numNodesPerLevelGpu.getBufferCL() ), + b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), + b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), + b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ), + b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ), + b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_determineInternalNodeAabbsKernel, "m_determineInternalNodeAabbsKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numLevels); + launcher.setConst(numInternalNodes); + launcher.setConst(level); + + launcher.launch1D(numLeaves); + } clFinish(m_queue); if(0) { - b3SapAabb mergedAABB = m_mergedAabb.at(0); - printf("mergedAABBMin: %f, %f, %f \n", mergedAABB.m_minVec.x, mergedAABB.m_minVec.y, mergedAABB.m_minVec.z); - printf("mergedAABBMax: %f, %f, %f \n", mergedAABB.m_maxVec.x, mergedAABB.m_maxVec.y, mergedAABB.m_maxVec.z); + static b3AlignedObjectArray leafIndexRanges; + m_internalNodeLeafIndexRanges.copyToHost(leafIndexRanges, false); + clFinish(m_queue); + + for(int i = 0; i < numInternalNodes; ++i) + //if(leafIndexRanges[i].x == -1 || leafIndexRanges[i].y == -1) + printf("leafIndexRanges[%d]: %d, %d\n", i, leafIndexRanges[i].x, leafIndexRanges[i].y); + printf("\n"); } + if(0) { static b3AlignedObjectArray rigidAabbs; @@ -363,12 +399,18 @@ public: actualRootAabb.m_minVec.setMin(rigidAabbs[i].m_minVec); actualRootAabb.m_maxVec.setMax(rigidAabbs[i].m_maxVec); } - printf("actualRootMin: %f, %f, %f \n", actualRootAabb.m_minVec.x, actualRootAabb.m_minVec.y, actualRootAabb.m_minVec.z); - printf("actualRootMax: %f, %f, %f \n", actualRootAabb.m_maxVec.x, actualRootAabb.m_maxVec.y, actualRootAabb.m_maxVec.z); - + b3SapAabb rootAabb = m_internalNodeAabbs.at(0); - printf("rootMin: %f, %f, %f \n", rootAabb.m_minVec.x, rootAabb.m_minVec.y, rootAabb.m_minVec.z); - printf("rootMax: %f, %f, %f \n", rootAabb.m_maxVec.x, rootAabb.m_maxVec.y, rootAabb.m_maxVec.z); + b3SapAabb mergedAABB = m_mergedAabb.at(0); + + printf("mergedAABBMin: %f, %f, %f \n", mergedAABB.m_minVec.x, mergedAABB.m_minVec.y, mergedAABB.m_minVec.z); + printf("actualRootMin: %f, %f, %f \n", actualRootAabb.m_minVec.x, actualRootAabb.m_minVec.y, actualRootAabb.m_minVec.z); + printf("kernelRootMin: %f, %f, %f \n", rootAabb.m_minVec.x, rootAabb.m_minVec.y, rootAabb.m_minVec.z); + + printf("mergedAABBMax: %f, %f, %f \n", mergedAABB.m_maxVec.x, mergedAABB.m_maxVec.y, mergedAABB.m_maxVec.z); + printf("actualRootMax: %f, %f, %f \n", actualRootAabb.m_maxVec.x, actualRootAabb.m_maxVec.y, actualRootAabb.m_maxVec.z); + printf("kernelRootMax: %f, %f, %f \n", rootAabb.m_maxVec.x, rootAabb.m_maxVec.y, rootAabb.m_maxVec.z); + printf("\n"); } } @@ -397,6 +439,7 @@ public: b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ), + b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ), b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), b3BufferInfoCL( out_numPairs.getBufferCL() ), diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h index 007067460..761f4b168 100644 --- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h +++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h @@ -77,11 +77,7 @@ public: virtual cl_mem getAabbBufferWS() { return m_aabbsGpu.getBufferCL(); } virtual b3OpenCLArray& getAllAabbsGPU() { return m_aabbsGpu; } - virtual b3AlignedObjectArray& getAllAabbsCPU() - { - b3Assert(0); //CPU version not implemented - return m_aabbsCpu; - } + virtual b3AlignedObjectArray& getAllAabbsCPU() { return m_aabbsCpu; } static b3GpuBroadphaseInterface* CreateFunc(cl_context context, cl_device_id device, cl_command_queue queue) { diff --git a/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl b/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl index 51c8ce281..dc283ac95 100644 --- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl +++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl @@ -80,60 +80,32 @@ unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z) return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2; } - -__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbs) +//Should replace with an optimized parallel reduction +__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbsNeedingMerge) { - int aabbIndex = get_global_id(0); - if(aabbIndex >= numAabbs) return; - - //Find the most significant bit(msb) - int mostSignificantBit = 0; - { - int temp = numAabbs; - while(temp >>= 1) mostSignificantBit++; //Start counting from 0 (0 and 1 have msb 0, 2 has msb 1) - } - - int numberOfAabbsAboveMsbSplit = numAabbs & ~( ~(0) << mostSignificantBit ); - int numRemainingAabbs = (1 << mostSignificantBit); - - //Merge AABBs above most significant bit so that the number of remaining AABBs is a power of 2 - //For example, if there are 159 AABBs = 128 + 31, then merge indices [0, 30] and 128 + [0, 30] - if(aabbIndex < numberOfAabbsAboveMsbSplit) - { - int otherAabbIndex = numRemainingAabbs + aabbIndex; - - b3AabbCL aabb = out_mergedAabb[aabbIndex]; - b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex]; - - b3AabbCL mergedAabb; - mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min); - mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max); - out_mergedAabb[aabbIndex] = mergedAabb; - } - - barrier(CLK_GLOBAL_MEM_FENCE); - + //Each time this kernel is added to the command queue, + //the number of AABBs needing to be merged is halved // - int offset = numRemainingAabbs / 2; - while(offset >= 1) - { - if(aabbIndex < offset) - { - int otherAabbIndex = aabbIndex + offset; + //Example with 159 AABBs: + // numRemainingAabbs == 159 / 2 + 159 % 2 == 80 + // numMergedAabbs == 159 - 80 == 79 + //So, indices [0, 78] are merged with [0 + 80, 78 + 80] + + int numRemainingAabbs = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2; + int numMergedAabbs = numAabbsNeedingMerge - numRemainingAabbs; + + int aabbIndex = get_global_id(0); + if(aabbIndex >= numMergedAabbs) return; + + int otherAabbIndex = aabbIndex + numRemainingAabbs; + + b3AabbCL aabb = out_mergedAabb[aabbIndex]; + b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex]; - b3AabbCL aabb = out_mergedAabb[aabbIndex]; - b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex]; - - b3AabbCL mergedAabb; - mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min); - mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max); - out_mergedAabb[aabbIndex] = mergedAabb; - } - - offset /= 2; - - barrier(CLK_GLOBAL_MEM_FENCE); - } + b3AabbCL mergedAabb; + mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min); + mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max); + out_mergedAabb[aabbIndex] = mergedAabb; } __kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes, @@ -254,7 +226,7 @@ __kernel void constructBinaryTree(__global int* firstIndexOffsetPerLevel, { int leafNodeLevel = numLevels - 1; leftChildIndex = (isLeftChildLeaf) ? leftChildIndex - firstIndexOffsetPerLevel[leafNodeLevel] : leftChildIndex; - rightChildIndex = (isLeftChildLeaf) ? rightChildIndex - firstIndexOffsetPerLevel[leafNodeLevel] : rightChildIndex; + rightChildIndex = (isRightChildLeaf) ? rightChildIndex - firstIndexOffsetPerLevel[leafNodeLevel] : rightChildIndex; } //Set the negative sign bit if the node is internal @@ -276,20 +248,19 @@ __kernel void determineInternalNodeAabbs(__global int* firstIndexOffsetPerLevel, __global int2* internalNodeChildIndices, __global SortDataCL* mortonCodesAndAabbIndices, __global b3AabbCL* leafNodeAabbs, - __global b3AabbCL* out_internalNodeAabbs, int numLevels, int numInternalNodes) + __global int2* out_internalNodeLeafIndexRanges, + __global b3AabbCL* out_internalNodeAabbs, + int numLevels, int numInternalNodes, int level) { int i = get_global_id(0); if(i >= numInternalNodes) return; - int numInternalLevels = numLevels - 1; - - //Starting from the level next to the leaf nodes, move towards the root(level 0) - for(int level = numInternalLevels - 1; level >= 0; --level) + //For each node in a level, check its child nodes to determine its AABB { int indexInLevel = i; //Index relative to firstIndexOffsetPerLevel[level] int numNodesInLevel = numNodesPerLevel[level]; - if(i < numNodesInLevel) + if(indexInLevel < numNodesInLevel) { int internalNodeIndexGlobal = indexInLevel + firstIndexOffsetPerLevel[level]; int2 childIndicies = internalNodeChildIndices[internalNodeIndexGlobal]; @@ -300,19 +271,26 @@ __kernel void determineInternalNodeAabbs(__global int* firstIndexOffsetPerLevel, int isLeftChildLeaf = isLeafNode(childIndicies.x); int isRightChildLeaf = isLeafNode(childIndicies.y); + //left/RightChildLeafIndex == Rigid body indicies int leftChildLeafIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1; int rightChildLeafIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1; b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftChildLeafIndex] : out_internalNodeAabbs[leftChildIndex]; b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightChildLeafIndex] : out_internalNodeAabbs[rightChildIndex]; + // b3AabbCL internalNodeAabb; internalNodeAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min); internalNodeAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max); out_internalNodeAabbs[internalNodeIndexGlobal] = internalNodeAabb; + + //For index range, x == min and y == max; left child always has lower index + int2 leafIndexRange; + leafIndexRange.x = (isLeftChildLeaf) ? leftChildIndex : out_internalNodeLeafIndexRanges[leftChildIndex].x; + leafIndexRange.y = (isRightChildLeaf) ? rightChildIndex : out_internalNodeLeafIndexRanges[rightChildIndex].y; + + out_internalNodeLeafIndexRanges[internalNodeIndexGlobal] = leafIndexRange; } - - barrier(CLK_GLOBAL_MEM_FENCE); } } @@ -331,7 +309,9 @@ bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, const b3AabbCL* aabb2) //From sap.cl __kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, - __global int2* internalNodeChildIndices, __global b3AabbCL* internalNodeAabbs, + __global int2* internalNodeChildIndices, + __global b3AabbCL* internalNodeAabbs, + __global int2* internalNodeLeafIndexRanges, __global SortDataCL* mortonCodesAndAabbIndices, __global int* out_numPairs, __global int4* out_overlappingPairs, int maxPairs, int numQueryAabbs) @@ -341,7 +321,8 @@ __kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, int queryRigidIndex = get_group_id(0) * get_local_size(0) + get_local_id(0); if(queryRigidIndex >= numQueryAabbs) return; - queryRigidIndex = mortonCodesAndAabbIndices[queryRigidIndex].m_value; + int queryBvhNodeIndex = queryRigidIndex; + queryRigidIndex = mortonCodesAndAabbIndices[queryRigidIndex].m_value; // fix queryRigidIndex naming for this branch #else int queryRigidIndex = get_global_id(0); if(queryRigidIndex >= numQueryAabbs) return; @@ -363,7 +344,15 @@ __kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex); - + + //Optimization - if the node is not a leaf, check whether the highest leaf index of that node + //is less than the queried node's index to avoid testing each pair twice. + { + // fix: produces duplicate pairs + // int highestLeafIndex = (isLeaf) ? numQueryAabbs : internalNodeLeafIndexRanges[bvhNodeIndex].y; + // if(highestLeafIndex < queryBvhNodeIndex) continue; + } + //bvhRigidIndex is not used if internal node int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1; diff --git a/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h b/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h index fc3273bbc..6f99d51f2 100644 --- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h +++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h @@ -75,59 +75,32 @@ static const char* parallelLinearBvhCL= \ "{\n" " return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;\n" "}\n" -"__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbs)\n" +"//Should replace with an optimized parallel reduction\n" +"__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbsNeedingMerge)\n" "{\n" -" int aabbIndex = get_global_id(0);\n" -" if(aabbIndex >= numAabbs) return;\n" -" \n" -" //Find the most significant bit(msb)\n" -" int mostSignificantBit = 0;\n" -" {\n" -" int temp = numAabbs;\n" -" while(temp >>= 1) mostSignificantBit++; //Start counting from 0 (0 and 1 have msb 0, 2 has msb 1)\n" -" }\n" -" \n" -" int numberOfAabbsAboveMsbSplit = numAabbs & ~( ~(0) << mostSignificantBit );\n" -" int numRemainingAabbs = (1 << mostSignificantBit);\n" -" \n" -" //Merge AABBs above most significant bit so that the number of remaining AABBs is a power of 2\n" -" //For example, if there are 159 AABBs = 128 + 31, then merge indices [0, 30] and 128 + [0, 30]\n" -" if(aabbIndex < numberOfAabbsAboveMsbSplit)\n" -" {\n" -" int otherAabbIndex = numRemainingAabbs + aabbIndex;\n" -" \n" -" b3AabbCL aabb = out_mergedAabb[aabbIndex];\n" -" b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];\n" -" \n" -" b3AabbCL mergedAabb;\n" -" mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);\n" -" mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);\n" -" out_mergedAabb[aabbIndex] = mergedAabb;\n" -" }\n" -" \n" -" barrier(CLK_GLOBAL_MEM_FENCE);\n" -" \n" +" //Each time this kernel is added to the command queue, \n" +" //the number of AABBs needing to be merged is halved\n" " //\n" -" int offset = numRemainingAabbs / 2;\n" -" while(offset >= 1)\n" -" {\n" -" if(aabbIndex < offset)\n" -" {\n" -" int otherAabbIndex = aabbIndex + offset;\n" +" //Example with 159 AABBs:\n" +" // numRemainingAabbs == 159 / 2 + 159 % 2 == 80\n" +" // numMergedAabbs == 159 - 80 == 79\n" +" //So, indices [0, 78] are merged with [0 + 80, 78 + 80]\n" +" \n" +" int numRemainingAabbs = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2;\n" +" int numMergedAabbs = numAabbsNeedingMerge - numRemainingAabbs;\n" +" \n" +" int aabbIndex = get_global_id(0);\n" +" if(aabbIndex >= numMergedAabbs) return;\n" +" \n" +" int otherAabbIndex = aabbIndex + numRemainingAabbs;\n" +" \n" +" b3AabbCL aabb = out_mergedAabb[aabbIndex];\n" +" b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];\n" " \n" -" b3AabbCL aabb = out_mergedAabb[aabbIndex];\n" -" b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];\n" -" \n" -" b3AabbCL mergedAabb;\n" -" mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);\n" -" mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);\n" -" out_mergedAabb[aabbIndex] = mergedAabb;\n" -" }\n" -" \n" -" offset /= 2;\n" -" \n" -" barrier(CLK_GLOBAL_MEM_FENCE);\n" -" }\n" +" b3AabbCL mergedAabb;\n" +" mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);\n" +" mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);\n" +" out_mergedAabb[aabbIndex] = mergedAabb;\n" "}\n" "__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes, \n" " __global SortDataCL* out_mortonCodesAndAabbIndices, int numAabbs)\n" @@ -244,7 +217,7 @@ static const char* parallelLinearBvhCL= \ " {\n" " int leafNodeLevel = numLevels - 1;\n" " leftChildIndex = (isLeftChildLeaf) ? leftChildIndex - firstIndexOffsetPerLevel[leafNodeLevel] : leftChildIndex;\n" -" rightChildIndex = (isLeftChildLeaf) ? rightChildIndex - firstIndexOffsetPerLevel[leafNodeLevel] : rightChildIndex;\n" +" rightChildIndex = (isRightChildLeaf) ? rightChildIndex - firstIndexOffsetPerLevel[leafNodeLevel] : rightChildIndex;\n" " }\n" " \n" " //Set the negative sign bit if the node is internal\n" @@ -265,20 +238,19 @@ static const char* parallelLinearBvhCL= \ " __global int2* internalNodeChildIndices,\n" " __global SortDataCL* mortonCodesAndAabbIndices,\n" " __global b3AabbCL* leafNodeAabbs, \n" -" __global b3AabbCL* out_internalNodeAabbs, int numLevels, int numInternalNodes)\n" +" __global int2* out_internalNodeLeafIndexRanges,\n" +" __global b3AabbCL* out_internalNodeAabbs, \n" +" int numLevels, int numInternalNodes, int level)\n" "{\n" " int i = get_global_id(0);\n" " if(i >= numInternalNodes) return;\n" " \n" -" int numInternalLevels = numLevels - 1;\n" -" \n" -" //Starting from the level next to the leaf nodes, move towards the root(level 0)\n" -" for(int level = numInternalLevels - 1; level >= 0; --level)\n" +" //For each node in a level, check its child nodes to determine its AABB\n" " {\n" " int indexInLevel = i; //Index relative to firstIndexOffsetPerLevel[level]\n" " \n" " int numNodesInLevel = numNodesPerLevel[level];\n" -" if(i < numNodesInLevel)\n" +" if(indexInLevel < numNodesInLevel)\n" " {\n" " int internalNodeIndexGlobal = indexInLevel + firstIndexOffsetPerLevel[level];\n" " int2 childIndicies = internalNodeChildIndices[internalNodeIndexGlobal];\n" @@ -289,19 +261,26 @@ static const char* parallelLinearBvhCL= \ " int isLeftChildLeaf = isLeafNode(childIndicies.x);\n" " int isRightChildLeaf = isLeafNode(childIndicies.y);\n" " \n" +" //left/RightChildLeafIndex == Rigid body indicies\n" " int leftChildLeafIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1;\n" " int rightChildLeafIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1;\n" " \n" " b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftChildLeafIndex] : out_internalNodeAabbs[leftChildIndex];\n" " b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightChildLeafIndex] : out_internalNodeAabbs[rightChildIndex];\n" " \n" +" //\n" " b3AabbCL internalNodeAabb;\n" " internalNodeAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min);\n" " internalNodeAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max);\n" " out_internalNodeAabbs[internalNodeIndexGlobal] = internalNodeAabb;\n" +" \n" +" //For index range, x == min and y == max; left child always has lower index\n" +" int2 leafIndexRange;\n" +" leafIndexRange.x = (isLeftChildLeaf) ? leftChildIndex : out_internalNodeLeafIndexRanges[leftChildIndex].x;\n" +" leafIndexRange.y = (isRightChildLeaf) ? rightChildIndex : out_internalNodeLeafIndexRanges[rightChildIndex].y;\n" +" \n" +" out_internalNodeLeafIndexRanges[internalNodeIndexGlobal] = leafIndexRange;\n" " }\n" -" \n" -" barrier(CLK_GLOBAL_MEM_FENCE);\n" " }\n" "}\n" "//From sap.cl\n" @@ -316,7 +295,9 @@ static const char* parallelLinearBvhCL= \ "}\n" "//From sap.cl\n" "__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, \n" -" __global int2* internalNodeChildIndices, __global b3AabbCL* internalNodeAabbs,\n" +" __global int2* internalNodeChildIndices, \n" +" __global b3AabbCL* internalNodeAabbs,\n" +" __global int2* internalNodeLeafIndexRanges,\n" " __global SortDataCL* mortonCodesAndAabbIndices,\n" " __global int* out_numPairs, __global int4* out_overlappingPairs, \n" " int maxPairs, int numQueryAabbs)\n" @@ -326,7 +307,8 @@ static const char* parallelLinearBvhCL= \ " int queryRigidIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n" " if(queryRigidIndex >= numQueryAabbs) return;\n" " \n" -" queryRigidIndex = mortonCodesAndAabbIndices[queryRigidIndex].m_value;\n" +" int queryBvhNodeIndex = queryRigidIndex;\n" +" queryRigidIndex = mortonCodesAndAabbIndices[queryRigidIndex].m_value; // fix queryRigidIndex naming for this branch\n" "#else\n" " int queryRigidIndex = get_global_id(0);\n" " if(queryRigidIndex >= numQueryAabbs) return;\n" @@ -347,7 +329,15 @@ static const char* parallelLinearBvhCL= \ " \n" " int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false\n" " int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n" -" \n" +" \n" +" //Optimization - if the node is not a leaf, check whether the highest leaf index of that node\n" +" //is less than the queried node's index to avoid testing each pair twice.\n" +" {\n" +" // fix: produces duplicate pairs\n" +" // int highestLeafIndex = (isLeaf) ? numQueryAabbs : internalNodeLeafIndexRanges[bvhNodeIndex].y;\n" +" // if(highestLeafIndex < queryBvhNodeIndex) continue;\n" +" }\n" +" \n" " //bvhRigidIndex is not used if internal node\n" " int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n" " \n"