diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp
index 0593fc136..a478f9e6f 100644
--- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp
+++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp
@@ -69,6 +69,9 @@ b3GpuParallelLinearBvh::b3GpuParallelLinearBvh(cl_context context, cl_device_id
 	m_buildBinaryRadixTreeAabbsRecursiveKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeAabbsRecursive", &error, m_parallelLinearBvhProgram, additionalMacros );
 	b3Assert(m_buildBinaryRadixTreeAabbsRecursiveKernel);
 	
+	m_findLeafIndexRangesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findLeafIndexRanges", &error, m_parallelLinearBvhProgram, additionalMacros );
+	b3Assert(m_findLeafIndexRangesKernel);
+	
 	m_plbvhCalculateOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhCalculateOverlappingPairs", &error, m_parallelLinearBvhProgram, additionalMacros );
 	b3Assert(m_plbvhCalculateOverlappingPairsKernel);
 	m_plbvhRayTraverseKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhRayTraverse", &error, m_parallelLinearBvhProgram, additionalMacros );
@@ -91,6 +94,8 @@ b3GpuParallelLinearBvh::~b3GpuParallelLinearBvh()
 	clReleaseKernel(m_findDistanceFromRootKernel);
 	clReleaseKernel(m_buildBinaryRadixTreeAabbsRecursiveKernel);
 	
+	clReleaseKernel(m_findLeafIndexRangesKernel);
+	
 	clReleaseKernel(m_plbvhCalculateOverlappingPairsKernel);
 	clReleaseKernel(m_plbvhRayTraverseKernel);
 	clReleaseKernel(m_plbvhLargeAabbAabbTestKernel);
@@ -253,7 +258,33 @@ void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAab
 	}
 	
 	//
-	constructRadixBinaryTree();
+	constructBinaryRadixTree();
+	
+	
+	//Since it is a sorted binary radix tree, each internal node contains a contiguous subset of leaf node indices.
+	//The root node contains leaf node indices in the range [0, numLeafNodes - 1].
+	//The child nodes of each node split their parent's index range into 2 contiguous halves.
+	//
+	//For example, if the root has indices [0, 31], its children might partition that range into [0, 11] and [12, 31].
+	//The next level in the tree could then split those ranges into [0, 2], [3, 11], [12, 22], and [23, 31].
+	//
+	//This property can be used for optimizing calculateOverlappingPairs(), to avoid testing each AABB pair twice
+	{
+		B3_PROFILE("m_findLeafIndexRangesKernel");
+	
+		b3BufferInfoCL bufferInfo[] = 
+		{
+			b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
+			b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() )
+		};
+		
+		b3LauncherCL launcher(m_queue, m_findLeafIndexRangesKernel, "m_findLeafIndexRangesKernel");
+		launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+		launcher.setConst(numInternalNodes);
+		
+		launcher.launch1D(numInternalNodes);
+		clFinish(m_queue);
+	}
 }
 
 void b3GpuParallelLinearBvh::calculateOverlappingPairs(b3OpenCLArray<int>& out_numPairs, b3OpenCLArray<b3Int4>& out_overlappingPairs)
@@ -410,9 +441,9 @@ void b3GpuParallelLinearBvh::testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayIn
 	
 }
 
-void b3GpuParallelLinearBvh::constructRadixBinaryTree()
+void b3GpuParallelLinearBvh::constructBinaryRadixTree()
 {
-	B3_PROFILE("b3GpuParallelLinearBvh::constructRadixBinaryTree()");
+	B3_PROFILE("b3GpuParallelLinearBvh::constructBinaryRadixTree()");
 	
 	int numLeaves = m_leafNodeAabbs.size();
 	int numInternalNodes = numLeaves - 1;
diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h
index e54080a71..f1d1049df 100644
--- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h
@@ -63,6 +63,8 @@ class b3GpuParallelLinearBvh
 	cl_kernel m_findDistanceFromRootKernel;
 	cl_kernel m_buildBinaryRadixTreeAabbsRecursiveKernel;
 	
+	cl_kernel m_findLeafIndexRangesKernel;
+	
 	//Traversal kernels
 	cl_kernel m_plbvhCalculateOverlappingPairsKernel;
 	cl_kernel m_plbvhRayTraverseKernel;
@@ -117,7 +119,7 @@ public:
 								b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs);
 								
 private:
-	void constructRadixBinaryTree();
+	void constructBinaryRadixTree();
 };
 
 #endif
diff --git a/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl b/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl
index b28c5ab56..83f355849 100644
--- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl
+++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl
@@ -211,12 +211,12 @@ __kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs,
 		int isLeaf = isLeafNode(internalOrLeafNodeIndex);	//Internal node if false
 		int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);
 		
-		//Optimization - if the node is not a leaf, check whether the highest leaf index of that node
-		//is less than the queried node's index to avoid testing each pair twice.
+		//Optimization - if the BVH is structured as a binary radix tree, then
+		//each internal node corresponds to a contiguous range of leaf nodes(internalNodeLeafIndexRanges[]).
+		//This can be used to avoid testing each AABB-AABB pair twice.
 		{
-			//	fix: produces duplicate pairs
-		//	int highestLeafIndex = (isLeaf) ? numQueryAabbs : internalNodeLeafIndexRanges[bvhNodeIndex].y;
-		//	if(highestLeafIndex < queryBvhNodeIndex) continue;
+			int highestLeafIndex = (isLeaf) ? bvhNodeIndex : internalNodeLeafIndexRanges[bvhNodeIndex].y;
+			if(highestLeafIndex < queryBvhNodeIndex) continue;
 		}
 		
 		//bvhRigidIndex is not used if internal node
@@ -225,7 +225,7 @@ __kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs,
 		b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];
 		if( queryRigidIndex != bvhRigidIndex && TestAabbAgainstAabb2(&queryAabb, &bvhNodeAabb) )
 		{
-			if(isLeaf && rigidAabbs[queryRigidIndex].m_minIndices[3] < rigidAabbs[bvhRigidIndex].m_minIndices[3])
+			if(isLeaf)
 			{
 				int4 pair;
 				pair.x = rigidAabbs[queryRigidIndex].m_minIndices[3];
@@ -741,3 +741,32 @@ __kernel void buildBinaryRadixTreeAabbsRecursive(__global int* distanceFromRoot,
 		internalNodeAabbs[internalNodeIndex] = mergedAabb;
 	}
 }
+
+__kernel void findLeafIndexRanges(__global int2* internalNodeChildNodes, __global int2* out_leafIndexRanges, int numInternalNodes)
+{
+	int internalNodeIndex = get_global_id(0);
+	if(internalNodeIndex >= numInternalNodes) return;
+	
+	int numLeafNodes = numInternalNodes + 1;
+	
+	int2 childNodes = internalNodeChildNodes[internalNodeIndex];
+	
+	int2 leafIndexRange;	//x == min leaf index, y == max leaf index
+	
+	//Find lowest leaf index covered by this internal node
+	{
+		int lowestIndex = childNodes.x;		//childNodes.x == Left child
+		while( !isLeafNode(lowestIndex) ) lowestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(lowestIndex) ].x;
+		leafIndexRange.x = lowestIndex;
+	}
+	
+	//Find highest leaf index covered by this internal node
+	{
+		int highestIndex = childNodes.y;	//childNodes.y == Right child
+		while( !isLeafNode(highestIndex) ) highestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(highestIndex) ].y;
+		leafIndexRange.y = highestIndex;
+	}
+	
+	//
+	out_leafIndexRanges[internalNodeIndex] = leafIndexRange;
+}
diff --git a/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h b/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h
index 731423098..40c5e3218 100644
--- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h
@@ -196,12 +196,12 @@ static const char* parallelLinearBvhCL= \
 "		int isLeaf = isLeafNode(internalOrLeafNodeIndex);	//Internal node if false\n"
 "		int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n"
 "		\n"
-"		//Optimization - if the node is not a leaf, check whether the highest leaf index of that node\n"
-"		//is less than the queried node's index to avoid testing each pair twice.\n"
+"		//Optimization - if the BVH is structured as a binary radix tree, then\n"
+"		//each internal node corresponds to a contiguous range of leaf nodes(internalNodeLeafIndexRanges[]).\n"
+"		//This can be used to avoid testing each AABB-AABB pair twice.\n"
 "		{\n"
-"			//	fix: produces duplicate pairs\n"
-"		//	int highestLeafIndex = (isLeaf) ? numQueryAabbs : internalNodeLeafIndexRanges[bvhNodeIndex].y;\n"
-"		//	if(highestLeafIndex < queryBvhNodeIndex) continue;\n"
+"			int highestLeafIndex = (isLeaf) ? bvhNodeIndex : internalNodeLeafIndexRanges[bvhNodeIndex].y;\n"
+"			if(highestLeafIndex < queryBvhNodeIndex) continue;\n"
 "		}\n"
 "		\n"
 "		//bvhRigidIndex is not used if internal node\n"
@@ -210,7 +210,7 @@ static const char* parallelLinearBvhCL= \
 "		b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n"
 "		if( queryRigidIndex != bvhRigidIndex && TestAabbAgainstAabb2(&queryAabb, &bvhNodeAabb) )\n"
 "		{\n"
-"			if(isLeaf && rigidAabbs[queryRigidIndex].m_minIndices[3] < rigidAabbs[bvhRigidIndex].m_minIndices[3])\n"
+"			if(isLeaf)\n"
 "			{\n"
 "				int4 pair;\n"
 "				pair.x = rigidAabbs[queryRigidIndex].m_minIndices[3];\n"
@@ -702,4 +702,32 @@ static const char* parallelLinearBvhCL= \
 "		internalNodeAabbs[internalNodeIndex] = mergedAabb;\n"
 "	}\n"
 "}\n"
+"__kernel void findLeafIndexRanges(__global int2* internalNodeChildNodes, __global int2* out_leafIndexRanges, int numInternalNodes)\n"
+"{\n"
+"	int internalNodeIndex = get_global_id(0);\n"
+"	if(internalNodeIndex >= numInternalNodes) return;\n"
+"	\n"
+"	int numLeafNodes = numInternalNodes + 1;\n"
+"	\n"
+"	int2 childNodes = internalNodeChildNodes[internalNodeIndex];\n"
+"	\n"
+"	int2 leafIndexRange;	//x == min leaf index, y == max leaf index\n"
+"	\n"
+"	//Find lowest leaf index covered by this internal node\n"
+"	{\n"
+"		int lowestIndex = childNodes.x;		//childNodes.x == Left child\n"
+"		while( !isLeafNode(lowestIndex) ) lowestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(lowestIndex) ].x;\n"
+"		leafIndexRange.x = lowestIndex;\n"
+"	}\n"
+"	\n"
+"	//Find highest leaf index covered by this internal node\n"
+"	{\n"
+"		int highestIndex = childNodes.y;	//childNodes.y == Right child\n"
+"		while( !isLeafNode(highestIndex) ) highestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(highestIndex) ].y;\n"
+"		leafIndexRange.y = highestIndex;\n"
+"	}\n"
+"	\n"
+"	//\n"
+"	out_leafIndexRanges[internalNodeIndex] = leafIndexRange;\n"
+"}\n"
 ;