diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h
index 10fbeb79f..58ec34ab8 100644
--- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h
@@ -55,6 +55,7 @@ class b3GpuParallelLinearBvh
 	
 	cl_program m_parallelLinearBvhProgram;
 	
+	cl_kernel m_findAllNodesMergedAabbKernel;
 	cl_kernel m_assignMortonCodesAndAabbIndiciesKernel;
 	cl_kernel m_constructBinaryTreeKernel;
 	cl_kernel m_determineInternalNodeAabbsKernel;
@@ -78,6 +79,7 @@ class b3GpuParallelLinearBvh
 	//1 element per leaf node
 	b3OpenCLArray<int> m_leafNodeParentNodes;
 	b3OpenCLArray<b3SortData> m_mortonCodesAndAabbIndicies;		//m_key = morton code, m_value == aabb index
+	b3OpenCLArray<b3SapAabb> m_mergedAabb;
 	
 public:
 	b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue) :
@@ -91,7 +93,8 @@ public:
 		m_internalNodeChildNodes(context, queue),
 		m_internalNodeParentNodes(context, queue),
 		m_leafNodeParentNodes(context, queue),
-		m_mortonCodesAndAabbIndicies(context, queue)
+		m_mortonCodesAndAabbIndicies(context, queue),
+		m_mergedAabb(context, queue)
 	{
 		const char CL_PROGRAM_PATH[] = "src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl";
 		
@@ -101,6 +104,8 @@ public:
 		m_parallelLinearBvhProgram = b3OpenCLUtils::compileCLProgramFromString(context, device, kernelSource, &error, additionalMacros, CL_PROGRAM_PATH);
 		b3Assert(m_parallelLinearBvhProgram);
 		
+		m_findAllNodesMergedAabbKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findAllNodesMergedAabb", &error, m_parallelLinearBvhProgram, additionalMacros );
+		b3Assert(m_findAllNodesMergedAabbKernel);
 		m_assignMortonCodesAndAabbIndiciesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "assignMortonCodesAndAabbIndicies", &error, m_parallelLinearBvhProgram, additionalMacros );
 		b3Assert(m_assignMortonCodesAndAabbIndiciesKernel);
 		m_constructBinaryTreeKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "constructBinaryTree", &error, m_parallelLinearBvhProgram, additionalMacros );
@@ -114,6 +119,7 @@ public:
 	
 	virtual ~b3GpuParallelLinearBvh() 
 	{
+		clReleaseKernel(m_findAllNodesMergedAabbKernel);
 		clReleaseKernel(m_assignMortonCodesAndAabbIndiciesKernel);
 		clReleaseKernel(m_constructBinaryTreeKernel);
 		clReleaseKernel(m_determineInternalNodeAabbsKernel);
@@ -125,8 +131,7 @@ public:
 
 	//	fix: need to handle/test case with 2 nodes
 	
-	///@param cellsize A virtual grid of size 2^10^3 is used in the process of creating the BVH
-	void build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, b3Scalar cellSize)
+	void build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs)
 	{
 		B3_PROFILE("b3ParallelLinearBvh::build()");
 	
@@ -143,6 +148,7 @@ public:
 	
 			m_leafNodeParentNodes.resize(numLeaves);
 			m_mortonCodesAndAabbIndicies.resize(numLeaves);
+			m_mergedAabb.resize(numLeaves);
 		}
 		
 		//Determine number of levels in the binary tree( numLevels = ceil( log2(numLeaves) ) )
@@ -160,7 +166,7 @@ public:
 			//If the number of nodes is not a power of 2(as in, can be expressed as 2^N where N is an integer), then there is 1 additional level
 			if( ~(1 << mostSignificantBit) & numLeaves ) numLevels++;
 			
-			if(1) printf("numLeaves, numLevels, mostSignificantBit: %d, %d, %d \n", numLeaves, numLevels, mostSignificantBit);
+			if(0) printf("numLeaves, numLevels, mostSignificantBit: %d, %d, %d \n", numLeaves, numLevels, mostSignificantBit);
 		}
 		
 		//Determine number of nodes per level, use prefix sum to get offsets of each level, and send to GPU
@@ -202,7 +208,7 @@ public:
 					m_firstIndexOffsetPerLevelCpu[i] -= m_numNodesPerLevelCpu[i];
 			}
 			
-			if(1)
+			if(0)
 			{
 				int numInternalNodes = 0;
 				for(int i = 0; i < numLevels; ++i) 
@@ -225,20 +231,22 @@ public:
 		{
 			B3_PROFILE("Find AABB of merged nodes");
 		
-			/*b3BufferInfoCL bufferInfo[] = 
+			m_mergedAabb.copyFromOpenCLArray(worldSpaceAabbs);	//Need to make a copy since the kernel modifies the array
+		
+			b3BufferInfoCL bufferInfo[] = 
 			{
-				b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ),
-				b3BufferInfoCL( m_allNodesMergedAabb.getBufferCL() ),
+				b3BufferInfoCL( m_mergedAabb.getBufferCL() )		//Resulting AABB is stored in m_mergedAabb[0]
 			};
 			
-			b3LauncherCL launcher(m_queue, m_findAllNodesMergedAabb, "m_findAllNodesMergedAabb");
+			b3LauncherCL launcher(m_queue, m_findAllNodesMergedAabbKernel, "m_findAllNodesMergedAabbKernel");
 			launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
 			launcher.setConst(numLeaves);
 			
 			launcher.launch1D(numLeaves);
-			clFinish(m_queue);*/
+			clFinish(m_queue);
 		}
 		
+		
 		//Insert the center of the AABBs into a virtual grid,
 		//then convert the discrete grid coordinates into a morton code
 		//For each element in m_mortonCodesAndAabbIndicies, set
@@ -250,12 +258,12 @@ public:
 			b3BufferInfoCL bufferInfo[] = 
 			{
 				b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ),
+				b3BufferInfoCL( m_mergedAabb.getBufferCL() ),
 				b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() )
 			};
 			
 			b3LauncherCL launcher(m_queue, m_assignMortonCodesAndAabbIndiciesKernel, "m_assignMortonCodesAndAabbIndiciesKernel");
 			launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
-			launcher.setConst(cellSize);
 			launcher.setConst(numLeaves);
 			
 			launcher.launch1D(numLeaves);
@@ -301,7 +309,7 @@ public:
 			launcher.launch1D(numInternalNodes);
 			clFinish(m_queue);
 			
-			if(1)
+			if(0)
 			{
 				static b3AlignedObjectArray<b3Int2> internalNodeChildNodes;	
 				m_internalNodeChildNodes.copyToHost(internalNodeChildNodes, false);
@@ -335,6 +343,12 @@ public:
 			launcher.launch1D(numLeaves);
 			clFinish(m_queue);
 			
+			if(0)
+			{
+				b3SapAabb mergedAABB = m_mergedAabb.at(0);
+				printf("mergedAABBMin: %f, %f, %f \n", mergedAABB.m_minVec.x, mergedAABB.m_minVec.y, mergedAABB.m_minVec.z);
+				printf("mergedAABBMax: %f, %f, %f \n", mergedAABB.m_maxVec.x, mergedAABB.m_maxVec.y, mergedAABB.m_maxVec.z);
+			}
 			if(0)
 			{
 				static b3AlignedObjectArray<b3SapAabb> rigidAabbs;
diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h
index 728215b7a..007067460 100644
--- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h
@@ -56,14 +56,8 @@ public:
 	
 	virtual void calculateOverlappingPairs(int maxPairs)
 	{
-		//Detect overall min/max
-		{
-			//Not implemented
-		}
-		
 		//Reconstruct BVH
-		const b3Scalar CELL_SIZE(0.1); 
-		m_plbvh.build(m_aabbsGpu, CELL_SIZE);
+		m_plbvh.build(m_aabbsGpu);
 		
 		//
 		m_overlappingPairsGpu.resize(maxPairs);
diff --git a/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl b/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl
index a5127c3aa..51c8ce281 100644
--- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl
+++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl
@@ -80,8 +80,8 @@ unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z)
 	return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;
 }
 
-/*
-__kernel void findAllNodesMergedAabb(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* out_mergedAabb, int numAabbs)
+
+__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbs)
 {
 	int aabbIndex = get_global_id(0);
 	if(aabbIndex >= numAabbs) return;
@@ -89,11 +89,11 @@ __kernel void findAllNodesMergedAabb(__global b3AabbCL* worldSpaceAabbs, __globa
 	//Find the most significant bit(msb)
 	int mostSignificantBit = 0;
 	{
-		int temp = numLeaves;
+		int temp = numAabbs;
 		while(temp >>= 1) mostSignificantBit++;		//Start counting from 0 (0 and 1 have msb 0, 2 has msb 1)
 	}
 	
-	int numberOfAabbsAboveMsbSplit = numAabbs & ~( ~(0) << mostSignificantBit );	//	verify
+	int numberOfAabbsAboveMsbSplit = numAabbs & ~( ~(0) << mostSignificantBit );
 	int numRemainingAabbs = (1 << mostSignificantBit);
 	
 	//Merge AABBs above most significant bit so that the number of remaining AABBs is a power of 2
@@ -102,8 +102,8 @@ __kernel void findAllNodesMergedAabb(__global b3AabbCL* worldSpaceAabbs, __globa
 	{
 		int otherAabbIndex = numRemainingAabbs + aabbIndex;
 		
-		b3AabbCL aabb = worldSpaceAabbs[aabbIndex];
-		b3AabbCL otherAabb = worldSpaceAabbs[otherAabbIndex];
+		b3AabbCL aabb = out_mergedAabb[aabbIndex];
+		b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];
 		
 		b3AabbCL mergedAabb;
 		mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);
@@ -121,8 +121,8 @@ __kernel void findAllNodesMergedAabb(__global b3AabbCL* worldSpaceAabbs, __globa
 		{
 			int otherAabbIndex = aabbIndex + offset;
 		
-			b3AabbCL aabb = worldSpaceAabbs[aabbIndex];
-			b3AabbCL otherAabb = worldSpaceAabbs[otherAabbIndex];
+			b3AabbCL aabb = out_mergedAabb[aabbIndex];
+			b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];
 		
 			b3AabbCL mergedAabb;
 			mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);
@@ -130,27 +130,29 @@ __kernel void findAllNodesMergedAabb(__global b3AabbCL* worldSpaceAabbs, __globa
 			out_mergedAabb[aabbIndex] = mergedAabb;
 		}
 		
-		offset = offset / 2;
+		offset /= 2;
 		
 		barrier(CLK_GLOBAL_MEM_FENCE);
 	}
 }
-*/
 
-__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, 
-												__global SortDataCL* out_mortonCodesAndAabbIndices, 
-												b3Scalar cellSize, int numAabbs)
+__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes, 
+												__global SortDataCL* out_mortonCodesAndAabbIndices, int numAabbs)
 {
 	int leafNodeIndex = get_global_id(0);	//Leaf node index == AABB index
 	if(leafNodeIndex >= numAabbs) return;
 	
-	b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex];
+	b3AabbCL mergedAabb = mergedAabbOfAllNodes[0];
+	b3Vector3 gridCenter = (mergedAabb.m_min + mergedAabb.m_max) * 0.5f;
+	b3Vector3 gridCellSize = (mergedAabb.m_max - mergedAabb.m_min) / (float)1024;
 	
-	b3Vector3 center = (aabb.m_min + aabb.m_max) * 0.5f;
+	b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex];
+	b3Vector3 aabbCenter = (aabb.m_min + aabb.m_max) * 0.5f;
+	b3Vector3 aabbCenterRelativeToGrid = aabbCenter - gridCenter;
 	
 	//Quantize into integer coordinates
 	//floor() is needed to prevent the center cell, at (0,0,0) from being twice the size
-	b3Vector3 gridPosition = center / cellSize;
+	b3Vector3 gridPosition = aabbCenterRelativeToGrid / gridCellSize;
 	
 	int4 discretePosition;
 	discretePosition.x = (int)( (gridPosition.x >= 0.0f) ? gridPosition.x : floor(gridPosition.x) );
diff --git a/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h b/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h
index a6cabdb99..fc3273bbc 100644
--- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h
@@ -75,20 +75,77 @@ static const char* parallelLinearBvhCL= \
 "{\n"
 "	return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;\n"
 "}\n"
-"__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, \n"
-"												__global SortDataCL* out_mortonCodesAndAabbIndices, \n"
-"												b3Scalar cellSize, int numAabbs)\n"
+"__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbs)\n"
+"{\n"
+"	int aabbIndex = get_global_id(0);\n"
+"	if(aabbIndex >= numAabbs) return;\n"
+"	\n"
+"	//Find the most significant bit(msb)\n"
+"	int mostSignificantBit = 0;\n"
+"	{\n"
+"		int temp = numAabbs;\n"
+"		while(temp >>= 1) mostSignificantBit++;		//Start counting from 0 (0 and 1 have msb 0, 2 has msb 1)\n"
+"	}\n"
+"	\n"
+"	int numberOfAabbsAboveMsbSplit = numAabbs & ~( ~(0) << mostSignificantBit );\n"
+"	int numRemainingAabbs = (1 << mostSignificantBit);\n"
+"	\n"
+"	//Merge AABBs above most significant bit so that the number of remaining AABBs is a power of 2\n"
+"	//For example, if there are 159 AABBs = 128 + 31, then merge indices [0, 30] and 128 + [0, 30]\n"
+"	if(aabbIndex < numberOfAabbsAboveMsbSplit)\n"
+"	{\n"
+"		int otherAabbIndex = numRemainingAabbs + aabbIndex;\n"
+"		\n"
+"		b3AabbCL aabb = out_mergedAabb[aabbIndex];\n"
+"		b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];\n"
+"		\n"
+"		b3AabbCL mergedAabb;\n"
+"		mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);\n"
+"		mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);\n"
+"		out_mergedAabb[aabbIndex] = mergedAabb;\n"
+"	}\n"
+"	\n"
+"	barrier(CLK_GLOBAL_MEM_FENCE);\n"
+"	\n"
+"	//\n"
+"	int offset = numRemainingAabbs / 2;\n"
+"	while(offset >= 1)\n"
+"	{\n"
+"		if(aabbIndex < offset)\n"
+"		{\n"
+"			int otherAabbIndex = aabbIndex + offset;\n"
+"		\n"
+"			b3AabbCL aabb = out_mergedAabb[aabbIndex];\n"
+"			b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];\n"
+"		\n"
+"			b3AabbCL mergedAabb;\n"
+"			mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);\n"
+"			mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);\n"
+"			out_mergedAabb[aabbIndex] = mergedAabb;\n"
+"		}\n"
+"		\n"
+"		offset /= 2;\n"
+"		\n"
+"		barrier(CLK_GLOBAL_MEM_FENCE);\n"
+"	}\n"
+"}\n"
+"__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes, \n"
+"												__global SortDataCL* out_mortonCodesAndAabbIndices, int numAabbs)\n"
 "{\n"
 "	int leafNodeIndex = get_global_id(0);	//Leaf node index == AABB index\n"
 "	if(leafNodeIndex >= numAabbs) return;\n"
 "	\n"
-"	b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex];\n"
+"	b3AabbCL mergedAabb = mergedAabbOfAllNodes[0];\n"
+"	b3Vector3 gridCenter = (mergedAabb.m_min + mergedAabb.m_max) * 0.5f;\n"
+"	b3Vector3 gridCellSize = (mergedAabb.m_max - mergedAabb.m_min) / (float)1024;\n"
 "	\n"
-"	b3Vector3 center = (aabb.m_min + aabb.m_max) * 0.5f;\n"
+"	b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex];\n"
+"	b3Vector3 aabbCenter = (aabb.m_min + aabb.m_max) * 0.5f;\n"
+"	b3Vector3 aabbCenterRelativeToGrid = aabbCenter - gridCenter;\n"
 "	\n"
 "	//Quantize into integer coordinates\n"
 "	//floor() is needed to prevent the center cell, at (0,0,0) from being twice the size\n"
-"	b3Vector3 gridPosition = center / cellSize;\n"
+"	b3Vector3 gridPosition = aabbCenterRelativeToGrid / gridCellSize;\n"
 "	\n"
 "	int4 discretePosition;\n"
 "	discretePosition.x = (int)( (gridPosition.x >= 0.0f) ? gridPosition.x : floor(gridPosition.x) );\n"
@@ -110,7 +167,7 @@ static const char* parallelLinearBvhCL= \
 "	out_mortonCodesAndAabbIndices[leafNodeIndex] = mortonCodeIndexPair;\n"
 "}\n"
 "#define B3_PLVBH_TRAVERSE_MAX_STACK_SIZE 128\n"
-"#define B3_PLBVH_ROOT_NODE_MARKER -1	//Used to indicate that the node has no parent \n"
+"#define B3_PLBVH_ROOT_NODE_MARKER -1	//Used to indicate that the (root) node has no parent \n"
 "#define B3_PLBVH_ROOT_NODE_INDEX 0\n"
 "//For elements of internalNodeChildIndices(int2), the negative bit determines whether it is a leaf or internal node.\n"
 "//Positive index == leaf node, while negative index == internal node (remove negative sign to get index).\n"
@@ -264,13 +321,16 @@ static const char* parallelLinearBvhCL= \
 "											__global int* out_numPairs, __global int4* out_overlappingPairs, \n"
 "											int maxPairs, int numQueryAabbs)\n"
 "{\n"
+"#define USE_SPATIALLY_COHERENT_INDICIES		//mortonCodesAndAabbIndices[] contains rigid body indices sorted along the z-curve\n"
+"#ifdef USE_SPATIALLY_COHERENT_INDICIES\n"
 "	int queryRigidIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
 "	if(queryRigidIndex >= numQueryAabbs) return;\n"
 "	\n"
 "	queryRigidIndex = mortonCodesAndAabbIndices[queryRigidIndex].m_value;\n"
-"	//int queryRigidIndex = get_global_id(0);\n"
-"	//if(queryRigidIndex >= numQueryAabbs) return;\n"
-"	\n"
+"#else\n"
+"	int queryRigidIndex = get_global_id(0);\n"
+"	if(queryRigidIndex >= numQueryAabbs) return;\n"
+"#endif\n"
 "	b3AabbCL queryAabb = rigidAabbs[queryRigidIndex];\n"
 "	\n"
 "	int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n"