Accelerate GPU raycaster with PLBVH.

2014-02-23 20:40:58 -08:00
parent e955192971
commit e4fbd5332d
10 changed files with 732 additions and 33 deletions
--- a/src/Bullet3OpenCL/Raycast/b3GpuRaycast.cpp
+++ b/src/Bullet3OpenCL/Raycast/b3GpuRaycast.cpp
@@ -8,6 +8,11 @@
 #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
 #include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
 #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
+#include "Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h"
+#include "Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h"
+
 #include "Bullet3OpenCL/Raycast/kernels/rayCastKernels.h"


@@ -20,7 +25,24 @@ struct b3GpuRaycastInternalData
 	cl_context m_context;
 	cl_device_id m_device;
 	cl_command_queue  m_q;
-	cl_kernel	m_raytraceKernel;
+	cl_kernel m_raytraceKernel;
+	cl_kernel m_raytracePairsKernel;
+	cl_kernel m_findRayRigidPairIndexRanges;
+	
+	b3GpuParallelLinearBvh* m_plbvh;
+	b3RadixSort32CL* m_radixSorter;
+	b3FillCL* m_fill;
+	
+	//1 element per ray
+	b3OpenCLArray<b3RayInfo>* m_gpuRays;
+	b3OpenCLArray<b3RayHit>* m_gpuHitResults;
+	b3OpenCLArray<int>* m_firstRayRigidPairIndexPerRay;
+	b3OpenCLArray<int>* m_numRayRigidPairsPerRay;
+	
+	//1 element per (ray index, rigid index) pair
+	b3OpenCLArray<int>* m_gpuNumRayRigidPairs;
+	b3OpenCLArray<b3Int2>* m_gpuRayRigidPairs;
+	
 	int m_test;
 };

@@ -31,7 +53,19 @@ b3GpuRaycast::b3GpuRaycast(cl_context ctx,cl_device_id device, cl_command_queue
 	m_data->m_device = device;
 	m_data->m_q = q;
 	m_data->m_raytraceKernel = 0;
+	m_data->m_raytracePairsKernel = 0;
+	m_data->m_findRayRigidPairIndexRanges = 0;

+	m_data->m_plbvh = new b3GpuParallelLinearBvh(ctx, device, q);
+	m_data->m_radixSorter = new b3RadixSort32CL(ctx, device, q);
+	m_data->m_fill = new b3FillCL(ctx, device, q);
+	
+	m_data->m_gpuRays = new b3OpenCLArray<b3RayInfo>(ctx, q);
+	m_data->m_gpuHitResults = new b3OpenCLArray<b3RayHit>(ctx, q);
+	m_data->m_firstRayRigidPairIndexPerRay = new b3OpenCLArray<int>(ctx, q);
+	m_data->m_numRayRigidPairsPerRay = new b3OpenCLArray<int>(ctx, q);
+	m_data->m_gpuNumRayRigidPairs = new b3OpenCLArray<int>(ctx, q);
+	m_data->m_gpuRayRigidPairs = new b3OpenCLArray<b3Int2>(ctx, q);

 	{
 		cl_int errNum=0;
@@ -39,6 +73,10 @@ b3GpuRaycast::b3GpuRaycast(cl_context ctx,cl_device_id device, cl_command_queue
 		b3Assert(errNum==CL_SUCCESS);
 		m_data->m_raytraceKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,rayCastKernelCL, "rayCastKernel",&errNum,prog);
 		b3Assert(errNum==CL_SUCCESS);
+		m_data->m_raytracePairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,rayCastKernelCL, "rayCastPairsKernel",&errNum,prog);
+		b3Assert(errNum==CL_SUCCESS);
+		m_data->m_findRayRigidPairIndexRanges = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,rayCastKernelCL, "findRayRigidPairIndexRanges",&errNum,prog);
+		b3Assert(errNum==CL_SUCCESS);
 		clReleaseProgram(prog);
 	}

@@ -48,6 +86,20 @@ b3GpuRaycast::b3GpuRaycast(cl_context ctx,cl_device_id device, cl_command_queue
 b3GpuRaycast::~b3GpuRaycast()
 {
 	clReleaseKernel(m_data->m_raytraceKernel);
+	clReleaseKernel(m_data->m_raytracePairsKernel);
+	clReleaseKernel(m_data->m_findRayRigidPairIndexRanges);
+	
+	delete m_data->m_plbvh;
+	delete m_data->m_radixSorter;
+	delete m_data->m_fill;
+	
+	delete m_data->m_gpuRays;
+	delete m_data->m_gpuHitResults;
+	delete m_data->m_firstRayRigidPairIndexPerRay;
+	delete m_data->m_numRayRigidPairsPerRay;
+	delete m_data->m_gpuNumRayRigidPairs;
+	delete m_data->m_gpuRayRigidPairs;
+	
 	delete m_data;
 }

@@ -206,27 +258,32 @@ void b3GpuRaycast::castRaysHost(const b3AlignedObjectArray<b3RayInfo>& rays,	b3A
 }
 ///todo: add some acceleration structure (AABBs, tree etc)
 void b3GpuRaycast::castRays(const b3AlignedObjectArray<b3RayInfo>& rays,	b3AlignedObjectArray<b3RayHit>& hitResults,
-		int numBodies,const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables, const struct b3GpuNarrowPhaseInternalData* narrowphaseData)
+		int numBodies,const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables, 
+		const struct b3GpuNarrowPhaseInternalData* narrowphaseData,	class b3GpuBroadphaseInterface* broadphase)
 {
-	
 	//castRaysHost(rays,hitResults,numBodies,bodies,numCollidables,collidables,narrowphaseData);

 	B3_PROFILE("castRaysGPU");
-
-	b3OpenCLArray<b3RayInfo> gpuRays(m_data->m_context,m_data->m_q);
-	b3OpenCLArray<b3RayHit> gpuHitResults(m_data->m_context,m_data->m_q);
-
+	
 	{
 		B3_PROFILE("raycast copyFromHost");
-		gpuRays.copyFromHost(rays);
-
-	
-		gpuHitResults.resize(hitResults.size());
-		gpuHitResults.copyFromHost(hitResults);
+		m_data->m_gpuRays->copyFromHost(rays);
+		m_data->m_gpuHitResults->copyFromHost(hitResults);
+		
 	}
-
-
+	
+	int numRays = hitResults.size();
+	{
+		m_data->m_firstRayRigidPairIndexPerRay->resize(numRays);
+		m_data->m_numRayRigidPairsPerRay->resize(numRays);
+		
+		m_data->m_gpuNumRayRigidPairs->resize(1);
+		m_data->m_gpuRayRigidPairs->resize(numRays * 16);
+	}
+	
 	//run kernel
+	const bool USE_BRUTE_FORCE_RAYCAST = false;
+	if(USE_BRUTE_FORCE_RAYCAST)
 	{
 		B3_PROFILE("raycast launch1D");

@@ -234,8 +291,8 @@ void b3GpuRaycast::castRays(const b3AlignedObjectArray<b3RayInfo>& rays,	b3Align
 		int numRays = rays.size();
 		launcher.setConst(numRays);

-		launcher.setBuffer(gpuRays.getBufferCL());
-		launcher.setBuffer(gpuHitResults.getBufferCL());
+		launcher.setBuffer(m_data->m_gpuRays->getBufferCL());
+		launcher.setBuffer(m_data->m_gpuHitResults->getBufferCL());

 		launcher.setConst(numBodies);
 		launcher.setBuffer(narrowphaseData->m_bodyBufferGPU->getBufferCL());
@@ -246,11 +303,90 @@ void b3GpuRaycast::castRays(const b3AlignedObjectArray<b3RayInfo>& rays,	b3Align
 		launcher.launch1D(numRays);
 		clFinish(m_data->m_q);
 	}
+	else
+	{
+		//printf("broadphase->getAllAabbsGPU().size(): %d \n", broadphase->getAllAabbsGPU().size());
+		m_data->m_plbvh->build( broadphase->getAllAabbsGPU() );
+
+		m_data->m_plbvh->testRaysAgainstBvhAabbs(*m_data->m_gpuRays, *m_data->m_gpuNumRayRigidPairs, *m_data->m_gpuRayRigidPairs);
+		
+		int numRayRigidPairs = -1;
+		m_data->m_gpuNumRayRigidPairs->copyToHostPointer(&numRayRigidPairs, 1);
+		if( numRayRigidPairs > m_data->m_gpuRayRigidPairs->size() )
+		{
+			numRayRigidPairs = m_data->m_gpuRayRigidPairs->size();
+			m_data->m_gpuNumRayRigidPairs->copyFromHostPointer(&numRayRigidPairs, 1);
+		}
+		
+		m_data->m_gpuRayRigidPairs->resize(numRayRigidPairs);	//Radix sort needs b3OpenCLArray::size() to be correct
+		
+		//Sort ray-rigid pairs by ray index
+		{
+			B3_PROFILE("sort ray-rigid pairs");
+			m_data->m_radixSorter->execute( *reinterpret_cast< b3OpenCLArray<b3SortData>* >(m_data->m_gpuRayRigidPairs) );
+		}
+		
+		//detect start,count of each ray pair
+		{
+			B3_PROFILE("detect ray-rigid pair index ranges");
+			
+			{
+				B3_PROFILE("reset ray-rigid pair index ranges");
+				
+				m_data->m_fill->execute(*m_data->m_firstRayRigidPairIndexPerRay, numRayRigidPairs, numRays);	//atomic_min used to find first index
+				m_data->m_fill->execute(*m_data->m_numRayRigidPairsPerRay, 0, numRays);
+				clFinish(m_data->m_q);
+			}
+			
+			b3BufferInfoCL bufferInfo[] = 
+			{
+				b3BufferInfoCL( m_data->m_gpuRayRigidPairs->getBufferCL() ),
+				
+				b3BufferInfoCL( m_data->m_firstRayRigidPairIndexPerRay->getBufferCL() ),
+				b3BufferInfoCL( m_data->m_numRayRigidPairsPerRay->getBufferCL() )
+			};
+			
+			b3LauncherCL launcher(m_data->m_q, m_data->m_findRayRigidPairIndexRanges, "m_findRayRigidPairIndexRanges");
+			launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setConst(numRayRigidPairs);
+			
+			launcher.launch1D(numRayRigidPairs);
+			clFinish(m_data->m_q);
+		}
+		
+		{
+			B3_PROFILE("ray-rigid intersection");
+			
+			b3BufferInfoCL bufferInfo[] = 
+			{
+				b3BufferInfoCL( m_data->m_gpuRays->getBufferCL() ),
+				b3BufferInfoCL( m_data->m_gpuHitResults->getBufferCL() ),
+				b3BufferInfoCL( m_data->m_firstRayRigidPairIndexPerRay->getBufferCL() ),
+				b3BufferInfoCL( m_data->m_numRayRigidPairsPerRay->getBufferCL() ),
+				
+				b3BufferInfoCL( narrowphaseData->m_bodyBufferGPU->getBufferCL() ),
+				b3BufferInfoCL( narrowphaseData->m_collidablesGPU->getBufferCL() ),
+				b3BufferInfoCL( narrowphaseData->m_convexFacesGPU->getBufferCL() ),
+				b3BufferInfoCL( narrowphaseData->m_convexPolyhedraGPU->getBufferCL() ),
+				
+				b3BufferInfoCL( m_data->m_gpuRayRigidPairs->getBufferCL() )
+			};
+			
+			b3LauncherCL launcher(m_data->m_q, m_data->m_raytracePairsKernel, "m_raytracePairsKernel");
+			launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setConst(numRays);
+			
+			launcher.launch1D(numRays);
+			clFinish(m_data->m_q);
+		}
+	}
+	
+	

 	//copy results
 	{
 		B3_PROFILE("raycast copyToHost");
-		gpuHitResults.copyToHost(hitResults);
+		m_data->m_gpuHitResults->copyToHost(hitResults);
 	}

 }
--- a/src/Bullet3OpenCL/Raycast/b3GpuRaycast.h
+++ b/src/Bullet3OpenCL/Raycast/b3GpuRaycast.h
@@ -23,8 +23,7 @@ public:

 	void castRays(const b3AlignedObjectArray<b3RayInfo>& rays,	b3AlignedObjectArray<b3RayHit>& hitResults,
 		int numBodies,const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables,
-		const struct b3GpuNarrowPhaseInternalData* narrowphaseData
-		);
+		const struct b3GpuNarrowPhaseInternalData* narrowphaseData, class b3GpuBroadphaseInterface* broadphase);
 	

 		
--- a/src/Bullet3OpenCL/Raycast/kernels/rayCastKernels.cl
+++ b/src/Bullet3OpenCL/Raycast/kernels/rayCastKernels.cl
@@ -337,3 +337,103 @@ __kernel void rayCastKernel(
 	}

 }
+
+
+__kernel void findRayRigidPairIndexRanges(__global int2* rayRigidPairs, 
+											__global int* out_firstRayRigidPairIndexPerRay,
+											__global int* out_numRayRigidPairsPerRay,
+											int numRayRigidPairs)
+{
+	int rayRigidPairIndex = get_global_id(0);
+	if (rayRigidPairIndex >= numRayRigidPairs) return;
+	
+	int rayIndex = rayRigidPairs[rayRigidPairIndex].x;
+	
+	atomic_min(&out_firstRayRigidPairIndexPerRay[rayIndex], rayRigidPairIndex);
+	atomic_inc(&out_numRayRigidPairsPerRay[rayIndex]);
+}
+
+__kernel void rayCastPairsKernel(const __global b3RayInfo* rays, 
+								__global b3RayHit* hitResults, 
+								__global int* firstRayRigidPairIndexPerRay,
+								__global int* numRayRigidPairsPerRay,
+									
+								__global Body* bodies,
+								__global Collidable* collidables,
+								__global const b3GpuFace* faces,
+								__global const ConvexPolyhedronCL* convexShapes,
+								
+								__global int2* rayRigidPairs,
+								int numRays)
+{
+	int i = get_global_id(0);
+	if (i >= numRays) return;
+	
+	float4 rayFrom = rays[i].m_from;
+	float4 rayTo = rays[i].m_to;
+		
+	hitResults[i].m_hitFraction = 1.f;
+		
+	float hitFraction = 1.f;
+	float4 hitPoint;
+	float4 hitNormal;
+	int hitBodyIndex = -1;
+		
+	//
+	for(int pair = 0; pair < numRayRigidPairsPerRay[i]; ++pair)
+	{
+		int rayRigidPairIndex = pair + firstRayRigidPairIndexPerRay[i];
+		int b = rayRigidPairs[rayRigidPairIndex].y;
+		
+		if (hitResults[i].m_hitResult2 == b) continue;
+		
+		Body body = bodies[b];
+		Collidable rigidCollidable = collidables[body.m_collidableIdx];
+		
+		float4 pos = body.m_pos;
+		float4 orn = body.m_quat;
+		
+		if (rigidCollidable.m_shapeType == SHAPE_CONVEX_HULL)
+		{
+			float4 invPos = (float4)(0,0,0,0);
+			float4 invOrn = (float4)(0,0,0,0);
+			float4 rayFromLocal = (float4)(0,0,0,0);
+			float4 rayToLocal = (float4)(0,0,0,0);
+			invOrn = qtInvert(orn);
+			invPos = qtRotate(invOrn, -pos);
+			rayFromLocal = qtRotate( invOrn, rayFrom ) + invPos;
+			rayToLocal = qtRotate( invOrn, rayTo) + invPos;
+			rayFromLocal.w = 0.f;
+			rayToLocal.w = 0.f;
+			int numFaces = convexShapes[rigidCollidable.m_shapeIndex].m_numFaces;
+			int faceOffset = convexShapes[rigidCollidable.m_shapeIndex].m_faceOffset;
+			
+			if (numFaces && rayConvex(rayFromLocal, rayToLocal, numFaces, faceOffset,faces, &hitFraction, &hitNormal))
+			{
+				hitBodyIndex = b;
+				hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);
+			}
+		}
+		
+		if (rigidCollidable.m_shapeType == SHAPE_SPHERE)
+		{
+			float radius = rigidCollidable.m_radius;
+		
+			if (sphere_intersect(pos, radius, rayFrom, rayTo, &hitFraction))
+			{
+				hitBodyIndex = b;
+				hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);
+				hitNormal = (float4) (hitPoint - bodies[b].m_pos);
+			}
+		}
+	}
+	
+	if (hitBodyIndex >= 0)
+	{
+		hitResults[i].m_hitFraction = hitFraction;
+		hitResults[i].m_hitPoint = hitPoint;
+		hitResults[i].m_hitNormal = normalize(hitNormal);
+		hitResults[i].m_hitResult0 = hitBodyIndex;
+	}
+	
+}
--- a/src/Bullet3OpenCL/Raycast/kernels/rayCastKernels.h
+++ b/src/Bullet3OpenCL/Raycast/kernels/rayCastKernels.h
@@ -281,4 +281,101 @@ static const char* rayCastKernelCL= \
 "		hitResults[i].m_hitResult0 = hitBodyIndex;\n"
 "	}\n"
 "}\n"
+"__kernel void findRayRigidPairIndexRanges(__global int2* rayRigidPairs, \n"
+"											__global int* out_firstRayRigidPairIndexPerRay,\n"
+"											__global int* out_numRayRigidPairsPerRay,\n"
+"											int numRayRigidPairs)\n"
+"{\n"
+"	int rayRigidPairIndex = get_global_id(0);\n"
+"	if (rayRigidPairIndex >= numRayRigidPairs) return;\n"
+"	\n"
+"	int rayIndex = rayRigidPairs[rayRigidPairIndex].x;\n"
+"	\n"
+"	atomic_min(&out_firstRayRigidPairIndexPerRay[rayIndex], rayRigidPairIndex);\n"
+"	atomic_inc(&out_numRayRigidPairsPerRay[rayIndex]);\n"
+"}\n"
+"__kernel void rayCastPairsKernel(const __global b3RayInfo* rays, \n"
+"								__global b3RayHit* hitResults, \n"
+"								__global int* firstRayRigidPairIndexPerRay,\n"
+"								__global int* numRayRigidPairsPerRay,\n"
+"									\n"
+"								__global Body* bodies,\n"
+"								__global Collidable* collidables,\n"
+"								__global const b3GpuFace* faces,\n"
+"								__global const ConvexPolyhedronCL* convexShapes,\n"
+"								\n"
+"								__global int2* rayRigidPairs,\n"
+"								int numRays)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	if (i >= numRays) return;\n"
+"	\n"
+"	float4 rayFrom = rays[i].m_from;\n"
+"	float4 rayTo = rays[i].m_to;\n"
+"		\n"
+"	hitResults[i].m_hitFraction = 1.f;\n"
+"		\n"
+"	float hitFraction = 1.f;\n"
+"	float4 hitPoint;\n"
+"	float4 hitNormal;\n"
+"	int hitBodyIndex = -1;\n"
+"		\n"
+"	//\n"
+"	for(int pair = 0; pair < numRayRigidPairsPerRay[i]; ++pair)\n"
+"	{\n"
+"		int rayRigidPairIndex = pair + firstRayRigidPairIndexPerRay[i];\n"
+"		int b = rayRigidPairs[rayRigidPairIndex].y;\n"
+"		\n"
+"		if (hitResults[i].m_hitResult2 == b) continue;\n"
+"		\n"
+"		Body body = bodies[b];\n"
+"		Collidable rigidCollidable = collidables[body.m_collidableIdx];\n"
+"		\n"
+"		float4 pos = body.m_pos;\n"
+"		float4 orn = body.m_quat;\n"
+"		\n"
+"		if (rigidCollidable.m_shapeType == SHAPE_CONVEX_HULL)\n"
+"		{\n"
+"			float4 invPos = (float4)(0,0,0,0);\n"
+"			float4 invOrn = (float4)(0,0,0,0);\n"
+"			float4 rayFromLocal = (float4)(0,0,0,0);\n"
+"			float4 rayToLocal = (float4)(0,0,0,0);\n"
+"			invOrn = qtInvert(orn);\n"
+"			invPos = qtRotate(invOrn, -pos);\n"
+"			rayFromLocal = qtRotate( invOrn, rayFrom ) + invPos;\n"
+"			rayToLocal = qtRotate( invOrn, rayTo) + invPos;\n"
+"			rayFromLocal.w = 0.f;\n"
+"			rayToLocal.w = 0.f;\n"
+"			int numFaces = convexShapes[rigidCollidable.m_shapeIndex].m_numFaces;\n"
+"			int faceOffset = convexShapes[rigidCollidable.m_shapeIndex].m_faceOffset;\n"
+"			\n"
+"			if (numFaces && rayConvex(rayFromLocal, rayToLocal, numFaces, faceOffset,faces, &hitFraction, &hitNormal))\n"
+"			{\n"
+"				hitBodyIndex = b;\n"
+"				hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);\n"
+"			}\n"
+"		}\n"
+"		\n"
+"		if (rigidCollidable.m_shapeType == SHAPE_SPHERE)\n"
+"		{\n"
+"			float radius = rigidCollidable.m_radius;\n"
+"		\n"
+"			if (sphere_intersect(pos, radius, rayFrom, rayTo, &hitFraction))\n"
+"			{\n"
+"				hitBodyIndex = b;\n"
+"				hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);\n"
+"				hitNormal = (float4) (hitPoint - bodies[b].m_pos);\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	\n"
+"	if (hitBodyIndex >= 0)\n"
+"	{\n"
+"		hitResults[i].m_hitFraction = hitFraction;\n"
+"		hitResults[i].m_hitPoint = hitPoint;\n"
+"		hitResults[i].m_hitNormal = normalize(hitNormal);\n"
+"		hitResults[i].m_hitResult0 = hitBodyIndex;\n"
+"	}\n"
+"	\n"
+"}\n"
 ;