From e83b5dac75c2cfe591cf87ea4aabe4cdf8ee30ed Mon Sep 17 00:00:00 2001
From: "erwin.coumans" <erwin.coumans@08e121b0-ae19-0410-a57b-3be3395fd4fd>
Date: Wed, 3 Aug 2011 20:21:23 +0000
Subject: [PATCH] fixes in the DX11 cloth (for devices with physical wavefronts
 smaller than 32) another fix for out-of-bounds check (there was no assert for
 this previously)

---
 .../DX11/HLSL/SolvePositionsSIMDBatched.hlsl  | 26 ++++++++++++++-----
 .../DX11/btSoftBodySolver_DX11SIMDAware.cpp   |  2 +-
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/SolvePositionsSIMDBatched.hlsl b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/SolvePositionsSIMDBatched.hlsl
index a67758ff5..4834dc150 100644
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/SolvePositionsSIMDBatched.hlsl
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/HLSL/SolvePositionsSIMDBatched.hlsl
@@ -1,5 +1,7 @@
 MSTRINGIFY(
 
+
+
 cbuffer SolvePositionsFromLinksKernelCB : register( b0 )
 {
 	int startWaveInBatch;
@@ -41,16 +43,20 @@ SolvePositionsFromLinksKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchT
 	const int firstWavefrontInBlock = startWaveInBatch + Gid.x * WAVEFRONT_BLOCK_MULTIPLIER;
 	const int localWavefront = wavefront - firstWavefrontInBlock;
 
+	int batchesWithinWavefront = 0;
+	int verticesUsedByWave = 0;
+	int cond = wavefront < (startWaveInBatch + numWaves);
+
 	// Mask out in case there's a stray "wavefront" at the end that's been forced in through the multiplier	
-	if( wavefront < (startWaveInBatch + numWaves) )
+	if( cond)
 	{
 
 		// Load the batch counts for the wavefronts
 
 		int2 batchesAndVerticesWithinWavefront = g_wavefrontBatchCountsVertexCounts[wavefront];
 
-		int batchesWithinWavefront = batchesAndVerticesWithinWavefront.x;
-		int verticesUsedByWave = batchesAndVerticesWithinWavefront.y;
+		batchesWithinWavefront = batchesAndVerticesWithinWavefront.x;
+		verticesUsedByWave = batchesAndVerticesWithinWavefront.y;
 
 		// Load the vertices for the wavefronts
 		for( int vertex = laneInWavefront; vertex < verticesUsedByWave; vertex+=WAVEFRONT_SIZE )
@@ -62,10 +68,13 @@ SolvePositionsFromLinksKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchT
 			vertexInverseMassSharedData[localWavefront*MAX_NUM_VERTICES_PER_WAVE + vertex] = g_verticesInverseMass[vertexAddress];
 		}
 		
+	}
 		// Ensure compiler does not re-order memory operations
-		AllMemoryBarrier();
-
+		//AllMemoryBarrier();
+	AllMemoryBarrierWithGroupSync ();
 		
+	if( cond)
+	{
 		// Loop through the batches performing the solve on each in LDS
 		int baseDataLocationForWave = WAVEFRONT_SIZE * wavefront * MAX_BATCHES_PER_WAVE;	
 
@@ -128,6 +137,11 @@ SolvePositionsFromLinksKernel( uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchT
 		}
 	}
 		
+		
 }
 
-);
\ No newline at end of file
+
+
+
+);
+
diff --git a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11SIMDAware.cpp b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11SIMDAware.cpp
index 8229d1a29..b74c8d248 100644
--- a/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11SIMDAware.cpp
+++ b/src/BulletMultiThreaded/GpuSoftBodySolvers/DX11/btSoftBodySolver_DX11SIMDAware.cpp
@@ -620,7 +620,7 @@ static void generateBatchesOfWavefronts( btAlignedObjectArray < btAlignedObjectA
 			mapOfVerticesInBatches.resize( batch + 1 );
 			
 			// Resize maps with total number of vertices
-			mapOfVerticesInBatches[batch].resize( numVertices, false );
+			mapOfVerticesInBatches[batch].resize( numVertices+1, false );
 
 			// Insert vertices into this batch too
 			for( int link = 0; link < wavefront.size(); ++link )