Added initial Smoothed Particle Hydrodynamics implementation (SPH), for CPU and CUDA.

This software is contributed under the ZLib license by Rama Hoetzlein, http://www.rchoetzlein.com We plan to integrate the SPH into the core Bullet library, including interaction with rigid bodies and soft bodies.
2009-05-13 22:28:03 +00:00
parent 4616b62686
commit e260bcd1b2
50 changed files with 61309 additions and 0 deletions
--- a/Extras/sph/fluids/radixsort_kernel.cu
+++ b/Extras/sph/fluids/radixsort_kernel.cu
@@ -0,0 +1,577 @@
+/*
+ * Copyright 1993-2006 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:   
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and 
+ * international Copyright laws.  
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH 
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.   
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 
+ * OR PERFORMANCE OF THIS SOURCE CODE.  
+ *
+ * U.S. Government End Users.  This source code is a "commercial item" as 
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of 
+ * "commercial computer software" and "commercial computer software 
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 
+ * and is provided to the U.S. Government only as a commercial end item.  
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 
+ * source code with only those rights set forth herein.
+ */
+
+/* Radixsort project with key/value and arbitrary datset size support
+ * which demonstrates the use of CUDA in a multi phase sorting 
+ * computation.
+ * Device code.
+ */
+
+#ifndef _RADIXSORT_KERNEL_H_
+#define _RADIXSORT_KERNEL_H_
+
+#include <stdio.h>
+#include "radixsort.cuh"
+
+#define SYNCIT __syncthreads()
+
+static const int NUM_SMS = 16;
+static const int NUM_THREADS_PER_SM = 192;
+static const int NUM_THREADS_PER_BLOCK = 64;
+//static const int NUM_THREADS = NUM_THREADS_PER_SM * NUM_SMS;
+static const int NUM_BLOCKS = (NUM_THREADS_PER_SM / NUM_THREADS_PER_BLOCK) * NUM_SMS;
+static const int RADIX = 8;                                                        // Number of bits per radix sort pass
+static const int RADICES = 1 << RADIX;                                             // Number of radices
+static const int RADIXMASK = RADICES - 1;                                          // Mask for each radix sort pass
+#if SIXTEEN
+static const int RADIXBITS = 16;                                                   // Number of bits to sort over
+#else
+static const int RADIXBITS = 32;                                                   // Number of bits to sort over
+#endif
+static const int RADIXTHREADS = 16;                                                // Number of threads sharing each radix counter
+static const int RADIXGROUPS = NUM_THREADS_PER_BLOCK / RADIXTHREADS;               // Number of radix groups per CTA
+static const int TOTALRADIXGROUPS = NUM_BLOCKS * RADIXGROUPS;                      // Number of radix groups for each radix
+static const int SORTRADIXGROUPS = TOTALRADIXGROUPS * RADICES;                     // Total radix count
+static const int GRFELEMENTS = (NUM_THREADS_PER_BLOCK / RADIXTHREADS) * RADICES; 
+static const int GRFSIZE = GRFELEMENTS * sizeof(uint); 
+
+// Prefix sum variables
+static const int PREFIX_NUM_THREADS_PER_SM = NUM_THREADS_PER_SM;
+static const int PREFIX_NUM_THREADS_PER_BLOCK = PREFIX_NUM_THREADS_PER_SM;
+static const int PREFIX_NUM_BLOCKS = (PREFIX_NUM_THREADS_PER_SM / PREFIX_NUM_THREADS_PER_BLOCK) * NUM_SMS;
+static const int PREFIX_BLOCKSIZE = SORTRADIXGROUPS / PREFIX_NUM_BLOCKS;
+static const int PREFIX_GRFELEMENTS = PREFIX_BLOCKSIZE + 2 * PREFIX_NUM_THREADS_PER_BLOCK;
+static const int PREFIX_GRFSIZE = PREFIX_GRFELEMENTS * sizeof(uint);
+
+// Shuffle variables
+static const int SHUFFLE_GRFOFFSET = RADIXGROUPS * RADICES;
+static const int SHUFFLE_GRFELEMENTS = SHUFFLE_GRFOFFSET + PREFIX_NUM_BLOCKS; 
+static const int SHUFFLE_GRFSIZE = SHUFFLE_GRFELEMENTS * sizeof(uint); 
+
+
+#define SDATA( index)      CUT_BANK_CHECKER(sdata, index)
+
+// Prefix sum data
+uint gRadixSum[TOTALRADIXGROUPS * RADICES];
+__device__ uint dRadixSum[TOTALRADIXGROUPS * RADICES];
+uint gRadixBlockSum[PREFIX_NUM_BLOCKS];
+__device__ uint dRadixBlockSum[PREFIX_NUM_BLOCKS];
+
+extern __shared__ uint sRadixSum[];
+
+
+
+////////////////////////////////////////////////////////////////////////////////
+//! Perform a radix sum on the list to be sorted.  Each SM holds a set of
+//! radix counters for each group of RADIXGROUPS thread in the GRF. 
+//!
+//! @param pData     input data
+//! @param elements  total number of elements
+//! @param elements_rounded_to_3072  total number of elements rounded up to the 
+//!                                  nearest multiple of 3072
+//! @param shift     the shift (0 to 24) that we are using to obtain the correct 
+//!                  byte
+////////////////////////////////////////////////////////////////////////////////
+__global__ void RadixSum(KeyValuePair *pData, uint elements, uint elements_rounded_to_3072, uint shift)
+{
+    uint pos    = threadIdx.x;
+
+    // Zero radix counts
+    while (pos < GRFELEMENTS)
+    {
+        sRadixSum[pos] = 0;
+        pos += NUM_THREADS_PER_BLOCK;
+    }
+
+    // Sum up data
+    // Source addresses computed so that each thread is reading from a block of 
+    // consecutive addresses so there are no conflicts between threads
+    // They then loop over their combined region and the next batch works elsewhere.
+    // So threads 0 to 16 work on memory 0 to 320.
+    // First reading 0,1,2,3...15 then 16,17,18,19...31 and so on
+    // optimising parallel access to shared memory by a thread accessing 16*threadID
+    // The next radix group runs from 320 to 640 and the same applies in that region
+    uint tmod   =   threadIdx.x % RADIXTHREADS;
+    uint tpos   =   threadIdx.x / RADIXTHREADS;
+
+    // Take the rounded element list size so that all threads have a certain size dataset to work with
+    // and no zero size datasets confusing the issue
+    // By using a multiple of 3072 we ensure that all threads have elements
+    // to work with until the last phase, at which point we individually test
+    uint element_fraction  =   elements_rounded_to_3072 / TOTALRADIXGROUPS;
+
+    // Generate range 
+    // Note that it is possible for both pos and end to be past the end of the element set
+    // which will be caught later.
+    pos       = (blockIdx.x * RADIXGROUPS + tpos) * element_fraction;
+    uint end  = pos + element_fraction;
+    pos      += tmod; 
+    //printf("pos: %d\n", pos);
+    __syncthreads();
+
+    while (pos < end )
+    {
+        uint key = 0;
+
+        // Read first data element if we are in the set of elements
+        //if( pos < elements )
+            //key = pData[pos].key;
+        KeyValuePair kvp;
+        // Read first data element, both items at once as the memory will want to coalesce like that anyway
+        if (pos < elements)
+            kvp = pData[pos];
+        else
+            kvp.key = 0;
+        key = kvp.key;
+
+
+        // Calculate position of radix counter to increment
+        // There are RADICES radices in each pass (256)
+        // and hence this many counters for bin grouping
+        // Multiply by RADIXGROUPS (4) to spread through memory
+        // and into 4 radix groups
+        uint p = ((key >> shift) & RADIXMASK) * RADIXGROUPS;
+       
+        // Increment radix counters
+        // Each radix group has its own set of counters
+        // so we add the thread position [0-3], ie the group index.
+        // We slow down here and take at least 16 cycles to write to the summation boxes
+        // but other groups will only conflict with themselves and so can also be writing
+        // 16 cycles here at least avoids retries.
+        uint ppos = p + tpos;
+
+        // If we are past the last element we don't want to do anything
+        // We do have to check each time, however, to ensure that all
+        // threads sync on each sync here.
+        if (tmod == 0 && pos < elements)
+            sRadixSum[ppos]++;
+            SYNCIT;
+        if (tmod == 1 && pos < elements)
+            sRadixSum[ppos]++;  
+            SYNCIT;          
+        if (tmod == 2 && pos < elements)
+            sRadixSum[ppos]++;   
+            SYNCIT; 
+        if (tmod == 3 && pos < elements)
+            sRadixSum[ppos]++; 
+            SYNCIT;  
+        if (tmod == 4 && pos < elements)
+            sRadixSum[ppos]++;
+            SYNCIT;
+        if (tmod == 5 && pos < elements)
+            sRadixSum[ppos]++; 
+            SYNCIT;           
+        if (tmod == 6 && pos < elements)
+            sRadixSum[ppos]++;
+            SYNCIT;   
+        if (tmod == 7 && pos < elements)
+            sRadixSum[ppos]++;  
+            SYNCIT;   
+        if (tmod == 8 && pos < elements)
+            sRadixSum[ppos]++;
+            SYNCIT;
+        if (tmod == 9 && pos < elements)
+            sRadixSum[ppos]++; 
+            SYNCIT;           
+        if (tmod == 10 && pos < elements)
+            sRadixSum[ppos]++;
+            SYNCIT;    
+        if (tmod == 11 && pos < elements)
+            sRadixSum[ppos]++; 
+            SYNCIT;     
+        if (tmod == 12 && pos < elements)
+            sRadixSum[ppos]++;
+            SYNCIT;
+        if (tmod == 13 && pos < elements)
+            sRadixSum[ppos]++; 
+            SYNCIT;           
+        if (tmod == 14 && pos < elements)
+            sRadixSum[ppos]++; 
+            SYNCIT;   
+        if (tmod == 15 && pos < elements)
+            sRadixSum[ppos]++; 
+            SYNCIT;   
+           
+        pos += RADIXTHREADS;
+                
+    }
+
+    __syncthreads();
+
+    __syncthreads();
+
+    // Output radix sums into separate memory regions for each radix group
+    // So this memory then is layed out:
+    // 0...... 192..... 384 ................ 192*256
+    // ie all 256 bins for each radix group
+    // in there:
+    // 0.............192
+    // 0  4  8  12...     - block idx * 4
+    // And in the block boxes we see the 4 radix groups for that block
+    // So 0-192 should contain bin 0 for each radix group, and so on
+    uint offset = blockIdx.x * RADIXGROUPS;
+    uint row    = threadIdx.x / RADIXGROUPS;
+    uint column = threadIdx.x % RADIXGROUPS;
+    while (row < RADICES)
+    {
+        dRadixSum[offset + row * TOTALRADIXGROUPS + column] = sRadixSum[row * RADIXGROUPS + column];
+        row += NUM_THREADS_PER_BLOCK / RADIXGROUPS;
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Performs first part of parallel prefix sum - individual sums of each radix
+//! count. By the end of this we have prefix sums on a block level in dRadixSum
+//! and totals for blocks in dRadixBlockSum.
+////////////////////////////////////////////////////////////////////////////////
+__global__ void RadixPrefixSum()
+{
+    // Read radix groups in offset by one in the GRF so a zero can be inserted at the beginning
+    // and the final sum of all radix counts summed here is tacked onto the end for reading by
+    // the next stage
+    // Each block in this case is the full number of threads per SM (and hence the total number 
+    // of radix groups), 192. We should then have the total set of offsets for an entire radix 
+    // group by the end of this stage
+    // Device mem addressing
+    
+    uint brow       = blockIdx.x * (RADICES / PREFIX_NUM_BLOCKS);
+    uint drow       = threadIdx.x / TOTALRADIXGROUPS; // In default parameterisation this is always 0
+    uint dcolumn    = threadIdx.x % TOTALRADIXGROUPS; // And similarly this is always the same as threadIdx.x   
+    uint dpos       = (brow + drow) * TOTALRADIXGROUPS + dcolumn;
+    uint end        = ((blockIdx.x + 1) * (RADICES / PREFIX_NUM_BLOCKS)) * TOTALRADIXGROUPS;
+    // Shared mem addressing
+    uint srow       = threadIdx.x / (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
+    uint scolumn    = threadIdx.x % (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
+    uint spos       = srow * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1) + scolumn;
+
+    // Read (RADICES / PREFIX_NUM_BLOCKS) radix counts into the GRF alongside each other
+    while (dpos < end)
+    {
+        sRadixSum[spos] = dRadixSum[dpos];
+        spos += (PREFIX_NUM_THREADS_PER_BLOCK / (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK)) * 
+                (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1);
+        dpos += (TOTALRADIXGROUPS / PREFIX_NUM_THREADS_PER_BLOCK) * TOTALRADIXGROUPS;
+    }
+    __syncthreads();
+       
+    // Perform preliminary sum on each thread's stretch of data
+    // Each thread having a block of 16, with spacers between 0...16 18...33 and so on
+    int pos     = threadIdx.x * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1);
+    end         = pos + (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
+    uint sum    = 0;
+    while (pos < end)
+    {
+        sum += sRadixSum[pos];
+        sRadixSum[pos] = sum;
+        pos++;  
+    }
+    __syncthreads();   
+  
+ 
+    // Calculate internal offsets by performing a more traditional parallel
+    // prefix sum of the topmost member of each thread's work data.  Right now,
+    // these are stored between the work data for each thread, allowing us to 
+    // eliminate GRF conflicts as well as hold the offsets needed to complete the sum
+    // In other words we have:
+    // 0....15 16 17....32 33 34....
+    // Where this first stage updates the intermediate values (so 16=15, 33=32 etc)
+    int m           = (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1);
+    pos             = threadIdx.x  * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1) +
+                      (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
+    sRadixSum[pos]  = sRadixSum[pos - 1];
+    __syncthreads();
+    // This stage then performs a parallel prefix sum (ie use powers of 2 to propagate in log n stages)
+    // to update 17, 34 etc with the totals to that point (so 34 becomes [34] + [17]) and so on.
+    while (m < PREFIX_NUM_THREADS_PER_BLOCK * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1))
+    {
+        int p  = pos - m;
+        uint t = ((p > 0) ? sRadixSum[p] : 0);
+        __syncthreads();
+        sRadixSum[pos] += t;
+        __syncthreads();
+        m *= 2;
+    } 
+    __syncthreads();
+
+  
+  
+    // Add internal offsets to each thread's work data.
+    // So now we take 17 and add it to all values 18 to 33 so all offsets for that block 
+    // are updated.
+    pos     = threadIdx.x * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1);
+    end     = pos + (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
+    int p   = pos - 1;
+    sum     = ((p > 0) ? sRadixSum[p] : 0);
+    while (pos < end)
+    {
+        sRadixSum[pos] += sum;
+        pos++; 
+    }
+    __syncthreads();        
+               
+    // Write summed data back out to global memory in the same way as we read it in
+    // We now have prefix sum values internal to groups
+    brow       = blockIdx.x * (RADICES / PREFIX_NUM_BLOCKS);
+    drow       = threadIdx.x / TOTALRADIXGROUPS;
+    dcolumn    = threadIdx.x % TOTALRADIXGROUPS;    
+    srow       = threadIdx.x / (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
+    scolumn    = threadIdx.x % (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK);
+    dpos       = (brow + drow) * TOTALRADIXGROUPS + dcolumn + 1;
+    spos       = srow * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1) + scolumn;
+    end        = ((blockIdx.x + 1) * RADICES / PREFIX_NUM_BLOCKS) * TOTALRADIXGROUPS;
+    while (dpos < end)
+    {
+        dRadixSum[dpos] = sRadixSum[spos];
+        dpos += (TOTALRADIXGROUPS / PREFIX_NUM_THREADS_PER_BLOCK) * TOTALRADIXGROUPS;        
+        spos += (PREFIX_NUM_THREADS_PER_BLOCK / (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK)) * 
+                (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1);
+    }
+
+    // Write last element to summation
+    // Storing block sums in a separate array
+    if (threadIdx.x == 0) {
+        dRadixBlockSum[blockIdx.x] = sRadixSum[PREFIX_NUM_THREADS_PER_BLOCK * (PREFIX_BLOCKSIZE / PREFIX_NUM_THREADS_PER_BLOCK + 1) - 1];
+        dRadixSum[blockIdx.x * PREFIX_BLOCKSIZE] = 0;
+    }
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+//! Initially perform prefix sum of block totals to obtain final set of offsets.
+//! Then make use of radix sums to perform a shuffling of the data into the 
+//! correct bins.
+//!
+//! @param pSrc      input data
+//! @param pDst      output data
+//! @param elements  total number of elements
+//! @param shift     the shift (0 to 24) that we are using to obtain the correct 
+//!                  byte
+////////////////////////////////////////////////////////////////////////////////
+__global__ void RadixAddOffsetsAndShuffle(KeyValuePair* pSrc, KeyValuePair* pDst, uint elements, uint elements_rounded_to_3072, int shift)
+{
+    // Read offsets from previous blocks
+    if (threadIdx.x == 0)
+        sRadixSum[SHUFFLE_GRFOFFSET] = 0;
+        
+    if (threadIdx.x < PREFIX_NUM_BLOCKS - 1)
+        sRadixSum[SHUFFLE_GRFOFFSET + threadIdx.x + 1] = dRadixBlockSum[threadIdx.x];
+    __syncthreads();
+    
+    // Parallel prefix sum over block sums
+    int pos = threadIdx.x;
+    int n = 1;
+    while (n < PREFIX_NUM_BLOCKS)
+    {
+        int ppos = pos - n;
+        uint t0 = ((pos < PREFIX_NUM_BLOCKS) && (ppos >= 0)) ? sRadixSum[SHUFFLE_GRFOFFSET + ppos] : 0;
+        __syncthreads();
+        if (pos < PREFIX_NUM_BLOCKS)
+            sRadixSum[SHUFFLE_GRFOFFSET + pos] += t0;
+        __syncthreads(); 
+        n *= 2;
+    }
+
+    // Read radix count data and add appropriate block offset
+    // for each radix at the memory location for this thread
+    // (where the other threads in the block will be reading
+    // as well, hence the large stride).
+    // There is one counter box per radix group per radix 
+    // per block (4*256*3)
+    // We use 64 threads to read the 4 radix groups set of radices 
+    // for the block. 
+    int row    = threadIdx.x / RADIXGROUPS;
+    int column = threadIdx.x % RADIXGROUPS;
+    int spos   = row * RADIXGROUPS + column;
+    int dpos   = row * TOTALRADIXGROUPS + column + blockIdx.x * RADIXGROUPS;
+    while (spos < SHUFFLE_GRFOFFSET)
+    {
+        sRadixSum[spos] = dRadixSum[dpos] + sRadixSum[SHUFFLE_GRFOFFSET + dpos / (TOTALRADIXGROUPS * RADICES / PREFIX_NUM_BLOCKS)];
+        spos += NUM_THREADS_PER_BLOCK;
+        dpos += (NUM_THREADS_PER_BLOCK / RADIXGROUPS) * TOTALRADIXGROUPS;
+    }
+    __syncthreads();
+
+    //int pos;
+    // Shuffle data
+    // Each of the subbins for a block should be filled via the counters, properly interleaved
+    // Then, as we now iterate over each data value, we increment the subbins (each thread in the 
+    // radix group in turn to avoid miss writes due to conflicts) and set locations correctly.
+    uint element_fraction  =   elements_rounded_to_3072 / TOTALRADIXGROUPS;
+    int tmod   =   threadIdx.x % RADIXTHREADS;
+    int tpos   =   threadIdx.x / RADIXTHREADS;
+
+    pos       = (blockIdx.x * RADIXGROUPS + tpos) * element_fraction;
+    uint end  = pos + element_fraction; //(blockIdx.x * RADIXGROUPS + tpos + 1) * element_fraction;
+    pos      += tmod; 
+
+    __syncthreads();
+
+    while (pos < end )
+    {
+        KeyValuePair kvp;
+#if 1 // old load
+        // Read first data element, both items at once as the memory will want to coalesce like that anyway
+        if (pos < elements)
+        {
+            kvp = pSrc[pos];
+        }
+        else
+            kvp.key = 0;
+
+#else // casting to float2 to get it to combine loads
+        int2 kvpf2;
+
+        // Read first data element, both items at once as the memory will want to coalesce like that anyway
+        if (pos < elements)
+        {
+  //          kvp = pSrc[pos];
+            kvpf2 = ((int2*)pSrc)[pos];
+           // printf("kvp: %f %f  kvpf2: %f %f\n", kvp.key, kvp.value, kvpf2.x, kvpf2.y);
+        }
+        else
+            //kvp.key = 0;
+            kvpf2.x = 0;
+ 
+        kvp.key = kvpf2.x;
+        kvp.value = kvpf2.y;
+#endif  
+
+        uint index;
+                
+        // Calculate position of radix counter to increment
+        uint p = ((kvp.key >> shift) & RADIXMASK) * RADIXGROUPS;
+                
+        // Move data, keeping counts updated.
+        // Increment radix counters, relying on hexadecathread
+        // warp to prevent this code from stepping all over itself.
+        uint ppos = p + tpos;
+        if (tmod == 0 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;
+        if (tmod == 1 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;     
+        if (tmod == 2 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;
+        if (tmod == 3 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;
+        if (tmod == 4 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;
+        if (tmod == 5 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT; 
+        if (tmod == 6 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;
+        if (tmod == 7 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT; 
+        if (tmod == 8 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;
+        if (tmod == 9 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;          
+        if (tmod == 10 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;
+        if (tmod == 11 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;
+        if (tmod == 12 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;
+        if (tmod == 13 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;           
+        if (tmod == 14 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;
+        if (tmod == 15 && pos < elements)
+        {
+            index = sRadixSum[ppos]++;
+            pDst[index] = kvp;
+        }
+            SYNCIT;                         
+
+        pos += RADIXTHREADS;
+    }
+
+    __syncthreads();
+}
+
+#endif // #ifndef _RADIXSORT_KERNEL_H_