801 lines
22 KiB
C++
801 lines
22 KiB
C++
/*
|
|
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
|
|
|
This software is provided 'as-is', without any express or implied warranty.
|
|
In no event will the authors be held liable for any damages arising from the use of this software.
|
|
Permission is granted to anyone to use this software for any purpose,
|
|
including commercial applications, and to alter it and redistribute it freely,
|
|
subject to the following restrictions:
|
|
|
|
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
|
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
|
3. This notice may not be removed or altered from any source distribution.
|
|
*/
|
|
//Originally written by Takahiro Harada
|
|
|
|
|
|
#include <AdlPrimitives/Scan/PrefixScan.h>
|
|
#include <AdlPrimitives/Sort/RadixSort.h>
|
|
#include <AdlPrimitives/Sort/RadixSort32.h>
|
|
#include <AdlPrimitives/Search/BoundSearch.h>
|
|
#include <AdlPrimitives/Fill/Fill.h>
|
|
#include <AdlPrimitives/Copy/Copy.h>
|
|
|
|
#include <time.h>
|
|
|
|
using namespace adl;
|
|
|
|
#define NUM_TESTS 10
|
|
|
|
int g_nPassed = 0;
|
|
int g_nFailed = 0;
|
|
bool g_testFailed = 0;
|
|
|
|
//#define TEST_INIT bool g_testFailed = 0;
|
|
#define TEST_INIT g_testFailed = 0;
|
|
#define TEST_ASSERT(x) if( !(x) ){g_testFailed = 1;}
|
|
//#define TEST_ASSERT(x) if( !(x) ){g_testFailed = 1;ADLASSERT(x);}
|
|
#define TEST_REPORT(testName) printf("[%s] %s\n",(g_testFailed)?"X":"O", testName); if(g_testFailed) g_nFailed++; else g_nPassed++;
|
|
|
|
void memCpyTest( Device* deviceData )
|
|
{
|
|
TEST_INIT;
|
|
int maxSize = 64*1024;
|
|
Buffer<u32> buff( deviceData, maxSize );
|
|
|
|
u32* hostBuff = new u32[maxSize];
|
|
|
|
for(int iter=0; iter<NUM_TESTS; iter++)
|
|
{
|
|
int size = getRandom( 1024, maxSize );
|
|
|
|
for(int i=0; i<size; i++) hostBuff[i] = i;
|
|
|
|
buff.write( hostBuff, size );
|
|
|
|
DeviceUtils::waitForCompletion( deviceData );
|
|
for(int i=0; i<size; i++) hostBuff[i] = 0;
|
|
|
|
buff.read( hostBuff, size );
|
|
|
|
DeviceUtils::waitForCompletion( deviceData );
|
|
for(int i=0; i<size; i++) TEST_ASSERT( hostBuff[i] == i );
|
|
}
|
|
|
|
delete [] hostBuff;
|
|
TEST_REPORT( "memCpyTest" );
|
|
}
|
|
|
|
void kernelTest( Device* deviceData )
|
|
{
|
|
TEST_INIT;
|
|
|
|
KernelManager* manager = new KernelManager();
|
|
|
|
Kernel* kernel = manager->query(deviceData, ".\\Kernel", "VectorAddKernel" );
|
|
|
|
{
|
|
int size = 1024;
|
|
Buffer<int> buf0( deviceData, size );
|
|
Buffer<int> buf1( deviceData, size );
|
|
Buffer<float4> cBuf( deviceData, 1, BufferBase::BUFFER_CONST );
|
|
int* hostBuf0 = new int[size];
|
|
int* hostBuf1 = new int[size];
|
|
for(int i=0; i<size; i++) { hostBuf0[i] = i; hostBuf1[i] = 1; }
|
|
buf0.write( hostBuf0, size );
|
|
buf1.write( hostBuf1, size );
|
|
DeviceUtils::waitForCompletion( deviceData );
|
|
|
|
float4 constBuffer;
|
|
constBuffer.x = (float)size;
|
|
constBuffer.y = 2.f;
|
|
constBuffer.z = 0.f;
|
|
constBuffer.w = 0.f;
|
|
{
|
|
Launcher::BufferInfo bInfo[] = { Launcher::BufferInfo( (Buffer<float>*)&buf0 ), Launcher::BufferInfo( (Buffer<float>*)&buf1, true ) };
|
|
|
|
Launcher launcher( deviceData, kernel );
|
|
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
|
|
launcher.setConst( (Buffer<float4>&)cBuf, constBuffer );
|
|
launcher.launch1D( size );
|
|
|
|
buf0.read( hostBuf0, size );
|
|
buf1.read( hostBuf1, size );
|
|
DeviceUtils::waitForCompletion( deviceData );
|
|
}
|
|
|
|
for(int i=0; i<size; i++) { TEST_ASSERT( hostBuf0[i] == i+1+2 ); }
|
|
|
|
delete [] hostBuf0;
|
|
delete [] hostBuf1;
|
|
}
|
|
TEST_REPORT( "kernelTest" );
|
|
}
|
|
|
|
void stopwatchTest( Device* deviceData )
|
|
{
|
|
{
|
|
Stopwatch sw( deviceData );
|
|
|
|
sw.start();
|
|
Sleep(2);
|
|
sw.split();
|
|
Sleep(2);
|
|
sw.stop();
|
|
|
|
float t[2];
|
|
sw.getMs( t, 2 );
|
|
}
|
|
}
|
|
|
|
template<DeviceType type>
|
|
void scanTest( Device* deviceGPU, Device* deviceHost )
|
|
{
|
|
TEST_INIT;
|
|
|
|
ADLASSERT( type == deviceGPU->m_type );
|
|
|
|
int maxSize = 1024*256;
|
|
|
|
HostBuffer<u32> buf0( deviceHost, maxSize );
|
|
HostBuffer<u32> buf1( deviceHost, maxSize );
|
|
Buffer<u32> buf2( deviceGPU, maxSize );
|
|
Buffer<u32> buf3( deviceGPU, maxSize );
|
|
|
|
PrefixScan<type>::Data* data0 = PrefixScan<type>::allocate( deviceGPU, maxSize );
|
|
PrefixScan<TYPE_HOST>::Data* data1 = PrefixScan<TYPE_HOST>::allocate( deviceHost, maxSize );
|
|
|
|
int dx = maxSize/NUM_TESTS;
|
|
for(int iter=0; iter<NUM_TESTS; iter++)
|
|
{
|
|
int size = min2( 128+dx*iter, maxSize );
|
|
|
|
for(int i=0; i<size; i++) buf0[i] = 1;
|
|
buf2.write( buf0.m_ptr, size );
|
|
DeviceUtils::waitForCompletion( deviceGPU );
|
|
|
|
u32 sumHost, sumGPU;
|
|
PrefixScan<TYPE_HOST>::execute( data1, buf0, buf1, size, &sumHost );
|
|
PrefixScan<type>::execute( data0, buf2, buf3, size, &sumGPU );
|
|
|
|
buf3.read( buf0.m_ptr, size );
|
|
DeviceUtils::waitForCompletion( deviceGPU );
|
|
TEST_ASSERT( sumHost == sumGPU );
|
|
for(int i=0; i<size; i++) TEST_ASSERT( buf1[i] == buf0[i] );
|
|
}
|
|
|
|
PrefixScan<TYPE_HOST>::deallocate( data1 );
|
|
PrefixScan<type>::deallocate( data0 );
|
|
|
|
TEST_REPORT( "scanTest" );
|
|
}
|
|
|
|
template<DeviceType type, RadixSortBase::Option SORT_TYPE>
|
|
bool radixSortTest( Device* deviceGPU, Device* deviceHost )
|
|
{
|
|
TEST_INIT;
|
|
ADLASSERT( type == deviceGPU->m_type );
|
|
|
|
int maxSize = 1024*256;
|
|
|
|
HostBuffer<SortData> buf0( deviceHost, maxSize );
|
|
HostBuffer<SortData> buf1( deviceHost, maxSize );
|
|
Buffer<SortData> buf2( deviceGPU, maxSize );
|
|
|
|
RadixSort<TYPE_HOST>::Data* dataH = RadixSort<TYPE_HOST>::allocate( deviceHost, maxSize, RadixSortBase::SORT_SIMPLE );
|
|
RadixSort<type>::Data* dataC = RadixSort<type>::allocate( deviceGPU, maxSize, SORT_TYPE );
|
|
|
|
int dx = maxSize/NUM_TESTS;
|
|
for(int iter=0; iter<NUM_TESTS; iter++)
|
|
{
|
|
int size = min2( 128+dx*iter, maxSize-512 );
|
|
size = NEXTMULTIPLEOF( size, 512 );
|
|
|
|
for(int i=0; i<size; i++) buf0[i] = SortData( getRandom(0,0xff), i );
|
|
buf2.write( buf0.m_ptr, size );
|
|
DeviceUtils::waitForCompletion( deviceGPU );
|
|
|
|
RadixSort<TYPE_HOST>::execute( dataH, buf0, size );
|
|
RadixSort<type>::execute( dataC, buf2, size );
|
|
|
|
buf2.read( buf1.m_ptr, size );
|
|
DeviceUtils::waitForCompletion( deviceGPU );
|
|
for(int i=0; i<size; i++) TEST_ASSERT( buf0[i].m_value == buf1[i].m_value && buf0[i].m_key == buf1[i].m_key );
|
|
}
|
|
|
|
RadixSort<TYPE_HOST>::deallocate( dataH );
|
|
RadixSort<type>::deallocate( dataC );
|
|
|
|
return g_testFailed;
|
|
}
|
|
|
|
template<DeviceType type>
|
|
void radixSortSimpleTest( Device* deviceGPU, Device* deviceHost )
|
|
{
|
|
TEST_INIT;
|
|
g_testFailed = radixSortTest<type, RadixSortBase::SORT_SIMPLE>(deviceGPU, deviceHost);
|
|
TEST_REPORT( "radixSortSimpleTest" );
|
|
}
|
|
|
|
template<DeviceType type>
|
|
void radixSortStandardTest( Device* deviceGPU, Device* deviceHost )
|
|
{
|
|
TEST_INIT;
|
|
g_testFailed = radixSortTest<type, RadixSortBase::SORT_STANDARD>(deviceGPU, deviceHost);
|
|
TEST_REPORT( "radixSortStandardTest" );
|
|
}
|
|
|
|
template<DeviceType type>
|
|
void radixSortAdvancedTest( Device* deviceGPU, Device* deviceHost )
|
|
{
|
|
TEST_INIT;
|
|
g_testFailed = radixSortTest<type, RadixSortBase::SORT_ADVANCED>(deviceGPU, deviceHost);
|
|
TEST_REPORT( "radixSortAdvancedTest" );
|
|
}
|
|
|
|
template<DeviceType type>
|
|
void boundSearchTest( Device* deviceGPU, Device* deviceHost )
|
|
{
|
|
TEST_INIT;
|
|
|
|
ADLASSERT( type == deviceGPU->m_type );
|
|
|
|
int maxSize = 1024*256;
|
|
int bucketSize = 256;
|
|
|
|
HostBuffer<SortData> buf0( deviceHost, maxSize );
|
|
HostBuffer<u32> lowerH( deviceHost, maxSize );
|
|
HostBuffer<u32> upperH( deviceHost, maxSize );
|
|
|
|
Buffer<SortData> buf( deviceGPU, maxSize );
|
|
Buffer<u32> lower( deviceGPU, maxSize );
|
|
Buffer<u32> upper( deviceGPU, maxSize );
|
|
|
|
BoundSearch<type>::Data* dataH = BoundSearch<type>::allocate( deviceGPU );
|
|
RadixSort<TYPE_HOST>::Data* dataHSort = RadixSort<TYPE_HOST>::allocate( deviceHost, maxSize, RadixSortBase::SORT_SIMPLE );
|
|
|
|
int dx = maxSize/NUM_TESTS;
|
|
for(int iter=0; iter<NUM_TESTS; iter++)
|
|
{
|
|
int size = min2( 128+dx*iter, maxSize );
|
|
for(int i=0; i<size; i++) buf0[i] = SortData( getRandom(0,bucketSize), i );
|
|
RadixSort<TYPE_HOST>::execute( dataHSort, buf0, size );
|
|
buf.write( buf0.m_ptr, size );
|
|
{
|
|
u32* host = new u32[size];
|
|
for(int i=0; i<size; i++) host[i] = -1;
|
|
lower.write( host, size );
|
|
upper.write( host, size );
|
|
}
|
|
DeviceUtils::waitForCompletion( deviceGPU );
|
|
|
|
BoundSearch<type>::execute( dataH, buf, size, lower, bucketSize, BoundSearchBase::BOUND_LOWER );
|
|
BoundSearch<type>::execute( dataH, buf, size, upper, bucketSize, BoundSearchBase::BOUND_UPPER );
|
|
|
|
lower.read( lowerH.m_ptr, bucketSize );
|
|
upper.read( upperH.m_ptr, bucketSize );
|
|
DeviceUtils::waitForCompletion( deviceGPU );
|
|
/*
|
|
for(u32 i=1; i<(u32)bucketSize; i++)
|
|
{
|
|
for(u32 j=lowerH[i-1]; j<lowerH[i]; j++)
|
|
{
|
|
TEST_ASSERT( buf0[j].m_key < i );
|
|
}
|
|
}
|
|
|
|
for(u32 i=0; i<(u32)bucketSize; i++)
|
|
{
|
|
int jMin = (i==0)?0:upperH[i-1];
|
|
for(u32 j=jMin; j<upperH[i]; j++)
|
|
{
|
|
TEST_ASSERT( buf0[j].m_key <= i );
|
|
}
|
|
}
|
|
*/
|
|
for(u32 i=0; i<(u32)bucketSize; i++)
|
|
{
|
|
for(u32 j=lowerH[i]; j<upperH[i]; j++)
|
|
{
|
|
if ( buf0[j].m_key != i )
|
|
{
|
|
printf("error %d != %d\n",buf0[j].m_key,i);
|
|
}
|
|
TEST_ASSERT( buf0[j].m_key == i );
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
BoundSearch<type>::deallocate( dataH );
|
|
RadixSort<TYPE_HOST>::deallocate( dataHSort );
|
|
|
|
TEST_REPORT( "boundSearchTest" );
|
|
}
|
|
|
|
template<DeviceType type>
|
|
void fillIntTest( Device* deviceGPU, Device* deviceHost )
|
|
{
|
|
TEST_INIT;
|
|
|
|
ADLASSERT( type == deviceGPU->m_type );
|
|
|
|
int maxSize = 1024*256;
|
|
|
|
HostBuffer<int> buf0( deviceHost, maxSize );
|
|
HostBuffer<int> buf1( deviceHost, maxSize );
|
|
Buffer<int> buf2( deviceGPU, maxSize );
|
|
|
|
Fill<TYPE_HOST>::Data* data0 = Fill<TYPE_HOST>::allocate( deviceHost );
|
|
Fill<type>::Data* data1 = Fill<type>::allocate( deviceGPU );
|
|
|
|
int dx = maxSize/NUM_TESTS;
|
|
for(int iter=0; iter<NUM_TESTS; iter++)
|
|
{
|
|
int size = min2( 128+dx*iter, maxSize );
|
|
for(int i=0; i<size; i++) buf0[i] = -1;
|
|
buf2.write( buf0.m_ptr, size );
|
|
DeviceUtils::waitForCompletion( deviceGPU );
|
|
|
|
Fill<TYPE_HOST>::execute( data0, buf0, 12, size );
|
|
Fill<type>::execute( data1, buf2, 12, size );
|
|
|
|
buf2.read( buf1.m_ptr, size );
|
|
DeviceUtils::waitForCompletion( deviceGPU );
|
|
for(int i=0; i<size; i++) TEST_ASSERT( buf1[i] == buf0[i] );
|
|
}
|
|
|
|
Fill<TYPE_HOST>::deallocate( data0 );
|
|
Fill<type>::deallocate( data1 );
|
|
|
|
TEST_REPORT( "fillIntTest" );
|
|
}
|
|
|
|
template<DeviceType type>
|
|
void fillInt2Test( Device* deviceGPU, Device* deviceHost )
|
|
{
|
|
TEST_INIT;
|
|
|
|
ADLASSERT( type == deviceGPU->m_type );
|
|
|
|
int maxSize = 1024*256;
|
|
|
|
HostBuffer<int2> buf0( deviceHost, maxSize );
|
|
HostBuffer<int2> buf1( deviceHost, maxSize );
|
|
Buffer<int2> buf2( deviceGPU, maxSize );
|
|
|
|
Fill<TYPE_HOST>::Data* data0 = Fill<TYPE_HOST>::allocate( deviceHost );
|
|
Fill<type>::Data* data1 = Fill<type>::allocate( deviceGPU );
|
|
|
|
int dx = maxSize/NUM_TESTS;
|
|
for(int iter=0; iter<NUM_TESTS; iter++)
|
|
{
|
|
int size = min2( 128+dx*iter, maxSize );
|
|
for(int i=0; i<size; i++) buf0[i] = make_int2( -1, -1 );
|
|
buf2.write( buf0.m_ptr, size );
|
|
DeviceUtils::waitForCompletion( deviceGPU );
|
|
|
|
Fill<TYPE_HOST>::execute( data0, buf0, make_int2( 12, 12 ), size );
|
|
Fill<type>::execute( data1, buf2, make_int2( 12, 12 ), size );
|
|
|
|
buf2.read( buf1.m_ptr, size );
|
|
DeviceUtils::waitForCompletion( deviceGPU );
|
|
for(int i=0; i<size; i++) TEST_ASSERT( buf1[i] == buf0[i] );
|
|
}
|
|
|
|
Fill<TYPE_HOST>::deallocate( data0 );
|
|
Fill<type>::deallocate( data1 );
|
|
|
|
TEST_REPORT( "fillInt2Test" );
|
|
}
|
|
|
|
template<DeviceType type>
|
|
void fillInt4Test( Device* deviceGPU, Device* deviceHost )
|
|
{
|
|
TEST_INIT;
|
|
|
|
ADLASSERT( type == deviceGPU->m_type );
|
|
|
|
int maxSize = 1024*256;
|
|
|
|
HostBuffer<int4> buf0( deviceHost, maxSize );
|
|
HostBuffer<int4> buf1( deviceHost, maxSize );
|
|
Buffer<int4> buf2( deviceGPU, maxSize );
|
|
|
|
Fill<TYPE_HOST>::Data* data0 = Fill<TYPE_HOST>::allocate( deviceHost );
|
|
Fill<type>::Data* data1 = Fill<type>::allocate( deviceGPU );
|
|
|
|
int dx = maxSize/NUM_TESTS;
|
|
for(int iter=0; iter<NUM_TESTS; iter++)
|
|
{
|
|
int size = min2( 128+dx*iter, maxSize );
|
|
for(int i=0; i<size; i++) buf0[i] = make_int4( -1 );
|
|
buf2.write( buf0.m_ptr, size );
|
|
DeviceUtils::waitForCompletion( deviceGPU );
|
|
|
|
Fill<TYPE_HOST>::execute( data0, buf0, make_int4( 12 ), size );
|
|
Fill<type>::execute( data1, buf2, make_int4( 12 ), size );
|
|
|
|
buf2.read( buf1.m_ptr, size );
|
|
DeviceUtils::waitForCompletion( deviceGPU );
|
|
for(int i=0; i<size; i++) TEST_ASSERT( buf1[i] == buf0[i] );
|
|
}
|
|
|
|
Fill<TYPE_HOST>::deallocate( data0 );
|
|
Fill<type>::deallocate( data1 );
|
|
|
|
TEST_REPORT( "fillInt4Test" );
|
|
}
|
|
|
|
|
|
template<DeviceType type, CopyBase::Option OPTION>
|
|
bool CopyF4Test( Device* deviceGPU, Device* deviceHost )
|
|
{
|
|
TEST_INIT;
|
|
ADLASSERT( type == deviceGPU->m_type );
|
|
|
|
int maxSize = 1024*256;
|
|
|
|
HostBuffer<float4> buf0( deviceHost, maxSize );
|
|
HostBuffer<float4> buf1( deviceHost, maxSize );
|
|
Buffer<float4> buf2( deviceGPU, maxSize );
|
|
Buffer<float4> buf3( deviceGPU, maxSize );
|
|
HostBuffer<float4> devResult( deviceHost, maxSize );
|
|
|
|
Copy<TYPE_HOST>::Data* data0 = Copy<TYPE_HOST>::allocate( deviceHost );
|
|
Copy<type>::Data* data1 = Copy<type>::allocate( deviceGPU );
|
|
|
|
int dx = maxSize/NUM_TESTS;
|
|
for(int iter=0; iter<NUM_TESTS; iter++)
|
|
{
|
|
int size = min2( 128+dx*iter, maxSize-4 );
|
|
size = NEXTMULTIPLEOF( size, 4 );
|
|
float r = 10000.f;
|
|
for(int i=0; i<size; i++) buf0[i] = make_float4( getRandom( -r, r ), getRandom( -r, r ), getRandom( -r, r ), getRandom( -r, r ) );
|
|
buf2.write( buf0.m_ptr, size );
|
|
DeviceUtils::waitForCompletion( deviceGPU );
|
|
|
|
Copy<TYPE_HOST>::execute( data0, buf1, buf0, size, OPTION );
|
|
Copy<type>::execute( data1, buf3, buf2, size, OPTION );
|
|
|
|
buf3.read( devResult.m_ptr, size );
|
|
DeviceUtils::waitForCompletion( deviceGPU );
|
|
for(int i=0; i<size; i++)
|
|
{
|
|
TEST_ASSERT( buf1[i] == devResult[i] );
|
|
TEST_ASSERT( buf0[i] == devResult[i] );
|
|
}
|
|
}
|
|
|
|
Copy<TYPE_HOST>::deallocate( data0 );
|
|
Copy<type>::deallocate( data1 );
|
|
|
|
return g_testFailed;
|
|
}
|
|
|
|
template<DeviceType type>
|
|
void Copy1F4Test( Device* deviceGPU, Device* deviceHost )
|
|
{
|
|
TEST_INIT;
|
|
g_testFailed = CopyF4Test<type, CopyBase::PER_WI_1>( deviceGPU, deviceHost );
|
|
TEST_REPORT( "Copy1F4Test" );
|
|
}
|
|
|
|
template<DeviceType type>
|
|
void Copy2F4Test( Device* deviceGPU, Device* deviceHost )
|
|
{
|
|
TEST_INIT;
|
|
g_testFailed = CopyF4Test<type, CopyBase::PER_WI_2>( deviceGPU, deviceHost );
|
|
TEST_REPORT( "Copy2F4Test" );
|
|
}
|
|
|
|
template<DeviceType type>
|
|
void Copy4F4Test( Device* deviceGPU, Device* deviceHost )
|
|
{
|
|
TEST_INIT;
|
|
g_testFailed = CopyF4Test<type, CopyBase::PER_WI_4>( deviceGPU, deviceHost );
|
|
TEST_REPORT( "Copy4F4Test" );
|
|
}
|
|
|
|
|
|
template<DeviceType type>
|
|
void CopyF1Test( Device* deviceGPU, Device* deviceHost )
|
|
{
|
|
TEST_INIT;
|
|
ADLASSERT( type == deviceGPU->m_type );
|
|
|
|
int maxSize = 1024*256;
|
|
|
|
HostBuffer<float> buf0( deviceHost, maxSize );
|
|
HostBuffer<float> buf1( deviceHost, maxSize );
|
|
Buffer<float> buf2( deviceGPU, maxSize );
|
|
Buffer<float> buf3( deviceGPU, maxSize );
|
|
HostBuffer<float> devResult( deviceHost, maxSize );
|
|
|
|
Copy<TYPE_HOST>::Data* data0 = Copy<TYPE_HOST>::allocate( deviceHost );
|
|
Copy<type>::Data* data1 = Copy<type>::allocate( deviceGPU );
|
|
|
|
int dx = maxSize/NUM_TESTS;
|
|
for(int iter=0; iter<NUM_TESTS; iter++)
|
|
{
|
|
int size = min2( 128+dx*iter, maxSize-4 );
|
|
size = NEXTMULTIPLEOF( size, 4 );
|
|
float r = 10000.f;
|
|
for(int i=0; i<size; i++) buf0[i] = getRandom( -r, r );
|
|
buf2.write( buf0.m_ptr, size );
|
|
DeviceUtils::waitForCompletion( deviceGPU );
|
|
|
|
Copy<TYPE_HOST>::execute( data0, buf1, buf0, size );
|
|
Copy<type>::execute( data1, buf3, buf2, size );
|
|
|
|
buf3.read( devResult.m_ptr, size );
|
|
DeviceUtils::waitForCompletion( deviceGPU );
|
|
for(int i=0; i<size; i++)
|
|
{
|
|
TEST_ASSERT( buf1[i] == devResult[i] );
|
|
TEST_ASSERT( buf0[i] == devResult[i] );
|
|
}
|
|
}
|
|
|
|
Copy<TYPE_HOST>::deallocate( data0 );
|
|
Copy<type>::deallocate( data1 );
|
|
|
|
TEST_REPORT( "CopyF1Test" );
|
|
}
|
|
|
|
template<DeviceType type>
|
|
void CopyF2Test( Device* deviceGPU, Device* deviceHost )
|
|
{
|
|
TEST_INIT;
|
|
ADLASSERT( type == deviceGPU->m_type );
|
|
|
|
int maxSize = 1024*256;
|
|
|
|
HostBuffer<float2> buf0( deviceHost, maxSize );
|
|
HostBuffer<float2> buf1( deviceHost, maxSize );
|
|
Buffer<float2> buf2( deviceGPU, maxSize );
|
|
Buffer<float2> buf3( deviceGPU, maxSize );
|
|
HostBuffer<float2> devResult( deviceHost, maxSize );
|
|
|
|
Copy<TYPE_HOST>::Data* data0 = Copy<TYPE_HOST>::allocate( deviceHost );
|
|
Copy<type>::Data* data1 = Copy<type>::allocate( deviceGPU );
|
|
|
|
int dx = maxSize/NUM_TESTS;
|
|
for(int iter=0; iter<NUM_TESTS; iter++)
|
|
{
|
|
int size = min2( 128+dx*iter, maxSize-4 );
|
|
size = NEXTMULTIPLEOF( size, 4 );
|
|
float r = 10000.f;
|
|
for(int i=0; i<size; i++) buf0[i] = make_float2( getRandom( -r, r ), getRandom( -r, r ) );
|
|
buf2.write( buf0.m_ptr, size );
|
|
DeviceUtils::waitForCompletion( deviceGPU );
|
|
|
|
Copy<TYPE_HOST>::execute( data0, buf1, buf0, size );
|
|
Copy<type>::execute( data1, buf3, buf2, size );
|
|
|
|
buf3.read( devResult.m_ptr, size );
|
|
DeviceUtils::waitForCompletion( deviceGPU );
|
|
for(int i=0; i<size; i++)
|
|
{
|
|
TEST_ASSERT( buf1[i] == devResult[i] );
|
|
TEST_ASSERT( buf0[i] == devResult[i] );
|
|
}
|
|
}
|
|
|
|
Copy<TYPE_HOST>::deallocate( data0 );
|
|
Copy<type>::deallocate( data1 );
|
|
|
|
TEST_REPORT( "CopyF2Test" );
|
|
}
|
|
|
|
template<DeviceType type>
|
|
void radixSort32Test( Device* deviceGPU, Device* deviceHost )
|
|
{
|
|
TEST_INIT;
|
|
ADLASSERT( type == deviceGPU->m_type );
|
|
|
|
int maxSize = 1024*256;
|
|
|
|
HostBuffer<u32> buf0( deviceHost, maxSize );
|
|
HostBuffer<u32> buf1( deviceHost, maxSize );
|
|
Buffer<u32> buf2( deviceGPU, maxSize );
|
|
|
|
RadixSort32<TYPE_HOST>::Data* dataH = RadixSort32<TYPE_HOST>::allocate( deviceHost, maxSize );
|
|
RadixSort32<type>::Data* dataC = RadixSort32<type>::allocate( deviceGPU, maxSize );
|
|
|
|
int dx = maxSize/NUM_TESTS;
|
|
for(int iter=0; iter<NUM_TESTS; iter++)
|
|
{
|
|
int size = min2( 128+dx*iter, maxSize-512 );
|
|
size = NEXTMULTIPLEOF( size, 512 );
|
|
|
|
for(int i=0; i<size; i++) buf0[i] = getRandom(0u,0xffffffffu);
|
|
buf2.write( buf0.m_ptr, size );
|
|
DeviceUtils::waitForCompletion( deviceGPU );
|
|
|
|
RadixSort32<TYPE_HOST>::execute( dataH, buf0, size, 32 );
|
|
RadixSort32<type>::execute( dataC, buf2, size, 32 );
|
|
|
|
buf2.read( buf1.m_ptr, size );
|
|
DeviceUtils::waitForCompletion( deviceGPU );
|
|
// for(int i=0; i<size-1; i++) TEST_ASSERT( buf1[i] <= buf1[i+1] );
|
|
for(int i=0; i<size; i++) TEST_ASSERT( buf0[i] == buf1[i] );
|
|
}
|
|
|
|
RadixSort32<TYPE_HOST>::deallocate( dataH );
|
|
RadixSort32<type>::deallocate( dataC );
|
|
|
|
TEST_REPORT( "RadixSort32Test" );
|
|
}
|
|
|
|
template<DeviceType type>
|
|
void radixSortKeyValue32Test( Device* deviceGPU, Device* deviceHost )
|
|
{
|
|
TEST_INIT;
|
|
ADLASSERT( type == deviceGPU->m_type );
|
|
|
|
int maxSize = 1024*256;
|
|
|
|
// Host buffers
|
|
HostBuffer<u32> buf0( deviceHost, maxSize ); // Buffer for keys in host and will be sorted by host.
|
|
HostBuffer<u32> buf1( deviceHost, maxSize ); // Buffer for keys in host and will be saved by device after sorting in device.
|
|
HostBuffer<u32> buf2( deviceHost, maxSize ); // Buffer for values in host. This buffer is paired with buf0.
|
|
HostBuffer<u32> buf3( deviceHost, maxSize ); // Buffer for values in host and will be saved by device after sorting. It is paired with buf1.
|
|
|
|
// Device buffers
|
|
Buffer<u32> buf4( deviceGPU, maxSize ); // Buffer for input keys for device.
|
|
Buffer<u32> buf5( deviceGPU, maxSize ); // Buffer for output keys from device and will be sorted by device. This key data will be saved to buf1 to be compared with a result(buf0) from host.
|
|
Buffer<u32> buf6( deviceGPU, maxSize ); // Buffer for input values in device.
|
|
Buffer<u32> buf7( deviceGPU, maxSize ); // Buffer for output values in device.
|
|
|
|
RadixSort32<TYPE_HOST>::Data* dataH = RadixSort32<TYPE_HOST>::allocate( deviceHost, maxSize );
|
|
RadixSort32<type>::Data* dataC = RadixSort32<type>::allocate( deviceGPU, maxSize );
|
|
|
|
int dx = maxSize/NUM_TESTS;
|
|
|
|
for(int iter=0; iter<NUM_TESTS; iter++)
|
|
{
|
|
int size = min2( 128+dx*iter, maxSize-512 );
|
|
size = NEXTMULTIPLEOF( size, 512 );
|
|
|
|
// keys
|
|
seedRandom((int)time(NULL)/2);
|
|
for(int i=0; i<size; i++) buf0[i] = getRandom(0u,0xffffffffu);
|
|
buf4.write( buf0.m_ptr, size );
|
|
DeviceUtils::waitForCompletion( deviceGPU );
|
|
|
|
// values
|
|
seedRandom((int)time(NULL)/2);
|
|
for(int i=0; i<size; i++) buf2[i] = getRandom(0u,0xffffffffu);
|
|
buf6.write( buf2.m_ptr, size );
|
|
DeviceUtils::waitForCompletion( deviceGPU );
|
|
|
|
RadixSort32<TYPE_HOST>::execute( dataH, buf0, buf2, size, 32 );
|
|
RadixSort32<type>::execute( dataC, buf4, buf5, buf6, buf7, size, 32 );
|
|
buf5.read( buf1.m_ptr, size );
|
|
buf7.read( buf3.m_ptr, size );
|
|
|
|
DeviceUtils::waitForCompletion( deviceGPU );
|
|
|
|
for(int i=0; i<size; i++)
|
|
{
|
|
// Comparing keys. One is done by Host and the other is done by Device.
|
|
TEST_ASSERT( buf0[i] == buf1[i] );
|
|
|
|
// Comparing values. One is done by Host and the other is done by Device.
|
|
TEST_ASSERT( buf2[i] == buf3[i] );
|
|
}
|
|
}
|
|
|
|
RadixSort32<TYPE_HOST>::deallocate( dataH );
|
|
RadixSort32<type>::deallocate( dataC );
|
|
|
|
TEST_REPORT( "RadixSortKeyValue32Test" );
|
|
}
|
|
|
|
#if defined(ADL_ENABLE_DX11)
|
|
#define RUN_GPU( func ) func(ddcl); func(dddx);
|
|
#define RUN_GPU_TEMPLATE( func ) func<TYPE_CL>( ddcl, ddhost ); func<TYPE_DX11>( dddx, ddhost );
|
|
#define RUN_CL_TEMPLATE( func ) func<TYPE_CL>( ddcl, ddhost );
|
|
#else
|
|
#define RUN_GPU( func ) func(ddcl);
|
|
#define RUN_GPU_TEMPLATE( func ) func<TYPE_CL>( ddcl, ddhost );
|
|
#endif
|
|
#define RUN_ALL( func ) RUN_GPU( func ); func(ddhost);
|
|
|
|
void runAllTest()
|
|
{
|
|
g_nPassed = 0;
|
|
g_nFailed = 0;
|
|
|
|
|
|
Device* ddcl;
|
|
Device* ddhost;
|
|
#if defined(ADL_ENABLE_DX11)
|
|
Device* dddx;
|
|
#endif
|
|
|
|
{
|
|
DeviceUtils::Config cfg;
|
|
|
|
// Choose AMD or NVidia
|
|
#ifdef CL_PLATFORM_AMD
|
|
cfg.m_vendor = adl::DeviceUtils::Config::VD_AMD;
|
|
#endif
|
|
|
|
#ifdef CL_PLATFORM_INTEL
|
|
cfg.m_vendor = adl::DeviceUtils::Config::VD_INTEL;
|
|
cfg.m_type = DeviceUtils::Config::DEVICE_CPU;
|
|
#endif
|
|
|
|
|
|
#ifdef CL_PLATFORM_NVIDIA
|
|
cfg.m_vendor = adl::DeviceUtils::Config::VD_NV;
|
|
#endif
|
|
|
|
|
|
ddcl = DeviceUtils::allocate( TYPE_CL, cfg );
|
|
ddhost = DeviceUtils::allocate( TYPE_HOST, cfg );
|
|
// cfg.m_type = DeviceUtils::Config::DEVICE_GPU;
|
|
#if defined(ADL_ENABLE_DX11)
|
|
dddx = DeviceUtils::allocate( TYPE_DX11, cfg );
|
|
#endif
|
|
}
|
|
|
|
{
|
|
char name[128];
|
|
ddcl->getDeviceName( name );
|
|
printf("CL: %s\n", name);
|
|
#ifdef ADL_ENABLE_DX11
|
|
dddx->getDeviceName( name );
|
|
printf("DX11: %s\n", name);
|
|
#endif
|
|
}
|
|
|
|
RUN_GPU_TEMPLATE( radixSort32Test );
|
|
RUN_GPU_TEMPLATE( radixSortKeyValue32Test );
|
|
|
|
if (1)
|
|
{
|
|
RUN_GPU_TEMPLATE( CopyF1Test );
|
|
RUN_GPU_TEMPLATE( CopyF2Test );
|
|
|
|
boundSearchTest<TYPE_HOST>( ddhost, ddhost );
|
|
// fillTest<TYPE_HOST>( ddhost, ddhost );
|
|
// fillTest<TYPE_CL>( ddcl, ddhost );
|
|
|
|
|
|
|
|
|
|
RUN_GPU_TEMPLATE( boundSearchTest );
|
|
|
|
RUN_GPU_TEMPLATE( fillIntTest );
|
|
RUN_GPU_TEMPLATE( fillInt2Test );
|
|
RUN_GPU_TEMPLATE( fillInt4Test );
|
|
|
|
RUN_ALL( stopwatchTest );
|
|
RUN_ALL( memCpyTest );
|
|
// RUN_GPU( kernelTest );
|
|
RUN_GPU_TEMPLATE( scanTest );
|
|
RUN_GPU_TEMPLATE( radixSortSimpleTest );
|
|
|
|
RUN_GPU_TEMPLATE( radixSortStandardTest );
|
|
|
|
RUN_GPU_TEMPLATE( radixSort32Test );
|
|
|
|
// RUN_GPU_TEMPLATE( boundSearchTest );
|
|
RUN_GPU_TEMPLATE( Copy1F4Test );
|
|
RUN_GPU_TEMPLATE( Copy2F4Test );
|
|
RUN_GPU_TEMPLATE( Copy4F4Test );
|
|
}
|
|
|
|
DeviceUtils::deallocate( ddcl );
|
|
DeviceUtils::deallocate( ddhost );
|
|
#if defined(ADL_ENABLE_DX11)
|
|
DeviceUtils::deallocate( dddx );
|
|
#endif
|
|
|
|
printf("=========\n%d Passed\n%d Failed\n", g_nPassed, g_nFailed);
|
|
|
|
|
|
} |