/* Copyright (c) 2012 Advanced Micro Devices, Inc. This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software. Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions: 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 3. This notice may not be removed or altered from any source distribution. */ //Originally written by Takahiro Harada #include #include #include #include #include #include #include using namespace adl; #define NUM_TESTS 10 int g_nPassed = 0; int g_nFailed = 0; bool g_testFailed = 0; //#define TEST_INIT bool g_testFailed = 0; #define TEST_INIT g_testFailed = 0; #define TEST_ASSERT(x) if( !(x) ){g_testFailed = 1;} //#define TEST_ASSERT(x) if( !(x) ){g_testFailed = 1;ADLASSERT(x);} #define TEST_REPORT(testName) printf("[%s] %s\n",(g_testFailed)?"X":"O", testName); if(g_testFailed) g_nFailed++; else g_nPassed++; void memCpyTest( Device* deviceData ) { TEST_INIT; int maxSize = 64*1024; Buffer buff( deviceData, maxSize ); u32* hostBuff = new u32[maxSize]; for(int iter=0; iterquery(deviceData, ".\\Kernel", "VectorAddKernel" ); { int size = 1024; Buffer buf0( deviceData, size ); Buffer buf1( deviceData, size ); Buffer cBuf( deviceData, 1, BufferBase::BUFFER_CONST ); int* hostBuf0 = new int[size]; int* hostBuf1 = new int[size]; for(int i=0; i*)&buf0 ), Launcher::BufferInfo( (Buffer*)&buf1, true ) }; Launcher launcher( deviceData, kernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) ); launcher.setConst( (Buffer&)cBuf, constBuffer ); launcher.launch1D( size ); buf0.read( hostBuf0, size ); buf1.read( hostBuf1, size ); DeviceUtils::waitForCompletion( deviceData ); } for(int i=0; i void scanTest( Device* deviceGPU, Device* deviceHost ) { TEST_INIT; ADLASSERT( type == deviceGPU->m_type ); int maxSize = 1024*256; HostBuffer buf0( deviceHost, maxSize ); HostBuffer buf1( deviceHost, maxSize ); Buffer buf2( deviceGPU, maxSize ); Buffer buf3( deviceGPU, maxSize ); PrefixScan::Data* data0 = PrefixScan::allocate( deviceGPU, maxSize ); PrefixScan::Data* data1 = PrefixScan::allocate( deviceHost, maxSize ); int dx = maxSize/NUM_TESTS; for(int iter=0; iter::execute( data1, buf0, buf1, size, &sumHost ); PrefixScan::execute( data0, buf2, buf3, size, &sumGPU ); buf3.read( buf0.m_ptr, size ); DeviceUtils::waitForCompletion( deviceGPU ); TEST_ASSERT( sumHost == sumGPU ); for(int i=0; i::deallocate( data1 ); PrefixScan::deallocate( data0 ); TEST_REPORT( "scanTest" ); } template bool radixSortTest( Device* deviceGPU, Device* deviceHost ) { TEST_INIT; ADLASSERT( type == deviceGPU->m_type ); int maxSize = 1024*256; HostBuffer buf0( deviceHost, maxSize ); HostBuffer buf1( deviceHost, maxSize ); Buffer buf2( deviceGPU, maxSize ); RadixSort::Data* dataH = RadixSort::allocate( deviceHost, maxSize, RadixSortBase::SORT_SIMPLE ); RadixSort::Data* dataC = RadixSort::allocate( deviceGPU, maxSize, SORT_TYPE ); int dx = maxSize/NUM_TESTS; for(int iter=0; iter::execute( dataH, buf0, size ); RadixSort::execute( dataC, buf2, size ); buf2.read( buf1.m_ptr, size ); DeviceUtils::waitForCompletion( deviceGPU ); for(int i=0; i::deallocate( dataH ); RadixSort::deallocate( dataC ); return g_testFailed; } template void radixSortSimpleTest( Device* deviceGPU, Device* deviceHost ) { TEST_INIT; g_testFailed = radixSortTest(deviceGPU, deviceHost); TEST_REPORT( "radixSortSimpleTest" ); } template void radixSortStandardTest( Device* deviceGPU, Device* deviceHost ) { TEST_INIT; g_testFailed = radixSortTest(deviceGPU, deviceHost); TEST_REPORT( "radixSortStandardTest" ); } template void radixSortAdvancedTest( Device* deviceGPU, Device* deviceHost ) { TEST_INIT; g_testFailed = radixSortTest(deviceGPU, deviceHost); TEST_REPORT( "radixSortAdvancedTest" ); } template void boundSearchTest( Device* deviceGPU, Device* deviceHost ) { TEST_INIT; ADLASSERT( type == deviceGPU->m_type ); int maxSize = 1024*256; int bucketSize = 256; HostBuffer buf0( deviceHost, maxSize ); HostBuffer lowerH( deviceHost, maxSize ); HostBuffer upperH( deviceHost, maxSize ); Buffer buf( deviceGPU, maxSize ); Buffer lower( deviceGPU, maxSize ); Buffer upper( deviceGPU, maxSize ); BoundSearch::Data* dataH = BoundSearch::allocate( deviceGPU ); RadixSort::Data* dataHSort = RadixSort::allocate( deviceHost, maxSize, RadixSortBase::SORT_SIMPLE ); int dx = maxSize/NUM_TESTS; for(int iter=0; iter::execute( dataHSort, buf0, size ); buf.write( buf0.m_ptr, size ); { u32* host = new u32[size]; for(int i=0; i::execute( dataH, buf, size, lower, bucketSize, BoundSearchBase::BOUND_LOWER ); BoundSearch::execute( dataH, buf, size, upper, bucketSize, BoundSearchBase::BOUND_UPPER ); lower.read( lowerH.m_ptr, bucketSize ); upper.read( upperH.m_ptr, bucketSize ); DeviceUtils::waitForCompletion( deviceGPU ); /* for(u32 i=1; i<(u32)bucketSize; i++) { for(u32 j=lowerH[i-1]; j::deallocate( dataH ); RadixSort::deallocate( dataHSort ); TEST_REPORT( "boundSearchTest" ); } template void fillIntTest( Device* deviceGPU, Device* deviceHost ) { TEST_INIT; ADLASSERT( type == deviceGPU->m_type ); int maxSize = 1024*256; HostBuffer buf0( deviceHost, maxSize ); HostBuffer buf1( deviceHost, maxSize ); Buffer buf2( deviceGPU, maxSize ); Fill::Data* data0 = Fill::allocate( deviceHost ); Fill::Data* data1 = Fill::allocate( deviceGPU ); int dx = maxSize/NUM_TESTS; for(int iter=0; iter::execute( data0, buf0, 12, size ); Fill::execute( data1, buf2, 12, size ); buf2.read( buf1.m_ptr, size ); DeviceUtils::waitForCompletion( deviceGPU ); for(int i=0; i::deallocate( data0 ); Fill::deallocate( data1 ); TEST_REPORT( "fillIntTest" ); } template void fillInt2Test( Device* deviceGPU, Device* deviceHost ) { TEST_INIT; ADLASSERT( type == deviceGPU->m_type ); int maxSize = 1024*256; HostBuffer buf0( deviceHost, maxSize ); HostBuffer buf1( deviceHost, maxSize ); Buffer buf2( deviceGPU, maxSize ); Fill::Data* data0 = Fill::allocate( deviceHost ); Fill::Data* data1 = Fill::allocate( deviceGPU ); int dx = maxSize/NUM_TESTS; for(int iter=0; iter::execute( data0, buf0, make_int2( 12, 12 ), size ); Fill::execute( data1, buf2, make_int2( 12, 12 ), size ); buf2.read( buf1.m_ptr, size ); DeviceUtils::waitForCompletion( deviceGPU ); for(int i=0; i::deallocate( data0 ); Fill::deallocate( data1 ); TEST_REPORT( "fillInt2Test" ); } template void fillInt4Test( Device* deviceGPU, Device* deviceHost ) { TEST_INIT; ADLASSERT( type == deviceGPU->m_type ); int maxSize = 1024*256; HostBuffer buf0( deviceHost, maxSize ); HostBuffer buf1( deviceHost, maxSize ); Buffer buf2( deviceGPU, maxSize ); Fill::Data* data0 = Fill::allocate( deviceHost ); Fill::Data* data1 = Fill::allocate( deviceGPU ); int dx = maxSize/NUM_TESTS; for(int iter=0; iter::execute( data0, buf0, make_int4( 12 ), size ); Fill::execute( data1, buf2, make_int4( 12 ), size ); buf2.read( buf1.m_ptr, size ); DeviceUtils::waitForCompletion( deviceGPU ); for(int i=0; i::deallocate( data0 ); Fill::deallocate( data1 ); TEST_REPORT( "fillInt4Test" ); } template bool CopyF4Test( Device* deviceGPU, Device* deviceHost ) { TEST_INIT; ADLASSERT( type == deviceGPU->m_type ); int maxSize = 1024*256; HostBuffer buf0( deviceHost, maxSize ); HostBuffer buf1( deviceHost, maxSize ); Buffer buf2( deviceGPU, maxSize ); Buffer buf3( deviceGPU, maxSize ); HostBuffer devResult( deviceHost, maxSize ); Copy::Data* data0 = Copy::allocate( deviceHost ); Copy::Data* data1 = Copy::allocate( deviceGPU ); int dx = maxSize/NUM_TESTS; for(int iter=0; iter::execute( data0, buf1, buf0, size, OPTION ); Copy::execute( data1, buf3, buf2, size, OPTION ); buf3.read( devResult.m_ptr, size ); DeviceUtils::waitForCompletion( deviceGPU ); for(int i=0; i::deallocate( data0 ); Copy::deallocate( data1 ); return g_testFailed; } template void Copy1F4Test( Device* deviceGPU, Device* deviceHost ) { TEST_INIT; g_testFailed = CopyF4Test( deviceGPU, deviceHost ); TEST_REPORT( "Copy1F4Test" ); } template void Copy2F4Test( Device* deviceGPU, Device* deviceHost ) { TEST_INIT; g_testFailed = CopyF4Test( deviceGPU, deviceHost ); TEST_REPORT( "Copy2F4Test" ); } template void Copy4F4Test( Device* deviceGPU, Device* deviceHost ) { TEST_INIT; g_testFailed = CopyF4Test( deviceGPU, deviceHost ); TEST_REPORT( "Copy4F4Test" ); } template void CopyF1Test( Device* deviceGPU, Device* deviceHost ) { TEST_INIT; ADLASSERT( type == deviceGPU->m_type ); int maxSize = 1024*256; HostBuffer buf0( deviceHost, maxSize ); HostBuffer buf1( deviceHost, maxSize ); Buffer buf2( deviceGPU, maxSize ); Buffer buf3( deviceGPU, maxSize ); HostBuffer devResult( deviceHost, maxSize ); Copy::Data* data0 = Copy::allocate( deviceHost ); Copy::Data* data1 = Copy::allocate( deviceGPU ); int dx = maxSize/NUM_TESTS; for(int iter=0; iter::execute( data0, buf1, buf0, size ); Copy::execute( data1, buf3, buf2, size ); buf3.read( devResult.m_ptr, size ); DeviceUtils::waitForCompletion( deviceGPU ); for(int i=0; i::deallocate( data0 ); Copy::deallocate( data1 ); TEST_REPORT( "CopyF1Test" ); } template void CopyF2Test( Device* deviceGPU, Device* deviceHost ) { TEST_INIT; ADLASSERT( type == deviceGPU->m_type ); int maxSize = 1024*256; HostBuffer buf0( deviceHost, maxSize ); HostBuffer buf1( deviceHost, maxSize ); Buffer buf2( deviceGPU, maxSize ); Buffer buf3( deviceGPU, maxSize ); HostBuffer devResult( deviceHost, maxSize ); Copy::Data* data0 = Copy::allocate( deviceHost ); Copy::Data* data1 = Copy::allocate( deviceGPU ); int dx = maxSize/NUM_TESTS; for(int iter=0; iter::execute( data0, buf1, buf0, size ); Copy::execute( data1, buf3, buf2, size ); buf3.read( devResult.m_ptr, size ); DeviceUtils::waitForCompletion( deviceGPU ); for(int i=0; i::deallocate( data0 ); Copy::deallocate( data1 ); TEST_REPORT( "CopyF2Test" ); } template void radixSort32Test( Device* deviceGPU, Device* deviceHost ) { TEST_INIT; ADLASSERT( type == deviceGPU->m_type ); int maxSize = 1024*256; HostBuffer buf0( deviceHost, maxSize ); HostBuffer buf1( deviceHost, maxSize ); Buffer buf2( deviceGPU, maxSize ); RadixSort32::Data* dataH = RadixSort32::allocate( deviceHost, maxSize ); RadixSort32::Data* dataC = RadixSort32::allocate( deviceGPU, maxSize ); int dx = maxSize/NUM_TESTS; for(int iter=0; iter::execute( dataH, buf0, size, 32 ); RadixSort32::execute( dataC, buf2, size, 32 ); buf2.read( buf1.m_ptr, size ); DeviceUtils::waitForCompletion( deviceGPU ); // for(int i=0; i::deallocate( dataH ); RadixSort32::deallocate( dataC ); TEST_REPORT( "RadixSort32Test" ); } template void radixSortKeyValue32Test( Device* deviceGPU, Device* deviceHost ) { TEST_INIT; ADLASSERT( type == deviceGPU->m_type ); int maxSize = 1024*256; // Host buffers HostBuffer buf0( deviceHost, maxSize ); // Buffer for keys in host and will be sorted by host. HostBuffer buf1( deviceHost, maxSize ); // Buffer for keys in host and will be saved by device after sorting in device. HostBuffer buf2( deviceHost, maxSize ); // Buffer for values in host. This buffer is paired with buf0. HostBuffer buf3( deviceHost, maxSize ); // Buffer for values in host and will be saved by device after sorting. It is paired with buf1. // Device buffers Buffer buf4( deviceGPU, maxSize ); // Buffer for input keys for device. Buffer buf5( deviceGPU, maxSize ); // Buffer for output keys from device and will be sorted by device. This key data will be saved to buf1 to be compared with a result(buf0) from host. Buffer buf6( deviceGPU, maxSize ); // Buffer for input values in device. Buffer buf7( deviceGPU, maxSize ); // Buffer for output values in device. RadixSort32::Data* dataH = RadixSort32::allocate( deviceHost, maxSize ); RadixSort32::Data* dataC = RadixSort32::allocate( deviceGPU, maxSize ); int dx = maxSize/NUM_TESTS; for(int iter=0; iter::execute( dataH, buf0, buf2, size, 32 ); RadixSort32::execute( dataC, buf4, buf5, buf6, buf7, size, 32 ); buf5.read( buf1.m_ptr, size ); buf7.read( buf3.m_ptr, size ); DeviceUtils::waitForCompletion( deviceGPU ); for(int i=0; i::deallocate( dataH ); RadixSort32::deallocate( dataC ); TEST_REPORT( "RadixSortKeyValue32Test" ); } #if defined(ADL_ENABLE_DX11) #define RUN_GPU( func ) func(ddcl); func(dddx); #define RUN_GPU_TEMPLATE( func ) func( ddcl, ddhost ); func( dddx, ddhost ); #define RUN_CL_TEMPLATE( func ) func( ddcl, ddhost ); #else #define RUN_GPU( func ) func(ddcl); #define RUN_GPU_TEMPLATE( func ) func( ddcl, ddhost ); #endif #define RUN_ALL( func ) RUN_GPU( func ); func(ddhost); void runAllTest() { g_nPassed = 0; g_nFailed = 0; Device* ddcl; Device* ddhost; #if defined(ADL_ENABLE_DX11) Device* dddx; #endif { DeviceUtils::Config cfg; // Choose AMD or NVidia #ifdef CL_PLATFORM_AMD cfg.m_vendor = adl::DeviceUtils::Config::VD_AMD; #endif #ifdef CL_PLATFORM_INTEL cfg.m_vendor = adl::DeviceUtils::Config::VD_INTEL; cfg.m_type = DeviceUtils::Config::DEVICE_CPU; #endif #ifdef CL_PLATFORM_NVIDIA cfg.m_vendor = adl::DeviceUtils::Config::VD_NV; #endif ddcl = DeviceUtils::allocate( TYPE_CL, cfg ); ddhost = DeviceUtils::allocate( TYPE_HOST, cfg ); // cfg.m_type = DeviceUtils::Config::DEVICE_GPU; #if defined(ADL_ENABLE_DX11) dddx = DeviceUtils::allocate( TYPE_DX11, cfg ); #endif } { char name[128]; ddcl->getDeviceName( name ); printf("CL: %s\n", name); #ifdef ADL_ENABLE_DX11 dddx->getDeviceName( name ); printf("DX11: %s\n", name); #endif } RUN_GPU_TEMPLATE( radixSort32Test ); RUN_GPU_TEMPLATE( radixSortKeyValue32Test ); if (1) { RUN_GPU_TEMPLATE( CopyF1Test ); RUN_GPU_TEMPLATE( CopyF2Test ); boundSearchTest( ddhost, ddhost ); // fillTest( ddhost, ddhost ); // fillTest( ddcl, ddhost ); RUN_GPU_TEMPLATE( boundSearchTest ); RUN_GPU_TEMPLATE( fillIntTest ); RUN_GPU_TEMPLATE( fillInt2Test ); RUN_GPU_TEMPLATE( fillInt4Test ); RUN_ALL( stopwatchTest ); RUN_ALL( memCpyTest ); // RUN_GPU( kernelTest ); RUN_GPU_TEMPLATE( scanTest ); RUN_GPU_TEMPLATE( radixSortSimpleTest ); RUN_GPU_TEMPLATE( radixSortStandardTest ); RUN_GPU_TEMPLATE( radixSort32Test ); // RUN_GPU_TEMPLATE( boundSearchTest ); RUN_GPU_TEMPLATE( Copy1F4Test ); RUN_GPU_TEMPLATE( Copy2F4Test ); RUN_GPU_TEMPLATE( Copy4F4Test ); } DeviceUtils::deallocate( ddcl ); DeviceUtils::deallocate( ddhost ); #if defined(ADL_ENABLE_DX11) DeviceUtils::deallocate( dddx ); #endif printf("=========\n%d Passed\n%d Failed\n", g_nPassed, g_nFailed); }