00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00021 #include <stdio.h>
00022 #include <stdlib.h>
00023 #include <string.h>
00024 #include <cuda.h>
00025
00026 #include "Inform.h"
00027 #include "WKFThreads.h"
00028 #include "WKFUtils.h"
00029 #include "CUDAKernels.h"
00030 #include "Measure.h"
00031
00032
00033
00034
00035
00036 #if 1
00037 #define RESTRICT __restrict__
00038 #else
00039 #define RESTRICT
00040 #endif
00041
00042
00043
00044 #if defined(VMDUSECUDAGDS)
00045 #include </usr/local/gds-beta-0.7.1/lib/cufile.h>
00046
00047
00048
00049 #define VMDJSPLUGININCLUDESRC 1
00050 #include "/home/johns/plugins/molfile_plugin/src/jsplugin.c"
00051 #endif
00052
00053
00054 #define CUERR { cudaError_t err; \
00055 if ((err = cudaGetLastError()) != cudaSuccess) { \
00056 printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00057 return -1; }}
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068 #define FMADD16 \
00069 tmp0 = tmp0*tmp4+tmp7; \
00070 tmp1 = tmp1*tmp5+tmp0; \
00071 tmp2 = tmp2*tmp6+tmp1; \
00072 tmp3 = tmp3*tmp7+tmp2; \
00073 tmp4 = tmp4*tmp0+tmp3; \
00074 tmp5 = tmp5*tmp1+tmp4; \
00075 tmp6 = tmp6*tmp2+tmp5; \
00076 tmp7 = tmp7*tmp3+tmp6; \
00077 tmp8 = tmp8*tmp12+tmp15; \
00078 tmp9 = tmp9*tmp13+tmp8; \
00079 tmp10 = tmp10*tmp14+tmp9; \
00080 tmp11 = tmp11*tmp15+tmp10; \
00081 tmp12 = tmp12*tmp8+tmp11; \
00082 tmp13 = tmp13*tmp9+tmp12; \
00083 tmp14 = tmp14*tmp10+tmp13; \
00084 tmp15 = tmp15*tmp11+tmp14;
00085
00086
00087 #define GRIDSIZEX 6144 // number of 1-D thread blocks
00088 #define BLOCKSIZEX 64 // number of threads per 1-D block
00089 #define GLOOPS 2000 // iteration count (all threads)
00090 #define FMADD16COUNT 32 // 32 reps
00091 #define FLOPSPERFMADD16 32 // 16 MULs and 16 ADDs
00092
00093
00094 #define FLOPSPERLOOP (FMADD16COUNT * FLOPSPERFMADD16)
00095
00096 __global__ static void madd_kernel(float *doutput) {
00097 int tid = blockIdx.x * blockDim.x + threadIdx.x;
00098 float tmp0,tmp1,tmp2,tmp3,tmp4,tmp5,tmp6,tmp7;
00099 float tmp8,tmp9,tmp10,tmp11,tmp12,tmp13,tmp14,tmp15;
00100 tmp0=tmp1=tmp2=tmp3=tmp4=tmp5=tmp6=tmp7=0.0f;
00101 tmp8=tmp9=tmp10=tmp11=tmp12=tmp13=tmp14=tmp15 = 0.0f;
00102
00103 tmp15=tmp7 = blockIdx.x * 0.001f;
00104 tmp1 = blockIdx.y * 0.001f;
00105
00106 int loop;
00107 for(loop=0; loop<GLOOPS; loop++){
00108 FMADD16
00109 FMADD16
00110 FMADD16
00111 FMADD16
00112 FMADD16
00113 FMADD16
00114 FMADD16
00115 FMADD16
00116 FMADD16
00117 FMADD16
00118 FMADD16
00119 FMADD16
00120 FMADD16
00121 FMADD16
00122 FMADD16
00123 FMADD16
00124 FMADD16
00125 FMADD16
00126 FMADD16
00127 FMADD16
00128 FMADD16
00129 FMADD16
00130 FMADD16
00131 FMADD16
00132 FMADD16
00133 FMADD16
00134 FMADD16
00135 FMADD16
00136 FMADD16
00137 FMADD16
00138 FMADD16
00139 FMADD16
00140 }
00141
00142 doutput[tid] = tmp0+tmp1+tmp2+tmp3+tmp4+tmp5+tmp6+tmp7
00143 +tmp8+tmp9+tmp10+tmp11+tmp12+tmp13+tmp14+tmp15;
00144 }
00145
00146
00147 static int cudamaddgflops(int cudadev, double *gflops, int testloops) {
00148 float *doutput = NULL;
00149 dim3 Bsz, Gsz;
00150 wkf_timerhandle timer;
00151 int i;
00152
00153 cudaError_t rc;
00154 rc = cudaSetDevice(cudadev);
00155 if (rc != cudaSuccess) {
00156 #if CUDART_VERSION >= 2010
00157 rc = cudaGetLastError();
00158 if (rc != cudaErrorSetOnActiveProcess)
00159 return -1;
00160 #else
00161 cudaGetLastError();
00162
00163 #endif
00164 }
00165
00166
00167
00168 Bsz.x = BLOCKSIZEX;
00169 Bsz.y = 1;
00170 Bsz.z = 1;
00171 Gsz.x = GRIDSIZEX;
00172 Gsz.y = 1;
00173 Gsz.z = 1;
00174
00175
00176 cudaMalloc((void**)&doutput, BLOCKSIZEX * GRIDSIZEX * sizeof(float));
00177 CUERR
00178
00179
00180 madd_kernel<<<Gsz, Bsz>>>(doutput);
00181 cudaDeviceSynchronize();
00182
00183
00184 timer=wkf_timer_create();
00185 wkf_timer_start(timer);
00186 for (i=0; i<testloops; i++) {
00187 madd_kernel<<<Gsz, Bsz>>>(doutput);
00188 }
00189 cudaDeviceSynchronize();
00190 CUERR
00191 wkf_timer_stop(timer);
00192
00193 double runtime = wkf_timer_time(timer);
00194 double gflop = ((double) GLOOPS) * ((double) FLOPSPERLOOP) *
00195 ((double) BLOCKSIZEX) * ((double) GRIDSIZEX) * (1.0e-9) * testloops;
00196
00197 *gflops = gflop / runtime;
00198
00199 cudaFree(doutput);
00200 CUERR
00201
00202 wkf_timer_destroy(timer);
00203
00204 return 0;
00205 }
00206
00207 typedef struct {
00208 int deviceid;
00209 int testloops;
00210 double gflops;
00211 } maddthrparms;
00212
00213 static void * cudamaddthread(void *voidparms) {
00214 maddthrparms *parms = (maddthrparms *) voidparms;
00215 cudamaddgflops(parms->deviceid, &parms->gflops, parms->testloops);
00216 return NULL;
00217 }
00218
00219 int vmd_cuda_madd_gflops(int numdevs, int *devlist, double *gflops,
00220 int testloops) {
00221 maddthrparms *parms;
00222 wkf_thread_t * threads;
00223 int i;
00224
00225
00226 threads = (wkf_thread_t *) calloc(numdevs * sizeof(wkf_thread_t), 1);
00227
00228
00229 parms = (maddthrparms *) malloc(numdevs * sizeof(maddthrparms));
00230 for (i=0; i<numdevs; i++) {
00231 if (devlist != NULL)
00232 parms[i].deviceid = devlist[i];
00233 else
00234 parms[i].deviceid = i;
00235
00236 parms[i].testloops = testloops;
00237 parms[i].gflops = 0.0;
00238 }
00239
00240 #if defined(VMDTHREADS)
00241
00242
00243
00244 for (i=0; i<numdevs; i++) {
00245 wkf_thread_create(&threads[i], cudamaddthread, &parms[i]);
00246 }
00247
00248
00249 for (i=0; i<numdevs; i++) {
00250 wkf_thread_join(threads[i], NULL);
00251 }
00252 #else
00253
00254 cudamaddthread((void *) &parms[0]);
00255 #endif
00256
00257 for (i=0; i<numdevs; i++) {
00258 gflops[i] = parms[i].gflops;
00259 }
00260
00261
00262 free(parms);
00263 free(threads);
00264
00265 return 0;
00266 }
00267
00268
00269
00270
00271
00272
00273
00274
00275
00276
00277 #define BWITER 500
00278 #define LATENCYITER 50000
00279
00280 static int cudabusbw(int cudadev,
00281 double *hdmbsec, double *hdlatusec,
00282 double *phdmbsec, double *phdlatusec,
00283 double *dhmbsec, double *dhlatusec,
00284 double *pdhmbsec, double *pdhlatusec) {
00285 float *hdata = NULL;
00286 float *phdata = NULL;
00287 float *ddata = NULL;
00288 int i;
00289 double runtime;
00290 wkf_timerhandle timer;
00291 int memsz = 1024 * 1024 * sizeof(float);
00292
00293 *hdmbsec = 0.0;
00294 *hdlatusec = 0.0;
00295 *dhmbsec = 0.0;
00296 *dhlatusec = 0.0;
00297 *phdmbsec = 0.0;
00298 *phdlatusec = 0.0;
00299 *pdhmbsec = 0.0;
00300 *pdhlatusec = 0.0;
00301
00302
00303 cudaError_t rc;
00304 rc = cudaSetDevice(cudadev);
00305 if (rc != cudaSuccess) {
00306 #if CUDART_VERSION >= 2010
00307 rc = cudaGetLastError();
00308 if (rc != cudaErrorSetOnActiveProcess)
00309 return -1;
00310 #else
00311 cudaGetLastError();
00312
00313 #endif
00314 }
00315
00316
00317 hdata = (float *) malloc(memsz);
00318
00319
00320 cudaMallocHost((void**) &phdata, memsz);
00321 CUERR
00322
00323
00324 cudaMalloc((void**) &ddata, memsz);
00325 CUERR
00326
00327
00328 timer=wkf_timer_create();
00329
00330
00331
00332
00333
00334
00335 wkf_timer_start(timer);
00336 for (i=0; i<BWITER; i++) {
00337 cudaMemcpy(ddata, hdata, memsz, cudaMemcpyHostToDevice);
00338 }
00339 wkf_timer_stop(timer);
00340 CUERR
00341 runtime = wkf_timer_time(timer);
00342 *hdmbsec = ((double) BWITER) * ((double) memsz) / runtime / (1024.0 * 1024.0);
00343
00344
00345 wkf_timer_start(timer);
00346 for (i=0; i<LATENCYITER; i++) {
00347 cudaMemcpy(ddata, hdata, 1, cudaMemcpyHostToDevice);
00348 }
00349 wkf_timer_stop(timer);
00350 CUERR
00351 runtime = wkf_timer_time(timer);
00352 *hdlatusec = runtime * 1.0e6 / ((double) LATENCYITER);
00353
00354
00355
00356 wkf_timer_start(timer);
00357 for (i=0; i<BWITER; i++) {
00358 cudaMemcpy(ddata, phdata, memsz, cudaMemcpyHostToDevice);
00359 }
00360 wkf_timer_stop(timer);
00361 CUERR
00362 runtime = wkf_timer_time(timer);
00363 *phdmbsec = ((double) BWITER) * ((double) memsz) / runtime / (1024.0 * 1024.0);
00364
00365
00366 wkf_timer_start(timer);
00367 for (i=0; i<LATENCYITER; i++) {
00368 cudaMemcpy(ddata, phdata, 1, cudaMemcpyHostToDevice);
00369 }
00370 wkf_timer_stop(timer);
00371 CUERR
00372 runtime = wkf_timer_time(timer);
00373 *phdlatusec = runtime * 1.0e6 / ((double) LATENCYITER);
00374
00375
00376
00377
00378
00379
00380
00381 wkf_timer_start(timer);
00382 for (i=0; i<BWITER; i++) {
00383 cudaMemcpy(hdata, ddata, memsz, cudaMemcpyDeviceToHost);
00384 }
00385 wkf_timer_stop(timer);
00386 CUERR
00387 runtime = wkf_timer_time(timer);
00388 *dhmbsec = ((double) BWITER) * ((double) memsz) / runtime / (1024.0 * 1024.0);
00389
00390
00391 wkf_timer_start(timer);
00392 for (i=0; i<LATENCYITER; i++) {
00393 cudaMemcpy(hdata, ddata, 1, cudaMemcpyDeviceToHost);
00394 }
00395 wkf_timer_stop(timer);
00396 CUERR
00397 runtime = wkf_timer_time(timer);
00398 *dhlatusec = runtime * 1.0e6 / ((double) LATENCYITER);
00399
00400
00401
00402 wkf_timer_start(timer);
00403 for (i=0; i<BWITER; i++) {
00404 cudaMemcpy(phdata, ddata, memsz, cudaMemcpyDeviceToHost);
00405 }
00406 wkf_timer_stop(timer);
00407 CUERR
00408 runtime = wkf_timer_time(timer);
00409 *pdhmbsec = ((double) BWITER) * ((double) memsz) / runtime / (1024.0 * 1024.0);
00410
00411
00412 wkf_timer_start(timer);
00413 for (i=0; i<LATENCYITER; i++) {
00414 cudaMemcpy(phdata, ddata, 1, cudaMemcpyDeviceToHost);
00415 }
00416 wkf_timer_stop(timer);
00417 CUERR
00418 runtime = wkf_timer_time(timer);
00419 *pdhlatusec = runtime * 1.0e6 / ((double) LATENCYITER);
00420
00421
00422 cudaFree(ddata);
00423 CUERR
00424 cudaFreeHost(phdata);
00425 CUERR
00426 free(hdata);
00427
00428 wkf_timer_destroy(timer);
00429
00430 return 0;
00431 }
00432
00433 typedef struct {
00434 int deviceid;
00435 double hdmbsec;
00436 double hdlatusec;
00437 double phdmbsec;
00438 double phdlatusec;
00439 double dhmbsec;
00440 double dhlatusec;
00441 double pdhmbsec;
00442 double pdhlatusec;
00443 } busbwthrparms;
00444
00445 static void * cudabusbwthread(void *voidparms) {
00446 busbwthrparms *parms = (busbwthrparms *) voidparms;
00447 cudabusbw(parms->deviceid,
00448 &parms->hdmbsec, &parms->hdlatusec,
00449 &parms->phdmbsec, &parms->phdlatusec,
00450 &parms->dhmbsec, &parms->dhlatusec,
00451 &parms->pdhmbsec, &parms->pdhlatusec);
00452 return NULL;
00453 }
00454
00455 int vmd_cuda_bus_bw(int numdevs, int *devlist,
00456 double *hdmbsec, double *hdlatusec,
00457 double *phdmbsec,double *phdlatusec,
00458 double *dhmbsec, double *dhlatusec,
00459 double *pdhmbsec, double *pdhlatusec) {
00460 busbwthrparms *parms;
00461 wkf_thread_t * threads;
00462 int i;
00463
00464
00465 threads = (wkf_thread_t *) calloc(numdevs * sizeof(wkf_thread_t), 1);
00466
00467
00468 parms = (busbwthrparms *) malloc(numdevs * sizeof(busbwthrparms));
00469 for (i=0; i<numdevs; i++) {
00470 if (devlist != NULL)
00471 parms[i].deviceid = devlist[i];
00472 else
00473 parms[i].deviceid = i;
00474 parms[i].hdmbsec = 0.0;
00475 parms[i].hdlatusec = 0.0;
00476 parms[i].phdmbsec = 0.0;
00477 parms[i].phdlatusec = 0.0;
00478 parms[i].dhmbsec = 0.0;
00479 parms[i].dhlatusec = 0.0;
00480 parms[i].pdhmbsec = 0.0;
00481 parms[i].pdhlatusec = 0.0;
00482 }
00483
00484 #if defined(VMDTHREADS)
00485
00486
00487
00488 for (i=0; i<numdevs; i++) {
00489 wkf_thread_create(&threads[i], cudabusbwthread, &parms[i]);
00490 }
00491
00492
00493 for (i=0; i<numdevs; i++) {
00494 wkf_thread_join(threads[i], NULL);
00495 }
00496 #else
00497
00498 cudabusbwthread((void *) &parms[0]);
00499 #endif
00500
00501 for (i=0; i<numdevs; i++) {
00502 hdmbsec[i] = parms[i].hdmbsec;
00503 hdlatusec[i] = parms[i].hdlatusec;
00504 phdmbsec[i] = parms[i].phdmbsec;
00505 phdlatusec[i] = parms[i].phdlatusec;
00506 dhmbsec[i] = parms[i].dhmbsec;
00507 dhlatusec[i] = parms[i].dhlatusec;
00508 pdhmbsec[i] = parms[i].pdhmbsec;
00509 pdhlatusec[i] = parms[i].pdhlatusec;
00510 }
00511
00512
00513 free(parms);
00514 free(threads);
00515
00516 return 0;
00517 }
00518
00519
00520
00521
00522
00523
00524 template <class T>
00525 __global__ void gpuglobmemcpybw(T *dest, const T *src) {
00526 const unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
00527 dest[idx] = src[idx];
00528 }
00529
00530 template <class T>
00531 __global__ void gpuglobmemsetbw(T *dest, const T val) {
00532 int idx = threadIdx.x + blockIdx.x * blockDim.x;
00533 dest[idx] = val;
00534 }
00535
00536 typedef float4 datatype;
00537
00538 static int cudaglobmembw(int cudadev, double *gpumemsetgbsec, double *gpumemcpygbsec) {
00539 int i;
00540 int len = 1 << 22;
00541 int loops = 500;
00542 datatype *src, *dest;
00543 datatype val=make_float4(1.0f, 1.0f, 1.0f, 1.0f);
00544
00545
00546 float memsettime = 0.0f;
00547 float memcpytime = 0.0f;
00548 *gpumemsetgbsec = 0.0;
00549 *gpumemcpygbsec = 0.0;
00550
00551
00552 cudaError_t rc;
00553 rc = cudaSetDevice(cudadev);
00554 if (rc != cudaSuccess) {
00555 #if CUDART_VERSION >= 2010
00556 rc = cudaGetLastError();
00557 if (rc != cudaErrorSetOnActiveProcess)
00558 return -1;
00559 #else
00560 cudaGetLastError();
00561
00562 #endif
00563 }
00564
00565 cudaMalloc((void **) &src, sizeof(datatype)*len);
00566 CUERR
00567 cudaMalloc((void **) &dest, sizeof(datatype)*len);
00568 CUERR
00569
00570 dim3 BSz(256, 1, 1);
00571 dim3 GSz(len / (BSz.x * BSz.y * BSz.z), 1, 1);
00572
00573
00574 gpuglobmemsetbw<datatype><<< GSz, BSz >>>(src, val);
00575 CUERR
00576 gpuglobmemsetbw<datatype><<< GSz, BSz >>>(dest, val);
00577 CUERR
00578 gpuglobmemcpybw<datatype><<< GSz, BSz >>>(dest, src);
00579 CUERR
00580
00581 cudaEvent_t start, end;
00582 cudaEventCreate(&start);
00583 cudaEventCreate(&end);
00584
00585
00586 cudaEventRecord(start, 0);
00587 for (i=0; i<loops; i++) {
00588 gpuglobmemsetbw<datatype><<< GSz, BSz >>>(dest, val);
00589 }
00590 CUERR
00591 cudaEventRecord(end, 0);
00592 CUERR
00593 cudaEventSynchronize(start);
00594 CUERR
00595 cudaEventSynchronize(end);
00596 CUERR
00597 cudaEventElapsedTime(&memsettime, start, end);
00598 CUERR
00599
00600
00601 cudaEventRecord(start, 0);
00602 for (i=0; i<loops; i++) {
00603 gpuglobmemcpybw<datatype><<< GSz, BSz >>>(dest, src);
00604 }
00605 cudaEventRecord(end, 0);
00606 CUERR
00607 cudaEventSynchronize(start);
00608 CUERR
00609 cudaEventSynchronize(end);
00610 CUERR
00611 cudaEventElapsedTime(&memcpytime, start, end);
00612 CUERR
00613
00614 cudaEventDestroy(start);
00615 CUERR
00616 cudaEventDestroy(end);
00617 CUERR
00618
00619 *gpumemsetgbsec = (len * sizeof(datatype) / (1024.0 * 1024.0)) / (memsettime / loops);
00620 *gpumemcpygbsec = (2 * len * sizeof(datatype) / (1024.0 * 1024.0)) / (memcpytime / loops);
00621 cudaFree(dest);
00622 cudaFree(src);
00623 CUERR
00624
00625 return 0;
00626 }
00627
00628 typedef struct {
00629 int deviceid;
00630 double memsetgbsec;
00631 double memcpygbsec;
00632 } globmembwthrparms;
00633
00634 static void * cudaglobmembwthread(void *voidparms) {
00635 globmembwthrparms *parms = (globmembwthrparms *) voidparms;
00636 cudaglobmembw(parms->deviceid, &parms->memsetgbsec, &parms->memcpygbsec);
00637 return NULL;
00638 }
00639
00640 int vmd_cuda_globmem_bw(int numdevs, int *devlist,
00641 double *memsetgbsec, double *memcpygbsec) {
00642 globmembwthrparms *parms;
00643 wkf_thread_t * threads;
00644 int i;
00645
00646
00647 threads = (wkf_thread_t *) calloc(numdevs * sizeof(wkf_thread_t), 1);
00648
00649
00650 parms = (globmembwthrparms *) malloc(numdevs * sizeof(globmembwthrparms));
00651 for (i=0; i<numdevs; i++) {
00652 if (devlist != NULL)
00653 parms[i].deviceid = devlist[i];
00654 else
00655 parms[i].deviceid = i;
00656 parms[i].memsetgbsec = 0.0;
00657 parms[i].memcpygbsec = 0.0;
00658 }
00659
00660 #if defined(VMDTHREADS)
00661
00662
00663
00664 for (i=0; i<numdevs; i++) {
00665 wkf_thread_create(&threads[i], cudaglobmembwthread, &parms[i]);
00666 }
00667
00668
00669 for (i=0; i<numdevs; i++) {
00670 wkf_thread_join(threads[i], NULL);
00671 }
00672 #else
00673
00674 cudaglobmembwthread((void *) &parms[0]);
00675 #endif
00676
00677 for (i=0; i<numdevs; i++) {
00678 memsetgbsec[i] = parms[i].memsetgbsec;
00679 memcpygbsec[i] = parms[i].memcpygbsec;
00680 }
00681
00682
00683 free(parms);
00684 free(threads);
00685
00686 return 0;
00687 }
00688
00689
00690
00691
00692
00693 static void * vmddevpoollatencythread(void *voidparms) {
00694 return NULL;
00695 }
00696
00697 static void * vmddevpooltilelatencythread(void *voidparms) {
00698 int threadid=-1;
00699 int tilesize=1;
00700 void *parms=NULL;
00701 wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
00702 wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00703
00704
00705 wkf_tasktile_t tile;
00706 while (wkf_threadpool_next_tile(voidparms, tilesize, &tile) != WKF_SCHED_DONE) {
00707
00708 }
00709
00710 return NULL;
00711 }
00712
00713
00714
00715 __global__ static void nopkernel(float * ddata) {
00716 unsigned int xindex = blockIdx.x * blockDim.x + threadIdx.x;
00717 unsigned int yindex = blockIdx.y * blockDim.y + threadIdx.y;
00718 unsigned int outaddr = gridDim.x * blockDim.x * yindex + xindex;
00719
00720 if (ddata != NULL)
00721 ddata[outaddr] = outaddr;
00722 }
00723
00724
00725 __global__ static void voidkernel(void) {
00726 return;
00727 }
00728
00729 static void * vmddevpoolcudatilelatencythread(void *voidparms) {
00730 int threadid=-1;
00731 int tilesize=1;
00732 float *parms=NULL;
00733 wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
00734
00735
00736
00737 wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00738
00739 #if 0
00740
00741 tilesize=4;
00742 wkf_threadpool_worker_devscaletile(voidparms, &tilesize);
00743 #endif
00744
00745
00746 wkf_tasktile_t tile;
00747 dim3 Gsz(1,1,0);
00748 dim3 Bsz(8,8,1);
00749 while (wkf_threadpool_next_tile(voidparms, tilesize, &tile) != WKF_SCHED_DONE) {
00750
00751 nopkernel<<<Gsz, Bsz, 0>>>(parms);
00752 }
00753
00754
00755 cudaDeviceSynchronize();
00756
00757 return NULL;
00758 }
00759
00760
00761 int vmd_cuda_devpool_latency(wkf_threadpool_t *devpool, int tilesize,
00762 double *kernlaunchlatency,
00763 double *barlatency,
00764 double *cyclelatency,
00765 double *tilelatency,
00766 double *kernellatency) {
00767 int i;
00768 wkf_tasktile_t tile;
00769 wkf_timerhandle timer;
00770 int loopcount;
00771
00772 timer=wkf_timer_create();
00773
00774
00775
00776 loopcount = 15000;
00777 dim3 VGsz(1,1,0);
00778 dim3 VBsz(8,8,1);
00779 wkf_timer_start(timer);
00780 for (i=0; i<loopcount; i++) {
00781 voidkernel<<<VGsz, VBsz, 0>>>();
00782 }
00783
00784 cudaDeviceSynchronize();
00785 wkf_timer_stop(timer);
00786 *kernlaunchlatency = wkf_timer_time(timer) / ((double) loopcount);
00787
00788
00789 loopcount = 15000;
00790 wkf_timer_start(timer);
00791 for (i=0; i<loopcount; i++) {
00792 wkf_threadpool_wait(devpool);
00793 }
00794 wkf_timer_stop(timer);
00795 *barlatency = wkf_timer_time(timer) / ((double) loopcount);
00796
00797
00798 loopcount = 5000;
00799 wkf_timer_start(timer);
00800 for (i=0; i<loopcount; i++) {
00801 tile.start=0;
00802 tile.end=0;
00803 wkf_threadpool_sched_dynamic(devpool, &tile);
00804 wkf_threadpool_launch(devpool, vmddevpoollatencythread, NULL, 1);
00805 }
00806 wkf_timer_stop(timer);
00807 *cyclelatency = wkf_timer_time(timer) / ((double) loopcount);
00808
00809
00810 loopcount = 5000;
00811 wkf_timer_start(timer);
00812 for (i=0; i<loopcount; i++) {
00813 tile.start=0;
00814 tile.end=tilesize;
00815 wkf_threadpool_sched_dynamic(devpool, &tile);
00816 wkf_threadpool_launch(devpool, vmddevpooltilelatencythread, NULL, 1);
00817 }
00818 wkf_timer_stop(timer);
00819 *tilelatency = wkf_timer_time(timer) / ((double) loopcount);
00820
00821
00822 loopcount = 2000;
00823 wkf_timer_start(timer);
00824 for (i=0; i<loopcount; i++) {
00825 tile.start=0;
00826 tile.end=tilesize;
00827 wkf_threadpool_sched_dynamic(devpool, &tile);
00828 wkf_threadpool_launch(devpool, vmddevpoolcudatilelatencythread, NULL, 1);
00829 }
00830 wkf_timer_stop(timer);
00831 *kernellatency = wkf_timer_time(timer) / ((double) loopcount);
00832
00833 wkf_timer_destroy(timer);
00834
00835 #if 1
00836 vmd_cuda_measure_latencies(devpool);
00837 #endif
00838
00839 return 0;
00840 }
00841
00842
00843
00844
00845
00846 typedef struct {
00847 int deviceid;
00848 int testloops;
00849 double kernlatency;
00850 double bcopylatency;
00851 double kbseqlatency;
00852 } latthrparms;
00853
00854 static void * vmddevpoolcudalatencythread(void *voidparms) {
00855 int threadid=-1;
00856 latthrparms *parms=NULL;
00857
00858 wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
00859 wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00860 if (parms->deviceid == threadid) {
00861 wkf_timerhandle timer;
00862 timer=wkf_timer_create();
00863 printf("Thread/device %d running...\n", threadid);
00864 cudaStream_t devstream;
00865 cudaStreamCreate(&devstream);
00866
00867 char *hostbuf = (char *) calloc(1, 65536 * sizeof(char));
00868 char *gpubuf = NULL;
00869 cudaMalloc((void**)&gpubuf, 65536 * sizeof(char));
00870
00871 dim3 Gsz(1,1,0);
00872 dim3 Bsz(8,8,1);
00873
00874
00875 wkf_timer_start(timer);
00876 int i;
00877 for (i=0; i<parms->testloops; i++) {
00878
00879 nopkernel<<<Gsz, Bsz, 0, devstream>>>(NULL);
00880 }
00881
00882 cudaStreamSynchronize(devstream);
00883 wkf_timer_stop(timer);
00884 parms->kernlatency = 1000000 * wkf_timer_time(timer) / ((double) parms->testloops);
00885
00886
00887 wkf_timer_start(timer);
00888 for (i=0; i<parms->testloops; i++) {
00889 cudaMemcpyAsync(gpubuf, hostbuf, 1, cudaMemcpyHostToDevice, devstream);
00890 cudaMemcpyAsync(hostbuf, gpubuf, 1, cudaMemcpyDeviceToHost, devstream);
00891 }
00892
00893 cudaStreamSynchronize(devstream);
00894 wkf_timer_stop(timer);
00895 parms->kernlatency = 1000000 * wkf_timer_time(timer) / ((double) parms->testloops);
00896
00897 printf("NULL kernel launch latency (usec): %.2f\n", parms->kernlatency);
00898
00899 cudaStreamDestroy(devstream);
00900 cudaFree(gpubuf);
00901 free(hostbuf);
00902 wkf_timer_destroy(timer);
00903 }
00904
00905 return NULL;
00906 }
00907
00908
00909 int vmd_cuda_measure_latencies(wkf_threadpool_t *devpool) {
00910 latthrparms thrparms;
00911 int workers = wkf_threadpool_get_workercount(devpool);
00912 int i;
00913 printf("vmd_cuda_measure_latencies()...\n");
00914 for (i=0; i<workers; i++) {
00915 memset(&thrparms, 0, sizeof(thrparms));
00916 thrparms.deviceid = i;
00917 thrparms.testloops = 2500;
00918 wkf_threadpool_launch(devpool, vmddevpoolcudalatencythread, &thrparms, 1);
00919 }
00920
00921 return 0;
00922 }
00923
00924
00925 #if defined(VMDUSECUDAGDS)
00926 typedef struct {
00927 int nfiles;
00928 const char **trjfileset;
00929 jshandle **jshandles;
00930 CUfileHandle_t *cfh;
00931 int devcount;
00932 int natoms;
00933 const AtomSel *sel;
00934 int first;
00935 int last;
00936 int step;
00937 } gpuoocbenchthreadparms;
00938
00939 #define VMDGDSMAXFRAMEBUF 8
00940 static void * gpu_ooc_bench_thread(void *voidparms) {
00941 int threadid, numthreads;
00942 gpuoocbenchthreadparms *parms = NULL;
00943 wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00944 wkf_threadpool_worker_getid(voidparms, &threadid, &numthreads);
00945
00946
00947
00948
00949 int nfiles = parms->nfiles;
00950 int natoms = parms->natoms;
00951 const AtomSel *sel = parms->sel;
00952 int first = parms->first;
00953 int last = parms->last;
00954 int step = parms->step;
00955
00956 int usecufile = 1;
00957 fio_fd *hostfds = NULL;
00958 int pinhostiobuffer = 1;
00959 if (getenv("VMDGDSHOSTNOPIN")) {
00960 pinhostiobuffer=0;
00961 }
00962
00963 if (getenv("VMDGDSUSEHOST")) {
00964 usecufile=0;
00965 hostfds = (fio_fd *) calloc(1, nfiles * sizeof(fio_fd));
00966
00967 int hostusedirectio = 1;
00968 if (getenv("VMDGDSHOSTBUFFEREDIO") != NULL)
00969 hostusedirectio = 0;
00970
00971 int openmode = FIO_READ;
00972 if (hostusedirectio)
00973 openmode |= FIO_DIRECT;
00974
00975 int i;
00976 for (i=0; i<nfiles; i++) {
00977 if (fio_open(parms->trjfileset[i], openmode, &hostfds[i]) < 0) {
00978 if (hostusedirectio) {
00979 printf("Thr[%d] direct I/O unavailable or can't open file '%s'\n",
00980 threadid, parms->trjfileset[i]);
00981 } else {
00982 printf("Thr[%d] can't open file '%s'\n",
00983 threadid, parms->trjfileset[i]);
00984 }
00985 return NULL;
00986 }
00987 }
00988 }
00989
00990 if (hostfds && usecufile) {
00991 printf("Inconsistent cufile/hostfds state, aborting!\n");
00992 return NULL;
00993 }
00994
00995
00996
00997 long blocksz = MOLFILE_DIRECTIO_MIN_BLOCK_SIZE;
00998 long sz = 3L*sizeof(float)*natoms + blocksz;
00999
01000
01001 size_t blockpadsz = (sz + (blocksz - 1)) & (~(blocksz - 1));
01002
01003 int framecount = (last - first + 1) / step;
01004 int framesperfile = framecount / nfiles;
01005
01006 if (threadid == 0) {
01007 printf("Thr[%2d] %d frames total, natoms: %d selected: %d\n",
01008 threadid, framecount, natoms, sel->selected);
01009 printf("Thr[%2d] %d frames/file\n", threadid, framesperfile);
01010 }
01011
01012 cudaError_t crc;
01013 cudaStream_t oocstream;
01014 float *devptr=NULL;
01015 float *hostptr=NULL;
01016 float *hostptr_unaligned=NULL;
01017
01018 float *crdx1=NULL, *crdy1=NULL, *crdz1=NULL;
01019 float *crdx2=NULL, *crdy2=NULL, *crdz2=NULL;
01020 int multiframeio = 0;
01021 if (getenv("VMDGDSMULTIFRAME"))
01022 multiframeio = atoi(getenv("VMDGDSMULTIFRAME"));
01023 if (multiframeio > VMDGDSMAXFRAMEBUF)
01024 multiframeio = VMDGDSMAXFRAMEBUF;
01025
01026
01027 dim3 IOBsz = dim3(256, 1, 1);
01028 dim3 IOGsz = dim3((natoms + IOBsz.x - 1) / IOBsz.x, 1, 1);
01029
01030 if (parms->devcount > 0) {
01031 long gpuallocsz = (VMDGDSMAXFRAMEBUF+1) * blockpadsz;
01032
01033 if (threadid == 0) {
01034 printf("Thr[%2d] Allocating GPU timestep I/O buf: %ld \n",
01035 threadid, gpuallocsz);
01036 }
01037 crc = cudaMalloc((void**) &devptr, gpuallocsz);
01038
01039 if (hostfds != NULL) {
01040 if (pinhostiobuffer) {
01041 crc = cudaMallocHost((void**) &hostptr, gpuallocsz);
01042 } else {
01043 hostptr = (float *) alloc_aligned_ptr(gpuallocsz, 4096,
01044 (void**) &hostptr_unaligned);
01045 if (!hostptr) {
01046 printf("Thr[%d]: Failed allocation!\n", threadid);
01047 return NULL;
01048 }
01049 }
01050 }
01051
01052 long crdsz = sel->selected * sizeof(float);
01053
01054
01055 crc = cudaMalloc((void**) &crdx1, crdsz);
01056 crc = cudaMalloc((void**) &crdy1, crdsz);
01057 crc = cudaMalloc((void**) &crdz1, crdsz);
01058 crc = cudaMalloc((void**) &crdx2, crdsz);
01059 crc = cudaMalloc((void**) &crdy2, crdsz);
01060 crc = cudaMalloc((void**) &crdz2, crdsz);
01061 if (crc != cudaSuccess) {
01062 printf("Thr[%2d], Failed to allocate GPU buffer!\n", threadid);
01063 return NULL;
01064 }
01065
01066 cudaStreamCreate(&oocstream);
01067
01068 #if defined(VMDUSECUDAGDS)
01069 cuFileBufRegister(devptr, gpuallocsz, 0);
01070 #endif
01071 }
01072
01073 int verbose = (getenv("VMDGDSVERBOSE") != NULL) ? 1 : 0;
01074
01075 int filestrategy = 0;
01076 if (getenv("VMDGDSFILESTRATEGY")) {
01077 filestrategy = atoi(getenv("VMDGDSFILESTRATEGY"));
01078 }
01079 if (threadid == 0) {
01080 printf("Thr[%2d] file strategy set to: %d\n", threadid, filestrategy);
01081 }
01082
01083 wkf_tasktile_t tile;
01084 while (wkf_threadlaunch_next_tile(voidparms, VMDGDSMAXFRAMEBUF * 1, &tile) != WKF_SCHED_DONE) {
01085
01086
01087
01088 int idx;
01089 int threadspergpu;
01090 if (parms->devcount > 0)
01091 threadspergpu = numthreads / parms->devcount;
01092 else
01093 threadspergpu = 1;
01094
01095 for (idx=tile.start; idx<tile.end; idx++) {
01096 int myfileidx, fileframeidx;
01097
01098 switch (filestrategy) {
01099 case 1:
01100 myfileidx = (idx / multiframeio) % nfiles;
01101 fileframeidx = idx % framesperfile;
01102 break;
01103
01104 case 2:
01105 myfileidx = (idx / (multiframeio * threadspergpu)) % nfiles;
01106 fileframeidx = idx % framesperfile;
01107 break;
01108
01109 case 3:
01110 myfileidx = (threadid / 4) % nfiles;
01111 fileframeidx = idx % framesperfile;
01112 break;
01113
01114 case 0:
01115 default:
01116 myfileidx = (threadid / threadspergpu) % nfiles;
01117 fileframeidx = idx % framesperfile;
01118 break;
01119 }
01120
01121
01122
01123
01124 long startoffset, foffset, readlen;
01125 read_js_timestep_index_offsets(parms->jshandles[myfileidx],
01126 natoms, fileframeidx, 0, natoms, NULL,
01127 &startoffset, &foffset, &readlen);
01128 if (multiframeio) {
01129
01130
01131
01132 long multistartoffset, multifoffset, multireadlen;
01133 read_js_timestep_index_offsets(parms->jshandles[myfileidx], natoms,
01134 fileframeidx+multiframeio-1,
01135 0, natoms, NULL,
01136 &multistartoffset, &multifoffset,
01137 &multireadlen);
01138
01139 multireadlen = (multifoffset + multireadlen) - foffset;
01140
01141
01142 readlen = multireadlen;
01143 idx+=multiframeio-1;
01144 }
01145
01146
01147
01148
01149 long ret=0;
01150 if (usecufile) {
01151 ret = cuFileRead(parms->cfh[myfileidx], (char *) devptr, readlen, foffset, 0);
01152 } else if (hostfds) {
01153 foffset=0;
01154 ret=fio_fseek(hostfds[myfileidx], foffset, FIO_SEEK_SET);
01155 if (ret<0) { printf("fio_fseek() error!\n"); return NULL; }
01156 ret=fio_fread(hostptr, readlen, 1, hostfds[myfileidx]);
01157 if (ret<0) { printf("fio_fseek() error!\n"); return NULL; }
01158 cudaMemcpy(devptr, hostptr, readlen, cudaMemcpyHostToDevice);
01159 } else {
01160 printf("Inconsistent cufile/hostfds state, aborting!\n");
01161 return NULL;
01162 }
01163
01164
01165 if (ret < 0) {
01166 printf("Thr[%2d] Error: cuFileRead(): %ld\n", threadid, ret);
01167 return NULL;
01168 }
01169
01170 if (verbose) {
01171 printf("Thr[%2d]F[%d][tile: %d to %d] frame: %d cuFile len: %ld off: %ld\n",
01172 threadid, myfileidx, tile.start, tile.end, idx,
01173 readlen, foffset);
01174 }
01175 }
01176 }
01177
01178 cudaFree(crdx1);
01179 cudaFree(crdy1);
01180 cudaFree(crdz1);
01181 cudaFree(crdx2);
01182 cudaFree(crdy2);
01183 cudaFree(crdz2);
01184
01185 #if defined(VMDUSECUDAGDS)
01186 if (usecufile) {
01187 cuFileBufDeregister(devptr);
01188 }
01189 #endif
01190
01191 if (hostfds != NULL) {
01192 int i;
01193 for (i=0; i<nfiles; i++) {
01194 fio_fclose(hostfds[i]);
01195 }
01196 free(hostfds);
01197 }
01198
01199 if (hostptr != NULL) {
01200 if (pinhostiobuffer) {
01201 cudaFreeHost(hostptr);
01202 } else {
01203 free(hostptr_unaligned);
01204 }
01205 }
01206
01207 return NULL;
01208 }
01209
01210 #endif
01211
01212
01213 int gpu_ooc_bench(wkf_threadpool_t *devpool,
01214 int nfiles, const char **trjfileset, const AtomSel *sel,
01215 int first, int last, int step) {
01216 printf("gpu_ooc_bench()\n");
01217 wkf_threadpool_t *bigpool = NULL;
01218
01219 #if defined(VMDUSECUDAGDS)
01220 int devcount;
01221 cudaError_t crc = cudaGetDeviceCount(&devcount);
01222 printf("gpu_ooc_bench) GPU device count: %d\n", devcount);
01223 if (devcount==0)
01224 printf("gpu_ooc_bench) No GPU devices, continuing with host only...\n");
01225
01226 CUfileHandle_t * cfh = (CUfileHandle_t *) calloc(1, nfiles * sizeof(CUfileHandle_t));
01227 CUfileDescr_t * cfhdesc = (CUfileDescr_t *) calloc(1, nfiles * sizeof(CUfileDescr_t));
01228 memset(&cfh[0], 0, sizeof(cfh));
01229 memset(&cfhdesc[0], 0, sizeof(cfhdesc));
01230
01231 int natomschk = 0;
01232 jshandle **jshandles = (jshandle **) calloc(1, nfiles * sizeof(jshandle *));
01233 fio_fd *directio_fds = (fio_fd *) calloc(1, nfiles * sizeof(fio_fd));
01234
01235 int i;
01236 for (i=0; i<nfiles; i++) {
01237 const char *filename = trjfileset[i];
01238 printf("gpu_ooc_bench) File[%d] GDS setup, opening '%s'\n", i, filename);
01239 jshandles[i] = (jshandle *) open_js_read(filename, "js", &natomschk);
01240 if (!jshandles[i]) {
01241 printf("gpu_ooc_bench) File[%d] open_js_read failed for file %s\n", i, filename);
01242 return -1;
01243 }
01244
01245 #if vmdplugin_ABIVERSION > 17
01246 long blocksz = MOLFILE_DIRECTIO_MIN_BLOCK_SIZE;
01247 int filepgalignsz = 1;
01248 read_js_timestep_pagealign_size(jshandles[i], &filepgalignsz);
01249 if (filepgalignsz != blocksz) {
01250 printf("gpu_ooc_bench) File[%d] Plugin-returned page alignment size mismatch!\n", i);
01251 } else {
01252 printf("gpu_ooc_bench) File[%d] Page alignment size: %d\n", i, filepgalignsz);
01253 }
01254 #endif
01255
01256 read_js_timestep_index_offsets(jshandles[i], natomschk, 0, 0, 0,
01257 &directio_fds[i], NULL, NULL, NULL);
01258
01259 cfhdesc[i].handle.fd = directio_fds[i];
01260 cfhdesc[i].type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
01261 CUfileError_t cferr = cuFileHandleRegister(&cfh[i], &cfhdesc[i]);
01262
01263 if (cferr.err != CU_FILE_SUCCESS) {
01264 printf("gpu_ooc_bench) File[%d] cuFileImportExternalFile on fd %d failed!\n",
01265 i, cfhdesc[i].handle.fd);
01266 return -1;
01267 }
01268 }
01269
01270
01271
01272
01273
01274 gpuoocbenchthreadparms parms;
01275 memset(&parms, 0, sizeof(parms));
01276 parms.devcount = devcount;
01277 parms.nfiles = nfiles;
01278 parms.trjfileset = trjfileset;
01279 parms.jshandles = jshandles;
01280 parms.cfh = cfh;
01281 parms.natoms = sel->num_atoms;
01282 parms.sel = sel;
01283 parms.first = first;
01284 parms.last = last;
01285 parms.step = step;
01286
01287 int framecount = nfiles * (last / step);
01288
01289
01290 wkf_timerhandle timer;
01291 timer=wkf_timer_create();
01292
01293
01294 wkf_tasktile_t tile;
01295 tile.start=0;
01296 tile.end=framecount - 1;
01297
01298 printf("gpu_ooc_bench) tile start: %d end: %d\n", tile.start, tile.end);
01299
01300 int gdsthreadspergpu = 1;
01301 if (getenv("VMDGDSTHREADSPERGPU") != NULL)
01302 gdsthreadspergpu = atoi(getenv("VMDGDSTHREADSPERGPU"));
01303
01304 printf("gpu_ooc_bench) gdsthreadspergpu: %d\n", gdsthreadspergpu);
01305
01306 if (gdsthreadspergpu > 1) {
01307
01308 int workercount = devcount * gdsthreadspergpu;
01309
01310 int *devlist = new int[workercount];
01311 int k;
01312 for (k=0; k<workercount; k++) {
01313 devlist[k] = k / gdsthreadspergpu;
01314 }
01315
01316 msgInfo << "Creating Multi-worker ("
01317 << gdsthreadspergpu << " per-GPU) CUDA device pool..." << sendmsg;
01318 bigpool=wkf_threadpool_create(workercount, devlist);
01319 delete [] devlist;
01320
01321
01322 if (getenv("VMDCUDAVERBOSE") != NULL)
01323 wkf_threadpool_launch(bigpool, vmd_cuda_devpool_setdeviceonly, (void*)"VMD CUDA Dev Init", 1);
01324 else
01325 wkf_threadpool_launch(bigpool, vmd_cuda_devpool_setdeviceonly, NULL, 1);
01326
01327
01328 wkf_threadpool_launch(bigpool, vmd_cuda_devpool_clear_device_mem, NULL, 1);
01329
01330
01331 devpool = bigpool;
01332 }
01333
01334
01335 wkf_threadpool_launch(devpool, vmd_cuda_affinitize_threads, NULL, 1);
01336
01337 wkf_threadpool_sched_dynamic(devpool, &tile);
01338 wkf_timer_start(timer);
01339 wkf_threadpool_launch(devpool, gpu_ooc_bench_thread, &parms, 1);
01340 wkf_timer_stop(timer);
01341
01342 double runtime = wkf_timer_time(timer);
01343 double gbytes = sel->num_atoms * 12L * (tile.end+1) / (1024.0 * 1024.0 * 1024.0);
01344
01345 printf("gpu_ooc_bench) natoms: %d, fsz: %ld, tsz: %ld\n",
01346 sel->num_atoms, sel->num_atoms * 12L,
01347 sel->num_atoms * 12L * (tile.end+1));
01348
01349 int pinhostiobuffer = 1;
01350 if (getenv("VMDGDSHOSTNOPIN"))
01351 pinhostiobuffer=0;
01352
01353 int hostusedirectio = 1;
01354 if (getenv("VMDGDSHOSTBUFFEREDIO") != NULL)
01355 hostusedirectio = 0;
01356
01357 int usecufile=1;
01358 if (getenv("VMDGDSUSEHOST"))
01359 usecufile=0;
01360
01361 if (usecufile) {
01362 printf("OOC I/O via GDS + cuFile\n");
01363 } else {
01364 printf("OOC I/O via host, %s APIs, %s memory buffers\n",
01365 (hostusedirectio) ? "Direct I/O" : "Buffered I/O",
01366 (pinhostiobuffer) ? "pinned" : "unpinned");
01367 }
01368
01369 int multiframeio = 0;
01370 if (getenv("VMDGDSMULTIFRAME"))
01371 multiframeio = atoi(getenv("VMDGDSMULTIFRAME"));
01372 if (multiframeio > VMDGDSMAXFRAMEBUF)
01373 multiframeio = VMDGDSMAXFRAMEBUF;
01374 if (multiframeio) {
01375 printf("GDS multi-frame read opt: %d frames per call, %ld bytes\n",
01376 multiframeio,
01377 multiframeio * sel->num_atoms * 12L);
01378 }
01379
01380 printf("OOC runtime: %.1f, %.2fGB/sec\n", runtime, gbytes/runtime);
01381
01382 for (i=0; i<nfiles; i++) {
01383 #if defined(VMDUSECUDAGDS)
01384 cuFileHandleDeregister(cfh[i]);
01385 #endif
01386 close_js_read(jshandles[i]);
01387 }
01388 #endif
01389
01390 #if defined(VMDUSECUDAGDS)
01391 if (cfh != NULL)
01392 free(cfh);
01393
01394 if (cfhdesc != NULL)
01395 free(cfhdesc);
01396
01397 if (jshandles != NULL)
01398 free(jshandles);
01399
01400 if (directio_fds != NULL)
01401 free(directio_fds);
01402 #endif
01403
01404
01405
01406 if (bigpool != NULL)
01407 wkf_threadpool_destroy(bigpool);
01408
01409 return 0;
01410 }
01411
01412
01413