#define MASK_WIDTH 5
typedefstruct{unsignedintweight;unsignedintheight;unsignedintpitch;float*elements;}Matrix;// Matrix allocation func. MatrixAllocateMatrix(intheight,intwidth,intinit){...}// global var. -> declare outside any kernel function __constant__floatMc[MASK_WIDTH][MASK_WIDTH];MatrixM=AllocateMatrix(MASK_WIDTH,MASK_WIDTH,1);// initialize M elems, allocate N/P, initialize N/P, copy N/P to N_d/P_dcudaMemcpyToSymbol(Mc,M.elements,MASK_WIDTH*MASK_WIDTH*sizeof(float));// launch kernel
2. Tiled 1D
1. Load halo + internal elem. into SMEM
__global__voidconvolution_1D_tiled_kernel(float*N,float*P,intMask_Width,intWidth){inti=blockIdx.x*blockDim.x+threadIdx.x;__shared__floatN_ds[TILE_SIZE+Mask_Width-1];intn=Mask_Width/2;// left halo inthalo_idx_left=(blockIdx.x-1)*blockDim.x+threadIdx.x;if(threadIdx.x>=(blockDim.x-n)){N_ds[threadIdx.x+n-blockDim.x]=(halo_idx_left<0)?0:N[halo_idx_left];}// internal N_ds[n+threadIdx.x]=N[blockIdx.x*blockDim.x+threadIdx.x];// right halo inthalo_idx_right=(blockIdx.x+1)*blockDim.x+threadIdx.x;if(threadIdx.x<n){N_ds[threadIdx.x+n+blockDim.x]=(halo_idx_right>Width)?0:N[halo_idx_right];}__syncthreads();floatPvalue=0;for(intj=0;j<MASK_WIDTH;j++){Pvalue+=N_ds[threadIdx.x+j]*M[j];}P[i]=Pvalue;}
// size not needed - each thread operates on one elem. __global__voidkernel(float*dst,float*src)unsignedinti=blockIdx.x*blockDim.x+threadIdx.x;if(i<SIZE){dst[i]=0.0Ffor(registerintj=0;j<REPEAT;j++){dst[i]+=src[i]}}
intmain(void){// allocate pinned-memory cudaMallocHost((void**)&pSource,TOTALSIZE*sizeof(float));cudaMallocHost((void**)&pResult,TOTALSIZE*sizeof(float));// create stream object cudaStream_taStream[STREAMSIZE];for(i=0;i<STREAMSIZE;i++){cudaStreamCreate(&aStream[i]);}// CUDA: copy from host to devicefor(i=0;i<STREAMSIZE;i++){intoffset=TOTALSIZE/STREAMSIZE*i;cudaMemcpyAsync(pSourceDev+offset,pSource+offset,TOTALSIZE/STREAMSIZE*sizeof(float),cudaMemcpyHostToDevice,aStream[i]);}// CUDA: launch the kerneldim3dimGrid(GRIDSIZE/STREAMSIZE,1,1);dim3dimBlock(BLOCKSIZSE,1,1);for(i=0;i<STREAMSIZE;i++){intoffset=TOTALSIZE/STREAMSIZE*i;kernel<<<dimGrid,dimBlock,0,aStream[i]>>>(pResultDev+offset,pSourceDev+offset);}// CUDA: copy from device to host for(i=0;i<STREAMSIZE;i++){cudaStreamSynchronize(aStream[i]);cudaStreamDestroy(aStream[i])}}
4. MISC
1. Elapsed Time: CUDA Event API Usage
cudaEvent_tstart,stop;floattime;cudaEventCreate(&start);cudaEventCreate(&stop);cudaEventRecord(start,0);VectorAdd<<<gridDim,gridBlock>>>(dev_A,dev_B,dev_R,size);cudaEventRecord(stop,0);cudaEventSynchronize(stop);cudaEvenElapsedTime(&time,start,stop);cudaEventDestroy(start);cudaEventDestroy(stop);printf("elapsed time = %f msec\n",time)
17. Graph Structure
1. Sequential BFS
voidBFS_sequential(intsource,int*edges,int*dest,int*label){intfrontier[2][MAX_FRONTIER_SIZE];int*c_frontier=frontier[0];int*p_frontier=frontier[1];intc_frontier_tail=0;intp_frontier_tail=0;// insert source to p_frontier array, and add 1 to p_frontier_tailinsert_frontier(source,p_frontier,&p_frontier_tail);label[source]=0intc_vertex// current vertexwhile(p_frontier_tail>0){for(intf=0;f<p_frontier_tail;p++){c_vertex=p_frontier[f];// source of this loop for(inti=edges[c_vertex];i<edges[c_vertex+1];i++){if(label[dest[i]]==-1){insert_frontier(dest[i],c_frontier,&c_frontier_tail);label[dest[i]]+=label[c_vertex]+1}}}// swap c_frontier and p_frontierint*temp=c_frontier;c_frontier=p_frontier;p_frontier=temp;p_frontier_tail=c_frontier_tail;c_frontier_tail=0;}}
2. Parallel BFS
voidBFS_host(unsignedintsource,unsignedint*edges,unsignedint*dest,unsignedint*label){...// cudaMalloc edges_d, dest_d, label_d, and visited_d// cudaMemecpy edges, dest, and label to device global memory // cudaMalloc frontier_d, c_frontier_tail_d, p_frontier_tail_d unsignedint*c_frontier_d=&frontier_d[0];unsignedint*p_frontier_d=&frontier_d[MAX_FRONTIER_SIZE];// launch simple kernel to initialize source// *c_frontier_tail_d = 0 ; // p_frontier_d = source; // *p_frontier_tail_d = 1; // label_d[source] = 0;p_frontier_tail=1;while(p_frontier_tail>0){intnum_blocks=ceil(p_frontier_tail/float(BLOCK_SIZE));BFS_Bqueue_kernel<<<num_blocks,BLOCK_SIZE>>>(p_frontier_d,p_frontier_tail_d,c_frontier_d,c_frontier_tail_d,edges_d,dest_d,label_d,visited_d);// cudaMemcpy to read the *c_frontier_tail value back to host // assign int to p_frontier_tail for the while-loop condition testint*temp=c_frontier_d;c_frontier_d=p_frontier_d;p_frontier_d=temp;// launch a simple kernel to set *p_frontier_tail_d = *c_frontier_tail_d; *c_frontier_tail_d=0; }}__global__voidBFS_Bqueue_kernel(unsignedint*p_frontier,unsignedint*p_frontier_tail,unsignedint*c_frontier,unsignedint*c_frontier_tail,unsignedint*edges,unsignedint*dest,unsignedint*label,unsignedint*visited){__shared__unsignedintc_frontier_s[BLOCK_QUEUE_SIZE];__shared__unsignedintc_frontier_tail_s,our_c_frontier_tail;if(threadIdx.x==0){c_frontier_tail_s=0;}__syncthreads();constunsignedgid=blockIdx.x*blockDim.x+threadIdx.x;if(gid<*p_frontier_tail){constunsignedintc_vertex=p_frontier[gid];for(unsignedinti=edges[c_vertex];i<edges[c_vertex+1];i++){constunsignedintwas_visited=atomicExch(&(visited[dest[i]]),1);if(!was_visited){label[dest[i]]=label[c_vertex]+1;constunsignedintmy_tail=atomicAdd(&c_frontier_tail_s,1);// check if # of neighboring vertex exceeds BLOCK_QUEUE_SIZEif(my_tail<BLOCK_QUEUE_SIZE){c_frontier_s[my_tail]=dest[i];}else{c_frontier_tail_s=BLOCK_QUEUE_SIZE;constunsignedintmy_global_tail=atomicAdd(c_frontier_tail,1);c_frontier[my_global_tail]=dest[i];}}}// we need to reference global tail multiple times- make it to smem var. __syncthreads();if(threadIdx.x==0){our_c_frontier_tail=atomicAdd(c_frontier_tail,c_frontier_tail_s);}}__syncthreads();// vertex searching order doesn't matter- just append to global memory tail// for vertexes that exceed BLOCK_QUEUE_SIZE will already be in global mem. for(unsignedinti=threadIdx.x;i<c_frontier_tail_s;i+=blockDim.x){c_frontier[our_c_frontier_tail+1]=c_frontier_s[i];}}
Shortcoming: 1) No memory coalescing, 2) Control Flow Divergence
__global__voidSpMV_CSR(intnum_rows,float*data,int*col_idx,int*row_ptr,float*x,float*y){// each thread processes one rowintrow=blockIdx.x*blockDim.x+threadIdx.x;introw_start=row_ptr[row];introw_end=row_ptr[row+1];for(intelem=row_start;elem<row_end;elem++){floatdot;dot+=data[elem]*x[col_idx[elem]];}y[row]=dot;}
3. ELL (PACK) Format
data[], col_idx[] both padded โ Solves control flow divergence, row_ptr[] not needed!