14. Reduction

1. Sum Reduction w/ Divergence

Sum reduction이 threadIdx.x=0에 되기 때문에, SMEM -> global mem. 할 떄 threadIdx.x=0에 대해서만 하면 됨.

__global__ void kernel(unsigned* pData, unsigned* pAns){
  __shared__ float dataShared[2*BLOCKSIZE];   
  unsigned int i = blockDim.x*blockIdx.x + threadIdx.x;
  dataShared[threadIdx.x] = pData[i];
  dataShared[threadIdx.x+BLOCKSIZE] = pData[i+BLOCKSIZE]; 
  __syncthreads(); 

  for (register unsigned s=1; s<=BLOCKSIZE; s*=2){
      if (threadIdx.x%s == 0){ dataShared[2*threadIdx.x] += dataShared[2*threadIdx.x+s]; }
      __syncthreads(); 
  }
  if (threadIdx.x==0){ pAns[threadIdx.x] = dataShared[threadIdx.x]; }
}

2. Sum Reduction w/o Divergence

__global__ void kernel (unsigned* pData, unsigned* pAns){
  __shared__ float dataShared[2*BLOCKSIZE]; 
  unsigned int i = blockDim.x*blockIdx.x + threadIdx.x;
  dataShared[threadIdx.x] = pData[i];
  dataShared[threadIdx.x+BLOCKSIZE] = pData[i+BLOCKSIZE]; 
  __syncthreads(); 

  for (register unsigned s=BLOCKSIZE; s>0; s/=2){
      if (threadIdx.x<s){ dataShared[threadIdx.x] += dataShared[threadIdx.x+s]; }
      __syncthreads(); 
  }
  if (threadIdx.x==0){ pAns[threadIdx.x] = dataShared[threadIdx.x]; }
}

15. Scan (Prefix Sum)

1. Kogge-Stone Parallel Scan

for (int s=1; s<BLOCKSIZE; s*=2){
    float temp = 0.0f; 
    if (threadIdx.x>=s){ temp=T[threadIdx.x]+T[threadIdx.x-s]; }
    __syncthreads(); 
    if (threadIdx.x<s){ T[threadIdx.x]=temp; }
    __syncthreads(); 
}
if (gid<n) { output[gid] = T[threadIdx.x]; }

2. Kogge-Stone (Double Buffering ver.)

for (int s=1; s<BLOCKSIZE; s*=2){
    if (threadIdx.x>=s) { dst[threadIdx.x] = src[threadIdx.x]+src[threadIdx.x-s]; }
    else { dst[threadIdx.x] = src[threadIdx.x]; }
    __syncthreads(); 

    float* tmp=src; src=dst; dst=tmp;
}
if (gid<n){ output[gid] = src[threadIdx.x]; }

3. Brent-Kung Parallel Scan

int idx = (threadIdx.x+1) * 2 * s - 1; ```cpp int s=1; while(s<BLOCKSIZE){ int idx = (threadIdx.x+1) * 2 * s - 1; if (idx<BLOCKSIZE){ T[idx] += T[idx-s]; } s *= 2; __syncthreads(); }

int s=BLOCKSIZE/2; while(s>0){ int idx = (threadIdx.x+1) * 2 * s - 1; if (idx+s<BLOCKSIZE && idx<BLOCKSIZE){ T[idx+s] += T[idx]; } s /= 2; __syncthreads(); }

# 13. Convolution 

## 1. Boundary Condition Handling & Constant Cache
1. 1D CONV w/ Boundary Condition Handling 
```cpp
__global__ void CONV_1D (float* N, float* M, float* P, int Mask_Width, int Width){
    int i = blockIdx.x*blockdDim.x + threadIdx.x;
    float Pvalue = 0; 
    int N_start = i - Mask_Width/2; 

    for (int j = 0; j<Mask_Width; j++){
        if (N_start+j>=0 && N_start+j<Width) { 
            Pvalue += N[N_start+j]*M[j]; 
        }
    }
    P[i] = Pvalue; 
}

Usage of Constant Memory ```cpp #define MASK_WIDTH 5 typedef struct{ unsigned int weight; unsigned int height; unsigned int pitch; float* elements; } Matrix;

// Matrix allocation func. Matrix AllocateMatrix(int height, int width, int init){…}

// global var. -> declare outside any kernel function constant float Mc[MASK_WIDTH][MASK_WIDTH]; Matrix M = AllocateMatrix(MASK_WIDTH, MASK_WIDTH, 1); // initialize M elems, allocate N/P, initialize N/P, copy N/P to N_d/P_d

cudaMemcpyToSymbol(Mc, M.elements, MASK_WIDTHMASK_WIDTHsizeof(float)); // launch kernel

## 2. Tiled 1D
### 1. Load halo + internal elem. into SMEM
```cpp
__global__ void convolution_1D_tiled_kernel(float* N, float* P, int Mask_Width, int Width){
    int i = blockIdx.x*blockDim.x+threadIdx.x; 
    __shared__ float N_ds[TILE_SIZE + Mask_Width - 1]; 
    int n = Mask_Width/2;

    // left halo 
    int halo_idx_left = (blockIdx.x-1)*blockDim.x+threadIdx.x; 
    if (threadIdx.x >= (blockDim.x-n)){
        N_ds[threadIdx.x + n - blockDim.x] = (halo_idx_left<0) ? 0 : N[halo_idx_left]; 
    }
    // internal 
    N_ds[n+threadIdx.x] = N[blockIdx.x*blockDim.x+threadIdx.x];

    // right halo 
    int halo_idx_right = (blockIdx.x+1)*blockDim.x+threadIdx.x; 
    if (threadIdx.x < n){
        N_ds[threadIdx.x + n + blockDim.x] = (halo_idx_right>Width) ? 0 : N[halo_idx_right]; 
    }
    __syncthreads(); 

    float Pvalue = 0; 
    for (int j=0; j<MASK_WIDTH; j++){ Pvalue += N_ds[threadIdx.x + j]*M[j]; }
    P[i] = Pvalue;
}

Practice!

__global__ void convolution_1D_tiled_kernel(float* N, float* P, int Mask_Width, int Width){
  int i = blockIdx.x*blockDim.x+threadIdx.x; 
  __shared__ float N_ds[??]; 
  int n = Mask_Width/2;

  // left halo 
  int halo_idx_left = ??
  if (??){
      N_ds[??] = (??) ? 0 : N[??]; 
  }
  // internal 
  N_ds[??] = N[??];

  // right halo 
  int halo_idx_right = ??
  if (??){
      N_ds[??] = (??) ? 0 : N[??]; 
  }
  __syncthreads(); 

  float Pvalue = 0; 
  for (int j=0; j<MASK_WIDTH; j++){ Pvalue += N_ds[??]*M[??]; }
  P[??] = Pvalue;
}

2. Data Reuse in SMEM & General Caching

🔥 Everything is gid based!!

__global__ void convolution_1D_tiled_kernel (float* N, float* P, int Mask_Width, int Width){
  int i = blockIdx.x*blockDim.x+threadIdx.x; 
  int n = Mask_Width/2; 
  __shared__ float N_ds[TILE_SIZE]; 
    
  N_ds[threadIdx.x]=N[i]; __syncthreads(); 

  int This_tile_start_point = blockIdx.x*blockDim.x; 
  int Next_tile_start_point = (blockIdx.x+1)*blockDim.x; 
  int N_start_point = i - n;

  float Pvalue = 0; 
  for (int j=0; j<MASK_WIDTH; j++){
      int N_index = N_start_point + j; 
      if (N_index >= This_tile_start_point && N_index < Next_tile_start_point){
          Pvalue += N_ds[threadIdx.x+j-n]*M[j]; 
      } else {
          Pvalue += N[N_index]*M[j]; 
      }
  }
  P[i] = Pvalue; 
}

Practice!

__global__ void convolution_1D_tiled_kernel (float* N, float* P, int Mask_Width, int Width){
  int i = blockIdx.x*blockDim.x+threadIdx.x; 
  int n = Mask_Width/2; 
  __shared__ float N_ds[??]; 
    
  N_ds[??]=N[??]; __syncthreads(); 

  int This_tile_start_point = blockIdx.x*blockDim.x; 
  int Next_tile_start_point = (blockIdx.x+1)*blockDim.x; 
  int N_start_point = ??

  float Pvalue = 0; 
  for (int j=0; j<MASK_WIDTH; j++){
      int N_index = ??; 
      if (N_index >= ?? && N_index < ??){
          Pvalue += N_ds[??]*M[??]; 
      } else {
          Pvalue += N[??]*M[??]; 
      }
  }
  P[??] = Pvalue; 
}

3. Tiled 2D

it’s easier to think: row_o, col_o gid based.

N_ds is tid based shared memory!

Shifting from output coordinates to input coordinates

__global__ void convolution(Matrix N, Matrix P){
  __shared__ float N_ds[BLOCK_SIZE][BLOCK_SIZE]; 
  int ty = threadIdx.y; int tx = threadIdx.x; 
  int row_o = blockIdx.y*TILE_SIZE+ty; 
  int col_o = blockIdx.x*TILE_SIZE+tx;
  int row_i = row_o - KERNEL_SIZE/2; 
  int col_i = col_o - KERNEL_SIZE/2; 
  float output = 0.0f; 
}

Loading input matrix to the shared memory

if (0<=row_i<N.height && 0<=col_i<N.width){
  N_ds[ty][tx] = N.elements[row_i * N.width + col_i];
} else{
  N_ds[ty][tx] = 0;
}
__syncthreads();

Do convolution!

if (ty<TILE_SIZE && tx<TILE_SIZE){
  for (i=0; i<KERNEL_SIZE; i++){
   for (j=0; j<KERNEL_SIZE; j++){
       output += Mc[i][j]*N_ds[i+ty][j+tx]; 
   }
  }
  if (row_o<P.height && col_o<P.width){ P.elements[row_o*P.width+col_o] = output; }
}

Setting Block Size

#define BLOCK_SIZE (TILE_SIZE + MASK_WIDTH -1)
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE, 1); 
dim3 dimGrid(ceil(N.width/float(TILE_SIZE)), ceil(N.width/float(TILE_SIZE)), 1); 
convolution<<<dimGrid, dimBlock>>>(N_d, P_d)

Practice!

__global__ void convolution(Matrix N, Matrix P){
  // Shifting from  output coordinates to input coordinates
  __shared__ float N_ds[??][??]; 
  int ty = threadIdx.y; int tx = threadIdx.x; 
  int row_o = ??; 
  int col_o = ??;
  int row_i = ??; 
  int col_i = ??; 
  float output = 0.0f; 

  // Loading input matrix to the shared memory 
  if (??){
      N_ds[??][??] = N.elements[??];
  } else{
      N_ds[??][??] = 0;
  }
  __syncthreads();

  // Do convolution! 
  if (ty<?? && tx<??){
      for (i=0; i<KERNEL_SIZE; i++){
          for (j=0; j<KERNEL_SIZE; j++){
              output += Mc[i][j]*N_ds[??][??]; 
          }
      }
      if (??<P.height && ??<P.width){ P.elements[??] = output; }
  }
}
// Setting block size
#define BLOCK_SIZE (??)
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE, 1); 
dim3 dimGrid(??, ??, 1); 
convolution<<<dimGrid, dimBlock>>>(N_d, P_d)

12. Histogram w/ Atomic Ops.

1. Atomic Ops.

atomicCAS

int atomicCAS(int* addr, int expected, int newVal){
 oldVal = read(*addr);
 *addr = (oldVal==expected) ? newVal : oldVal;
 return oldVal
}

atomicCAS Usage

oldVal = *addr; 
readback = atomicCAS(addr, oldVal, newVal); 
if (oldVal==readback) → success
else → fail, another T did atomicCAS 

atomicAdd

__device__ inline void atomicAdd (float* addr, float value){
 int oldVal, newVal, readback; 
 oldVal = __float_as_int(*addr);
 newVal = __float_as_int(__int_as_float(oldVal)+value); 
 while ([readback=atomicCAS(*addr, oldVal, newVal)] != oldVal){
     oldVal = readback; 
     newVal = __float_as_int(__int_as_float(oldVal)+Value);
 }
}

2. Example: Count

Non-atomic version

total # of threads ≠ count, t=4msec

 __global__ void kernel(unsigned long long int* pCount){
 (*pCount) = (*pCount) + 1; 
 }

Atomic version

__gobal__ void kernel(~){
 atomicAdd(pCount, 1ULL); 
}

Shared memory version

__global__ void kernel(int* pCount){
 __shared__ int nCountShared; 
 if (threadIdx.x==0){ nCountShared=0; } __sycthreads(); 
 atomicAdd(&nCountShared, 1); __syncthreads(); 
 if (threadIdx.x==0){ atomicAdd(pCount, nCountShared); }
}

3. Example: Histogram Count

CPU version

sum of each hist = image size, t=4340usec

void kernel (unsigned int* hist, unsigned int* img, unsigned int size){
 while (size--){
   unsigned int pixelVal = *img++;
   hist[pixelVal] += 1;
 }
}

GPU version

sum of each hist = image size, t=1.61msec

__global__ void kernel(unsigned int* hist, unsigned int* img, unsigned int size){
 unsigned int i = blockIdx.x*blockDim.x+threadIdx.x; 
 unsigned int pixelVal = img[i];
 atomicAdd(&hist[pixelVal], 1);
}

SMEM version

sum of each hist = image size ```cpp global void kernel(unsigned int* hist, unsigned int* img, unsigned int size){ shared int histShared[NUMHIST]; if (threadIdx.x<NUMHIST) { histShared[threadIdx.x]=0 } __syncthreads();

unsigned int i = blockIdx.x*blockDim.x+threadIdx.x; unsigned pixelVal = img[i]; atomicAdd(&histShared[pixelVal], 1); __syncthreads();

if (threadIdx.x<NUMHIST) { atomicAdd(&hist[threadIdx.x],histShared[threadIdx.x]); } }

       - Practice! 
```cpp 
__global__ void kernel(unsigned int* hist, unsigned int* img, unsigned int size){
 __shared___ int histShared[NUMHIST]; 
 if (threadIdx.x<NUMHIST) { histShared[threadIdx.x]=0; } __syncthreads();

 unsigned i= blockIdx.x*blockDim.x+threadIdx.x; 
 pixelVal=img[i]; 
 atomicAdd(&histShared[pixelVal], 1); __syncthreads(); 

 if (threadIdx.x<NUMHIST) { atomicAdd(&hist[threadIdx.x],histShared[threadIdx.x]); }
}

18. Sparse Matrix Multiplication

1. Sequential SpMV

for (int row=0; row<num_rows; row++){
    float dot = 0; 
    int row_start = row_ptr[row]; 
    int row_end = row_ptr[row+1]; 
    for (int elem=row_start; elem<row_end; elem++){
        dot += data[elem] * x[col_idx[elem]];
    }
    y[row] += dot;
}

2. Parallel SpMV

Shortcoming: 1) No memory coalescing, 2) Control Flow Divergence ```cpp global void SpMV_CSR(int num_rows, float* data, int* col_idx, int* row_ptr, float* x, float* y){ // each thread processes one row int row = blockIdx.x*blockDim.x+threadIdx.x; int row_start = row_ptr[row]; int row_end = row_ptr[row+1];

for (int elem=row_start; elem<row_end; elem++){ float dot; dot += data[elem]*x[col_idx[elem]]; } y[row]=dot;

}

## 3. ELL (PACK) Format
- data[], col_idx[] both padded → Solves control flow divergence, row_ptr[] not needed!
- transposed → Solves coalescing
- Shortcoming: excessive # of padded elements
```cpp
__global__ void SpMV_ELL(int num_rows, float* data, int* col_idx, int num_elem, float* x, float* y){
    int row = blockIdx.x*blockDim.x+threadIdx.x; 
    
    if (row<num_rows){
        for (unsigned int i=0; i<num_elem; i++){
            dot += data[row+i*num_rows]*x[col_idx[row+i*num_rows]];
        }
        y[row]=dot
    }
        
}

4. Coordinate Format

Elem. in COO format can be arbitrarily record
1. Sequential SpMV (COO)
```
for (int i=0; i<num_elem; i++){
  y[row_idx[i]] = data[i]x[col_idx[i]]
}
```
2. Parallel SpMV (COO)

Each thread takes care of one elem.

__global__ void SpMV_COO(float* data, int* col_idx, int* row_idx, int num_elem, float* x, float* y){
  int i = blockDim.x*blockIdx.x+threadIdx.x; 
  if (i<num_elem){
      float dot; 
      dot = data[i]*x[col_idx[i]]; 
      atomicAdd(&y[row_idx[i]], dot);
  }
}

17. Graph Structure

1. Sequential BFS

void BFS_sequential (int source, int* edges, int* dest, int *label){

    int frontier[2][MAX_FRONTIER_SIZE]; 
    int* c_frontier = frontier[0]; int* p_frontier = frontier[1];
    int c_frontier_tail = 0; int p_frontier_tail = 0; 

    // insert source to p_frontier array, and add 1 to p_frontier_tail
    insert_frontier(source, p_frontier, &p_frontier_tail); 
    label[source]=0

    int c_vertex // current vertex
    while (p_frontier_tail>0){
        for (int f=0; f<p_frontier_tail; p++){
            c_vertex = p_frontier[f]; // source of this loop 
            for (int i=edges[c_vertex]; i<edges[c_vertex+1]; i++){
                if (label[dest[i]]==-1){
                    insert_frontier(dest[i], c_frontier, &c_frontier_tail); 
                    label[dest[i]] += label[c_vertex] + 1
                }
            }
        }
        // swap c_frontier and p_frontier
        int* temp = c_frontier; c_frontier = p_frontier; p_frontier = temp; 
        p_frontier_tail = c_frontier_tail; c_frontier_tail = 0; 
    }
}

2. Parallel BFS

void BFS_host(unsigned int source, unsigned int* edges, unsigned int* dest, unsigned int* label){
    ... 
    // cudaMalloc edges_d, dest_d, label_d, and visited_d
    // cudaMemecpy edges, dest, and label to device global memory 
    // cudaMalloc frontier_d, c_frontier_tail_d, p_frontier_tail_d 
    unsigned int* c_frontier_d = &frontier_d[0]; 
    unsigned int* p_frontier_d = &frontier_d[MAX_FRONTIER_SIZE]; 

    // launch simple kernel to initialize source
    // *c_frontier_tail_d = 0 ; 
    // p_frontier_d = source; 
    // *p_frontier_tail_d = 1; 
    // label_d[source] = 0;

    p_frontier_tail = 1; 
    while(p_frontier_tail>0){
        int num_blocks = ceil(p_frontier_tail/float(BLOCK_SIZE)); 
        BFS_Bqueue_kernel<<<num_blocks, BLOCK_SIZE>>>(p_frontier_d, p_frontier_tail_d, c_frontier_d, c_frontier_tail_d, edges_d, dest_d, label_d, visited_d); 
        // cudaMemcpy to read the *c_frontier_tail value back to host 
        // assign int to p_frontier_tail for the while-loop condition test
        int* temp = c_frontier_d; c_frontier_d=p_frontier_d; p_frontier_d=temp; 
        // launch a simple kernel to set *p_frontier_tail_d = *c_frontier_tail_d; *c_frontier_tail_d=0; 
    }
}
__global__ void BFS_Bqueue_kernel(unsigned int* p_frontier, unsigned int*p_frontier_tail, unsigned int* c_frontier, unsigned int* c_frontier_tail, unsigned int* edges, unsigned int* dest, unsigned int* label, unsigned int* visited){
    __shared__ unsigned int c_frontier_s[BLOCK_QUEUE_SIZE]; 
    __shared__ unsigned int c_frontier_tail_s, our_c_frontier_tail; 

    if (threadIdx.x ==0){ c_frontier_tail_s=0; } __syncthreads(); 
    const unsigned gid = blockIdx.x*blockDim.x+threadIdx.x; 

    if (gid < *p_frontier_tail){
        const unsigned int c_vertex = p_frontier[gid]; 

        for (unsigned int i = edges[c_vertex]; i<edges[c_vertex+1]; i++){
            const unsigned int was_visited = atomicExch(&(visited[dest[i]]), 1); 

            if (!was_visited){
                label[dest[i]] = label[c_vertex]+1; 
                const unsigned int my_tail = atomicAdd(&c_frontier_tail_s, 1); 

                // check if # of neighboring vertex exceeds BLOCK_QUEUE_SIZE
                if (my_tail<BLOCK_QUEUE_SIZE){
                    c_frontier_s[my_tail] = dest[i]; 
                } else {
                    c_frontier_tail_s = BLOCK_QUEUE_SIZE; 
                    const unsigned int my_global_tail = atomicAdd(c_frontier_tail, 1);
                    c_frontier[my_global_tail] = dest[i]; 
                }
            }
        }
        // we need to reference global tail multiple times- make it to smem var. 
        __syncthreads(); 
        if (threadIdx.x==0){ 
            our_c_frontier_tail = atomicAdd(c_frontier_tail, c_frontier_tail_s); 
        }
    }
    __syncthreads(); 
    // vertex searching order doesn't matter- just append to global memory tail
    // for vertexes that exceed BLOCK_QUEUE_SIZE will already be in global mem. 
    for (unsigned int i=threadIdx.x; i<c_frontier_tail_s; i+=blockDim.x){
        c_frontier[our_c_frontier_tail+1]=c_frontier_s[i];
    }

}

16. Streaming and AsyncCopy

1. Streaming Operation

Host version - t=588533usec

void kernel (float* dst, float* src, float val, int size){
 while (size--){
     *dst = 0.0F; 
     for (register int j=0; j<REPEAT; ++j){ *dst += *src; }
     dst++; src; 
 }
}

Device version - t=108413usec (winner!)

// size not needed - each thread operates on one elem. 
__global__ void kernel (float* dst, float* src) 
 unsigned int i = blockIdx.x*blockDim.x + threadIdx.x; 

 if (i<SIZE){
     dst[i] = 0.0F
     for (register int j=0; j<REPEAT; j++) { dst[i] += src[i]}
 }

2. Asynchronous Copy

Synchronous Streaming
- cudaMemcpy (void* dst, const void* src, size_t count, enum kind)
Asynchronous Copy
- cudaMemcpyAsync (void* dst, const void* src, size_t count, enum kind, cudaStream_t stream=0)
- Pinned-memory
- cudaMallocHost(), cudaFreeHost()

3. CUDA Streams

1. CUDA Streams - Asynchronous Ver.

kernel launch 때 stream 이 마지막 인자로 들어가야 되기 때문에 세번째 차원 0 반드시 명시 kernel<<<dimGrid, dimBlock, 0, stream>>>(pResultDev, pSourceDev)

kenrel launch 다음에 synchronize stream 까먹지 말기

int main(void){
  // allocate pinned-memory 
  float pSource = NULL; float pResult = NULL; 
  cudaMallocHost((void**)&pSource, TOTALSIZE*sizeof(float)); 
  cudaMallocHost((void**)&pResult, TOTALSIZE*sizeof(float)); 

  // create stream object
  cudaStream_t stream; 
  cudaStreamCreate(&stream);
  ... 
  // CUDA: copy from host to device
  cudaMemcpyAsync(pSourceDev, pSource, TOTALSIZE*sizeof(float), cudaMemcpyHostToDevice, stream);

  // CUDA: launch the kernel 
  ... 
  kernel<<<dimGrid, dimBlock, 0, stream>>>(pResultDev, pSourceDev)
    
  // sync to make sure operations complete
  cudaStreamSynchronize(stream);

  // deallocate pinned-memory 
  cudaFreeHost(pSource); cudaFreeHost(pResult); 
    
  // destroy stream 
  cudaStreamDestroy(stream); 
}

2. CUDA Streams - Multiple Streams

cudaStreamCreate(&stream), cudaStreamSynchronize(stream), cudaStreamDestroy(stream)

int main(void){
  // allocate pinned-memory 
  cudaMallocHost((void**)&pSource, TOTALSIZE*sizeof(float));
  cudaMallocHost((void**)&pResult, TOTALSIZE*sizeof(float));

  // create stream object 
  cudaStream_t aStream[STREAMSIZE]; 
  for (i=0; i<STREAMSIZE; i++) { cudaStreamCreate(&aStream[i]); }
    
  // CUDA: copy from host to device
  for (i=0; i<STREAMSIZE; i++){
      int offset = TOTALSIZE/STREAMSIZE*i; 
      cudaMemcpyAsync(pSourceDev+offset, pSource+offset, TOTALSIZE/STREAMSIZE*sizeof(float), cudaMemcpyHostToDevice, aStream[i]);
  }
    
  // CUDA: launch the kernel
  dim3 dimGrid(GRIDSIZE/STREAMSIZE, 1, 1); 
  dim3 dimBlock(BLOCKSIZSE, 1, 1); 
  for (i=0; i<STREAMSIZE; i++){
      int offset = TOTALSIZE/STREAMSIZE*i;
      kernel<<<dimGrid, dimBlock, 0, aStream[i]>>>(pResultDev+offset, pSourceDev+offset);
  }
    
  // CUDA: copy from device to host 
  for (i=0; i<STREAMSIZE; i++){
      cudaStreamSynchronize(aStream[i]); 
      cudaStreamDestroy(aStream[i])
  }
}

4. MISC

1. Elapsed Time: CUDA Event API Usage

cudaEvent_t start, stop; 
float time; 

cudaEventCreate(&start); cudaEventCreate(&stop); 
cudaEventRecord(start, 0); 
VectorAdd<<<gridDim, gridBlock>>>(dev_A, dev_B, dev_R, size); 
cudaEventRecord(stop, 0); 

cudaEventSynchronize(stop); 
cudaEvenElapsedTime(&time, start, stop); 
cudaEventDestroy(start); cudaEventDestroy(stop); 
printf("elapsed time = %f msec\n", time)