__global__ voidreduce0(int *g_idata, int *g_odata){ extern __shared__ int sdata[];
// each thread loads one element from global to shared mem unsignedint tid = threadIdx.x; unsignedint i = blockIdx.x*blockDim.x + threadIdx.x; sdata[tid] = g_idata[i]; __syncthreads();
// do reduction in shared mem for(unsignedint s=1; s < blockDim.x; s *= 2) { if (tid % (2*s) == 0) { sdata[tid] += sdata[tid + s]; } __syncthreads(); }
// write result for this block to global mem if (tid == 0) g_odata[blockIdx.x] = sdata[0]; }
for (unsignedint s=1; s < blockDim.x; s *= 2) { int index = 2 * s * tid; if (index < blockDim.x) { sdata[index] += sdata[index + s]; } __syncthreads(); }
for (unsignedint s=blockDim.x/2; s>0; s>>=1) { if (tid < s) { sdata[tid] += sdata[tid + s]; } __syncthreads(); }
Sequential addressing中,红字凸显的部分说明有一半的线程在第一循环迭代的时候就处于空闲状态。为了改进,我们采用first add during load技术。简单地说,我们采用上述过程一半的block,在刚进入线程的时候就进行一次operation,具体实现如下:
1 2 3 4 5 6
// perform first level of reduction, // reading from global memory, writing to shared memory unsignedint tid = threadIdx.x; unsignedint i = blockIdx.x*(blockDim.x*2) + threadIdx.x; sdata[tid] = g_idata[i] + g_idata[i+blockDim.x]; __syncthreads();