cuda => 병렬 감소 (예 : 배열을 합하는 방법)

비고

병렬 축소 알고리즘은 일반적으로 요소 배열을 결합하여 단일 결과를 생성하는 알고리즘을 나타냅니다. 이 범주에 속하는 일반적인 문제는 다음과 같습니다.

배열의 모든 요소 합계
배열에서 최대 값을 찾는 것

일반적으로 병렬 감소는 모든 이진 연상 연산자 , 즉 (A*B)*C = A*(B*C) 적용될 수 있습니다. 이러한 연산자 *를 사용하면 병렬 감소 알고리즘이 반복적으로 배열 인수를 쌍으로 그룹화합니다. 각 쌍은 다른 단계와 병렬로 계산되어 한 단계로 전체 배열 크기를 절반으로 줄입니다. 이 과정은 하나의 요소가 존재할 때까지 반복됩니다.

연산자가 연관성 외에도 교환 가능 하다면 (즉, A*B = B*A ) 알고리즘은 다른 패턴으로 쌍을 이룰 수 있습니다. 이론적으로 볼 때 아무런 차이가 없지만 실제로는 더 나은 메모리 액세스 패턴을 제공합니다.

모든 연관 연산자가 교환 가능하지는 않습니다. 예를 들어 행렬 곱셈을 취하십시오.

교환 연산자에 대한 단일 블록 병렬 감소

CUDA의 병렬 축소에 대한 가장 간단한 접근법은 단일 블록을 할당하여 작업을 수행하는 것입니다.

static const int arraySize = 10000;
static const int blockSize = 1024;

__global__ void sumCommSingleBlock(const int *a, int *out) {
    int idx = threadIdx.x;
    int sum = 0;
    for (int i = idx; i < arraySize; i += blockSize)
        sum += a[i];
    __shared__ int r[blockSize];
    r[idx] = sum;
    __syncthreads();
    for (int size = blockSize/2; size>0; size/=2) { //uniform
        if (idx<size)
            r[idx] += r[idx+size];
        __syncthreads();
    }
    if (idx == 0)
        *out = r[0];
}

...

sumCommSingleBlock<<<1, blockSize>>>(dev_a, dev_out);

데이터 크기가 너무 크지 않은 경우 (소수의 요소 만) 대부분 가능합니다. 이것은 대개 감축이 더 큰 CUDA 프로그램의 일부일 때 발생합니다. 입력이 처음부터 blockSize 와 일치하면 첫 번째 for 루프를 완전히 제거 할 수 있습니다.

첫 번째 단계에서 스레드보다 많은 요소가있는 경우 완전히 독립적으로 추가합니다. 문제가 blockSize 로 줄어들 blockSize 실제 병렬 축소가 트리거됩니다. 곱셈, 최소, 최대 등의 다른 교환 가능 연상 연산자에도 동일한 코드를 적용 할 수 있습니다.

알고리즘은 예를 들어 워프 레벨 병렬 감소를 사용하여 더 빠르게 만들 수 있습니다.

비 교환 연산자에 대한 단일 블록 병렬 감소

비 교환 연산자에 대한 병렬 감소는 교환 가능 버전에 비해 다소 복잡합니다. 이 예제에서 우리는 여전히 단순성을 위해 정수에 덧셈을 사용합니다. 예를 들어 행렬 곱셈과 같이 대체 할 수 있습니다. 참고로, 0은 곱셈의 중성 요소, 즉 항등 행렬로 대체해야합니다.

static const int arraySize = 1000000;
static const int blockSize = 1024;

__global__ void sumNoncommSingleBlock(const int *gArr, int *out) {
    int thIdx = threadIdx.x;
    __shared__ int shArr[blockSize*2];
    __shared__ int offset;
    shArr[thIdx] = thIdx<arraySize ? gArr[thIdx] : 0;
    if (thIdx == 0)
        offset = blockSize;
    __syncthreads();
    while (offset < arraySize) { //uniform
        shArr[thIdx + blockSize] = thIdx+offset<arraySize ? gArr[thIdx+offset] : 0;
        __syncthreads();
        if (thIdx == 0)
            offset += blockSize;
        int sum = shArr[2*thIdx] + shArr[2*thIdx+1];
        __syncthreads();
        shArr[thIdx] = sum;
    }
    __syncthreads();
    for (int stride = 1; stride<blockSize; stride*=2) { //uniform
        int arrIdx = thIdx*stride*2;
        if (arrIdx+stride<blockSize)
            shArr[arrIdx] += shArr[arrIdx+stride];
        __syncthreads();
    }   
    if (thIdx == 0)
        *out = shArr[0];
}

...

sumNoncommSingleBlock<<<1, blockSize>>>(dev_a, dev_out);

첫 번째 while 루프는 쓰레드보다 많은 입력 요소가있는 한 실행합니다. 각 반복에서 하나의 축소가 수행되고 결과는 shArr 배열의 첫 번째 절반으로 압축됩니다. 후반부는 새로운 데이터로 채워집니다.

gArr 에서 모든 데이터가로드되면 두 번째 루프가 실행됩니다. 이제 더 이상 결과를 압축하지 않습니다 (추가 __syncthreads() ). 각 단계에서 스레드 n은 2*n 번째 활성 요소에 액세스하여이를 2*n+1 번째 요소로 더합니다.

워프 레벨 감소 (warp-level reduction)와 공유 메모리 뱅크 충돌을 제거하는 등의 간단한 예제를 더욱 최적화 할 수있는 많은 방법이 있습니다.

교환 연산자에 대한 다중 블록 병렬 감소

CUDA의 병렬 감소에 대한 멀티 블록 접근 방식은 블록이 통신이 제한적이기 때문에 단일 블록 접근 방식에 비해 추가적인 어려움이 있습니다. 아이디어는 각 블록이 입력 배열의 일부를 계산하도록 한 다음 모든 부분 결과를 병합하는 마지막 블록 하나를 갖도록하는 것입니다. 그렇게하기 위해 두 개의 커널을 시작할 수 있으며 암시 적으로 그리드 전체의 동기화 지점을 만듭니다.

static const int wholeArraySize = 100000000;
static const int blockSize = 1024;
static const int gridSize = 24; //this number is hardware-dependent; usually #SM*2 is a good number.

__global__ void sumCommMultiBlock(const int *gArr, int arraySize, int *gOut) {
    int thIdx = threadIdx.x;
    int gthIdx = thIdx + blockIdx.x*blockSize;
    const int gridSize = blockSize*gridDim.x;
    int sum = 0;
    for (int i = gthIdx; i < arraySize; i += gridSize)
        sum += gArr[i];
    __shared__ int shArr[blockSize];
    shArr[thIdx] = sum;
    __syncthreads();
    for (int size = blockSize/2; size>0; size/=2) { //uniform
        if (thIdx<size)
            shArr[thIdx] += shArr[thIdx+size];
        __syncthreads();
    }
    if (thIdx == 0)
        gOut[blockIdx.x] = shArr[0];
}

__host__ int sumArray(int* arr) {
    int* dev_arr;
    cudaMalloc((void**)&dev_arr, wholeArraySize * sizeof(int));
    cudaMemcpy(dev_arr, arr, wholeArraySize * sizeof(int), cudaMemcpyHostToDevice);

    int out;
    int* dev_out;
    cudaMalloc((void**)&dev_out, sizeof(int)*gridSize);
    
    sumCommMultiBlock<<<gridSize, blockSize>>>(dev_arr, wholeArraySize, dev_out);
    //dev_out now holds the partial result
    sumCommMultiBlock<<<1, blockSize>>>(dev_out, gridSize, dev_out);
    //dev_out[0] now holds the final result
    cudaDeviceSynchronize();
    
    cudaMemcpy(&out, dev_out, sizeof(int), cudaMemcpyDeviceToHost);
    cudaFree(dev_arr);
    cudaFree(dev_out);
    return out;
}

하나는 전체 점유시 GPU의 모든 멀티 프로세서를 포화시킬만큼 충분한 블록을 시작하는 것이 이상적입니다. 이 수를 초과하여, 특히 배열에 요소가있는만큼 스레드를 시작하는 것은 비생산적입니다. 이렇게하면 원시 컴퓨팅 성능이 더 이상 향상되지 않지만 매우 효율적인 첫 번째 루프를 사용하지 못하게됩니다.

마지막 블럭 가드 의 도움으로 단일 커널을 사용하여 동일한 결과를 얻을 수도 있습니다 :

static const int wholeArraySize = 100000000;
static const int blockSize = 1024;
static const int gridSize = 24;

__device__ bool lastBlock(int* counter) {
    __threadfence(); //ensure that partial result is visible by all blocks
    int last = 0;
    if (threadIdx.x == 0)
        last = atomicAdd(counter, 1);
    return __syncthreads_or(last == gridDim.x-1);
}    

__global__ void sumCommMultiBlock(const int *gArr, int arraySize, int *gOut, int* lastBlockCounter) {
    int thIdx = threadIdx.x;
    int gthIdx = thIdx + blockIdx.x*blockSize;
    const int gridSize = blockSize*gridDim.x;
    int sum = 0;
    for (int i = gthIdx; i < arraySize; i += gridSize)
        sum += gArr[i];
    __shared__ int shArr[blockSize];
    shArr[thIdx] = sum;
    __syncthreads();
    for (int size = blockSize/2; size>0; size/=2) { //uniform
        if (thIdx<size)
            shArr[thIdx] += shArr[thIdx+size];
        __syncthreads();
    }
    if (thIdx == 0)
        gOut[blockIdx.x] = shArr[0];
    if (lastBlock(lastBlockCounter)) {
        shArr[thIdx] = thIdx<gridSize ? gOut[thIdx] : 0;
        __syncthreads();
        for (int size = blockSize/2; size>0; size/=2) { //uniform
            if (thIdx<size)
                shArr[thIdx] += shArr[thIdx+size];
            __syncthreads();
        }
        if (thIdx == 0)
            gOut[0] = shArr[0];            
    }
}

__host__ int sumArray(int* arr) {
    int* dev_arr;
    cudaMalloc((void**)&dev_arr, wholeArraySize * sizeof(int));
    cudaMemcpy(dev_arr, arr, wholeArraySize * sizeof(int), cudaMemcpyHostToDevice);

    int out;
    int* dev_out;
    cudaMalloc((void**)&dev_out, sizeof(int)*gridSize);
    
    int* dev_lastBlockCounter;
    cudaMalloc((void**)&dev_lastBlockCounter, sizeof(int));
    cudaMemset(dev_lastBlockCounter, 0, sizeof(int));
    
    sumCommMultiBlock<<<gridSize, blockSize>>>(dev_arr, wholeArraySize, dev_out, dev_lastBlockCounter);
    cudaDeviceSynchronize();
    
    cudaMemcpy(&out, dev_out, sizeof(int), cudaMemcpyDeviceToHost);
    cudaFree(dev_arr);
    cudaFree(dev_out);
    return out;
}

예를 들어, 워프 레벨 병렬 축소를 사용하여 커널을 더 빠르게 만들 수 있습니다.

비가 전 연산자에 대한 다중 블록 병렬 감소

병렬 감소에 대한 다중 블록 접근법은 단일 블록 접근법과 매우 유사합니다. 전역 입력 배열은 섹션으로 분할되어야하며 각 섹션은 단일 블록으로 축소됩니다. 각 블록의 부분 결과가 얻어지면 최종 블록 하나가 최종 결과를 얻기 위해 줄입니다.

sumNoncommSingleBlock 에 대한 자세한 내용은 단일 블록 축소 예제에서 설명합니다.
lastBlock 은 마지막에 도달 한 블록 만 허용합니다. 이를 피하려면 커널을 두 개의 개별 호출로 분리 할 수 있습니다.

static const int wholeArraySize = 100000000;
static const int blockSize = 1024;
static const int gridSize = 24; //this number is hardware-dependent; usually #SM*2 is a good number.

__device__ bool lastBlock(int* counter) {
    __threadfence(); //ensure that partial result is visible by all blocks
    int last = 0;
    if (threadIdx.x == 0)
        last = atomicAdd(counter, 1);
    return __syncthreads_or(last == gridDim.x-1);
}

__device__ void sumNoncommSingleBlock(const int* gArr, int arraySize, int* out) {
    int thIdx = threadIdx.x;
    __shared__ int shArr[blockSize*2];
    __shared__ int offset;
    shArr[thIdx] = thIdx<arraySize ? gArr[thIdx] : 0;
    if (thIdx == 0)
        offset = blockSize;
    __syncthreads();
    while (offset < arraySize) { //uniform
        shArr[thIdx + blockSize] = thIdx+offset<arraySize ? gArr[thIdx+offset] : 0;
        __syncthreads();
        if (thIdx == 0)
            offset += blockSize;
        int sum = shArr[2*thIdx] + shArr[2*thIdx+1];
        __syncthreads();
        shArr[thIdx] = sum;
    }
    __syncthreads();
    for (int stride = 1; stride<blockSize; stride*=2) { //uniform
        int arrIdx = thIdx*stride*2;
        if (arrIdx+stride<blockSize)
            shArr[arrIdx] += shArr[arrIdx+stride];
        __syncthreads();
    }   
    if (thIdx == 0)
        *out = shArr[0];
}

__global__ void sumNoncommMultiBlock(const int* gArr, int* out, int* lastBlockCounter) {
    int arraySizePerBlock = wholeArraySize/gridSize;
    const int* gArrForBlock = gArr+blockIdx.x*arraySizePerBlock;
    int arraySize = arraySizePerBlock;
    if (blockIdx.x == gridSize-1)
        arraySize = wholeArraySize - blockIdx.x*arraySizePerBlock;
    sumNoncommSingleBlock(gArrForBlock, arraySize, &out[blockIdx.x]);
    if (lastBlock(lastBlockCounter))
        sumNoncommSingleBlock(out, gridSize, out);    
}

교환 연산자에 대한 단일 워프 병렬 감소

때로는 커다란 CUDA 커널의 일부로 축소가 아주 작은 규모로 수행되어야합니다. 예를 들어, 입력 데이터가 정확히 32 개의 요소, 즉 워프에있는 스레드의 수를 가지고 있다고 가정합니다. 이러한 시나리오에서 감소를 수행하기 위해 하나의 워프가 할당 될 수 있습니다. 워프가 완벽한 동기화에서 실행되면 블록 수준의 감소와 비교할 때 많은 __syncthreads() 명령어를 제거 할 수 있습니다.

static const int warpSize = 32;

__device__ int sumCommSingleWarp(volatile int* shArr) {
    int idx = threadIdx.x % warpSize; //the lane index in the warp
    if (idx<16) shArr[idx] += shArr[idx+16];
    if (idx<8) shArr[idx] += shArr[idx+8];
    if (idx<4) shArr[idx] += shArr[idx+4];
    if (idx<2) shArr[idx] += shArr[idx+2];
    if (idx==0) shArr[idx] += shArr[idx+1];
    return shArr[0];
}

shArr 은 공유 메모리의 배열 인 것이 바람직합니다. 값은 워프의 모든 스레드에 대해 동일해야합니다. sumCommSingleWarp 가 복수의 워프에 의해 불려 sumCommSingleWarp , shArr 는 각각의 워프 내에서 shArr 해야합니다.

인수 shArr 은 배열에 대한 작업이 지시 된대로 실제로 수행되도록 volatile 로 표시됩니다. 그렇지 않으면 shArr[idx] 대한 반복 할당이 레지스터에 대한 할당으로 최적화 될 수 있으며 최종 할당 만 shArr 에 대한 실제 저장소가 shArr . 이 경우 즉시 할당이 다른 스레드에 표시되지 않아 잘못된 결과가 발생합니다. const가 아닌 매개 변수를 const 매개 변수로 전달할 때와 마찬가지로 정상적인 비 휘발성 배열을 휘발성 배열의 인수로 전달할 수 있습니다.

축소 후 shArr[1..31] 의 내용을 신경 쓰지 않으면 코드를 더 간단하게 만들 수 있습니다.

static const int warpSize = 32;

__device__ int sumCommSingleWarp(volatile int* shArr) {
    int idx = threadIdx.x % warpSize; //the lane index in the warp
    if (idx<16) {
      shArr[idx] += shArr[idx+16];
      shArr[idx] += shArr[idx+8];
      shArr[idx] += shArr[idx+4];
      shArr[idx] += shArr[idx+2];
      shArr[idx] += shArr[idx+1];
    }
    return shArr[0];
}

이 설정에서 우리는 많은 if 조건을 제거했습니다. 여분의 스레드는 불필요한 추가 작업을 수행하지만 더 이상 생성하지 않은 내용은 신경 쓰지 않습니다. 워프는 SIMD 모드에서 실행되기 때문에 아무런 작업도 수행하지 않아 시간을 절약 할 수 있습니다. 반면에 조건을 평가하는 데는 상대적으로 많은 시간이 걸리는데, 이는 if 문의 본문이 너무 작기 때문입니다. shArr[32..47] 가 0으로 채워 shArr[32..47] 초기 if 문도 제거 될 수 있습니다.

워프 레벨 감소는 블록 레벨 감소를 높이기 위해 사용될 수 있습니다.

__global__ void sumCommSingleBlockWithWarps(const int *a, int *out) {
    int idx = threadIdx.x;
    int sum = 0;
    for (int i = idx; i < arraySize; i += blockSize)
        sum += a[i];
    __shared__ int r[blockSize];
    r[idx] = sum;
    sumCommSingleWarp(&r[idx & ~(warpSize-1)]);
    __syncthreads();
    if (idx<warpSize) { //first warp only
        r[idx] = idx*warpSize<blockSize ? r[idx*warpSize] : 0;
        sumCommSingleWarp(r);
        if (idx == 0)
            *out = r[0];
    }
}

&r[idx & ~(warpSize-1)] 인수는 기본적으로 r + warpIdx*32 입니다. 이렇게하면 r 배열을 32 개 요소의 청크로 효과적으로 분할 할 수 있으며 각 청크는 별도의 워프에 할당됩니다.

비 순응 연산자에 대한 단일 워프 병렬 감소

static const int warpSize = 32;

__device__ int sumNoncommSingleWarp(volatile int* shArr) {
    int idx = threadIdx.x % warpSize; //the lane index in the warp
    if (idx%2 == 0) shArr[idx] += shArr[idx+1];
    if (idx%4 == 0) shArr[idx] += shArr[idx+2];
    if (idx%8 == 0) shArr[idx] += shArr[idx+4];
    if (idx%16 == 0) shArr[idx] += shArr[idx+8];
    if (idx == 0) shArr[idx] += shArr[idx+16];
    return shArr[0];
}

shArr[1..31] 의 최종 내용을 신경 쓰지 않고 shArr[32..47] 을 0으로 shArr[32..47] 수 shArr[32..47] 위 코드를 단순화 할 수 있습니다.

static const int warpSize = 32;

__device__ int sumNoncommSingleWarpPadded(volatile int* shArr) {
    //shArr[32..47] == 0
    int idx = threadIdx.x % warpSize; //the lane index in the warp
    shArr[idx] += shArr[idx+1];
    shArr[idx] += shArr[idx+2];
    shArr[idx] += shArr[idx+4];
    shArr[idx] += shArr[idx+8];
    shArr[idx] += shArr[idx+16];
    return shArr[0];
}

이 설정에서 지침의 절반을 구성하는 모든 if 조건을 제거했습니다. 여분의 스레드는 불필요한 추가 작업을 수행하여 결과를 궁극적으로 최종 결과에 영향을주지 않는 shArr 셀에 저장합니다. 워프는 SIMD 모드에서 실행되기 때문에 아무런 작업도 수행하지 않아 시간을 절약 할 수 있습니다.

레지스터 만 사용하는 단일 워프 병렬 감소

일반적으로 전역 또는 공유 배열에서 축소가 수행됩니다. 그러나 축소가 더 작은 CUDA 커널의 일부로 수행되면 단일 워프로 수행 될 수 있습니다. Keppler 이상의 아키텍처 (CC> = 3.0)에서는 워프 셔플 기능을 사용하여 공유 메모리를 전혀 사용하지 않을 수 있습니다.

예를 들어 워프의 각 스레드가 단일 입력 데이터 값을 보유한다고 가정합니다. 모든 쓰레드는 합쳐서 32 가지 요소를 가지고있다.

__device__ int sumSingleWarpReg(int value) {
    value += __shfl_down(value, 1);
    value += __shfl_down(value, 2);
    value += __shfl_down(value, 4);
    value += __shfl_down(value, 8);
    value += __shfl_down(value, 16);
    return __shfl(value,0);
}

이 버전은 교환 가능 및 비 교환 운영자 모두에게 적용됩니다.

Modified text is an extract of the original Stack Overflow Documentation

아래 라이선스 CC BY-SA 3.0

와 제휴하지 않음 Stack Overflow

cuda
병렬 감소 (예 : 배열을 합하는 방법)

수색…

비고