CUDA use OpenGL Texture R16 and RGBA EqualizeHistogram

cuda和OpenGL纹理结合，并进行直方图计算针对于单通道16位图像。结合方式在CUDA_equalizeHistogram函数中。其他的为CUDA核函数。

先上R16的处理，并详细解释：

单通道16位：

#define HISTOGRAM_LENGTH 65536  // 2^16 表示16位深度

定义直方图长度为65536，对应16位像素值的范围（0-65535）。

__global__ void computeHistogram_16(cudaSurfaceObject_t surface, int width, int height, unsigned int* histogram) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x < width && y < height) {
        unsigned short pixel;
        surf2Dread(&pixel, surface, x * sizeof(unsigned short), y);
        atomicAdd(&histogram[pixel], 1);
    }
}

__global__ void computeHistogram_16(...)：声明一个CUDA核函数，用于计算图像的直方图。
int x = blockIdx.x * blockDim.x + threadIdx.x：计算当前线程在X方向的全局索引。
int y = blockIdx.y * blockDim.y + threadIdx.y：计算当前线程在Y方向的全局索引。
if (x < width && y < height)：检查线程是否在图像的有效范围内。
unsigned short pixel：声明一个变量存储像素值。
surf2Dread(&pixel, surface, x * sizeof(unsigned short), y)：从表面读取像素值。
atomicAdd(&histogram[pixel], 1)：对直方图对应像素值位置执行原子加法操作。

__global__ void exclusiveScan_16(unsigned int* input, unsigned int* output, int len) {
    __shared__ unsigned int temp[1024]; // 分块处理
    int tid = threadIdx.x;
    int bid = blockIdx.x * 1024;

    if (bid + tid < len) {
        temp[tid] = input[bid + tid];
    } else {
        temp[tid] = 0;
    }
    __syncthreads();

    for (int stride = 1; stride < 1024; stride *= 2) {
        unsigned int val = 0;
        if (tid >= stride) {
            val = temp[tid - stride];
        }
        __syncthreads();
        temp[tid] = temp[tid] + val;
        __syncthreads();
    }

    if (bid + tid < len) {
        output[bid + tid] = temp[tid];
    }
}

__global__ void exclusiveScan_16(...)：声明一个CUDA核函数，用于计算前缀和（exclusive scan）。
__shared__ unsigned int temp[1024]：声明一个共享内存数组，用于存储分块数据。
int tid = threadIdx.x：计算当前线程在块内的局部索引。
int bid = blockIdx.x * 1024：计算当前块在全局数据中的起始位置。
if (bid + tid < len)：检查索引是否在有效范围内。
temp[tid] = input[bid + tid]：将全局数据加载到共享内存中。
else { temp[tid] = 0; }：如果索引超出范围，设置共享内存元素为0。
__syncthreads()：同步所有线程，确保共享内存已被完全填充。
for (int stride = 1; stride < 1024; stride *= 2)：循环进行扫描操作，步长每次翻倍。
unsigned int val = 0：声明一个变量用于存储中间值。
if (tid >= stride) { val = temp[tid - stride]; }：根据步长从共享内存中读取值。
__syncthreads()：同步所有线程，确保所有线程都读取了值。
temp[tid] = temp[tid] + val：更新共享内存中的值。
__syncthreads()：同步所有线程，确保所有线程都完成了更新。
if (bid + tid < len) { output[bid + tid] = temp[tid]; }：将结果写回到全局内存中。

__global__ void addScannedBlocks(unsigned int* output, int len) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x * 2;
    if (tid < len && blockIdx.x > 0) {
        unsigned int add_val = output[blockIdx.x * blockDim.x * 2 - 1];
        output[tid] += add_val;
    }
}

__global__ void addScannedBlocks(...)：声明一个CUDA核函数，用于合并分块扫描结果。
int tid = threadIdx.x + blockIdx.x * blockDim.x * 2：计算当前线程在全局数据中的索引。
if (tid < len && blockIdx.x > 0)：检查索引是否在有效范围内，并且不是第一个块。
unsigned int add_val = output[blockIdx.x * blockDim.x * 2 - 1]：获取前一个块的扫描结果。
output[tid] += add_val：将前一个块的结果加到当前块的结果中。

__global__ void computeCDF_16(unsigned int* histogram, float* cdf, int size, unsigned int total) {
    int tid = threadIdx.x;
    if (tid < size) {
        cdf[tid] = (float)histogram[tid] / (float)total;
    }
}

__global__ void computeCDF_16(...)：声明一个CUDA核函数，用于计算累积分布函数（CDF）。
int tid = threadIdx.x：计算当前线程的索引。
if (tid < size)：检查索引是否在有效范围内。
cdf[tid] = (float)histogram[tid] / (float)total：计算CDF值并存储在cdf数组中。

__global__ void equalize_16(cudaSurfaceObject_t surface, int width, int height, float* cdf) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x < width && y < height) {
        unsigned short pixel;
        surf2Dread(&pixel, surface, x * sizeof(unsigned short), y);

        unsigned short newValue = static_cast<unsigned short>(65535 * cdf[pixel]);
        surf2Dwrite(newValue, surface, x * sizeof(unsigned short), y);
    }
}

__global__ void equalize_16(...)：声明一个CUDA核函数，用于对图像进行直方图均衡化。
int x = blockIdx.x * blockDim.x + threadIdx.x：计算当前线程在X方向的全局索引。
int y = blockIdx.y * blockDim.y + threadIdx.y：计算当前线程在Y方向的全局索引。
if (x < width && y < height)：检查线程是否在图像的有效范围内。
unsigned short pixel：声明一个变量存储像素值。
surf2Dread(&pixel, surface, x * sizeof(unsigned short), y)：从表面读取像素值。
unsigned short newValue = static_cast<unsigned short>(65535 * cdf[pixel])：根据CDF计算新的像素值。
surf2Dwrite(newValue, surface, x * sizeof(unsigned short), y)：将新的像素值写回表面。

void equalizeHistogram_16(cudaArray_t cuArray, int width, int height) {
    cudaResourceDesc resDesc;
    memset(&resDesc, 0, sizeof(resDesc));
    resDesc.resType = cudaResourceTypeArray;
    resDesc.res.array.array = cuArray;

    cudaSurfaceObject_t surfObj;
    cudaCreateSurfaceObject(&surfObj, &resDesc);

    unsigned int* d_histogram;
    cudaMalloc(&d_histogram, HISTOGRAM_LENGTH * sizeof(unsigned int));
    cudaMemset(d_histogram, 0, HISTOGRAM_LENGTH * sizeof(unsigned int));

    dim3 blockSize(16, 16);
    dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y);
    computeHistogram_16 <<<gridSize, blockSize>>> (surfObj, width, height, d_histogram);

    unsigned int* d_scan_histogram;
    cudaMalloc(&d_scan_histogram, HISTOGRAM_LENGTH * sizeof(unsigned int));
    dim3 scanBlockSize(1024);
    dim3 scanGridSize((HISTOGRAM_LENGTH + scanBlockSize.x - 1) / scanBlockSize.x);
    exclusiveScan_16 <<<scanGridSize, scanBlockSize>>> (d_histogram, d_scan_histogram, HISTOGRAM_LENGTH);
    addScannedBlocks <<<scanGridSize, scanBlockSize>>> (d_scan_histogram, HISTOGRAM_LENGTH);

    unsigned

 int h_total;
    cudaMemcpy(&h_total, &d_scan_histogram[HISTOGRAM_LENGTH - 1], sizeof(unsigned int), cudaMemcpyDeviceToHost);

    float* d_cdf;
    cudaMalloc(&d_cdf, HISTOGRAM_LENGTH * sizeof(float));
    computeCDF_16 <<<1, HISTOGRAM_LENGTH>>> (d_scan_histogram, d_cdf, HISTOGRAM_LENGTH, h_total);

    equalize_16 <<<gridSize, blockSize>>> (surfObj, width, height, d_cdf);

    cudaDeviceSynchronize();

    cudaDestroySurfaceObject(surfObj);
    cudaFree(d_histogram);
    cudaFree(d_scan_histogram);
    cudaFree(d_cdf);
}

void equalizeHistogram_16(...)：定义一个主机函数，用于执行直方图均衡化的整个过程。
cudaResourceDesc resDesc：声明并初始化CUDA资源描述符。
memset(&resDesc, 0, sizeof(resDesc))：将资源描述符清零。
resDesc.resType = cudaResourceTypeArray：设置资源类型为CUDA数组。
resDesc.res.array.array = cuArray：将资源描述符的数组指针设置为输入的CUDA数组。
cudaSurfaceObject_t surfObj：声明CUDA表面对象。
cudaCreateSurfaceObject(&surfObj, &resDesc)：创建CUDA表面对象。
unsigned int* d_histogram：声明设备端直方图数组指针。
cudaMalloc(&d_histogram, HISTOGRAM_LENGTH * sizeof(unsigned int))：分配设备端直方图数组内存。
cudaMemset(d_histogram, 0, HISTOGRAM_LENGTH * sizeof(unsigned int))：将设备端直方图数组清零。
dim3 blockSize(16, 16)：定义每个块的线程布局。
dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y)：计算网格大小，以覆盖整个图像。
computeHistogram_16 <<<gridSize, blockSize>>> (surfObj, width, height, d_histogram)：启动计算直方图的CUDA核函数。
unsigned int* d_scan_histogram：声明设备端扫描直方图数组指针。
cudaMalloc(&d_scan_histogram, HISTOGRAM_LENGTH * sizeof(unsigned int))：分配设备端扫描直方图数组内存。
dim3 scanBlockSize(1024)：定义扫描核函数的每个块的线程数。
dim3 scanGridSize((HISTOGRAM_LENGTH + scanBlockSize.x - 1) / scanBlockSize.x)：计算扫描核函数的网格大小。
exclusiveScan_16 <<<scanGridSize, scanBlockSize>>> (d_histogram, d_scan_histogram, HISTOGRAM_LENGTH)：启动计算前缀和的CUDA核函数。
addScannedBlocks <<<scanGridSize, scanBlockSize>>> (d_scan_histogram, HISTOGRAM_LENGTH)：启动合并扫描结果的CUDA核函数。
unsigned int h_total：声明一个变量用于存储总和。
cudaMemcpy(&h_total, &d_scan_histogram[HISTOGRAM_LENGTH - 1], sizeof(unsigned int), cudaMemcpyDeviceToHost)：将设备端的总和复制到主机端。
float* d_cdf：声明设备端CDF数组指针。
cudaMalloc(&d_cdf, HISTOGRAM_LENGTH * sizeof(float))：分配设备端CDF数组内存。
computeCDF_16 <<<1, HISTOGRAM_LENGTH>>> (d_scan_histogram, d_cdf, HISTOGRAM_LENGTH, h_total)：启动计算CDF的CUDA核函数。
equalize_16 <<<gridSize, blockSize>>> (surfObj, width, height, d_cdf)：启动直方图均衡化的CUDA核函数。
cudaDeviceSynchronize()：同步设备，确保所有CUDA核函数完成。
cudaDestroySurfaceObject(surfObj)：销毁CUDA表面对象。
cudaFree(d_histogram)：释放设备端直方图数组内存。
cudaFree(d_scan_histogram)：释放设备端扫描直方图数组内存。
cudaFree(d_cdf)：释放设备端CDF数组内存。

bool CUDA_equalizeHistogram_16(int htexture, int width, int height, int target) {
    cudaGraphicsResource_t cudaResources;

    // 在 CUDA 中注册纹理
    cudaError_t err = cudaGraphicsGLRegisterImage(&cudaResources, htexture, target, cudaGraphicsRegisterFlagsWriteDiscard);
    if (err != cudaSuccess) {
        return false;
    }

    // 在 CUDA 中锁定资源，获得操作纹理的指针
    cudaArray_t cuArray;
    cudaGraphicsMapResources(1, &cudaResources, 0);
    cudaGraphicsSubResourceGetMappedArray(&cuArray, cudaResources, 0, 0);

    equalizeHistogram_16(cuArray, width, height);

    // 解除资源锁定，使 OpenGL 可以利用得到的纹理对象进行纹理贴图操作
    cudaGraphicsUnmapResources(1, &cudaResources, 0);

    // 释放 CUDA 资源
    cudaGraphicsUnregisterResource(cudaResources);

    return true;
}

bool CUDA_equalizeHistogram_16(...)：定义一个主机函数，用于通过CUDA和OpenGL接口进行图像直方图均衡化。
cudaGraphicsResource_t cudaResources：声明CUDA图形资源。
cudaError_t err = cudaGraphicsGLRegisterImage(&cudaResources, htexture, target, cudaGraphicsRegisterFlagsWriteDiscard)：注册OpenGL纹理为CUDA图形资源。
if (err != cudaSuccess) { return false; }：检查注册是否成功，如果失败返回false。
cudaArray_t cuArray：声明CUDA数组。
cudaGraphicsMapResources(1, &cudaResources, 0)：锁定CUDA图形资源，获得操作指针。
cudaGraphicsSubResourceGetMappedArray(&cuArray, cudaResources, 0, 0)：获取映射的CUDA数组。
equalizeHistogram_16(cuArray, width, height)：调用直方图均衡化函数。
cudaGraphicsUnmapResources(1, &cudaResources, 0)：解除资源锁定，使OpenGL可以使用该纹理对象进行操作。
cudaGraphicsUnregisterResource(cudaResources)：释放CUDA图形资源。
return true：返回true表示操作成功。

通过上述代码解释，每一行代码的作用和逻辑应当清楚明了。这个过程包括计算图像直方图、执行前缀和扫描、计算CDF并对图像进行直方图均衡化。

RGBA处理：

#define HISTOGRAM_LENGTH 256

__global__ void computeHistogram_rgba(cudaSurfaceObject_t surface, int width, int height, unsigned int* histogram) {
	int x = blockIdx.x * blockDim.x + threadIdx.x;
	int y = blockIdx.y * blockDim.y + threadIdx.y;

	if (x < width && y < height) {
		uchar4 pixel;
		surf2Dread(&pixel, surface, x * sizeof(uchar4), y);
		atomicAdd(&histogram[pixel.x], 1);
	}
}

__global__ void computeCDF_rgba(unsigned int* histogram, float* cdf) {
	__shared__ unsigned int temp[HISTOGRAM_LENGTH];
	int tid = threadIdx.x;

	if (tid < HISTOGRAM_LENGTH) {
		temp[tid] = histogram[tid];
	}

	__syncthreads();

	for (int stride = 1; stride < HISTOGRAM_LENGTH; stride *= 2) {
		unsigned int val = 0;
		if (tid >= stride) {
			val = temp[tid - stride];
		}
		__syncthreads();
		if (tid >= stride) {
			temp[tid] += val;
		}
		__syncthreads();
	}

	if (tid < HISTOGRAM_LENGTH) {
		cdf[tid] = temp[tid] / (float)(temp[HISTOGRAM_LENGTH - 1]);
	}
}

__global__ void equalize_rgba(cudaSurfaceObject_t surface, int width, int height, float* cdf) {
	int x = blockIdx.x * blockDim.x + threadIdx.x;
	int y = blockIdx.y * blockDim.y + threadIdx.y;

	if (x < width && y < height) {
		uchar4 pixel;
		surf2Dread(&pixel, surface, x * sizeof(uchar4), y);

		unsigned char newValue = static_cast<unsigned char>(255 * cdf[pixel.x]);
		pixel.x = newValue;
		pixel.y = newValue;
		pixel.z = newValue;

		surf2Dwrite(pixel, surface, x * sizeof(uchar4), y);
	}
}

void equalizeHistogram_rgba(cudaArray_t cuArray, int width, int height)
{
	cudaResourceDesc resDesc;
	memset(&resDesc, 0, sizeof(resDesc));
	resDesc.resType = cudaResourceTypeArray;
	resDesc.res.array.array = cuArray;

	cudaSurfaceObject_t surfObj;
	cudaCreateSurfaceObject(&surfObj, &resDesc);

	unsigned int* d_histogram;
	cudaMalloc(&d_histogram, HISTOGRAM_LENGTH * sizeof(unsigned int));
	cudaMemset(d_histogram, 0, HISTOGRAM_LENGTH * sizeof(unsigned int));

	dim3 blockSize(16, 16);
	//dim3 blockSize(32, 32);
	dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y);
	computeHistogram_rgba << <gridSize, blockSize >> > (surfObj, width, height, d_histogram);

	float* d_cdf;
	cudaMalloc(&d_cdf, HISTOGRAM_LENGTH * sizeof(float));
	computeCDF_rgba << <1, HISTOGRAM_LENGTH >> > (d_histogram, d_cdf);

	equalize_rgba << <gridSize, blockSize >> > (surfObj, width, height, d_cdf);

	cudaDeviceSynchronize();

	cudaDestroySurfaceObject(surfObj);
	cudaFree(d_histogram);
	cudaFree(d_cdf);
}



bool CUDA_equalizeHistogram_rgba(int htexture, int width, int height, int target)
{

	cudaGraphicsResource_t cudaResources;

	// 在 CUDA 中注册纹理
	cudaError_t err = cudaGraphicsGLRegisterImage(&cudaResources, htexture, target, cudaGraphicsRegisterFlagsWriteDiscard);
	if (err != cudaSuccess)
	{
		return false;
	}

	// 在 CUDA 中锁定资源，获得操作纹理的指针
	cudaArray_t cuArray;
	cudaGraphicsMapResources(1, &cudaResources, 0);
	cudaGraphicsSubResourceGetMappedArray(&cuArray, cudaResources, 0, 0);

	equalizeHistogram_rgba(cuArray, width, height);

	// 解除资源锁定，使 OpenGL 可以利用得到的纹理对象进行纹理贴图操作
	cudaGraphicsUnmapResources(1, &cudaResources, 0);

	// 释放 CUDA 资源
	cudaGraphicsUnregisterResource(cudaResources);

	return true;
}

「真诚赞赏，手留余香」

CUDA use OpenGL Texture R16 and RGBA EqualizeHistogram

CUDA 使用 OpenGL纹理结合直方图均衡

单通道16位：

RGBA处理：

CATALOG

FEATURED TAGS

FRIENDS