cuda和OpenGL纹理结合,并进行直方图计算 针对于单通道16位图像。结合方式在CUDA_equalizeHistogram函数中。 其他的为CUDA核函数。
先上R16的处理,并详细解释:
单通道16位:
#define HISTOGRAM_LENGTH 65536 // 2^16 表示16位深度
定义直方图长度为65536,对应16位像素值的范围(0-65535)。
__global__ void computeHistogram_16(cudaSurfaceObject_t surface, int width, int height, unsigned int* histogram) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < width && y < height) {
unsigned short pixel;
surf2Dread(&pixel, surface, x * sizeof(unsigned short), y);
atomicAdd(&histogram[pixel], 1);
}
}
__global__ void computeHistogram_16(...)
:声明一个CUDA核函数,用于计算图像的直方图。int x = blockIdx.x * blockDim.x + threadIdx.x
:计算当前线程在X方向的全局索引。int y = blockIdx.y * blockDim.y + threadIdx.y
:计算当前线程在Y方向的全局索引。if (x < width && y < height)
:检查线程是否在图像的有效范围内。unsigned short pixel
:声明一个变量存储像素值。surf2Dread(&pixel, surface, x * sizeof(unsigned short), y)
:从表面读取像素值。atomicAdd(&histogram[pixel], 1)
:对直方图对应像素值位置执行原子加法操作。
__global__ void exclusiveScan_16(unsigned int* input, unsigned int* output, int len) {
__shared__ unsigned int temp[1024]; // 分块处理
int tid = threadIdx.x;
int bid = blockIdx.x * 1024;
if (bid + tid < len) {
temp[tid] = input[bid + tid];
} else {
temp[tid] = 0;
}
__syncthreads();
for (int stride = 1; stride < 1024; stride *= 2) {
unsigned int val = 0;
if (tid >= stride) {
val = temp[tid - stride];
}
__syncthreads();
temp[tid] = temp[tid] + val;
__syncthreads();
}
if (bid + tid < len) {
output[bid + tid] = temp[tid];
}
}
__global__ void exclusiveScan_16(...)
:声明一个CUDA核函数,用于计算前缀和(exclusive scan)。__shared__ unsigned int temp[1024]
:声明一个共享内存数组,用于存储分块数据。int tid = threadIdx.x
:计算当前线程在块内的局部索引。int bid = blockIdx.x * 1024
:计算当前块在全局数据中的起始位置。if (bid + tid < len)
:检查索引是否在有效范围内。temp[tid] = input[bid + tid]
:将全局数据加载到共享内存中。else { temp[tid] = 0; }
:如果索引超出范围,设置共享内存元素为0。__syncthreads()
:同步所有线程,确保共享内存已被完全填充。for (int stride = 1; stride < 1024; stride *= 2)
:循环进行扫描操作,步长每次翻倍。unsigned int val = 0
:声明一个变量用于存储中间值。if (tid >= stride) { val = temp[tid - stride]; }
:根据步长从共享内存中读取值。__syncthreads()
:同步所有线程,确保所有线程都读取了值。temp[tid] = temp[tid] + val
:更新共享内存中的值。__syncthreads()
:同步所有线程,确保所有线程都完成了更新。if (bid + tid < len) { output[bid + tid] = temp[tid]; }
:将结果写回到全局内存中。
__global__ void addScannedBlocks(unsigned int* output, int len) {
int tid = threadIdx.x + blockIdx.x * blockDim.x * 2;
if (tid < len && blockIdx.x > 0) {
unsigned int add_val = output[blockIdx.x * blockDim.x * 2 - 1];
output[tid] += add_val;
}
}
__global__ void addScannedBlocks(...)
:声明一个CUDA核函数,用于合并分块扫描结果。int tid = threadIdx.x + blockIdx.x * blockDim.x * 2
:计算当前线程在全局数据中的索引。if (tid < len && blockIdx.x > 0)
:检查索引是否在有效范围内,并且不是第一个块。unsigned int add_val = output[blockIdx.x * blockDim.x * 2 - 1]
:获取前一个块的扫描结果。output[tid] += add_val
:将前一个块的结果加到当前块的结果中。
__global__ void computeCDF_16(unsigned int* histogram, float* cdf, int size, unsigned int total) {
int tid = threadIdx.x;
if (tid < size) {
cdf[tid] = (float)histogram[tid] / (float)total;
}
}
__global__ void computeCDF_16(...)
:声明一个CUDA核函数,用于计算累积分布函数(CDF)。int tid = threadIdx.x
:计算当前线程的索引。if (tid < size)
:检查索引是否在有效范围内。cdf[tid] = (float)histogram[tid] / (float)total
:计算CDF值并存储在cdf
数组中。
__global__ void equalize_16(cudaSurfaceObject_t surface, int width, int height, float* cdf) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < width && y < height) {
unsigned short pixel;
surf2Dread(&pixel, surface, x * sizeof(unsigned short), y);
unsigned short newValue = static_cast<unsigned short>(65535 * cdf[pixel]);
surf2Dwrite(newValue, surface, x * sizeof(unsigned short), y);
}
}
__global__ void equalize_16(...)
:声明一个CUDA核函数,用于对图像进行直方图均衡化。int x = blockIdx.x * blockDim.x + threadIdx.x
:计算当前线程在X方向的全局索引。int y = blockIdx.y * blockDim.y + threadIdx.y
:计算当前线程在Y方向的全局索引。if (x < width && y < height)
:检查线程是否在图像的有效范围内。unsigned short pixel
:声明一个变量存储像素值。surf2Dread(&pixel, surface, x * sizeof(unsigned short), y)
:从表面读取像素值。unsigned short newValue = static_cast<unsigned short>(65535 * cdf[pixel])
:根据CDF计算新的像素值。surf2Dwrite(newValue, surface, x * sizeof(unsigned short), y)
:将新的像素值写回表面。
void equalizeHistogram_16(cudaArray_t cuArray, int width, int height) {
cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeArray;
resDesc.res.array.array = cuArray;
cudaSurfaceObject_t surfObj;
cudaCreateSurfaceObject(&surfObj, &resDesc);
unsigned int* d_histogram;
cudaMalloc(&d_histogram, HISTOGRAM_LENGTH * sizeof(unsigned int));
cudaMemset(d_histogram, 0, HISTOGRAM_LENGTH * sizeof(unsigned int));
dim3 blockSize(16, 16);
dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y);
computeHistogram_16 <<<gridSize, blockSize>>> (surfObj, width, height, d_histogram);
unsigned int* d_scan_histogram;
cudaMalloc(&d_scan_histogram, HISTOGRAM_LENGTH * sizeof(unsigned int));
dim3 scanBlockSize(1024);
dim3 scanGridSize((HISTOGRAM_LENGTH + scanBlockSize.x - 1) / scanBlockSize.x);
exclusiveScan_16 <<<scanGridSize, scanBlockSize>>> (d_histogram, d_scan_histogram, HISTOGRAM_LENGTH);
addScannedBlocks <<<scanGridSize, scanBlockSize>>> (d_scan_histogram, HISTOGRAM_LENGTH);
unsigned
int h_total;
cudaMemcpy(&h_total, &d_scan_histogram[HISTOGRAM_LENGTH - 1], sizeof(unsigned int), cudaMemcpyDeviceToHost);
float* d_cdf;
cudaMalloc(&d_cdf, HISTOGRAM_LENGTH * sizeof(float));
computeCDF_16 <<<1, HISTOGRAM_LENGTH>>> (d_scan_histogram, d_cdf, HISTOGRAM_LENGTH, h_total);
equalize_16 <<<gridSize, blockSize>>> (surfObj, width, height, d_cdf);
cudaDeviceSynchronize();
cudaDestroySurfaceObject(surfObj);
cudaFree(d_histogram);
cudaFree(d_scan_histogram);
cudaFree(d_cdf);
}
void equalizeHistogram_16(...)
:定义一个主机函数,用于执行直方图均衡化的整个过程。cudaResourceDesc resDesc
:声明并初始化CUDA资源描述符。memset(&resDesc, 0, sizeof(resDesc))
:将资源描述符清零。resDesc.resType = cudaResourceTypeArray
:设置资源类型为CUDA数组。resDesc.res.array.array = cuArray
:将资源描述符的数组指针设置为输入的CUDA数组。cudaSurfaceObject_t surfObj
:声明CUDA表面对象。cudaCreateSurfaceObject(&surfObj, &resDesc)
:创建CUDA表面对象。unsigned int* d_histogram
:声明设备端直方图数组指针。cudaMalloc(&d_histogram, HISTOGRAM_LENGTH * sizeof(unsigned int))
:分配设备端直方图数组内存。cudaMemset(d_histogram, 0, HISTOGRAM_LENGTH * sizeof(unsigned int))
:将设备端直方图数组清零。dim3 blockSize(16, 16)
:定义每个块的线程布局。dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y)
:计算网格大小,以覆盖整个图像。computeHistogram_16 <<<gridSize, blockSize>>> (surfObj, width, height, d_histogram)
:启动计算直方图的CUDA核函数。unsigned int* d_scan_histogram
:声明设备端扫描直方图数组指针。cudaMalloc(&d_scan_histogram, HISTOGRAM_LENGTH * sizeof(unsigned int))
:分配设备端扫描直方图数组内存。dim3 scanBlockSize(1024)
:定义扫描核函数的每个块的线程数。dim3 scanGridSize((HISTOGRAM_LENGTH + scanBlockSize.x - 1) / scanBlockSize.x)
:计算扫描核函数的网格大小。exclusiveScan_16 <<<scanGridSize, scanBlockSize>>> (d_histogram, d_scan_histogram, HISTOGRAM_LENGTH)
:启动计算前缀和的CUDA核函数。addScannedBlocks <<<scanGridSize, scanBlockSize>>> (d_scan_histogram, HISTOGRAM_LENGTH)
:启动合并扫描结果的CUDA核函数。unsigned int h_total
:声明一个变量用于存储总和。cudaMemcpy(&h_total, &d_scan_histogram[HISTOGRAM_LENGTH - 1], sizeof(unsigned int), cudaMemcpyDeviceToHost)
:将设备端的总和复制到主机端。float* d_cdf
:声明设备端CDF数组指针。cudaMalloc(&d_cdf, HISTOGRAM_LENGTH * sizeof(float))
:分配设备端CDF数组内存。computeCDF_16 <<<1, HISTOGRAM_LENGTH>>> (d_scan_histogram, d_cdf, HISTOGRAM_LENGTH, h_total)
:启动计算CDF的CUDA核函数。equalize_16 <<<gridSize, blockSize>>> (surfObj, width, height, d_cdf)
:启动直方图均衡化的CUDA核函数。cudaDeviceSynchronize()
:同步设备,确保所有CUDA核函数完成。cudaDestroySurfaceObject(surfObj)
:销毁CUDA表面对象。cudaFree(d_histogram)
:释放设备端直方图数组内存。cudaFree(d_scan_histogram)
:释放设备端扫描直方图数组内存。cudaFree(d_cdf)
:释放设备端CDF数组内存。
bool CUDA_equalizeHistogram_16(int htexture, int width, int height, int target) {
cudaGraphicsResource_t cudaResources;
// 在 CUDA 中注册纹理
cudaError_t err = cudaGraphicsGLRegisterImage(&cudaResources, htexture, target, cudaGraphicsRegisterFlagsWriteDiscard);
if (err != cudaSuccess) {
return false;
}
// 在 CUDA 中锁定资源,获得操作纹理的指针
cudaArray_t cuArray;
cudaGraphicsMapResources(1, &cudaResources, 0);
cudaGraphicsSubResourceGetMappedArray(&cuArray, cudaResources, 0, 0);
equalizeHistogram_16(cuArray, width, height);
// 解除资源锁定,使 OpenGL 可以利用得到的纹理对象进行纹理贴图操作
cudaGraphicsUnmapResources(1, &cudaResources, 0);
// 释放 CUDA 资源
cudaGraphicsUnregisterResource(cudaResources);
return true;
}
bool CUDA_equalizeHistogram_16(...)
:定义一个主机函数,用于通过CUDA和OpenGL接口进行图像直方图均衡化。cudaGraphicsResource_t cudaResources
:声明CUDA图形资源。cudaError_t err = cudaGraphicsGLRegisterImage(&cudaResources, htexture, target, cudaGraphicsRegisterFlagsWriteDiscard)
:注册OpenGL纹理为CUDA图形资源。if (err != cudaSuccess) { return false; }
:检查注册是否成功,如果失败返回false
。cudaArray_t cuArray
:声明CUDA数组。cudaGraphicsMapResources(1, &cudaResources, 0)
:锁定CUDA图形资源,获得操作指针。cudaGraphicsSubResourceGetMappedArray(&cuArray, cudaResources, 0, 0)
:获取映射的CUDA数组。equalizeHistogram_16(cuArray, width, height)
:调用直方图均衡化函数。cudaGraphicsUnmapResources(1, &cudaResources, 0)
:解除资源锁定,使OpenGL可以使用该纹理对象进行操作。cudaGraphicsUnregisterResource(cudaResources)
:释放CUDA图形资源。return true
:返回true
表示操作成功。
通过上述代码解释,每一行代码的作用和逻辑应当清楚明了。这个过程包括计算图像直方图、执行前缀和扫描、计算CDF并对图像进行直方图均衡化。
RGBA处理:
#define HISTOGRAM_LENGTH 256
__global__ void computeHistogram_rgba(cudaSurfaceObject_t surface, int width, int height, unsigned int* histogram) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < width && y < height) {
uchar4 pixel;
surf2Dread(&pixel, surface, x * sizeof(uchar4), y);
atomicAdd(&histogram[pixel.x], 1);
}
}
__global__ void computeCDF_rgba(unsigned int* histogram, float* cdf) {
__shared__ unsigned int temp[HISTOGRAM_LENGTH];
int tid = threadIdx.x;
if (tid < HISTOGRAM_LENGTH) {
temp[tid] = histogram[tid];
}
__syncthreads();
for (int stride = 1; stride < HISTOGRAM_LENGTH; stride *= 2) {
unsigned int val = 0;
if (tid >= stride) {
val = temp[tid - stride];
}
__syncthreads();
if (tid >= stride) {
temp[tid] += val;
}
__syncthreads();
}
if (tid < HISTOGRAM_LENGTH) {
cdf[tid] = temp[tid] / (float)(temp[HISTOGRAM_LENGTH - 1]);
}
}
__global__ void equalize_rgba(cudaSurfaceObject_t surface, int width, int height, float* cdf) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < width && y < height) {
uchar4 pixel;
surf2Dread(&pixel, surface, x * sizeof(uchar4), y);
unsigned char newValue = static_cast<unsigned char>(255 * cdf[pixel.x]);
pixel.x = newValue;
pixel.y = newValue;
pixel.z = newValue;
surf2Dwrite(pixel, surface, x * sizeof(uchar4), y);
}
}
void equalizeHistogram_rgba(cudaArray_t cuArray, int width, int height)
{
cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeArray;
resDesc.res.array.array = cuArray;
cudaSurfaceObject_t surfObj;
cudaCreateSurfaceObject(&surfObj, &resDesc);
unsigned int* d_histogram;
cudaMalloc(&d_histogram, HISTOGRAM_LENGTH * sizeof(unsigned int));
cudaMemset(d_histogram, 0, HISTOGRAM_LENGTH * sizeof(unsigned int));
dim3 blockSize(16, 16);
//dim3 blockSize(32, 32);
dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y);
computeHistogram_rgba << <gridSize, blockSize >> > (surfObj, width, height, d_histogram);
float* d_cdf;
cudaMalloc(&d_cdf, HISTOGRAM_LENGTH * sizeof(float));
computeCDF_rgba << <1, HISTOGRAM_LENGTH >> > (d_histogram, d_cdf);
equalize_rgba << <gridSize, blockSize >> > (surfObj, width, height, d_cdf);
cudaDeviceSynchronize();
cudaDestroySurfaceObject(surfObj);
cudaFree(d_histogram);
cudaFree(d_cdf);
}
bool CUDA_equalizeHistogram_rgba(int htexture, int width, int height, int target)
{
cudaGraphicsResource_t cudaResources;
// 在 CUDA 中注册纹理
cudaError_t err = cudaGraphicsGLRegisterImage(&cudaResources, htexture, target, cudaGraphicsRegisterFlagsWriteDiscard);
if (err != cudaSuccess)
{
return false;
}
// 在 CUDA 中锁定资源,获得操作纹理的指针
cudaArray_t cuArray;
cudaGraphicsMapResources(1, &cudaResources, 0);
cudaGraphicsSubResourceGetMappedArray(&cuArray, cudaResources, 0, 0);
equalizeHistogram_rgba(cuArray, width, height);
// 解除资源锁定,使 OpenGL 可以利用得到的纹理对象进行纹理贴图操作
cudaGraphicsUnmapResources(1, &cudaResources, 0);
// 释放 CUDA 资源
cudaGraphicsUnregisterResource(cudaResources);
return true;
}
「真诚赞赏,手留余香」
真诚赞赏,手留余香
使用微信扫描二维码完成支付