▶ 使用CPU和GPU分别实现散列表
● CPU方法
1 #include <stdio.h> 2 #include <time.h> 3 #include "cuda_runtime.h" 4 #include "D:\Code\CUDA\book\common\book.h" 5 6 #define SIZE (100*1024*1024) 7 #define ELEMENTS (SIZE / sizeof(unsigned int)) 8 #define HASH_ENTRIES (1024) 9 10 11 struct Entry 12 { 13 unsigned int key; 14 void *value; 15 Entry *next; 16 }; 17 18 struct Table 19 { 20 size_t count; 21 Entry **entries; 22 Entry *pool; 23 Entry *firstFree; 24 }; 25 26 27 size_t hash(unsigned int key, size_t count) 28 { 29 return key % count; 30 } 31 32 void initialize_table(Table &table, int entries, int elements) 33 { 34 table.count = entries; 35 table.entries = (Entry**)calloc(entries, sizeof(Entry*)); 36 table.pool = (Entry*)malloc(elements * sizeof(Entry)); 37 table.firstFree = table.pool; 38 } 39 40 void free_table(Table &table) 41 { 42 free(table.entries); 43 free(table.pool); 44 } 45 46 void add_to_table(Table &table, unsigned int key, void *value) 47 { 48 size_t hashValue = hash(key, table.count); 49 Entry *location = table.firstFree++; 50 location->key = key; 51 location->value = value; 52 location->next = table.entries[hashValue];// 插到该分支的头部而不是尾部 53 table.entries[hashValue] = location; 54 } 55 56 void verify_table(const Table &table) 57 { 58 int count = 0; 59 for (size_t i = 0; i<table.count; i++) 60 { 61 Entry *current = table.entries[i]; 62 while (current != NULL) 63 { 64 ++count; 65 if (hash(current->key, table.count) != i) 66 printf("\n\t%d hashed to %ld, but was located at %ld\n", current->key, hash(current->key, table.count), i); 67 current = current->next; 68 } 69 } 70 if (count != ELEMENTS) 71 printf("\n\t%d elements found in hash table. Should be %ld\n", 72 count, ELEMENTS); 73 else 74 printf("\n\tAll %d elements found in hash table.\n", count); 75 } 76 77 int main(void) 78 { 79 unsigned int *buffer =(unsigned int*)big_random_block(SIZE); 80 Table table; 81 clock_t start, stop; 82 83 initialize_table(table, HASH_ENTRIES, ELEMENTS); 84 85 start = clock(); 86 for (int i = 0; i<ELEMENTS; i++) 87 add_to_table(table, buffer[i], (void*)NULL); 88 89 stop = clock(); 90 printf("\n\tBuilding the table: %3.1f ms\n", (float)(stop - start) / (float)CLOCKS_PER_SEC * 1000.0f); 91 92 verify_table(table); 93 free_table(table); 94 free(buffer); 95 getchar(); 96 return 0; 97 }
● GPU方法(用到了前面的原子锁)
1 #include <stdio.h> 2 #include <time.h> 3 #include "cuda_runtime.h" 4 #include "device_launch_parameters.h" 5 #include "cuda.h" 6 #include "D:\Code\CUDA\book\common\book.h" 7 8 #define SIZE (100*1024*1024) 9 #define ELEMENTS (SIZE / sizeof(unsigned int)) 10 #define HASH_ENTRIES (1024) 11 12 struct Lock 13 { 14 int *mutex; 15 Lock(void) 16 { 17 int state = 0; 18 cudaMalloc((void **)&mutex, sizeof(int)); 19 cudaMemcpy(mutex, &state, sizeof(int), cudaMemcpyHostToDevice); 20 } 21 ~Lock(void) 22 { 23 cudaFree(mutex); 24 } 25 __device__ void lock(void) 26 { 27 while (atomicCAS(mutex, 0, 1) != 0); 28 } 29 __device__ void unlock(void) 30 { 31 atomicExch(mutex, 0); 32 } 33 }; 34 35 struct Entry 36 { 37 unsigned int key; 38 void *value; 39 Entry *next; 40 }; 41 42 struct Table 43 { 44 size_t count; 45 Entry **entries; 46 Entry *pool; 47 Entry *firstFree; 48 }; 49 50 __device__ __host__ size_t hash(unsigned int key, size_t count) 51 { 52 return key % count; 53 } 54 55 void initialize_table(Table &table, int entries, int elements) 56 { 57 table.count = entries; 58 cudaMalloc((void**)&table.entries, entries * sizeof(Entry*)); 59 cudaMemset(table.entries, 0, entries * sizeof(Entry*)); 60 cudaMalloc((void**)&table.pool, elements * sizeof(Entry)); 61 } 62 63 void free_table(Table &table) 64 { 65 cudaFree(table.entries); 66 cudaFree(table.pool); 67 } 68 69 __global__ void add_to_table(unsigned int *keys, void **values, Table table, Lock *lock) 70 // 锁数组用于锁定散列表中的每一个桶 71 { 72 int tid = threadIdx.x + blockIdx.x * blockDim.x; 73 int stride = blockDim.x * gridDim.x; 74 while (tid < ELEMENTS) 75 { 76 unsigned int key = keys[tid]; 77 size_t hashValue = hash(key, table.count); 78 for (int i = 0; i<32; i++)// 利用循环来分散线程束,使同一线程束中的32个线程在循环的不同次数时进行写入 79 { 80 if ((tid % 32) == i) 81 { 82 Entry *location = &(table.pool[tid]); 83 location->key = key; 84 location->value = values[tid]; 85 lock[hashValue].lock(); 86 location->next = table.entries[hashValue]; 87 table.entries[hashValue] = location; 88 lock[hashValue].unlock(); 89 } 90 } 91 tid += stride; 92 } 93 } 94 95 void copy_table_to_host(const Table &table, Table &hostTable) 96 { 97 hostTable.count = table.count; 98 hostTable.entries = (Entry**)calloc(table.count, sizeof(Entry*)); 99 hostTable.pool = (Entry*)malloc(ELEMENTS * sizeof(Entry)); 100 101 cudaMemcpy(hostTable.entries, table.entries, table.count * sizeof(Entry*), cudaMemcpyDeviceToHost); 102 cudaMemcpy(hostTable.pool, table.pool, ELEMENTS * sizeof(Entry), cudaMemcpyDeviceToHost); 103 104 for (int i = 0; i < table.count; i++) 105 { 106 if (hostTable.entries[i] != NULL) 107 hostTable.entries[i] = (Entry*)((size_t)hostTable.entries[i] - (size_t)table.pool + (size_t)hostTable.pool); 108 // 从从显存到内存的地址线性偏移 x - adressGPU + addressCPU 109 } 110 for (int i = 0; i < ELEMENTS; i++) 111 { 112 if (hostTable.pool[i].next != NULL) 113 hostTable.pool[i].next = (Entry*)((size_t)hostTable.pool[i].next - (size_t)table.pool + (size_t)hostTable.pool); 114 // 同样是做偏移,但是要找到下一个元素的地址 115 } 116 } 117 118 void verify_table(const Table &dev_table) 119 { 120 Table table; 121 copy_table_to_host(dev_table, table); 122 123 int count = 0; 124 for (size_t i = 0; i < table.count; i++) 125 { 126 Entry *current = table.entries[i]; 127 while (current != NULL) 128 { 129 ++count; 130 if (hash(current->key, table.count) != i) 131 printf("%d hashed to %ld, but was located at %ld\n", current->key, hash(current->key, table.count), i); 132 current = current->next; 133 } 134 } 135 if (count != ELEMENTS) 136 printf("%d elements found in hash table. Should be %ld\n", count, ELEMENTS); 137 else 138 printf("All %d elements found in hash table.\n", count); 139 } 140 141 int main(void) 142 { 143 unsigned int *buffer = (unsigned int*)big_random_block(SIZE); 144 145 unsigned int *dev_keys; 146 void **dev_values; 147 cudaMalloc((void**)&dev_keys, SIZE); 148 cudaMalloc((void**)&dev_values, SIZE); 149 cudaMemcpy(dev_keys, buffer, SIZE, cudaMemcpyHostToDevice); 150 151 Table table; 152 initialize_table(table, HASH_ENTRIES, ELEMENTS); 153 154 Lock lock[HASH_ENTRIES];// 准备锁列表 155 Lock *dev_lock; 156 cudaMalloc((void**)&dev_lock, HASH_ENTRIES * sizeof(Lock)); 157 cudaMemcpy(dev_lock, lock, HASH_ENTRIES * sizeof(Lock), cudaMemcpyHostToDevice); 158 159 cudaEvent_t start, stop; 160 cudaEventCreate(&start); 161 cudaEventCreate(&stop); 162 cudaEventRecord(start, 0); 163 164 add_to_table << <60, 256 >> >(dev_keys, dev_values, table, dev_lock); 165 166 cudaEventRecord(stop, 0); 167 cudaEventSynchronize(stop); 168 float elapsedTime; 169 cudaEventElapsedTime(&elapsedTime, start, stop); 170 printf("Time to hash: %3.1f ms\n", elapsedTime); 171 172 verify_table(table); 173 free_table(table); 174 175 cudaEventDestroy(start); 176 cudaEventDestroy(stop); 177 free_table(table); 178 cudaFree(dev_lock); 179 cudaFree(dev_keys); 180 cudaFree(dev_values); 181 free(buffer); 182 getchar(); 183 return 0; 184 }