《GPU高性能编程CUDA实战》附录二散列表

2023年8月25日 728次阅读来源: 算法小白

▶ 使用CPU和GPU分别实现散列表

● CPU方法

 1 #include <stdio.h>
 2 #include <time.h>
 3 #include "cuda_runtime.h"
 4 #include "D:\Code\CUDA\book\common\book.h"
 5 
 6 #define SIZE            (100*1024*1024)
 7 #define ELEMENTS        (SIZE / sizeof(unsigned int))
 8 #define HASH_ENTRIES    (1024)
 9 
10 
11 struct Entry
12 {
13     unsigned int    key;
14     void            *value;
15     Entry           *next;
16 };
17 
18 struct Table
19 {
20     size_t  count;
21     Entry   **entries;
22     Entry   *pool;
23     Entry   *firstFree;
24 };
25 
26 
27 size_t hash(unsigned int key, size_t count)
28 {
29     return key % count;
30 }
31 
32 void initialize_table(Table &table, int entries, int elements)
33 {
34     table.count = entries;
35     table.entries = (Entry**)calloc(entries, sizeof(Entry*));
36     table.pool = (Entry*)malloc(elements * sizeof(Entry));
37     table.firstFree = table.pool;
38 }
39 
40 void free_table(Table &table)
41 {
42     free(table.entries);
43     free(table.pool);
44 }
45 
46 void add_to_table(Table &table, unsigned int key, void *value)
47 {
48     size_t hashValue = hash(key, table.count);
49     Entry *location = table.firstFree++;
50     location->key = key;
51     location->value = value;
52     location->next = table.entries[hashValue];// 插到该分支的头部而不是尾部
53     table.entries[hashValue] = location;
54 }
55 
56 void verify_table(const Table &table)
57 {
58     int count = 0;
59     for (size_t i = 0; i<table.count; i++)
60     {
61         Entry   *current = table.entries[i];
62         while (current != NULL)
63         {
64             ++count;
65             if (hash(current->key, table.count) != i)
66                 printf("\n\t%d hashed to %ld, but was located at %ld\n", current->key, hash(current->key, table.count), i);
67             current = current->next;
68         }
69     }
70     if (count != ELEMENTS)
71         printf("\n\t%d elements found in hash table.  Should be %ld\n",
72             count, ELEMENTS);
73     else
74         printf("\n\tAll %d elements found in hash table.\n", count);
75 }
76 
77 int main(void)
78 {
79     unsigned int *buffer =(unsigned int*)big_random_block(SIZE);
80     Table table;
81     clock_t start, stop;
82 
83     initialize_table(table, HASH_ENTRIES, ELEMENTS);
84     
85     start = clock();
86     for (int i = 0; i<ELEMENTS; i++)
87         add_to_table(table, buffer[i], (void*)NULL);
88     
89     stop = clock();
90     printf("\n\tBuilding the table:  %3.1f ms\n", (float)(stop - start) / (float)CLOCKS_PER_SEC * 1000.0f);
91 
92     verify_table(table);
93     free_table(table);
94     free(buffer);
95     getchar();
96     return 0;
97 }

● GPU方法（用到了前面的原子锁）

  1 #include <stdio.h>
  2 #include <time.h>
  3 #include "cuda_runtime.h"
  4 #include "device_launch_parameters.h"
  5 #include "cuda.h"
  6 #include "D:\Code\CUDA\book\common\book.h"
  7 
  8 #define SIZE            (100*1024*1024)
  9 #define ELEMENTS        (SIZE / sizeof(unsigned int))
 10 #define HASH_ENTRIES    (1024)
 11 
 12 struct Lock
 13 {
 14     int *mutex;
 15     Lock(void)
 16     {
 17         int state = 0;
 18         cudaMalloc((void **)&mutex, sizeof(int));
 19         cudaMemcpy(mutex, &state, sizeof(int), cudaMemcpyHostToDevice);
 20     }
 21     ~Lock(void)
 22     {
 23         cudaFree(mutex);
 24     }
 25     __device__ void lock(void)
 26     {
 27         while (atomicCAS(mutex, 0, 1) != 0);
 28     }
 29     __device__ void unlock(void)
 30     {
 31         atomicExch(mutex, 0);
 32     }
 33 };
 34 
 35 struct Entry
 36 {
 37     unsigned int    key;
 38     void            *value;
 39     Entry           *next;
 40 };
 41 
 42 struct Table
 43 {
 44     size_t  count;
 45     Entry   **entries;
 46     Entry   *pool;
 47     Entry   *firstFree;
 48 };
 49 
 50 __device__ __host__ size_t hash(unsigned int key, size_t count)
 51 {
 52     return key % count;
 53 }
 54 
 55 void initialize_table(Table &table, int entries, int elements)
 56 {
 57     table.count = entries;
 58     cudaMalloc((void**)&table.entries, entries * sizeof(Entry*));
 59     cudaMemset(table.entries, 0, entries * sizeof(Entry*));
 60     cudaMalloc((void**)&table.pool, elements * sizeof(Entry));
 61 }
 62 
 63 void free_table(Table &table)
 64 {
 65     cudaFree(table.entries);
 66     cudaFree(table.pool);
 67 }
 68 
 69 __global__ void add_to_table(unsigned int *keys, void **values, Table table, Lock *lock)
 70 // 锁数组用于锁定散列表中的每一个桶
 71 {
 72     int tid = threadIdx.x + blockIdx.x * blockDim.x;
 73     int stride = blockDim.x * gridDim.x;
 74     while (tid < ELEMENTS)
 75     {
 76         unsigned int key = keys[tid];
 77         size_t hashValue = hash(key, table.count);
 78         for (int i = 0; i<32; i++)// 利用循环来分散线程束，使同一线程束中的32个线程在循环的不同次数时进行写入
 79         {
 80             if ((tid % 32) == i)
 81             {
 82                 Entry *location = &(table.pool[tid]);
 83                 location->key = key;
 84                 location->value = values[tid];
 85                 lock[hashValue].lock();
 86                 location->next = table.entries[hashValue];
 87                 table.entries[hashValue] = location;
 88                 lock[hashValue].unlock();
 89             }
 90         }
 91         tid += stride;
 92     }
 93 }
 94 
 95 void copy_table_to_host(const Table &table, Table &hostTable)
 96 {
 97     hostTable.count = table.count;
 98     hostTable.entries = (Entry**)calloc(table.count, sizeof(Entry*));
 99     hostTable.pool = (Entry*)malloc(ELEMENTS * sizeof(Entry));
100 
101     cudaMemcpy(hostTable.entries, table.entries, table.count * sizeof(Entry*), cudaMemcpyDeviceToHost);
102     cudaMemcpy(hostTable.pool, table.pool, ELEMENTS * sizeof(Entry), cudaMemcpyDeviceToHost);
103 
104     for (int i = 0; i < table.count; i++)
105     {
106         if (hostTable.entries[i] != NULL)
107             hostTable.entries[i] = (Entry*)((size_t)hostTable.entries[i] - (size_t)table.pool + (size_t)hostTable.pool);
108         // 从从显存到内存的地址线性偏移 x - adressGPU + addressCPU
109     }
110     for (int i = 0; i < ELEMENTS; i++)
111     {
112         if (hostTable.pool[i].next != NULL)
113             hostTable.pool[i].next = (Entry*)((size_t)hostTable.pool[i].next - (size_t)table.pool + (size_t)hostTable.pool);
114         // 同样是做偏移，但是要找到下一个元素的地址
115     }
116 }
117 
118 void verify_table(const Table &dev_table)
119 {
120     Table   table;
121     copy_table_to_host(dev_table, table);
122 
123     int count = 0;
124     for (size_t i = 0; i < table.count; i++)
125     {
126         Entry   *current = table.entries[i];
127         while (current != NULL)
128         {
129             ++count;
130             if (hash(current->key, table.count) != i)
131                 printf("%d hashed to %ld, but was located at %ld\n", current->key, hash(current->key, table.count), i);
132             current = current->next;
133         }
134     }
135     if (count != ELEMENTS)
136         printf("%d elements found in hash table.  Should be %ld\n", count, ELEMENTS);
137     else
138         printf("All %d elements found in hash table.\n", count);
139 }
140 
141 int main(void)
142 {
143     unsigned int *buffer = (unsigned int*)big_random_block(SIZE);
144 
145     unsigned int *dev_keys;
146     void         **dev_values;
147     cudaMalloc((void**)&dev_keys, SIZE);
148     cudaMalloc((void**)&dev_values, SIZE);
149     cudaMemcpy(dev_keys, buffer, SIZE, cudaMemcpyHostToDevice);
150 
151     Table table;
152     initialize_table(table, HASH_ENTRIES, ELEMENTS);
153 
154     Lock    lock[HASH_ENTRIES];// 准备锁列表
155     Lock    *dev_lock;
156     cudaMalloc((void**)&dev_lock, HASH_ENTRIES * sizeof(Lock));
157     cudaMemcpy(dev_lock, lock, HASH_ENTRIES * sizeof(Lock), cudaMemcpyHostToDevice);
158 
159     cudaEvent_t     start, stop;
160     cudaEventCreate(&start);
161     cudaEventCreate(&stop);
162     cudaEventRecord(start, 0);
163 
164     add_to_table << <60, 256 >> >(dev_keys, dev_values, table, dev_lock);
165 
166     cudaEventRecord(stop, 0);
167     cudaEventSynchronize(stop);
168     float   elapsedTime;
169     cudaEventElapsedTime(&elapsedTime, start, stop);
170     printf("Time to hash:  %3.1f ms\n", elapsedTime);
171 
172     verify_table(table);
173     free_table(table);
174 
175     cudaEventDestroy(start);
176     cudaEventDestroy(stop);
177     free_table(table);
178     cudaFree(dev_lock);
179     cudaFree(dev_keys);
180     cudaFree(dev_values);
181     free(buffer);
182     getchar();
183     return 0;
184 }

    原文作者：算法小白
    原文地址: https://www.cnblogs.com/cuancuancuanhao/p/7651789.html
    本文转自网络文章，转载此文章仅为分享知识，如有侵权，请联系博主进行删除。