Merge branch 'wkg-docs' into 'master'

Update Wukong+G docs and fix compile errors Closes #11 See merge request opensource/wukong!77
SJTU-IPADS · Nov 10, 2019 · c1646d2 · c1646d2
2 parents 6acef0e + 29a6e2f
commit c1646d2
Show file tree

Hide file tree

Showing 14 changed files with 106 additions and 56 deletions.
diff --git a/README.md b/README.md
@@ -19,6 +19,7 @@ For more details see [Wukong Project](http://ipads.se.sjtu.edu.cn/projects/wukon
 * [Installation](docs/INSTALL.md)
 * [Tutorials](docs/TUTORIALS.md)
 * [Q&A](docs/QA.md)
+* [GPU extension](docs/gpu/TUTORIALS.md)
 
 
 ## License

diff --git a/core/config.hpp b/core/config.hpp
@@ -223,6 +223,11 @@ void load_config(string fname, int nsrvs)
 #ifdef USE_GPU
     // each GPU card needs one (dedicated) agent thread
     Global::num_threads += Global::num_gpus;
+    if (Global::num_gpus != 1) {
+        logstream(LOG_ERROR) << "Wrong config: please config num_gpus with 1 to enable GPU extension."
+                             << LOG_endl;
+        exit(-1);
+    }
 #endif
 
     // limited the number of engines

diff --git a/core/global.hpp b/core/global.hpp
@@ -31,46 +31,46 @@ class Global {
     // another choice
     // e.g., static int &num_threads() { static int _num_threads = 2; return _num_threads; }
 
-    static int num_servers;
-    static int num_threads;
+    static int num_servers __attribute__((weak));
+    static int num_threads __attribute__((weak));
 
-    static int num_proxies;
-    static int num_engines;
+    static int num_proxies __attribute__((weak));
+    static int num_engines __attribute__((weak));
 
-    static string input_folder;
+    static string input_folder __attribute__((weak));
 
-    static int data_port_base;
-    static int ctrl_port_base;
+    static int data_port_base __attribute__((weak));
+    static int ctrl_port_base __attribute__((weak));
 
-    static int rdma_buf_size_mb;
-    static int rdma_rbf_size_mb;
+    static int rdma_buf_size_mb __attribute__((weak));
+    static int rdma_rbf_size_mb __attribute__((weak));
 
-    static bool use_rdma;
-    static int rdma_threshold;
+    static bool use_rdma __attribute__((weak));
+    static int rdma_threshold __attribute__((weak));
 
-    static int mt_threshold;
+    static int mt_threshold __attribute__((weak));
 
-    static bool enable_caching;
-    static bool enable_workstealing;
-    static int stealing_pattern;
+    static bool enable_caching __attribute__((weak));
+    static bool enable_workstealing __attribute__((weak));
+    static int stealing_pattern __attribute__((weak));
 
-    static bool silent;
+    static bool silent __attribute__((weak));
 
-    static bool enable_planner;
-    static bool generate_statistics;
+    static bool enable_planner __attribute__((weak));
+    static bool generate_statistics __attribute__((weak));
 
-    static bool enable_vattr;
+    static bool enable_vattr __attribute__((weak));
 
-    static int memstore_size_gb;
-    static int est_load_factor;
+    static int memstore_size_gb __attribute__((weak));
+    static int est_load_factor __attribute__((weak));
 
-    static int num_gpus;
-    static int gpu_kvcache_size_gb;
-    static int gpu_rbuf_size_mb;
-    static int gpu_rdma_buf_size_mb;
-    static int gpu_key_blk_size_mb;
-    static int gpu_value_blk_size_mb;
-    static bool gpu_enable_pipeline;
+    static int num_gpus __attribute__((weak));
+    static int gpu_kvcache_size_gb __attribute__((weak));
+    static int gpu_rbuf_size_mb __attribute__((weak));
+    static int gpu_rdma_buf_size_mb __attribute__((weak));
+    static int gpu_key_blk_size_mb __attribute__((weak));
+    static int gpu_value_blk_size_mb __attribute__((weak));
+    static bool gpu_enable_pipeline __attribute__((weak));
 };
 
 

diff --git a/core/gpu/gpu_agent.hpp b/core/gpu/gpu_agent.hpp
@@ -96,8 +96,10 @@ class GPUAgent {
         Bundle b(req);
         if (adaptor->send(dst_sid, dst_tid, b)) {
             // #2 send result buffer
-            adaptor->send_dev2host(dst_sid, dst_tid, req.result.gpu.rbuf(),
-                                   WUKONG_GPU_ELEM_SIZE * req.result.gpu.rbuf_num_elems());
+            if (req.result.gpu.rbuf_num_elems() > 0) {
+                adaptor->send_dev2host(dst_sid, dst_tid, req.result.gpu.rbuf(),
+                                       WUKONG_GPU_ELEM_SIZE * req.result.gpu.rbuf_num_elems());
+            }
             return true;
         }
 
@@ -152,7 +154,7 @@ class GPUAgent {
     // fork-join or in-place execution
     bool need_fork_join(SPARQLQuery &req) {
         // always need NOT fork-join when executing on single machine
-        if (Global::num_serverss == 1) return false;
+        if (Global::num_servers == 1) return false;
 
         // always need fork-join mode w/o RDMA
         if (!Global::use_rdma) return true;

diff --git a/core/gpu/gpu_cache.hpp b/core/gpu/gpu_cache.hpp
@@ -36,6 +36,7 @@
 // utils
 #include "unit.hpp"
 #include "gpu.hpp"
+#include "global.hpp"
 
 using namespace std;
 
@@ -287,7 +288,7 @@ class GPUCache {
             // step 4.2 traverse the ext_bucket_list and load
             uint64_t passed_buckets = 0;
 
-            for (int i = 0; i < rdf_metas[seg].ext_bucket_list.size(); i++) {
+            for (int i = 0; i < rdf_metas[seg].get_ext_bucket_list_size(); i++) {
                 ext_bucket_extent_t ext = rdf_metas[seg].ext_bucket_list[i];
                 /* load from this ext
                  * inside_off: the offset inside the ext

diff --git a/core/gpu/gpu_engine.hpp b/core/gpu/gpu_engine.hpp
@@ -28,7 +28,6 @@
 #include <boost/unordered_map.hpp>
 #include <vector>
 
-#include "config.hpp"
 #include "type.hpp"
 #include "dgraph.hpp"
 #include "query.hpp"

diff --git a/core/gpu/gpu_engine_cuda.hpp b/core/gpu/gpu_engine_cuda.hpp
@@ -27,6 +27,7 @@
 #include <vector>
 #include <utility>
 
+#include "global.hpp"
 #include "assertion.hpp"
 #include "query.hpp"
 
@@ -126,7 +127,7 @@ class GPUEngineCuda final {
         param.query.var2col_start = req.result.var2col(start);
 
         logstream(LOG_DEBUG) << "known_to_unknown: #ext_buckets: "
-                             << seg_meta.ext_bucket_list.size() << LOG_endl;
+                             << seg_meta.get_ext_bucket_list_size() << LOG_endl;
 
         ASSERT(gmem->res_inbuf() != gmem->res_outbuf());
         ASSERT(nullptr != gmem->res_inbuf());
@@ -139,7 +140,7 @@ class GPUEngineCuda final {
 
 
         // prefetch segment of next pattern
-        if (global_gpu_enable_pipeline && has_next_pattern(req)) {
+        if (Global::gpu_enable_pipeline && has_next_pattern(req)) {
             auto next_seg = pattern_to_segid(req, req.pattern_step + 1);
             auto stream2 = stream_pool->get_stream(next_seg.pid);
 
@@ -227,7 +228,7 @@ class GPUEngineCuda final {
 
 
         // preload next predicate
-        if (global_gpu_enable_pipeline && has_next_pattern(req)) {
+        if (Global::gpu_enable_pipeline && has_next_pattern(req)) {
             auto next_seg = pattern_to_segid(req, req.pattern_step + 1);
             auto stream2 = stream_pool->get_stream(next_seg.pid);
 
@@ -308,7 +309,7 @@ class GPUEngineCuda final {
 
 
         // preload next predicate
-        if (global_gpu_enable_pipeline && has_next_pattern(req)) {
+        if (Global::gpu_enable_pipeline && has_next_pattern(req)) {
             auto next_seg = pattern_to_segid(req, req.pattern_step + 1);
             auto stream2 = stream_pool->get_stream(next_seg.pid);
 

diff --git a/core/gpu/gpu_hash.hpp b/core/gpu/gpu_hash.hpp
@@ -34,6 +34,7 @@
 #include "store/vertex.hpp"
 
 // utils
+#include "global.hpp"
 #include "gpu.hpp"
 #include "unit.hpp"
 

diff --git a/core/gpu/gpu_mem.hpp b/core/gpu/gpu_mem.hpp
@@ -24,13 +24,13 @@
 
 #ifdef USE_GPU
 
-#include "global.hpp"
 #include "rdma.hpp"
 #include "type.hpp"
 
 // utils
 #include "unit.hpp"
 #include "gpu.hpp"
+#include "global.hpp"
 
 class GPUMem {
 private:

diff --git a/core/store/gstore.hpp b/core/store/gstore.hpp
@@ -460,8 +460,14 @@ class GStore {
         // allocate buckets in indirect-header region to segments
         // #buckets : #extended buckets = 1 : 0.15
         if (seg.num_buckets > 0) {
-            uint64_t start_off = alloc_ext_buckets(EXT_BUCKET_EXTENT_LEN);
-            seg.add_ext_buckets(ext_bucket_extent_t(EXT_BUCKET_EXTENT_LEN, start_off));
+            uint64_t nbuckets = 0;
+#ifdef USE_GPU
+            nbuckets = EXT_BUCKET_EXTENT_LEN(seg.num_buckets);
+#else
+            nbuckets = EXT_BUCKET_EXTENT_LEN;
+#endif
+            uint64_t start_off = alloc_ext_buckets(nbuckets);
+            seg.add_ext_buckets(ext_bucket_extent_t(nbuckets, start_off));
         }
     }
 
@@ -826,8 +832,14 @@ class GStore {
             rdf_seg_meta_t &seg = rdf_seg_meta_map[segid_t(key)];
             uint64_t ext_bucket_id = seg.get_ext_bucket();
             if (ext_bucket_id == 0) {
-                uint64_t start_off = alloc_ext_buckets(EXT_BUCKET_EXTENT_LEN);
-                seg.add_ext_buckets(ext_bucket_extent_t(EXT_BUCKET_EXTENT_LEN, start_off));
+                uint64_t nbuckets = 0;
+#ifdef USE_GPU
+                nbuckets = EXT_BUCKET_EXTENT_LEN(seg.num_buckets);
+#else
+                nbuckets = EXT_BUCKET_EXTENT_LEN;
+#endif
+                uint64_t start_off = alloc_ext_buckets(nbuckets);
+                seg.add_ext_buckets(ext_bucket_extent_t(nbuckets, start_off));
                 ext_bucket_id = seg.get_ext_bucket();
             }
             pthread_spin_unlock(&seg_ext_locks[seg_ext_lock_id]);

diff --git a/core/store/meta.hpp b/core/store/meta.hpp
@@ -35,8 +35,12 @@
 using namespace std;
 using namespace boost::archive;
 
-
+#ifdef USE_GPU
+#define EXT_BUCKET_LIST_CAPACITY 1
+#define EXT_BUCKET_EXTENT_LEN(num_buckets) (num_buckets * 15 / 100 + 1)
+#else
 #define EXT_BUCKET_EXTENT_LEN 256
+#endif
 #define PREDICATE_NSEGS 2
 #ifdef VERSATILE
 #define INDEX_NSEGS 4   // index(2) + vid's all preds(2)
@@ -75,15 +79,39 @@ struct rdf_seg_meta_t {
     uint64_t num_keys = 0;      // #keys of the segment
     uint64_t num_buckets = 0;   // allocated main headers (hash space)
     uint64_t bucket_start = 0;  // start offset of main-header region of gstore
-    vector<ext_bucket_extent_t> ext_bucket_list;
     uint64_t num_edges = 0;     // #edges of the segment
     uint64_t edge_start = 0;    // start offset in the entry region of gstore
 
     int num_key_blks = 0;       // #key-blocks needed in gcache
     int num_value_blks = 0;     // #value-blocks needed in gcache
 
+#ifdef USE_GPU
+    ext_bucket_extent_t ext_bucket_list[EXT_BUCKET_LIST_CAPACITY];
+    size_t ext_bucket_list_sz = 0;
+
+    rdf_seg_meta_t() {
+        memset(&ext_bucket_list, 0, sizeof(ext_bucket_list));
+    }
+
+    size_t get_ext_bucket_list_size() const { return ext_bucket_list_sz; }
+
+    void add_ext_buckets(const ext_bucket_extent_t &ext) {
+        assert(ext_bucket_list_sz < EXT_BUCKET_LIST_CAPACITY);
+        ext_bucket_list[ext_bucket_list_sz++] = ext;
+    }
+#else
+    vector<ext_bucket_extent_t> ext_bucket_list;
+
+    size_t get_ext_bucket_list_size() const { return ext_bucket_list.size(); }
+
+    void add_ext_buckets(const ext_bucket_extent_t &ext) {
+        ext_bucket_list.push_back(ext);
+    }
+
+#endif
+
     uint64_t get_ext_bucket() {
-        for (int i = 0; i < ext_bucket_list.size(); ++i) {
+        for (int i = 0; i < get_ext_bucket_list_size(); ++i) {
             ext_bucket_extent_t &ext = ext_bucket_list[i];
             if (ext.off < ext.num_ext_buckets) {
                 return ext.start + ext.off++;
@@ -92,13 +120,9 @@ struct rdf_seg_meta_t {
         return 0;
     }
 
-    void add_ext_buckets(const ext_bucket_extent_t &ext) {
-        ext_bucket_list.push_back(ext);
-    }
-
     inline uint64_t get_total_num_buckets() const {
         uint64_t total = num_buckets;
-        for (int i = 0; i < ext_bucket_list.size(); ++i) {
+        for (int i = 0; i < get_ext_bucket_list_size(); ++i) {
             total += ext_bucket_list[i].num_ext_buckets;
         }
         return total;
@@ -109,6 +133,9 @@ struct rdf_seg_meta_t {
         ar & num_buckets;
         ar & bucket_start;
         ar & ext_bucket_list;
+#ifdef USE_GPU
+        ar & ext_bucket_list_sz;
+#endif
         ar & num_edges;
         ar & edge_start;
     }

diff --git a/core/store/vertex.hpp b/core/store/vertex.hpp
@@ -23,6 +23,9 @@
 #pragma once
 #include <sstream>
 
+// definitions of "__host__" and "__device__"
+#include <cuda_runtime.h>
+
 // utils
 #include "math.hpp"
 

diff --git a/docs/gpu/tutorials.md → docs/gpu/TUTORIALS.md b/docs/gpu/tutorials.md → docs/gpu/TUTORIALS.md
@@ -21,7 +21,7 @@ export CPATH=${CUDA_HOME}/include:$CPATH
 
 And edit the **CMakeLists.txt** in the `$WUKONG_ROOT` to enable the `USE_GPU` option. Then go to `$WUKONG_ROOT/scripts` and run `./build.sh` to build Wukong+G. Or you can just run `./build.sh -DUSE_GPU=ON` to build with GPU support. Currently, the `USE_GPU` option **conflicts with** `USE_JEMALLOC`, `USE_DYNAMIC_GSTORE` and  `USE_VERSATILE`.
 
-Next, we need to install the [kernel module for GPUDirect RDMA](http://www.mellanox.com/page/products_dyn?product_family=295&mtag=gpudirect). Download it from the web and install it to **each server** in the cluster.
+Next, we need to install the [kernel module for GPUDirect RDMA](http://www.mellanox.com/page/products_dyn?product_family=295&mtag=gpudirect). Download it from the web and install it to **each machine** in the cluster.
 
 ```bash
 tar xzf nvidia-peer-memory-1.0-3.tar.gz
@@ -140,10 +140,10 @@ wukong>
 Although Wukong+G can notably speed up query processing, there is still plenty of room for improvement:
 
 - We only tested Wukong+G in a RDMA-capable cluster with NVIDIA Tesla K40m and CUDA 8.0.
-- Wukong+G assumes the predicate of triple patterns in a query is known, queries with unknown predicates cannot be handled.
-- If a query produces huge intermediate result that exceeds the size of result buffer on GPU (``global_gpu_rbuf_size_mb``), Wukong+G cannot handle it.
+- Wukong+G assumes the predicate of triple patterns in a query is known, queries with unknown predicates cannot be handled by GPU engine.
+- If a query produces huge intermediate result that exceeds the size of result buffer on GPU (``global_gpu_rbuf_size_mb``), Wukong+G cannot handle it (may crash or return wrong results).
 - If the size of triples of a predicate cannot fit into the key-value cache on GPU, Wukong+G cannot handle it.
-- If pipeline is enabled, there should be enough GPU memory to accommodate two triple patterns, the current pattern and the next pattern.
+- If pipeline is enabled, there should be enough GPU memory to accommodate two triple patterns, the current pattern and the prefetched pattern.
 
 
 
diff --git a/utils/gpu.hpp b/utils/gpu.hpp
@@ -22,8 +22,6 @@
 
 #pragma once
 
-#include "global.hpp"
-
 #ifdef USE_GPU
 #include <cuda_runtime.h>