Skip to content

Commit

Permalink
Merge branch 'wkg-docs' into 'master'
Browse files Browse the repository at this point in the history
Update Wukong+G docs and fix compile errors

Closes #11

See merge request opensource/wukong!77
  • Loading branch information
realstolz committed Nov 10, 2019
2 parents 6acef0e + 29a6e2f commit c1646d2
Show file tree
Hide file tree
Showing 14 changed files with 106 additions and 56 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ For more details see [Wukong Project](http://ipads.se.sjtu.edu.cn/projects/wukon
* [Installation](docs/INSTALL.md)
* [Tutorials](docs/TUTORIALS.md)
* [Q&A](docs/QA.md)
* [GPU extension](docs/gpu/TUTORIALS.md)


## License
Expand Down
5 changes: 5 additions & 0 deletions core/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,11 @@ void load_config(string fname, int nsrvs)
#ifdef USE_GPU
// each GPU card needs one (dedicated) agent thread
Global::num_threads += Global::num_gpus;
if (Global::num_gpus != 1) {
logstream(LOG_ERROR) << "Wrong config: please config num_gpus with 1 to enable GPU extension."
<< LOG_endl;
exit(-1);
}
#endif

// limited the number of engines
Expand Down
56 changes: 28 additions & 28 deletions core/global.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,46 +31,46 @@ class Global {
// another choice
// e.g., static int &num_threads() { static int _num_threads = 2; return _num_threads; }

static int num_servers;
static int num_threads;
static int num_servers __attribute__((weak));
static int num_threads __attribute__((weak));

static int num_proxies;
static int num_engines;
static int num_proxies __attribute__((weak));
static int num_engines __attribute__((weak));

static string input_folder;
static string input_folder __attribute__((weak));

static int data_port_base;
static int ctrl_port_base;
static int data_port_base __attribute__((weak));
static int ctrl_port_base __attribute__((weak));

static int rdma_buf_size_mb;
static int rdma_rbf_size_mb;
static int rdma_buf_size_mb __attribute__((weak));
static int rdma_rbf_size_mb __attribute__((weak));

static bool use_rdma;
static int rdma_threshold;
static bool use_rdma __attribute__((weak));
static int rdma_threshold __attribute__((weak));

static int mt_threshold;
static int mt_threshold __attribute__((weak));

static bool enable_caching;
static bool enable_workstealing;
static int stealing_pattern;
static bool enable_caching __attribute__((weak));
static bool enable_workstealing __attribute__((weak));
static int stealing_pattern __attribute__((weak));

static bool silent;
static bool silent __attribute__((weak));

static bool enable_planner;
static bool generate_statistics;
static bool enable_planner __attribute__((weak));
static bool generate_statistics __attribute__((weak));

static bool enable_vattr;
static bool enable_vattr __attribute__((weak));

static int memstore_size_gb;
static int est_load_factor;
static int memstore_size_gb __attribute__((weak));
static int est_load_factor __attribute__((weak));

static int num_gpus;
static int gpu_kvcache_size_gb;
static int gpu_rbuf_size_mb;
static int gpu_rdma_buf_size_mb;
static int gpu_key_blk_size_mb;
static int gpu_value_blk_size_mb;
static bool gpu_enable_pipeline;
static int num_gpus __attribute__((weak));
static int gpu_kvcache_size_gb __attribute__((weak));
static int gpu_rbuf_size_mb __attribute__((weak));
static int gpu_rdma_buf_size_mb __attribute__((weak));
static int gpu_key_blk_size_mb __attribute__((weak));
static int gpu_value_blk_size_mb __attribute__((weak));
static bool gpu_enable_pipeline __attribute__((weak));
};


Expand Down
8 changes: 5 additions & 3 deletions core/gpu/gpu_agent.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,10 @@ class GPUAgent {
Bundle b(req);
if (adaptor->send(dst_sid, dst_tid, b)) {
// #2 send result buffer
adaptor->send_dev2host(dst_sid, dst_tid, req.result.gpu.rbuf(),
WUKONG_GPU_ELEM_SIZE * req.result.gpu.rbuf_num_elems());
if (req.result.gpu.rbuf_num_elems() > 0) {
adaptor->send_dev2host(dst_sid, dst_tid, req.result.gpu.rbuf(),
WUKONG_GPU_ELEM_SIZE * req.result.gpu.rbuf_num_elems());
}
return true;
}

Expand Down Expand Up @@ -152,7 +154,7 @@ class GPUAgent {
// fork-join or in-place execution
bool need_fork_join(SPARQLQuery &req) {
// always need NOT fork-join when executing on single machine
if (Global::num_serverss == 1) return false;
if (Global::num_servers == 1) return false;

// always need fork-join mode w/o RDMA
if (!Global::use_rdma) return true;
Expand Down
3 changes: 2 additions & 1 deletion core/gpu/gpu_cache.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
// utils
#include "unit.hpp"
#include "gpu.hpp"
#include "global.hpp"

using namespace std;

Expand Down Expand Up @@ -287,7 +288,7 @@ class GPUCache {
// step 4.2 traverse the ext_bucket_list and load
uint64_t passed_buckets = 0;

for (int i = 0; i < rdf_metas[seg].ext_bucket_list.size(); i++) {
for (int i = 0; i < rdf_metas[seg].get_ext_bucket_list_size(); i++) {
ext_bucket_extent_t ext = rdf_metas[seg].ext_bucket_list[i];
/* load from this ext
* inside_off: the offset inside the ext
Expand Down
1 change: 0 additions & 1 deletion core/gpu/gpu_engine.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
#include <boost/unordered_map.hpp>
#include <vector>

#include "config.hpp"
#include "type.hpp"
#include "dgraph.hpp"
#include "query.hpp"
Expand Down
9 changes: 5 additions & 4 deletions core/gpu/gpu_engine_cuda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include <vector>
#include <utility>

#include "global.hpp"
#include "assertion.hpp"
#include "query.hpp"

Expand Down Expand Up @@ -126,7 +127,7 @@ class GPUEngineCuda final {
param.query.var2col_start = req.result.var2col(start);

logstream(LOG_DEBUG) << "known_to_unknown: #ext_buckets: "
<< seg_meta.ext_bucket_list.size() << LOG_endl;
<< seg_meta.get_ext_bucket_list_size() << LOG_endl;

ASSERT(gmem->res_inbuf() != gmem->res_outbuf());
ASSERT(nullptr != gmem->res_inbuf());
Expand All @@ -139,7 +140,7 @@ class GPUEngineCuda final {


// prefetch segment of next pattern
if (global_gpu_enable_pipeline && has_next_pattern(req)) {
if (Global::gpu_enable_pipeline && has_next_pattern(req)) {
auto next_seg = pattern_to_segid(req, req.pattern_step + 1);
auto stream2 = stream_pool->get_stream(next_seg.pid);

Expand Down Expand Up @@ -227,7 +228,7 @@ class GPUEngineCuda final {


// preload next predicate
if (global_gpu_enable_pipeline && has_next_pattern(req)) {
if (Global::gpu_enable_pipeline && has_next_pattern(req)) {
auto next_seg = pattern_to_segid(req, req.pattern_step + 1);
auto stream2 = stream_pool->get_stream(next_seg.pid);

Expand Down Expand Up @@ -308,7 +309,7 @@ class GPUEngineCuda final {


// preload next predicate
if (global_gpu_enable_pipeline && has_next_pattern(req)) {
if (Global::gpu_enable_pipeline && has_next_pattern(req)) {
auto next_seg = pattern_to_segid(req, req.pattern_step + 1);
auto stream2 = stream_pool->get_stream(next_seg.pid);

Expand Down
1 change: 1 addition & 0 deletions core/gpu/gpu_hash.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include "store/vertex.hpp"

// utils
#include "global.hpp"
#include "gpu.hpp"
#include "unit.hpp"

Expand Down
2 changes: 1 addition & 1 deletion core/gpu/gpu_mem.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,13 @@

#ifdef USE_GPU

#include "global.hpp"
#include "rdma.hpp"
#include "type.hpp"

// utils
#include "unit.hpp"
#include "gpu.hpp"
#include "global.hpp"

class GPUMem {
private:
Expand Down
20 changes: 16 additions & 4 deletions core/store/gstore.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -460,8 +460,14 @@ class GStore {
// allocate buckets in indirect-header region to segments
// #buckets : #extended buckets = 1 : 0.15
if (seg.num_buckets > 0) {
uint64_t start_off = alloc_ext_buckets(EXT_BUCKET_EXTENT_LEN);
seg.add_ext_buckets(ext_bucket_extent_t(EXT_BUCKET_EXTENT_LEN, start_off));
uint64_t nbuckets = 0;
#ifdef USE_GPU
nbuckets = EXT_BUCKET_EXTENT_LEN(seg.num_buckets);
#else
nbuckets = EXT_BUCKET_EXTENT_LEN;
#endif
uint64_t start_off = alloc_ext_buckets(nbuckets);
seg.add_ext_buckets(ext_bucket_extent_t(nbuckets, start_off));
}
}

Expand Down Expand Up @@ -826,8 +832,14 @@ class GStore {
rdf_seg_meta_t &seg = rdf_seg_meta_map[segid_t(key)];
uint64_t ext_bucket_id = seg.get_ext_bucket();
if (ext_bucket_id == 0) {
uint64_t start_off = alloc_ext_buckets(EXT_BUCKET_EXTENT_LEN);
seg.add_ext_buckets(ext_bucket_extent_t(EXT_BUCKET_EXTENT_LEN, start_off));
uint64_t nbuckets = 0;
#ifdef USE_GPU
nbuckets = EXT_BUCKET_EXTENT_LEN(seg.num_buckets);
#else
nbuckets = EXT_BUCKET_EXTENT_LEN;
#endif
uint64_t start_off = alloc_ext_buckets(nbuckets);
seg.add_ext_buckets(ext_bucket_extent_t(nbuckets, start_off));
ext_bucket_id = seg.get_ext_bucket();
}
pthread_spin_unlock(&seg_ext_locks[seg_ext_lock_id]);
Expand Down
43 changes: 35 additions & 8 deletions core/store/meta.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,12 @@
using namespace std;
using namespace boost::archive;


#ifdef USE_GPU
#define EXT_BUCKET_LIST_CAPACITY 1
#define EXT_BUCKET_EXTENT_LEN(num_buckets) (num_buckets * 15 / 100 + 1)
#else
#define EXT_BUCKET_EXTENT_LEN 256
#endif
#define PREDICATE_NSEGS 2
#ifdef VERSATILE
#define INDEX_NSEGS 4 // index(2) + vid's all preds(2)
Expand Down Expand Up @@ -75,15 +79,39 @@ struct rdf_seg_meta_t {
uint64_t num_keys = 0; // #keys of the segment
uint64_t num_buckets = 0; // allocated main headers (hash space)
uint64_t bucket_start = 0; // start offset of main-header region of gstore
vector<ext_bucket_extent_t> ext_bucket_list;
uint64_t num_edges = 0; // #edges of the segment
uint64_t edge_start = 0; // start offset in the entry region of gstore

int num_key_blks = 0; // #key-blocks needed in gcache
int num_value_blks = 0; // #value-blocks needed in gcache

#ifdef USE_GPU
ext_bucket_extent_t ext_bucket_list[EXT_BUCKET_LIST_CAPACITY];
size_t ext_bucket_list_sz = 0;

rdf_seg_meta_t() {
memset(&ext_bucket_list, 0, sizeof(ext_bucket_list));
}

size_t get_ext_bucket_list_size() const { return ext_bucket_list_sz; }

void add_ext_buckets(const ext_bucket_extent_t &ext) {
assert(ext_bucket_list_sz < EXT_BUCKET_LIST_CAPACITY);
ext_bucket_list[ext_bucket_list_sz++] = ext;
}
#else
vector<ext_bucket_extent_t> ext_bucket_list;

size_t get_ext_bucket_list_size() const { return ext_bucket_list.size(); }

void add_ext_buckets(const ext_bucket_extent_t &ext) {
ext_bucket_list.push_back(ext);
}

#endif

uint64_t get_ext_bucket() {
for (int i = 0; i < ext_bucket_list.size(); ++i) {
for (int i = 0; i < get_ext_bucket_list_size(); ++i) {
ext_bucket_extent_t &ext = ext_bucket_list[i];
if (ext.off < ext.num_ext_buckets) {
return ext.start + ext.off++;
Expand All @@ -92,13 +120,9 @@ struct rdf_seg_meta_t {
return 0;
}

void add_ext_buckets(const ext_bucket_extent_t &ext) {
ext_bucket_list.push_back(ext);
}

inline uint64_t get_total_num_buckets() const {
uint64_t total = num_buckets;
for (int i = 0; i < ext_bucket_list.size(); ++i) {
for (int i = 0; i < get_ext_bucket_list_size(); ++i) {
total += ext_bucket_list[i].num_ext_buckets;
}
return total;
Expand All @@ -109,6 +133,9 @@ struct rdf_seg_meta_t {
ar & num_buckets;
ar & bucket_start;
ar & ext_bucket_list;
#ifdef USE_GPU
ar & ext_bucket_list_sz;
#endif
ar & num_edges;
ar & edge_start;
}
Expand Down
3 changes: 3 additions & 0 deletions core/store/vertex.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@
#pragma once
#include <sstream>

// definitions of "__host__" and "__device__"
#include <cuda_runtime.h>

// utils
#include "math.hpp"

Expand Down
8 changes: 4 additions & 4 deletions docs/gpu/tutorials.md → docs/gpu/TUTORIALS.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ export CPATH=${CUDA_HOME}/include:$CPATH

And edit the **CMakeLists.txt** in the `$WUKONG_ROOT` to enable the `USE_GPU` option. Then go to `$WUKONG_ROOT/scripts` and run `./build.sh` to build Wukong+G. Or you can just run `./build.sh -DUSE_GPU=ON` to build with GPU support. Currently, the `USE_GPU` option **conflicts with** `USE_JEMALLOC`, `USE_DYNAMIC_GSTORE` and `USE_VERSATILE`.

Next, we need to install the [kernel module for GPUDirect RDMA](http://www.mellanox.com/page/products_dyn?product_family=295&mtag=gpudirect). Download it from the web and install it to **each server** in the cluster.
Next, we need to install the [kernel module for GPUDirect RDMA](http://www.mellanox.com/page/products_dyn?product_family=295&mtag=gpudirect). Download it from the web and install it to **each machine** in the cluster.

```bash
tar xzf nvidia-peer-memory-1.0-3.tar.gz
Expand Down Expand Up @@ -140,10 +140,10 @@ wukong>
Although Wukong+G can notably speed up query processing, there is still plenty of room for improvement:

- We only tested Wukong+G in a RDMA-capable cluster with NVIDIA Tesla K40m and CUDA 8.0.
- Wukong+G assumes the predicate of triple patterns in a query is known, queries with unknown predicates cannot be handled.
- If a query produces huge intermediate result that exceeds the size of result buffer on GPU (``global_gpu_rbuf_size_mb``), Wukong+G cannot handle it.
- Wukong+G assumes the predicate of triple patterns in a query is known, queries with unknown predicates cannot be handled by GPU engine.
- If a query produces huge intermediate result that exceeds the size of result buffer on GPU (``global_gpu_rbuf_size_mb``), Wukong+G cannot handle it (may crash or return wrong results).
- If the size of triples of a predicate cannot fit into the key-value cache on GPU, Wukong+G cannot handle it.
- If pipeline is enabled, there should be enough GPU memory to accommodate two triple patterns, the current pattern and the next pattern.
- If pipeline is enabled, there should be enough GPU memory to accommodate two triple patterns, the current pattern and the prefetched pattern.



2 changes: 0 additions & 2 deletions utils/gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@

#pragma once

#include "global.hpp"

#ifdef USE_GPU
#include <cuda_runtime.h>

Expand Down

0 comments on commit c1646d2

Please sign in to comment.