diff --git a/algorithms/Community/connected_components/weakly_connected_components/small_world/tg_algo_wcc_small_world.gsql b/algorithms/Community/connected_components/weakly_connected_components/small_world/tg_algo_wcc_small_world.gsql deleted file mode 100644 index 34cc48b2..00000000 --- a/algorithms/Community/connected_components/weakly_connected_components/small_world/tg_algo_wcc_small_world.gsql +++ /dev/null @@ -1,76 +0,0 @@ -CREATE QUERY tg_wcc_small_world(STRING v_type, STRING e_type, UINT threshold = 100000, - BOOL to_show_cc_count=FALSE) SYNTAX V1 { - - ## - # This query detects weakly connected components based on the following paper: - # https://www.osti.gov/servlets/purl/1115145 - ## - SumAccum @sum_indegree; - SumAccum @sum_outdegree; - SumAccum @sum_degree_product; - MinAccum @min_cc_id; - OrAccum @or_visited; - MapAccum> @@CC_count_map; - - # 1. initialization - Vertices = {v_type}; - All_Vertices = Vertices; - - # 2. calculate the product of in degree and out degree - # and filter the vertices which have the product no less than the threshold - PivotCandidates = SELECT s - FROM Vertices:s - POST-ACCUM s.@sum_indegree = s.outdegree(e_type), - s.@sum_outdegree = s.outdegree(e_type), - s.@sum_degree_product = s.@sum_indegree * s.@sum_outdegree - HAVING s.@sum_degree_product >= threshold; - - - # 3. while PotentialPoviots set is not empty, select a pivot and find its CC - WHILE PivotCandidates.size() > 0 DO - # select an initial pivot vertex as the vertex in the graph - # that has the largest product of its in degree and out degree - Vertices = SELECT s - FROM PivotCandidates:s - ORDER BY s.@sum_degree_product DESC - LIMIT 1; - Vertices = SELECT s - FROM Vertices:s - POST-ACCUM s.@or_visited = TRUE, - s.@min_cc_id = getvid(s); - # with the chosen pivot we use BFS algorithm to find all elements in its connected component - WHILE Vertices.size() > 0 DO - Vertices = SELECT t - FROM Vertices:s-(e_type:e)-v_type:t - WHERE t.@or_visited == FALSE - ACCUM t.@min_cc_id = s.@min_cc_id - POST-ACCUM t.@or_visited += TRUE; - - END; - # remove the visited vertices from the PivotCandidates set - PivotCandidates = SELECT s - FROM PivotCandidates:s - WHERE s.@or_visited == FALSE; - END; - - # 4. take the remaining vertices and pass them all off to coloring - Vertices = SELECT s - FROM All_Vertices:s - WHERE s.@or_visited == FALSE - ACCUM s.@min_cc_id = getvid(s); - - WHILE Vertices.size() > 0 DO - Vertices = SELECT t - FROM Vertices:s-(e_type:e)-v_type:t - WHERE s.@min_cc_id < t.@min_cc_id - ACCUM t.@min_cc_id += s.@min_cc_id; - END; - - IF to_show_cc_count THEN - Vertices = {v_type}; - Vertices = SELECT s - FROM Vertices:s - POST-ACCUM @@CC_count_map += (s.@min_cc_id -> 1); - PRINT @@CC_count_map; - END; - } diff --git a/algorithms/Community/connected_components/weakly_connected_components/small_world/tg_algo_wcc_small_world.yml b/algorithms/Community/connected_components/weakly_connected_components/small_world/tg_algo_wcc_small_world.yml deleted file mode 100644 index b5292aa4..00000000 --- a/algorithms/Community/connected_components/weakly_connected_components/small_world/tg_algo_wcc_small_world.yml +++ /dev/null @@ -1,16 +0,0 @@ ---- -# @ : this file -# {x} : the directory of file x -# &x : any file which matches pattern x -# ** : any depth of arbitrary directories -# * : wildcard character -# [x|y...|z] : file/directory x, y, or z - - algorithm: - name: Weakly Connected Components (Small World) - filename: "tg_wcc_small_world.gsql" - sha_id: ed6ea869749977cc0f3df71225d7325fb81c9767 - description: "This algorithm is optimized for finding connected components in small-world graphs, where the vast majority of the vertices in the graph are weakly connected to form a hub community." - version: lib3.0 - include: true - diff --git a/algorithms/Community/connected_components/weakly_connected_components/small_world/tg_wcc_small_world.yml b/algorithms/Community/connected_components/weakly_connected_components/small_world/tg_wcc_small_world.yml index 85c90372..c3687be6 100644 --- a/algorithms/Community/connected_components/weakly_connected_components/small_world/tg_wcc_small_world.yml +++ b/algorithms/Community/connected_components/weakly_connected_components/small_world/tg_wcc_small_world.yml @@ -12,4 +12,4 @@ sha_id: ed6ea869749977cc0f3df71225d7325fb81c9767 description: "This algorithm is optimized for finding connected components in small-world graphs, where the vast majority of the vertices in the graph are weakly connected to form a hub community." version: lib3.0 - include: false + include: true diff --git a/algorithms/GraphML/Embeddings/Node2Vec/CHANGELOG.md b/algorithms/GraphML/Embeddings/Node2Vec/CHANGELOG.md deleted file mode 100644 index dfda85ce..00000000 --- a/algorithms/GraphML/Embeddings/Node2Vec/CHANGELOG.md +++ /dev/null @@ -1,76 +0,0 @@ - -## lib3.0_211004 Node2Vec Change Logs - -### `tg_weighted_random_walk` - -> [`36713a9`](https://github.com/tigergraph/gsql-graph-algorithms/commit/36713a9882094e177456795cda8173faf2fc8ce2) Merge branch 'tigergraph:master' into master - -### `tg_random_udf` - -> [`36713a9`](https://github.com/tigergraph/gsql-graph-algorithms/commit/36713a9882094e177456795cda8173faf2fc8ce2) Merge branch 'tigergraph:master' into master - -### `tg_weighted_random_walk_batch` - -> [`36713a9`](https://github.com/tigergraph/gsql-graph-algorithms/commit/36713a9882094e177456795cda8173faf2fc8ce2) Merge branch 'tigergraph:master' into master - -### `tg_weighted_random_walk_sub` - -> [`36713a9`](https://github.com/tigergraph/gsql-graph-algorithms/commit/36713a9882094e177456795cda8173faf2fc8ce2) Merge branch 'tigergraph:master' into master - - - -## lib3.0_210903 Node2Vec Change Logs - -### `tg_node2vec_sub` - -> [`4245e43`](https://github.com/tigergraph/gsql-graph-algorithms/commit/4245e43a22b913d135841349a2b0754e7ab8968e) Merge remote-tracking branch 'upstream/master' - -> [`db8be22`](https://github.com/tigergraph/gsql-graph-algorithms/commit/db8be22995734581d3289b84fd9a8bf4c3421298) docs: change style of readme and changelog files - -> [`a921a77`](https://github.com/tigergraph/gsql-graph-algorithms/commit/a921a7756247fa0e55d807a0245ecf102401ab45) Merge branch 'tigergraph:algorithm-folder-restructure' into algorithm-folder-restructure - -> [`ac43583`](https://github.com/tigergraph/gsql-graph-algorithms/commit/ac435831c1e0f8a254f52dfa1390d2e3b48f161f) moved examples to template and updated query names - -> [`6d9036e`](https://github.com/tigergraph/gsql-graph-algorithms/commit/6d9036e833bb3f04a886e5c17dd29752b9e2cd48) Merge pull request #39 from karimsaraipour/algorithm-folder-restructure - -> [`c2880dd`](https://github.com/tigergraph/gsql-graph-algorithms/commit/c2880dd1b6dc23ba028f9b1898e323406c84fa6b) reorganized ML folder. Changed README - -> [`ec58568`](https://github.com/tigergraph/gsql-graph-algorithms/commit/ec58568cdd7e608bd7af13d6bce2eaf781c9798f) New schema-free layout - -> [`bb52bc0`](https://github.com/tigergraph/gsql-graph-algorithms/commit/bb52bc0903ffd2684b70b9fb7499f8b3749f0f6b) Merge pull request #36 from tigergraph/LCC-and-Closeness_cent - -> [`e390bc2`](https://github.com/tigergraph/gsql-graph-algorithms/commit/e390bc2300f9deab0bd612a40ba386c5306d2525) Merge pull request #31 from tigergraph/louvain-algorithm - -> [`e0a9758`](https://github.com/tigergraph/gsql-graph-algorithms/commit/e0a9758b608dd68420b69706b9fbd6f492b99000) Merge pull request #37 from tigergraph/monil-shah-patch-1 - -> [`261627d`](https://github.com/tigergraph/gsql-graph-algorithms/commit/261627d240eac286295f50aae21792309d3759ab) Adding node 2 vec - -### `tg_random_walk` - -> [`4245e43`](https://github.com/tigergraph/gsql-graph-algorithms/commit/4245e43a22b913d135841349a2b0754e7ab8968e) Merge remote-tracking branch 'upstream/master' - -> [`6cc5cab`](https://github.com/tigergraph/gsql-graph-algorithms/commit/6cc5cab0086dbf8a33f6008b898ed75700322358) Merge pull request #44 from tigergraph/LCC-and-Closeness_cent - -> [`ec58568`](https://github.com/tigergraph/gsql-graph-algorithms/commit/ec58568cdd7e608bd7af13d6bce2eaf781c9798f) New schema-free layout - -> [`bb52bc0`](https://github.com/tigergraph/gsql-graph-algorithms/commit/bb52bc0903ffd2684b70b9fb7499f8b3749f0f6b) Merge pull request #36 from tigergraph/LCC-and-Closeness_cent - -> [`e390bc2`](https://github.com/tigergraph/gsql-graph-algorithms/commit/e390bc2300f9deab0bd612a40ba386c5306d2525) Merge pull request #31 from tigergraph/louvain-algorithm - -> [`e0a9758`](https://github.com/tigergraph/gsql-graph-algorithms/commit/e0a9758b608dd68420b69706b9fbd6f492b99000) Merge pull request #37 from tigergraph/monil-shah-patch-1 - -> [`261627d`](https://github.com/tigergraph/gsql-graph-algorithms/commit/261627d240eac286295f50aae21792309d3759ab) Adding node 2 vec - -### `tg_random_walk_batch` - -> [`4245e43`](https://github.com/tigergraph/gsql-graph-algorithms/commit/4245e43a22b913d135841349a2b0754e7ab8968e) Merge remote-tracking branch 'upstream/master' - -> [`4245e43`](https://github.com/tigergraph/gsql-graph-algorithms/commit/4245e43a22b913d135841349a2b0754e7ab8968e) Merge remote-tracking branch 'upstream/master' - -### `tg_node2vec` - -> [`4245e43`](https://github.com/tigergraph/gsql-graph-algorithms/commit/4245e43a22b913d135841349a2b0754e7ab8968e) Merge remote-tracking branch 'upstream/master' - -> [`a921a77`](https://github.com/tigergraph/gsql-graph-algorithms/commit/a921a7756247fa0e55d807a0245ecf102401ab45) Merge branch 'tigergraph:algorithm-folder-restructure' into algorithm-folder-restructure - -> [`ac43583`](https://github.com/tigergraph/gsql-graph-algorithms/commit/ac435831c1e0f8a254f52dfa1390d2e3b48f161f) moved examples to template and updated query names diff --git a/algorithms/GraphML/Embeddings/Node2Vec/README.md b/algorithms/GraphML/Embeddings/Node2Vec/README.md deleted file mode 100644 index c17bc396..00000000 --- a/algorithms/GraphML/Embeddings/Node2Vec/README.md +++ /dev/null @@ -1,102 +0,0 @@ -# Node2Vec - -Node2Vec is a vertex embedding algorithm proposed in [node2vec: Scalable Feature Learning for Networks](https://arxiv.org/abs/1607.00653?context=cs). TigerGraph splits the computation into two parts: the random walk process and the embedding training process. Assuming that you are using version 3.6 or greater of the TigerGraph database, ignore the UDF install instructions. - -## [TigerGraph Node2Vec Documentation](https://docs.tigergraph.com/graph-ml/current/node-embeddings/node2vec) - -## Instructions - -### Random Walk Process Install -There are two different random walk processes to choose from. The first is regular random walks, implemented in `tg_random_walk.gsql`. This is equivalent to setting `p` and `q` parameters of Node2Vec both to 1, which is also equivalent to the [DeepWalk](https://arxiv.org/pdf/1403.6652.pdf) paper. This version is more performant than `tg_weighted_random_walk.gsql`, due to the less computation that is needed. If the graph is large, you may want to batch the random walk process to reduce memory consumption. Use `tg_random_walk_batch.gsql` if this is desired. - -The second option is weighted random walk, as described in the Node2Vec paper. This is implemented in the `tg_weighted_random_walk_sub.gsql` and `tg_weighted_random_walk.gsql`. If your TigerGraph database version is below 3.6, see the UDF installation instructions below. If the graph is large, you may want to batch the random walk process to reduce memory consumption. Use `tg_weighted_random_walk_batch.gsql` with `tg_weighted_random_walk_sub.gsql` if desired. - -**To install the un-weighted random walk:** copy the algorithm from `tg_random_walk.gsql` and install it on the database using the standard query install process. - -**To install the weighted random walk:** copy `tg_weighted_random_walk_sub.gsql` and install. Then copy and install `tg_weighted_random_walk.gsql`. - -### Node2Vec Embedding Install -Once the random walks have been generated, we can use the output to train the Node2Vec model. To install, make sure the proper UDFs are installed. If you are using a TigerGraph database of version 3.6 or greater, the UDFs are pre-installed. - -**To install Node2Vec query:** copy the query from `tg_node2vec.gsql` and install on the database. - -### Preliminary Notes -Vim is the text editor of choice in this README, any other text editors such as Emacs or Nano will suffice in the commands listed below -\ -`` should be replaced with your current Tigergraph version number - -### UDF installation - -#### Weighted Random Walk UDF install -If you are using `tg_weighted_random_walk_sub.gsql`, then you will need to install the `tg_random_udf.cpp`. **The code defined in `tg_random_udf.cpp` should be pasted inside the `UDIMPL`f namespace inside of `ExprFunctions.hpp`. -```bash -# open file and paste code - -$ vim ~/tigergraph/app//dev/gdk/gsql/src/QueryUdf/ExprFunctions.hpp -``` - -#### Node2Vec UDF install -`tg_node2vec_sub()` is a UDF that is called in `tg_node2vec.gsql`. \ -**The code defined in `tg_node2vec_sub.cpp` should be pasted inside the `UDIMPL` namespace inside of `ExprFunctions.hpp` -```bash -# open file and paste code - -$ vim ~/tigergraph/app//dev/gdk/gsql/src/QueryUdf/ExprFunctions.hpp -``` - -##### Getting Word2vec file -There are multiple options to get `word2vec.h` -1. Download/Copy `word2vec.h` file into `~/tigergraph/app//dev/gdk/gsdk/include` directory -2. Create the file and copy the code from `word2vec.h` and paste it into the newly created file (steps shown below) -```bash -# Go to correct directory -$ cd ~/tigergraph/app//dev/gdk/gsdk/include/ - -# create file and paste code -$ vim word2vec.h -``` - -##### Including word2vec -The newly created `word2vec.h` needs to be included in the `ExprUtil.hpp` file -```bash -$ vim ~/tigergraph/app//dev/gdk/gsql/src/QueryUdf/ExprUtil.hpp -``` -Once inside the text editor, paste the following line of `C++` code into under the other include statements -```c++ -#include "/home/tigergraph/tigergraph/app//dev/gdk/gsdk/include/word2vec.h" -``` -### Multiple machines(cluster) or Single Machine? -If you are working on a single machine, remove the `Distributed` GSQL keyword from the header of the `random_walk` query -```bash -# Change first header to the second header - -CREATE DISTRIBUTED QUERY random_walk(...) {...} -CREATE QUERY random_walk(...) {...} -``` -After doing this, proceed to the next section(Running Queries) - -If you are on multiple machines or a cluster -```bash -# For every machine in cluster - -$ gssh -$ PUT ExprFunctions from "/home/tigergraph/tigergraph/app//dev/gdk/gsql/src/QueryUdf/ExprFunctions.hpp" -``` - -### Running Queries -** The following instructions can be done with GraphStudio or GSQL terminal -1. Install the `random_walk` query -2. Run query `random_walk` with desired parameters. Visit https://docs.tigergraph.com/graph-ml/current/node-embeddings/node2vec for a description of the random walk query parameters. Make sure that TigerGraph has the correct permissions to write to the output directory you specify. -3. (optional) Inspect output of random_walk query - ```bash - # For the default filepath parameter - - $ cat ~/path.csv - ``` -4. Run `node2vec_query` with desired parameters. `Dimension` denotes the embedding dimension size -5. (optional) Inspect Embeddings - ```bash - # For the default filepath parameter - - $ cat ~/embedding.csv - ``` diff --git a/algorithms/GraphML/Embeddings/Node2Vec/tg_node2vec.gsql b/algorithms/GraphML/Embeddings/Node2Vec/tg_node2vec.gsql deleted file mode 100644 index b36f5c07..00000000 --- a/algorithms/GraphML/Embeddings/Node2Vec/tg_node2vec.gsql +++ /dev/null @@ -1,3 +0,0 @@ -CREATE QUERY tg_node2vec(STRING filepath = "/home/tigergraph/path.csv", STRING output_file = "/home/tigergraph/embedding.csv", INT dimension) SYNTAX V1 { - tg_node2vec_sub(dimension, filepath, output_file); -} \ No newline at end of file diff --git a/algorithms/GraphML/Embeddings/Node2Vec/tg_node2vec_sub.cpp b/algorithms/GraphML/Embeddings/Node2Vec/tg_node2vec_sub.cpp deleted file mode 100644 index f3090e7c..00000000 --- a/algorithms/GraphML/Embeddings/Node2Vec/tg_node2vec_sub.cpp +++ /dev/null @@ -1,34 +0,0 @@ - // node2vec function: given random walk sequence, this function trains vector using skip-gram model - inline void tg_node2vec_sub(int dimension, string input_file, string output_file){ - Model model(dimension); - model.sample_ = 0; - // model.window = 10; - int n_workers = 4; - std::vector sentences; - - size_t count =0; - const size_t max_sentence_len = 200; - - SentenceP sentence(new Sentence); - std::ifstream in(input_file); - while (true) { - std::string s; - in >> s; - if (s.empty()) break; - ++count; - sentence->tokens_.push_back(std::move(s)); - if (count == max_sentence_len) { - count = 0; - sentences.push_back(std::move(sentence)); - sentence.reset(new Sentence); - } - } - - if (!sentence->tokens_.empty()) - sentences.push_back(std::move(sentence)); - - model.build_vocab(sentences); - model.train(sentences, n_workers); - model.save(output_file); - -} diff --git a/algorithms/GraphML/Embeddings/Node2Vec/tg_random_udf.cpp b/algorithms/GraphML/Embeddings/Node2Vec/tg_random_udf.cpp deleted file mode 100644 index 20c8af76..00000000 --- a/algorithms/GraphML/Embeddings/Node2Vec/tg_random_udf.cpp +++ /dev/null @@ -1,28 +0,0 @@ -// random function, generate a random value between 0 and 1 -inline float random(){ - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_real_distribution<> dis(0, 1); - return dis(gen); -} - -// generate a int random value given a range -inline int random_range(int start, int end){ - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_real_distribution<> dis(start, end); - return dis(gen); - -} -// generate a random value based on probability distribution -// For example: given {0.5,0.3,0.2}, this function will generate {0,1,2} based on its probability -inline int random_distribution(ListAccum p){ - std::vector a; - for (auto it : p.data_){ - a.push_back(it); - } - std::random_device rd; - std::mt19937 gen(rd()); - std::discrete_distribution<> dis(a.begin(), a.end()); - return dis(gen); -} \ No newline at end of file diff --git a/algorithms/GraphML/Embeddings/Node2Vec/tg_random_walk.gsql b/algorithms/GraphML/Embeddings/Node2Vec/tg_random_walk.gsql deleted file mode 100644 index 53126fb4..00000000 --- a/algorithms/GraphML/Embeddings/Node2Vec/tg_random_walk.gsql +++ /dev/null @@ -1,30 +0,0 @@ -CREATE QUERY tg_random_walk(INT step = 8, INT path_size = 4, STRING filepath = "/home/tigergraph/path.csv", SET edge_types, INT sample_num) SYNTAX V1 { - - FILE f(filepath); - ListAccum> @recv_seque_list; - ListAccum> @send_seque_list; - - start (ANY) = {ANY}; - start = SELECT s - FROM start:s - POST-ACCUM s.@send_seque_list += [s]; - - WHILE true LIMIT step DO - tmp = SELECT t - FROM start:s-(edge_types:e)->:t - SAMPLE sample_num EDGE WHEN s.outdegree() >= 1 - ACCUM t.@recv_seque_list += s.@send_seque_list - POST-ACCUM - t.@send_seque_list.clear(), - FOREACH PATH IN t.@recv_seque_list DO - CASE WHEN PATH.size() == path_size - 1 THEN - f.println(PATH + [t]) - ELSE - t.@send_seque_list += PATH + [t] - END - END, - t.@send_seque_list += [t], - t.@recv_seque_list.clear(); - END; - -} diff --git a/algorithms/GraphML/Embeddings/Node2Vec/tg_random_walk_batch.gsql b/algorithms/GraphML/Embeddings/Node2Vec/tg_random_walk_batch.gsql deleted file mode 100644 index 82e2e380..00000000 --- a/algorithms/GraphML/Embeddings/Node2Vec/tg_random_walk_batch.gsql +++ /dev/null @@ -1,34 +0,0 @@ -CREATE QUERY tg_random_walk_batch(INT batches, INT step, INT path_size, STRING filepath = "/home/tigergraph/path.csv", SET edge_types, INT sample_num) SYNTAX V1 { - - FILE f(filepath); - ListAccum> @recv_seque_list; - ListAccum> @send_seque_list; - start (ANY) = {ANY}; - - FOREACH i IN RANGE [0, batches - 1] DO - tmp = SELECT s - FROM start:s - WHERE getvid(s) % batches == i; - start = SELECT s - FROM tmp:s - POST-ACCUM s.@send_seque_list.clear(), - s.@send_seque_list += [s]; - WHILE true LIMIT step DO - tmp = SELECT t - FROM tmp:s-(edge_types:e)->:t - SAMPLE sample_num EDGE WHEN s.outdegree() >= 1 - ACCUM t.@recv_seque_list += s.@send_seque_list - POST-ACCUM - t.@send_seque_list.clear(), - FOREACH PATH IN t.@recv_seque_list DO - CASE WHEN PATH.size() == path_size - 1 THEN - f.println(PATH + [t]) - ELSE - t.@send_seque_list += PATH + [t] - END - END, - t.@send_seque_list += [t], - t.@recv_seque_list.clear(); - END; - END; -} diff --git a/algorithms/GraphML/Embeddings/Node2Vec/tg_weighted_random_walk.gsql b/algorithms/GraphML/Embeddings/Node2Vec/tg_weighted_random_walk.gsql deleted file mode 100644 index be8505f2..00000000 --- a/algorithms/GraphML/Embeddings/Node2Vec/tg_weighted_random_walk.gsql +++ /dev/null @@ -1,9 +0,0 @@ -CREATE QUERY tg_weighted_random_walk(INT walk_length = 20, INT num_walks = 4, FLOAT p = 1, FLOAT q = 1, STRING filepath = "/home/tigergraph/path.csv") SYNTAX V1 { - - FILE f (filepath); - Start = {ANY}; - Start = SELECT s - FROM Start:s - POST-ACCUM tg_weighted_random_walk_sub(s, walk_length, num_walks, p, q, f); - -} \ No newline at end of file diff --git a/algorithms/GraphML/Embeddings/Node2Vec/tg_weighted_random_walk_batch.gsql b/algorithms/GraphML/Embeddings/Node2Vec/tg_weighted_random_walk_batch.gsql deleted file mode 100644 index 8c5e848f..00000000 --- a/algorithms/GraphML/Embeddings/Node2Vec/tg_weighted_random_walk_batch.gsql +++ /dev/null @@ -1,12 +0,0 @@ -CREATE QUERY tg_weighted_random_walk_batch(INT batches, INT walk_length = 20, INT num_walks = 4, FLOAT p = 1, FLOAT q = 1, STRING filepath = "/home/tigergraph/path.csv") SYNTAX V1 { - - FILE f (filepath); - Start = {ANY}; - FOREACH i IN RANGE [0, batches-1] DO - Start = SELECT s - FROM Start:s - WHERE getvid(s) % batches == i - POST-ACCUM tg_weighted_random_walk_sub(s, walk_length, num_walks, p, q, f); - END; - -} \ No newline at end of file diff --git a/algorithms/GraphML/Embeddings/Node2Vec/tg_weighted_random_walk_sub.gsql b/algorithms/GraphML/Embeddings/Node2Vec/tg_weighted_random_walk_sub.gsql deleted file mode 100644 index 68f6321c..00000000 --- a/algorithms/GraphML/Embeddings/Node2Vec/tg_weighted_random_walk_sub.gsql +++ /dev/null @@ -1,72 +0,0 @@ -CREATE QUERY tg_weighted_random_walk_sub(VERTEX source, INT length, INT num_walks, FLOAT p, FLOAT q, FILE f) SYNTAX V1 { - - /* This query impletemented random walk in Node2vec paper. Link: https://cs.stanford.edu/~jure/pubs/node2vec-kdd16.pdf - Input: source vertex, random walk length, walk times for each vertex, output file, set of valid edge types - Output: random walk sequence - */ - - SetAccum @@pick_set; - ListAccum @@prob_list; - ListAccum @@candidates_list; - ListAccum@@path_list; - FLOAT Prob; - SumAccum @sum_d_tx = 2; - SumAccum@sum_score; - - - FOREACH i in RANGE[0,num_walks-1] DO - // add source vertex into path - @@path_list += source; - Start = {source}; - - WHILE(Start.size()>=0 ) LIMIT length DO - // Calculate transition probability - Start = SELECT s - FROM Start:s-(:e)-:t - POST-ACCUM - IF t.@sum_d_tx ==0 THEN - t.@sum_score = (1/p) //* e.score - ELSE IF t.@sum_d_tx == 1 THEN - t.@sum_score = 1 //t.score - ELSE - t.@sum_score = (1/q) //* e.score - END; - // Reset candidates @d_tx =2, start vertex @d_dx = 0 - // store score and vertex of candidates - candidates = SELECT t - FROM Start:s-(:e)-:t - ACCUM t.@sum_d_tx = 2, s.@sum_d_tx = 0 - POST-ACCUM @@prob_list +=t.@sum_score, - @@candidates_list +=t; - - IF candidates.size() == 0 THEN - @@path_list += source; - continue; - END; - // generate prob value based on weight distribution - Prob = tg_random_distribution(@@prob_list); - //print Prob; - // pick vertex that generated by random function - @@path_list += @@candidates_list.get(Prob); - @@pick_set += @@candidates_list.get(Prob); - - //print @@pick; - // select the vertex that connected with @@pick in candidates and set @d_tx=1 - sel = SELECT s - FROM candidates:s-(:e)-:t - WHERE t == @@candidates_list.get(Prob) - POST-ACCUM s.@sum_d_tx =1; - - // select @@pick as the next start vertex - Start = @@pick_set; - - // clear accumulators to start next iteration - @@pick_set.clear(); - @@prob_list.clear(); - @@candidates_list.clear(); - END; // WHILE - PRINT @@path_list; - f.println(@@path_list); - @@path_list.clear(); - END; // FOREACH -} diff --git a/algorithms/GraphML/Embeddings/Node2Vec/tg_word2vec.h b/algorithms/GraphML/Embeddings/Node2Vec/tg_word2vec.h deleted file mode 100644 index beb7f820..00000000 --- a/algorithms/GraphML/Embeddings/Node2Vec/tg_word2vec.h +++ /dev/null @@ -1,567 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -typedef std::vector Vector; - -struct Word -{ - int32_t index_; - std::string text_; - uint32_t count_; - Word *left_, *right_; - - std::vector codes_; - std::vector points_; - - Word(int32_t index, std::string text, uint32_t count, Word *left = 0, Word *right = 0) : index_(index), text_(text), count_(count), left_(left), right_(right) {} - Word(const Word&) = delete; - const Word& operator = (const Word&) = delete; -}; -typedef std::shared_ptr WordP; - -struct Sentence -{ - std::vector words_; - std::vector tokens_; -}; -typedef std::shared_ptr SentenceP; - -struct Model -{ - std::vector syn0_, syn1_; - std::vector syn0norm_; - - //negative sampling - std::vector syn1neg_; - std::vector unigram_; - - std::unordered_map vocab_; - std::vector words_; - - int layer1_size_; - int window_; - - //subsampling - float sample_; - - int min_count_; //minumum count of word (or phrase) appearance to be put into context - int negative_; - - float alpha_, min_alpha_; - - bool phrase_; //whether consider phrase - float phrase_threshold_; //threshold for the phrase score to be put into context - - Model(int size = 100, int window = 10, float sample = 0.001, int min_count = 0, int negative = 0, float alpha = 0.025, float min_alpha = 0.0001) - :layer1_size_(size), window_(window), sample_(sample), min_count_(min_count), negative_(negative) - , alpha_(alpha), min_alpha_(min_alpha) - , phrase_(false), phrase_threshold_(100) - {} - - - bool has(const std::string& w) const { return vocab_.find(w) != vocab_.end(); } - - int build_vocab(std::vector& sentences) { - size_t count = 0; - std::unordered_map vocab; - auto progress = [&count](const char *type, const std::unordered_map& vocab) { - printf("collecting [%s] %lu sentences, %lu distinct %ss, %d %ss\n", type, count, vocab.size(), type, - std::accumulate(vocab.begin(), vocab.end(), 0, [](int x, const std::pair& v) { return x + v.second; }), type); - }; //show current progress - - // count tokens and phrases, and store the count in vocab - for (auto& sentence: sentences) { - ++count; - if (count % 10000 == 0) progress("word", vocab); - - std::string last_token; - for (auto& token: sentence->tokens_) { - vocab[token] += 1; - // add bigram phrases - if (phrase_) { - if(!last_token.empty()) vocab[last_token + "_" + token] += 1; - last_token = token; - } - } - } - progress("word", vocab); - - // if two-word phrases are considered - if (phrase_) { - count = 0; - int total_words = std::accumulate(vocab.begin(), vocab.end(), 0, [](int x, const std::pair& v) { return x + v.second; }); //total count of words - - std::unordered_map phrase_vocab; - - //filter the phrases above phrase_threshold, store the count of phrases and words in phrase_vocab, push the token to phrase_tokens - for (auto& sentence: sentences) { - ++count; - if (count % 10000 == 0) progress("phrase", phrase_vocab); - - std::vector phrase_tokens; - std::string last_token; - uint32_t pa = 0, pb = 0, pab = 0; - for (auto& token: sentence->tokens_) { - pb = vocab[token]; - if (! last_token.empty()) { - std::string phrase = last_token + "_" + token; - pab = vocab[phrase]; - float score = 0; - if (pa >= min_count_ && pb >= min_count_ && pab >= min_count_) - score = (pab - min_count_ ) / (float(pa) * pb) * total_words; - if (score > phrase_threshold_) { - phrase_tokens.push_back(phrase); - token.clear(); - phrase_vocab[phrase] += 1; - } - else { - phrase_tokens.push_back(last_token); - phrase_vocab[last_token] += 1; - } - } - last_token = token; - pa = pb; - } - - if (!last_token.empty()) { - phrase_tokens.push_back(last_token); - phrase_vocab[last_token] += 1; - } - sentence->tokens_.swap(phrase_tokens); - } - progress("phrase", phrase_vocab); - - printf("using phrases\n"); - vocab.swap(phrase_vocab); - } - - int n_words = vocab.size(); - if (n_words <= 1) return -1; - - words_.reserve(n_words); - auto comp = [](Word *w1, Word *w2) { return w1->count_ > w2->count_; }; - - // populate vocab_ with word (str), [word, count](*Word) - for (auto& p: vocab) { - uint32_t count = p.second; - if (count <= min_count_) continue; - - auto r = vocab_.emplace(p.first, WordP(new Word{0, p.first, count})); - words_.push_back((r.first->second.get())); - } - std::sort(words_.begin(), words_.end(), comp); //sort by count - - int index = 0; - for (auto& w: words_) w->index_ = index++; //assign index to all words - - printf("collected %lu distinct words with min_count=%d\n", vocab_.size(), min_count_); - - n_words = words_.size(); - - std::vector heap = words_; - std::make_heap(heap.begin(), heap.end(), comp); - - std::vector tmp; - for (int i=0; icount_ + min2->count_, min1, min2})); - - heap.push_back(tmp.back().get()); - std::push_heap(heap.begin(), heap.end(), comp); - } - - int max_depth = 0; - std::list, std::vector>> stack; - stack.push_back(std::make_tuple(heap[0], std::vector(), std::vector())); - count = 0; - while (!stack.empty()) { - auto t = stack.back(); - stack.pop_back(); - - Word *word = std::get<0>(t); - if (word->index_ < n_words) { - word->points_ = std::get<1>(t); - word->codes_ = std::get<2>(t); - max_depth = std::max((int)word->codes_.size(), max_depth); - } - else { - auto points = std::get<1>(t); - points.emplace_back(word->index_ - n_words); - auto codes1 = std::get<2>(t); - auto codes2 = codes1; - codes1.push_back(0); codes2.push_back(1); - stack.emplace_back(std::make_tuple(word->left_, points, codes1)); - stack.emplace_back(std::make_tuple(word->right_, points, codes2)); - } - } - - printf("built huffman tree with maximum node depth %d\n", max_depth); - - syn0_.resize(n_words); - syn1_.resize(n_words); - - std::default_random_engine eng(::time(NULL)); - std::uniform_real_distribution rng(0.0, 1.0); - for (auto& s: syn0_) { - s.resize(layer1_size_); - for (auto& x: s) x = (rng(eng) - 0.5) / layer1_size_; - } - for (auto& s: syn1_) s.resize(layer1_size_); - -#if 0 - //TODO: verify - if (negative_ > 0) { - syn1neg_.resize(n_words); - for (auto& s: syn1neg_) s.resize(layer1_size_); - - unigram_.resize(1e8); - const float power = 0.75; - float sum = std::accumulate(words_.begin(), words_.end(), 0.0, [&power](float x, Word *word) { return x + ::pow(word->count_, power); }); - float d1 = ::pow(words_[0]->count_, power) / sum; - - int i = 0; - for (int a=0; a d1) { - ++i; d1 += ::pow(words_[i]->count_, power) / sum; - } - if (i >= words_.size()) i = words_.size() - 1; - } - } -#endif - - return 0; - } - - int train(std::vector& sentences, int n_workers) { - int total_words = std::accumulate(vocab_.begin(), vocab_.end(), 0, - [](int x, const std::pair& p) { return (int)(x + p.second->count_); }); - int current_words = 0; - float alpha0 = alpha_, min_alpha = min_alpha_; - - typedef std::vector Job; - typedef std::unique_ptr JobP; - std::mutex m; - std::condition_variable cond_var; - std::list jobs; - - volatile bool done = false; - auto worker = [&](){ - while (true) { - JobP job; - { - std::unique_lock lock(m); - while (jobs.empty() && !done) - cond_var.wait(lock); - - if (jobs.empty()) break; - - job = std::move(jobs.front()); - jobs.pop_front(); - } - - if (!job) break; - - auto cstart = std::chrono::high_resolution_clock::now(); - float alpha = std::max(min_alpha, float(alpha0 * (1.0 - 1.0 * current_words / total_words))); - int words = 0; - for (auto sentence: *job) { - words += train_sentence(*sentence, alpha); - } - current_words += words; - auto cend = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast(cend - cstart).count(); - printf("training alpha: %.4f progress: %.2f%% words per thread sec: %.3fK\n", alpha, current_words * 100.0/total_words, words * 1000.0 / duration); - } - }; - - auto enqueue_job = [&](JobP&& job) { - std::unique_lock lock(m); - jobs.push_back(std::forward(job)); - cond_var.notify_one(); - }; - - std::vector workers; - for(int i=0; i rng(0.0, 1.0); - - for (auto& sentence: sentences) { - if (sentence->tokens_.empty()) - continue; - size_t len = sentence->tokens_.size(); - sentence->words_.reserve(len); - for (size_t i=0; itokens_[i]); - if (it == vocab_.end()) continue; - Word *word = it->second.get(); - // subsampling - if (sample_ > 0) { - float rnd = (sqrt(word->count_ / (sample_ * total_words)) + 1) * (sample_ * total_words) / word->count_; - if (rnd < rng(eng)) continue; - } - sentence->words_.push_back(it->second.get()); - } - - job->push_back(sentence.get()); - if ( job->size() == batch_size) { - enqueue_job(std::move(job)); - job.reset(new Job); - } - } - - if (! job->empty()) - enqueue_job(std::move(job)); - - done = true; - for (int i=0; i words = words_; - std::sort(words.begin(), words.end(), [](Word *w1, Word *w2) { return w1->count_ > w2->count_; }); - - for (auto w: words) { - out << w->text_; - for (auto i: syn0_[w->index_]) out << " " << i; - out << std::endl; - } - - return 0; - } - - int load(const std::string& file) { - std::ifstream in(file); - std::string line; - if (! std::getline(in, line)) return -1; - - int n_words = 0, layer1_size = 0; - std::istringstream iss(line); - iss >> n_words >> layer1_size; - - syn0_.clear(); vocab_.clear(); words_.clear(); - syn0_.resize(n_words); - for (int i=0; i> text; - - auto p = vocab_.emplace(text, WordP(new Word{i, text, 0})); - words_.push_back(p.first->second.get()); - syn0_[i].resize(layer1_size); - for(int j=0; j> syn0_[i][j]; - } - } - - layer1_size_ = layer1_size; - printf("%d words loaded\n", n_words); - - syn0norm_ = syn0_; - for (auto& v: syn0norm_) unit(v); - - return 0; - } - - std::vector> most_similar(std::vector positive, std::vector negative, int topn) { - if ((positive.empty() && negative.empty()) || syn0norm_.empty()) return std::vector>{}; - - Vector mean(layer1_size_); - std::vector all_words; - auto add_word = [&mean, &all_words, this](const std::string& w, float weight) { - auto it = vocab_.find(w); - if (it == vocab_.end()) return; - - Word& word = *it->second; - saxpy(mean, weight, syn0norm_[word.index_]); - - all_words.push_back(word.index_); - }; - - for (auto& w: positive) add_word(w, 1.0); - for (auto& w: negative) add_word(w, -1.0); - - unit(mean); - - Vector dists; - std::vector indexes; - int i=0; - - dists.reserve(syn0norm_.size()); - indexes.reserve(syn0norm_.size()); - for (auto &x: syn0norm_) { - dists.push_back(dot(x, mean)); - indexes.push_back(i++); - } - - auto comp = [&dists](int i, int j) { return dists[i] > dists[j]; }; -// std::sort(indexes.begin(), indexes.end(), comp); - - int k = std::min(int(topn+all_words.size()), int(indexes.size())-1); - auto first = indexes.begin(), last = indexes.begin() + k, end = indexes.end(); - std::make_heap(first, last + 1, comp); - std::pop_heap(first, last + 1, comp); - for (auto it = last + 1; it != end; ++it) { - if (! comp(*it, *first)) continue; - *last = *it; - std::pop_heap(first, last + 1, comp); - } - - std::sort_heap(first, last, comp); - - std::vector> results; - for(int i=0, j=0; itext_, dists[indexes[i]])); - if (++j >= topn) break; - } - - return results; - } - -private: - int train_sentence(Sentence& sentence, float alpha) { - const int max_size = 1000; - const float max_exp = 6.0; - const static std::vector table = [&](){ - std::vector x(max_size); - for (size_t i=0; icodes_.empty()) - continue; - int word_index = word->index_; - auto& l1 = syn0_[word_index]; //layer1 weight - - Vector work(layer1_size_); - for (size_t b=0; b= max_exp) - continue; - - int fi = int((f + max_exp) * (max_size / max_exp / 2.0)); - - f = table[fi]; //prediction -// f = sigmoid(f); - float g = (1 - current.codes_[b] - f) * alpha; //codes_ : one-hot label - - saxpy(work, g, l2); - saxpy(l2, g, l1); - -// work += syn1_[idx] * g; -// syn1_[idx] += syn0_[word_index] * g; - } - - //negative sampling -#if 0 - if (negative_ > 0) { - for (int d = 0; d < negative_ + 1; ++d) { - int label = (d == 0? 1: 0); - int target = 0; - if (d == 0) target = i; - else { - target = unigram_[rand() % unigram_.size()]; - if (target == 0) target = rand() % (vocab_.size() - 1) + 1; - if (target == i) continue; - } - - auto& l2 = syn1neg_[target]; - float f = dot(l1, l2), g = 0; - if (f > max_exp) g = (label - 1) * alpha; - else if (f < -max_exp) g = (label - 0) * alpha; - else { - int fi = int((f + max_exp) * (max_size / max_exp / 2.0)); - g = (label - table[fi]) * alpha; - } - - saxpy(work, g, l2); - saxpy(l2, g, l1); - - } - } -#endif - -// syn0_[word_index] += work; - saxpy(l1, 1.0, work); - } - ++count; - } - return count; - } - - float similarity(const std::string& w1, const std::string& w2) const { - auto it1 = vocab_.find(w1), it2 = vocab_.find(w2); - if (it1 != vocab_.end() && it2 != vocab_.end()) - return dot(syn0_[it1->second->index_], syn0_[it2->second->index_]); - return 0; - } - - static inline float dot(const Vector&x, const Vector& y) { - int m = x.size(); const float *xd = x.data(), *yd = y.data(); - float sum = 0.0; - while (--m >= 0) sum += (*xd++) * (*yd++); - return sum; - } - - static inline void saxpy(Vector& x, float g, const Vector& y) { - int m = x.size(); float *xd = x.data(); const float *yd = y.data(); - while (--m >= 0) (*xd++) += g * (*yd++); - } - - static inline void unit(Vector& x) { - float len = ::sqrt(dot(x, x)); - if (len == 0) return; - - int m = x.size(); float *xd = x.data(); - while (--m >= 0) (*xd++) /= len; - } -};