diff --git a/CMakeLists.txt b/CMakeLists.txt index e6c6293f..dfb930c0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -128,6 +128,11 @@ if (BUILD_EXE) add_dependencies(tests GraphZeppelinVerifyCC) target_link_libraries(tests PRIVATE GraphZeppelinVerifyCC) + add_executable(statistical_sketch_test + tools/sketch_testing.cpp) + add_dependencies(statistical_sketch_test GraphZeppelinVerifyCC) + target_link_libraries(statistical_sketch_test PRIVATE GraphZeppelinVerifyCC) + # executable for processing a binary graph stream add_executable(process_stream tools/process_stream.cpp) diff --git a/src/cc_alg_configuration.cpp b/src/cc_alg_configuration.cpp index 93d0ae47..becbb7e5 100644 --- a/src/cc_alg_configuration.cpp +++ b/src/cc_alg_configuration.cpp @@ -14,11 +14,6 @@ CCAlgConfiguration& CCAlgConfiguration::sketches_factor(double factor) { << "Defaulting to 1." << std::endl; _sketches_factor = 1; } - if (_sketches_factor != 1) { - std::cerr << "WARNING: Your graph configuration specifies using a factor " << _sketches_factor - << " of the normal quantity of sketches." << std::endl; - std::cerr << " Is this intentional? If not, set sketches_factor to one!" << std::endl; - } return *this; } diff --git a/src/cc_sketch_alg.cpp b/src/cc_sketch_alg.cpp index 385f31e2..ed788f45 100644 --- a/src/cc_sketch_alg.cpp +++ b/src/cc_sketch_alg.cpp @@ -91,11 +91,11 @@ void CCSketchAlg::pre_insert(GraphUpdate upd, int /* thr_id */) { auto dst = std::max(edge.src, edge.dst); std::lock_guard sflock(spanning_forest_mtx[src]); if (dsu.merge(src, dst).merged) { - // edge is new connectivity info. Add to spanning forest. + // this edge adds new connectivity information so add to spanning forest spanning_forest[src].insert(dst); - } + } else if (spanning_forest[src].find(dst) != spanning_forest[src].end()) { - // edge is not new connectivity info and is deletion of spanning forest edge. dsu invalid. + // this update deletes one of our spanning forest edges so mark dsu invalid dsu_valid = false; shared_dsu_valid = false; } diff --git a/tools/benchmark/graphcc_bench.cpp b/tools/benchmark/graphcc_bench.cpp index 17375a00..96a15c1e 100644 --- a/tools/benchmark/graphcc_bench.cpp +++ b/tools/benchmark/graphcc_bench.cpp @@ -304,16 +304,23 @@ static void BM_DSU_Find(benchmark::State& state) { constexpr size_t size_of_dsu = 16 * MB; DisjointSetUnion dsu(size_of_dsu); + auto rng = std::default_random_engine{}; + std::vector queries; + for (size_t i = 0; i < 4096; i++) { + queries.push_back((size_of_dsu / 4096) * i); + } + std::shuffle(queries.begin(), queries.end(), rng); + // perform find test for (auto _ : state) { - for (size_t i = 0; i < size_of_dsu; i++) - dsu.find_root(i); + for (auto q : queries) + dsu.find_root(q); } state.counters["Find_Latency"] = - benchmark::Counter(state.iterations() * size_of_dsu, + benchmark::Counter(state.iterations() * queries.size(), benchmark::Counter::kIsRate | benchmark::Counter::kInvert); } -BENCHMARK(BM_DSU_Find)->Iterations(1); +BENCHMARK(BM_DSU_Find); static void BM_DSU_Find_After_Combine(benchmark::State& state) { constexpr size_t size_of_dsu = 16 * MB; @@ -323,52 +330,73 @@ static void BM_DSU_Find_After_Combine(benchmark::State& state) { dsu.merge(i, i+1); } + auto rng = std::default_random_engine{}; + std::vector queries; + for (size_t i = 0; i < 4096; i++) { + queries.push_back((size_of_dsu / 4096) * i); + } + std::shuffle(queries.begin(), queries.end(), rng); + // perform find test for (auto _ : state) { - for (size_t i = 0; i < size_of_dsu; i++) - dsu.find_root(i); + for (auto q : queries) + dsu.find_root(q); } state.counters["Find_Latency"] = - benchmark::Counter(state.iterations() * size_of_dsu, + benchmark::Counter(state.iterations() * queries.size(), benchmark::Counter::kIsRate | benchmark::Counter::kInvert); } -BENCHMARK(BM_DSU_Find_After_Combine)->Iterations(1); +BENCHMARK(BM_DSU_Find_After_Combine); // MT DSU Find Root -static void BM_Parallel_DSU_Find(benchmark::State& state) { +static void BM_MT_DSU_Find(benchmark::State& state) { constexpr size_t size_of_dsu = 16 * MB; DisjointSetUnion_MT dsu(size_of_dsu); + auto rng = std::default_random_engine{}; + std::vector queries; + for (size_t i = 0; i < 4096; i++) { + queries.push_back((size_of_dsu / 4096) * i); + } + std::shuffle(queries.begin(), queries.end(), rng); + // perform find test for (auto _ : state) { - for (size_t i = 0; i < size_of_dsu; i++) - dsu.find_root(i); + for (auto q : queries) + dsu.find_root(q); } state.counters["Find_Latency"] = - benchmark::Counter(state.iterations() * size_of_dsu, + benchmark::Counter(state.iterations() * queries.size(), benchmark::Counter::kIsRate | benchmark::Counter::kInvert); } -BENCHMARK(BM_Parallel_DSU_Find)->Iterations(1); +BENCHMARK(BM_MT_DSU_Find); // MT DSU Find Root -static void BM_Parallel_DSU_Find_After_Combine(benchmark::State& state) { - constexpr size_t size_of_dsu = 16 * MB; +static void BM_MT_DSU_Find_After_Combine(benchmark::State& state) { + constexpr size_t size_of_dsu = MB; DisjointSetUnion_MT dsu(size_of_dsu); // merge everything into same root for (size_t i = 0; i < size_of_dsu - 1; i++) { dsu.merge(i, i+1); } + auto rng = std::default_random_engine{}; + std::vector queries; + for (size_t i = 0; i < 512; i++) { + queries.push_back((size_of_dsu / 512) * i); + } + std::shuffle(queries.begin(), queries.end(), rng); + // perform find test for (auto _ : state) { - for (size_t i = 0; i < size_of_dsu; i++) - dsu.find_root(i); + for (auto q : queries) + dsu.find_root(q); } state.counters["Find_Latency"] = - benchmark::Counter(state.iterations() * size_of_dsu, + benchmark::Counter(state.iterations() * queries.size(), benchmark::Counter::kIsRate | benchmark::Counter::kInvert); } -BENCHMARK(BM_Parallel_DSU_Find_After_Combine)->Iterations(1); +BENCHMARK(BM_MT_DSU_Find_After_Combine); // Benchmark speed of DSU merges when the sequence of merges is adversarial // This means we avoid joining roots wherever possible diff --git a/tools/sketch_testing.cpp b/tools/sketch_testing.cpp new file mode 100644 index 00000000..6d10287f --- /dev/null +++ b/tools/sketch_testing.cpp @@ -0,0 +1,171 @@ +#include +#include +#include +#include + +#include "sketch.h" +#include "cc_alg_configuration.h" + +/* + + The purpose of this file is to test the probability that a sketch column returns a nonzero + That is, for a number of nonzeroes z, how what's the probability of success? + + We model this as a binomial process for the sake of confidence intervals / stddev. + + Originally, this code inserted z random elements into a sketch then queried it. + + As a first speed optimization (that didn't appear to change outcome) (xxHash works well) + We replaced random insertion with sequential inserted. + + As a second speed optimization, we queried the sketch after every update. + That is, instead of O(z^2) insertions per z data points, we perform O(z) insertions per z data points. + This sacrifices independence. Whether or not the z-1th sketch is good is a fantastic predictor for the zth sketch being good. + But, for a given z, the results are still independent. + + For parity with the main code, column seeds are sequential. + + The output of this is intended to be parsed into summary stats by sum_sketch_testing.py +*/ + + +std::random_device dev; + +std::mt19937_64 rng(dev()); +using rand_type = std::mt19937_64::result_type; + + +rand_type gen(rand_type n) +{ + std::uniform_int_distribution dist(0,n-1); + return dist(rng); +} + +rand_type seed = gen(1ll << 62); + +rand_type gen_seed() +{ + //std::uniform_int_distribution dist(0,1ll << 63); + //return dist(rng); + return seed++; +} + + +enum ResultType { + R_GOOD=0, + R_BAD=1, + R_HASHFAIL=2 +}; + +ResultType test_z(rand_type n, rand_type z) +{ + assert(z >= 1); + assert(z <= n*n); + Sketch sketch(n, gen_seed(), 1, 1); + + // Generate z edges and track them + /*std::unordered_set edges; + while (edges.size() < z) + { + edges.insert(gen(n*n)); + } + + for (const auto& r : edges) + { + sketch.update(r); + } + */ + for (rand_type i = 0; i < z; i++) + sketch.update(i); + // Sample the sketches + SketchSample query_ret = sketch.sample(); + SampleResult ret_code = query_ret.result; + + assert(ret_code != ZERO); + + if (ret_code == GOOD) + { + //if (edges.find(res) == edges.end()) + // return R_HASHFAIL; + return R_GOOD; + } + return R_BAD; +} + +std::pair fit_to_binomial(rand_type ngood, rand_type ntrials) +{ + double p = ngood / (1.0 * ntrials); + double variance = ntrials * p * (1-p); + double stddev = sqrt(variance); + return std::pair(p, stddev/ntrials); +} + +std::pair test_nz_pair(rand_type n, rand_type z) +{ + int ntrials = 500; + int results[3] = {0,0,0}; + for (int i = 0; i < ntrials; i++) + results[test_z(n, z)]++; + //std::cout << "GOOD: " << results[0] << std::endl; + //std::cout << "BAD: " << results[1] << std::endl; + //std::cout << "HASHFAIL: " << results[2] << std::endl; + int ngood = results[0]; + // Fit to binomial + return fit_to_binomial(ngood, ntrials); +} + +void test_n_one(rand_type n, rand_type* good, rand_type max_z) +{ + Sketch sketch(n*n, gen_seed(), 1, 1); + for (rand_type i = 0; i < max_z; i++) + { + sketch.update(i); + // Sample the sketches + SketchSample query_ret = sketch.sample(); + SampleResult ret_code = query_ret.result; + //assert(ret_code != ZERO); + if (ret_code == GOOD) + good[i]++; + sketch.reset_sample_state(); + } +} + +void test_n(rand_type n) +{ + int ntrials = 500; + rand_type max_z = 1+(n*n)/4; + // Default init to 0? + rand_type* good = new rand_type[max_z]; + for (int i = 0; i < ntrials; i++) + test_n_one(n, good, max_z); + + double worst_3sigma = 1; + rand_type worst_i = 0; + for (rand_type i = 0; i < max_z; i++) + { + auto pair = fit_to_binomial(good[i], ntrials); + double ans = pair.first; + double stddev = pair.second; + std::cout << i << ": " << ans << " +- " << stddev << std::endl; + if (ans - 3 * stddev < worst_3sigma) + { + worst_i = i; + worst_3sigma = ans-3*stddev; + } + } + auto pair = fit_to_binomial(good[worst_i], ntrials); + double ans = pair.first; + double stddev = pair.second; + std::cout << "WORST" << std::endl; + std::cout << worst_i << ": " << ans << " +- " << stddev << std::endl; + + delete[] good; +} + +int main() +{ + std::cout << CCAlgConfiguration() << std::endl; + rand_type n = 1 << 13; + std::cout << "TESTING: " << n << " TO " << (n*n)/4 << std::endl; + test_n(n); +} diff --git a/tools/sum_sketch_testing.py b/tools/sum_sketch_testing.py new file mode 100644 index 00000000..7a5f3a2b --- /dev/null +++ b/tools/sum_sketch_testing.py @@ -0,0 +1,63 @@ +import sys +import re + +""" +The purpose of this file is to parse the output of sketch_testing.cpp into summary statistics +That is, we can answer questions like "how many data points are 2 stddev above .8" +or "What is the mean of the data" +""" + +prob = r"([0-9]*[.])?[0-9]+" +which = r"[0-9]+" + +pattern = re.compile("(" + which + "): (" + prob + ") \+- (" + prob + ")") + +def parse(filename): + with open(filename) as file: + lines = file.readlines()[:4000000] + stats = [] + for l in lines: + match = pattern.match(l) + if match: + t = (int(match.group(1)), float(match.group(2)), float(match.group(4))) + stats.append(t) + return stats + +def above(stats, target, sigmas): + above = 0 + below = 0 + + + for s in stats: + if (s[1] - sigmas * s[2] > target): + above += 1 + else: + below += 1 + print("BELOW") + + print (above / (above + below)) + + +def mean(stats, sigmas): + summ = 0 + count = 0 + for s in stats: + count += 1 + summ += s[1] - sigmas * s[2] + print(summ/count) + + +stats = parse(sys.argv[1]) + +above(stats, 0.76, 0) +#above(stats, 0.78, 1) +#above(stats, 0.78, 2) + +#mean(stats, 3) + + + + + + +