From 8546f1da03e9ceca8a4be8a5c32a2fb4f80cfe43 Mon Sep 17 00:00:00 2001 From: Daniel DeLayo Date: Mon, 22 Jan 2024 11:54:49 -0500 Subject: [PATCH 1/4] statistical testing --- CMakeLists.txt | 5 + tools/statistical_testing/sketch_testing.cpp | 147 ++++++++++++++++++ .../statistical_testing/sum_sketch_testing.py | 54 +++++++ 3 files changed, 206 insertions(+) create mode 100644 tools/statistical_testing/sketch_testing.cpp create mode 100644 tools/statistical_testing/sum_sketch_testing.py diff --git a/CMakeLists.txt b/CMakeLists.txt index a66c474d..d775131d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -148,6 +148,11 @@ if (BUILD_EXE) test/util/graph_gen.cpp) add_dependencies(statistical_test GraphZeppelinVerifyCC) target_link_libraries(statistical_test PRIVATE GraphZeppelinVerifyCC) + + add_executable(statistical_sketch_test + tools/statistical_testing/sketch_testing.cpp) + add_dependencies(statistical_sketch_test GraphZeppelinVerifyCC) + target_link_libraries(statistical_sketch_test PRIVATE GraphZeppelinVerifyCC) # executables for experiment/benchmarking add_executable(efficient_gen diff --git a/tools/statistical_testing/sketch_testing.cpp b/tools/statistical_testing/sketch_testing.cpp new file mode 100644 index 00000000..3329a429 --- /dev/null +++ b/tools/statistical_testing/sketch_testing.cpp @@ -0,0 +1,147 @@ +#include +#include +#include +#include + +#include "sketch.h" +#include "cc_alg_configuration.h" + +std::random_device dev; +std::mt19937_64 rng(dev()); +using rand_type = std::mt19937_64::result_type; + + +rand_type gen(rand_type n) +{ + std::uniform_int_distribution dist(0,n-1); + return dist(rng); +} + +rand_type seed = gen(1ll << 62); + +rand_type gen_seed() +{ + //std::uniform_int_distribution dist(0,1ll << 63); + //return dist(rng); + return seed++; +} + + +enum ResultType { + R_GOOD=0, + R_BAD=1, + R_HASHFAIL=2 +}; + +ResultType test_z(rand_type n, rand_type z) +{ + assert(z >= 1); + assert(z <= n*n); + Sketch sketch(n, gen_seed(), 1, 1); + + // Generate z edges and track them + /*std::unordered_set edges; + while (edges.size() < z) + { + edges.insert(gen(n*n)); + } + + for (const auto& r : edges) + { + sketch.update(r); + } + */ + for (rand_type i = 0; i < z; i++) + sketch.update(i); + // Sample the sketches + SketchSample query_ret = sketch.sample(); + SampleResult ret_code = query_ret.result; + + assert(ret_code != ZERO); + + if (ret_code == GOOD) + { + //if (edges.find(res) == edges.end()) + // return R_HASHFAIL; + return R_GOOD; + } + return R_BAD; +} + +std::pair fit_to_binomial(rand_type ngood, rand_type ntrials) +{ + double p = ngood / (1.0 * ntrials); + double variance = ntrials * p * (1-p); + double stddev = sqrt(variance); + return std::pair(p, stddev/ntrials); +} + +std::pair test_nz_pair(rand_type n, rand_type z) +{ + int ntrials = 500; + int results[3] = {0,0,0}; + for (int i = 0; i < ntrials; i++) + results[test_z(n, z)]++; + //std::cout << "GOOD: " << results[0] << std::endl; + //std::cout << "BAD: " << results[1] << std::endl; + //std::cout << "HASHFAIL: " << results[2] << std::endl; + int ngood = results[0]; + // Fit to binomial + return fit_to_binomial(ngood, ntrials); +} + +void test_n_one(rand_type n, rand_type* good, rand_type max_z) +{ + Sketch sketch(n*n, gen_seed(), 1, 1); + for (rand_type i = 0; i < max_z; i++) + { + sketch.update(i); + // Sample the sketches + SketchSample query_ret = sketch.sample(); + SampleResult ret_code = query_ret.result; + //assert(ret_code != ZERO); + if (ret_code == GOOD) + good[i]++; + sketch.reset_sample_state(); + } +} + +void test_n(rand_type n) +{ + int ntrials = 500; + rand_type max_z = 1+(n*n)/4; + // Default init to 0? + rand_type* good = new rand_type[max_z]; + for (int i = 0; i < ntrials; i++) + test_n_one(n, good, max_z); + + double worst_3sigma = 1; + rand_type worst_i = 0; + for (rand_type i = 0; i < max_z; i++) + { + auto pair = fit_to_binomial(good[i], ntrials); + double ans = pair.first; + double stddev = pair.second; + std::cout << i << ": " << ans << " +- " << stddev << std::endl; + if (ans - 3 * stddev < worst_3sigma) + { + worst_i = i; + worst_3sigma = ans-3*stddev; + } + } + auto pair = fit_to_binomial(good[worst_i], ntrials); + double ans = pair.first; + double stddev = pair.second; + std::cout << "WORST" << std::endl; + std::cout << worst_i << ": " << ans << " +- " << stddev << std::endl; + + delete[] good; +} + +int main() +{ + std::cout << CCAlgConfiguration() << std::endl; + rand_type n = 1 << 13; + std::cout << "TESTING: " << n << " TO " << (n*n)/4 << std::endl; + test_n(n); +} diff --git a/tools/statistical_testing/sum_sketch_testing.py b/tools/statistical_testing/sum_sketch_testing.py new file mode 100644 index 00000000..01052777 --- /dev/null +++ b/tools/statistical_testing/sum_sketch_testing.py @@ -0,0 +1,54 @@ +import sys +import re + +prob = r"([0-9]*[.])?[0-9]+" +which = r"[0-9]+" + +pattern = re.compile("(" + which + "): (" + prob + ") \+- (" + prob + ")") + +def parse(filename): + with open(filename) as file: + lines = file.readlines()[:4000000] + stats = [] + for l in lines: + match = pattern.match(l) + if match: + t = (int(match.group(1)), float(match.group(2)), float(match.group(4))) + stats.append(t) + return stats + +def above(stats, target, sigmas): + above = 0 + below = 0 + + + for s in stats: + if (s[1] - sigmas * s[2] > target): + above += 1 + else: + below += 1 + + print (above / (above + below)) + + +def mean(stats, sigmas): + summ = 0 + count = 0 + for s in stats: + count += 1 + summ += s[1] - sigmas * s[2] + print(summ/count) + + +stats = parse(sys.argv[1]) + +above(stats, 0.71, 0) + +mean(stats, 0) + + + + + + + From 057f91b1ea57d3300863f4eecbee6104d528fee6 Mon Sep 17 00:00:00 2001 From: Daniel DeLayo Date: Mon, 12 Feb 2024 15:42:00 -0500 Subject: [PATCH 2/4] move to tools --- tools/{statistical_testing => }/sum_sketch_testing.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) rename tools/{statistical_testing => }/sum_sketch_testing.py (88%) diff --git a/tools/statistical_testing/sum_sketch_testing.py b/tools/sum_sketch_testing.py similarity index 88% rename from tools/statistical_testing/sum_sketch_testing.py rename to tools/sum_sketch_testing.py index 01052777..55b666e7 100644 --- a/tools/statistical_testing/sum_sketch_testing.py +++ b/tools/sum_sketch_testing.py @@ -27,6 +27,7 @@ def above(stats, target, sigmas): above += 1 else: below += 1 + print("BELOW") print (above / (above + below)) @@ -42,9 +43,11 @@ def mean(stats, sigmas): stats = parse(sys.argv[1]) -above(stats, 0.71, 0) +above(stats, 0.76, 0) +#above(stats, 0.78, 1) +#above(stats, 0.78, 2) -mean(stats, 0) +#mean(stats, 3) From d669088bbc76e05bef99a4c7ac99286937b32aa2 Mon Sep 17 00:00:00 2001 From: Daniel DeLayo Date: Thu, 15 Feb 2024 15:01:12 -0500 Subject: [PATCH 3/4] move to tools 2 --- CMakeLists.txt | 2 +- tools/{statistical_testing => }/sketch_testing.cpp | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename tools/{statistical_testing => }/sketch_testing.cpp (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index a326e6e7..3c695baa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -152,7 +152,7 @@ if (BUILD_EXE) target_link_libraries(statistical_test PRIVATE GraphZeppelinVerifyCC) add_executable(statistical_sketch_test - tools/statistical_testing/sketch_testing.cpp) + tools/sketch_testing.cpp) add_dependencies(statistical_sketch_test GraphZeppelinVerifyCC) target_link_libraries(statistical_sketch_test PRIVATE GraphZeppelinVerifyCC) diff --git a/tools/statistical_testing/sketch_testing.cpp b/tools/sketch_testing.cpp similarity index 100% rename from tools/statistical_testing/sketch_testing.cpp rename to tools/sketch_testing.cpp From d502ae4265146e7d5eaf9396650c550b58e0df97 Mon Sep 17 00:00:00 2001 From: Daniel DeLayo Date: Thu, 22 Feb 2024 11:23:07 -0500 Subject: [PATCH 4/4] documentation --- tools/sketch_testing.cpp | 24 ++++++++++++++++++++++++ tools/sum_sketch_testing.py | 6 ++++++ 2 files changed, 30 insertions(+) diff --git a/tools/sketch_testing.cpp b/tools/sketch_testing.cpp index 3329a429..6d10287f 100644 --- a/tools/sketch_testing.cpp +++ b/tools/sketch_testing.cpp @@ -6,7 +6,31 @@ #include "sketch.h" #include "cc_alg_configuration.h" +/* + + The purpose of this file is to test the probability that a sketch column returns a nonzero + That is, for a number of nonzeroes z, how what's the probability of success? + + We model this as a binomial process for the sake of confidence intervals / stddev. + + Originally, this code inserted z random elements into a sketch then queried it. + + As a first speed optimization (that didn't appear to change outcome) (xxHash works well) + We replaced random insertion with sequential inserted. + + As a second speed optimization, we queried the sketch after every update. + That is, instead of O(z^2) insertions per z data points, we perform O(z) insertions per z data points. + This sacrifices independence. Whether or not the z-1th sketch is good is a fantastic predictor for the zth sketch being good. + But, for a given z, the results are still independent. + + For parity with the main code, column seeds are sequential. + + The output of this is intended to be parsed into summary stats by sum_sketch_testing.py +*/ + + std::random_device dev; + std::mt19937_64 rng(dev()); using rand_type = std::mt19937_64::result_type; diff --git a/tools/sum_sketch_testing.py b/tools/sum_sketch_testing.py index 55b666e7..7a5f3a2b 100644 --- a/tools/sum_sketch_testing.py +++ b/tools/sum_sketch_testing.py @@ -1,6 +1,12 @@ import sys import re +""" +The purpose of this file is to parse the output of sketch_testing.cpp into summary statistics +That is, we can answer questions like "how many data points are 2 stddev above .8" +or "What is the mean of the data" +""" + prob = r"([0-9]*[.])?[0-9]+" which = r"[0-9]+"