Merge pull request #139 from GraphStreamingProject/refactor

statistical testing
GraphStreamingProject · Feb 26, 2024 · b4aa8d5 · b4aa8d5
2 parents c607b58 + 86d01d9
commit b4aa8d5
Show file tree

Hide file tree

Showing 3 changed files with 240 additions and 0 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -128,6 +128,12 @@ if (BUILD_EXE)
   add_dependencies(tests GraphZeppelinVerifyCC)
   target_link_libraries(tests PRIVATE GraphZeppelinVerifyCC)
 
+  add_executable(statistical_sketch_test
+    tools/sketch_testing.cpp)
+  add_dependencies(statistical_sketch_test GraphZeppelinVerifyCC)
+  target_link_libraries(statistical_sketch_test PRIVATE GraphZeppelinVerifyCC)
+
+
   # executable for processing a binary graph stream
   add_executable(process_stream
     tools/process_stream.cpp)

diff --git a/tools/sketch_testing.cpp b/tools/sketch_testing.cpp
@@ -0,0 +1,171 @@
+#include <iostream>
+#include <random>
+#include <set>
+#include <cassert>
+
+#include "sketch.h"
+#include "cc_alg_configuration.h"
+
+/*
+
+  The purpose of this file is to test the probability that a sketch column returns a nonzero
+  That is, for a number of nonzeroes z, how what's the probability of success?  
+
+  We model this as a binomial process for the sake of confidence intervals / stddev.
+  
+  Originally, this code inserted z random elements into a sketch then queried it.
+
+  As a first speed optimization (that didn't appear to change outcome) (xxHash works well) 
+  We replaced random insertion with sequential inserted.
+
+  As a second speed optimization, we queried the sketch after every update. 
+  That is, instead of O(z^2) insertions per z data points, we perform O(z) insertions per z data points.
+  This sacrifices independence. Whether or not the z-1th sketch is good is a fantastic predictor for the zth sketch being good.
+  But, for a given z, the results are still independent.
+
+  For parity with the main code, column seeds are sequential.
+
+  The output of this is intended to be parsed into summary stats by sum_sketch_testing.py
+*/
+
+
+std::random_device dev;
+
+std::mt19937_64 rng(dev());
+using rand_type = std::mt19937_64::result_type;
+
+
+rand_type gen(rand_type n)
+{
+    std::uniform_int_distribution<rand_type> dist(0,n-1); 
+    return dist(rng);
+}
+
+rand_type seed = gen(1ll << 62);
+
+rand_type gen_seed()
+{
+    //std::uniform_int_distribution<rand_type> dist(0,1ll << 63);
+    //return dist(rng);
+    return seed++;
+}
+
+
+enum ResultType {
+    R_GOOD=0,
+    R_BAD=1,
+    R_HASHFAIL=2
+};
+
+ResultType test_z(rand_type n, rand_type z)
+{
+    assert(z >= 1);
+    assert(z <= n*n);
+    Sketch sketch(n, gen_seed(), 1, 1);
+
+    // Generate z edges and track them
+    /*std::unordered_set<rand_type> edges;
+    while (edges.size() < z)
+    {
+        edges.insert(gen(n*n));
+    }
+
+    for (const auto& r : edges)
+    {
+        sketch.update(r);
+    }
+    */
+    for (rand_type i = 0; i < z; i++)
+        sketch.update(i);
+    // Sample the sketches
+    SketchSample query_ret = sketch.sample();
+    SampleResult ret_code = query_ret.result;
+
+    assert(ret_code != ZERO);
+
+    if (ret_code == GOOD)
+    {
+        //if (edges.find(res) == edges.end())
+        //    return R_HASHFAIL;
+        return R_GOOD;
+    }   
+    return R_BAD;
+}
+
+std::pair<double, double> fit_to_binomial(rand_type ngood, rand_type ntrials)
+{
+    double p = ngood / (1.0 * ntrials);
+    double variance = ntrials * p * (1-p);
+    double stddev = sqrt(variance);
+    return std::pair<double, double>(p, stddev/ntrials);
+}
+
+std::pair<double, double> test_nz_pair(rand_type n, rand_type z)
+{
+    int ntrials = 500;
+    int results[3] = {0,0,0};
+    for (int i = 0; i < ntrials; i++)
+        results[test_z(n, z)]++;
+    //std::cout << "GOOD: " << results[0] << std::endl;
+    //std::cout << "BAD: " << results[1] << std::endl;
+    //std::cout << "HASHFAIL: " << results[2] << std::endl;
+    int ngood = results[0];
+    // Fit to binomial
+    return fit_to_binomial(ngood, ntrials);
+}
+
+void test_n_one(rand_type n, rand_type* good, rand_type max_z)
+{
+  Sketch sketch(n*n, gen_seed(), 1, 1);
+  for (rand_type i = 0; i < max_z; i++)
+  {
+    sketch.update(i);
+    // Sample the sketches
+    SketchSample query_ret = sketch.sample();
+    SampleResult ret_code = query_ret.result;
+    //assert(ret_code != ZERO);
+    if (ret_code == GOOD)
+      good[i]++;
+    sketch.reset_sample_state();
+  }
+}
+
+void test_n(rand_type n)
+{
+  int ntrials = 500;
+  rand_type max_z = 1+(n*n)/4;
+  // Default init to 0?
+  rand_type* good = new rand_type[max_z];
+  for (int i = 0; i < ntrials; i++)
+    test_n_one(n, good, max_z);
+
+  double worst_3sigma = 1;
+  rand_type worst_i = 0;
+  for (rand_type i = 0; i < max_z; i++)
+  { 
+    auto pair = fit_to_binomial(good[i], ntrials);
+    double ans = pair.first;
+    double stddev = pair.second;
+    std::cout << i << ": " << ans << " +- " << stddev << std::endl;
+    if (ans - 3 * stddev < worst_3sigma)
+    {
+      worst_i = i;
+      worst_3sigma = ans-3*stddev;
+    }
+  }
+  auto pair = fit_to_binomial(good[worst_i], ntrials);
+  double ans = pair.first;
+  double stddev = pair.second;
+  std::cout << "WORST" << std::endl;
+  std::cout << worst_i << ": " << ans << " +- " << stddev << std::endl;
+
+  delete[] good;  
+}
+
+int main()
+{
+  std::cout << CCAlgConfiguration() << std::endl;
+  rand_type n = 1 << 13;
+  std::cout << "TESTING: " << n << " TO " << (n*n)/4 << std::endl;
+  test_n(n);
+}
diff --git a/tools/sum_sketch_testing.py b/tools/sum_sketch_testing.py
@@ -0,0 +1,63 @@
+import sys
+import re
+
+"""
+The purpose of this file is to parse the output of sketch_testing.cpp into summary statistics
+That is, we can answer questions like "how many data points are 2 stddev above .8"
+or "What is the mean of the data"
+"""
+
+prob = r"([0-9]*[.])?[0-9]+"
+which = r"[0-9]+"
+
+pattern = re.compile("(" + which + "): (" + prob + ") \+- (" + prob + ")")
+
+def parse(filename):
+  with open(filename) as file:
+    lines = file.readlines()[:4000000]
+    stats = []
+    for l in lines:
+      match = pattern.match(l)
+      if match:
+        t = (int(match.group(1)), float(match.group(2)), float(match.group(4)))
+        stats.append(t)
+    return stats
+
+def above(stats, target, sigmas):
+  above = 0
+  below = 0
+
+
+  for s in stats:
+    if (s[1] - sigmas * s[2] > target):
+      above += 1
+    else:
+      below += 1
+      print("BELOW")
+
+  print (above / (above + below))
+
+
+def mean(stats, sigmas):
+  summ = 0
+  count = 0
+  for s in stats:
+    count += 1
+    summ += s[1] - sigmas * s[2]
+  print(summ/count)
+
+
+stats = parse(sys.argv[1])
+
+above(stats, 0.76, 0)
+#above(stats, 0.78, 1)
+#above(stats, 0.78, 2)
+
+#mean(stats, 3)
+
+
+
+
+
+
+