From 8546f1da03e9ceca8a4be8a5c32a2fb4f80cfe43 Mon Sep 17 00:00:00 2001
From: Daniel DeLayo <dandelayo@gmail.com>
Date: Mon, 22 Jan 2024 11:54:49 -0500
Subject: [PATCH 1/4] statistical testing

---
 CMakeLists.txt                                |   5 +
 tools/statistical_testing/sketch_testing.cpp  | 147 ++++++++++++++++++
 .../statistical_testing/sum_sketch_testing.py |  54 +++++++
 3 files changed, 206 insertions(+)
 create mode 100644 tools/statistical_testing/sketch_testing.cpp
 create mode 100644 tools/statistical_testing/sum_sketch_testing.py
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a66c474d..d775131d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -148,6 +148,11 @@ if (BUILD_EXE)
     test/util/graph_gen.cpp)
   add_dependencies(statistical_test GraphZeppelinVerifyCC)
   target_link_libraries(statistical_test PRIVATE GraphZeppelinVerifyCC)
+  
+  add_executable(statistical_sketch_test
+    tools/statistical_testing/sketch_testing.cpp)
+  add_dependencies(statistical_sketch_test GraphZeppelinVerifyCC)
+  target_link_libraries(statistical_sketch_test PRIVATE GraphZeppelinVerifyCC)
 
   # executables for experiment/benchmarking
   add_executable(efficient_gen
diff --git a/tools/statistical_testing/sketch_testing.cpp b/tools/statistical_testing/sketch_testing.cpp
new file mode 100644
index 00000000..3329a429
--- /dev/null
+++ b/tools/statistical_testing/sketch_testing.cpp
@@ -0,0 +1,147 @@
+#include <iostream>
+#include <random>
+#include <set>
+#include <cassert>
+
+#include "sketch.h"
+#include "cc_alg_configuration.h"
+
+std::random_device dev;
+std::mt19937_64 rng(dev());
+using rand_type = std::mt19937_64::result_type;
+
+    
+rand_type gen(rand_type n)
+{
+    std::uniform_int_distribution<rand_type> dist(0,n-1); 
+    return dist(rng);
+}
+
+rand_type seed = gen(1ll << 62);
+
+rand_type gen_seed()
+{
+    //std::uniform_int_distribution<rand_type> dist(0,1ll << 63);
+    //return dist(rng);
+    return seed++;
+}
+
+
+enum ResultType {
+    R_GOOD=0,
+    R_BAD=1,
+    R_HASHFAIL=2
+};
+
+ResultType test_z(rand_type n, rand_type z)
+{
+    assert(z >= 1);
+    assert(z <= n*n);
+    Sketch sketch(n, gen_seed(), 1, 1);
+
+    // Generate z edges and track them
+    /*std::unordered_set<rand_type> edges;
+    while (edges.size() < z)
+    {
+        edges.insert(gen(n*n));
+    }
+
+    for (const auto& r : edges)
+    {
+        sketch.update(r);
+    }
+    */
+    for (rand_type i = 0; i < z; i++)
+        sketch.update(i);
+    // Sample the sketches
+    SketchSample query_ret = sketch.sample();
+    SampleResult ret_code = query_ret.result;
+
+    assert(ret_code != ZERO);
+
+    if (ret_code == GOOD)
+    {
+        //if (edges.find(res) == edges.end())
+        //    return R_HASHFAIL;
+        return R_GOOD;
+    }   
+    return R_BAD;
+}
+
+std::pair<double, double> fit_to_binomial(rand_type ngood, rand_type ntrials)
+{
+    double p = ngood / (1.0 * ntrials);
+    double variance = ntrials * p * (1-p);
+    double stddev = sqrt(variance);
+    return std::pair<double, double>(p, stddev/ntrials);
+}
+
+std::pair<double, double> test_nz_pair(rand_type n, rand_type z)
+{
+    int ntrials = 500;
+    int results[3] = {0,0,0};
+    for (int i = 0; i < ntrials; i++)
+        results[test_z(n, z)]++;
+    //std::cout << "GOOD: " << results[0] << std::endl;
+    //std::cout << "BAD: " << results[1] << std::endl;
+    //std::cout << "HASHFAIL: " << results[2] << std::endl;
+    int ngood = results[0];
+    // Fit to binomial
+    return fit_to_binomial(ngood, ntrials);
+}
+
+void test_n_one(rand_type n, rand_type* good, rand_type max_z)
+{
+  Sketch sketch(n*n, gen_seed(), 1, 1);
+  for (rand_type i = 0; i < max_z; i++)
+  {
+    sketch.update(i);
+    // Sample the sketches
+    SketchSample query_ret = sketch.sample();
+    SampleResult ret_code = query_ret.result;
+    //assert(ret_code != ZERO);
+    if (ret_code == GOOD)
+      good[i]++;
+    sketch.reset_sample_state();
+  }
+}
+
+void test_n(rand_type n)
+{
+  int ntrials = 500;
+  rand_type max_z = 1+(n*n)/4;
+  // Default init to 0?
+  rand_type* good = new rand_type[max_z];
+  for (int i = 0; i < ntrials; i++)
+    test_n_one(n, good, max_z);
+
+  double worst_3sigma = 1;
+  rand_type worst_i = 0;
+  for (rand_type i = 0; i < max_z; i++)
+  { 
+    auto pair = fit_to_binomial(good[i], ntrials);
+    double ans = pair.first;
+    double stddev = pair.second;
+    std::cout << i << ": " << ans << " +- " << stddev << std::endl;
+    if (ans - 3 * stddev < worst_3sigma)
+    {
+      worst_i = i;
+      worst_3sigma = ans-3*stddev;
+    }
+  }
+  auto pair = fit_to_binomial(good[worst_i], ntrials);
+  double ans = pair.first;
+  double stddev = pair.second;
+  std::cout << "WORST" << std::endl;
+  std::cout << worst_i << ": " << ans << " +- " << stddev << std::endl;
+
+  delete[] good;  
+}
+
+int main()
+{
+  std::cout << CCAlgConfiguration() << std::endl;
+  rand_type n = 1 << 13;
+  std::cout << "TESTING: " << n << " TO " << (n*n)/4 << std::endl;
+  test_n(n);
+}
diff --git a/tools/statistical_testing/sum_sketch_testing.py b/tools/statistical_testing/sum_sketch_testing.py
new file mode 100644
index 00000000..01052777
--- /dev/null
+++ b/tools/statistical_testing/sum_sketch_testing.py
@@ -0,0 +1,54 @@
+import sys
+import re
+
+prob = r"([0-9]*[.])?[0-9]+"
+which = r"[0-9]+"
+
+pattern = re.compile("(" + which + "): (" + prob + ") \+- (" + prob + ")")
+
+def parse(filename):
+  with open(filename) as file:
+    lines = file.readlines()[:4000000]
+    stats = []
+    for l in lines:
+      match = pattern.match(l)
+      if match:
+        t = (int(match.group(1)), float(match.group(2)), float(match.group(4)))
+        stats.append(t)
+    return stats
+
+def above(stats, target, sigmas):
+  above = 0
+  below = 0
+
+
+  for s in stats:
+    if (s[1] - sigmas * s[2] > target):
+      above += 1
+    else:
+      below += 1
+
+  print (above / (above + below))
+  
+
+def mean(stats, sigmas):
+  summ = 0
+  count = 0
+  for s in stats:
+    count += 1
+    summ += s[1] - sigmas * s[2]
+  print(summ/count)
+  
+  
+stats = parse(sys.argv[1])
+
+above(stats, 0.71, 0)
+
+mean(stats, 0)
+
+
+
+
+
+
+

From 057f91b1ea57d3300863f4eecbee6104d528fee6 Mon Sep 17 00:00:00 2001
From: Daniel DeLayo <dandelayo@gmail.com>
Date: Mon, 12 Feb 2024 15:42:00 -0500
Subject: [PATCH 2/4] move to tools

---
 tools/{statistical_testing => }/sum_sketch_testing.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)
 rename tools/{statistical_testing => }/sum_sketch_testing.py (88%)

diff --git a/tools/statistical_testing/sum_sketch_testing.py b/tools/sum_sketch_testing.py
similarity index 88%
rename from tools/statistical_testing/sum_sketch_testing.py
rename to tools/sum_sketch_testing.py
index 01052777..55b666e7 100644
--- a/tools/statistical_testing/sum_sketch_testing.py
+++ b/tools/sum_sketch_testing.py
@@ -27,6 +27,7 @@ def above(stats, target, sigmas):
       above += 1
     else:
       below += 1
+      print("BELOW")
 
   print (above / (above + below))
   
@@ -42,9 +43,11 @@ def mean(stats, sigmas):
   
 stats = parse(sys.argv[1])
 
-above(stats, 0.71, 0)
+above(stats, 0.76, 0)
+#above(stats, 0.78, 1)
+#above(stats, 0.78, 2)
 
-mean(stats, 0)
+#mean(stats, 3)
 
 
 

From d669088bbc76e05bef99a4c7ac99286937b32aa2 Mon Sep 17 00:00:00 2001
From: Daniel DeLayo <dandelayo@gmail.com>
Date: Thu, 15 Feb 2024 15:01:12 -0500
Subject: [PATCH 3/4] move to tools 2

---
 CMakeLists.txt                                     | 2 +-
 tools/{statistical_testing => }/sketch_testing.cpp | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename tools/{statistical_testing => }/sketch_testing.cpp (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a326e6e7..3c695baa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -152,7 +152,7 @@ if (BUILD_EXE)
   target_link_libraries(statistical_test PRIVATE GraphZeppelinVerifyCC)
   
   add_executable(statistical_sketch_test
-    tools/statistical_testing/sketch_testing.cpp)
+    tools/sketch_testing.cpp)
   add_dependencies(statistical_sketch_test GraphZeppelinVerifyCC)
   target_link_libraries(statistical_sketch_test PRIVATE GraphZeppelinVerifyCC)
 
diff --git a/tools/statistical_testing/sketch_testing.cpp b/tools/sketch_testing.cpp
similarity index 100%
rename from tools/statistical_testing/sketch_testing.cpp
rename to tools/sketch_testing.cpp

From d502ae4265146e7d5eaf9396650c550b58e0df97 Mon Sep 17 00:00:00 2001
From: Daniel DeLayo <dandelayo@gmail.com>
Date: Thu, 22 Feb 2024 11:23:07 -0500
Subject: [PATCH 4/4] documentation

---
 tools/sketch_testing.cpp    | 24 ++++++++++++++++++++++++
 tools/sum_sketch_testing.py |  6 ++++++
 2 files changed, 30 insertions(+)

diff --git a/tools/sketch_testing.cpp b/tools/sketch_testing.cpp
index 3329a429..6d10287f 100644
--- a/tools/sketch_testing.cpp
+++ b/tools/sketch_testing.cpp
@@ -6,7 +6,31 @@
 #include "sketch.h"
 #include "cc_alg_configuration.h"
 
+/*
+
+  The purpose of this file is to test the probability that a sketch column returns a nonzero
+  That is, for a number of nonzeroes z, how what's the probability of success?  
+
+  We model this as a binomial process for the sake of confidence intervals / stddev.
+  
+  Originally, this code inserted z random elements into a sketch then queried it.
+
+  As a first speed optimization (that didn't appear to change outcome) (xxHash works well) 
+  We replaced random insertion with sequential inserted.
+
+  As a second speed optimization, we queried the sketch after every update. 
+  That is, instead of O(z^2) insertions per z data points, we perform O(z) insertions per z data points.
+  This sacrifices independence. Whether or not the z-1th sketch is good is a fantastic predictor for the zth sketch being good.
+  But, for a given z, the results are still independent.
+
+  For parity with the main code, column seeds are sequential.
+
+  The output of this is intended to be parsed into summary stats by sum_sketch_testing.py
+*/
+
+
 std::random_device dev;
+
 std::mt19937_64 rng(dev());
 using rand_type = std::mt19937_64::result_type;
 
diff --git a/tools/sum_sketch_testing.py b/tools/sum_sketch_testing.py
index 55b666e7..7a5f3a2b 100644
--- a/tools/sum_sketch_testing.py
+++ b/tools/sum_sketch_testing.py
@@ -1,6 +1,12 @@
 import sys
 import re
 
+"""
+The purpose of this file is to parse the output of sketch_testing.cpp into summary statistics
+That is, we can answer questions like "how many data points are 2 stddev above .8"
+or "What is the mean of the data"
+"""
+
 prob = r"([0-9]*[.])?[0-9]+"
 which = r"[0-9]+"