Skip to content

Commit

Permalink
Merge pull request #139 from GraphStreamingProject/refactor
Browse files Browse the repository at this point in the history
statistical testing
  • Loading branch information
DanielDeLayo authored Feb 26, 2024
2 parents c607b58 + 86d01d9 commit b4aa8d5
Show file tree
Hide file tree
Showing 3 changed files with 240 additions and 0 deletions.
6 changes: 6 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,12 @@ if (BUILD_EXE)
add_dependencies(tests GraphZeppelinVerifyCC)
target_link_libraries(tests PRIVATE GraphZeppelinVerifyCC)

add_executable(statistical_sketch_test
tools/sketch_testing.cpp)
add_dependencies(statistical_sketch_test GraphZeppelinVerifyCC)
target_link_libraries(statistical_sketch_test PRIVATE GraphZeppelinVerifyCC)


# executable for processing a binary graph stream
add_executable(process_stream
tools/process_stream.cpp)
Expand Down
171 changes: 171 additions & 0 deletions tools/sketch_testing.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
#include <iostream>
#include <random>
#include <set>
#include <cassert>

#include "sketch.h"
#include "cc_alg_configuration.h"

/*
The purpose of this file is to test the probability that a sketch column returns a nonzero
That is, for a number of nonzeroes z, how what's the probability of success?
We model this as a binomial process for the sake of confidence intervals / stddev.
Originally, this code inserted z random elements into a sketch then queried it.
As a first speed optimization (that didn't appear to change outcome) (xxHash works well)
We replaced random insertion with sequential inserted.
As a second speed optimization, we queried the sketch after every update.
That is, instead of O(z^2) insertions per z data points, we perform O(z) insertions per z data points.
This sacrifices independence. Whether or not the z-1th sketch is good is a fantastic predictor for the zth sketch being good.
But, for a given z, the results are still independent.
For parity with the main code, column seeds are sequential.
The output of this is intended to be parsed into summary stats by sum_sketch_testing.py
*/


std::random_device dev;

std::mt19937_64 rng(dev());
using rand_type = std::mt19937_64::result_type;


rand_type gen(rand_type n)
{
std::uniform_int_distribution<rand_type> dist(0,n-1);
return dist(rng);
}

rand_type seed = gen(1ll << 62);

rand_type gen_seed()
{
//std::uniform_int_distribution<rand_type> dist(0,1ll << 63);
//return dist(rng);
return seed++;
}


enum ResultType {
R_GOOD=0,
R_BAD=1,
R_HASHFAIL=2
};

ResultType test_z(rand_type n, rand_type z)
{
assert(z >= 1);
assert(z <= n*n);
Sketch sketch(n, gen_seed(), 1, 1);

// Generate z edges and track them
/*std::unordered_set<rand_type> edges;
while (edges.size() < z)
{
edges.insert(gen(n*n));
}
for (const auto& r : edges)
{
sketch.update(r);
}
*/
for (rand_type i = 0; i < z; i++)
sketch.update(i);
// Sample the sketches
SketchSample query_ret = sketch.sample();
SampleResult ret_code = query_ret.result;

assert(ret_code != ZERO);

if (ret_code == GOOD)
{
//if (edges.find(res) == edges.end())
// return R_HASHFAIL;
return R_GOOD;
}
return R_BAD;
}

std::pair<double, double> fit_to_binomial(rand_type ngood, rand_type ntrials)
{
double p = ngood / (1.0 * ntrials);
double variance = ntrials * p * (1-p);
double stddev = sqrt(variance);
return std::pair<double, double>(p, stddev/ntrials);
}

std::pair<double, double> test_nz_pair(rand_type n, rand_type z)
{
int ntrials = 500;
int results[3] = {0,0,0};
for (int i = 0; i < ntrials; i++)
results[test_z(n, z)]++;
//std::cout << "GOOD: " << results[0] << std::endl;
//std::cout << "BAD: " << results[1] << std::endl;
//std::cout << "HASHFAIL: " << results[2] << std::endl;
int ngood = results[0];
// Fit to binomial
return fit_to_binomial(ngood, ntrials);
}

void test_n_one(rand_type n, rand_type* good, rand_type max_z)
{
Sketch sketch(n*n, gen_seed(), 1, 1);
for (rand_type i = 0; i < max_z; i++)
{
sketch.update(i);
// Sample the sketches
SketchSample query_ret = sketch.sample();
SampleResult ret_code = query_ret.result;
//assert(ret_code != ZERO);
if (ret_code == GOOD)
good[i]++;
sketch.reset_sample_state();
}
}

void test_n(rand_type n)
{
int ntrials = 500;
rand_type max_z = 1+(n*n)/4;
// Default init to 0?
rand_type* good = new rand_type[max_z];
for (int i = 0; i < ntrials; i++)
test_n_one(n, good, max_z);

double worst_3sigma = 1;
rand_type worst_i = 0;
for (rand_type i = 0; i < max_z; i++)
{
auto pair = fit_to_binomial(good[i], ntrials);
double ans = pair.first;
double stddev = pair.second;
std::cout << i << ": " << ans << " +- " << stddev << std::endl;
if (ans - 3 * stddev < worst_3sigma)
{
worst_i = i;
worst_3sigma = ans-3*stddev;
}
}
auto pair = fit_to_binomial(good[worst_i], ntrials);
double ans = pair.first;
double stddev = pair.second;
std::cout << "WORST" << std::endl;
std::cout << worst_i << ": " << ans << " +- " << stddev << std::endl;

delete[] good;
}

int main()
{
std::cout << CCAlgConfiguration() << std::endl;
rand_type n = 1 << 13;
std::cout << "TESTING: " << n << " TO " << (n*n)/4 << std::endl;
test_n(n);
}
63 changes: 63 additions & 0 deletions tools/sum_sketch_testing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import sys
import re

"""
The purpose of this file is to parse the output of sketch_testing.cpp into summary statistics
That is, we can answer questions like "how many data points are 2 stddev above .8"
or "What is the mean of the data"
"""

prob = r"([0-9]*[.])?[0-9]+"
which = r"[0-9]+"

pattern = re.compile("(" + which + "): (" + prob + ") \+- (" + prob + ")")

def parse(filename):
with open(filename) as file:
lines = file.readlines()[:4000000]
stats = []
for l in lines:
match = pattern.match(l)
if match:
t = (int(match.group(1)), float(match.group(2)), float(match.group(4)))
stats.append(t)
return stats

def above(stats, target, sigmas):
above = 0
below = 0


for s in stats:
if (s[1] - sigmas * s[2] > target):
above += 1
else:
below += 1
print("BELOW")

print (above / (above + below))


def mean(stats, sigmas):
summ = 0
count = 0
for s in stats:
count += 1
summ += s[1] - sigmas * s[2]
print(summ/count)


stats = parse(sys.argv[1])

above(stats, 0.76, 0)
#above(stats, 0.78, 1)
#above(stats, 0.78, 2)

#mean(stats, 3)







0 comments on commit b4aa8d5

Please sign in to comment.