From a70a2f6ac77e052ca6c2d875541e13c4bbc8cda1 Mon Sep 17 00:00:00 2001
From: Joshua Moerman <lakseru@gmail.com>
Date: Fri, 18 Sep 2015 13:17:35 +0200
Subject: [PATCH] Some script I use to make statistical learning graphs

---
 src/learning_graph.cpp | 171 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 171 insertions(+)
 create mode 100644 src/learning_graph.cpp
diff --git a/src/learning_graph.cpp b/src/learning_graph.cpp
new file mode 100644
index 0000000..3afa768
--- /dev/null
+++ b/src/learning_graph.cpp
@@ -0,0 +1,171 @@
+#include <docopt.h>
+
+#include <boost/range/algorithm/sort.hpp>
+#include <boost/range/algorithm/unique.hpp>
+
+#include <cstdint>
+#include <cmath>
+#include <fstream>
+#include <future>
+#include <iostream>
+#include <stdexcept>
+#include <vector>
+
+using namespace std;
+
+static const char USAGE[] =
+    R"(Generate a statistical learning graph from multiple runs
+
+    Usage:
+      learning_graph <file> ...
+
+    Options:
+      -h, --help       Show this screen
+      --version        Show version
+)";
+
+struct datapoint {
+	uint64_t states;
+	uint64_t learning_queries;
+	uint64_t learning_inputs;
+	uint64_t testing_queries;
+	uint64_t testing_inputs;
+};
+
+using dataset = vector<datapoint>;
+
+static void accumulate_dataset(dataset & ds) {
+	for (size_t i = 0; i < ds.size() - 1; ++i) {
+		ds[i + 1].learning_queries += ds[i].learning_queries;
+		ds[i + 1].learning_inputs += ds[i].learning_inputs;
+		ds[i + 1].testing_queries += ds[i].testing_queries;
+		ds[i + 1].testing_inputs += ds[i].testing_inputs;
+	}
+}
+
+template <typename C, typename S>
+void print_quantiles(C const & container, S && selector, ostream & out) {
+	const auto index_weight = [&](double p) -> pair<size_t, double> {
+		auto index = (p * (container.size() - 1));
+		return {floor(index), 1 - fmod(index, 1)};
+	};
+
+	auto sorted_container = container;
+	sort(sorted_container.begin(), sorted_container.end(),
+	     [&](auto const & l, auto const & r) { return selector(l) < selector(r); });
+	out << selector(sorted_container.front()) << '\t';
+
+	const auto i25 = index_weight(0.25);
+	out << i25.second * selector(sorted_container[i25.first])
+	           + (1 - i25.second) * selector(sorted_container[i25.first + 1])
+	    << '\t';
+
+	const auto i50 = index_weight(0.50);
+	out << i50.second * selector(sorted_container[i50.first])
+	           + (1 - i50.second) * selector(sorted_container[i50.first + 1])
+	    << '\t';
+
+	const auto i75 = index_weight(0.75);
+	out << i75.second * selector(sorted_container[i75.first])
+	           + (1 - i75.second) * selector(sorted_container[i75.first + 1])
+	    << '\t';
+
+	out << selector(sorted_container.back());
+}
+
+int main(int argc, char * argv[]) {
+	const auto args = docopt::docopt(USAGE, {argv + 1, argv + argc}, true, __DATE__ __TIME__);
+
+	vector<future<dataset>> dataset_futures;
+	for (auto const & filename : args.at("<file>").asStringList()) {
+		dataset_futures.emplace_back(async([filename] {
+			fstream file(filename);
+			if (!file) throw runtime_error("Could not open file " + filename);
+
+			dataset s;
+			datapoint p;
+			while (file >> p.states >> p.learning_queries >> p.learning_inputs >> p.testing_queries
+			       >> p.testing_inputs) {
+				s.push_back(p);
+			}
+
+			accumulate_dataset(s);
+
+			return s;
+		}));
+	}
+
+	vector<dataset> datasets;
+	clog << "datasets";
+	for (auto & f : dataset_futures) {
+		datasets.emplace_back(f.get());
+		clog << ' ' << datasets.back().size();
+		if (datasets.back().size() == 0) throw runtime_error("empty dataset");
+	}
+	clog << endl;
+
+	vector<size_t> state_values;
+	state_values.reserve(datasets[0].size());
+
+	// lazy way of doing things
+	for (auto && ds : datasets)
+		for (auto && x : ds) state_values.push_back(x.states);
+
+	sort(state_values.begin(), state_values.end());
+	state_values.erase(unique(state_values.begin(), state_values.end()), state_values.end());
+
+	// id(state_value) -> [total query size]
+	vector<vector<double>> data;
+	data.reserve(state_values.size());
+
+	// we keep track of the current timestamp for the different datasets
+	struct it_pair {
+		dataset::const_iterator current, next, end;
+	};
+	vector<it_pair> iterators(datasets.size());
+	for (size_t i = 0; i < datasets.size(); ++i)
+		iterators[i] = {datasets[i].begin(), datasets[i].begin(), datasets[i].end()};
+
+	for (auto const & state : state_values) {
+		data.push_back({});
+		for (auto & it : iterators) {
+			while (it.next != it.end && it.next->states < state) {
+				it.current = it.next;
+				it.next++;
+			}
+
+			// one run stopped prior to the others, we can skip it
+			if (it.next == it.end) continue;
+
+			// one run started earlier, we can skip it
+			if (it.current->states > state) continue;
+
+			// if we're spot on, update current
+			if (it.next->states == state) it.current = it.next;
+
+			const auto v2 = it.next->learning_queries + it.next->learning_inputs
+			                + it.next->testing_queries + it.next->testing_inputs;
+			const auto v1 = it.current->learning_queries + it.current->learning_inputs
+			                + it.current->testing_queries + it.current->testing_inputs;
+			const auto ratio
+			    = it.next->states == state
+			          ? 1.0
+			          : (state - it.current->states) / double(it.next->states - it.current->states);
+			const auto v = ratio * v2 + (1.0 - ratio) * v1;
+			data.back().push_back(v);
+		}
+	}
+
+	for (auto & v : data) {
+		sort(v.begin(), v.end());
+	}
+
+	cout << "s\tmin\tQ1\t\tQ2\tQ3\tmax" << endl;
+	for (size_t i = 0; i < state_values.size(); ++i) {
+		auto v = data[i];
+		if (v.empty()) continue;
+		cout << state_values[i] << '\t';
+		print_quantiles(v, [](auto const & x) { return x; }, cout);
+		cout << endl;
+	}
+}