From a70a2f6ac77e052ca6c2d875541e13c4bbc8cda1 Mon Sep 17 00:00:00 2001 From: Joshua Moerman Date: Fri, 18 Sep 2015 13:17:35 +0200 Subject: [PATCH] Some script I use to make statistical learning graphs --- src/learning_graph.cpp | 171 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 171 insertions(+) create mode 100644 src/learning_graph.cpp diff --git a/src/learning_graph.cpp b/src/learning_graph.cpp new file mode 100644 index 0000000..3afa768 --- /dev/null +++ b/src/learning_graph.cpp @@ -0,0 +1,171 @@ +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +static const char USAGE[] = + R"(Generate a statistical learning graph from multiple runs + + Usage: + learning_graph ... + + Options: + -h, --help Show this screen + --version Show version +)"; + +struct datapoint { + uint64_t states; + uint64_t learning_queries; + uint64_t learning_inputs; + uint64_t testing_queries; + uint64_t testing_inputs; +}; + +using dataset = vector; + +static void accumulate_dataset(dataset & ds) { + for (size_t i = 0; i < ds.size() - 1; ++i) { + ds[i + 1].learning_queries += ds[i].learning_queries; + ds[i + 1].learning_inputs += ds[i].learning_inputs; + ds[i + 1].testing_queries += ds[i].testing_queries; + ds[i + 1].testing_inputs += ds[i].testing_inputs; + } +} + +template +void print_quantiles(C const & container, S && selector, ostream & out) { + const auto index_weight = [&](double p) -> pair { + auto index = (p * (container.size() - 1)); + return {floor(index), 1 - fmod(index, 1)}; + }; + + auto sorted_container = container; + sort(sorted_container.begin(), sorted_container.end(), + [&](auto const & l, auto const & r) { return selector(l) < selector(r); }); + out << selector(sorted_container.front()) << '\t'; + + const auto i25 = index_weight(0.25); + out << i25.second * selector(sorted_container[i25.first]) + + (1 - i25.second) * selector(sorted_container[i25.first + 1]) + << '\t'; + + const auto i50 = index_weight(0.50); + out << i50.second * selector(sorted_container[i50.first]) + + (1 - i50.second) * selector(sorted_container[i50.first + 1]) + << '\t'; + + const auto i75 = index_weight(0.75); + out << i75.second * selector(sorted_container[i75.first]) + + (1 - i75.second) * selector(sorted_container[i75.first + 1]) + << '\t'; + + out << selector(sorted_container.back()); +} + +int main(int argc, char * argv[]) { + const auto args = docopt::docopt(USAGE, {argv + 1, argv + argc}, true, __DATE__ __TIME__); + + vector> dataset_futures; + for (auto const & filename : args.at("").asStringList()) { + dataset_futures.emplace_back(async([filename] { + fstream file(filename); + if (!file) throw runtime_error("Could not open file " + filename); + + dataset s; + datapoint p; + while (file >> p.states >> p.learning_queries >> p.learning_inputs >> p.testing_queries + >> p.testing_inputs) { + s.push_back(p); + } + + accumulate_dataset(s); + + return s; + })); + } + + vector datasets; + clog << "datasets"; + for (auto & f : dataset_futures) { + datasets.emplace_back(f.get()); + clog << ' ' << datasets.back().size(); + if (datasets.back().size() == 0) throw runtime_error("empty dataset"); + } + clog << endl; + + vector state_values; + state_values.reserve(datasets[0].size()); + + // lazy way of doing things + for (auto && ds : datasets) + for (auto && x : ds) state_values.push_back(x.states); + + sort(state_values.begin(), state_values.end()); + state_values.erase(unique(state_values.begin(), state_values.end()), state_values.end()); + + // id(state_value) -> [total query size] + vector> data; + data.reserve(state_values.size()); + + // we keep track of the current timestamp for the different datasets + struct it_pair { + dataset::const_iterator current, next, end; + }; + vector iterators(datasets.size()); + for (size_t i = 0; i < datasets.size(); ++i) + iterators[i] = {datasets[i].begin(), datasets[i].begin(), datasets[i].end()}; + + for (auto const & state : state_values) { + data.push_back({}); + for (auto & it : iterators) { + while (it.next != it.end && it.next->states < state) { + it.current = it.next; + it.next++; + } + + // one run stopped prior to the others, we can skip it + if (it.next == it.end) continue; + + // one run started earlier, we can skip it + if (it.current->states > state) continue; + + // if we're spot on, update current + if (it.next->states == state) it.current = it.next; + + const auto v2 = it.next->learning_queries + it.next->learning_inputs + + it.next->testing_queries + it.next->testing_inputs; + const auto v1 = it.current->learning_queries + it.current->learning_inputs + + it.current->testing_queries + it.current->testing_inputs; + const auto ratio + = it.next->states == state + ? 1.0 + : (state - it.current->states) / double(it.next->states - it.current->states); + const auto v = ratio * v2 + (1.0 - ratio) * v1; + data.back().push_back(v); + } + } + + for (auto & v : data) { + sort(v.begin(), v.end()); + } + + cout << "s\tmin\tQ1\t\tQ2\tQ3\tmax" << endl; + for (size_t i = 0; i < state_values.size(); ++i) { + auto v = data[i]; + if (v.empty()) continue; + cout << state_values[i] << '\t'; + print_quantiles(v, [](auto const & x) { return x; }, cout); + cout << endl; + } +}