From 5f827b2c1cf5c66c855636c8b7e9d9d16f35925b Mon Sep 17 00:00:00 2001
From: Joshua Moerman <lakseru@gmail.com>
Date: Tue, 14 Apr 2015 11:55:20 +0200
Subject: [PATCH] Adds a Trie datastrucutre (to be used)

---
 lib/trie.cpp      |   9 ++++
 lib/trie.hpp      |  81 +++++++++++++++++++++++++++++++++++
 src/trie_test.cpp | 105 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 195 insertions(+)
 create mode 100644 lib/trie.cpp
 create mode 100644 lib/trie.hpp
 create mode 100644 src/trie_test.cpp
diff --git a/lib/trie.cpp b/lib/trie.cpp
new file mode 100644
index 0000000..7c2b07c
--- /dev/null
+++ b/lib/trie.cpp
@@ -0,0 +1,9 @@
+#include "trie.hpp"
+
+std::vector<std::vector<size_t>> flatten(const trie& t) {
+	std::vector<std::vector<size_t>> ret;
+
+	t.for_each([&ret](auto&& w) { ret.push_back(w); });
+
+	return ret;
+}
diff --git a/lib/trie.hpp b/lib/trie.hpp
new file mode 100644
index 0000000..79dead3
--- /dev/null
+++ b/lib/trie.hpp
@@ -0,0 +1,81 @@
+#pragma once
+
+#include <boost/optional.hpp>
+
+#include <stack>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+///
+/// \brief A Trie datastructure used to remove prefixes in a set of words
+///
+/// The datastructure only works for words over size_t. In principle the symbols
+/// can be unbounded, however having very large symbols degrades the performance
+/// a lot. Some random testing shows that for symbols <= 50 the performance is
+/// similar to std::set (which is solving a different problem).
+///
+/// Tests : 1M words, avg words length 4 (geometric dist.), alphabet 50 symbols
+/// trie reduction 58% in 1.15s
+/// set  reduction 49% in 0.92s
+///
+/// I did not implement any iterators, as those are quite hard to get right.
+/// There are, however, "internal iterators" exposed as a for_each() member
+/// function (if only we had coroutines already...)
+///
+struct trie {
+	/// \brief Inserts a word (given by iterators \p begin and \p end)
+	/// \returns true if the element was inserted, false if already there
+	template <typename Iterator> bool insert(Iterator&& begin, Iterator&& end) {
+		if (begin == end) return false;
+
+		size_t i = *begin++;
+		if (i >= branches.size()) branches.resize(i + 1);
+
+		auto& b = branches[i];
+		if (b) return b->insert(begin, end);
+
+		b = trie();
+		b->insert(begin, end);
+		count++;
+		return true;
+	}
+
+	/// \brief Inserts a word given as range \p r
+	/// \returns true if the element was inserted, false if already there
+	template <typename Range> bool insert(Range const& r) {
+		return insert(begin(r), end(r));
+	}
+
+	/// \p function is applied to all word (not to the prefixes)
+	template <typename Fun> void for_each(Fun&& function) const {
+		std::vector<size_t> word;
+		return for_each_impl(std::forward<Fun>(function), word);
+	}
+
+	private:
+	template <typename Fun>
+	void for_each_impl(Fun&& function, std::vector<size_t>& word) const {
+		if (count == 0) {
+			const auto& cword = word;
+			function(cword); // we don't want function to modify word
+			return;
+		}
+
+		for (size_t i = 0; i < branches.size(); ++i) {
+			auto const& b = branches[i];
+			if (b) {
+				word.push_back(i);
+				b->for_each_impl(function, word);
+				word.resize(word.size() - 1);
+			}
+		}
+	}
+
+	size_t count = 0;
+	std::vector<boost::optional<trie>> branches;
+};
+
+/// \brief Flattens a trie \p t
+/// \returns an array of words (without the prefixes)
+std::vector<std::vector<size_t>> flatten(trie const& t);
diff --git a/src/trie_test.cpp b/src/trie_test.cpp
new file mode 100644
index 0000000..9819602
--- /dev/null
+++ b/src/trie_test.cpp
@@ -0,0 +1,105 @@
+#include <trie.hpp>
+
+#include <algorithm>
+#include <chrono>
+#include <iostream>
+#include <random>
+#include <set>
+#include <stdexcept>
+#include <vector>
+
+using namespace std;
+
+using word = vector<size_t>;
+
+static void check(bool r) {
+	if (!r) throw runtime_error("error in trie");
+}
+
+static void test() {
+	word w1 = {1, 2, 3};
+	word w2 = {2, 3};
+	word w3 = {1, 2};
+	word w4 = {5, 5, 5};
+	word w5 = {5, 5, 3};
+	word w6 = {5, 5, 3, 1};
+
+	trie t;
+	check(t.insert(w1));
+	check(!t.insert(w1));
+	check(t.insert(w2));
+	check(!t.insert(w3));
+	check(t.insert(w4));
+	check(t.insert(w5));
+	check(t.insert(w6));
+
+	check(flatten(t).size() == 4);
+
+	t.for_each([](auto&& w) {
+		for (auto&& i : w) cout << i << ' ';
+		cout << '\n';
+	});
+	cout << endl;
+}
+
+static void performance() {
+	vector<word> corpus(1000000);
+
+	std::random_device rd;
+	std::mt19937 generator(rd());
+	uniform_int_distribution<int> unfair_coin(0, 3);
+	uniform_int_distribution<size_t> symbol(0, 50 - 1);
+
+	generate(begin(corpus),
+	         end(corpus),
+	         [&] {
+		         word w;
+		         while (unfair_coin(generator) || w.empty()) {
+			         w.push_back(symbol(generator));
+		         }
+		         return w;
+		     });
+
+	size_t size = corpus.size();
+	size_t total_size
+	    = accumulate(begin(corpus),
+	                 end(corpus),
+	                 0ul,
+	                 [](auto l, auto&& r) { return l + r.size(); });
+
+	cout << size << " words\n";
+	cout << total_size << " symbols\n";
+	cout << total_size / double(size) << " average word length\n";
+	cout << endl;
+
+	using clock = std::chrono::high_resolution_clock;
+	using time = std::chrono::time_point<clock>;
+	using seconds = std::chrono::duration<double>;
+
+	auto t_start = clock::now();
+	trie t;
+	for (auto&& w : corpus) t.insert(w);
+	auto t_end = clock::now();
+
+	auto s_start = clock::now();
+	set<word> s;
+	for (auto&& w : corpus) s.insert(w);
+	auto s_end = clock::now();
+
+	size_t trie_size = flatten(t).size();
+	size_t set_size = s.size();
+	cout << trie_size << " words in the trie\n";
+	cout << trie_size / double(size) << " ratio\n";
+	cout << seconds(t_end - t_start).count() << " seconds\n";
+	cout << endl;
+
+	cout << set_size << " words in the set\n";
+	cout << set_size / double(size) << " ratio\n";
+	cout << seconds(s_end - s_start).count() << " seconds\n";
+	cout << endl;
+}
+
+int main() {
+	test();
+	performance();
+}