From 5f827b2c1cf5c66c855636c8b7e9d9d16f35925b Mon Sep 17 00:00:00 2001 From: Joshua Moerman Date: Tue, 14 Apr 2015 11:55:20 +0200 Subject: [PATCH] Adds a Trie datastrucutre (to be used) --- lib/trie.cpp | 9 ++++ lib/trie.hpp | 81 +++++++++++++++++++++++++++++++++++ src/trie_test.cpp | 105 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 195 insertions(+) create mode 100644 lib/trie.cpp create mode 100644 lib/trie.hpp create mode 100644 src/trie_test.cpp diff --git a/lib/trie.cpp b/lib/trie.cpp new file mode 100644 index 0000000..7c2b07c --- /dev/null +++ b/lib/trie.cpp @@ -0,0 +1,9 @@ +#include "trie.hpp" + +std::vector> flatten(const trie& t) { + std::vector> ret; + + t.for_each([&ret](auto&& w) { ret.push_back(w); }); + + return ret; +} diff --git a/lib/trie.hpp b/lib/trie.hpp new file mode 100644 index 0000000..79dead3 --- /dev/null +++ b/lib/trie.hpp @@ -0,0 +1,81 @@ +#pragma once + +#include + +#include +#include +#include +#include + +/// +/// \brief A Trie datastructure used to remove prefixes in a set of words +/// +/// The datastructure only works for words over size_t. In principle the symbols +/// can be unbounded, however having very large symbols degrades the performance +/// a lot. Some random testing shows that for symbols <= 50 the performance is +/// similar to std::set (which is solving a different problem). +/// +/// Tests : 1M words, avg words length 4 (geometric dist.), alphabet 50 symbols +/// trie reduction 58% in 1.15s +/// set reduction 49% in 0.92s +/// +/// I did not implement any iterators, as those are quite hard to get right. +/// There are, however, "internal iterators" exposed as a for_each() member +/// function (if only we had coroutines already...) +/// +struct trie { + /// \brief Inserts a word (given by iterators \p begin and \p end) + /// \returns true if the element was inserted, false if already there + template bool insert(Iterator&& begin, Iterator&& end) { + if (begin == end) return false; + + size_t i = *begin++; + if (i >= branches.size()) branches.resize(i + 1); + + auto& b = branches[i]; + if (b) return b->insert(begin, end); + + b = trie(); + b->insert(begin, end); + count++; + return true; + } + + /// \brief Inserts a word given as range \p r + /// \returns true if the element was inserted, false if already there + template bool insert(Range const& r) { + return insert(begin(r), end(r)); + } + + /// \p function is applied to all word (not to the prefixes) + template void for_each(Fun&& function) const { + std::vector word; + return for_each_impl(std::forward(function), word); + } + + private: + template + void for_each_impl(Fun&& function, std::vector& word) const { + if (count == 0) { + const auto& cword = word; + function(cword); // we don't want function to modify word + return; + } + + for (size_t i = 0; i < branches.size(); ++i) { + auto const& b = branches[i]; + if (b) { + word.push_back(i); + b->for_each_impl(function, word); + word.resize(word.size() - 1); + } + } + } + + size_t count = 0; + std::vector> branches; +}; + +/// \brief Flattens a trie \p t +/// \returns an array of words (without the prefixes) +std::vector> flatten(trie const& t); diff --git a/src/trie_test.cpp b/src/trie_test.cpp new file mode 100644 index 0000000..9819602 --- /dev/null +++ b/src/trie_test.cpp @@ -0,0 +1,105 @@ +#include + +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +using word = vector; + +static void check(bool r) { + if (!r) throw runtime_error("error in trie"); +} + +static void test() { + word w1 = {1, 2, 3}; + word w2 = {2, 3}; + word w3 = {1, 2}; + word w4 = {5, 5, 5}; + word w5 = {5, 5, 3}; + word w6 = {5, 5, 3, 1}; + + trie t; + check(t.insert(w1)); + check(!t.insert(w1)); + check(t.insert(w2)); + check(!t.insert(w3)); + check(t.insert(w4)); + check(t.insert(w5)); + check(t.insert(w6)); + + check(flatten(t).size() == 4); + + t.for_each([](auto&& w) { + for (auto&& i : w) cout << i << ' '; + cout << '\n'; + }); + cout << endl; +} + +static void performance() { + vector corpus(1000000); + + std::random_device rd; + std::mt19937 generator(rd()); + uniform_int_distribution unfair_coin(0, 3); + uniform_int_distribution symbol(0, 50 - 1); + + generate(begin(corpus), + end(corpus), + [&] { + word w; + while (unfair_coin(generator) || w.empty()) { + w.push_back(symbol(generator)); + } + return w; + }); + + size_t size = corpus.size(); + size_t total_size + = accumulate(begin(corpus), + end(corpus), + 0ul, + [](auto l, auto&& r) { return l + r.size(); }); + + cout << size << " words\n"; + cout << total_size << " symbols\n"; + cout << total_size / double(size) << " average word length\n"; + cout << endl; + + using clock = std::chrono::high_resolution_clock; + using time = std::chrono::time_point; + using seconds = std::chrono::duration; + + auto t_start = clock::now(); + trie t; + for (auto&& w : corpus) t.insert(w); + auto t_end = clock::now(); + + auto s_start = clock::now(); + set s; + for (auto&& w : corpus) s.insert(w); + auto s_end = clock::now(); + + size_t trie_size = flatten(t).size(); + size_t set_size = s.size(); + cout << trie_size << " words in the trie\n"; + cout << trie_size / double(size) << " ratio\n"; + cout << seconds(t_end - t_start).count() << " seconds\n"; + cout << endl; + + cout << set_size << " words in the set\n"; + cout << set_size / double(size) << " ratio\n"; + cout << seconds(s_end - s_start).count() << " seconds\n"; + cout << endl; +} + +int main() { + test(); + performance(); +}