1
Fork 0
mirror of https://github.com/Jaxan/hybrid-ads.git synced 2025-04-27 23:17:44 +02:00

Improved the trie data structure to be more (memory) efficient

This commit is contained in:
Joshua Moerman 2017-07-28 14:48:14 +01:00
parent 4a5af92354
commit 44f4cb3b76

View file

@ -1,45 +1,37 @@
#pragma once #pragma once
#include <boost/optional.hpp> #include <algorithm>
#include <memory>
#include <stack>
#include <stdexcept>
#include <utility> #include <utility>
#include <vector> #include <vector>
/// ///
/// \brief A Trie datastructure used to remove prefixes in a set of words /// \brief A Trie datastructure used to remove prefixes in a set of words.
/// /// Insert-only. Iteration over the structure only uses longest matches.
/// The datastructure only works for words over integral unsigned types. In principle the symbols
/// can be unbounded, however having very large symbols degrades the performance a lot. Some random
/// testing shows that for symbols <= 50 the performance is similar to std::set (which is solving a
/// different problem).
/// ///
/// Tests : 1M words, avg words length 4 (geometric dist.), alphabet 50 symbols /// Tests : 1M words, avg words length 4 (geometric dist.), alphabet 50 symbols
/// trie reduction 58% in 1.15s /// trie reduction 58% in 0.4s
/// set reduction 49% in 0.92s /// set reduction 49% in 1.1s
/// ///
/// I did not implement any iterators, as those are quite hard to get right. /// I did not implement any iterators, as those are quite hard to get right.
/// There are, however, "internal iterators" exposed as a for_each() member /// There are, however, "internal iterators" exposed as a for_each() member
/// function (if only we had coroutines already...) /// function (if only we had coroutines already...)
/// ///
/// TODO: implement `bool member(...)`
///
template <typename T> struct trie { template <typename T> struct trie {
static_assert(std::is_integral<T>::value && std::is_unsigned<T>::value, "");
/// \brief Inserts a word (given by iterators \p begin and \p end) /// \brief Inserts a word (given by iterators \p begin and \p end)
/// \returns true if the element was inserted, false if already there /// \returns true if the element was inserted, false if already there
template <typename Iterator> bool insert(Iterator && begin, Iterator && end) { template <typename Iterator> bool insert(Iterator && begin, Iterator && end) {
if (begin == end) return false; if (!node) {
node.reset(new trie_node());
size_t i = *begin++; if (begin == end) {
if (i >= branches.size()) branches.resize(i + 1); return true;
}
}
auto & b = branches[i]; return node->insert(begin, end);
if (b) return b->insert(begin, end);
b = trie();
b->insert(begin, end);
return true;
} }
/// \brief Inserts a word given as range \p r /// \brief Inserts a word given as range \p r
@ -48,34 +40,67 @@ template <typename T> struct trie {
/// \brief Applies \p function to all word (not to the prefixes) /// \brief Applies \p function to all word (not to the prefixes)
template <typename Fun> void for_each(Fun && function) const { template <typename Fun> void for_each(Fun && function) const {
std::vector<T> word; if (node) {
return for_each_impl(std::forward<Fun>(function), word); node->for_each(std::forward<Fun>(function));
} else {
// empty set, so we don't call the function
}
} }
/// \brief Empties the complete set /// \brief Empties the complete set
void clear() { branches.clear(); } void clear() { node.reset(nullptr); }
private: private:
template <typename Fun> void for_each_impl(Fun && function, std::vector<T> & word) const { struct trie_node;
size_t count = 0; std::unique_ptr<trie_node> node = nullptr;
for (T i = 0; i < branches.size(); ++i) {
auto const & b = branches[i]; // A node always contains the empty word
if (b) { struct trie_node {
++count; template <typename Iterator> bool insert(Iterator && begin, Iterator && end) {
word.push_back(i); if (begin == end) return false;
b->for_each_impl(function, word);
T i = *begin++;
auto it = find(i);
if (it != data.end() && it->first == i) {
return it->second.insert(begin, end);
}
// else, does not yet exist
it = data.emplace(it, i, trie_node());
it->second.insert(begin, end);
return true;
}
template <typename Fun> void for_each(Fun && function) const {
std::vector<T> word;
return for_each_impl(std::forward<Fun>(function), word);
}
private:
template <typename Fun> void for_each_impl(Fun && function, std::vector<T> & word) const {
if (data.empty()) {
// we don't want function to modify word
const auto & cword = word;
function(cword);
}
for (auto const & kv : data) {
// for each letter, we extend the word, recurse and remove extension.
word.push_back(kv.first);
kv.second.for_each_impl(function, word);
word.resize(word.size() - 1); word.resize(word.size() - 1);
} }
} }
if (count == 0) { typename std::vector<std::pair<T, trie_node>>::iterator find(T const & key) {
const auto & cword = word; return std::lower_bound(
function(cword); // we don't want function to modify word data.begin(), data.end(), key,
return; [](std::pair<T, trie_node> const & kv, T const & k) { return kv.first < k; });
} }
}
std::vector<boost::optional<trie>> branches; std::vector<std::pair<T, trie_node>> data;
};
}; };
/// \brief Flattens a trie \p t /// \brief Flattens a trie \p t
@ -90,7 +115,7 @@ template <typename T> std::vector<std::vector<T>> flatten(trie<T> const & t) {
template <typename T> std::pair<size_t, size_t> total_size(trie<T> const & t) { template <typename T> std::pair<size_t, size_t> total_size(trie<T> const & t) {
size_t count = 0; size_t count = 0;
size_t total_count = 0; size_t total_count = 0;
t.for_each([&count, &total_count](std::vector<T> const & w) { t.for_each([&count, &total_count](std::vector<T> const & w) {
++count; ++count;
total_count += w.size(); total_count += w.size();
}); });