mirror of
https://github.com/Jaxan/hybrid-ads.git
synced 2025-04-27 23:17:44 +02:00
Improved the trie data structure to be more (memory) efficient
This commit is contained in:
parent
4a5af92354
commit
44f4cb3b76
1 changed files with 66 additions and 41 deletions
103
lib/trie.hpp
103
lib/trie.hpp
|
@ -1,81 +1,106 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <boost/optional.hpp>
|
#include <algorithm>
|
||||||
|
#include <memory>
|
||||||
#include <stack>
|
|
||||||
#include <stdexcept>
|
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
///
|
///
|
||||||
/// \brief A Trie datastructure used to remove prefixes in a set of words
|
/// \brief A Trie datastructure used to remove prefixes in a set of words.
|
||||||
///
|
/// Insert-only. Iteration over the structure only uses longest matches.
|
||||||
/// The datastructure only works for words over integral unsigned types. In principle the symbols
|
|
||||||
/// can be unbounded, however having very large symbols degrades the performance a lot. Some random
|
|
||||||
/// testing shows that for symbols <= 50 the performance is similar to std::set (which is solving a
|
|
||||||
/// different problem).
|
|
||||||
///
|
///
|
||||||
/// Tests : 1M words, avg words length 4 (geometric dist.), alphabet 50 symbols
|
/// Tests : 1M words, avg words length 4 (geometric dist.), alphabet 50 symbols
|
||||||
/// trie reduction 58% in 1.15s
|
/// trie reduction 58% in 0.4s
|
||||||
/// set reduction 49% in 0.92s
|
/// set reduction 49% in 1.1s
|
||||||
///
|
///
|
||||||
/// I did not implement any iterators, as those are quite hard to get right.
|
/// I did not implement any iterators, as those are quite hard to get right.
|
||||||
/// There are, however, "internal iterators" exposed as a for_each() member
|
/// There are, however, "internal iterators" exposed as a for_each() member
|
||||||
/// function (if only we had coroutines already...)
|
/// function (if only we had coroutines already...)
|
||||||
///
|
///
|
||||||
|
/// TODO: implement `bool member(...)`
|
||||||
|
///
|
||||||
template <typename T> struct trie {
|
template <typename T> struct trie {
|
||||||
static_assert(std::is_integral<T>::value && std::is_unsigned<T>::value, "");
|
|
||||||
|
|
||||||
/// \brief Inserts a word (given by iterators \p begin and \p end)
|
/// \brief Inserts a word (given by iterators \p begin and \p end)
|
||||||
/// \returns true if the element was inserted, false if already there
|
/// \returns true if the element was inserted, false if already there
|
||||||
template <typename Iterator> bool insert(Iterator && begin, Iterator && end) {
|
template <typename Iterator> bool insert(Iterator && begin, Iterator && end) {
|
||||||
if (begin == end) return false;
|
if (!node) {
|
||||||
|
node.reset(new trie_node());
|
||||||
|
|
||||||
size_t i = *begin++;
|
if (begin == end) {
|
||||||
if (i >= branches.size()) branches.resize(i + 1);
|
|
||||||
|
|
||||||
auto & b = branches[i];
|
|
||||||
if (b) return b->insert(begin, end);
|
|
||||||
|
|
||||||
b = trie();
|
|
||||||
b->insert(begin, end);
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return node->insert(begin, end);
|
||||||
|
}
|
||||||
|
|
||||||
/// \brief Inserts a word given as range \p r
|
/// \brief Inserts a word given as range \p r
|
||||||
/// \returns true if the element was inserted, false if already there
|
/// \returns true if the element was inserted, false if already there
|
||||||
template <typename Range> bool insert(Range const & r) { return insert(begin(r), end(r)); }
|
template <typename Range> bool insert(Range const & r) { return insert(begin(r), end(r)); }
|
||||||
|
|
||||||
/// \brief Applies \p function to all word (not to the prefixes)
|
/// \brief Applies \p function to all word (not to the prefixes)
|
||||||
|
template <typename Fun> void for_each(Fun && function) const {
|
||||||
|
if (node) {
|
||||||
|
node->for_each(std::forward<Fun>(function));
|
||||||
|
} else {
|
||||||
|
// empty set, so we don't call the function
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// \brief Empties the complete set
|
||||||
|
void clear() { node.reset(nullptr); }
|
||||||
|
|
||||||
|
private:
|
||||||
|
struct trie_node;
|
||||||
|
std::unique_ptr<trie_node> node = nullptr;
|
||||||
|
|
||||||
|
// A node always contains the empty word
|
||||||
|
struct trie_node {
|
||||||
|
template <typename Iterator> bool insert(Iterator && begin, Iterator && end) {
|
||||||
|
if (begin == end) return false;
|
||||||
|
|
||||||
|
T i = *begin++;
|
||||||
|
auto it = find(i);
|
||||||
|
|
||||||
|
if (it != data.end() && it->first == i) {
|
||||||
|
return it->second.insert(begin, end);
|
||||||
|
}
|
||||||
|
|
||||||
|
// else, does not yet exist
|
||||||
|
it = data.emplace(it, i, trie_node());
|
||||||
|
it->second.insert(begin, end);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
template <typename Fun> void for_each(Fun && function) const {
|
template <typename Fun> void for_each(Fun && function) const {
|
||||||
std::vector<T> word;
|
std::vector<T> word;
|
||||||
return for_each_impl(std::forward<Fun>(function), word);
|
return for_each_impl(std::forward<Fun>(function), word);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// \brief Empties the complete set
|
|
||||||
void clear() { branches.clear(); }
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
template <typename Fun> void for_each_impl(Fun && function, std::vector<T> & word) const {
|
template <typename Fun> void for_each_impl(Fun && function, std::vector<T> & word) const {
|
||||||
size_t count = 0;
|
if (data.empty()) {
|
||||||
for (T i = 0; i < branches.size(); ++i) {
|
// we don't want function to modify word
|
||||||
auto const & b = branches[i];
|
const auto & cword = word;
|
||||||
if (b) {
|
function(cword);
|
||||||
++count;
|
}
|
||||||
word.push_back(i);
|
|
||||||
b->for_each_impl(function, word);
|
for (auto const & kv : data) {
|
||||||
|
// for each letter, we extend the word, recurse and remove extension.
|
||||||
|
word.push_back(kv.first);
|
||||||
|
kv.second.for_each_impl(function, word);
|
||||||
word.resize(word.size() - 1);
|
word.resize(word.size() - 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (count == 0) {
|
typename std::vector<std::pair<T, trie_node>>::iterator find(T const & key) {
|
||||||
const auto & cword = word;
|
return std::lower_bound(
|
||||||
function(cword); // we don't want function to modify word
|
data.begin(), data.end(), key,
|
||||||
return;
|
[](std::pair<T, trie_node> const & kv, T const & k) { return kv.first < k; });
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<boost::optional<trie>> branches;
|
std::vector<std::pair<T, trie_node>> data;
|
||||||
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
/// \brief Flattens a trie \p t
|
/// \brief Flattens a trie \p t
|
||||||
|
|
Loading…
Add table
Reference in a new issue