mirror of
https://github.com/Jaxan/hybrid-ads.git
synced 2025-04-27 23:17:44 +02:00
Adds a Trie datastrucutre (to be used)
This commit is contained in:
parent
2b600cca08
commit
5f827b2c1c
3 changed files with 195 additions and 0 deletions
9
lib/trie.cpp
Normal file
9
lib/trie.cpp
Normal file
|
@ -0,0 +1,9 @@
|
|||
#include "trie.hpp"
|
||||
|
||||
std::vector<std::vector<size_t>> flatten(const trie& t) {
|
||||
std::vector<std::vector<size_t>> ret;
|
||||
|
||||
t.for_each([&ret](auto&& w) { ret.push_back(w); });
|
||||
|
||||
return ret;
|
||||
}
|
81
lib/trie.hpp
Normal file
81
lib/trie.hpp
Normal file
|
@ -0,0 +1,81 @@
|
|||
#pragma once
|
||||
|
||||
#include <boost/optional.hpp>
|
||||
|
||||
#include <stack>
|
||||
#include <stdexcept>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
///
|
||||
/// \brief A Trie datastructure used to remove prefixes in a set of words
|
||||
///
|
||||
/// The datastructure only works for words over size_t. In principle the symbols
|
||||
/// can be unbounded, however having very large symbols degrades the performance
|
||||
/// a lot. Some random testing shows that for symbols <= 50 the performance is
|
||||
/// similar to std::set (which is solving a different problem).
|
||||
///
|
||||
/// Tests : 1M words, avg words length 4 (geometric dist.), alphabet 50 symbols
|
||||
/// trie reduction 58% in 1.15s
|
||||
/// set reduction 49% in 0.92s
|
||||
///
|
||||
/// I did not implement any iterators, as those are quite hard to get right.
|
||||
/// There are, however, "internal iterators" exposed as a for_each() member
|
||||
/// function (if only we had coroutines already...)
|
||||
///
|
||||
struct trie {
|
||||
/// \brief Inserts a word (given by iterators \p begin and \p end)
|
||||
/// \returns true if the element was inserted, false if already there
|
||||
template <typename Iterator> bool insert(Iterator&& begin, Iterator&& end) {
|
||||
if (begin == end) return false;
|
||||
|
||||
size_t i = *begin++;
|
||||
if (i >= branches.size()) branches.resize(i + 1);
|
||||
|
||||
auto& b = branches[i];
|
||||
if (b) return b->insert(begin, end);
|
||||
|
||||
b = trie();
|
||||
b->insert(begin, end);
|
||||
count++;
|
||||
return true;
|
||||
}
|
||||
|
||||
/// \brief Inserts a word given as range \p r
|
||||
/// \returns true if the element was inserted, false if already there
|
||||
template <typename Range> bool insert(Range const& r) {
|
||||
return insert(begin(r), end(r));
|
||||
}
|
||||
|
||||
/// \p function is applied to all word (not to the prefixes)
|
||||
template <typename Fun> void for_each(Fun&& function) const {
|
||||
std::vector<size_t> word;
|
||||
return for_each_impl(std::forward<Fun>(function), word);
|
||||
}
|
||||
|
||||
private:
|
||||
template <typename Fun>
|
||||
void for_each_impl(Fun&& function, std::vector<size_t>& word) const {
|
||||
if (count == 0) {
|
||||
const auto& cword = word;
|
||||
function(cword); // we don't want function to modify word
|
||||
return;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < branches.size(); ++i) {
|
||||
auto const& b = branches[i];
|
||||
if (b) {
|
||||
word.push_back(i);
|
||||
b->for_each_impl(function, word);
|
||||
word.resize(word.size() - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
size_t count = 0;
|
||||
std::vector<boost::optional<trie>> branches;
|
||||
};
|
||||
|
||||
/// \brief Flattens a trie \p t
|
||||
/// \returns an array of words (without the prefixes)
|
||||
std::vector<std::vector<size_t>> flatten(trie const& t);
|
105
src/trie_test.cpp
Normal file
105
src/trie_test.cpp
Normal file
|
@ -0,0 +1,105 @@
|
|||
#include <trie.hpp>
|
||||
|
||||
#include <algorithm>
|
||||
#include <chrono>
|
||||
#include <iostream>
|
||||
#include <random>
|
||||
#include <set>
|
||||
#include <stdexcept>
|
||||
#include <vector>
|
||||
|
||||
using namespace std;
|
||||
|
||||
using word = vector<size_t>;
|
||||
|
||||
static void check(bool r) {
|
||||
if (!r) throw runtime_error("error in trie");
|
||||
}
|
||||
|
||||
static void test() {
|
||||
word w1 = {1, 2, 3};
|
||||
word w2 = {2, 3};
|
||||
word w3 = {1, 2};
|
||||
word w4 = {5, 5, 5};
|
||||
word w5 = {5, 5, 3};
|
||||
word w6 = {5, 5, 3, 1};
|
||||
|
||||
trie t;
|
||||
check(t.insert(w1));
|
||||
check(!t.insert(w1));
|
||||
check(t.insert(w2));
|
||||
check(!t.insert(w3));
|
||||
check(t.insert(w4));
|
||||
check(t.insert(w5));
|
||||
check(t.insert(w6));
|
||||
|
||||
check(flatten(t).size() == 4);
|
||||
|
||||
t.for_each([](auto&& w) {
|
||||
for (auto&& i : w) cout << i << ' ';
|
||||
cout << '\n';
|
||||
});
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
static void performance() {
|
||||
vector<word> corpus(1000000);
|
||||
|
||||
std::random_device rd;
|
||||
std::mt19937 generator(rd());
|
||||
uniform_int_distribution<int> unfair_coin(0, 3);
|
||||
uniform_int_distribution<size_t> symbol(0, 50 - 1);
|
||||
|
||||
generate(begin(corpus),
|
||||
end(corpus),
|
||||
[&] {
|
||||
word w;
|
||||
while (unfair_coin(generator) || w.empty()) {
|
||||
w.push_back(symbol(generator));
|
||||
}
|
||||
return w;
|
||||
});
|
||||
|
||||
size_t size = corpus.size();
|
||||
size_t total_size
|
||||
= accumulate(begin(corpus),
|
||||
end(corpus),
|
||||
0ul,
|
||||
[](auto l, auto&& r) { return l + r.size(); });
|
||||
|
||||
cout << size << " words\n";
|
||||
cout << total_size << " symbols\n";
|
||||
cout << total_size / double(size) << " average word length\n";
|
||||
cout << endl;
|
||||
|
||||
using clock = std::chrono::high_resolution_clock;
|
||||
using time = std::chrono::time_point<clock>;
|
||||
using seconds = std::chrono::duration<double>;
|
||||
|
||||
auto t_start = clock::now();
|
||||
trie t;
|
||||
for (auto&& w : corpus) t.insert(w);
|
||||
auto t_end = clock::now();
|
||||
|
||||
auto s_start = clock::now();
|
||||
set<word> s;
|
||||
for (auto&& w : corpus) s.insert(w);
|
||||
auto s_end = clock::now();
|
||||
|
||||
size_t trie_size = flatten(t).size();
|
||||
size_t set_size = s.size();
|
||||
cout << trie_size << " words in the trie\n";
|
||||
cout << trie_size / double(size) << " ratio\n";
|
||||
cout << seconds(t_end - t_start).count() << " seconds\n";
|
||||
cout << endl;
|
||||
|
||||
cout << set_size << " words in the set\n";
|
||||
cout << set_size / double(size) << " ratio\n";
|
||||
cout << seconds(s_end - s_start).count() << " seconds\n";
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
int main() {
|
||||
test();
|
||||
performance();
|
||||
}
|
Loading…
Add table
Reference in a new issue