Joshua Moerman
11 years ago
commit
fcf5765cb5
15 changed files with 601 additions and 0 deletions
@ -0,0 +1,4 @@ |
|||
*.user |
|||
build-* |
|||
.DS_Store |
|||
|
@ -0,0 +1,14 @@ |
|||
project(Genetics) |
|||
cmake_minimum_required(VERSION 2.8) |
|||
|
|||
include_directories("${PROJECT_SOURCE_DIR}/include/") |
|||
add_definitions(-std=c++1y) |
|||
|
|||
find_package(Boost REQUIRED COMPONENTS filesystem program_options serialization system) |
|||
include_directories(SYSTEM ${Boost_INCLUDE_DIRS}) |
|||
set(libs ${libs} ${Boost_LIBRARIES}) |
|||
|
|||
set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) |
|||
add_subdirectory("include") |
|||
add_subdirectory("lib") |
|||
add_subdirectory("src") |
@ -0,0 +1,38 @@ |
|||
# Genetic Sequence Formulizer |
|||
This program will take a sequence entered by the user and tries to find a formula producing this sequence. I made this to play around with genetic programming. Inspired by the following two talks: |
|||
|
|||
* [Reducing Wasted Evaluations in Cartesian Genetic Programming, by Brian Goldman](https://www.youtube.com/watch?v=3HsgVHv1ho8) |
|||
* [Automatic Algorithm Invention with a GPU, by Wes Faler](https://www.youtube.com/watch?v=xQDazGrKsuM) |
|||
|
|||
I used a technique similar to the one described in the first presentation (the one called *single*), as it seemed to be the easiest one. The nice thing is that there is always just one parent, but there are inactive genes which are allowed to mutate freely. |
|||
|
|||
## Examples |
|||
Some actual output of the program (truncated): |
|||
|
|||
$ ./main 0 0 0 0 0 0 0 |
|||
your input: 0, 0, 0, 0, 0, 0, 0, |
|||
formula: (x-x)*x |
|||
continuation: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... |
|||
|
|||
$ ./main 1 2 3 |
|||
your input: 1, 2, 3, |
|||
formula: ((x-18)/(x-18))+x |
|||
continuation: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ... |
|||
|
|||
$ ./main 1 2 4 8 |
|||
your input: 1, 2, 4, 8, |
|||
formula: 2^x |
|||
continuation: 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, ... |
|||
|
|||
$ ./main 37 |
|||
your input: 37, |
|||
formula: (10+12)+15 |
|||
continuation: 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, |
|||
|
|||
## Notes |
|||
|
|||
* The algorithm does not yet simplify the formula (I should take this into account when scoring the genes). |
|||
* The sequences are 0-based. |
|||
* The constants used in the genes are bound to 20 (so in the last example, it needed multiple constants added). |
|||
* It treats ```1/0``` as ```INT_MAX``` (we need some safety as we evaluate randomly generated expressions). |
|||
* I may be using some c++14 features. Install a new clang or gcc to get this awesomeness. |
@ -0,0 +1,4 @@ |
|||
|
|||
# Hack to show headers in Qt Creator |
|||
file(GLOB headers "*.hpp") |
|||
add_library(dummy ${headers} dummy.cpp) |
@ -0,0 +1,2 @@ |
|||
// Needed for the hack to show headers in Qt Creator
|
|||
static int x = x; |
@ -0,0 +1,21 @@ |
|||
#pragma once |
|||
|
|||
#include "genome.hpp" |
|||
#include <vector> |
|||
|
|||
using Score = double; |
|||
|
|||
struct Evolver{ |
|||
// stores the genome with its score
|
|||
// first element is the best one (after calling next_generation()).
|
|||
std::vector<std::pair<Genome, Score>> current_generation; |
|||
|
|||
// goal which we are trying to achieve
|
|||
std::vector<int> goal; |
|||
|
|||
// evaluates current generation, picks the best one, and generate new generation
|
|||
void next_generation(); |
|||
}; |
|||
|
|||
// reate a random generation to start with
|
|||
Evolver create_evolver(size_t population_size, size_t genome_size); |
@ -0,0 +1,61 @@ |
|||
#pragma once |
|||
|
|||
#include <vector> |
|||
#include <iosfwd> |
|||
|
|||
// TODO: make this a decent type, and also make the variable "x" a decent type
|
|||
using Node = int; |
|||
|
|||
struct Value { |
|||
bool is_node; |
|||
union { |
|||
int constant; |
|||
Node node; // -1 reserved for "x"
|
|||
} value; |
|||
}; |
|||
|
|||
struct Gen { |
|||
enum Type { |
|||
kAdd, |
|||
kSub, |
|||
kMult, |
|||
kDiv, |
|||
kPow, |
|||
kTypes |
|||
} type; |
|||
Value x, y; |
|||
}; |
|||
|
|||
struct Genome { |
|||
std::vector<Gen> genes; |
|||
std::vector<bool> active_genes; |
|||
Node output; |
|||
|
|||
// calculates which genes are active (used internally)
|
|||
void calculate_active_genes(); |
|||
|
|||
// evaluates expression with input x
|
|||
int evaluate_on(int x) const; |
|||
|
|||
// mutate a single bit of some gene, returns true when a active gene was altered
|
|||
bool mutate_random_bit(); |
|||
|
|||
// checks whether the invariant still holds (for debugging)
|
|||
bool invariant() const; |
|||
|
|||
// returns a string in a (more or less) readable format
|
|||
std::string as_formula() const; |
|||
|
|||
// some trivial getters (to silence warnings)
|
|||
Gen & gene(int n) { return genes[static_cast<size_t>(n)]; } |
|||
Gen const & gene(int n) const { return genes[static_cast<size_t>(n)]; } |
|||
bool is_active(int n) const { return active_genes[static_cast<size_t>(n)]; } |
|||
}; |
|||
|
|||
// the obvious I/O routines
|
|||
std::ostream & operator<<(std::ostream& out, Value const & g); |
|||
std::ostream & operator<<(std::ostream& out, Gen const & g); |
|||
std::ostream & operator<<(std::ostream& out, Genome const & g); |
|||
|
|||
// generates a random genome with the specified number of genes/nodes
|
|||
Genome random_genome(size_t number_of_genes); |
@ -0,0 +1,7 @@ |
|||
#pragma once |
|||
|
|||
// I needed a safe division in order to avoid SIGFPE
|
|||
int divi(int x, int y); |
|||
|
|||
// Integer power function
|
|||
int powi(int base, int exp); |
@ -0,0 +1,4 @@ |
|||
|
|||
file(GLOB sources "*.cpp") |
|||
add_library(common STATIC ${sources}) |
|||
target_link_libraries(common ${libs}) |
@ -0,0 +1,81 @@ |
|||
#include "evolve.hpp" |
|||
#include "utilities.hpp" |
|||
|
|||
#include <map> |
|||
#include <iterator> |
|||
#include <cmath> |
|||
#include <iostream> |
|||
|
|||
static Score score(const std::vector<int>& goal, Genome const & genome){ |
|||
Score ss = 0; |
|||
for(int i = 0; i < goal.size(); ++i) { |
|||
Score error = goal[i] - genome.evaluate_on(i); |
|||
ss += error * error; |
|||
} |
|||
return ss; |
|||
} |
|||
|
|||
template <typename It, typename Dist> |
|||
It advance2(It it, Dist d){ |
|||
std::advance(it, d); |
|||
return it; |
|||
} |
|||
|
|||
template <typename C> |
|||
struct subrange_t { |
|||
C & c; |
|||
int s; |
|||
|
|||
subrange_t(C & c_, int s_) |
|||
: c(c_), s(s_) |
|||
{} |
|||
|
|||
auto begin(){ return advance2(c.begin(), s); } |
|||
auto end(){ return c.end(); } |
|||
|
|||
auto begin() const { return advance2(c.begin(), s); } |
|||
auto end() const { return c.end(); } |
|||
}; |
|||
|
|||
template <typename C> |
|||
auto subrange(C & c, int start){ |
|||
return subrange_t<C>(c, start); |
|||
} |
|||
|
|||
void Evolver::next_generation(){ |
|||
// evaluate (if needed)
|
|||
for(auto&& g : current_generation){ |
|||
if(g.second < 0) { |
|||
g.second = score(goal, g.first); |
|||
} |
|||
} |
|||
|
|||
// pick best no worse than parent
|
|||
auto best = current_generation[0]; |
|||
for(auto&& g : current_generation){ |
|||
if(g.second <= best.second){ |
|||
best = g; |
|||
} |
|||
} |
|||
|
|||
// continue with the best as parent
|
|||
current_generation[0] = best; |
|||
int count = 0; |
|||
for(auto& g : subrange(current_generation, 1)){ |
|||
count++; |
|||
g = best; |
|||
for(int j = 0; j < count; ++j){ |
|||
while(!g.first.mutate_random_bit()){} |
|||
} |
|||
g.second = -1; |
|||
} |
|||
} |
|||
|
|||
Evolver create_evolver(size_t population_size, size_t genome_size){ |
|||
Evolver e; |
|||
e.current_generation.reserve(population_size); |
|||
for(size_t i = 0; i < population_size; ++i){ |
|||
e.current_generation.emplace_back(random_genome(genome_size), -1); |
|||
} |
|||
return e; |
|||
} |
@ -0,0 +1,227 @@ |
|||
#include "genome.hpp" |
|||
#include "utilities.hpp" |
|||
|
|||
#include <random> |
|||
#include <functional> |
|||
#include <iostream> |
|||
#include <stack> |
|||
#include <cassert> |
|||
|
|||
// Pretty printing nowadays obviously needs colors
|
|||
// blue == constant. red == node or variable
|
|||
// gray background == inactive gene
|
|||
static std::string color(int n, bool node){ |
|||
if(node && n == -1) return "\x1B[31mx\x1b[39m"; |
|||
if(node) return "\x1B[31m" + std::to_string(n) + "\x1b[39m"; |
|||
return "\x1B[36m" + std::to_string(n) + "\x1b[39m"; |
|||
} |
|||
|
|||
std::ostream & operator<<(std::ostream& out, const Value& v){ |
|||
if (v.is_node) { |
|||
return out << color(v.value.node, true); |
|||
} else { |
|||
return out << color(v.value.constant, false); |
|||
} |
|||
} |
|||
|
|||
std::ostream & operator<<(std::ostream& out, Gen const & g){ |
|||
switch (g.type) { |
|||
case Gen::kAdd: return out << "[+ " << g.x << ' ' << g.y << ']'; |
|||
case Gen::kSub: return out << "[- " << g.x << ' ' << g.y << ']'; |
|||
case Gen::kMult: return out << "[* " << g.x << ' ' << g.y << ']'; |
|||
case Gen::kDiv: return out << "[/ " << g.x << ' ' << g.y << ']'; |
|||
case Gen::kPow: return out << "[^ " << g.x << ' ' << g.y << ']'; |
|||
default: return out << "[error]"; |
|||
} |
|||
} |
|||
|
|||
std::ostream & operator<<(std::ostream& out, Genome const & g){ |
|||
for(unsigned int i = 0; i < g.genes.size(); ++i){ |
|||
if(g.active_genes[i]){ |
|||
out << g.genes[i]; |
|||
} else { |
|||
out << "\x1B[47m" << g.genes[i] << "\x1B[49m"; |
|||
} |
|||
} |
|||
return out << g.output; |
|||
} |
|||
|
|||
void Genome::calculate_active_genes(){ |
|||
active_genes.assign(genes.size(), false); |
|||
|
|||
// NOTE: prohibits concurrency, but it's a speedup for now
|
|||
static std::stack<Node> nodes; |
|||
nodes.push(output); |
|||
while(!nodes.empty()){ |
|||
const auto n = nodes.top(); |
|||
nodes.pop(); |
|||
if(n == -1 || is_active(n)) continue; |
|||
active_genes[static_cast<size_t>(n)] = true; |
|||
|
|||
auto const & g = gene(n); |
|||
if (g.x.is_node) { |
|||
nodes.push(g.x.value.node); |
|||
} |
|||
if (g.y.is_node) { |
|||
nodes.push(g.y.value.node); |
|||
} |
|||
} |
|||
} |
|||
|
|||
static int calc_op(Gen::Type const & op, int const & x, int const & y){ |
|||
switch (op) { |
|||
case Gen::kAdd: return x + y; |
|||
case Gen::kSub: return x - y; |
|||
case Gen::kMult: return x * y; |
|||
case Gen::kDiv: return divi(x, y); |
|||
case Gen::kPow: return powi(x, y); |
|||
default: return 0; |
|||
} |
|||
} |
|||
|
|||
int Genome::evaluate_on(int x) const{ |
|||
// NOTE: prohibits concurrency, but it's a speedup for now
|
|||
static std::vector<int> results; |
|||
results.assign(output + 1, 0); |
|||
|
|||
for(int i = 0; i <= output; ++i){ |
|||
if(!is_active(i)) continue; |
|||
|
|||
auto const & g = gene(i); |
|||
auto const xv = g.x.is_node ? (g.x.value.node == -1 ? x : results[g.x.value.node]) : g.x.value.constant; |
|||
auto const yv = g.y.is_node ? (g.y.value.node == -1 ? x : results[g.y.value.node]) : g.y.value.constant; |
|||
|
|||
results[i] = calc_op(g.type, xv, yv); |
|||
} |
|||
return results[output]; |
|||
} |
|||
|
|||
// FIXME: encapsulate all this
|
|||
using Generator = std::mt19937; |
|||
using Uniform = std::uniform_int_distribution<int>; |
|||
using Params = Uniform::param_type; |
|||
|
|||
static Generator generator{0}; |
|||
static Uniform distribution; |
|||
static auto random_type = std::bind(distribution, generator, Params(0, Gen::kTypes-1)); |
|||
static auto random_const = std::bind(distribution, generator, Params(-20, 20)); |
|||
static auto random_node = [](int n){ return distribution(generator, Params(0, n-1)); }; |
|||
|
|||
static Value random_value(Node n){ |
|||
Value v; |
|||
switch(distribution(generator, Params(0, n ? 2 : 1))){ |
|||
case 0: v.is_node = false; |
|||
v.value.constant = random_const(); |
|||
break; |
|||
case 1: v.is_node = true; |
|||
v.value.node = -1; |
|||
break; |
|||
case 2: v.is_node = true; |
|||
v.value.node = random_node(n); |
|||
break; |
|||
} |
|||
|
|||
return v; |
|||
} |
|||
|
|||
static Gen random_gen(Node n){ |
|||
Gen g; |
|||
g.type = Gen::Type(random_type()); |
|||
g.x = random_value(n); |
|||
g.y = random_value(n); |
|||
return g; |
|||
} |
|||
|
|||
Genome random_genome(size_t number_of_genes){ |
|||
Genome g; |
|||
// FIXME: don't silence the warning, fix the node-numbering
|
|||
const int nn = static_cast<int>(number_of_genes); |
|||
for(int i = 0; i < nn; ++i){ |
|||
g.genes.emplace_back(random_gen(i)); |
|||
} |
|||
g.output = random_node(nn); |
|||
g.calculate_active_genes(); |
|||
return g; |
|||
} |
|||
|
|||
bool Genome::mutate_random_bit(){ |
|||
// FIXME: make this more robust
|
|||
const auto fields_per_gen = 3; |
|||
const auto bit = distribution(generator, Params(0, fields_per_gen * genes.size())); // inclusive
|
|||
if(bit == fields_per_gen * genes.size()){ |
|||
output = random_node(genes.size()); |
|||
calculate_active_genes(); |
|||
assert(invariant()); |
|||
return true; |
|||
} else { |
|||
const auto n = bit / fields_per_gen; |
|||
const auto field = bit % fields_per_gen; |
|||
auto & g = gene(n); |
|||
switch(field){ |
|||
case 0: |
|||
g.type = Gen::Type(random_type()); |
|||
break; |
|||
case 1: |
|||
g.x = random_value(n); |
|||
break; |
|||
case 2: |
|||
g.y = random_value(n); |
|||
break; |
|||
} |
|||
if(is_active(n)){ |
|||
calculate_active_genes(); |
|||
assert(invariant()); |
|||
return true; |
|||
} else { |
|||
assert(invariant()); |
|||
return false; |
|||
} |
|||
} |
|||
} |
|||
|
|||
bool Genome::invariant() const{ |
|||
for(int n = 0; n < genes.size(); ++n){ |
|||
Gen const & g = gene(n); |
|||
if(g.x.is_node && g.x.value.node >= n) return false; |
|||
if(g.y.is_node && g.y.value.node >= n) return false; |
|||
} |
|||
return true; |
|||
} |
|||
|
|||
std::string Genome::as_formula() const{ |
|||
static std::vector<std::string> results; |
|||
results.assign(output+1, "err"); |
|||
|
|||
for(int i = 0; i <= output; ++i){ |
|||
if(!is_active(i)) continue; |
|||
|
|||
auto const & g = gene(i); |
|||
std::string x; |
|||
std::string y; |
|||
|
|||
if(g.x.is_node){ |
|||
if(g.x.value.node == -1) x = "x"; |
|||
else x = "(" + results[g.x.value.node] + ")"; |
|||
} else { |
|||
x = std::to_string(g.x.value.constant); |
|||
} |
|||
|
|||
if(g.y.is_node){ |
|||
if(g.y.value.node == -1) y = "x"; |
|||
else y = "(" + results[g.y.value.node] + ")"; |
|||
} else { |
|||
y = std::to_string(g.y.value.constant); |
|||
} |
|||
|
|||
switch (g.type) { |
|||
case Gen::kAdd: results[i] = x + "+" + y; break; |
|||
case Gen::kSub: results[i] = x + "-" + y; break; |
|||
case Gen::kMult: results[i] = x + "*" + y; break; |
|||
case Gen::kDiv: results[i] = x + "/" + y; break; |
|||
case Gen::kPow: results[i] = x + "^" + y; break; |
|||
default: results[i] = "err"; |
|||
} |
|||
} |
|||
return results[output]; |
|||
} |
|||
|
@ -0,0 +1,32 @@ |
|||
#include "utilities.hpp" |
|||
|
|||
#include <limits> |
|||
|
|||
using namespace std; |
|||
|
|||
int divi(int x, int y){ |
|||
// clearly we don't want a division by zero
|
|||
if(!y) { |
|||
if(x < 0) return numeric_limits<int>::min(); |
|||
if(x == 0) return 1; |
|||
return numeric_limits<int>::max(); |
|||
} |
|||
// in this situation there is a overflow which causes a SIGFPE
|
|||
if(x == numeric_limits<int>::min() && y == -1) { |
|||
return numeric_limits<int>::max(); |
|||
} |
|||
return x / y; |
|||
} |
|||
|
|||
// exponentiation by squaring
|
|||
int powi(int base, int exp){ |
|||
if(exp < 0) return 0; |
|||
int res = 1; |
|||
while (exp) { |
|||
if (exp & 1) |
|||
res *= base; |
|||
exp >>= 1; |
|||
base *= base; |
|||
} |
|||
return res; |
|||
} |
@ -0,0 +1,8 @@ |
|||
|
|||
file(GLOB sources "*.cpp") |
|||
|
|||
foreach(source ${sources}) |
|||
get_filename_component(exec ${source} NAME_WE) |
|||
add_executable(${exec} ${source}) |
|||
target_link_libraries(${exec} common ${libs}) |
|||
endforeach() |
@ -0,0 +1,41 @@ |
|||
#include <iostream> |
|||
#include <vector> |
|||
#include <limits> |
|||
|
|||
// I ran into some trouble with division, so I decided to test the boundary
|
|||
// cases. Turns out that INT_MIN / -1 overflows and signals a SIGFPE :(.
|
|||
|
|||
using namespace std; |
|||
using T = long int; |
|||
|
|||
static T myDiv(T x, T y){ |
|||
if(!y) { |
|||
std::cout << "SIGFPE"; |
|||
if(x < 0) return numeric_limits<T>::min(); |
|||
if(x == 0) return 1; |
|||
return numeric_limits<T>::max(); |
|||
} |
|||
// somehow this is wrong for shorts and chars, but ok for all bigger integral types
|
|||
if(numeric_limits<T>::is_signed && x == numeric_limits<T>::min() && y == -1) { |
|||
std::cout << "SIGFPE"; |
|||
return numeric_limits<T>::max(); |
|||
} |
|||
return x / y; |
|||
} |
|||
|
|||
int main(){ |
|||
// The bad guys
|
|||
vector<T> v = {numeric_limits<T>::min(), -1, 0, 1, numeric_limits<T>::max()}; |
|||
|
|||
for(auto&&x : v){ |
|||
std::cout << x << " "; |
|||
} |
|||
std::cout << std::endl; |
|||
|
|||
for(auto&& x : v){ |
|||
for(auto&& y : v){ |
|||
std::cout << x << " / " << y << " = " << std::flush; |
|||
std::cout << myDiv(x, y) << std::endl; |
|||
} |
|||
} |
|||
} |
@ -0,0 +1,57 @@ |
|||
#include "genome.hpp" |
|||
#include "evolve.hpp" |
|||
|
|||
#include <boost/program_options.hpp> |
|||
#include <iostream> |
|||
|
|||
int main(int argc, char** argv){ |
|||
namespace po = boost::program_options; |
|||
|
|||
po::options_description opts; |
|||
opts.add_options() |
|||
("input", po::value<std::vector<int>>()->multitoken(), "sequence to generate a formula for (e.g. 1 2 4 8)") |
|||
("help,h", po::bool_switch(), "show this help"); |
|||
|
|||
po::positional_options_description file_opts; |
|||
file_opts.add("input", -1); |
|||
|
|||
po::variables_map vm; |
|||
po::store(po::command_line_parser(argc, argv).options(opts).positional(file_opts).run(), vm); |
|||
po::notify(vm); |
|||
|
|||
if(vm["help"].as<bool>()){ |
|||
std::cout << "Genetic Sequence Formulizer, version " << __DATE__ << "\n"; |
|||
std::cout << "seq2form [-h] <sequence of non-negative integers>\n"; |
|||
std::cout << opts << std::endl; |
|||
return 0; |
|||
} |
|||
|
|||
// TODO: make these program options
|
|||
const auto genome_size = 16; |
|||
const auto population = 10; |
|||
const auto generations = 1000001 / population; |
|||
|
|||
auto evolver = create_evolver(population, genome_size); |
|||
evolver.goal = vm["input"].as<std::vector<int>>(); |
|||
|
|||
std::cout << "your input:\t"; |
|||
for(auto&& x : evolver.goal) std::cout << x << ", "; |
|||
std::cout << std::endl; |
|||
|
|||
// GO! (until we find a perfect solution, or we hit the generations-bound)
|
|||
for(int i = 0; i < generations; ++i){ |
|||
evolver.next_generation(); |
|||
if(evolver.current_generation[0].second <= 0) { |
|||
break; |
|||
} |
|||
} |
|||
|
|||
// SHOW!
|
|||
auto & best_genome = evolver.current_generation[0].first; |
|||
std::cout << "formula:\t" << best_genome.as_formula() << std::endl; |
|||
std::cout << "continuation:\t"; |
|||
for(int i = 0; i < 50; ++i){ |
|||
std::cout << best_genome.evaluate_on(i) << ", "; |
|||
} |
|||
std::cout << std::endl; |
|||
} |
Reference in new issue