commit fcf5765cb5042543447349b75d2b3aa8fb6b45e9 Author: Joshua Moerman Date: Mon Mar 17 20:24:33 2014 +0100 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..781b756 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +*.user +build-* +.DS_Store + diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..d208e3a --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,14 @@ +project(Genetics) +cmake_minimum_required(VERSION 2.8) + +include_directories("${PROJECT_SOURCE_DIR}/include/") +add_definitions(-std=c++1y) + +find_package(Boost REQUIRED COMPONENTS filesystem program_options serialization system) +include_directories(SYSTEM ${Boost_INCLUDE_DIRS}) +set(libs ${libs} ${Boost_LIBRARIES}) + +set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +add_subdirectory("include") +add_subdirectory("lib") +add_subdirectory("src") diff --git a/README.md b/README.md new file mode 100644 index 0000000..024d01b --- /dev/null +++ b/README.md @@ -0,0 +1,38 @@ +# Genetic Sequence Formulizer +This program will take a sequence entered by the user and tries to find a formula producing this sequence. I made this to play around with genetic programming. Inspired by the following two talks: + +* [Reducing Wasted Evaluations in Cartesian Genetic Programming, by Brian Goldman](https://www.youtube.com/watch?v=3HsgVHv1ho8) +* [Automatic Algorithm Invention with a GPU, by Wes Faler](https://www.youtube.com/watch?v=xQDazGrKsuM) + +I used a technique similar to the one described in the first presentation (the one called *single*), as it seemed to be the easiest one. The nice thing is that there is always just one parent, but there are inactive genes which are allowed to mutate freely. + +## Examples +Some actual output of the program (truncated): + + $ ./main 0 0 0 0 0 0 0 + your input: 0, 0, 0, 0, 0, 0, 0, + formula: (x-x)*x + continuation: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... + + $ ./main 1 2 3 + your input: 1, 2, 3, + formula: ((x-18)/(x-18))+x + continuation: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ... + + $ ./main 1 2 4 8 + your input: 1, 2, 4, 8, + formula: 2^x + continuation: 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, ... + + $ ./main 37 + your input: 37, + formula: (10+12)+15 + continuation: 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, + +## Notes + +* The algorithm does not yet simplify the formula (I should take this into account when scoring the genes). +* The sequences are 0-based. +* The constants used in the genes are bound to 20 (so in the last example, it needed multiple constants added). +* It treats ```1/0``` as ```INT_MAX``` (we need some safety as we evaluate randomly generated expressions). +* I may be using some c++14 features. Install a new clang or gcc to get this awesomeness. \ No newline at end of file diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt new file mode 100644 index 0000000..987fe23 --- /dev/null +++ b/include/CMakeLists.txt @@ -0,0 +1,4 @@ + +# Hack to show headers in Qt Creator +file(GLOB headers "*.hpp") +add_library(dummy ${headers} dummy.cpp) diff --git a/include/dummy.cpp b/include/dummy.cpp new file mode 100644 index 0000000..7dcf12c --- /dev/null +++ b/include/dummy.cpp @@ -0,0 +1,2 @@ +// Needed for the hack to show headers in Qt Creator +static int x = x; diff --git a/include/evolve.hpp b/include/evolve.hpp new file mode 100644 index 0000000..e2b3aed --- /dev/null +++ b/include/evolve.hpp @@ -0,0 +1,21 @@ +#pragma once + +#include "genome.hpp" +#include + +using Score = double; + +struct Evolver{ + // stores the genome with its score + // first element is the best one (after calling next_generation()). + std::vector> current_generation; + + // goal which we are trying to achieve + std::vector goal; + + // evaluates current generation, picks the best one, and generate new generation + void next_generation(); +}; + +// reate a random generation to start with +Evolver create_evolver(size_t population_size, size_t genome_size); diff --git a/include/genome.hpp b/include/genome.hpp new file mode 100644 index 0000000..254d33e --- /dev/null +++ b/include/genome.hpp @@ -0,0 +1,61 @@ +#pragma once + +#include +#include + +// TODO: make this a decent type, and also make the variable "x" a decent type +using Node = int; + +struct Value { + bool is_node; + union { + int constant; + Node node; // -1 reserved for "x" + } value; +}; + +struct Gen { + enum Type { + kAdd, + kSub, + kMult, + kDiv, + kPow, + kTypes + } type; + Value x, y; +}; + +struct Genome { + std::vector genes; + std::vector active_genes; + Node output; + + // calculates which genes are active (used internally) + void calculate_active_genes(); + + // evaluates expression with input x + int evaluate_on(int x) const; + + // mutate a single bit of some gene, returns true when a active gene was altered + bool mutate_random_bit(); + + // checks whether the invariant still holds (for debugging) + bool invariant() const; + + // returns a string in a (more or less) readable format + std::string as_formula() const; + + // some trivial getters (to silence warnings) + Gen & gene(int n) { return genes[static_cast(n)]; } + Gen const & gene(int n) const { return genes[static_cast(n)]; } + bool is_active(int n) const { return active_genes[static_cast(n)]; } +}; + +// the obvious I/O routines +std::ostream & operator<<(std::ostream& out, Value const & g); +std::ostream & operator<<(std::ostream& out, Gen const & g); +std::ostream & operator<<(std::ostream& out, Genome const & g); + +// generates a random genome with the specified number of genes/nodes +Genome random_genome(size_t number_of_genes); diff --git a/include/utilities.hpp b/include/utilities.hpp new file mode 100644 index 0000000..eedfc0e --- /dev/null +++ b/include/utilities.hpp @@ -0,0 +1,7 @@ +#pragma once + +// I needed a safe division in order to avoid SIGFPE +int divi(int x, int y); + +// Integer power function +int powi(int base, int exp); diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt new file mode 100644 index 0000000..6196f11 --- /dev/null +++ b/lib/CMakeLists.txt @@ -0,0 +1,4 @@ + +file(GLOB sources "*.cpp") +add_library(common STATIC ${sources}) +target_link_libraries(common ${libs}) diff --git a/lib/evolve.cpp b/lib/evolve.cpp new file mode 100644 index 0000000..abbdd79 --- /dev/null +++ b/lib/evolve.cpp @@ -0,0 +1,81 @@ +#include "evolve.hpp" +#include "utilities.hpp" + +#include +#include +#include +#include + +static Score score(const std::vector& goal, Genome const & genome){ + Score ss = 0; + for(int i = 0; i < goal.size(); ++i) { + Score error = goal[i] - genome.evaluate_on(i); + ss += error * error; + } + return ss; +} + +template +It advance2(It it, Dist d){ + std::advance(it, d); + return it; +} + +template +struct subrange_t { + C & c; + int s; + + subrange_t(C & c_, int s_) + : c(c_), s(s_) + {} + + auto begin(){ return advance2(c.begin(), s); } + auto end(){ return c.end(); } + + auto begin() const { return advance2(c.begin(), s); } + auto end() const { return c.end(); } +}; + +template +auto subrange(C & c, int start){ + return subrange_t(c, start); +} + +void Evolver::next_generation(){ + // evaluate (if needed) + for(auto&& g : current_generation){ + if(g.second < 0) { + g.second = score(goal, g.first); + } + } + + // pick best no worse than parent + auto best = current_generation[0]; + for(auto&& g : current_generation){ + if(g.second <= best.second){ + best = g; + } + } + + // continue with the best as parent + current_generation[0] = best; + int count = 0; + for(auto& g : subrange(current_generation, 1)){ + count++; + g = best; + for(int j = 0; j < count; ++j){ + while(!g.first.mutate_random_bit()){} + } + g.second = -1; + } +} + +Evolver create_evolver(size_t population_size, size_t genome_size){ + Evolver e; + e.current_generation.reserve(population_size); + for(size_t i = 0; i < population_size; ++i){ + e.current_generation.emplace_back(random_genome(genome_size), -1); + } + return e; +} diff --git a/lib/genome.cpp b/lib/genome.cpp new file mode 100644 index 0000000..65daf16 --- /dev/null +++ b/lib/genome.cpp @@ -0,0 +1,227 @@ +#include "genome.hpp" +#include "utilities.hpp" + +#include +#include +#include +#include +#include + +// Pretty printing nowadays obviously needs colors +// blue == constant. red == node or variable +// gray background == inactive gene +static std::string color(int n, bool node){ + if(node && n == -1) return "\x1B[31mx\x1b[39m"; + if(node) return "\x1B[31m" + std::to_string(n) + "\x1b[39m"; + return "\x1B[36m" + std::to_string(n) + "\x1b[39m"; +} + +std::ostream & operator<<(std::ostream& out, const Value& v){ + if (v.is_node) { + return out << color(v.value.node, true); + } else { + return out << color(v.value.constant, false); + } +} + +std::ostream & operator<<(std::ostream& out, Gen const & g){ + switch (g.type) { + case Gen::kAdd: return out << "[+ " << g.x << ' ' << g.y << ']'; + case Gen::kSub: return out << "[- " << g.x << ' ' << g.y << ']'; + case Gen::kMult: return out << "[* " << g.x << ' ' << g.y << ']'; + case Gen::kDiv: return out << "[/ " << g.x << ' ' << g.y << ']'; + case Gen::kPow: return out << "[^ " << g.x << ' ' << g.y << ']'; + default: return out << "[error]"; + } +} + +std::ostream & operator<<(std::ostream& out, Genome const & g){ + for(unsigned int i = 0; i < g.genes.size(); ++i){ + if(g.active_genes[i]){ + out << g.genes[i]; + } else { + out << "\x1B[47m" << g.genes[i] << "\x1B[49m"; + } + } + return out << g.output; +} + +void Genome::calculate_active_genes(){ + active_genes.assign(genes.size(), false); + + // NOTE: prohibits concurrency, but it's a speedup for now + static std::stack nodes; + nodes.push(output); + while(!nodes.empty()){ + const auto n = nodes.top(); + nodes.pop(); + if(n == -1 || is_active(n)) continue; + active_genes[static_cast(n)] = true; + + auto const & g = gene(n); + if (g.x.is_node) { + nodes.push(g.x.value.node); + } + if (g.y.is_node) { + nodes.push(g.y.value.node); + } + } +} + +static int calc_op(Gen::Type const & op, int const & x, int const & y){ + switch (op) { + case Gen::kAdd: return x + y; + case Gen::kSub: return x - y; + case Gen::kMult: return x * y; + case Gen::kDiv: return divi(x, y); + case Gen::kPow: return powi(x, y); + default: return 0; + } +} + +int Genome::evaluate_on(int x) const{ + // NOTE: prohibits concurrency, but it's a speedup for now + static std::vector results; + results.assign(output + 1, 0); + + for(int i = 0; i <= output; ++i){ + if(!is_active(i)) continue; + + auto const & g = gene(i); + auto const xv = g.x.is_node ? (g.x.value.node == -1 ? x : results[g.x.value.node]) : g.x.value.constant; + auto const yv = g.y.is_node ? (g.y.value.node == -1 ? x : results[g.y.value.node]) : g.y.value.constant; + + results[i] = calc_op(g.type, xv, yv); + } + return results[output]; +} + +// FIXME: encapsulate all this +using Generator = std::mt19937; +using Uniform = std::uniform_int_distribution; +using Params = Uniform::param_type; + +static Generator generator{0}; +static Uniform distribution; +static auto random_type = std::bind(distribution, generator, Params(0, Gen::kTypes-1)); +static auto random_const = std::bind(distribution, generator, Params(-20, 20)); +static auto random_node = [](int n){ return distribution(generator, Params(0, n-1)); }; + +static Value random_value(Node n){ + Value v; + switch(distribution(generator, Params(0, n ? 2 : 1))){ + case 0: v.is_node = false; + v.value.constant = random_const(); + break; + case 1: v.is_node = true; + v.value.node = -1; + break; + case 2: v.is_node = true; + v.value.node = random_node(n); + break; + } + + return v; +} + +static Gen random_gen(Node n){ + Gen g; + g.type = Gen::Type(random_type()); + g.x = random_value(n); + g.y = random_value(n); + return g; +} + +Genome random_genome(size_t number_of_genes){ + Genome g; + // FIXME: don't silence the warning, fix the node-numbering + const int nn = static_cast(number_of_genes); + for(int i = 0; i < nn; ++i){ + g.genes.emplace_back(random_gen(i)); + } + g.output = random_node(nn); + g.calculate_active_genes(); + return g; +} + +bool Genome::mutate_random_bit(){ + // FIXME: make this more robust + const auto fields_per_gen = 3; + const auto bit = distribution(generator, Params(0, fields_per_gen * genes.size())); // inclusive + if(bit == fields_per_gen * genes.size()){ + output = random_node(genes.size()); + calculate_active_genes(); + assert(invariant()); + return true; + } else { + const auto n = bit / fields_per_gen; + const auto field = bit % fields_per_gen; + auto & g = gene(n); + switch(field){ + case 0: + g.type = Gen::Type(random_type()); + break; + case 1: + g.x = random_value(n); + break; + case 2: + g.y = random_value(n); + break; + } + if(is_active(n)){ + calculate_active_genes(); + assert(invariant()); + return true; + } else { + assert(invariant()); + return false; + } + } +} + +bool Genome::invariant() const{ + for(int n = 0; n < genes.size(); ++n){ + Gen const & g = gene(n); + if(g.x.is_node && g.x.value.node >= n) return false; + if(g.y.is_node && g.y.value.node >= n) return false; + } + return true; +} + +std::string Genome::as_formula() const{ + static std::vector results; + results.assign(output+1, "err"); + + for(int i = 0; i <= output; ++i){ + if(!is_active(i)) continue; + + auto const & g = gene(i); + std::string x; + std::string y; + + if(g.x.is_node){ + if(g.x.value.node == -1) x = "x"; + else x = "(" + results[g.x.value.node] + ")"; + } else { + x = std::to_string(g.x.value.constant); + } + + if(g.y.is_node){ + if(g.y.value.node == -1) y = "x"; + else y = "(" + results[g.y.value.node] + ")"; + } else { + y = std::to_string(g.y.value.constant); + } + + switch (g.type) { + case Gen::kAdd: results[i] = x + "+" + y; break; + case Gen::kSub: results[i] = x + "-" + y; break; + case Gen::kMult: results[i] = x + "*" + y; break; + case Gen::kDiv: results[i] = x + "/" + y; break; + case Gen::kPow: results[i] = x + "^" + y; break; + default: results[i] = "err"; + } + } + return results[output]; +} + diff --git a/lib/utilities.cpp b/lib/utilities.cpp new file mode 100644 index 0000000..c0c47d5 --- /dev/null +++ b/lib/utilities.cpp @@ -0,0 +1,32 @@ +#include "utilities.hpp" + +#include + +using namespace std; + +int divi(int x, int y){ + // clearly we don't want a division by zero + if(!y) { + if(x < 0) return numeric_limits::min(); + if(x == 0) return 1; + return numeric_limits::max(); + } + // in this situation there is a overflow which causes a SIGFPE + if(x == numeric_limits::min() && y == -1) { + return numeric_limits::max(); + } + return x / y; +} + +// exponentiation by squaring +int powi(int base, int exp){ + if(exp < 0) return 0; + int res = 1; + while (exp) { + if (exp & 1) + res *= base; + exp >>= 1; + base *= base; + } + return res; +} diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000..56b78f0 --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,8 @@ + +file(GLOB sources "*.cpp") + +foreach(source ${sources}) + get_filename_component(exec ${source} NAME_WE) + add_executable(${exec} ${source}) + target_link_libraries(${exec} common ${libs}) +endforeach() diff --git a/src/div_test.cpp b/src/div_test.cpp new file mode 100644 index 0000000..bfee8d8 --- /dev/null +++ b/src/div_test.cpp @@ -0,0 +1,41 @@ +#include +#include +#include + +// I ran into some trouble with division, so I decided to test the boundary +// cases. Turns out that INT_MIN / -1 overflows and signals a SIGFPE :(. + +using namespace std; +using T = long int; + +static T myDiv(T x, T y){ + if(!y) { + std::cout << "SIGFPE"; + if(x < 0) return numeric_limits::min(); + if(x == 0) return 1; + return numeric_limits::max(); + } + // somehow this is wrong for shorts and chars, but ok for all bigger integral types + if(numeric_limits::is_signed && x == numeric_limits::min() && y == -1) { + std::cout << "SIGFPE"; + return numeric_limits::max(); + } + return x / y; +} + +int main(){ + // The bad guys + vector v = {numeric_limits::min(), -1, 0, 1, numeric_limits::max()}; + + for(auto&&x : v){ + std::cout << x << " "; + } + std::cout << std::endl; + + for(auto&& x : v){ + for(auto&& y : v){ + std::cout << x << " / " << y << " = " << std::flush; + std::cout << myDiv(x, y) << std::endl; + } + } +} diff --git a/src/main.cpp b/src/main.cpp new file mode 100644 index 0000000..66a4bb4 --- /dev/null +++ b/src/main.cpp @@ -0,0 +1,57 @@ +#include "genome.hpp" +#include "evolve.hpp" + +#include +#include + +int main(int argc, char** argv){ + namespace po = boost::program_options; + + po::options_description opts; + opts.add_options() + ("input", po::value>()->multitoken(), "sequence to generate a formula for (e.g. 1 2 4 8)") + ("help,h", po::bool_switch(), "show this help"); + + po::positional_options_description file_opts; + file_opts.add("input", -1); + + po::variables_map vm; + po::store(po::command_line_parser(argc, argv).options(opts).positional(file_opts).run(), vm); + po::notify(vm); + + if(vm["help"].as()){ + std::cout << "Genetic Sequence Formulizer, version " << __DATE__ << "\n"; + std::cout << "seq2form [-h] \n"; + std::cout << opts << std::endl; + return 0; + } + + // TODO: make these program options + const auto genome_size = 16; + const auto population = 10; + const auto generations = 1000001 / population; + + auto evolver = create_evolver(population, genome_size); + evolver.goal = vm["input"].as>(); + + std::cout << "your input:\t"; + for(auto&& x : evolver.goal) std::cout << x << ", "; + std::cout << std::endl; + + // GO! (until we find a perfect solution, or we hit the generations-bound) + for(int i = 0; i < generations; ++i){ + evolver.next_generation(); + if(evolver.current_generation[0].second <= 0) { + break; + } + } + + // SHOW! + auto & best_genome = evolver.current_generation[0].first; + std::cout << "formula:\t" << best_genome.as_formula() << std::endl; + std::cout << "continuation:\t"; + for(int i = 0; i < 50; ++i){ + std::cout << best_genome.evaluate_on(i) << ", "; + } + std::cout << std::endl; +}