Browse Source

Initial commit

master
Joshua Moerman 10 years ago
commit
fcf5765cb5
  1. 4
      .gitignore
  2. 14
      CMakeLists.txt
  3. 38
      README.md
  4. 4
      include/CMakeLists.txt
  5. 2
      include/dummy.cpp
  6. 21
      include/evolve.hpp
  7. 61
      include/genome.hpp
  8. 7
      include/utilities.hpp
  9. 4
      lib/CMakeLists.txt
  10. 81
      lib/evolve.cpp
  11. 227
      lib/genome.cpp
  12. 32
      lib/utilities.cpp
  13. 8
      src/CMakeLists.txt
  14. 41
      src/div_test.cpp
  15. 57
      src/main.cpp

4
.gitignore

@ -0,0 +1,4 @@
*.user
build-*
.DS_Store

14
CMakeLists.txt

@ -0,0 +1,14 @@
project(Genetics)
cmake_minimum_required(VERSION 2.8)
include_directories("${PROJECT_SOURCE_DIR}/include/")
add_definitions(-std=c++1y)
find_package(Boost REQUIRED COMPONENTS filesystem program_options serialization system)
include_directories(SYSTEM ${Boost_INCLUDE_DIRS})
set(libs ${libs} ${Boost_LIBRARIES})
set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
add_subdirectory("include")
add_subdirectory("lib")
add_subdirectory("src")

38
README.md

@ -0,0 +1,38 @@
# Genetic Sequence Formulizer
This program will take a sequence entered by the user and tries to find a formula producing this sequence. I made this to play around with genetic programming. Inspired by the following two talks:
* [Reducing Wasted Evaluations in Cartesian Genetic Programming, by Brian Goldman](https://www.youtube.com/watch?v=3HsgVHv1ho8)
* [Automatic Algorithm Invention with a GPU, by Wes Faler](https://www.youtube.com/watch?v=xQDazGrKsuM)
I used a technique similar to the one described in the first presentation (the one called *single*), as it seemed to be the easiest one. The nice thing is that there is always just one parent, but there are inactive genes which are allowed to mutate freely.
## Examples
Some actual output of the program (truncated):
$ ./main 0 0 0 0 0 0 0
your input: 0, 0, 0, 0, 0, 0, 0,
formula: (x-x)*x
continuation: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ ./main 1 2 3
your input: 1, 2, 3,
formula: ((x-18)/(x-18))+x
continuation: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ...
$ ./main 1 2 4 8
your input: 1, 2, 4, 8,
formula: 2^x
continuation: 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, ...
$ ./main 37
your input: 37,
formula: (10+12)+15
continuation: 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,
## Notes
* The algorithm does not yet simplify the formula (I should take this into account when scoring the genes).
* The sequences are 0-based.
* The constants used in the genes are bound to 20 (so in the last example, it needed multiple constants added).
* It treats ```1/0``` as ```INT_MAX``` (we need some safety as we evaluate randomly generated expressions).
* I may be using some c++14 features. Install a new clang or gcc to get this awesomeness.

4
include/CMakeLists.txt

@ -0,0 +1,4 @@
# Hack to show headers in Qt Creator
file(GLOB headers "*.hpp")
add_library(dummy ${headers} dummy.cpp)

2
include/dummy.cpp

@ -0,0 +1,2 @@
// Needed for the hack to show headers in Qt Creator
static int x = x;

21
include/evolve.hpp

@ -0,0 +1,21 @@
#pragma once
#include "genome.hpp"
#include <vector>
using Score = double;
struct Evolver{
// stores the genome with its score
// first element is the best one (after calling next_generation()).
std::vector<std::pair<Genome, Score>> current_generation;
// goal which we are trying to achieve
std::vector<int> goal;
// evaluates current generation, picks the best one, and generate new generation
void next_generation();
};
// reate a random generation to start with
Evolver create_evolver(size_t population_size, size_t genome_size);

61
include/genome.hpp

@ -0,0 +1,61 @@
#pragma once
#include <vector>
#include <iosfwd>
// TODO: make this a decent type, and also make the variable "x" a decent type
using Node = int;
struct Value {
bool is_node;
union {
int constant;
Node node; // -1 reserved for "x"
} value;
};
struct Gen {
enum Type {
kAdd,
kSub,
kMult,
kDiv,
kPow,
kTypes
} type;
Value x, y;
};
struct Genome {
std::vector<Gen> genes;
std::vector<bool> active_genes;
Node output;
// calculates which genes are active (used internally)
void calculate_active_genes();
// evaluates expression with input x
int evaluate_on(int x) const;
// mutate a single bit of some gene, returns true when a active gene was altered
bool mutate_random_bit();
// checks whether the invariant still holds (for debugging)
bool invariant() const;
// returns a string in a (more or less) readable format
std::string as_formula() const;
// some trivial getters (to silence warnings)
Gen & gene(int n) { return genes[static_cast<size_t>(n)]; }
Gen const & gene(int n) const { return genes[static_cast<size_t>(n)]; }
bool is_active(int n) const { return active_genes[static_cast<size_t>(n)]; }
};
// the obvious I/O routines
std::ostream & operator<<(std::ostream& out, Value const & g);
std::ostream & operator<<(std::ostream& out, Gen const & g);
std::ostream & operator<<(std::ostream& out, Genome const & g);
// generates a random genome with the specified number of genes/nodes
Genome random_genome(size_t number_of_genes);

7
include/utilities.hpp

@ -0,0 +1,7 @@
#pragma once
// I needed a safe division in order to avoid SIGFPE
int divi(int x, int y);
// Integer power function
int powi(int base, int exp);

4
lib/CMakeLists.txt

@ -0,0 +1,4 @@
file(GLOB sources "*.cpp")
add_library(common STATIC ${sources})
target_link_libraries(common ${libs})

81
lib/evolve.cpp

@ -0,0 +1,81 @@
#include "evolve.hpp"
#include "utilities.hpp"
#include <map>
#include <iterator>
#include <cmath>
#include <iostream>
static Score score(const std::vector<int>& goal, Genome const & genome){
Score ss = 0;
for(int i = 0; i < goal.size(); ++i) {
Score error = goal[i] - genome.evaluate_on(i);
ss += error * error;
}
return ss;
}
template <typename It, typename Dist>
It advance2(It it, Dist d){
std::advance(it, d);
return it;
}
template <typename C>
struct subrange_t {
C & c;
int s;
subrange_t(C & c_, int s_)
: c(c_), s(s_)
{}
auto begin(){ return advance2(c.begin(), s); }
auto end(){ return c.end(); }
auto begin() const { return advance2(c.begin(), s); }
auto end() const { return c.end(); }
};
template <typename C>
auto subrange(C & c, int start){
return subrange_t<C>(c, start);
}
void Evolver::next_generation(){
// evaluate (if needed)
for(auto&& g : current_generation){
if(g.second < 0) {
g.second = score(goal, g.first);
}
}
// pick best no worse than parent
auto best = current_generation[0];
for(auto&& g : current_generation){
if(g.second <= best.second){
best = g;
}
}
// continue with the best as parent
current_generation[0] = best;
int count = 0;
for(auto& g : subrange(current_generation, 1)){
count++;
g = best;
for(int j = 0; j < count; ++j){
while(!g.first.mutate_random_bit()){}
}
g.second = -1;
}
}
Evolver create_evolver(size_t population_size, size_t genome_size){
Evolver e;
e.current_generation.reserve(population_size);
for(size_t i = 0; i < population_size; ++i){
e.current_generation.emplace_back(random_genome(genome_size), -1);
}
return e;
}

227
lib/genome.cpp

@ -0,0 +1,227 @@
#include "genome.hpp"
#include "utilities.hpp"
#include <random>
#include <functional>
#include <iostream>
#include <stack>
#include <cassert>
// Pretty printing nowadays obviously needs colors
// blue == constant. red == node or variable
// gray background == inactive gene
static std::string color(int n, bool node){
if(node && n == -1) return "\x1B[31mx\x1b[39m";
if(node) return "\x1B[31m" + std::to_string(n) + "\x1b[39m";
return "\x1B[36m" + std::to_string(n) + "\x1b[39m";
}
std::ostream & operator<<(std::ostream& out, const Value& v){
if (v.is_node) {
return out << color(v.value.node, true);
} else {
return out << color(v.value.constant, false);
}
}
std::ostream & operator<<(std::ostream& out, Gen const & g){
switch (g.type) {
case Gen::kAdd: return out << "[+ " << g.x << ' ' << g.y << ']';
case Gen::kSub: return out << "[- " << g.x << ' ' << g.y << ']';
case Gen::kMult: return out << "[* " << g.x << ' ' << g.y << ']';
case Gen::kDiv: return out << "[/ " << g.x << ' ' << g.y << ']';
case Gen::kPow: return out << "[^ " << g.x << ' ' << g.y << ']';
default: return out << "[error]";
}
}
std::ostream & operator<<(std::ostream& out, Genome const & g){
for(unsigned int i = 0; i < g.genes.size(); ++i){
if(g.active_genes[i]){
out << g.genes[i];
} else {
out << "\x1B[47m" << g.genes[i] << "\x1B[49m";
}
}
return out << g.output;
}
void Genome::calculate_active_genes(){
active_genes.assign(genes.size(), false);
// NOTE: prohibits concurrency, but it's a speedup for now
static std::stack<Node> nodes;
nodes.push(output);
while(!nodes.empty()){
const auto n = nodes.top();
nodes.pop();
if(n == -1 || is_active(n)) continue;
active_genes[static_cast<size_t>(n)] = true;
auto const & g = gene(n);
if (g.x.is_node) {
nodes.push(g.x.value.node);
}
if (g.y.is_node) {
nodes.push(g.y.value.node);
}
}
}
static int calc_op(Gen::Type const & op, int const & x, int const & y){
switch (op) {
case Gen::kAdd: return x + y;
case Gen::kSub: return x - y;
case Gen::kMult: return x * y;
case Gen::kDiv: return divi(x, y);
case Gen::kPow: return powi(x, y);
default: return 0;
}
}
int Genome::evaluate_on(int x) const{
// NOTE: prohibits concurrency, but it's a speedup for now
static std::vector<int> results;
results.assign(output + 1, 0);
for(int i = 0; i <= output; ++i){
if(!is_active(i)) continue;
auto const & g = gene(i);
auto const xv = g.x.is_node ? (g.x.value.node == -1 ? x : results[g.x.value.node]) : g.x.value.constant;
auto const yv = g.y.is_node ? (g.y.value.node == -1 ? x : results[g.y.value.node]) : g.y.value.constant;
results[i] = calc_op(g.type, xv, yv);
}
return results[output];
}
// FIXME: encapsulate all this
using Generator = std::mt19937;
using Uniform = std::uniform_int_distribution<int>;
using Params = Uniform::param_type;
static Generator generator{0};
static Uniform distribution;
static auto random_type = std::bind(distribution, generator, Params(0, Gen::kTypes-1));
static auto random_const = std::bind(distribution, generator, Params(-20, 20));
static auto random_node = [](int n){ return distribution(generator, Params(0, n-1)); };
static Value random_value(Node n){
Value v;
switch(distribution(generator, Params(0, n ? 2 : 1))){
case 0: v.is_node = false;
v.value.constant = random_const();
break;
case 1: v.is_node = true;
v.value.node = -1;
break;
case 2: v.is_node = true;
v.value.node = random_node(n);
break;
}
return v;
}
static Gen random_gen(Node n){
Gen g;
g.type = Gen::Type(random_type());
g.x = random_value(n);
g.y = random_value(n);
return g;
}
Genome random_genome(size_t number_of_genes){
Genome g;
// FIXME: don't silence the warning, fix the node-numbering
const int nn = static_cast<int>(number_of_genes);
for(int i = 0; i < nn; ++i){
g.genes.emplace_back(random_gen(i));
}
g.output = random_node(nn);
g.calculate_active_genes();
return g;
}
bool Genome::mutate_random_bit(){
// FIXME: make this more robust
const auto fields_per_gen = 3;
const auto bit = distribution(generator, Params(0, fields_per_gen * genes.size())); // inclusive
if(bit == fields_per_gen * genes.size()){
output = random_node(genes.size());
calculate_active_genes();
assert(invariant());
return true;
} else {
const auto n = bit / fields_per_gen;
const auto field = bit % fields_per_gen;
auto & g = gene(n);
switch(field){
case 0:
g.type = Gen::Type(random_type());
break;
case 1:
g.x = random_value(n);
break;
case 2:
g.y = random_value(n);
break;
}
if(is_active(n)){
calculate_active_genes();
assert(invariant());
return true;
} else {
assert(invariant());
return false;
}
}
}
bool Genome::invariant() const{
for(int n = 0; n < genes.size(); ++n){
Gen const & g = gene(n);
if(g.x.is_node && g.x.value.node >= n) return false;
if(g.y.is_node && g.y.value.node >= n) return false;
}
return true;
}
std::string Genome::as_formula() const{
static std::vector<std::string> results;
results.assign(output+1, "err");
for(int i = 0; i <= output; ++i){
if(!is_active(i)) continue;
auto const & g = gene(i);
std::string x;
std::string y;
if(g.x.is_node){
if(g.x.value.node == -1) x = "x";
else x = "(" + results[g.x.value.node] + ")";
} else {
x = std::to_string(g.x.value.constant);
}
if(g.y.is_node){
if(g.y.value.node == -1) y = "x";
else y = "(" + results[g.y.value.node] + ")";
} else {
y = std::to_string(g.y.value.constant);
}
switch (g.type) {
case Gen::kAdd: results[i] = x + "+" + y; break;
case Gen::kSub: results[i] = x + "-" + y; break;
case Gen::kMult: results[i] = x + "*" + y; break;
case Gen::kDiv: results[i] = x + "/" + y; break;
case Gen::kPow: results[i] = x + "^" + y; break;
default: results[i] = "err";
}
}
return results[output];
}

32
lib/utilities.cpp

@ -0,0 +1,32 @@
#include "utilities.hpp"
#include <limits>
using namespace std;
int divi(int x, int y){
// clearly we don't want a division by zero
if(!y) {
if(x < 0) return numeric_limits<int>::min();
if(x == 0) return 1;
return numeric_limits<int>::max();
}
// in this situation there is a overflow which causes a SIGFPE
if(x == numeric_limits<int>::min() && y == -1) {
return numeric_limits<int>::max();
}
return x / y;
}
// exponentiation by squaring
int powi(int base, int exp){
if(exp < 0) return 0;
int res = 1;
while (exp) {
if (exp & 1)
res *= base;
exp >>= 1;
base *= base;
}
return res;
}

8
src/CMakeLists.txt

@ -0,0 +1,8 @@
file(GLOB sources "*.cpp")
foreach(source ${sources})
get_filename_component(exec ${source} NAME_WE)
add_executable(${exec} ${source})
target_link_libraries(${exec} common ${libs})
endforeach()

41
src/div_test.cpp

@ -0,0 +1,41 @@
#include <iostream>
#include <vector>
#include <limits>
// I ran into some trouble with division, so I decided to test the boundary
// cases. Turns out that INT_MIN / -1 overflows and signals a SIGFPE :(.
using namespace std;
using T = long int;
static T myDiv(T x, T y){
if(!y) {
std::cout << "SIGFPE";
if(x < 0) return numeric_limits<T>::min();
if(x == 0) return 1;
return numeric_limits<T>::max();
}
// somehow this is wrong for shorts and chars, but ok for all bigger integral types
if(numeric_limits<T>::is_signed && x == numeric_limits<T>::min() && y == -1) {
std::cout << "SIGFPE";
return numeric_limits<T>::max();
}
return x / y;
}
int main(){
// The bad guys
vector<T> v = {numeric_limits<T>::min(), -1, 0, 1, numeric_limits<T>::max()};
for(auto&&x : v){
std::cout << x << " ";
}
std::cout << std::endl;
for(auto&& x : v){
for(auto&& y : v){
std::cout << x << " / " << y << " = " << std::flush;
std::cout << myDiv(x, y) << std::endl;
}
}
}

57
src/main.cpp

@ -0,0 +1,57 @@
#include "genome.hpp"
#include "evolve.hpp"
#include <boost/program_options.hpp>
#include <iostream>
int main(int argc, char** argv){
namespace po = boost::program_options;
po::options_description opts;
opts.add_options()
("input", po::value<std::vector<int>>()->multitoken(), "sequence to generate a formula for (e.g. 1 2 4 8)")
("help,h", po::bool_switch(), "show this help");
po::positional_options_description file_opts;
file_opts.add("input", -1);
po::variables_map vm;
po::store(po::command_line_parser(argc, argv).options(opts).positional(file_opts).run(), vm);
po::notify(vm);
if(vm["help"].as<bool>()){
std::cout << "Genetic Sequence Formulizer, version " << __DATE__ << "\n";
std::cout << "seq2form [-h] <sequence of non-negative integers>\n";
std::cout << opts << std::endl;
return 0;
}
// TODO: make these program options
const auto genome_size = 16;
const auto population = 10;
const auto generations = 1000001 / population;
auto evolver = create_evolver(population, genome_size);
evolver.goal = vm["input"].as<std::vector<int>>();
std::cout << "your input:\t";
for(auto&& x : evolver.goal) std::cout << x << ", ";
std::cout << std::endl;
// GO! (until we find a perfect solution, or we hit the generations-bound)
for(int i = 0; i < generations; ++i){
evolver.next_generation();
if(evolver.current_generation[0].second <= 0) {
break;
}
}
// SHOW!
auto & best_genome = evolver.current_generation[0].first;
std::cout << "formula:\t" << best_genome.as_formula() << std::endl;
std::cout << "continuation:\t";
for(int i = 0; i < 50; ++i){
std::cout << best_genome.evaluate_on(i) << ", ";
}
std::cout << std::endl;
}