commit fcf5765cb5042543447349b75d2b3aa8fb6b45e9
Author: Joshua Moerman <lakseru@gmail.com>
Date:   Mon Mar 17 20:24:33 2014 +0100

    Initial commit

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..781b756
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+*.user
+build-*
+.DS_Store
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..d208e3a
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,14 @@
+project(Genetics)
+cmake_minimum_required(VERSION 2.8)
+
+include_directories("${PROJECT_SOURCE_DIR}/include/")
+add_definitions(-std=c++1y)
+
+find_package(Boost REQUIRED COMPONENTS filesystem program_options serialization system)
+include_directories(SYSTEM ${Boost_INCLUDE_DIRS})
+set(libs ${libs} ${Boost_LIBRARIES})
+
+set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+add_subdirectory("include")
+add_subdirectory("lib")
+add_subdirectory("src")
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..024d01b
--- /dev/null
+++ b/README.md
@@ -0,0 +1,38 @@
+# Genetic Sequence Formulizer
+This program will take a sequence entered by the user and tries to find a formula producing this sequence. I made this to play around with genetic programming. Inspired by the following two talks:
+
+* [Reducing Wasted Evaluations in Cartesian Genetic Programming, by Brian Goldman](https://www.youtube.com/watch?v=3HsgVHv1ho8)
+* [Automatic Algorithm Invention with a GPU, by Wes Faler](https://www.youtube.com/watch?v=xQDazGrKsuM)
+
+I used a technique similar to the one described in the first presentation (the one called *single*), as it seemed to be the easiest one. The nice thing is that there is always just one parent, but there are inactive genes which are allowed to mutate freely.
+
+## Examples
+Some actual output of the program (truncated):
+
+	$ ./main 0 0 0 0 0 0 0
+	your input:	0, 0, 0, 0, 0, 0, 0, 
+	formula:	(x-x)*x
+	continuation:	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
+	
+	$ ./main 1 2 3
+	your input:	1, 2, 3, 
+	formula:	((x-18)/(x-18))+x
+	continuation:	1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ...
+	
+	$ ./main 1 2 4 8
+	your input:	1, 2, 4, 8, 
+	formula:	2^x
+	continuation:	1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, ...
+	
+	$ ./main 37
+	your input:	37, 
+	formula:	(10+12)+15
+	continuation:	37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 
+
+## Notes
+
+* The algorithm does not yet simplify the formula (I should take this into account when scoring the genes).
+* The sequences are 0-based.
+* The constants used in the genes are bound to 20 (so in the last example, it needed multiple constants added).
+* It treats ```1/0``` as ```INT_MAX``` (we need some safety as we evaluate randomly generated expressions).
+* I may be using some c++14 features. Install a new clang or gcc to get this awesomeness.
\ No newline at end of file
diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt
new file mode 100644
index 0000000..987fe23
--- /dev/null
+++ b/include/CMakeLists.txt
@@ -0,0 +1,4 @@
+
+# Hack to show headers in Qt Creator
+file(GLOB headers "*.hpp")
+add_library(dummy ${headers} dummy.cpp)
diff --git a/include/dummy.cpp b/include/dummy.cpp
new file mode 100644
index 0000000..7dcf12c
--- /dev/null
+++ b/include/dummy.cpp
@@ -0,0 +1,2 @@
+// Needed for the hack to show headers in Qt Creator
+static int x = x;
diff --git a/include/evolve.hpp b/include/evolve.hpp
new file mode 100644
index 0000000..e2b3aed
--- /dev/null
+++ b/include/evolve.hpp
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "genome.hpp"
+#include <vector>
+
+using Score = double;
+
+struct Evolver{
+	// stores the genome with its score
+	// first element is the best one (after calling next_generation()).
+	std::vector<std::pair<Genome, Score>> current_generation;
+
+	// goal which we are trying to achieve
+	std::vector<int> goal;
+
+	// evaluates current generation, picks the best one, and generate new generation
+	void next_generation();
+};
+
+// reate a random generation to start with
+Evolver create_evolver(size_t population_size, size_t genome_size);
diff --git a/include/genome.hpp b/include/genome.hpp
new file mode 100644
index 0000000..254d33e
--- /dev/null
+++ b/include/genome.hpp
@@ -0,0 +1,61 @@
+#pragma once
+
+#include <vector>
+#include <iosfwd>
+
+// TODO: make this a decent type, and also make the variable "x" a decent type
+using Node = int;
+
+struct Value {
+	bool is_node;
+	union {
+		int constant;
+		Node node; // -1 reserved for "x"
+	} value;
+};
+
+struct Gen {
+	enum Type {
+		kAdd,
+		kSub,
+		kMult,
+		kDiv,
+		kPow,
+		kTypes
+	} type;
+	Value x, y;
+};
+
+struct Genome {
+	std::vector<Gen> genes;
+	std::vector<bool> active_genes;
+	Node output;
+
+	// calculates which genes are active (used internally)
+	void calculate_active_genes();
+
+	// evaluates expression with input x
+	int evaluate_on(int x) const;
+
+	// mutate a single bit of some gene, returns true when a active gene was altered
+	bool mutate_random_bit();
+
+	// checks whether the invariant still holds (for debugging)
+	bool invariant() const;
+
+	// returns a string in a (more or less) readable format
+	std::string as_formula() const;
+
+	// some trivial getters (to silence warnings)
+	Gen & gene(int n) { return genes[static_cast<size_t>(n)]; }
+	Gen const & gene(int n) const { return genes[static_cast<size_t>(n)]; }
+	bool is_active(int n) const { return active_genes[static_cast<size_t>(n)]; }
+};
+
+// the obvious I/O routines
+std::ostream & operator<<(std::ostream& out, Value const & g);
+std::ostream & operator<<(std::ostream& out, Gen const & g);
+std::ostream & operator<<(std::ostream& out, Genome const & g);
+
+// generates a random genome with the specified number of genes/nodes
+Genome random_genome(size_t number_of_genes);
diff --git a/include/utilities.hpp b/include/utilities.hpp
new file mode 100644
index 0000000..eedfc0e
--- /dev/null
+++ b/include/utilities.hpp
@@ -0,0 +1,7 @@
+#pragma once
+
+// I needed a safe division in order to avoid SIGFPE
+int divi(int x, int y);
+
+// Integer power function
+int powi(int base, int exp);
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
new file mode 100644
index 0000000..6196f11
--- /dev/null
+++ b/lib/CMakeLists.txt
@@ -0,0 +1,4 @@
+
+file(GLOB sources "*.cpp")
+add_library(common STATIC ${sources})
+target_link_libraries(common ${libs})
diff --git a/lib/evolve.cpp b/lib/evolve.cpp
new file mode 100644
index 0000000..abbdd79
--- /dev/null
+++ b/lib/evolve.cpp
@@ -0,0 +1,81 @@
+#include "evolve.hpp"
+#include "utilities.hpp"
+
+#include <map>
+#include <iterator>
+#include <cmath>
+#include <iostream>
+
+static Score score(const std::vector<int>& goal, Genome const & genome){
+	Score ss = 0;
+	for(int i = 0; i < goal.size(); ++i) {
+		Score error = goal[i] - genome.evaluate_on(i);
+		ss += error * error;
+	}
+	return ss;
+}
+
+template <typename It, typename Dist>
+It advance2(It it, Dist d){
+	std::advance(it, d);
+	return it;
+}
+
+template <typename C>
+struct subrange_t {
+	C & c;
+	int s;
+
+	subrange_t(C & c_, int s_)
+	: c(c_), s(s_)
+	{}
+
+	auto begin(){ return advance2(c.begin(), s); }
+	auto end(){ return c.end(); }
+
+	auto begin() const { return advance2(c.begin(), s); }
+	auto end() const { return c.end(); }
+};
+
+template <typename C>
+auto subrange(C & c, int start){
+	return subrange_t<C>(c, start);
+}
+
+void Evolver::next_generation(){
+	// evaluate (if needed)
+	for(auto&& g : current_generation){
+		if(g.second < 0) {
+			g.second = score(goal, g.first);
+		}
+	}
+
+	// pick best no worse than parent
+	auto best = current_generation[0];
+	for(auto&& g : current_generation){
+		if(g.second <= best.second){
+			best = g;
+		}
+	}
+
+	// continue with the best as parent
+	current_generation[0] = best;
+	int count = 0;
+	for(auto& g : subrange(current_generation, 1)){
+		count++;
+		g = best;
+		for(int j = 0; j < count; ++j){
+			while(!g.first.mutate_random_bit()){}
+		}
+		g.second = -1;
+	}
+}
+
+Evolver create_evolver(size_t population_size, size_t genome_size){
+	Evolver e;
+	e.current_generation.reserve(population_size);
+	for(size_t i = 0; i < population_size; ++i){
+		e.current_generation.emplace_back(random_genome(genome_size), -1);
+	}
+	return e;
+}
diff --git a/lib/genome.cpp b/lib/genome.cpp
new file mode 100644
index 0000000..65daf16
--- /dev/null
+++ b/lib/genome.cpp
@@ -0,0 +1,227 @@
+#include "genome.hpp"
+#include "utilities.hpp"
+
+#include <random>
+#include <functional>
+#include <iostream>
+#include <stack>
+#include <cassert>
+
+// Pretty printing nowadays obviously needs colors
+// blue == constant. red == node or variable
+// gray background == inactive gene
+static std::string color(int n, bool node){
+	if(node && n == -1) return "\x1B[31mx\x1b[39m";
+	if(node) return "\x1B[31m" + std::to_string(n) + "\x1b[39m";
+	return "\x1B[36m" + std::to_string(n) + "\x1b[39m";
+}
+
+std::ostream & operator<<(std::ostream& out, const Value& v){
+	if (v.is_node) {
+		return out << color(v.value.node, true);
+	} else {
+		return out << color(v.value.constant, false);
+	}
+}
+
+std::ostream & operator<<(std::ostream& out, Gen const & g){
+	switch (g.type) {
+		case Gen::kAdd: return out << "[+ " << g.x << ' ' << g.y << ']';
+		case Gen::kSub: return out << "[- " << g.x << ' ' << g.y << ']';
+		case Gen::kMult: return out << "[* " << g.x << ' ' << g.y << ']';
+		case Gen::kDiv: return out << "[/ " << g.x << ' ' << g.y << ']';
+		case Gen::kPow: return out << "[^ " << g.x << ' ' << g.y << ']';
+		default: return out << "[error]";
+	}
+}
+
+std::ostream & operator<<(std::ostream& out, Genome const & g){
+	for(unsigned int i = 0; i < g.genes.size(); ++i){
+		if(g.active_genes[i]){
+			out << g.genes[i];
+		} else {
+			out << "\x1B[47m" << g.genes[i] << "\x1B[49m";
+		}
+	}
+	return out << g.output;
+}
+
+void Genome::calculate_active_genes(){
+	active_genes.assign(genes.size(), false);
+
+	// NOTE: prohibits concurrency, but it's a speedup for now
+	static std::stack<Node> nodes;
+	nodes.push(output);
+	while(!nodes.empty()){
+		const auto n = nodes.top();
+		nodes.pop();
+		if(n == -1 || is_active(n)) continue;
+		active_genes[static_cast<size_t>(n)] = true;
+
+		auto const & g = gene(n);
+		if (g.x.is_node) {
+			nodes.push(g.x.value.node);
+		}
+		if (g.y.is_node) {
+			nodes.push(g.y.value.node);
+		}
+	}
+}
+
+static int calc_op(Gen::Type const & op, int const & x, int const & y){
+	switch (op) {
+		case Gen::kAdd: return x + y;
+		case Gen::kSub: return x - y;
+		case Gen::kMult: return x * y;
+		case Gen::kDiv: return divi(x, y);
+		case Gen::kPow: return powi(x, y);
+		default: return 0;
+	}
+}
+
+int Genome::evaluate_on(int x) const{
+	// NOTE: prohibits concurrency, but it's a speedup for now
+	static std::vector<int> results;
+	results.assign(output + 1, 0);
+
+	for(int i = 0; i <= output; ++i){
+		if(!is_active(i)) continue;
+
+		auto const & g = gene(i);
+		auto const xv = g.x.is_node ? (g.x.value.node == -1 ? x : results[g.x.value.node]) : g.x.value.constant;
+		auto const yv = g.y.is_node ? (g.y.value.node == -1 ? x : results[g.y.value.node]) : g.y.value.constant;
+
+		results[i] = calc_op(g.type, xv, yv);
+	}
+	return results[output];
+}
+
+// FIXME: encapsulate all this
+using Generator = std::mt19937;
+using Uniform = std::uniform_int_distribution<int>;
+using Params = Uniform::param_type;
+
+static Generator generator{0};
+static Uniform distribution;
+static auto random_type = std::bind(distribution, generator, Params(0, Gen::kTypes-1));
+static auto random_const = std::bind(distribution, generator, Params(-20, 20));
+static auto random_node = [](int n){ return distribution(generator, Params(0, n-1)); };
+
+static Value random_value(Node n){
+	Value v;
+	switch(distribution(generator, Params(0, n ? 2 : 1))){
+		case 0:	v.is_node = false;
+			v.value.constant = random_const();
+			break;
+		case 1:	v.is_node = true;
+			v.value.node = -1;
+			break;
+		case 2:	v.is_node = true;
+			v.value.node = random_node(n);
+			break;
+	}
+
+	return v;
+}
+
+static Gen random_gen(Node n){
+	Gen g;
+	g.type = Gen::Type(random_type());
+	g.x = random_value(n);
+	g.y = random_value(n);
+	return g;
+}
+
+Genome random_genome(size_t number_of_genes){
+	Genome g;
+	// FIXME: don't silence the warning, fix the node-numbering
+	const int nn = static_cast<int>(number_of_genes);
+	for(int i = 0; i < nn; ++i){
+		g.genes.emplace_back(random_gen(i));
+	}
+	g.output = random_node(nn);
+	g.calculate_active_genes();
+	return g;
+}
+
+bool Genome::mutate_random_bit(){
+	// FIXME: make this more robust
+	const auto fields_per_gen = 3;
+	const auto bit = distribution(generator, Params(0, fields_per_gen * genes.size())); // inclusive
+	if(bit == fields_per_gen * genes.size()){
+		output = random_node(genes.size());
+		calculate_active_genes();
+		assert(invariant());
+		return true;
+	} else {
+		const auto n = bit / fields_per_gen;
+		const auto field = bit % fields_per_gen;
+		auto & g = gene(n);
+		switch(field){
+			case 0:
+				g.type = Gen::Type(random_type());
+				break;
+			case 1:
+				g.x = random_value(n);
+				break;
+			case 2:
+				g.y = random_value(n);
+				break;
+		}
+		if(is_active(n)){
+			calculate_active_genes();
+			assert(invariant());
+			return true;
+		} else {
+			assert(invariant());
+			return false;
+		}
+	}
+}
+
+bool Genome::invariant() const{
+	for(int n = 0; n < genes.size(); ++n){
+		Gen const & g = gene(n);
+		if(g.x.is_node && g.x.value.node >= n) return false;
+		if(g.y.is_node && g.y.value.node >= n) return false;
+	}
+	return true;
+}
+
+std::string Genome::as_formula() const{
+	static std::vector<std::string> results;
+	results.assign(output+1, "err");
+
+	for(int i = 0; i <= output; ++i){
+		if(!is_active(i)) continue;
+
+		auto const & g = gene(i);
+		std::string x;
+		std::string y;
+
+		if(g.x.is_node){
+			if(g.x.value.node == -1) x = "x";
+			else x = "(" + results[g.x.value.node] + ")";
+		} else {
+			x = std::to_string(g.x.value.constant);
+		}
+
+		if(g.y.is_node){
+			if(g.y.value.node == -1) y = "x";
+			else y = "(" + results[g.y.value.node] + ")";
+		} else {
+			y = std::to_string(g.y.value.constant);
+		}
+
+		switch (g.type) {
+			case Gen::kAdd: results[i] = x + "+" + y; break;
+			case Gen::kSub: results[i] = x + "-" + y; break;
+			case Gen::kMult: results[i] = x + "*" + y; break;
+			case Gen::kDiv: results[i] = x + "/" + y; break;
+			case Gen::kPow: results[i] = x + "^" + y; break;
+			default: results[i] = "err";
+		}
+	}
+	return results[output];
+}
+
diff --git a/lib/utilities.cpp b/lib/utilities.cpp
new file mode 100644
index 0000000..c0c47d5
--- /dev/null
+++ b/lib/utilities.cpp
@@ -0,0 +1,32 @@
+#include "utilities.hpp"
+
+#include <limits>
+
+using namespace std;
+
+int divi(int x, int y){
+	// clearly we don't want a division by zero
+	if(!y) {
+		if(x < 0) return numeric_limits<int>::min();
+		if(x == 0) return 1;
+		return numeric_limits<int>::max();
+	}
+	// in this situation there is a overflow which causes a SIGFPE
+	if(x == numeric_limits<int>::min() && y == -1) {
+		return numeric_limits<int>::max();
+	}
+	return x / y;
+}
+
+// exponentiation by squaring
+int powi(int base, int exp){
+	if(exp < 0) return 0;
+	int res = 1;
+	while (exp) {
+		if (exp & 1)
+			res *= base;
+		exp >>= 1;
+		base *= base;
+	}
+	return res;
+}
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 0000000..56b78f0
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,8 @@
+
+file(GLOB sources "*.cpp")
+
+foreach(source ${sources})
+	get_filename_component(exec ${source} NAME_WE)
+	add_executable(${exec} ${source})
+	target_link_libraries(${exec} common ${libs})
+endforeach()
diff --git a/src/div_test.cpp b/src/div_test.cpp
new file mode 100644
index 0000000..bfee8d8
--- /dev/null
+++ b/src/div_test.cpp
@@ -0,0 +1,41 @@
+#include <iostream>
+#include <vector>
+#include <limits>
+
+// I ran into some trouble with division, so I decided to test the boundary
+// cases. Turns out that INT_MIN / -1 overflows and signals a SIGFPE :(.
+
+using namespace std;
+using T = long int;
+
+static T myDiv(T x, T y){
+	if(!y) {
+		std::cout << "SIGFPE";
+		if(x < 0) return numeric_limits<T>::min();
+		if(x == 0) return 1;
+		return numeric_limits<T>::max();
+	}
+	// somehow this is wrong for shorts and chars, but ok for all bigger integral types
+	if(numeric_limits<T>::is_signed && x == numeric_limits<T>::min() && y == -1) {
+		std::cout << "SIGFPE";
+		return numeric_limits<T>::max();
+	}
+	return x / y;
+}
+
+int main(){
+	// The bad guys
+	vector<T> v = {numeric_limits<T>::min(), -1, 0, 1, numeric_limits<T>::max()};
+
+	for(auto&&x : v){
+		std::cout << x << " ";
+	}
+	std::cout << std::endl;
+
+	for(auto&& x : v){
+		for(auto&& y : v){
+			std::cout << x << " / " << y << " = " << std::flush;
+			std::cout << myDiv(x, y) << std::endl;
+		}
+	}
+}
diff --git a/src/main.cpp b/src/main.cpp
new file mode 100644
index 0000000..66a4bb4
--- /dev/null
+++ b/src/main.cpp
@@ -0,0 +1,57 @@
+#include "genome.hpp"
+#include "evolve.hpp"
+
+#include <boost/program_options.hpp>
+#include <iostream>
+
+int main(int argc, char** argv){
+	namespace po = boost::program_options;
+
+	po::options_description opts;
+	opts.add_options()
+		("input", po::value<std::vector<int>>()->multitoken(), "sequence to generate a formula for (e.g. 1 2 4 8)")
+		("help,h", po::bool_switch(), "show this help");
+
+	po::positional_options_description file_opts;
+	file_opts.add("input", -1);
+
+	po::variables_map vm;
+	po::store(po::command_line_parser(argc, argv).options(opts).positional(file_opts).run(), vm);
+	po::notify(vm);
+
+	if(vm["help"].as<bool>()){
+		std::cout << "Genetic Sequence Formulizer, version " << __DATE__ << "\n";
+		std::cout << "seq2form [-h] <sequence of non-negative integers>\n";
+		std::cout << opts << std::endl;
+		return 0;
+	}
+
+	// TODO: make these program options
+	const auto genome_size = 16;
+	const auto population = 10;
+	const auto generations = 1000001 / population;
+
+	auto evolver = create_evolver(population, genome_size);
+	evolver.goal = vm["input"].as<std::vector<int>>();
+
+	std::cout << "your input:\t";
+	for(auto&& x : evolver.goal) std::cout << x << ", ";
+	std::cout << std::endl;
+
+	// GO! (until we find a perfect solution, or we hit the generations-bound)
+	for(int i = 0; i < generations; ++i){
+		evolver.next_generation();
+		if(evolver.current_generation[0].second <= 0) {
+			break;
+		}
+	}
+
+	// SHOW!
+	auto & best_genome = evolver.current_generation[0].first;
+	std::cout << "formula:\t" << best_genome.as_formula() << std::endl;
+	std::cout << "continuation:\t";
+	for(int i = 0; i < 50; ++i){
+		std::cout << best_genome.evaluate_on(i) << ", ";
+	}
+	std::cout << std::endl;
+}