diff --git a/wavelet/wavelet_parallel_2D_mockup.cpp b/wavelet/wavelet_parallel_2D_mockup.cpp
index 153f0f8..c95ef00 100644
--- a/wavelet/wavelet_parallel_2D_mockup.cpp
+++ b/wavelet/wavelet_parallel_2D_mockup.cpp
@@ -7,11 +7,14 @@
 #include "wavelet.hpp"
 #include "wavelet_parallel.hpp"
 
-static unsigned int P;
-static unsigned int W;
-static unsigned int H;
+static struct {
+	unsigned int P;
+	unsigned int W;
+	unsigned int H;
+	unsigned int M;
+	bool check_results;
+} globals;
 
-static std::vector<double> data;
 static std::vector<double> seqr;
 static std::vector<double> parr;
 
@@ -38,6 +41,12 @@ struct block {
 		bsp::push_reg(hcomm.data(), hcomm.size());
 		bsp::push_reg(vcomm.data(), vcomm.size());
 	}
+
+	void pop() {
+		bsp::pop_reg(data.data());
+		bsp::pop_reg(hcomm.data());
+		bsp::pop_reg(vcomm.data());
+	}
 };
 
 // Communicate vertical data from b (strided) to b2 (not strided)
@@ -110,26 +119,43 @@ static void hstep(wvlt::par::proc_info const & pi, plan_2D const & plan, std::ve
 	}
 }
 
+// gets globals from processor 0
+static void get_globals(){
+	bsp::push_reg(&globals);
+	bsp::sync();
+	bsp::get(0, &globals, 0, &globals);
+	bsp::sync();
+	bsp::pop_reg(&globals);
+}
+
+// fake data
+double data(unsigned int x, unsigned int y){
+	return x*y;
+}
+
 static void par_wavelet_2D(){
-	bsp::begin(P);
+	bsp::begin(globals.P);
+	get_globals();
 
 	const wvlt::par::proc_info d(bsp::nprocs(), bsp::pid());
-	const wvlt::par::plan_1D horizontal(W, W/d.p, 1);
-	const wvlt::par::plan_1D vertical(H, H/d.p, 1);
+	const wvlt::par::plan_1D horizontal(globals.W, globals.W/d.p, globals.M);
+	const wvlt::par::plan_1D vertical(globals.H, globals.H/d.p, globals.M);
 	const plan_2D plan{horizontal, vertical};
 
 	auto bbb = block(plan);
+	// We allocated everything up front, we don't actually need to do this
+	// but it's easy, as we don't have to think of this later.
 	std::vector<block> blocks(d.p, bbb);
 	std::vector<double> hfinish(2 * d.p * vertical.b, 0.0);
 	std::vector<double> vfinish(horizontal.b * 2 * d.p, 0.0);
 
-	// Direct read because MCBSP can do this ;D
+	// As we will be generating the data, no need to sync
 	for(unsigned int b = 0; b < blocks.size(); ++b){
 		unsigned int x_start = b * horizontal.b;
 		unsigned int y_start = (d.s - b + d.p)%d.p * vertical.b;
 		for(unsigned int y = 0; y < vertical.b; ++y){
 			for(unsigned int x = 0; x < horizontal.b; ++x){
-				auto v = data[x_start + x + horizontal.n*(y_start+y)];
+				auto v = data(x_start + x, y_start + y);
 				blocks[b].data[x + horizontal.b*y] = v;
 			}
 		}
@@ -150,29 +176,32 @@ static void par_wavelet_2D(){
 			stride <<= plan.horizontal.m;
 		}
 
-		// finish parallely
+		// fan in to the right processor
 		unsigned int hh = horizontal.b/2;
 		for(unsigned int b = 0; b < blocks.size(); ++b){
 			unsigned int t = (d.s - b + d.p)%d.p;
 			unsigned int x_start = b * 2;
 			auto ptr = blocks[b].data.data();
 			for(unsigned int y = 0; y < vertical.b; ++y){
-				bsp::put(t, &ptr[0  + horizontal.b*y], hfinish.data(), 0 + x_start + 2*d.p*y, 1);
-				bsp::put(t, &ptr[hh + horizontal.b*y], hfinish.data(), 1 + x_start + 2*d.p*y, 1);
+				// processor, source, dest, offset
+				bsp::put(t, &ptr[0  + horizontal.b*y], hfinish.data(), 0 + x_start + 2*d.p*y);
+				bsp::put(t, &ptr[hh + horizontal.b*y], hfinish.data(), 1 + x_start + 2*d.p*y);
 			}
 		}
 		bsp::sync();
 
+		// last step of the algorithm
 		for(unsigned int y = 0; y < vertical.b; ++y){
 			wvlt::wavelet(hfinish.data() + 2*d.p*y, 2*d.p, 1);
 		}
 
+		// fan out to the right processor
 		for(unsigned int y = 0; y < vertical.b; ++y){
 			for(unsigned int t = 0; t < d.p; ++t){
 				unsigned int b = (t - d.s + d.p)%d.p;
 				unsigned int x_start = b * 2;
-				bsp::put(t, &hfinish[0 + x_start + 2*d.p*y], blocks[b].data.data(), 0  + horizontal.b*y, 1);
-				bsp::put(t, &hfinish[1 + x_start + 2*d.p*y], blocks[b].data.data(), hh + horizontal.b*y, 1);
+				bsp::put(t, &hfinish[0 + x_start + 2*d.p*y], blocks[b].data.data(), 0  + horizontal.b*y);
+				bsp::put(t, &hfinish[1 + x_start + 2*d.p*y], blocks[b].data.data(), hh + horizontal.b*y);
 			}
 		}
 		bsp::sync();
@@ -186,29 +215,32 @@ static void par_wavelet_2D(){
 			stride <<= plan.vertical.m;
 		}
 
-		// finish parallely
+		// fan in to the right processor
 		unsigned int hh = vertical.b/2;
 		for(unsigned int b = 0; b < blocks.size(); ++b){
 			unsigned int t = b;
 			unsigned int y_start = (d.s - b + d.p)%d.p * 2;
 			auto ptr = blocks[b].data.data();
 			for(unsigned int x = 0; x < horizontal.b; ++x){
-				bsp::put(t, &ptr[x + 0 *horizontal.b], vfinish.data(), x + horizontal.b*(y_start + 0), 1);
-				bsp::put(t, &ptr[x + hh*horizontal.b], vfinish.data(), x + horizontal.b*(y_start + 1), 1);
+				// processor, source, dest, offset
+				bsp::put(t, &ptr[x + 0 *horizontal.b], vfinish.data(), x + horizontal.b*(y_start + 0));
+				bsp::put(t, &ptr[x + hh*horizontal.b], vfinish.data(), x + horizontal.b*(y_start + 1));
 			}
 		}
 		bsp::sync();
 
+		// last step of the algorithm
 		for(unsigned int x = 0; x < horizontal.b; ++x){
 			wvlt::wavelet(vfinish.data() + x, 2*d.p, horizontal.b);
 		}
 
+		// fan out to the right processor
 		for(unsigned int x = 0; x < horizontal.b; ++x){
 			for(unsigned int t = 0; t < d.p; ++t){
 				unsigned int b = d.s;
 				unsigned int y_start = (t - b + d.p)%d.p * 2;
-				bsp::put(t, &vfinish[x + horizontal.b*(y_start + 0)], blocks[b].data.data(), horizontal.b*0  + x, 1);
-				bsp::put(t, &vfinish[x + horizontal.b*(y_start + 1)], blocks[b].data.data(), horizontal.b*hh + x, 1);
+				bsp::put(t, &vfinish[x + horizontal.b*(y_start + 0)], blocks[b].data.data(), horizontal.b*0  + x);
+				bsp::put(t, &vfinish[x + horizontal.b*(y_start + 1)], blocks[b].data.data(), horizontal.b*hh + x);
 			}
 		}
 		bsp::sync();
@@ -217,25 +249,33 @@ static void par_wavelet_2D(){
 	double time2 = bsp::time();
 	if(d.s==0) printf("parallel version\t%f\n", time2 - time1);
 
-	// Direct write because MCBSP can do this ;D
-	for(unsigned int b = 0; b < blocks.size(); ++b){
-		unsigned int x_start = b * horizontal.b;
-		unsigned int y_start = (d.s - b + d.p)%d.p * vertical.b;
-		for(unsigned int y = 0; y < vertical.b; ++y){
-			for(unsigned int x = 0; x < horizontal.b; ++x){
-				auto v = blocks[b].data[x + horizontal.b*y];
-				parr[x_start + x + horizontal.n*(y_start+y)] = v;
+	if(globals.check_results){
+		bsp::push_reg(parr.data(), parr.size());
+		bsp::sync();
+
+		for(unsigned int b = 0; b < blocks.size(); ++b){
+			unsigned int x_start = b * horizontal.b;
+			unsigned int y_start = (d.s - b + d.p)%d.p * vertical.b;
+			for(unsigned int y = 0; y < vertical.b; ++y){
+				for(unsigned int x = 0; x < horizontal.b; ++x){
+					auto v = blocks[b].data[x + horizontal.b*y];
+					bsp::put(0, &v, parr.data(), x_start + x + horizontal.n*(y_start+y));
+				}
 			}
 		}
+		bsp::sync();
 	}
 
 	bsp::end();
 }
 
 static void seq_wavelet_2D(){
-	seqr = data;
+	for(unsigned int y = 0; y < globals.H; ++y)
+		for(unsigned int x = 0; x < globals.W; ++x)
+			seqr[x + globals.W*y] = data(x, y);
+
 	auto time1 = timer::clock::now();
-	wvlt::wavelet_2D(seqr.data(), W, H);
+	wvlt::wavelet_2D(seqr.data(), globals.W, globals.H);
 	auto time2 = timer::clock::now();
 	printf("sequential version\t%f\n", timer::from_dur(time2 - time1));
 }
@@ -258,22 +298,61 @@ static int compare_results(std::vector<double> const & lh, std::vector<double> c
 }
 
 int main(int argc, char** argv){
-	P = 2;
-	H = 1024;
-	W = 1024;
+	bsp::init(par_wavelet_2D, argc, argv);
+	namespace po = boost::program_options;
+
+	// Describe program options
+	po::options_description opts;
+	opts.add_options()
+		("p", po::value<unsigned int>(), "number of processors")
+		("w", po::value<unsigned int>(), "width of image")
+		("h", po::value<unsigned int>(), "height of image")
+		("m", po::value<unsigned int>()->default_value(1), "the variable m")
+		("help", po::bool_switch(), "show this help")
+		("show-input", po::bool_switch(), "shows the given input")
+		("seq", po::bool_switch(), "also runs the sequential algorithm")
+		("check", po::bool_switch(), "enables correctness checks");
+	po::variables_map vm;
+
+	// Parse and set options
+	try {
+		po::store(po::parse_command_line(argc, argv, opts), vm);
+		po::notify(vm);
+
+		if(vm["help"].as<bool>()){
+			std::cout << "Parallel wavelet mockup" << std::endl;
+			std::cout << opts << std::endl;
+			return 0;
+		}
 
-	data.assign(W*H, 0.0);
-	for(unsigned int y = 0; y < H; ++y)
-		for(unsigned int x = 0; x < W; ++x)
-			data[x + W*y] = x*y;
-	seqr.assign(W*H, 0.0);
-	parr.assign(W*H, 0.0);
+		globals.P = vm["p"].as<unsigned int>();
+		globals.W = vm["w"].as<unsigned int>();
+		globals.H = vm["h"].as<unsigned int>();
+		globals.M = vm["m"].as<unsigned int>();
+		globals.check_results = vm["check"].as<bool>();
+
+		if(!is_pow_of_two(globals.P)) throw po::error("p is not a power of two");
+		if(!is_pow_of_two(globals.W)) throw po::error("w is not a power of two");
+		if(!is_pow_of_two(globals.H)) throw po::error("h is not a power of two");
+	} catch(std::exception& e){
+		std::cout << colors::red("ERROR: ") << e.what() << std::endl;
+		std::cout << opts << std::endl;
+		return 1;
+	}
+
+	if(vm["show-input"].as<bool>()){
+		std::cout << "w\t" << globals.W << "h\t" << globals.H << "\np\t" << globals.P << "\nm\t" << globals.M << std::endl;
+	}
+
+	seqr.assign(globals.W*globals.H, 0.0);
+	parr.assign(globals.W*globals.H, 0.0);
 
-	bsp::init(par_wavelet_2D, argc, argv);
 	par_wavelet_2D();
 	seq_wavelet_2D();
 
-	double threshold = 1.0e-8;
-	std::cout << "Checking results ";
-	compare_results(seqr, parr, threshold);
+	if(globals.check_results){
+		double threshold = 1.0e-8;
+		std::cout << "Checking results ";
+		compare_results(seqr, parr, threshold);
+	}
 }