diff --git a/include/basics.hpp b/include/basics.hpp
index dbca2ba..52faf7c 100644
--- a/include/basics.hpp
+++ b/include/basics.hpp
@@ -32,8 +32,9 @@
 #include <algorithm>
 
 namespace pixel_formats {
-	inline uint8_t clamp(int n){
-		return std::min(255, std::max(0, n));
+	template <typename T>
+	uint8_t clamp(T n){
+		return uint8_t(std::min<T>(255, std::max<T>(0, n)));
 	}
 	
 	struct gray {
@@ -53,27 +54,25 @@ namespace pixel_formats {
 		static const size_t bits_per_color = 8;
 		
 		rgb()
-		: red(0)
-		, green(0)
-		, blue(0)
+		: r(0)
+		, g(0)
+		, b(0)
 		{}
 		
 		rgb(double red, double green, double blue)
-		: red(clamp(255*red))
-		, green(clamp(255*green))
-		, blue(clamp(255*blue))
+		: r(clamp(255*red))
+		, g(clamp(255*green))
+		, b(clamp(255*blue))
 		{}
 		
 		rgb(int red, int green, int blue)
-		: red(clamp(red))
-		, green(clamp(green))
-		, blue(clamp(blue))
+		: r(clamp(red))
+		, g(clamp(green))
+		, b(clamp(blue))
 		{}
 		
 	private:
-		uint8_t red;
-		uint8_t green;
-		uint8_t blue;
+		uint8_t r, g, b;
 	};
 	
 	struct bgr{
@@ -81,27 +80,25 @@ namespace pixel_formats {
 		static const size_t bits_per_color = 8;
 		
 		bgr()
-		: blue(0)
-		, green(0)
-		, red(0)
+		: b(0)
+		, g(0)
+		, r(0)
 		{}
 		
 		bgr(double red, double green, double blue)
-		: blue(clamp(255*blue))
-		, green(clamp(255*green))
-		, red(clamp(255*red))
+		: b(clamp(255*blue))
+		, g(clamp(255*green))
+		, r(clamp(255*red))
 		{}
 		
 		bgr(int red, int green, int blue)
-		: blue(clamp(blue))
-		, green(clamp(green))
-		, red(clamp(red))
+		: b(clamp(blue))
+		, g(clamp(green))
+		, r(clamp(red))
 		{}
 		
 	private:
-		uint8_t blue;
-		uint8_t green;
-		uint8_t red;
+		uint8_t b, g, r;
 	};
 
 	template <typename PixelType>
diff --git a/include/png.hpp b/include/png.hpp
index d7e60c9..2afbe6a 100644
--- a/include/png.hpp
+++ b/include/png.hpp
@@ -140,7 +140,7 @@ namespace png{
 			if(bit_depth < 8) throw std::runtime_error("Bitdepths lower than 8 are not supported (yet)");
 			
 			row.resize(row_size);
-			png_read_row(png_ptr, (png_bytep)row.data(), 0);
+			png_read_row(png_ptr, png_bytep(row.data()), 0);
 		}
 		
 		~istream(){
@@ -161,7 +161,7 @@ namespace png{
 				x = 0;
 				++y;
 				if(y < height)
-					png_read_row(png_ptr, (png_bytep)row.data(), 0);
+					png_read_row(png_ptr, png_bytep(row.data()), 0);
 			}
 			
 			return *this;
@@ -171,7 +171,7 @@ namespace png{
 			return valid;
 		}
 		
-		uint32_t get_width() const { return  row.size(); }
+		uint32_t get_width() const { return row.size(); }
 		uint32_t get_height() const { return height; }
 		
 	private:
diff --git a/include/utilities.hpp b/include/utilities.hpp
index 029fb85..688a512 100644
--- a/include/utilities.hpp
+++ b/include/utilities.hpp
@@ -15,22 +15,25 @@ bool is_even(Int n){
 
 // calculates integer 2-log such that:
 // 2^(two_log(x)) >= x > 2^(two_log(x) - 1)
-unsigned int two_log(unsigned int x){
+inline unsigned int two_log(unsigned int x){
 	if(x <= 1) return 0;
-	return 8*sizeof(unsigned int) - __builtin_clz(x-1);
+	return 8*sizeof(unsigned int) - unsigned(__builtin_clz(x-1));
 }
 
-// Makes numbers human-readable
-// eg 2300000 becomes 2M
-inline std::string human_string(int n){
+// Makes numbers human-readable with one decimal
+// eg 2350000 becomes 2.3M
+template <typename Int>
+inline std::string human_string(Int n, std::string suffix = ""){
 	static const std::string names [] = {"", "K", "M", "G"};
-	int i = 0;
+	unsigned int i = 0;
+	Int m = 10*n;
 	while(n > 1000 && i < sizeof(names)){
 		n /= 1000;
+		m /= 1000;
 		++i;
 	}
 	// cast is to make the old gcc 4.4 happy (it doesn't have all overloads of to_string)
-	return std::to_string((long long)n) + names[i];
+	return std::to_string(n) + "." + std::to_string(m % 10) + names[i] + suffix;
 }
 
 inline std::string field(std::string const & str){
@@ -43,7 +46,7 @@ inline std::string field(std::string const & str){
 // Prints a vector with brackets and commas
 // Does not work recursively!
 template <typename T>
-void print_vec(std::vector<T> v){
+void print_vec(std::vector<T> const & v){
 	auto it = v.begin(), end = v.end();
 	std::cout << "{" << *it++;
 	while(it != end) std::cout << ", " << *it++;
@@ -59,8 +62,8 @@ struct timer{
 	std::string name;
 	time begin;
 
-	timer(std::string name)
-	: name(name)
+	timer(std::string name_)
+	: name(name_)
 	, begin(clock::now())
 	{}
 
@@ -75,10 +78,10 @@ struct timer{
 };
 
 namespace colors {
-	std::string red(std::string s){
+	inline std::string red(std::string s){
 		return "\x1b[31m" + s + "\x1b[39m";
 	}
-	std::string green(std::string s){
+	inline std::string green(std::string s){
 		return "\x1b[32m" + s + "\x1b[39m";
 	}
 }
diff --git a/wavelet/defines.hpp b/wavelet/defines.hpp
new file mode 100644
index 0000000..acf51fa
--- /dev/null
+++ b/wavelet/defines.hpp
@@ -0,0 +1,2 @@
+#define NEXP 20
+
diff --git a/wavelet/jcmp.cpp b/wavelet/jcmp.cpp
new file mode 100644
index 0000000..1fde5e7
--- /dev/null
+++ b/wavelet/jcmp.cpp
@@ -0,0 +1,134 @@
+#include <includes.hpp>
+#include <utilities.hpp>
+
+#include <boost/filesystem.hpp>
+#include <png.hpp>
+
+#include "jcmp.hpp"
+#include "wavelet.hpp"
+
+using namespace wvlt::V2;
+
+// Can be any functions from R+ -> R
+// as long as backward is its inverse
+static double forward(double x){
+	//return std::log(1 + std::log(1 + x));
+	return std::log(x);
+}
+
+static double backward(double x){
+	//return std::exp(std::exp(x) - 1) - 1;
+	return std::exp(x);
+}
+
+// note: we take a copy, because we will modify it in place
+static jcmp::image compress(std::vector<double> image, jcmp::uint width, double threshold, unsigned int& nzeros){
+	jcmp::uint height = image.size() / width;
+	assert(is_pow_of_two(width));
+	assert(is_pow_of_two(height));
+
+	// wavelet transform in x-direction
+	for(unsigned int i = 0; i < height; ++i){
+		wavelet(&image[i*width], width, 1);
+	}
+
+	// wavelet transform in y-direction
+	for(unsigned int i = 0; i < width; ++i){
+		wavelet(&image[i], height, width);
+	}
+
+	double min_abs = 10000.0;
+	double max_abs = 0.0;
+
+	for(auto& el : image){
+		auto absel = std::abs(el);
+		if(absel > threshold) {
+			min_abs = std::min(min_abs, absel);
+			max_abs = std::max(max_abs, absel);
+		} else {
+			el = 0;
+		}
+	}
+
+	jcmp::quantization q(&forward, &backward, max_abs, min_abs);
+
+	// save the principal coefficients
+	std::vector<jcmp::coefficient> v;
+	for(unsigned int y = 0; y < height; ++y){
+		for(unsigned int x = 0; x < width; ++x){
+			auto&& el = image[x + width*y];
+			if(el != 0) v.push_back({q.forwards(el), jcmp::uint(x), jcmp::uint(y)});
+		}
+	}
+
+	nzeros = v.size();
+	return jcmp::image(width, height, q.p, std::move(v));
+}
+
+static std::vector<double> decompress(jcmp::image in){
+	auto width = in.header.width;
+	auto height = in.header.height;
+	assert(is_pow_of_two(width));
+	assert(is_pow_of_two(height));
+
+	auto q = in.header.get_quantization(&forward, &backward);
+
+	std::vector<double> image(width * height, 0.0);
+
+	// read in coefficient on coordinates
+	for(auto it = in.data.begin(); it != in.data.end(); ++it){
+		auto&& x = *it;
+		image[x.x + width*x.y] = q.backwards(x.c);
+	}
+
+	in.clear();
+
+	// inverse wavelet transform in y-direction
+	for(unsigned int i = 0; i < width; ++i){
+		unwavelet(&image[i], height, width);
+	}
+
+	// inverse wavelet transform in x-direction
+	for(unsigned int i = 0; i < height; ++i){
+		unwavelet(&image[i*width], width, 1);
+	}
+
+	return image;
+}
+
+int main(){
+	namespace fs = boost::filesystem;
+
+	fs::path directory("images");
+	fs::directory_iterator eod;
+	for(fs::directory_iterator it(directory); it != eod; ++it){
+		auto && path = it->path();
+		if(path.extension() != ".png") continue;
+
+		// open file
+		std::string filename = path.string();
+		std::cout << field("file") << filename << std::endl;
+		png::istream image(filename);
+
+		auto width = image.get_width();
+		auto height = image.get_height();
+
+		// read into vector
+		std::vector<double> image_vec;
+		image_vec.reserve(width * height);
+		for(unsigned char c = 0; image >> c;) image_vec.push_back(c/255.0);
+
+		// compress and decompress to see how we lost information
+		unsigned int nzeros = 0;
+		auto compressed_vec = decompress(compress(image_vec, width, 0.1, nzeros));
+
+		// output some information
+		std::cout << field("raw") << human_string(sizeof(uint8_t) * image_vec.size(), "b") << std::endl;
+		std::cout << field("compressed") << human_string(sizeof(jcmp::coefficient) * nzeros, "b") << std::endl;
+
+		// save to output file
+		std::string cfilename = "compressed/" + path.filename().string();
+		png::gray_ostream compressed_image(width, height, cfilename);
+		for(unsigned int i = 0; i < compressed_vec.size(); ++i) compressed_image << compressed_vec[i];
+	}
+}
diff --git a/wavelet/jcmp.hpp b/wavelet/jcmp.hpp
new file mode 100644
index 0000000..9922791
--- /dev/null
+++ b/wavelet/jcmp.hpp
@@ -0,0 +1,3 @@
+#pragma once
+
+#include "jcmp_image.hpp"
diff --git a/wavelet/compressed_image.hpp b/wavelet/jcmp_image.hpp
similarity index 72%
rename from wavelet/compressed_image.hpp
rename to wavelet/jcmp_image.hpp
index 4cfa171..0094412 100644
--- a/wavelet/compressed_image.hpp
+++ b/wavelet/jcmp_image.hpp
@@ -2,26 +2,40 @@
 
 #include <includes.hpp>
 
+#include "jcmp_quantization.hpp"
+
 // joshua's compression :D
 namespace jcmp {
 	typedef uint32_t uint;
 
-	struct __attribute__ ((__packed__)) header {
-		const char signature[4];
+	struct header {
+		char signature[4];
 		uint width;
 		uint height;
 		uint length;
+		quantize_params qp;
 
-		header(uint width = 0, uint height = 0, uint length = 0)
+		header() = default;
+		header(uint width_, uint height_, uint length_, quantize_params const & p)
 		: signature{'J', 'C', 'M', 'P'}
-		, width(width)
-		, height(height)
-		, length(length)
+		, width(width_)
+		, height(height_)
+		, length(length_)
+		, qp(p)
 		{}
+
+		template <typename F>
+		quantization get_quantization(F const & f, F const & f_inv){
+			quantization q;
+			q.p = qp;
+			q.f = f;
+			q.f_inv = f_inv;
+			return q;
+		}
 	};
 
 	struct __attribute__ ((__packed__)) coefficient {
-		double c;
+		uint8_t c;
 		uint x;
 		uint y;
 	};
@@ -32,13 +46,14 @@ namespace jcmp {
 
 		image() = default;
 
-		image(uint width, uint height, std::vector<coefficient> const & data_in)
-		: header(width, height, data_in.size())
+		image(uint width, uint height, quantize_params const & p, std::vector<coefficient> const & data_in)
+		: header(width, height, data_in.size(), p)
 		, data(data_in)
 		{}
 
 		void clear(){
 			header.length = header.height = header.width = 0;
+			header.qp.f_max_abs = header.qp.f_min_abs = 0;
 			data.clear();
 		}
 	};
@@ -80,4 +95,7 @@ namespace jcmp {
 		file.sgetn(reinterpret_cast<char*>(image.data.data()), image.data.size() * sizeof(coefficient));
 		return image;
 	}
+
+	static_assert(sizeof(header) == 32,     "struct not propery packed");
+	static_assert(sizeof(coefficient) == 9, "struct not propery packed");
 }
diff --git a/wavelet/compressed_image_test.cpp b/wavelet/jcmp_image_test.cpp
similarity index 68%
rename from wavelet/compressed_image_test.cpp
rename to wavelet/jcmp_image_test.cpp
index 0b511d3..4e7f983 100644
--- a/wavelet/compressed_image_test.cpp
+++ b/wavelet/jcmp_image_test.cpp
@@ -1,9 +1,10 @@
 #include <includes.hpp>
 #include <utilities.hpp>
 
-#include "compressed_image.hpp"
+#include "jcmp_image.hpp"
+
 namespace jcmp{
-	bool operator==(coefficient const & lhs, coefficient const & rhs){
+	inline bool operator==(coefficient const & lhs, coefficient const & rhs){
 		return lhs.c == rhs.c && lhs.x == rhs.x && lhs.y == rhs.y;
 	}
 }
@@ -18,17 +19,17 @@ struct gen{
 			++y;
 		}
 		// only for testing purpose :D
-		return {rand() / double(RAND_MAX), x, y};
+		return {uint8_t(rand()), x, y};
 	}
 };
 
-void test_correctness(){
+static void test_correctness(){
 	const int w = 1024, h = 512;
 	std::vector<jcmp::coefficient> v(w*h / 100);
 	std::generate(v.begin(), v.end(), gen{w, 0, 0});
 
 	{	// iterator method
-		jcmp::write_to_file({w, h}, v.begin(), v.end(), "test.jcmp");
+		jcmp::write_to_file({w, h, 0, {0.0, 0.0}}, v.begin(), v.end(), "test.jcmp");
 		auto j = jcmp::read_from_file("test.jcmp");
 
 		assert(w == j.header.width);
@@ -37,7 +38,7 @@ void test_correctness(){
 	}
 
 	{	// copy method
-		jcmp::image i(w, h, std::move(v));
+		jcmp::image i(w, h, {0.0, 0.0}, std::move(v));
 
 		jcmp::write_to_file(i, "test.jcmp");
 		auto j = jcmp::read_from_file("test.jcmp");
@@ -48,27 +49,27 @@ void test_correctness(){
 	}
 }
 
-void test_speed(){
+static void test_speed(){
 	const int w = 4000, h = 8000;
 	std::vector<jcmp::coefficient> input(w*h / 20);
 	std::generate(input.begin(), input.end(), gen{w, 0, 0});
 
 	for(int i = 0; i < 10; ++i){
 		{	timer t("stream");
-			jcmp::write_to_file({w, h}, input.begin(), input.end(), "stream.jcmp");
+			jcmp::write_to_file({w, h, 0, {0.0, 0.0}}, input.begin(), input.end(), "stream.jcmp");
 		}
 
 		std::vector<jcmp::coefficient> copy(w*h / 20);
 		std::copy(input.begin(), input.end(), copy.begin());
 
 		{	timer t("move");
-			jcmp::image i(w, h, std::move(copy));
-			jcmp::write_to_file(i, "move.jcmp");
+			jcmp::image j(w, h, {0.0, 0.0}, std::move(copy));
+			jcmp::write_to_file(j, "move.jcmp");
 		}
 
 		{	timer t("copy");
-			jcmp::image i(w, h, input);
-			jcmp::write_to_file(i, "copy.jcmp");
+			jcmp::image j(w, h, {0.0, 0.0}, input);
+			jcmp::write_to_file(j, "copy.jcmp");
 		}
 	}
 }
diff --git a/wavelet/jcmp_quantization.hpp b/wavelet/jcmp_quantization.hpp
new file mode 100644
index 0000000..fbf2a4c
--- /dev/null
+++ b/wavelet/jcmp_quantization.hpp
@@ -0,0 +1,49 @@
+#pragma once
+
+namespace jcmp {
+	// input in [0, 1], output [0, 255]
+	inline uint8_t clamp_round(double x){
+		if(x <= 0.0) return 0;
+		if(x >= 1.0) return 255;
+		return uint8_t(std::round(255.0*x));
+	}
+
+	// parameters which need to be stored in the file
+	// to be able to dequantize
+	struct quantize_params {
+		double f_max_abs;
+		double f_min_abs;
+	};
+
+	// Quantization may be non linear (given by f and f_inv)
+	struct quantization {
+		std::function<double(double)> f;
+		std::function<double(double)> f_inv;
+		quantize_params p;
+
+		quantization() = default;
+
+		template <typename F>
+		quantization(F const & f_, F const & f_inv_, double max_abs, double min_abs)
+		: f(f_)
+		, f_inv(f_inv_)
+		, p{f(max_abs), f(min_abs)}
+		{}
+
+		uint8_t forwards(double x){
+			double y = std::abs(x);
+			y = (f(y) - p.f_min_abs) / (p.f_max_abs - p.f_min_abs);
+			if(x < 0) y = -y;
+			return clamp_round(0.5 * (y + 1.0));
+		}
+
+		double backwards(uint8_t x){
+			double y = 2.0 * x / 255.0 - 1.0;
+			y = (p.f_max_abs - p.f_min_abs) * y;
+			if(y < 0) return -f_inv(-y + p.f_min_abs);
+			return f_inv(y + p.f_min_abs);
+		}
+	};
+
+	static_assert(sizeof(quantize_params) == 16, "struct not propery packed");
+}
diff --git a/wavelet/jcmp_quantization_test.cpp b/wavelet/jcmp_quantization_test.cpp
new file mode 100644
index 0000000..ef942f4
--- /dev/null
+++ b/wavelet/jcmp_quantization_test.cpp
@@ -0,0 +1,8 @@
+#include <includes.hpp>
+#include <utilities.hpp>
+
+#include "jcmp_quantization.hpp"
+
+int main(){
+	// no tests yet
+}
diff --git a/wavelet/periodic_iterator_test.cpp b/wavelet/periodic_iterator_test.cpp
index 22f6c5e..26f06b6 100644
--- a/wavelet/periodic_iterator_test.cpp
+++ b/wavelet/periodic_iterator_test.cpp
@@ -1,7 +1,5 @@
-#include <iostream>
-#include <algorithm>
-#include <iterator>
-#include <vector>
+#include <includes.hpp>
+#include <utilities.hpp>
 
 #include <boost/assign.hpp>
 
diff --git a/wavelet/striding_iterator_test.cpp b/wavelet/striding_iterator_test.cpp
index b00e883..d94af05 100644
--- a/wavelet/striding_iterator_test.cpp
+++ b/wavelet/striding_iterator_test.cpp
@@ -1,5 +1,5 @@
-#include <iostream>
-#include <vector>
+#include <includes.hpp>
+#include <utilities.hpp>
 
 #include <boost/assign.hpp>
 
diff --git a/wavelet/wavelet.hpp b/wavelet/wavelet.hpp
index 77a083c..900c981 100644
--- a/wavelet/wavelet.hpp
+++ b/wavelet/wavelet.hpp
@@ -1,102 +1,3 @@
 #pragma once
 
-#include <includes.hpp>
-
-#include "striding_iterator.hpp"
-#include "periodic_iterator.hpp"
-#include "wavelet_constants.hpp"
-
-namespace wvlt {
-	namespace V1 {
-		// Apply the matrix Wn with the DAUB4 coefficients
-		template <typename Iterator>
-		void wavelet_mul(Iterator begin, Iterator end){
-			auto mul = end - begin;
-			std::vector<double> out(mul, 0.0);
-			for(int i = 0; i < mul; i += 2){
-				out[i]   = std::inner_product(evn_coef, evn_coef+4, periodic(begin, end) + i, 0.0);
-				out[i+1] = std::inner_product(odd_coef, odd_coef+4, periodic(begin, end) + i, 0.0);
-			}
-			for(int i = 0; i < mul; ++i){
-				*begin++ = out[i];
-			}
-		}
-
-		// Apply inverse of the matrix Wn with the DAUB4 coefficients
-		template <typename Iterator>
-		void wavelet_inv(Iterator begin, Iterator end){
-			auto mul = end - begin;
-			std::vector<double> out(mul, 0.0);
-			Iterator bc = begin;
-			for(int i = 0; i < mul; i += 2, begin += 2){
-				Iterator b2 = begin + 1;
-				for(int j = 0; j < 4; ++j){
-					out[(i+j) % mul] += *begin * evn_coef[j] + *b2 * odd_coef[j];
-				}
-			}
-			for(int i = 0; i < mul; ++i){
-				*bc++ = out[i];
-			}
-		}
-
-		// Shuffle works with an extra copy, might be inefficient, but it is at least correct ;)
-		// It applies the even-odd-sort matrix Sn
-		template <typename Iterator>
-		void shuffle(Iterator begin, Iterator end){
-			typedef typename std::iterator_traits<Iterator>::value_type value_type;
-			auto s = end - begin;
-			assert(s % 2 == 0);
-
-			std::vector<value_type> v(s);
-			std::copy(strided(begin  , 2), strided(end  , 2), v.begin());
-			std::copy(strided(begin+1, 2), strided(end+1, 2), v.begin() + s/2);
-			std::copy(v.begin(), v.end(), begin);
-		}
-
-		template <typename Iterator>
-		void unshuffle(Iterator begin, Iterator end){
-			typedef typename std::iterator_traits<Iterator>::value_type value_type;
-			auto s = end - begin;
-			assert(s % 2 == 0);
-
-			std::vector<value_type> v(s);
-			std::copy(begin, begin + s/2, strided(v.begin(),   2));
-			std::copy(begin + s/2, end,   strided(v.begin()+1, 2));
-			std::copy(v.begin(), v.end(), begin);
-		}
-
-		// Combines the matrix Wn and Sn recusrively
-		// Only works for inputs of size 2^m
-		template <typename Iterator>
-		void wavelet(Iterator begin, Iterator end){
-			auto s = end - begin;
-			assert(s >= 4);
-			for(int i = s; i >= 4; i >>= 1){
-				// half interval
-				end = begin + i;
-				assert(is_pow_of_two(end - begin));
-
-				// multiply with Wn
-				wavelet_mul(begin, end);
-				// then with Sn
-				shuffle(begin, end);
-			}
-		}
-
-		template <typename Iterator>
-		void unwavelet(Iterator begin, Iterator end){
-			auto s = end - begin;
-			assert(s >= 4);
-			for(int i = 4; i <= s; i <<= 1){
-				// double interval
-				end = begin + i;
-				assert(is_pow_of_two(end - begin));
-
-				// unshuffle: Sn^-1
-				unshuffle(begin, end);
-				// then Wn^-1
-				wavelet_inv(begin, end);
-			}
-		}
-	}
-}
+#include "wavelet_2.hpp"
diff --git a/wavelet/wavelet2.cpp b/wavelet/wavelet2.cpp
deleted file mode 100644
index 3e3c505..0000000
--- a/wavelet/wavelet2.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-#include <includes.hpp>
-
-#include <boost/filesystem.hpp>
-#include <png.hpp>
-#include <utilities.hpp>
-
-#include "compressed_image.hpp"
-#include "striding_iterator.hpp"
-#include "periodic_iterator.hpp"
-#include "wavelet.hpp"
-
-using namespace wvlt::V1;
-
-// note: we take a copy, because we will modify it in place
-jcmp::image compress(std::vector<double> image, int width, double threshold, int& zeros){
-	auto height = image.size() / width;
-	assert(is_pow_of_two(width));
-	assert(is_pow_of_two(height));
-
-	// wavelet transform in x-direction
-	for(int i = 0; i < height; ++i){
-		wavelet(image.begin() + i*width, image.begin() + (i+1)*width);
-	}
-
-	// wavelet transform in y-direction
-	for(int i = 0; i < width; ++i){
-		wavelet(strided(image.begin() + i, width), strided(image.end() + i, width));
-	}
-
-	// save the principal coefficients
-	std::vector<jcmp::coefficient> v;
-	for(int y = 0; y < height; ++y){
-		for(int x = 0; x < width; ++x){
-			auto&& el = image[x + width*y];
-			if(std::abs(el) > threshold) v.push_back({el, jcmp::uint(x), jcmp::uint(y)});
-			else ++zeros;
-		}
-	}
-
-	return jcmp::image(width, height, std::move(v));
-}
-
-std::vector<double> decompress(jcmp::image in){
-	auto width = in.header.width;
-	auto height = in.header.height;
-	assert(is_pow_of_two(width));
-	assert(is_pow_of_two(height));
-
-	std::vector<double> image(width * height, 0.0);
-
-	// read in coefficient on coordinates
-	for(auto it = in.data.begin(); it != in.data.end(); ++it){
-		auto&& x = *it;
-		image[x.x + width*x.y] = x.c;
-	}
-
-	in.clear();
-
-	// inverse wavelet transform in y-direction
-	for(int i = 0; i < width; ++i){
-		unwavelet(strided(image.begin() + i, width), strided(image.end() + i, width));
-	}
-
-	// inverse wavelet transform in x-direction
-	for(int i = 0; i < height; ++i){
-		unwavelet(image.begin() + i*width, image.begin() + (i+1)*width);
-	}
-
-	return image;
-}
-
-int main(){
-	namespace fs = boost::filesystem;
-
-	fs::path directory("images");
-	fs::directory_iterator eod;
-	for(fs::directory_iterator it(directory); it != eod; ++it){
-		auto && path = it->path();
-		if(path.extension() != ".png") continue;
-
-		// open file
-		std::string filename = path.string();
-		std::cout << field("file") << filename << std::endl;
-		png::istream image(filename);
-
-		auto width = image.get_width();
-		auto height = image.get_height();
-
-		// read into vector
-		std::vector<double> image_vec;
-		image_vec.reserve(width * height);
-		for(unsigned char c = 0; image >> c;) image_vec.push_back(c/255.0);
-
-		// compress and decompress to see how we lost information
-		int zeros = 0;
-		auto compressed_vec = decompress(compress(image_vec, width, 0.5, zeros));
-
-		// output some information
-		std::cout << field("raw") << human_string(image_vec.size()) << std::endl;
-		std::cout << field("compressed") << human_string(image_vec.size() - zeros) << std::endl;
-
-		// save to output file
-		std::string cfilename = "compressed/" + path.filename().string();
-		png::gray_ostream compressed_image(width, height, cfilename);
-		for(int i = 0; i < compressed_vec.size(); ++i) compressed_image << compressed_vec[i];
-	}
-}
diff --git a/wavelet/wavelet_1.hpp b/wavelet/wavelet_1.hpp
new file mode 100644
index 0000000..77a083c
--- /dev/null
+++ b/wavelet/wavelet_1.hpp
@@ -0,0 +1,102 @@
+#pragma once
+
+#include <includes.hpp>
+
+#include "striding_iterator.hpp"
+#include "periodic_iterator.hpp"
+#include "wavelet_constants.hpp"
+
+namespace wvlt {
+	namespace V1 {
+		// Apply the matrix Wn with the DAUB4 coefficients
+		template <typename Iterator>
+		void wavelet_mul(Iterator begin, Iterator end){
+			auto mul = end - begin;
+			std::vector<double> out(mul, 0.0);
+			for(int i = 0; i < mul; i += 2){
+				out[i]   = std::inner_product(evn_coef, evn_coef+4, periodic(begin, end) + i, 0.0);
+				out[i+1] = std::inner_product(odd_coef, odd_coef+4, periodic(begin, end) + i, 0.0);
+			}
+			for(int i = 0; i < mul; ++i){
+				*begin++ = out[i];
+			}
+		}
+
+		// Apply inverse of the matrix Wn with the DAUB4 coefficients
+		template <typename Iterator>
+		void wavelet_inv(Iterator begin, Iterator end){
+			auto mul = end - begin;
+			std::vector<double> out(mul, 0.0);
+			Iterator bc = begin;
+			for(int i = 0; i < mul; i += 2, begin += 2){
+				Iterator b2 = begin + 1;
+				for(int j = 0; j < 4; ++j){
+					out[(i+j) % mul] += *begin * evn_coef[j] + *b2 * odd_coef[j];
+				}
+			}
+			for(int i = 0; i < mul; ++i){
+				*bc++ = out[i];
+			}
+		}
+
+		// Shuffle works with an extra copy, might be inefficient, but it is at least correct ;)
+		// It applies the even-odd-sort matrix Sn
+		template <typename Iterator>
+		void shuffle(Iterator begin, Iterator end){
+			typedef typename std::iterator_traits<Iterator>::value_type value_type;
+			auto s = end - begin;
+			assert(s % 2 == 0);
+
+			std::vector<value_type> v(s);
+			std::copy(strided(begin  , 2), strided(end  , 2), v.begin());
+			std::copy(strided(begin+1, 2), strided(end+1, 2), v.begin() + s/2);
+			std::copy(v.begin(), v.end(), begin);
+		}
+
+		template <typename Iterator>
+		void unshuffle(Iterator begin, Iterator end){
+			typedef typename std::iterator_traits<Iterator>::value_type value_type;
+			auto s = end - begin;
+			assert(s % 2 == 0);
+
+			std::vector<value_type> v(s);
+			std::copy(begin, begin + s/2, strided(v.begin(),   2));
+			std::copy(begin + s/2, end,   strided(v.begin()+1, 2));
+			std::copy(v.begin(), v.end(), begin);
+		}
+
+		// Combines the matrix Wn and Sn recusrively
+		// Only works for inputs of size 2^m
+		template <typename Iterator>
+		void wavelet(Iterator begin, Iterator end){
+			auto s = end - begin;
+			assert(s >= 4);
+			for(int i = s; i >= 4; i >>= 1){
+				// half interval
+				end = begin + i;
+				assert(is_pow_of_two(end - begin));
+
+				// multiply with Wn
+				wavelet_mul(begin, end);
+				// then with Sn
+				shuffle(begin, end);
+			}
+		}
+
+		template <typename Iterator>
+		void unwavelet(Iterator begin, Iterator end){
+			auto s = end - begin;
+			assert(s >= 4);
+			for(int i = 4; i <= s; i <<= 1){
+				// double interval
+				end = begin + i;
+				assert(is_pow_of_two(end - begin));
+
+				// unshuffle: Sn^-1
+				unshuffle(begin, end);
+				// then Wn^-1
+				wavelet_inv(begin, end);
+			}
+		}
+	}
+}
diff --git a/wavelet/wavelet2.hpp b/wavelet/wavelet_2.hpp
similarity index 84%
rename from wavelet/wavelet2.hpp
rename to wavelet/wavelet_2.hpp
index 5d97935..0c22702 100644
--- a/wavelet/wavelet2.hpp
+++ b/wavelet/wavelet_2.hpp
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <utilities.hpp>
 #include "wavelet_constants.hpp"
 
 /* Rewrite of the basic functions
@@ -69,17 +70,23 @@ namespace wvlt{
 			x[i+stride] = y2;
 		}
 
-		inline void wavelet(double* x, unsigned int size){
+		// size indicates number of elements to process (so this is different from above!)
+		inline void wavelet(double* x, unsigned int size, unsigned int stride){
 			assert(x && is_pow_of_two(size) && size >= 4);
+			auto full_size = stride*size;
 			for(unsigned int i = 1; i <= size / 4; i <<= 1){
-				wavelet_mul(x, x[0], x[i], size, i);
+				auto j = stride * i;
+				wavelet_mul(x, x[0], x[j], full_size, j);
 			}
 		}
 
-		inline void unwavelet(double* x, unsigned int size){
+		// size indicates number of elements to process (so this is different from above!)
+		inline void unwavelet(double* x, unsigned int size, unsigned int stride){
 			assert(x && is_pow_of_two(size) && size >= 4);
+			auto full_size = stride*size;
 			for(unsigned int i = size / 4; i >= 1; i >>= 1){
-				wavelet_inv(x, x[size-i], x[size-2*i], size, i);
+				auto j = stride * i;
+				wavelet_inv(x, x[full_size-j], x[full_size-2*j], full_size, j);
 			}
 		}
 	}
diff --git a/wavelet/mockup.cpp b/wavelet/wavelet_parallel_mockup.cpp
similarity index 94%
rename from wavelet/mockup.cpp
rename to wavelet/wavelet_parallel_mockup.cpp
index ae4a846..eed79a4 100644
--- a/wavelet/mockup.cpp
+++ b/wavelet/wavelet_parallel_mockup.cpp
@@ -2,7 +2,7 @@
 #include <utilities.hpp>
 #include <bsp.hpp>
 
-#include "wavelet2.hpp"
+#include "wavelet.hpp"
 #include "defines.hpp"
 
 #ifndef NEXP
@@ -68,7 +68,7 @@ static void par_wavelet_base(distribution const & d, double* x, double* next, do
 
 	// proc zero has the privilige/duty to finish the job
 	if(d.s == 0) {
-		wvlt::wavelet(proczero, 2*d.p);
+		wvlt::wavelet(proczero, 2*d.p, 1);
 		// and to send it back to everyone
 		for(unsigned int t = 0; t < d.p; ++t){
 			for(unsigned int i = 0; i < 2; ++i){
@@ -102,7 +102,7 @@ static void par_wavelet(){
 
 	double time1 = bsp::time();
 
-	for(int i = 0; i < ITERS; ++i){
+	for(unsigned int i = 0; i < ITERS; ++i){
 		par_wavelet_base(d, x.data(), next.data(), proczero.data());
 		bsp::sync();
 	}
@@ -133,8 +133,8 @@ static void seq_wavelet(){
 	for(unsigned int i = 0; i < N; ++i) v[i] = data(i);
 
 	{	auto time1 = timer::clock::now();
-		for(int i = 0; i < ITERS; ++i){
-			wvlt::wavelet(v.data(), v.size());
+		for(unsigned int i = 0; i < ITERS; ++i){
+			wvlt::wavelet(v.data(), v.size(), 1);
 		}
 		auto time2 = timer::clock::now();
 		printf("sequential version\t%f\n", timer::from_dur(time2 - time1));
@@ -165,8 +165,8 @@ static void check_equality(double threshold){
 // Checks whether inverse gives us the data back
 // NOTE: modifies the global seq_result
 static void check_inverse(double threshold){
-	for(int i = 0; i < ITERS; ++i){
-		wvlt::unwavelet(seq_result.data(), seq_result.size());
+	for(unsigned int i = 0; i < ITERS; ++i){
+		wvlt::unwavelet(seq_result.data(), seq_result.size(), 1);
 	}
 	bool same = true;
 	for(unsigned int i = 0; i < N; ++i){
diff --git a/wavelet/wavelet_test.cpp b/wavelet/wavelet_test.cpp
index 85737bd..0e6eebf 100644
--- a/wavelet/wavelet_test.cpp
+++ b/wavelet/wavelet_test.cpp
@@ -1,19 +1,10 @@
 #include <includes.hpp>
 #include <utilities.hpp>
 
-#include "wavelet.hpp"
-#include "wavelet2.hpp"
+#include "wavelet_1.hpp"
+#include "wavelet_2.hpp"
 
-
-template <typename T>
-void print_vec(std::vector<T> v){
-	auto it = v.begin(), end = v.end();
-	std::cout << "{" << *it++;
-	while(it != end) std::cout << ", " << *it++;
-	std::cout << "}\n";
-}
-
-void timing_test(){
+static void timing_test(){
 	std::vector<double> input1 = {-1.0, -2.0, 2.0, 1.0, -3.0, -4.0, 4.0, 3.0};
 	std::vector<double> input2 = input1;
 	int test_size = 10;
@@ -36,15 +27,15 @@ void timing_test(){
 	print_vec(input2);
 }
 
-void correctness_test(){
+static void correctness_test(){
 	std::vector<double> input1 = {-1.0, -2.0, 2.0, 1.0, -3.0, -4.0, 4.0, 3.0};
 	std::vector<double> input2 = input1;
 
 	wvlt::V1::wavelet(input1.begin(), input1.end());
 	wvlt::V1::unwavelet(input1.begin(), input1.end());
 
-	wvlt::V2::wavelet(input2.data(), input2.size());
-	wvlt::V2::unwavelet(input2.data(), input2.size());
+	wvlt::V2::wavelet(input2.data(), input2.size(), 1);
+	wvlt::V2::unwavelet(input2.data(), input2.size(), 1);
 
 	std::cout << "V1\t"; print_vec(input1);
 	std::cout << "V2\t"; print_vec(input2);