From 572d257e1de6a6b67cad1ee498ed01dfb9424dfa Mon Sep 17 00:00:00 2001
From: Joshua Moerman <lakseru@gmail.com>
Date: Sat, 18 Jan 2014 12:06:27 +0100
Subject: [PATCH] Adds an optimalisation in pow_two, Updates some comments,
 reordered some stuff

---
 include/utilities.hpp               |  9 +++++--
 wavelet/wavelet_parallel.hpp        | 37 +++++++++++++----------------
 wavelet/wavelet_parallel_mockup.cpp |  9 ++++---
 3 files changed, 30 insertions(+), 25 deletions(-)

diff --git a/include/utilities.hpp b/include/utilities.hpp
index a480d04..34c61de 100644
--- a/include/utilities.hpp
+++ b/include/utilities.hpp
@@ -20,10 +20,15 @@ inline unsigned int two_log(unsigned int x){
 	return 8*sizeof(unsigned int) - unsigned(__builtin_clz(x-1));
 }
 
-// calculates 2^x (NOTE: can be improved by exponentiation by squaring)
+// calculates 2^x (by squaring)
 inline unsigned int pow_two(unsigned int x){
+	unsigned int base = 2;
 	unsigned int y = 1;
-	while(x--) y *= 2;
+	while(x){
+		if(x & 1) y *= base;
+		x >>= 1;
+		base *= base;
+	}
 	return y;
 }
 
diff --git a/wavelet/wavelet_parallel.hpp b/wavelet/wavelet_parallel.hpp
index 4fb8a99..059be6f 100644
--- a/wavelet/wavelet_parallel.hpp
+++ b/wavelet/wavelet_parallel.hpp
@@ -6,17 +6,17 @@
 #include "wavelet.hpp"
 
 /* In the following function we assume any in-parameter to be already
- * bsp::pushed. And the functions won't do any bsp::sync at the end. Both
- * conventions make it possible to chains functions with lesser syncs.
+ * bsp::pushed, if needed. And the functions won't do any bsp::sync at the end.
+ * Both conventions make it possible to chain functions with lesser syncs.
  *
- * Distribution is block distribution.
+ * Distribution is block distribution. Wavelet is in-place.
  */
 
 namespace wvlt {
 	namespace par {
-		// The structs proc_info and plan_1D contain some often
-		// used values in the parallel algorithm, they also
-		// precompute some constants.
+		// The structs proc_info and plan_1D contain some often used
+		// values in the parallel algorithm, they also precompute some
+		// constants.
 
 		// p = nproc(), s = pid()
 		// prev/next = previous and next processor index
@@ -29,8 +29,9 @@ namespace wvlt {
 		};
 
 		// n = inputisze, b = blocksize, m = step_size
-		// Cm = communication size
-		// TODO: describe other vars
+		// Cm = communication size, small_steps = total number of steps
+		// in the wavelet transform, big_steps = number of supersteps
+		// doing m small steps, remainder = small_steps - m*big_steps.
 		struct plan_1D {
 			unsigned int n, b, m, Cm, small_steps, big_steps, remainder;
 
@@ -46,13 +47,14 @@ namespace wvlt {
 			return plan;
 		}
 
-		inline void comm_step(proc_info const & pi, plan_1D const & plan, double* x, double* other, unsigned int size, unsigned int stride){
+		// Does one big step: so 1 comm. step and m comp. steps
+		inline void step(proc_info const & d, plan_1D const & plan, double* x, double* other, unsigned int size, unsigned int stride){
+			// Comminication
 			for(unsigned int i = 0; i < plan.Cm; ++i){
-				bsp::put(pi.prev, &x[stride*i], other, i, 1);
+				bsp::put(d.prev, &x[stride*i], other, i, 1);
 			}
-		}
-
-		inline void comp_step(proc_info const & d, plan_1D const & plan, double* x, double* other, unsigned int size, unsigned int stride){
+			bsp::sync();
+			// Computation
 			unsigned int end = pow_two(plan.m);
 			for(unsigned int i = 1; i < end; i <<= 1){
 				wavelet_mul(x, other[0], other[i], size, stride*i);
@@ -60,12 +62,7 @@ namespace wvlt {
 			}
 		}
 
-		inline void step(proc_info const & d, plan_1D const & plan, double* x, double* other, unsigned int size, unsigned int stride){
-			comm_step(d, plan, x, other, size, stride);
-			bsp::sync();
-			comp_step(d, plan, x, other, size, stride);
-		}
-
+		// Does the local part of the algorithm
 		inline void base(proc_info const & d, plan_1D const & plan, double* x, double* other, unsigned int size){
 			// do steps of size m
 			unsigned int stride = 1;
@@ -79,7 +76,7 @@ namespace wvlt {
 				step(d, get_remainder(plan), x, other, size, stride);
 		}
 
-		// block distributed parallel wavelet, result is also in block distribution (in-place in x)
+		// The whole parallel algorithm
 		inline void wavelet(proc_info const & d, plan_1D const & plan, double* x, double* next, double* proczero){
 			// First do the local part
 			base(d, plan, x, next, plan.b);
diff --git a/wavelet/wavelet_parallel_mockup.cpp b/wavelet/wavelet_parallel_mockup.cpp
index d4f2080..5067e2d 100644
--- a/wavelet/wavelet_parallel_mockup.cpp
+++ b/wavelet/wavelet_parallel_mockup.cpp
@@ -82,8 +82,8 @@ static void par_wavelet(){
 	// So this is not part of the parallel program anymore
 	bsp::pop_reg(proczero.data());
 	bsp::pop_reg(next.data());
-	next.clear();
 	proczero.clear();
+	next.clear();
 
 	if(globals.check_results){
 		bsp::push_reg(par_result.data(), par_result.size());
@@ -144,6 +144,7 @@ int main(int argc, char** argv){
 		("iterations", po::value<unsigned int>()->default_value(5), "number of iterations")
 		("help", po::bool_switch(), "show this help")
 		("show-input", po::bool_switch(), "shows the given input")
+		("seq", po::bool_switch(), "also runs the sequential algorithm")
 		("check", po::bool_switch(), "enables correctness checks");
 	po::variables_map vm;
 
@@ -182,8 +183,10 @@ int main(int argc, char** argv){
 		seq_result.assign(globals.N, 0.0);
 	}
 
-	// Run both versions (will print timings)
-	seq_wavelet();
+	// Run sequential algorithm if needed
+	if(globals.check_results || vm["seq"].as<bool>())
+		seq_wavelet();
+	// Always run parallel algorithm
 	par_wavelet();
 
 	// Checking equality of algorithms