From 572d257e1de6a6b67cad1ee498ed01dfb9424dfa Mon Sep 17 00:00:00 2001 From: Joshua Moerman Date: Sat, 18 Jan 2014 12:06:27 +0100 Subject: [PATCH] Adds an optimalisation in pow_two, Updates some comments, reordered some stuff --- include/utilities.hpp | 9 +++++-- wavelet/wavelet_parallel.hpp | 37 +++++++++++++---------------- wavelet/wavelet_parallel_mockup.cpp | 9 ++++--- 3 files changed, 30 insertions(+), 25 deletions(-) diff --git a/include/utilities.hpp b/include/utilities.hpp index a480d04..34c61de 100644 --- a/include/utilities.hpp +++ b/include/utilities.hpp @@ -20,10 +20,15 @@ inline unsigned int two_log(unsigned int x){ return 8*sizeof(unsigned int) - unsigned(__builtin_clz(x-1)); } -// calculates 2^x (NOTE: can be improved by exponentiation by squaring) +// calculates 2^x (by squaring) inline unsigned int pow_two(unsigned int x){ + unsigned int base = 2; unsigned int y = 1; - while(x--) y *= 2; + while(x){ + if(x & 1) y *= base; + x >>= 1; + base *= base; + } return y; } diff --git a/wavelet/wavelet_parallel.hpp b/wavelet/wavelet_parallel.hpp index 4fb8a99..059be6f 100644 --- a/wavelet/wavelet_parallel.hpp +++ b/wavelet/wavelet_parallel.hpp @@ -6,17 +6,17 @@ #include "wavelet.hpp" /* In the following function we assume any in-parameter to be already - * bsp::pushed. And the functions won't do any bsp::sync at the end. Both - * conventions make it possible to chains functions with lesser syncs. + * bsp::pushed, if needed. And the functions won't do any bsp::sync at the end. + * Both conventions make it possible to chain functions with lesser syncs. * - * Distribution is block distribution. + * Distribution is block distribution. Wavelet is in-place. */ namespace wvlt { namespace par { - // The structs proc_info and plan_1D contain some often - // used values in the parallel algorithm, they also - // precompute some constants. + // The structs proc_info and plan_1D contain some often used + // values in the parallel algorithm, they also precompute some + // constants. // p = nproc(), s = pid() // prev/next = previous and next processor index @@ -29,8 +29,9 @@ namespace wvlt { }; // n = inputisze, b = blocksize, m = step_size - // Cm = communication size - // TODO: describe other vars + // Cm = communication size, small_steps = total number of steps + // in the wavelet transform, big_steps = number of supersteps + // doing m small steps, remainder = small_steps - m*big_steps. struct plan_1D { unsigned int n, b, m, Cm, small_steps, big_steps, remainder; @@ -46,13 +47,14 @@ namespace wvlt { return plan; } - inline void comm_step(proc_info const & pi, plan_1D const & plan, double* x, double* other, unsigned int size, unsigned int stride){ + // Does one big step: so 1 comm. step and m comp. steps + inline void step(proc_info const & d, plan_1D const & plan, double* x, double* other, unsigned int size, unsigned int stride){ + // Comminication for(unsigned int i = 0; i < plan.Cm; ++i){ - bsp::put(pi.prev, &x[stride*i], other, i, 1); + bsp::put(d.prev, &x[stride*i], other, i, 1); } - } - - inline void comp_step(proc_info const & d, plan_1D const & plan, double* x, double* other, unsigned int size, unsigned int stride){ + bsp::sync(); + // Computation unsigned int end = pow_two(plan.m); for(unsigned int i = 1; i < end; i <<= 1){ wavelet_mul(x, other[0], other[i], size, stride*i); @@ -60,12 +62,7 @@ namespace wvlt { } } - inline void step(proc_info const & d, plan_1D const & plan, double* x, double* other, unsigned int size, unsigned int stride){ - comm_step(d, plan, x, other, size, stride); - bsp::sync(); - comp_step(d, plan, x, other, size, stride); - } - + // Does the local part of the algorithm inline void base(proc_info const & d, plan_1D const & plan, double* x, double* other, unsigned int size){ // do steps of size m unsigned int stride = 1; @@ -79,7 +76,7 @@ namespace wvlt { step(d, get_remainder(plan), x, other, size, stride); } - // block distributed parallel wavelet, result is also in block distribution (in-place in x) + // The whole parallel algorithm inline void wavelet(proc_info const & d, plan_1D const & plan, double* x, double* next, double* proczero){ // First do the local part base(d, plan, x, next, plan.b); diff --git a/wavelet/wavelet_parallel_mockup.cpp b/wavelet/wavelet_parallel_mockup.cpp index d4f2080..5067e2d 100644 --- a/wavelet/wavelet_parallel_mockup.cpp +++ b/wavelet/wavelet_parallel_mockup.cpp @@ -82,8 +82,8 @@ static void par_wavelet(){ // So this is not part of the parallel program anymore bsp::pop_reg(proczero.data()); bsp::pop_reg(next.data()); - next.clear(); proczero.clear(); + next.clear(); if(globals.check_results){ bsp::push_reg(par_result.data(), par_result.size()); @@ -144,6 +144,7 @@ int main(int argc, char** argv){ ("iterations", po::value()->default_value(5), "number of iterations") ("help", po::bool_switch(), "show this help") ("show-input", po::bool_switch(), "shows the given input") + ("seq", po::bool_switch(), "also runs the sequential algorithm") ("check", po::bool_switch(), "enables correctness checks"); po::variables_map vm; @@ -182,8 +183,10 @@ int main(int argc, char** argv){ seq_result.assign(globals.N, 0.0); } - // Run both versions (will print timings) - seq_wavelet(); + // Run sequential algorithm if needed + if(globals.check_results || vm["seq"].as()) + seq_wavelet(); + // Always run parallel algorithm par_wavelet(); // Checking equality of algorithms