diff --git a/wavelet/wavelet_2.hpp b/wavelet/wavelet_2.hpp index 4057a95..52e7a1c 100644 --- a/wavelet/wavelet_2.hpp +++ b/wavelet/wavelet_2.hpp @@ -81,6 +81,13 @@ namespace wvlt{ } } + inline void wavelet_2D(double* in, unsigned int width, unsigned int height){ + for(unsigned int y = 0; y < height; ++y) + wavelet(in + y*width, width, 1); + for(unsigned int x = 0; x < width; ++x) + wavelet(in + x, height, width); + } + // size indicates number of elements to process (so this is different from above!) inline void unwavelet(double* x, unsigned int size, unsigned int stride){ assert(x && is_pow_of_two(size) && size >= 4); @@ -90,5 +97,12 @@ namespace wvlt{ wavelet_inv(x, x[full_size-j], x[full_size-2*j], full_size, j); } } + + inline void unwavelet_2D(double* in, unsigned int width, unsigned int height){ + for(unsigned int x = 0; x < width; ++x) + unwavelet(in + x, height, width); + for(unsigned int y = 0; y < height; ++y) + unwavelet(in + y*width, width, 1); + } } } diff --git a/wavelet/wavelet_parallel.hpp b/wavelet/wavelet_parallel.hpp index f8d1c0a..4fb8a99 100644 --- a/wavelet/wavelet_parallel.hpp +++ b/wavelet/wavelet_parallel.hpp @@ -14,59 +14,79 @@ namespace wvlt { namespace par { - // Convenience container of some often-used values - // n = inputisze, p = nproc(), s = pid() - // b = blocksize, prev/next = previous and next processor index - struct distribution { - unsigned int n, p, s, b, prev, next; - - distribution(unsigned int n_, unsigned int p_, unsigned int s_) - : n(n_), p(p_), s(s_), b(n/p), prev((s-1+p)%p), next((s+1)%p) + // The structs proc_info and plan_1D contain some often + // used values in the parallel algorithm, they also + // precompute some constants. + + // p = nproc(), s = pid() + // prev/next = previous and next processor index + struct proc_info { + unsigned int p, s, prev, next; + + proc_info(unsigned int p_, unsigned int s_) + : p(p_), s(s_), prev((s-1+p)%p), next((s+1)%p) + {} + }; + + // n = inputisze, b = blocksize, m = step_size + // Cm = communication size + // TODO: describe other vars + struct plan_1D { + unsigned int n, b, m, Cm, small_steps, big_steps, remainder; + + plan_1D(unsigned int n_, unsigned int b_, unsigned int m_) + : n(n_), b(b_), m(m_), Cm(pow_two(m+1) - 2), small_steps(two_log(b) - 1), big_steps(small_steps/m), remainder(small_steps - m*big_steps) {} }; - inline unsigned int communication_size(unsigned int m){ - return pow_two(m+1) - 2; + inline plan_1D get_remainder(plan_1D plan){ + plan.m = plan.remainder; + plan.Cm = pow_two(plan.m+1) - 2; + plan.remainder = 0; + return plan; } - inline void step(distribution const & d, double* x, double* other, unsigned int size, unsigned int stride, unsigned int m){ - unsigned int t = d.prev; - unsigned int Cm = communication_size(m); - for(unsigned int i = 0; i < Cm; ++i){ - bsp::put(t, &x[stride*i], other, i, 1); + inline void comm_step(proc_info const & pi, plan_1D const & plan, double* x, double* other, unsigned int size, unsigned int stride){ + for(unsigned int i = 0; i < plan.Cm; ++i){ + bsp::put(pi.prev, &x[stride*i], other, i, 1); } - bsp::sync(); + } - unsigned int end = pow_two(m); + inline void comp_step(proc_info const & d, plan_1D const & plan, double* x, double* other, unsigned int size, unsigned int stride){ + unsigned int end = pow_two(plan.m); for(unsigned int i = 1; i < end; i <<= 1){ wavelet_mul(x, other[0], other[i], size, stride*i); if(i < end/2) wavelet_mul_base(other, 2*end - 2*i, i); } } - inline void base(distribution const & d, double* x, double* other, unsigned int size, unsigned int m){ - unsigned int t = two_log(d.b); - unsigned int steps = (t-1)/m; + inline void step(proc_info const & d, plan_1D const & plan, double* x, double* other, unsigned int size, unsigned int stride){ + comm_step(d, plan, x, other, size, stride); + bsp::sync(); + comp_step(d, plan, x, other, size, stride); + } + inline void base(proc_info const & d, plan_1D const & plan, double* x, double* other, unsigned int size){ + // do steps of size m unsigned int stride = 1; - for(unsigned int i = steps; i; i--){ - step(d, x, other, size, stride, m); - stride <<= m; + for(unsigned int i = plan.big_steps; i; i--){ + step(d, plan, x, other, size, stride); + stride <<= plan.m; } - unsigned int remaining = (t-1) - m*steps; - if(remaining) - step(d, x, other, size, stride, remaining); + // in the case m didn't divide the total number of small steps, do the remaining part + if(plan.remainder) + step(d, get_remainder(plan), x, other, size, stride); } // block distributed parallel wavelet, result is also in block distribution (in-place in x) - inline void wavelet(distribution const & d, double* x, double* next, double* proczero, unsigned int m){ + inline void wavelet(proc_info const & d, plan_1D const & plan, double* x, double* next, double* proczero){ // First do the local part - base(d, x, next, d.b, m); + base(d, plan, x, next, plan.b); // Then do a fan in (i.e. 2 elements to proc zero) for(unsigned int i = 0; i < 2; ++i){ - bsp::put(0, &x[i * d.b/2], proczero, d.s * 2 + i); + bsp::put(0, &x[i * plan.b/2], proczero, d.s * 2 + i); } bsp::sync(); @@ -76,7 +96,7 @@ namespace wvlt { // and to send it back to everyone for(unsigned int t = 0; t < d.p; ++t){ for(unsigned int i = 0; i < 2; ++i){ - bsp::put(t, &proczero[t*2 + i], x, i * d.b/2); + bsp::put(t, &proczero[t*2 + i], x, i * plan.b/2); } } } diff --git a/wavelet/wavelet_parallel_mockup.cpp b/wavelet/wavelet_parallel_mockup.cpp index 7134a66..4441efe 100644 --- a/wavelet/wavelet_parallel_mockup.cpp +++ b/wavelet/wavelet_parallel_mockup.cpp @@ -7,7 +7,7 @@ #include "wavelet_parallel.hpp" // Number of iterations to improve time measurements -static unsigned int ITERS = 1; +static unsigned int iterations = 1; // Static :(, will be set in main static unsigned int P; @@ -23,12 +23,12 @@ static double data(unsigned int global_index){ } // NOTE: does not synchronize -static void read_and_distribute_data(wvlt::par::distribution const & d, double* x){ - std::vector r(d.b); +static void read_and_distribute_data(wvlt::par::proc_info const & d, wvlt::par::plan_1D plan, double* x){ + std::vector r(plan.b); for(unsigned int t = 0; t < d.p; ++t){ - r.assign(d.b, 0.0); - for(unsigned int i = 0; i < d.b; ++i){ - r[i] = data(i + t*d.b); + r.assign(plan.b, 0.0); + for(unsigned int i = 0; i < plan.b; ++i){ + r[i] = data(i + t*plan.b); } bsp::put(t, r.data(), x, 0, r.size()); } @@ -36,16 +36,14 @@ static void read_and_distribute_data(wvlt::par::distribution const & d, double* static void par_wavelet(){ bsp::begin(P); - wvlt::par::distribution d(N, bsp::nprocs(), bsp::pid()); - - unsigned int m = 2; - unsigned int Cm = wvlt::par::communication_size(m); + const wvlt::par::proc_info d(bsp::nprocs(), bsp::pid()); + const wvlt::par::plan_1D plan(N, N/d.p, 2); // We allocate and push everything up front, since we need it anyways // (so peak memory is the same). This saves us 1 bsp::sync() // For convenience and consistency we use std::vector - std::vector x(d.b, 0.0); - std::vector next(Cm, 0.0); + std::vector x(plan.b, 0.0); + std::vector next(plan.Cm, 0.0); std::vector proczero(d.s == 0 ? 2*d.p : 1, 0.0); bsp::push_reg(x.data(), x.size()); @@ -56,13 +54,13 @@ static void par_wavelet(){ // processor zero reads data from file // gives each proc its own piece - if(d.s == 0) read_and_distribute_data(d, x.data()); + if(d.s == 0) read_and_distribute_data(d, plan, x.data()); bsp::sync(); // do the parallel wavelet!!! double time1 = bsp::time(); - for(unsigned int i = 0; i < ITERS; ++i){ - wvlt::par::wavelet(d, x.data(), next.data(), proczero.data(), m); + for(unsigned int i = 0; i < iterations; ++i){ + wvlt::par::wavelet(d, plan, x.data(), next.data(), proczero.data()); bsp::sync(); } double time2 = bsp::time(); @@ -78,7 +76,7 @@ static void par_wavelet(){ bsp::push_reg(par_result.data(), par_result.size()); bsp::sync(); - bsp::put(0, x.data(), par_result.data(), d.s * d.b, d.b); + bsp::put(0, x.data(), par_result.data(), d.s * plan.b, plan.b); bsp::sync(); bsp::pop_reg(par_result.data()); @@ -91,7 +89,7 @@ static void seq_wavelet(){ for(unsigned int i = 0; i < N; ++i) v[i] = data(i); { auto time1 = timer::clock::now(); - for(unsigned int i = 0; i < ITERS; ++i){ + for(unsigned int i = 0; i < iterations; ++i){ wvlt::wavelet(v.data(), v.size(), 1); } auto time2 = timer::clock::now(); @@ -144,7 +142,7 @@ int main(int argc, char** argv){ N = vm["n"].as(); P = vm["p"].as(); - ITERS = vm["iterations"].as(); + iterations = vm["iterations"].as(); if(!is_pow_of_two(N)) throw po::error("n is not a power of two"); if(!is_pow_of_two(P)) throw po::error("p is not a power of two"); @@ -169,7 +167,7 @@ int main(int argc, char** argv){ std::cout << "Checking results "; compare_results(seq_result, par_result, threshold); - for(int i = 0; i < ITERS; ++i) wvlt::unwavelet(seq_result.data(), seq_result.size(), 1); + for(unsigned int i = 0; i < iterations; ++i) wvlt::unwavelet(seq_result.data(), seq_result.size(), 1); for(unsigned int i = 0; i < par_result.size(); ++i) par_result[i] = data(i); std::cout << "Checking inverse ";