Splits distribution into proc_info and plan. Refacters some variables.
This commit is contained in:
parent
28f7497e0a
commit
763d7e25d8
3 changed files with 80 additions and 48 deletions
|
@ -81,6 +81,13 @@ namespace wvlt{
|
|||
}
|
||||
}
|
||||
|
||||
inline void wavelet_2D(double* in, unsigned int width, unsigned int height){
|
||||
for(unsigned int y = 0; y < height; ++y)
|
||||
wavelet(in + y*width, width, 1);
|
||||
for(unsigned int x = 0; x < width; ++x)
|
||||
wavelet(in + x, height, width);
|
||||
}
|
||||
|
||||
// size indicates number of elements to process (so this is different from above!)
|
||||
inline void unwavelet(double* x, unsigned int size, unsigned int stride){
|
||||
assert(x && is_pow_of_two(size) && size >= 4);
|
||||
|
@ -90,5 +97,12 @@ namespace wvlt{
|
|||
wavelet_inv(x, x[full_size-j], x[full_size-2*j], full_size, j);
|
||||
}
|
||||
}
|
||||
|
||||
inline void unwavelet_2D(double* in, unsigned int width, unsigned int height){
|
||||
for(unsigned int x = 0; x < width; ++x)
|
||||
unwavelet(in + x, height, width);
|
||||
for(unsigned int y = 0; y < height; ++y)
|
||||
unwavelet(in + y*width, width, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -14,59 +14,79 @@
|
|||
|
||||
namespace wvlt {
|
||||
namespace par {
|
||||
// Convenience container of some often-used values
|
||||
// n = inputisze, p = nproc(), s = pid()
|
||||
// b = blocksize, prev/next = previous and next processor index
|
||||
struct distribution {
|
||||
unsigned int n, p, s, b, prev, next;
|
||||
// The structs proc_info and plan_1D contain some often
|
||||
// used values in the parallel algorithm, they also
|
||||
// precompute some constants.
|
||||
|
||||
distribution(unsigned int n_, unsigned int p_, unsigned int s_)
|
||||
: n(n_), p(p_), s(s_), b(n/p), prev((s-1+p)%p), next((s+1)%p)
|
||||
// p = nproc(), s = pid()
|
||||
// prev/next = previous and next processor index
|
||||
struct proc_info {
|
||||
unsigned int p, s, prev, next;
|
||||
|
||||
proc_info(unsigned int p_, unsigned int s_)
|
||||
: p(p_), s(s_), prev((s-1+p)%p), next((s+1)%p)
|
||||
{}
|
||||
};
|
||||
|
||||
inline unsigned int communication_size(unsigned int m){
|
||||
return pow_two(m+1) - 2;
|
||||
// n = inputisze, b = blocksize, m = step_size
|
||||
// Cm = communication size
|
||||
// TODO: describe other vars
|
||||
struct plan_1D {
|
||||
unsigned int n, b, m, Cm, small_steps, big_steps, remainder;
|
||||
|
||||
plan_1D(unsigned int n_, unsigned int b_, unsigned int m_)
|
||||
: n(n_), b(b_), m(m_), Cm(pow_two(m+1) - 2), small_steps(two_log(b) - 1), big_steps(small_steps/m), remainder(small_steps - m*big_steps)
|
||||
{}
|
||||
};
|
||||
|
||||
inline plan_1D get_remainder(plan_1D plan){
|
||||
plan.m = plan.remainder;
|
||||
plan.Cm = pow_two(plan.m+1) - 2;
|
||||
plan.remainder = 0;
|
||||
return plan;
|
||||
}
|
||||
|
||||
inline void step(distribution const & d, double* x, double* other, unsigned int size, unsigned int stride, unsigned int m){
|
||||
unsigned int t = d.prev;
|
||||
unsigned int Cm = communication_size(m);
|
||||
for(unsigned int i = 0; i < Cm; ++i){
|
||||
bsp::put(t, &x[stride*i], other, i, 1);
|
||||
inline void comm_step(proc_info const & pi, plan_1D const & plan, double* x, double* other, unsigned int size, unsigned int stride){
|
||||
for(unsigned int i = 0; i < plan.Cm; ++i){
|
||||
bsp::put(pi.prev, &x[stride*i], other, i, 1);
|
||||
}
|
||||
}
|
||||
bsp::sync();
|
||||
|
||||
unsigned int end = pow_two(m);
|
||||
inline void comp_step(proc_info const & d, plan_1D const & plan, double* x, double* other, unsigned int size, unsigned int stride){
|
||||
unsigned int end = pow_two(plan.m);
|
||||
for(unsigned int i = 1; i < end; i <<= 1){
|
||||
wavelet_mul(x, other[0], other[i], size, stride*i);
|
||||
if(i < end/2) wavelet_mul_base(other, 2*end - 2*i, i);
|
||||
}
|
||||
}
|
||||
|
||||
inline void base(distribution const & d, double* x, double* other, unsigned int size, unsigned int m){
|
||||
unsigned int t = two_log(d.b);
|
||||
unsigned int steps = (t-1)/m;
|
||||
|
||||
unsigned int stride = 1;
|
||||
for(unsigned int i = steps; i; i--){
|
||||
step(d, x, other, size, stride, m);
|
||||
stride <<= m;
|
||||
inline void step(proc_info const & d, plan_1D const & plan, double* x, double* other, unsigned int size, unsigned int stride){
|
||||
comm_step(d, plan, x, other, size, stride);
|
||||
bsp::sync();
|
||||
comp_step(d, plan, x, other, size, stride);
|
||||
}
|
||||
|
||||
unsigned int remaining = (t-1) - m*steps;
|
||||
if(remaining)
|
||||
step(d, x, other, size, stride, remaining);
|
||||
inline void base(proc_info const & d, plan_1D const & plan, double* x, double* other, unsigned int size){
|
||||
// do steps of size m
|
||||
unsigned int stride = 1;
|
||||
for(unsigned int i = plan.big_steps; i; i--){
|
||||
step(d, plan, x, other, size, stride);
|
||||
stride <<= plan.m;
|
||||
}
|
||||
|
||||
// in the case m didn't divide the total number of small steps, do the remaining part
|
||||
if(plan.remainder)
|
||||
step(d, get_remainder(plan), x, other, size, stride);
|
||||
}
|
||||
|
||||
// block distributed parallel wavelet, result is also in block distribution (in-place in x)
|
||||
inline void wavelet(distribution const & d, double* x, double* next, double* proczero, unsigned int m){
|
||||
inline void wavelet(proc_info const & d, plan_1D const & plan, double* x, double* next, double* proczero){
|
||||
// First do the local part
|
||||
base(d, x, next, d.b, m);
|
||||
base(d, plan, x, next, plan.b);
|
||||
|
||||
// Then do a fan in (i.e. 2 elements to proc zero)
|
||||
for(unsigned int i = 0; i < 2; ++i){
|
||||
bsp::put(0, &x[i * d.b/2], proczero, d.s * 2 + i);
|
||||
bsp::put(0, &x[i * plan.b/2], proczero, d.s * 2 + i);
|
||||
}
|
||||
bsp::sync();
|
||||
|
||||
|
@ -76,7 +96,7 @@ namespace wvlt {
|
|||
// and to send it back to everyone
|
||||
for(unsigned int t = 0; t < d.p; ++t){
|
||||
for(unsigned int i = 0; i < 2; ++i){
|
||||
bsp::put(t, &proczero[t*2 + i], x, i * d.b/2);
|
||||
bsp::put(t, &proczero[t*2 + i], x, i * plan.b/2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
#include "wavelet_parallel.hpp"
|
||||
|
||||
// Number of iterations to improve time measurements
|
||||
static unsigned int ITERS = 1;
|
||||
static unsigned int iterations = 1;
|
||||
|
||||
// Static :(, will be set in main
|
||||
static unsigned int P;
|
||||
|
@ -23,12 +23,12 @@ static double data(unsigned int global_index){
|
|||
}
|
||||
|
||||
// NOTE: does not synchronize
|
||||
static void read_and_distribute_data(wvlt::par::distribution const & d, double* x){
|
||||
std::vector<double> r(d.b);
|
||||
static void read_and_distribute_data(wvlt::par::proc_info const & d, wvlt::par::plan_1D plan, double* x){
|
||||
std::vector<double> r(plan.b);
|
||||
for(unsigned int t = 0; t < d.p; ++t){
|
||||
r.assign(d.b, 0.0);
|
||||
for(unsigned int i = 0; i < d.b; ++i){
|
||||
r[i] = data(i + t*d.b);
|
||||
r.assign(plan.b, 0.0);
|
||||
for(unsigned int i = 0; i < plan.b; ++i){
|
||||
r[i] = data(i + t*plan.b);
|
||||
}
|
||||
bsp::put(t, r.data(), x, 0, r.size());
|
||||
}
|
||||
|
@ -36,16 +36,14 @@ static void read_and_distribute_data(wvlt::par::distribution const & d, double*
|
|||
|
||||
static void par_wavelet(){
|
||||
bsp::begin(P);
|
||||
wvlt::par::distribution d(N, bsp::nprocs(), bsp::pid());
|
||||
|
||||
unsigned int m = 2;
|
||||
unsigned int Cm = wvlt::par::communication_size(m);
|
||||
const wvlt::par::proc_info d(bsp::nprocs(), bsp::pid());
|
||||
const wvlt::par::plan_1D plan(N, N/d.p, 2);
|
||||
|
||||
// We allocate and push everything up front, since we need it anyways
|
||||
// (so peak memory is the same). This saves us 1 bsp::sync()
|
||||
// For convenience and consistency we use std::vector
|
||||
std::vector<double> x(d.b, 0.0);
|
||||
std::vector<double> next(Cm, 0.0);
|
||||
std::vector<double> x(plan.b, 0.0);
|
||||
std::vector<double> next(plan.Cm, 0.0);
|
||||
std::vector<double> proczero(d.s == 0 ? 2*d.p : 1, 0.0);
|
||||
|
||||
bsp::push_reg(x.data(), x.size());
|
||||
|
@ -56,13 +54,13 @@ static void par_wavelet(){
|
|||
|
||||
// processor zero reads data from file
|
||||
// gives each proc its own piece
|
||||
if(d.s == 0) read_and_distribute_data(d, x.data());
|
||||
if(d.s == 0) read_and_distribute_data(d, plan, x.data());
|
||||
bsp::sync();
|
||||
|
||||
// do the parallel wavelet!!!
|
||||
double time1 = bsp::time();
|
||||
for(unsigned int i = 0; i < ITERS; ++i){
|
||||
wvlt::par::wavelet(d, x.data(), next.data(), proczero.data(), m);
|
||||
for(unsigned int i = 0; i < iterations; ++i){
|
||||
wvlt::par::wavelet(d, plan, x.data(), next.data(), proczero.data());
|
||||
bsp::sync();
|
||||
}
|
||||
double time2 = bsp::time();
|
||||
|
@ -78,7 +76,7 @@ static void par_wavelet(){
|
|||
bsp::push_reg(par_result.data(), par_result.size());
|
||||
bsp::sync();
|
||||
|
||||
bsp::put(0, x.data(), par_result.data(), d.s * d.b, d.b);
|
||||
bsp::put(0, x.data(), par_result.data(), d.s * plan.b, plan.b);
|
||||
bsp::sync();
|
||||
|
||||
bsp::pop_reg(par_result.data());
|
||||
|
@ -91,7 +89,7 @@ static void seq_wavelet(){
|
|||
for(unsigned int i = 0; i < N; ++i) v[i] = data(i);
|
||||
|
||||
{ auto time1 = timer::clock::now();
|
||||
for(unsigned int i = 0; i < ITERS; ++i){
|
||||
for(unsigned int i = 0; i < iterations; ++i){
|
||||
wvlt::wavelet(v.data(), v.size(), 1);
|
||||
}
|
||||
auto time2 = timer::clock::now();
|
||||
|
@ -144,7 +142,7 @@ int main(int argc, char** argv){
|
|||
|
||||
N = vm["n"].as<unsigned int>();
|
||||
P = vm["p"].as<unsigned int>();
|
||||
ITERS = vm["iterations"].as<unsigned int>();
|
||||
iterations = vm["iterations"].as<unsigned int>();
|
||||
|
||||
if(!is_pow_of_two(N)) throw po::error("n is not a power of two");
|
||||
if(!is_pow_of_two(P)) throw po::error("p is not a power of two");
|
||||
|
@ -169,7 +167,7 @@ int main(int argc, char** argv){
|
|||
std::cout << "Checking results ";
|
||||
compare_results(seq_result, par_result, threshold);
|
||||
|
||||
for(int i = 0; i < ITERS; ++i) wvlt::unwavelet(seq_result.data(), seq_result.size(), 1);
|
||||
for(unsigned int i = 0; i < iterations; ++i) wvlt::unwavelet(seq_result.data(), seq_result.size(), 1);
|
||||
for(unsigned int i = 0; i < par_result.size(); ++i) par_result[i] = data(i);
|
||||
|
||||
std::cout << "Checking inverse ";
|
||||
|
|
Reference in a new issue