|
|
@ -14,59 +14,79 @@ |
|
|
|
|
|
|
|
namespace wvlt { |
|
|
|
namespace par { |
|
|
|
// Convenience container of some often-used values
|
|
|
|
// n = inputisze, p = nproc(), s = pid()
|
|
|
|
// b = blocksize, prev/next = previous and next processor index
|
|
|
|
struct distribution { |
|
|
|
unsigned int n, p, s, b, prev, next; |
|
|
|
|
|
|
|
distribution(unsigned int n_, unsigned int p_, unsigned int s_) |
|
|
|
: n(n_), p(p_), s(s_), b(n/p), prev((s-1+p)%p), next((s+1)%p) |
|
|
|
// The structs proc_info and plan_1D contain some often
|
|
|
|
// used values in the parallel algorithm, they also
|
|
|
|
// precompute some constants.
|
|
|
|
|
|
|
|
// p = nproc(), s = pid()
|
|
|
|
// prev/next = previous and next processor index
|
|
|
|
struct proc_info { |
|
|
|
unsigned int p, s, prev, next; |
|
|
|
|
|
|
|
proc_info(unsigned int p_, unsigned int s_) |
|
|
|
: p(p_), s(s_), prev((s-1+p)%p), next((s+1)%p) |
|
|
|
{} |
|
|
|
}; |
|
|
|
|
|
|
|
// n = inputisze, b = blocksize, m = step_size
|
|
|
|
// Cm = communication size
|
|
|
|
// TODO: describe other vars
|
|
|
|
struct plan_1D { |
|
|
|
unsigned int n, b, m, Cm, small_steps, big_steps, remainder; |
|
|
|
|
|
|
|
plan_1D(unsigned int n_, unsigned int b_, unsigned int m_) |
|
|
|
: n(n_), b(b_), m(m_), Cm(pow_two(m+1) - 2), small_steps(two_log(b) - 1), big_steps(small_steps/m), remainder(small_steps - m*big_steps) |
|
|
|
{} |
|
|
|
}; |
|
|
|
|
|
|
|
inline unsigned int communication_size(unsigned int m){ |
|
|
|
return pow_two(m+1) - 2; |
|
|
|
inline plan_1D get_remainder(plan_1D plan){ |
|
|
|
plan.m = plan.remainder; |
|
|
|
plan.Cm = pow_two(plan.m+1) - 2; |
|
|
|
plan.remainder = 0; |
|
|
|
return plan; |
|
|
|
} |
|
|
|
|
|
|
|
inline void step(distribution const & d, double* x, double* other, unsigned int size, unsigned int stride, unsigned int m){ |
|
|
|
unsigned int t = d.prev; |
|
|
|
unsigned int Cm = communication_size(m); |
|
|
|
for(unsigned int i = 0; i < Cm; ++i){ |
|
|
|
bsp::put(t, &x[stride*i], other, i, 1); |
|
|
|
inline void comm_step(proc_info const & pi, plan_1D const & plan, double* x, double* other, unsigned int size, unsigned int stride){ |
|
|
|
for(unsigned int i = 0; i < plan.Cm; ++i){ |
|
|
|
bsp::put(pi.prev, &x[stride*i], other, i, 1); |
|
|
|
} |
|
|
|
bsp::sync(); |
|
|
|
} |
|
|
|
|
|
|
|
unsigned int end = pow_two(m); |
|
|
|
inline void comp_step(proc_info const & d, plan_1D const & plan, double* x, double* other, unsigned int size, unsigned int stride){ |
|
|
|
unsigned int end = pow_two(plan.m); |
|
|
|
for(unsigned int i = 1; i < end; i <<= 1){ |
|
|
|
wavelet_mul(x, other[0], other[i], size, stride*i); |
|
|
|
if(i < end/2) wavelet_mul_base(other, 2*end - 2*i, i); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
inline void base(distribution const & d, double* x, double* other, unsigned int size, unsigned int m){ |
|
|
|
unsigned int t = two_log(d.b); |
|
|
|
unsigned int steps = (t-1)/m; |
|
|
|
inline void step(proc_info const & d, plan_1D const & plan, double* x, double* other, unsigned int size, unsigned int stride){ |
|
|
|
comm_step(d, plan, x, other, size, stride); |
|
|
|
bsp::sync(); |
|
|
|
comp_step(d, plan, x, other, size, stride); |
|
|
|
} |
|
|
|
|
|
|
|
inline void base(proc_info const & d, plan_1D const & plan, double* x, double* other, unsigned int size){ |
|
|
|
// do steps of size m
|
|
|
|
unsigned int stride = 1; |
|
|
|
for(unsigned int i = steps; i; i--){ |
|
|
|
step(d, x, other, size, stride, m); |
|
|
|
stride <<= m; |
|
|
|
for(unsigned int i = plan.big_steps; i; i--){ |
|
|
|
step(d, plan, x, other, size, stride); |
|
|
|
stride <<= plan.m; |
|
|
|
} |
|
|
|
|
|
|
|
unsigned int remaining = (t-1) - m*steps; |
|
|
|
if(remaining) |
|
|
|
step(d, x, other, size, stride, remaining); |
|
|
|
// in the case m didn't divide the total number of small steps, do the remaining part
|
|
|
|
if(plan.remainder) |
|
|
|
step(d, get_remainder(plan), x, other, size, stride); |
|
|
|
} |
|
|
|
|
|
|
|
// block distributed parallel wavelet, result is also in block distribution (in-place in x)
|
|
|
|
inline void wavelet(distribution const & d, double* x, double* next, double* proczero, unsigned int m){ |
|
|
|
inline void wavelet(proc_info const & d, plan_1D const & plan, double* x, double* next, double* proczero){ |
|
|
|
// First do the local part
|
|
|
|
base(d, x, next, d.b, m); |
|
|
|
base(d, plan, x, next, plan.b); |
|
|
|
|
|
|
|
// Then do a fan in (i.e. 2 elements to proc zero)
|
|
|
|
for(unsigned int i = 0; i < 2; ++i){ |
|
|
|
bsp::put(0, &x[i * d.b/2], proczero, d.s * 2 + i); |
|
|
|
bsp::put(0, &x[i * plan.b/2], proczero, d.s * 2 + i); |
|
|
|
} |
|
|
|
bsp::sync(); |
|
|
|
|
|
|
@ -76,7 +96,7 @@ namespace wvlt { |
|
|
|
// and to send it back to everyone
|
|
|
|
for(unsigned int t = 0; t < d.p; ++t){ |
|
|
|
for(unsigned int i = 0; i < 2; ++i){ |
|
|
|
bsp::put(t, &proczero[t*2 + i], x, i * d.b/2); |
|
|
|
bsp::put(t, &proczero[t*2 + i], x, i * plan.b/2); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|