Adds an optimalisation in pow_two, Updates some comments, reordered some stuff
This commit is contained in:
parent
1520a4928d
commit
572d257e1d
3 changed files with 30 additions and 25 deletions
|
@ -20,10 +20,15 @@ inline unsigned int two_log(unsigned int x){
|
||||||
return 8*sizeof(unsigned int) - unsigned(__builtin_clz(x-1));
|
return 8*sizeof(unsigned int) - unsigned(__builtin_clz(x-1));
|
||||||
}
|
}
|
||||||
|
|
||||||
// calculates 2^x (NOTE: can be improved by exponentiation by squaring)
|
// calculates 2^x (by squaring)
|
||||||
inline unsigned int pow_two(unsigned int x){
|
inline unsigned int pow_two(unsigned int x){
|
||||||
|
unsigned int base = 2;
|
||||||
unsigned int y = 1;
|
unsigned int y = 1;
|
||||||
while(x--) y *= 2;
|
while(x){
|
||||||
|
if(x & 1) y *= base;
|
||||||
|
x >>= 1;
|
||||||
|
base *= base;
|
||||||
|
}
|
||||||
return y;
|
return y;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -6,17 +6,17 @@
|
||||||
#include "wavelet.hpp"
|
#include "wavelet.hpp"
|
||||||
|
|
||||||
/* In the following function we assume any in-parameter to be already
|
/* In the following function we assume any in-parameter to be already
|
||||||
* bsp::pushed. And the functions won't do any bsp::sync at the end. Both
|
* bsp::pushed, if needed. And the functions won't do any bsp::sync at the end.
|
||||||
* conventions make it possible to chains functions with lesser syncs.
|
* Both conventions make it possible to chain functions with lesser syncs.
|
||||||
*
|
*
|
||||||
* Distribution is block distribution.
|
* Distribution is block distribution. Wavelet is in-place.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
namespace wvlt {
|
namespace wvlt {
|
||||||
namespace par {
|
namespace par {
|
||||||
// The structs proc_info and plan_1D contain some often
|
// The structs proc_info and plan_1D contain some often used
|
||||||
// used values in the parallel algorithm, they also
|
// values in the parallel algorithm, they also precompute some
|
||||||
// precompute some constants.
|
// constants.
|
||||||
|
|
||||||
// p = nproc(), s = pid()
|
// p = nproc(), s = pid()
|
||||||
// prev/next = previous and next processor index
|
// prev/next = previous and next processor index
|
||||||
|
@ -29,8 +29,9 @@ namespace wvlt {
|
||||||
};
|
};
|
||||||
|
|
||||||
// n = inputisze, b = blocksize, m = step_size
|
// n = inputisze, b = blocksize, m = step_size
|
||||||
// Cm = communication size
|
// Cm = communication size, small_steps = total number of steps
|
||||||
// TODO: describe other vars
|
// in the wavelet transform, big_steps = number of supersteps
|
||||||
|
// doing m small steps, remainder = small_steps - m*big_steps.
|
||||||
struct plan_1D {
|
struct plan_1D {
|
||||||
unsigned int n, b, m, Cm, small_steps, big_steps, remainder;
|
unsigned int n, b, m, Cm, small_steps, big_steps, remainder;
|
||||||
|
|
||||||
|
@ -46,13 +47,14 @@ namespace wvlt {
|
||||||
return plan;
|
return plan;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void comm_step(proc_info const & pi, plan_1D const & plan, double* x, double* other, unsigned int size, unsigned int stride){
|
// Does one big step: so 1 comm. step and m comp. steps
|
||||||
|
inline void step(proc_info const & d, plan_1D const & plan, double* x, double* other, unsigned int size, unsigned int stride){
|
||||||
|
// Comminication
|
||||||
for(unsigned int i = 0; i < plan.Cm; ++i){
|
for(unsigned int i = 0; i < plan.Cm; ++i){
|
||||||
bsp::put(pi.prev, &x[stride*i], other, i, 1);
|
bsp::put(d.prev, &x[stride*i], other, i, 1);
|
||||||
}
|
}
|
||||||
}
|
bsp::sync();
|
||||||
|
// Computation
|
||||||
inline void comp_step(proc_info const & d, plan_1D const & plan, double* x, double* other, unsigned int size, unsigned int stride){
|
|
||||||
unsigned int end = pow_two(plan.m);
|
unsigned int end = pow_two(plan.m);
|
||||||
for(unsigned int i = 1; i < end; i <<= 1){
|
for(unsigned int i = 1; i < end; i <<= 1){
|
||||||
wavelet_mul(x, other[0], other[i], size, stride*i);
|
wavelet_mul(x, other[0], other[i], size, stride*i);
|
||||||
|
@ -60,12 +62,7 @@ namespace wvlt {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void step(proc_info const & d, plan_1D const & plan, double* x, double* other, unsigned int size, unsigned int stride){
|
// Does the local part of the algorithm
|
||||||
comm_step(d, plan, x, other, size, stride);
|
|
||||||
bsp::sync();
|
|
||||||
comp_step(d, plan, x, other, size, stride);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void base(proc_info const & d, plan_1D const & plan, double* x, double* other, unsigned int size){
|
inline void base(proc_info const & d, plan_1D const & plan, double* x, double* other, unsigned int size){
|
||||||
// do steps of size m
|
// do steps of size m
|
||||||
unsigned int stride = 1;
|
unsigned int stride = 1;
|
||||||
|
@ -79,7 +76,7 @@ namespace wvlt {
|
||||||
step(d, get_remainder(plan), x, other, size, stride);
|
step(d, get_remainder(plan), x, other, size, stride);
|
||||||
}
|
}
|
||||||
|
|
||||||
// block distributed parallel wavelet, result is also in block distribution (in-place in x)
|
// The whole parallel algorithm
|
||||||
inline void wavelet(proc_info const & d, plan_1D const & plan, double* x, double* next, double* proczero){
|
inline void wavelet(proc_info const & d, plan_1D const & plan, double* x, double* next, double* proczero){
|
||||||
// First do the local part
|
// First do the local part
|
||||||
base(d, plan, x, next, plan.b);
|
base(d, plan, x, next, plan.b);
|
||||||
|
|
|
@ -82,8 +82,8 @@ static void par_wavelet(){
|
||||||
// So this is not part of the parallel program anymore
|
// So this is not part of the parallel program anymore
|
||||||
bsp::pop_reg(proczero.data());
|
bsp::pop_reg(proczero.data());
|
||||||
bsp::pop_reg(next.data());
|
bsp::pop_reg(next.data());
|
||||||
next.clear();
|
|
||||||
proczero.clear();
|
proczero.clear();
|
||||||
|
next.clear();
|
||||||
|
|
||||||
if(globals.check_results){
|
if(globals.check_results){
|
||||||
bsp::push_reg(par_result.data(), par_result.size());
|
bsp::push_reg(par_result.data(), par_result.size());
|
||||||
|
@ -144,6 +144,7 @@ int main(int argc, char** argv){
|
||||||
("iterations", po::value<unsigned int>()->default_value(5), "number of iterations")
|
("iterations", po::value<unsigned int>()->default_value(5), "number of iterations")
|
||||||
("help", po::bool_switch(), "show this help")
|
("help", po::bool_switch(), "show this help")
|
||||||
("show-input", po::bool_switch(), "shows the given input")
|
("show-input", po::bool_switch(), "shows the given input")
|
||||||
|
("seq", po::bool_switch(), "also runs the sequential algorithm")
|
||||||
("check", po::bool_switch(), "enables correctness checks");
|
("check", po::bool_switch(), "enables correctness checks");
|
||||||
po::variables_map vm;
|
po::variables_map vm;
|
||||||
|
|
||||||
|
@ -182,8 +183,10 @@ int main(int argc, char** argv){
|
||||||
seq_result.assign(globals.N, 0.0);
|
seq_result.assign(globals.N, 0.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Run both versions (will print timings)
|
// Run sequential algorithm if needed
|
||||||
seq_wavelet();
|
if(globals.check_results || vm["seq"].as<bool>())
|
||||||
|
seq_wavelet();
|
||||||
|
// Always run parallel algorithm
|
||||||
par_wavelet();
|
par_wavelet();
|
||||||
|
|
||||||
// Checking equality of algorithms
|
// Checking equality of algorithms
|
||||||
|
|
Reference in a new issue