Adds an optimalisation in pow_two, Updates some comments, reordered some stuff
This commit is contained in:
parent
1520a4928d
commit
572d257e1d
3 changed files with 30 additions and 25 deletions
|
@ -20,10 +20,15 @@ inline unsigned int two_log(unsigned int x){
|
|||
return 8*sizeof(unsigned int) - unsigned(__builtin_clz(x-1));
|
||||
}
|
||||
|
||||
// calculates 2^x (NOTE: can be improved by exponentiation by squaring)
|
||||
// calculates 2^x (by squaring)
|
||||
inline unsigned int pow_two(unsigned int x){
|
||||
unsigned int base = 2;
|
||||
unsigned int y = 1;
|
||||
while(x--) y *= 2;
|
||||
while(x){
|
||||
if(x & 1) y *= base;
|
||||
x >>= 1;
|
||||
base *= base;
|
||||
}
|
||||
return y;
|
||||
}
|
||||
|
||||
|
|
|
@ -6,17 +6,17 @@
|
|||
#include "wavelet.hpp"
|
||||
|
||||
/* In the following function we assume any in-parameter to be already
|
||||
* bsp::pushed. And the functions won't do any bsp::sync at the end. Both
|
||||
* conventions make it possible to chains functions with lesser syncs.
|
||||
* bsp::pushed, if needed. And the functions won't do any bsp::sync at the end.
|
||||
* Both conventions make it possible to chain functions with lesser syncs.
|
||||
*
|
||||
* Distribution is block distribution.
|
||||
* Distribution is block distribution. Wavelet is in-place.
|
||||
*/
|
||||
|
||||
namespace wvlt {
|
||||
namespace par {
|
||||
// The structs proc_info and plan_1D contain some often
|
||||
// used values in the parallel algorithm, they also
|
||||
// precompute some constants.
|
||||
// The structs proc_info and plan_1D contain some often used
|
||||
// values in the parallel algorithm, they also precompute some
|
||||
// constants.
|
||||
|
||||
// p = nproc(), s = pid()
|
||||
// prev/next = previous and next processor index
|
||||
|
@ -29,8 +29,9 @@ namespace wvlt {
|
|||
};
|
||||
|
||||
// n = inputisze, b = blocksize, m = step_size
|
||||
// Cm = communication size
|
||||
// TODO: describe other vars
|
||||
// Cm = communication size, small_steps = total number of steps
|
||||
// in the wavelet transform, big_steps = number of supersteps
|
||||
// doing m small steps, remainder = small_steps - m*big_steps.
|
||||
struct plan_1D {
|
||||
unsigned int n, b, m, Cm, small_steps, big_steps, remainder;
|
||||
|
||||
|
@ -46,13 +47,14 @@ namespace wvlt {
|
|||
return plan;
|
||||
}
|
||||
|
||||
inline void comm_step(proc_info const & pi, plan_1D const & plan, double* x, double* other, unsigned int size, unsigned int stride){
|
||||
// Does one big step: so 1 comm. step and m comp. steps
|
||||
inline void step(proc_info const & d, plan_1D const & plan, double* x, double* other, unsigned int size, unsigned int stride){
|
||||
// Comminication
|
||||
for(unsigned int i = 0; i < plan.Cm; ++i){
|
||||
bsp::put(pi.prev, &x[stride*i], other, i, 1);
|
||||
bsp::put(d.prev, &x[stride*i], other, i, 1);
|
||||
}
|
||||
}
|
||||
|
||||
inline void comp_step(proc_info const & d, plan_1D const & plan, double* x, double* other, unsigned int size, unsigned int stride){
|
||||
bsp::sync();
|
||||
// Computation
|
||||
unsigned int end = pow_two(plan.m);
|
||||
for(unsigned int i = 1; i < end; i <<= 1){
|
||||
wavelet_mul(x, other[0], other[i], size, stride*i);
|
||||
|
@ -60,12 +62,7 @@ namespace wvlt {
|
|||
}
|
||||
}
|
||||
|
||||
inline void step(proc_info const & d, plan_1D const & plan, double* x, double* other, unsigned int size, unsigned int stride){
|
||||
comm_step(d, plan, x, other, size, stride);
|
||||
bsp::sync();
|
||||
comp_step(d, plan, x, other, size, stride);
|
||||
}
|
||||
|
||||
// Does the local part of the algorithm
|
||||
inline void base(proc_info const & d, plan_1D const & plan, double* x, double* other, unsigned int size){
|
||||
// do steps of size m
|
||||
unsigned int stride = 1;
|
||||
|
@ -79,7 +76,7 @@ namespace wvlt {
|
|||
step(d, get_remainder(plan), x, other, size, stride);
|
||||
}
|
||||
|
||||
// block distributed parallel wavelet, result is also in block distribution (in-place in x)
|
||||
// The whole parallel algorithm
|
||||
inline void wavelet(proc_info const & d, plan_1D const & plan, double* x, double* next, double* proczero){
|
||||
// First do the local part
|
||||
base(d, plan, x, next, plan.b);
|
||||
|
|
|
@ -82,8 +82,8 @@ static void par_wavelet(){
|
|||
// So this is not part of the parallel program anymore
|
||||
bsp::pop_reg(proczero.data());
|
||||
bsp::pop_reg(next.data());
|
||||
next.clear();
|
||||
proczero.clear();
|
||||
next.clear();
|
||||
|
||||
if(globals.check_results){
|
||||
bsp::push_reg(par_result.data(), par_result.size());
|
||||
|
@ -144,6 +144,7 @@ int main(int argc, char** argv){
|
|||
("iterations", po::value<unsigned int>()->default_value(5), "number of iterations")
|
||||
("help", po::bool_switch(), "show this help")
|
||||
("show-input", po::bool_switch(), "shows the given input")
|
||||
("seq", po::bool_switch(), "also runs the sequential algorithm")
|
||||
("check", po::bool_switch(), "enables correctness checks");
|
||||
po::variables_map vm;
|
||||
|
||||
|
@ -182,8 +183,10 @@ int main(int argc, char** argv){
|
|||
seq_result.assign(globals.N, 0.0);
|
||||
}
|
||||
|
||||
// Run both versions (will print timings)
|
||||
seq_wavelet();
|
||||
// Run sequential algorithm if needed
|
||||
if(globals.check_results || vm["seq"].as<bool>())
|
||||
seq_wavelet();
|
||||
// Always run parallel algorithm
|
||||
par_wavelet();
|
||||
|
||||
// Checking equality of algorithms
|
||||
|
|
Reference in a new issue