|
@ -7,11 +7,14 @@ |
|
|
#include "wavelet.hpp" |
|
|
#include "wavelet.hpp" |
|
|
#include "wavelet_parallel.hpp" |
|
|
#include "wavelet_parallel.hpp" |
|
|
|
|
|
|
|
|
static unsigned int P; |
|
|
static struct { |
|
|
static unsigned int W; |
|
|
unsigned int P; |
|
|
static unsigned int H; |
|
|
unsigned int W; |
|
|
|
|
|
unsigned int H; |
|
|
|
|
|
unsigned int M; |
|
|
|
|
|
bool check_results; |
|
|
|
|
|
} globals; |
|
|
|
|
|
|
|
|
static std::vector<double> data; |
|
|
|
|
|
static std::vector<double> seqr; |
|
|
static std::vector<double> seqr; |
|
|
static std::vector<double> parr; |
|
|
static std::vector<double> parr; |
|
|
|
|
|
|
|
@ -38,6 +41,12 @@ struct block { |
|
|
bsp::push_reg(hcomm.data(), hcomm.size()); |
|
|
bsp::push_reg(hcomm.data(), hcomm.size()); |
|
|
bsp::push_reg(vcomm.data(), vcomm.size()); |
|
|
bsp::push_reg(vcomm.data(), vcomm.size()); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
void pop() { |
|
|
|
|
|
bsp::pop_reg(data.data()); |
|
|
|
|
|
bsp::pop_reg(hcomm.data()); |
|
|
|
|
|
bsp::pop_reg(vcomm.data()); |
|
|
|
|
|
} |
|
|
}; |
|
|
}; |
|
|
|
|
|
|
|
|
// Communicate vertical data from b (strided) to b2 (not strided)
|
|
|
// Communicate vertical data from b (strided) to b2 (not strided)
|
|
@ -110,26 +119,43 @@ static void hstep(wvlt::par::proc_info const & pi, plan_2D const & plan, std::ve |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// gets globals from processor 0
|
|
|
|
|
|
static void get_globals(){ |
|
|
|
|
|
bsp::push_reg(&globals); |
|
|
|
|
|
bsp::sync(); |
|
|
|
|
|
bsp::get(0, &globals, 0, &globals); |
|
|
|
|
|
bsp::sync(); |
|
|
|
|
|
bsp::pop_reg(&globals); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// fake data
|
|
|
|
|
|
double data(unsigned int x, unsigned int y){ |
|
|
|
|
|
return x*y; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
static void par_wavelet_2D(){ |
|
|
static void par_wavelet_2D(){ |
|
|
bsp::begin(P); |
|
|
bsp::begin(globals.P); |
|
|
|
|
|
get_globals(); |
|
|
|
|
|
|
|
|
const wvlt::par::proc_info d(bsp::nprocs(), bsp::pid()); |
|
|
const wvlt::par::proc_info d(bsp::nprocs(), bsp::pid()); |
|
|
const wvlt::par::plan_1D horizontal(W, W/d.p, 1); |
|
|
const wvlt::par::plan_1D horizontal(globals.W, globals.W/d.p, globals.M); |
|
|
const wvlt::par::plan_1D vertical(H, H/d.p, 1); |
|
|
const wvlt::par::plan_1D vertical(globals.H, globals.H/d.p, globals.M); |
|
|
const plan_2D plan{horizontal, vertical}; |
|
|
const plan_2D plan{horizontal, vertical}; |
|
|
|
|
|
|
|
|
auto bbb = block(plan); |
|
|
auto bbb = block(plan); |
|
|
|
|
|
// We allocated everything up front, we don't actually need to do this
|
|
|
|
|
|
// but it's easy, as we don't have to think of this later.
|
|
|
std::vector<block> blocks(d.p, bbb); |
|
|
std::vector<block> blocks(d.p, bbb); |
|
|
std::vector<double> hfinish(2 * d.p * vertical.b, 0.0); |
|
|
std::vector<double> hfinish(2 * d.p * vertical.b, 0.0); |
|
|
std::vector<double> vfinish(horizontal.b * 2 * d.p, 0.0); |
|
|
std::vector<double> vfinish(horizontal.b * 2 * d.p, 0.0); |
|
|
|
|
|
|
|
|
// Direct read because MCBSP can do this ;D
|
|
|
// As we will be generating the data, no need to sync
|
|
|
for(unsigned int b = 0; b < blocks.size(); ++b){ |
|
|
for(unsigned int b = 0; b < blocks.size(); ++b){ |
|
|
unsigned int x_start = b * horizontal.b; |
|
|
unsigned int x_start = b * horizontal.b; |
|
|
unsigned int y_start = (d.s - b + d.p)%d.p * vertical.b; |
|
|
unsigned int y_start = (d.s - b + d.p)%d.p * vertical.b; |
|
|
for(unsigned int y = 0; y < vertical.b; ++y){ |
|
|
for(unsigned int y = 0; y < vertical.b; ++y){ |
|
|
for(unsigned int x = 0; x < horizontal.b; ++x){ |
|
|
for(unsigned int x = 0; x < horizontal.b; ++x){ |
|
|
auto v = data[x_start + x + horizontal.n*(y_start+y)]; |
|
|
auto v = data(x_start + x, y_start + y); |
|
|
blocks[b].data[x + horizontal.b*y] = v; |
|
|
blocks[b].data[x + horizontal.b*y] = v; |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
@ -150,29 +176,32 @@ static void par_wavelet_2D(){ |
|
|
stride <<= plan.horizontal.m; |
|
|
stride <<= plan.horizontal.m; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
// finish parallely
|
|
|
// fan in to the right processor
|
|
|
unsigned int hh = horizontal.b/2; |
|
|
unsigned int hh = horizontal.b/2; |
|
|
for(unsigned int b = 0; b < blocks.size(); ++b){ |
|
|
for(unsigned int b = 0; b < blocks.size(); ++b){ |
|
|
unsigned int t = (d.s - b + d.p)%d.p; |
|
|
unsigned int t = (d.s - b + d.p)%d.p; |
|
|
unsigned int x_start = b * 2; |
|
|
unsigned int x_start = b * 2; |
|
|
auto ptr = blocks[b].data.data(); |
|
|
auto ptr = blocks[b].data.data(); |
|
|
for(unsigned int y = 0; y < vertical.b; ++y){ |
|
|
for(unsigned int y = 0; y < vertical.b; ++y){ |
|
|
bsp::put(t, &ptr[0 + horizontal.b*y], hfinish.data(), 0 + x_start + 2*d.p*y, 1); |
|
|
// processor, source, dest, offset
|
|
|
bsp::put(t, &ptr[hh + horizontal.b*y], hfinish.data(), 1 + x_start + 2*d.p*y, 1); |
|
|
bsp::put(t, &ptr[0 + horizontal.b*y], hfinish.data(), 0 + x_start + 2*d.p*y); |
|
|
|
|
|
bsp::put(t, &ptr[hh + horizontal.b*y], hfinish.data(), 1 + x_start + 2*d.p*y); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
bsp::sync(); |
|
|
bsp::sync(); |
|
|
|
|
|
|
|
|
|
|
|
// last step of the algorithm
|
|
|
for(unsigned int y = 0; y < vertical.b; ++y){ |
|
|
for(unsigned int y = 0; y < vertical.b; ++y){ |
|
|
wvlt::wavelet(hfinish.data() + 2*d.p*y, 2*d.p, 1); |
|
|
wvlt::wavelet(hfinish.data() + 2*d.p*y, 2*d.p, 1); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// fan out to the right processor
|
|
|
for(unsigned int y = 0; y < vertical.b; ++y){ |
|
|
for(unsigned int y = 0; y < vertical.b; ++y){ |
|
|
for(unsigned int t = 0; t < d.p; ++t){ |
|
|
for(unsigned int t = 0; t < d.p; ++t){ |
|
|
unsigned int b = (t - d.s + d.p)%d.p; |
|
|
unsigned int b = (t - d.s + d.p)%d.p; |
|
|
unsigned int x_start = b * 2; |
|
|
unsigned int x_start = b * 2; |
|
|
bsp::put(t, &hfinish[0 + x_start + 2*d.p*y], blocks[b].data.data(), 0 + horizontal.b*y, 1); |
|
|
bsp::put(t, &hfinish[0 + x_start + 2*d.p*y], blocks[b].data.data(), 0 + horizontal.b*y); |
|
|
bsp::put(t, &hfinish[1 + x_start + 2*d.p*y], blocks[b].data.data(), hh + horizontal.b*y, 1); |
|
|
bsp::put(t, &hfinish[1 + x_start + 2*d.p*y], blocks[b].data.data(), hh + horizontal.b*y); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
bsp::sync(); |
|
|
bsp::sync(); |
|
@ -186,29 +215,32 @@ static void par_wavelet_2D(){ |
|
|
stride <<= plan.vertical.m; |
|
|
stride <<= plan.vertical.m; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
// finish parallely
|
|
|
// fan in to the right processor
|
|
|
unsigned int hh = vertical.b/2; |
|
|
unsigned int hh = vertical.b/2; |
|
|
for(unsigned int b = 0; b < blocks.size(); ++b){ |
|
|
for(unsigned int b = 0; b < blocks.size(); ++b){ |
|
|
unsigned int t = b; |
|
|
unsigned int t = b; |
|
|
unsigned int y_start = (d.s - b + d.p)%d.p * 2; |
|
|
unsigned int y_start = (d.s - b + d.p)%d.p * 2; |
|
|
auto ptr = blocks[b].data.data(); |
|
|
auto ptr = blocks[b].data.data(); |
|
|
for(unsigned int x = 0; x < horizontal.b; ++x){ |
|
|
for(unsigned int x = 0; x < horizontal.b; ++x){ |
|
|
bsp::put(t, &ptr[x + 0 *horizontal.b], vfinish.data(), x + horizontal.b*(y_start + 0), 1); |
|
|
// processor, source, dest, offset
|
|
|
bsp::put(t, &ptr[x + hh*horizontal.b], vfinish.data(), x + horizontal.b*(y_start + 1), 1); |
|
|
bsp::put(t, &ptr[x + 0 *horizontal.b], vfinish.data(), x + horizontal.b*(y_start + 0)); |
|
|
|
|
|
bsp::put(t, &ptr[x + hh*horizontal.b], vfinish.data(), x + horizontal.b*(y_start + 1)); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
bsp::sync(); |
|
|
bsp::sync(); |
|
|
|
|
|
|
|
|
|
|
|
// last step of the algorithm
|
|
|
for(unsigned int x = 0; x < horizontal.b; ++x){ |
|
|
for(unsigned int x = 0; x < horizontal.b; ++x){ |
|
|
wvlt::wavelet(vfinish.data() + x, 2*d.p, horizontal.b); |
|
|
wvlt::wavelet(vfinish.data() + x, 2*d.p, horizontal.b); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// fan out to the right processor
|
|
|
for(unsigned int x = 0; x < horizontal.b; ++x){ |
|
|
for(unsigned int x = 0; x < horizontal.b; ++x){ |
|
|
for(unsigned int t = 0; t < d.p; ++t){ |
|
|
for(unsigned int t = 0; t < d.p; ++t){ |
|
|
unsigned int b = d.s; |
|
|
unsigned int b = d.s; |
|
|
unsigned int y_start = (t - b + d.p)%d.p * 2; |
|
|
unsigned int y_start = (t - b + d.p)%d.p * 2; |
|
|
bsp::put(t, &vfinish[x + horizontal.b*(y_start + 0)], blocks[b].data.data(), horizontal.b*0 + x, 1); |
|
|
bsp::put(t, &vfinish[x + horizontal.b*(y_start + 0)], blocks[b].data.data(), horizontal.b*0 + x); |
|
|
bsp::put(t, &vfinish[x + horizontal.b*(y_start + 1)], blocks[b].data.data(), horizontal.b*hh + x, 1); |
|
|
bsp::put(t, &vfinish[x + horizontal.b*(y_start + 1)], blocks[b].data.data(), horizontal.b*hh + x); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
bsp::sync(); |
|
|
bsp::sync(); |
|
@ -217,25 +249,33 @@ static void par_wavelet_2D(){ |
|
|
double time2 = bsp::time(); |
|
|
double time2 = bsp::time(); |
|
|
if(d.s==0) printf("parallel version\t%f\n", time2 - time1); |
|
|
if(d.s==0) printf("parallel version\t%f\n", time2 - time1); |
|
|
|
|
|
|
|
|
// Direct write because MCBSP can do this ;D
|
|
|
if(globals.check_results){ |
|
|
|
|
|
bsp::push_reg(parr.data(), parr.size()); |
|
|
|
|
|
bsp::sync(); |
|
|
|
|
|
|
|
|
for(unsigned int b = 0; b < blocks.size(); ++b){ |
|
|
for(unsigned int b = 0; b < blocks.size(); ++b){ |
|
|
unsigned int x_start = b * horizontal.b; |
|
|
unsigned int x_start = b * horizontal.b; |
|
|
unsigned int y_start = (d.s - b + d.p)%d.p * vertical.b; |
|
|
unsigned int y_start = (d.s - b + d.p)%d.p * vertical.b; |
|
|
for(unsigned int y = 0; y < vertical.b; ++y){ |
|
|
for(unsigned int y = 0; y < vertical.b; ++y){ |
|
|
for(unsigned int x = 0; x < horizontal.b; ++x){ |
|
|
for(unsigned int x = 0; x < horizontal.b; ++x){ |
|
|
auto v = blocks[b].data[x + horizontal.b*y]; |
|
|
auto v = blocks[b].data[x + horizontal.b*y]; |
|
|
parr[x_start + x + horizontal.n*(y_start+y)] = v; |
|
|
bsp::put(0, &v, parr.data(), x_start + x + horizontal.n*(y_start+y)); |
|
|
|
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
bsp::sync(); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
bsp::end(); |
|
|
bsp::end(); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
static void seq_wavelet_2D(){ |
|
|
static void seq_wavelet_2D(){ |
|
|
seqr = data; |
|
|
for(unsigned int y = 0; y < globals.H; ++y) |
|
|
|
|
|
for(unsigned int x = 0; x < globals.W; ++x) |
|
|
|
|
|
seqr[x + globals.W*y] = data(x, y); |
|
|
|
|
|
|
|
|
auto time1 = timer::clock::now(); |
|
|
auto time1 = timer::clock::now(); |
|
|
wvlt::wavelet_2D(seqr.data(), W, H); |
|
|
wvlt::wavelet_2D(seqr.data(), globals.W, globals.H); |
|
|
auto time2 = timer::clock::now(); |
|
|
auto time2 = timer::clock::now(); |
|
|
printf("sequential version\t%f\n", timer::from_dur(time2 - time1)); |
|
|
printf("sequential version\t%f\n", timer::from_dur(time2 - time1)); |
|
|
} |
|
|
} |
|
@ -258,22 +298,61 @@ static int compare_results(std::vector<double> const & lh, std::vector<double> c |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
int main(int argc, char** argv){ |
|
|
int main(int argc, char** argv){ |
|
|
P = 2; |
|
|
bsp::init(par_wavelet_2D, argc, argv); |
|
|
H = 1024; |
|
|
namespace po = boost::program_options; |
|
|
W = 1024; |
|
|
|
|
|
|
|
|
// Describe program options
|
|
|
|
|
|
po::options_description opts; |
|
|
|
|
|
opts.add_options() |
|
|
|
|
|
("p", po::value<unsigned int>(), "number of processors") |
|
|
|
|
|
("w", po::value<unsigned int>(), "width of image") |
|
|
|
|
|
("h", po::value<unsigned int>(), "height of image") |
|
|
|
|
|
("m", po::value<unsigned int>()->default_value(1), "the variable m") |
|
|
|
|
|
("help", po::bool_switch(), "show this help") |
|
|
|
|
|
("show-input", po::bool_switch(), "shows the given input") |
|
|
|
|
|
("seq", po::bool_switch(), "also runs the sequential algorithm") |
|
|
|
|
|
("check", po::bool_switch(), "enables correctness checks"); |
|
|
|
|
|
po::variables_map vm; |
|
|
|
|
|
|
|
|
|
|
|
// Parse and set options
|
|
|
|
|
|
try { |
|
|
|
|
|
po::store(po::parse_command_line(argc, argv, opts), vm); |
|
|
|
|
|
po::notify(vm); |
|
|
|
|
|
|
|
|
|
|
|
if(vm["help"].as<bool>()){ |
|
|
|
|
|
std::cout << "Parallel wavelet mockup" << std::endl; |
|
|
|
|
|
std::cout << opts << std::endl; |
|
|
|
|
|
return 0; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
data.assign(W*H, 0.0); |
|
|
globals.P = vm["p"].as<unsigned int>(); |
|
|
for(unsigned int y = 0; y < H; ++y) |
|
|
globals.W = vm["w"].as<unsigned int>(); |
|
|
for(unsigned int x = 0; x < W; ++x) |
|
|
globals.H = vm["h"].as<unsigned int>(); |
|
|
data[x + W*y] = x*y; |
|
|
globals.M = vm["m"].as<unsigned int>(); |
|
|
seqr.assign(W*H, 0.0); |
|
|
globals.check_results = vm["check"].as<bool>(); |
|
|
parr.assign(W*H, 0.0); |
|
|
|
|
|
|
|
|
if(!is_pow_of_two(globals.P)) throw po::error("p is not a power of two"); |
|
|
|
|
|
if(!is_pow_of_two(globals.W)) throw po::error("w is not a power of two"); |
|
|
|
|
|
if(!is_pow_of_two(globals.H)) throw po::error("h is not a power of two"); |
|
|
|
|
|
} catch(std::exception& e){ |
|
|
|
|
|
std::cout << colors::red("ERROR: ") << e.what() << std::endl; |
|
|
|
|
|
std::cout << opts << std::endl; |
|
|
|
|
|
return 1; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if(vm["show-input"].as<bool>()){ |
|
|
|
|
|
std::cout << "w\t" << globals.W << "h\t" << globals.H << "\np\t" << globals.P << "\nm\t" << globals.M << std::endl; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
seqr.assign(globals.W*globals.H, 0.0); |
|
|
|
|
|
parr.assign(globals.W*globals.H, 0.0); |
|
|
|
|
|
|
|
|
bsp::init(par_wavelet_2D, argc, argv); |
|
|
|
|
|
par_wavelet_2D(); |
|
|
par_wavelet_2D(); |
|
|
seq_wavelet_2D(); |
|
|
seq_wavelet_2D(); |
|
|
|
|
|
|
|
|
|
|
|
if(globals.check_results){ |
|
|
double threshold = 1.0e-8; |
|
|
double threshold = 1.0e-8; |
|
|
std::cout << "Checking results "; |
|
|
std::cout << "Checking results "; |
|
|
compare_results(seqr, parr, threshold); |
|
|
compare_results(seqr, parr, threshold); |
|
|
|
|
|
} |
|
|
} |
|
|
} |
|
|