Commit 0957e76c authored by Paal Kvamme's avatar Paal Kvamme
Browse files

Speed up range() by using correct type in accumulator. Noticeable when writing...

Speed up range() by using correct type in accumulator. Noticeable when writing int8 or int16 files. Also remove a hand-optimization that didn't have much effect.
parent d60df6da
......@@ -487,6 +487,7 @@ ZgyInternalBulk::readToExistingBuffer(
// case the request did in fact include such samples we will
// initialize the entire result buffer to the default value.
// TODO-Performance can I refactor the code so this isn't needed?
// Or can I make it fill only the actual padding area?
result->fill(as_float ? defaultvalue : defaultstorage);
std::vector<ReadRequest> requests;
......@@ -1639,7 +1640,7 @@ ZgyInternalBulk::_writeAlignedRegion(
std::vector<std::shared_ptr<const WriteBrickArgPack>> const_queue(worksize);
std::vector<std::shared_ptr<const WriteNowArgPack>> normal_queue(worksize);
std::atomic<int> errors(0);
#pragma omp parallel for if(enable_compress_mt())
#pragma omp parallel for if(enable_compress_mt() && worksize > 1)
for (std::int64_t ix = 0; ix < static_cast<std::int64_t>(worksize); ++ix) {
try {
const index3_t surveypos = work[ix]; // user's start i0,j0,k0 rounded down
......@@ -1647,6 +1648,7 @@ ZgyInternalBulk::_writeAlignedRegion(
std::shared_ptr<DataBuffer> brick = constbrick;
if (!brick) {
brick = DataBuffer::makeDataBuffer3d(nullptr, 0, bs, data->datatype());
// TODO-Performance, any way of avoiding the fill()?
brick->fill(defaultstorage);
brick->copyFrom(data.get(), // source
start.data(), surveypos.data(), // in survey coords
......@@ -1834,9 +1836,10 @@ ZgyInternalBulk::writeRegion(
// Or somehow replace all padding samples with defaultvalue.
// The range is in storage values. So e.g. for int8 it will be no
// more than [-128..+127].
// TODO-Performance: Add an OpemMP loop here or in data->range()
// since the caller will always be single threaded. Ideally a new
// rangeMT() should work whether the buffer is contiguous or not.
// Note that range() can be expensive. I have seen it take 5% of
// total time in a copy application. But a single call to range()
// is often fast enough to not warrant an OpemMP loop. See range()
// for more detals.
std::pair<double, double> minmax = data->range();
if (minmax.first <= minmax.second) {
......
......@@ -816,37 +816,52 @@ DataBufferNd<T,NDim>::clear()
_data.reset();
}
/**
* \brief Find lowest and highest sample excluding +/- Inf and NaN.
*
* \details The OpemMP directive is commented out because it might
* unfortunately turn out to run slower. The inner loop is very
* inexpensive which means the code is sensitive to how much overhead
* OpenMP will add. I have made real-world measurements but only in
* one hardware / OS / compiler environment and only for float input.
* I observed that 1 million elements was the point where
* multi-threading barely started to have a benefit. In a typical
* scenario I don't expect to see more then 5 times that number. That
* is a typical size of one brick-column. With 5 million floats the
* task ran 3.5 times faster using 16 threads. With a less capable
* OpenMP installation it might easily have run slower. It could
* potentially run a *lot* slower. In my test, range() accounted for
* 5% of total elapsed time. So I am, weighing a potential 5%
* performance improvement against the risk of slowing down. I am
* chicken. I hope I can instead parallelize this at a higher level.
* Making this issue moot.
*/
template <typename T, int NDim>
std::pair<double, double>
DataBufferNd<T,NDim>::range() const
{
SimpleTimer tt(AdHocTimers::instance().range, timers_on());
double min = std::numeric_limits<double>::infinity();
double max = -std::numeric_limits<double>::infinity();
T min = std::numeric_limits<T>::max();
T max = std::numeric_limits<T>::lowest();
if (isScalar()) {
if (IsFiniteT(scalarAsDouble()))
min = max = scalarAsDouble();
if (IsFiniteT(scalarValue()))
min = max = scalarValue();
}
else if (contiguous()) {
// Short cut: No holes. Treat as 1d.
const T *ptr = data();
const T *end = ptr + allocsize();
for (; ptr < end; ++ptr) {
if (IsFiniteT(*ptr)) {
min = max = *ptr;
break;
}
}
// Due to the loop above I can skip the check for min > max
for (; ptr < end; ++ptr) {
const T value = *ptr;
const T * const ptr = data();
const std::int64_t totalsize = allocsize();
//#pragma omp parallel for reduction(min: min) reduction(max: max) if(totalsize >= 1*1024*1024)
for (std::int64_t ii=0; ii<totalsize; ++ii) {
const T value = ptr[ii];
if (!IsFiniteT(value)) continue;
else if (min > value) min = value;
else if (max < value) max = value;
if (min > value) min = value;
if (max < value) max = value;
}
}
else {
const T * const ptr = data();
//#pragma omp parallel for -- NOT TESTED YET
for (int ii=0; ii<_size[0]; ++ii) {
for (int jj=0; jj<_size[1]; ++jj) {
for (int kk=0; kk<_size[2]; ++kk) {
......@@ -859,7 +874,7 @@ DataBufferNd<T,NDim>::range() const
}
}
}
return std::pair<double, double>(min, max);
return std::make_pair(static_cast<double>(min), static_cast<double>(max));
}
template <typename T, int NDim>
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment