Commit 6710b5b4 authored by Paal Kvamme's avatar Paal Kvamme
Browse files

Merge branch 'kvamme62/copysubset-shortcut' into 'master'

Add commented-out CopySubset shortcut

See merge request !82
parents 01e1055b b4abf40a
Pipeline #50229 passed with stages
in 14 minutes and 57 seconds
......@@ -28,6 +28,7 @@
#include <cmath>
#include <iostream>
#include <iomanip>
#include <atomic>
namespace InternalZGY {
#if 0
......@@ -114,6 +115,20 @@ namespace {
};
}
#if 0
/**
* The default for OPENZGY_COPYSUBSET_SHORTCUT is 0 i.e. off, because
* it hasn't been shown that it gives a measurable speedup and it
* hasn't been sufficiantly tested.
*/
static int
copysubset_shortcut()
{
static int enable = Environment::getNumericEnv("OPENZGY_COPYSUBSET_SHORTCUT", 0);
return enable;
}
#endif
/**
* Copy subset of one array into a subset of another.
*
......@@ -188,6 +203,78 @@ CopySubset(std::int32_t ndim,
cpysize = srcsize;
}
#if 0
static std::atomic<std::int64_t> totalcall{0}, fastcall{0};
++totalcall;
if (ndim==3 && copysubset_shortcut() > 1) {
static auto three= [](const std::int64_t *a) -> std::string {
if (!a)
return std::string("(nullptr)");
std::stringstream ss;
ss << "(" << a[0] << "," << a[1] << "," << a[2] << ")";
return ss.str();
};
if (totalcall > 0 && (totalcall % 1000) == 0 && copysubset_shortcut() > 1) {
std::cerr << "Fast CopySubset " << fastcall << "/" << totalcall
<< " " << three(srcsize) << " * " << sizeof(T)
<< " lockfree " << totalcall.is_lock_free()
<< std::endl;
}
if (copysubset_shortcut() > 2)
std::cerr << "CopySubset shortcut " << copysubset_shortcut()
<< " srcorig " << three(srcorig)
<< " srcsize " << three(srcsize)
<< " srcstride " << three(srcstride)
<< " dstorig " << three(dstorig)
<< " dstsize " << three(dstsize)
<< " dststride " << three(dststride)
<< " cpyorig " << three(cpyorig)
<< " cpysize " << three(cpysize)
<< std::endl;
}
// Do a short cut if the entire copy operation can be made
// using a single memcpy. The test is very specific and there
// are other cases where a single memcpy would have worked.
// But inside OpenZGY I believe this is the only one that
// matters. The code will be hit when reading or writing a
// single brick. NOTE that it might be better to test for that
// case at a higher level. In which case the test here can
// be removed. Initially the test needs to be here to debug
// a particular issue.
if (ndim == 3 &&
copysubset_shortcut() &&
// Source is C-Contiguous?
srcstride[2] == 1 &&
srcstride[1] == srcsize[0] &&
srcstride[0] == srcsize[0] * srcsize[1] &&
// Neither source nor destination has any offset?
srcorig[0] == 0 &&
srcorig[1] == 0 &&
srcorig[2] == 0 &&
dstorig[0] == 0 &&
dstorig[1] == 0 &&
dstorig[2] == 0 &&
// Same size?
srcsize[0] == dstsize[0] &&
srcsize[1] == dstsize[1] &&
srcsize[2] == dstsize[2] &&
// Same stride?
srcstride[1] == dststride[1] &&
srcstride[2] == dststride[2] &&
srcstride[0] == dststride[0] &&
// At this point, target must also be C-Contiguous.
// Not clipped by survey?
cpyorig[0] <= srcorig[0] && cpysize[0] >= srcsize[0] + cpyorig[0] &&
cpyorig[1] <= srcorig[1] && cpysize[1] >= srcsize[1] + cpyorig[1] &&
cpyorig[2] <= srcorig[2] && cpysize[2] >= srcsize[2] + cpyorig[2])
{
++fastcall;
memcpy(dstbuf, srcbuf, dstsize[0] * dstsize[1] * dstsize[2] * sizeof(T));
return;
}
#endif
// Calculate overlap range.
const std::int64_t beg = std::max(std::max(dstorig[0], srcorig[0]), cpyorig[0]);
const std::int64_t end = std::min(std::min(dstorig[0] + dstsize[0], srcorig[0] + srcsize[0]), cpyorig[0] + cpysize[0]);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment