Commit c60a6b3d authored by Paal Kvamme's avatar Paal Kvamme
Browse files

Merge branch 'kvamme62/features-and-tests' into 'master'

Correctly flag a ZGY file as v3 or v4

See merge request !78
parents 59ad4229 6da89693
Pipeline #50205 passed with stages
in 9 minutes and 7 seconds
......@@ -857,30 +857,11 @@ public:
// it is obviously not possible to read everything up front.
_accessor_rw.reset(new InternalZGY::ZgyInternalBulk(_fd, _meta_rw, _meta_rw, compress));
// If low resolution data is not available, mark as dirty because
// even if there has been no change to the bulk data the code should
// by default try to create lods and statistics on close.
if (_meta->ih().nlods() !=
InternalZGY::LookupTable::usableBrickLOD
(this->_meta->ih().lodsizes(),
this->_meta->ih().brickoffsets(),
this->_meta->blup().lup(),
this->_meta->blup().lupend()))
this->_dirty = true;
// If statistics are missing this also indicates that genlod needs
// to be called. Without this extra check the statistics for a
// tiny 1-brick file might not be updated. Because such a file
// always has all its (zero) lowres bricks.
if (this->_meta->ih().scnt() == 0)
this->_dirty = true;
// Also if the histogram is missing everything must be rebuilt.
// The converse is not true. The histogram range may be set
// already even if finaize was not run. With an incremental update
// the histogram cannot change. With a full rebuild it can grow.
const InternalZGY::IHistHeaderAccess& hh = this->_meta->hh();
if (hh.samplecount() == 0 || hh.minvalue() > hh.maxvalue())
// If low resolution data, statistics, or histogram are not
// available, mark as dirty because even if there has been no
// change to the bulk data the code should by default try to
// create lods and statistics on close.
if (!_meta->has_finalized_feature())
this->_dirty = true;
// Consistency checks: Only files uploaded by OpenZGY can be updated.
......@@ -1156,7 +1137,8 @@ public:
// If this is the first write then these steps are no-ops
// 1. Clear the statistics because it is out of date.
this->_meta_rw->ih().setstats(0,0,0,0,0);
constexpr double inf{std::numeric_limits<double>::infinity()};
this->_meta_rw->ih().setstats(0,0,0,inf,-inf);
// 2. Clear the histogram, except that we have already started
// deciding the histogram range to use as the range of all
......@@ -1191,15 +1173,7 @@ public:
//std::cout << "Was nlods " << nl1 << " code " << code1
// << " is nlods " << nl2 << " code " << code2 << std::endl;
// 4. Change the version number to 4, to signify that the old
// ZGY reader should no try to access this file. Low resolution
// bricks would appear to be present but have all zero samples.
// TODO-Low: Enhancement: With update support, when a file is
// later opened and properly finalized it may be possible to
// change the version number back to 3. Issue goes away when
// we upgrade the ZGY file format.
_meta_rw->fh().set_version
(std::max(_meta_rw->fh().version(), (std::uint32_t)4));
// 4. Change the version number to 4 is now done in _close_internal().
}
/**
......@@ -1286,8 +1260,7 @@ public:
// this and many similar places need to be updated.
this->_meta_rw->ih().setstats
(scaled_stats.getcnt(), scaled_stats.getsum(), scaled_stats.getssq(),
static_cast<float>(scaled_stats.getmin()),
static_cast<float>(scaled_stats.getmax()));
scaled_stats.getmin(), scaled_stats.getmax());
this->_meta_rw->hh().sethisto(scaled_histo.getmin(), scaled_histo.getmax(),
scaled_histo.getbins(), scaled_histo.getsize());
if (false)
......@@ -1453,6 +1426,9 @@ public:
if (!this->_fd || !this->_accessor_rw || !this->_meta_rw)
return; // looks like _close_internal alrady called.
// Prevent ZGY-Public from opening the file if appropriate.
this->_meta_rw->fh().set_version(_meta_rw->can_old_library_read() ? 3 : 4);
if (!this->errorflag()) {
this->_meta_rw->flushMeta(this->_fd);
}
......
......@@ -531,7 +531,7 @@ ZgyInternalBulk::ZgyInternalBulk(
if (metadata_rw) {
const IHistHeaderAccess& hh = metadata_rw->hh();
const IInfoHeaderAccess& ih = metadata_rw->ih();
if (hh.minvalue() >= hh.maxvalue()) {
if (hh.minvalue() <= hh.maxvalue()) {
if (hh.samplecount() != 0 || hh.minvalue() != 0 || hh.maxvalue() != 0) {
if (ih.datatype() == RawDataType::Float32) {
if (_logger(1))
......@@ -807,14 +807,6 @@ ZgyInternalBulk::readToNewBuffer(
*
* This method must be nonvirtual because it is also called from the
* constructor.
*
* The test for pre-existing low resolution bricks attempted here is
* somewhat paranoid. Checking statistics and histogram should have
* been sufficient because if called from the constructor then
* ZgyInternalMeta::initFromReopen() will have cleared the statistics
* if lowres was missing. If called from finalize() the caller knows
* whether lowres exists or not and if not should not have asked us to
* turn it on.
*/
void
ZgyInternalBulk::trackedBricksTryEnable(bool on)
......@@ -826,17 +818,9 @@ ZgyInternalBulk::trackedBricksTryEnable(bool on)
_modified_histo.reset();
}
else if (on) {
const IHistHeaderAccess& hh = _metadata_rw->hh();
const IInfoHeaderAccess& ih = _metadata_rw->ih();
const std::int64_t nlods = static_cast<std::int64_t>(ih.lodsizes().size());
const std::int64_t havelods = LookupTable::usableBrickLOD
(ih.lodsizes(), ih.brickoffsets(),
_metadata_rw->blup().lup(), _metadata_rw->blup().lupend());
const bool complete = (havelods == nlods &&
hh.samplecount() != 0 &&
hh.minvalue() <= hh.maxvalue() &&
ih.scnt() != 0);
if (complete) {
if (_metadata_rw->can_finalize_incremental()) {
const IHistHeaderAccess& hh = _metadata_rw->hh();
const IInfoHeaderAccess& ih = _metadata_rw->ih();
_modified_bricks.resize(ih.brickoffsets().back(), 0);
_modified_stats.reset(new StatisticData
(ih.scnt(), /*inf=*/0, ih.ssum(), ih.sssq(),
......
......@@ -484,6 +484,21 @@ LookupTable::usableBrickLOD(
(std::int32_t)lodsizes.size();
}
bool
LookupTable::hasBrickCompression(
const std::vector<std::uint64_t>& blup,
const std::vector<std::uint64_t>& bend)
{
constexpr std::int64_t bytesperbrick{1}; // Bogus, we only care about type.
for (std::size_t ix = 0; ix < blup.size(); ++ix) {
LookupTable::LutInfo info =
LookupTable::getBrickFilePositionFromIndex(ix, blup, bend, bytesperbrick);
if (info.status == BrickStatus::Compressed)
return true;
}
return false;
}
/**
* \brief Logically erase all low resolution bricks.
*
......
......@@ -157,6 +157,10 @@ public:
const std::vector<std::uint64_t>& blup,
const std::vector<std::uint64_t>& bend);
static bool hasBrickCompression(
const std::vector<std::uint64_t>& blup,
const std::vector<std::uint64_t>& bend);
static void setNoBrickLOD(
const std::vector<std::array<std::int64_t,3>>& lodsizes,
const std::vector<std::int64_t>& brickoffsets,
......
......@@ -861,7 +861,7 @@ public:
virtual const std::vector<std::int64_t>& brickoffsets() const override { return _cached_brickoffsets; }
// Write support.
virtual void setstats(std::int64_t scnt, double ssum, double sssq,
float smin, float smax)
double smin, double smax)
{
throw OpenZGY::Errors::ZgyInternalError("Writing InfoHeader is only supported for the latest version.");
}
......@@ -1015,13 +1015,14 @@ public:
virtual const std::vector<std::int64_t>& alphaoffsets() const override { return _cached_alphaoffsets; }
virtual const std::vector<std::int64_t>& brickoffsets() const override { return _cached_brickoffsets; }
virtual void setstats(std::int64_t scnt, double ssum, double sssq,
float smin, float smax)
double smin, double smax)
{
_pod._scnt = scnt;
_pod._ssum = ssum;
_pod._sssq = sssq;
_pod._smin = smin;
_pod._smax = smax;
// TODO-Low: After the next ZGY version the casts must be removed.
_pod._smin = static_cast<float>(smin);
_pod._smax = static_cast<float>(smax);
}
};
......@@ -2233,14 +2234,7 @@ ZgyInternalMeta::initFromReopen(const ZgyInternalWriterArgs& args_in, bool compr
ih.gpiline(), ih.gpxline(),
ih.gpx(), ih.gpy());
const std::int64_t nlods = static_cast<std::int64_t>(ih.lodsizes().size());
const std::int64_t havelods = LookupTable::usableBrickLOD
(ih.lodsizes(), ih.brickoffsets(),
this->_blup->lup(), this->_blup->lupend());
const bool complete = (havelods == nlods &&
this->_hh->samplecount() != 0 &&
ih.scnt() != 0);
if (!complete) {
if (!has_finalized_feature()) {
// Clear histogram, statistics, and low resolution data because
// everything will be rebuilt on finalize. The only thing kept
// is the previous histogram range. See ZgyInternalBulk ctor.
......@@ -2267,32 +2261,24 @@ ZgyInternalMeta::initFromReopen(const ZgyInternalWriterArgs& args_in, bool compr
&this->_blup->lup(), &this->_blup->lupend());
}
if (complete && compress) {
// Disallow opening a finalized file, even if uncompressed, if the
// second open specifies compression. Because we must assume that
// the application is going to finalize in this session and that
// would not be allowed. To be really pedantic I could allow the
// compressor to be set as long as the lodcompressor is not,
// but this is getting ridiculous.
throw OpenZGY::Errors::ZgyUserError
("A finalized file cannot have compressed data appended.");
}
if (complete) {
// Disallow any write of a file that has been finalized with compressed
// low resolution bricks. Because with current rules you won't be
// allowed to re-finalize it. That counts as an update. Need to catch
// this error early. If the application is allowed to start writing but
// not finalize at the end then the file would essentially be corrupted.
const std::vector<std::uint64_t>& blup = this->_blup->lup();
const std::vector<std::uint64_t>& bend = this->_blup->lupend();
const std::int64_t perbrick = ih.bytesperbrick();
for (std::size_t ii=0; ii<blup.size(); ++ii) {
if (LookupTable::getBrickFilePositionFromIndex
(ii, blup, bend, perbrick).status == BrickStatus::Compressed)
throw OpenZGY::Errors::ZgyUserError
("A finalized compressed file cannot be opened for update.");
}
if (!can_append_bulk(compress)) {
// Disallow opening a finalized file if it was or will be compressed.
// Because we must assume that the application is going to finalize
// in this session and that would not be allowed, and the attempt
// would leave the file corrupted. To be really pedantic I could
// allow the compressor to be set as long as the lodcompressor is
// not, but this is getting ridiculous.
// TODO-@@@-Medium: It should still be possible to open a compressed
// file for the sole purpose of changing annotation and world coords.
// That would not require a finalize. So ideally the test should be
// deferred to the first write. But there are other problems such as
// also deferring the re-open segment.
if (compress)
throw OpenZGY::Errors::ZgyUserError
("A finalized file cannot have compressed data appended.");
else
throw OpenZGY::Errors::ZgyUserError
("A finalized compressed file cannot be opened for update.");
}
}
......@@ -2395,4 +2381,196 @@ ZgyInternalMeta::dump(std::ostream& out, const std::string& prefix)
if (_blup) _blup->dump(out, prefix + " ");
}
/*
* Caveat: Adding bloat to this class. Find a better design?
*
* The has_xxx_feature() and can_xxx() tests will respectively
* collect information about the file and inform the caller about
* what is allowed.
*
* Initially there were just "was/is compressed" and "was finalized"
* cheks but things get more complicated when a file can be finalized
* with respect to low resolution data but not statistics or vice
* versa. Or the file may be small enough to not even have low
* resolution bricks.
*
* The tests that determine the version number are similar to the tests
* that decide whether incremental builds are allowed and whether a
* compressed file may be re-opened.
*
* Why does this matter?
*
* Petrel creates an empty file that will be written to later. This
* file must not be finalized because that would prevent it from being
* later written with compressed data. And if the finalize was done
* with compression it would prevent even ubcompressed writes. If no
* compression at all is involved then a finalize at this point "only"
* a serious performance issue.
*/
/**
* \brief Statistics and histogram are good.
*
* \details
* Good statistics and histogram has both sane limits (min<max) and
* nonzero counts. Note a few implementation details: An empty
* histogram might have limits (0,0) if never updated, and it might
* have (min<max) coupled with zero count because even if the file
* didn't get finalized it might need to know what the range would
* have been.
*/
bool
ZgyInternalMeta::has_stathist_feature() const
{
return (_hh->samplecount() != 0 &&
_ih->scnt() != 0 &&
_ih->smin() <= _ih->smax());
}
/**
* \brief Entire file is just a single brick.
*
* \details
* A survey small enough to fit inside a single brick triggers several
* corner cases. Such as never having any low resolution bricls.
*/
bool
ZgyInternalMeta::has_tiny_feature() const
{
const std::int64_t nlods = static_cast<std::int64_t>(_ih->lodsizes().size());
return nlods == 1;
}
/**
* \brief Has low resolution. Always false for single-brick files.
*
* \details
* Returns true if finalize has been run and low resolution bricks
* have been stored. For tiny files where all samples fit into a
* single brick, i.e. has_tiny_feature() == true, this will return
* false because the file still won't have (and won't need) lowres.
*/
bool
ZgyInternalMeta::has_lowres_feature() const
{
const std::int64_t nlods = static_cast<std::int64_t>(_ih->lodsizes().size());
const std::int64_t havelods = LookupTable::usableBrickLOD
(_ih->lodsizes(), _ih->brickoffsets(), _blup->lup(), _blup->lupend());
return nlods == havelods && nlods > 1;
}
/**
* \brief Contains compressed bricks.
*
* \details
* TODO-@@@-Low: Performance: Might need to cache the result.
* In that case, marking the cache dirty later on is a hassle.
*/
bool
ZgyInternalMeta::has_compression_feature() const
{
return InternalZGY::LookupTable::hasBrickCompression
(_blup->lup(), _blup->lupend());
}
/**
* \brief File is fully finalized.
*
* \details
* Both low resolution bricks (if needed), statistics, and histogram
* are present. If this test fails we might as well remove all the
* results of a previous finalize.
*/
bool
ZgyInternalMeta::has_finalized_feature() const
{
return (has_stathist_feature() &&
(has_tiny_feature() || has_lowres_feature()));
}
/**
* \brief Readable by the old library.
*
* \details
* When OpenZGY saves a file that isn't finalized then it is flagged
* as v4 because the old reader cannot handle that. The same happens
* if the file contains compressed data.
*
* If histogram or statistics are missing due to the file not being
* finalized then this does not in itself prevent the old library
* from using it. It is assumed that application code will react
* properly when it sees a range with min>max or count==0.
*
* A tiny uncompressed file that was not finalized will still be
* usable by ZGY-Public because it never has lowres.
*
* The version number (3 or 4) should reflect the current state of the
* file, not its history. So e.g. a file that was v4 only due to
* missing low resolution data should be changed to v3 when finalized.
* The test thus needs to be done right before closing the file.
*/
bool
ZgyInternalMeta::can_old_library_read() const
{
return (!has_compression_feature() &&
(has_tiny_feature() || has_lowres_feature()));
}
/**
* \brief Eligible for incremental finalize.
*
* \details
* The test is similar to can_old_library_read(). If low resolution data
* is missing then not only can't the file be opened by old zgy, it can't
* be incrementally finalized either because there is no starting point.
* The test here is stricter because even if only statistics and/or
* histogram are bad then incremental finalize os still disallowed.
*
* The user can still elect to do a full finalize. In fact, that is
* currently the default.
*
* Compressed files can't be finalized more than once. Nor can
* uncompressed files if the latest reopen asked for compression.
*/
bool
ZgyInternalMeta::can_finalize_incremental() const
{
return !has_compression_feature() && has_finalized_feature();
}
/**
* \brief Allowed to append (not necessarily update) bulk data.
*
* \details
* Compressed and finalized files are not allowed to have data appended
* to them. Not because the write itself would be a problem, but because
* re-finalizing the file counts an an update not an append so that would
* not be allowed.
*
* In the special single-brick case, appending is technically allowed
* even for compressed finalized data because no low resolution bricks
* need to be updated. This is of academic interest only (or it might
* change the wording of error messages) because a compressed file
* must have at least one brick of real data. And since the entire
* file is just one brick, there are no more empty bricks. So there
* is no space to append anything anyway.
*
* If the file is currently uncompressed but this current session will
* be adding compressed data then it still needs to be considered to
* be compressed. The caller needs to provide the will_compress flag
* because ibky the accessor knows what it is planning to do.
*
* Caveat: Ideally the test should be made on the first write and not
* the file reopen, because maybe the application just wanted to update
* the metadata. Which ought to still be allowed. But that special case
* also triggers other issues. How to prevent (for cloud access) the
* last segment from being re-opened needlessly.
*/
bool
ZgyInternalMeta::can_append_bulk(bool will_compress) const
{
return ((!has_compression_feature() && !will_compress) ||
(has_tiny_feature() || !has_lowres_feature()));
}
} // namespace
......@@ -333,7 +333,7 @@ public:
virtual double defaultvalue() const = 0;
// Write support.
virtual void setstats(std::int64_t scnt, double ssum, double sssq,
float smin, float smax) = 0;
double smin, double smax) = 0;
};
/////////////////////////////////////////////////////////////////////////////
......@@ -475,6 +475,22 @@ public:
return old;
}
std::int64_t flushMeta(const std::shared_ptr<FileADT>& file);
private:
// Caveat: Adding bloat to this class. Find a better design?
// Add methods to get derived information from one or more of the headers.
// Add methods that essentially forward the request to the static class
// LookupTable. Being static, all context needed by LookupTable must be
// passed in every call. And most of what it needs comes from our headers.
bool has_stathist_feature() const; // Statistics and histogram are good.
bool has_tiny_feature() const; // Entire file is just a single brick.
bool has_lowres_feature() const; // Has low resolution. False if tiny.
bool has_compression_feature() const; // Contains compressed bricks.
public:
bool has_finalized_feature() const; // Fully finalized.
bool can_old_library_read() const; // Readable by the old library.
bool can_finalize_incremental() const; // Eligible for incremental finalize.
bool can_append_bulk(bool) const; // Allowed to append bulk data.
private:
bool _logger(int priority, const std::string& ss = std::string()) const;
bool _logger(int priority, const std::ios& ss) const;
......
......@@ -713,8 +713,8 @@ do_test_finalize(int mode)
TEST_CHECK(stat.cnt == 0);
TEST_CHECK(stat.sum == 0);
TEST_CHECK(stat.ssq == 0);
TEST_CHECK(stat.min == 0);
TEST_CHECK(stat.max == 0);
TEST_CHECK(stat.min == std::numeric_limits<double>::infinity());
TEST_CHECK(stat.max == -std::numeric_limits<double>::infinity());
TEST_CHECK(hist.samplecount == 0);
TEST_CHECK_(hist.minvalue == 42, "minvalue expect 42 got %lg", hist.minvalue);
TEST_CHECK_(hist.maxvalue == 42, "maxvalue expect 42 got %lg", hist.maxvalue);
......@@ -1171,7 +1171,7 @@ test_compress_noop()
std::shared_ptr<const FileStatistics> stats =
do_test_copy_slurp_8(filename, lad.name(), "Null");
TEST_CHECK(stats->fileVersion() != 3); // v4 set even if no actual compressed
TEST_CHECK(stats->fileVersion() == 3); // v4 only set if actual compressed
TEST_CHECK(stats->isCompressed() == false); // looked for compressed bricks.
TEST_CHECK(stats->fileSize() == 4*64*64*64*4); // header plus 3 normal bricks
TEST_CHECK(stats->brickNormalCount() == 3); // 1 lod0, 1 lod1, 1 lod2
......
......@@ -315,6 +315,8 @@ do_test_reopen(const std::string& filename, TestTwiceFlags flags)
return (flags & f) != TestTwiceFlags::nothing;
};
constexpr double inf{std::numeric_limits<double>::infinity()};
// If not finalized on the last close there will be no lowres, stats, histo.
const int expect_nlods =
(flagset(TestTwiceFlags::step2_finalize) ? 5 :
......@@ -413,7 +415,7 @@ do_test_reopen(const std::string& filename, TestTwiceFlags flags)
(expect_15 * -15 * -15.0) +
(expect_10 * -10 * -10.0);
const double expect_stat_min = !expect_finalized ? 0.0 :
const double expect_stat_min = !expect_finalized ? inf :
(expect_15 + expect_10 == 0 ? 0.0 : -15.0);
// Note the quirk with incremental finalize. If there were ever
......@@ -421,7 +423,7 @@ do_test_reopen(const std::string& filename, TestTwiceFlags flags)
// range even if those samples were removed later.
// Unless of course the caller asked for an incremental finalize
// but didn't get it because step 1 did not finalize.
const double expect_stat_max = !expect_finalized ? 0 :
const double expect_stat_max = !expect_finalized ? -inf :
(flagset(TestTwiceFlags::step1_write) &&
flagset(TestTwiceFlags::step1_finalize) &&
flagset(TestTwiceFlags::step2_replace) &&
......@@ -1741,6 +1743,318 @@ test_reopen_rwlod()
#endif
}
/**
* The short version: When OpenZGY saves a file that isn't finalized
* then it is flagged as v4 because the old reader cannot handle that.
* Just like compressed data will do. Also there will be no
* incremental finalize on the next close since there is no starting
* point. On the positive side, compressed files that are not
* finalized can still be re-opened and appended to.
*
* The long version: The above is complicated by the fact that
* finalize does more than one thing so there exists a "halfway
* finalized" state where low resolution data is present and
* statistics are not. Or vice versa. Or low resolution is missing
* because it is not needed for tiny files.
*
* The version number (3 or 4) should reflect the current state of the
* file, not its history. So e.g. a file that was v4 only due to
* missing low resolution data should be changed to v3 when finalized.
* The tests that determine the version number are similar to the tests
* that decide whether incremental builds are allowed and whether a
* compressed file may be re-opened.
*
* Why does this matter?
*
* Petrel creates an empty file that will be written to later. This
* file must not be finalized because that would prevent it from being
* later written with compressed data. And if the finalize was done
* with compression it would prevent even ubcompressed writes. If no
* compression at all is involved then a finalize at this point "only"
* a serious performance issue.
*
* So:
* - Create empty file not finalized. It will be version 4.
* Except the special single-brick case which will be v3
* because there isn't any lowres to be generated.
* It doesn't matter whether compression is requested;
* the file will be uncompressed because it has no data.
*
* Features:
* (ZFP) Has compressed data.
* (LOD) Has low resolution data (N/A for single brick file).
* (S/H) Has statistics and histogram (empty histogram due to wrong range fails)
*
* Behavior depending on the above:
* - Mark as V4 if ZFP || !LOD.
* - Not allowed to reopen if ZFP && LOD.
* - Track changes for BuildIncremental if LOD && S/H
*
* Behavior for single-brick file which cannot have LOD:
* - Mark as V4 if ZFP
* - Not allowed to reopen if ZFP.
* - Never track changes.
*
* Notes:
* - Applications using the old readers are assumed to handle missing
* statistics and histogram correctly, so there is no need to flag
* as v4 only for that purpose.
*
* - A single brick file cannot have lowres, this lack will not trigger
* the v4 flag either.
*
* - Allowing BuildIncremental has a stricter check. Err on the size
* of forcing a BuildFull where an incremental might have sufficed.
* Here, missing S/H makes us assume the file is not finalized yet.
* A single brick file always fails this test because incremental
* builds would be pointless. (test: See if the statistics can shrink).
*
* - Reopen of a compressed file is not allowed if finalized because the
* low resolution bricks (just like the full resolution) can only be
* written once. Reopen of a compressed single brick file is allowed
* for consistency even though it would be pointless. Not due to the