Commit fc1e5de7 authored by Paal Kvamme's avatar Paal Kvamme
Browse files

Merge branch 'kvamme62/filestats' into 'master'

Allow application to retrieve compression factor and similar statistics.

See merge request !50
parents db0a7844 203fb19c
Pipeline #30548 passed with stages
in 10 minutes and 22 seconds
......@@ -299,6 +299,132 @@ public:
std::vector<std::int64_t>(hh.bins(),
hh.bins() + hh.bincount()));
}
virtual FileStatistics filestats() const override
{
using InternalZGY::LookupTable;
using InternalZGY::BrickStatus;
const std::int64_t bytesperalpha = _meta->ih().bytesperalpha();
const std::int64_t bytesperbrick = _meta->ih().bytesperbrick();
const std::vector<std::uint64_t>& alup = _meta->alup().lup();
const std::vector<std::uint64_t>& blup = _meta->blup().lup();
const std::vector<std::uint64_t>& bend = _meta->blup().lupend();
FileStatistics result;
result._file_version = _meta->fh().version();
result._alpha_normal_size_per_entry = bytesperalpha;
result._brick_normal_size_per_entry = bytesperbrick;
// result._file_size = _fd->xx_size(); Available in ZgyReader and ZgyWriter only.
// TODO-Low: Fix this kludge.
// I happen to know that in V3 and V4 the headers are all stored
// consecutively and the brick lookup table comes last.
result._header_size = _meta->oh().bricklupoff() + _meta->oh().bricklupsize();
const std::vector<std::array<std::int64_t,3>>& lodsizes = _meta->ih().lodsizes();
const std::vector<std::int64_t>& alphaoffsets = _meta->ih().alphaoffsets();
const std::vector<std::int64_t>& brickoffsets = _meta->ih().brickoffsets();
// This had been much simpler if I made a getBrickFilePositionByIndex.
// And possibly moving part of this code inside class LookupTable.
// I only need to iterate over the contents of alup, blup, bend
// but here I need to iterate over brick positiom and let class
// LookupTable convert that back to a raw index.
std::array<std::int64_t,3> size = _meta->ih().size();
const std::array<std::int64_t,3> bs = _meta->ih().bricksize();
for (std::int64_t lod= 0; lod < _meta->ih().nlods(); ++lod) {
for (std::int64_t ii = 0; ii < size[0]; ii += bs[0]) {
for (std::int64_t jj = 0; jj < size[1]; jj += bs[1]) {
LookupTable::LutInfo info =
LookupTable::getAlphaFilePosition
(ii/bs[0], jj/bs[1], lod,
lodsizes,
alphaoffsets, alup, /*aend,*/
bytesperalpha);
//std::cout<< zgydump_format(ii/bs[0], jj/bs[1], 0, lod, info) << "\n";
switch (info.status) {
case BrickStatus::Missing: result._alpha_missing_count += 1; break;
case BrickStatus::Constant: result._alpha_constant_count += 1; break;
case BrickStatus::Normal:
result._alpha_normal_count += 1;
break;
case BrickStatus::Compressed:
result._alpha_compressed_count += 1;
result._alpha_compressed_size += info.size_in_file;
break;
}
for (std::int64_t kk = 0; kk < size[2]; kk += bs[2]) {
LookupTable::LutInfo info =
LookupTable::getBrickFilePosition
(ii/bs[0], jj/bs[1], kk/bs[2], lod,
lodsizes,
brickoffsets, blup, bend,
bytesperbrick);
//std::cout << zgydump_format(ii/bs[0], jj/bs[1], kk/bs[2], lod, info) << "\n";
switch (info.status) {
case BrickStatus::Missing: result._brick_missing_count += 1; break;
case BrickStatus::Constant: result._brick_constant_count += 1; break;
case BrickStatus::Normal:
result._brick_normal_count += 1; break;
case BrickStatus::Compressed:
result._brick_compressed_count += 1;
result._brick_compressed_size += info.size_in_file;
break;
}
}
}
}
size[0] = (size[0]+1)/2;
size[1] = (size[1]+1)/2;
size[2] = (size[2]+1)/2;
}
// TODO-Low: Keep track of wasted_size and padding_size.
// Padding gets added in ZgyInternalMeta::flushMeta(). I need to
// replicate the logic here. The alternative is to scan for the
// lowest brick offset. But even that isn't completely reliable
// because there might be wasted blocks between end of padding and
// start of first block. And, do I really care at all?
//result._padding_size = roundup(result._header_size,
// result._brick_size_per_entry);
//result._wasted_size = result._file_size - result._usedSize();
// DERIVED INFORMATION:
// The following could also have been generated on the fly in some
// member function. I pre-calculate it here instead, to limit the
// amount of code visible in the public api.h header file.
// File size not including padding and holes.
result._used_size =
((result._alpha_normal_count * result._alpha_normal_size_per_entry) + result._alpha_compressed_size +
(result._brick_normal_count * result._brick_normal_size_per_entry) + result._brick_compressed_size +
result._header_size);
// As used_size if the file is/was uncompressed.
result._used_if_uncompressed =
(((result._alpha_normal_count + result._alpha_compressed_count) * result._alpha_normal_size_per_entry) +
((result._brick_normal_count + result._brick_compressed_count) * result._brick_normal_size_per_entry) +
result._header_size);
// Is there at least one brick flagged as compressed?
result._is_compressed =
(result._alpha_compressed_count + result._brick_compressed_count > 0);
// Relative size of this possibly compressed file compared to uncompressed.
result._compression_factor =
(result._used_if_uncompressed > 0 ?
result._used_size / (double)result._used_if_uncompressed :
1);
// Slightly different definition of compression factor.
// Doesn't work because file_size not set yet.
// Besides, I like the other one better.
//result._compression_factor =
// (result._file_size > 0 ?
// result._used_size + (result._file_size-(result._used_if_uncompressed)) / (double)result._file_size:
// 1);
return result;
}
};
/**
......@@ -548,6 +674,17 @@ public:
}
// Metadata remains accessible. Not sure whether this is a good idea.
}
/**
* filestats() needs to be overridden in ZgyReader and ZgyWriter
* if we want to make the physical file size accessible.
*/
virtual FileStatistics filestats() const override
{
FileStatistics result = ZgyMeta::filestats();
result._file_size = _fd->xx_eof();
return result;
}
};
/**
......@@ -964,6 +1101,17 @@ public:
_accessor_rw->set_errorflag(value);
_meta_rw->set_errorflag(value);
}
/**
* filestats() needs to be overridden in ZgyReader and ZgyWriter
* if we want to make the physical file size accessible.
*/
virtual FileStatistics filestats() const override
{
FileStatistics result = ZgyMeta::filestats();
result._file_size = _fd->xx_eof();
return result;
}
};
/**
......
// Copyright 2017-2020, Schlumberger
// Copyright 2017-2021, Schlumberger
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......@@ -135,6 +135,8 @@ class IOContext;
class IZgyReader;
namespace Impl {
class ZgyMeta;
class ZgyReader;
class ZgyWriter;
class EnumMapper;
}
......@@ -378,6 +380,194 @@ public:
}
};
/**
* \brief Return value from filestats().
* \details
*
* Meta information about the file that might be reported to a curious
* end user. Such as the compression factor that was achieved. Or whether
* this file was written by a legacy application that chose to store
* alpha tiles in the file. The information is not needed to consume
* the data stored in the file.
*
* This is a concrete class, not an interface. This is done to make
* the class a POD and ensure it is copyable. Note this means you
* should not assign the filestats() result to a referece.
*
* Note that the reported compression is about file sizes, not the
* requested signal to noise factor. The latter would need to be
* saved explicitly in the file. Currently it isn't.
*
* Normally the only wasted size in a ZGY file is the padding between
* the header area and the first brick. For a file with aplha tiles
* (cannot currently be cretated by OpenZGY) there might be more data
* lost due to alignment. And files written to cloud storage in a way
* that updates the same brick more than once may end up with holes.
*
* In the statistics, the padding after the header area is captured
* in padding_size and all other holes are captured in wasted_size.
* Caveat: Leftover space after the end of a compressed block will
* currently not be detected.
*/
class OPENZGY_API FileStatistics
{
// The code that fills in the contents do so by accessing the data fields
// directly. There isn't much point in the additional boilerplace code
// to implement write accessors.
friend class Impl::ZgyMeta;
friend class Impl::ZgyReader;
friend class Impl::ZgyWriter;
private:
std::int64_t _file_version;
std::int64_t _file_size;
std::int64_t _header_size;
//std::int64_t _padding_size;
//std::int64_t _wasted_size;
std::int64_t _alpha_normal_count;
std::int64_t _alpha_normal_size_per_entry;
std::int64_t _alpha_compressed_count;
std::int64_t _alpha_compressed_size;
std::int64_t _alpha_missing_count;
std::int64_t _alpha_constant_count;
std::int64_t _brick_normal_count;
std::int64_t _brick_normal_size_per_entry;
std::int64_t _brick_compressed_count;
std::int64_t _brick_compressed_size;
std::int64_t _brick_missing_count;
std::int64_t _brick_constant_count;
// Derived information
std::int64_t _used_size;
std::int64_t _used_if_uncompressed;
double _compression_factor;
bool _is_compressed;
public:
FileStatistics()
: _file_version(0)
, _file_size(0)
, _header_size(0)
//, _padding_size(0)
//, _wasted_size(0)
, _alpha_normal_count(0)
, _alpha_normal_size_per_entry(0)
, _alpha_compressed_count(0)
, _alpha_compressed_size(0)
, _alpha_missing_count(0)
, _alpha_constant_count(0)
, _brick_normal_count(0)
, _brick_normal_size_per_entry(0)
, _brick_compressed_count(0)
, _brick_compressed_size(0)
, _brick_missing_count(0)
, _brick_constant_count(0)
, _used_size(0)
, _used_if_uncompressed(0)
, _compression_factor(1.0)
, _is_compressed(false)
{
}
/// Version number from the main file header.
std::int64_t fileVersion() const { return _file_version; }
/// Total size of the file on disk or cloud.
std::int64_t fileSize() const { return _file_size; }
/// Size of all headers.
std::int64_t headerSize() const { return _header_size; }
// Wasted due to first brick alignment.
//std::int64_t paddingSize() const { return _padding_size; }
// Wasted due to other reasons.
//std::int64_t wastedSize() const { return _wasted_size; }
/// Number of uncompressed tiles.
std::int64_t alphaNormalCount() const { return _alpha_normal_count; }
/// Size used by one uncompressed tile.
std::int64_t alphaNormalSizePerEntry() const { return _alpha_normal_size_per_entry; }
/// Number of compressed tiles.
std::int64_t alphaCompressedCount() const { return _alpha_compressed_count; }
/// Total size used by compressed tiles.
std::int64_t alphaCcompressedSize() const { return _alpha_compressed_size; }
/// Number of compressed tiles.
std::int64_t alphaMissingCount() const { return _alpha_missing_count; }
/// Number of constant value tiles.
std::int64_t alphaConstantCount() const { return _alpha_constant_count; }
/// Number of uncompressed bricks.
std::int64_t brickNormalCount() const { return _brick_normal_count; }
/// Size used by one uncompressed brick.
std::int64_t brickNormalSizePerEntry() const { return _brick_normal_size_per_entry; }
/// Number of compressed bricks.
std::int64_t brickCompressedCount() const { return _brick_compressed_count; }
/// Total size used by compressed bricks.
std::int64_t brickCompressedSize() const { return _brick_compressed_size; }
/// Number of compressed bricks.
std::int64_t brickMissingCount() const { return _brick_missing_count; }
/// Number of constant value bricks.
std::int64_t brickConstantCount() const { return _brick_constant_count; }
// Derived information
/**
* \brief Space used by headers and data bricks.
* \details
* File size not including padding and holes, combining all LOD
* levels and the main headers. The padding between the header area
* and the first brick will not be included. Nor will holes between
* uncompressed bricks contribute. Currently any holes between
* compressed bricks are not detected which means that they will be
* counted as used. This can be derived from the other information.
*/
std::int64_t usedSize() const { return _used_size; }
/**
* \brief Space needed if the file is/was uncompressed.
* \details
* As used_size if the file is/was uncompressed. This can be derived
* from the other information.
*/
std::int64_t usedIfUncompressed() const { return _used_if_uncompressed; }
/**
* \brief Measure how successful the compression was.
* \details
* Estimate the relative size of this possibly compressed file
* compared to the same file if uncompressed. Will be 1.0 if file is
* already uncompressed but a value of 1.0 doesn't technically imply
* that the file is not compressed. Padding is ignored so the result
* will not match precisely what you get by uncompressing the file
* and storing it on disk. Also not taken into account is that the
* padding after the header area might differ between the compressed
* and uncompressed formats.
*/
double compressionFactor() const { return _compression_factor; }
/**
* True if at least one brick is flagged as compressed, even in the
* unlikely case where the compression didn't actually reduce the
* file size. This can be derived from the other information.
*/
bool isCompressed() const { return _is_compressed; }
/**
* For debugging. Output most of the information to the supplied ostream.
*/
void dump(std::ostream& out, const std::string& prefix = "") const {
out << prefix << "ZGY version " << _file_version
<< " file compressed to "
<< int(100.0 * _compression_factor) << "% of original\n"
<< prefix << "Size: "
<< _file_size << " bytes of which "
<< _header_size << " are in headers and "
<< _file_size - _used_size << " wasted\n"
<< prefix << "Alpha: "
<< _alpha_missing_count << " missing, "
<< _alpha_constant_count << " constant, "
<< _alpha_normal_count << " normal ("
<< _alpha_normal_count * _alpha_normal_size_per_entry << " bytes), "
<< _alpha_compressed_count << " compressed ("
<< _alpha_compressed_size << " bytes)\n"
<< prefix << "Brick: "
<< _brick_missing_count << " missing, "
<< _brick_constant_count << " constant, "
<< _brick_normal_count << " normal ("
<< _brick_normal_count * _brick_normal_size_per_entry << " bytes), "
<< _brick_compressed_count << " compressed ("
<< _brick_compressed_size << " bytes)\n";
}
};
/**
* \brief Argument package for creating a ZGY file.
*
......@@ -719,6 +909,7 @@ public:
virtual void dump(std::ostream&) const = 0; /**< \brief Output in human readable form for debugging. */
virtual SampleStatistics statistics() const = 0; /**< \brief Statistics of all sample values on the file. */
virtual SampleHistogram histogram() const = 0; /**< \brief Histogram of all sample values on the file. */
virtual FileStatistics filestats() const = 0; /**< \brief For display purposes only. */
};
/**
......
......@@ -2084,6 +2084,9 @@ ZgyInternalMeta::flushMeta(const std::shared_ptr<FileADT>& file)
// end of the header. Especially if we are writing to the cloud.
// Forget about using leftover space in the header to store the
// first few alpha tiles. We probably won't be writing those anyway.
// TODO-Low: Should there be padding also for the compressed case?
// Not strictly needed, but if writing directly to the cloud
// the padding ensures that all segments hace nice sizes.
IHeaderAccess::podbytes_t allbytes;
for (const auto& it : bytes)
std::copy(it.begin(), it.end(), std::back_inserter(allbytes));
......
// Copyright 2017-2020, Schlumberger
// Copyright 2017-2021, Schlumberger
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......@@ -405,7 +405,7 @@ private:
public:
// Actually internal. Used by OpenZGY::ZgyMeta, ZgyInternalBulk, GenLodC.
const IFileHeaderAccess& fh() const { return *_fh; }
//const IOffsetHeaderAccess& oh() const { return *_oh; }
const IOffsetHeaderAccess& oh() const { return *_oh; }
const IInfoHeaderAccess& ih() const { return *_ih; }
const IHistHeaderAccess& hh() const { return *_hh; }
const ILookupTableAccess& alup() const { return *_alup; }
......
......@@ -212,6 +212,12 @@ public:
return writer_->histogram();
}
virtual FileStatistics filestats() const override
{
std::lock_guard<std::mutex> lk(mutex_);
return writer_->filestats();
}
// FUNCTIONS FROM IZgyTools
virtual void transform(const corners_t& from, const corners_t& to, std::vector<std::array<float64_t,2>>& data) const override
{
......
// Copyright 2017-2020, Schlumberger
// Copyright 2017-2021, Schlumberger
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......@@ -67,6 +67,7 @@ public:
virtual void dump(std::ostream&) const {throw std::runtime_error("dump() has not been mocked");}
virtual SampleStatistics statistics() const {throw std::runtime_error("statistics() has not been mocked");}
virtual SampleHistogram histogram() const {throw std::runtime_error("histogram() has not been mocked");}
virtual FileStatistics filestats() const {throw std::runtime_error("filestats() has not been mocked");}
// Functions from IZgyTools
virtual void transform(const corners_t& from, const corners_t& to, std::vector<std::array<float64_t,2>>&) const {throw std::runtime_error("coord handling has not been mocked");}
virtual std::array<float64_t,2> transform1(const corners_t& from, const corners_t& to, const std::array<float64_t,2>&) const {throw std::runtime_error("coord handling has not been mocked");}
......
......@@ -121,8 +121,8 @@ static void dump_api(std::shared_ptr<OpenZGY::IZgyReader> rr, std::ostream& out)
const OpenZGY::IZgyReader& r = *rr.get();
std::streamsize oldprec = std::cout.precision();
std::ios_base::fmtflags oldflags = std::cout.flags();
out << "File format = " << r.datatype() << "\n";
out << "File format and version = " << r.datatype()
<< " ZGY version " << r.filestats().fileVersion() << "\n";
out << "Size I,J,K = " << r.size() << "\n";
out << "Brick size I,J,K = " << r.bricksize() << "\n";
out << "Number of bricks I,J,K = " << r.brickcount()[0] << "\n";
......@@ -151,6 +151,7 @@ static void dump_api(std::shared_ptr<OpenZGY::IZgyReader> rr, std::ostream& out)
<< r.zunitfactor() << " '"
<< r.zunitname() << "'\n";
out << "Ordered Corner Points Legend = [ <i>, <j>] { <inline>, <xline>} ( <easting>, <northing>)" << "\n";
r.filestats().dump(out);
for (int ii=0; ii<4; ++ii)
out << "Ordered Corner Point " << ii << " = ["
<< std::fixed << std::setprecision(0)
......@@ -196,7 +197,7 @@ void test_readmeta()
if (verbose()) {
std::cout << "\n";
dump_api(reader, std::cout);
//reader->dump(std::cout);
reader->filestats().dump(std::cout, "filestats: ");
}
const OpenZGY::IZgyReader& r(*reader);
......@@ -232,6 +233,13 @@ void test_readmeta()
TEST_CHECK(r.zunitdim() == UnitDimension::unknown);
TEST_CHECK(r.zunitfactor() == 1.0);
TEST_CHECK(r.zunitname() == "");
const FileStatistics filestats = r.filestats();
TEST_CHECK(filestats.fileVersion() == 3);
TEST_CHECK(filestats.fileSize() == 12320768);
TEST_CHECK(filestats.alphaNormalCount() == 5);
TEST_CHECK(filestats.alphaNormalSizePerEntry() == 64*64);
TEST_CHECK(filestats.brickNormalCount() == 45);
TEST_CHECK(filestats.brickNormalSizePerEntry() == 64*64*64);
// actual
const IZgyReader::corners_t& index = r.indexcorners();
......@@ -266,6 +274,16 @@ void test_readmeta()
reader->close();
}
void test_readcmeta()
{
std::shared_ptr<OpenZGY::IZgyReader> reader = OpenZGY::IZgyReader::open(get_testdata("Compressed.zgy"));
if (verbose()) {
std::cout << "\n";
dump_api(reader, std::cout);
reader->filestats().dump(std::cout, "filestats: ");
}
}
void test_readconst()
{
std::shared_ptr<OpenZGY::IZgyReader> reader = OpenZGY::IZgyReader::open(get_testdata("Empty-v3.zgy"));
......@@ -1412,6 +1430,8 @@ public:
{
register_test("api.zgywriterargs", test_ZgyWriterArgs);
register_test("api.readmeta", test_readmeta);
// Test file is not checked in yet.
//register_test("api.readcmeta", test_readcmeta);
register_test("api.readconst", test_readconst);
register_test("api.readbulk", test_readbulk);
register_test("api.readbadvt", test_readbadvt);
......
// Copyright 2017-2020, Schlumberger
// Copyright 2017-2021, Schlumberger
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......@@ -276,13 +276,14 @@ dump_basic(std::shared_ptr<OpenZGY::IZgyReader> r, const std::string& filename,
const SampleStatistics stat = r->statistics();
const SampleHistogram hist = r->histogram();
const FileStatistics filestats = r->filestats();
// Note that file size in bytes, ZGY version, and projection system
// are not available in the API. They might not be useful anyway.
os << "File name = '" << filename << "'\n"
<< "File size (bytes) = " << "?\n" // << r->_fd.xx_eof() << "\n"
<< "File format and version = " << r->datatype() << " ZGY version ?\n" // << r._accessor._metadata._fh._version << "\n"
<< "File size (bytes) = " << filestats.fileSize() << "\n"
<< "File format and version = " << r->datatype() << " ZGY version " << filestats.fileVersion() << "\n"
<< "Brick size I,J,K = " << "(" << r->bricksize()[0] << ", " << r->bricksize()[1] << ", " << r->bricksize()[2] << ")\n"
<< "Number of bricks I,J,K = " << "(" << r->brickcount()[0][0] << ", " << r->brickcount()[0][1] << ", " << r->brickcount()[0][2] << ")\n"
<< "Number of LODs = " << r->nlods() << "\n"
......@@ -322,8 +323,7 @@ dump_summary_brick_offsets(std::shared_ptr<OpenZGY::IZgyReader> r, std::ostream&
{
// TBD -- need access to internals
// See all_brick(), all_alpha(), summary_brick_offsets() in the Python version.
os << "Alpha status = ?\n";
os << "Brick status = ?\n";
r->filestats().dump(os);
}
void
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment