bulk.cpp 70 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
// Copyright 2017-2020, Schlumberger
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

/** file: impl/bulk.cpp
 *
 * This file contains a number of classes and free functions dealing with
 * bulk data access. There is a similar file impl.meta that deals with
 * metadata. Nothing in this file should be directly visible to users of
 * the public API.
 *
 *     impl.bulk.ZgyInternalBulk:
 *
 *         * Internal methods to read and write bulk.
 *         * Needs access to much of the meta data information.
 *           Currently this is handled by giving this class references to
 *           the ZgyInternalMeta instance. This gives access to more
 *           metadata than the class actually needs.
 *         * Refactoring notes: There are a couple of sections in this
 *           class that ought to have been separated out. As it is now
 *           the class does too much.
 *
 *             - Write support
 *             - Lod generation
 *             - Histogram generation
 *
 *     Code flow for writing:
 *
 *     The main logic is a couple of methods where each function ends
 *     with calling the next level down. So they might easily be refactored
 *     into a set of methods to be called sequentially. Or (horror!) into
 *     a single function. The order is:
 *
 *     writeRegion (arbitrary region, user's value_type)
 *
 *         Apply conversion float -> storage and read/modify/write logic. Keep
 *         track of min and max sample range for the file as a whole.
 *
 *     _writeAlignedRegion (arbitrary aligned region, native vt)
 *
 *         Apply splitting into bricks.
 *         Expand bricks at survey edge to full size unless r/m/w already did.
 *         Convert sample position to brick number. I.e. divide by bricksize.
Paal Kvamme's avatar
Paal Kvamme committed
54
55
56
57
 *         The next layer (_writeOneBrick) can be called with one brick
 *         and a time so we can re-use the buffer. On the other hand,
 *         passing a list of bricks or even all the bricks in the request
 *         might help parallelization.
58
59
60
61
 *
 *     _writeOneBrick (single brick, native vt)
 *
 *         Apply conversion from scalar to bulk or vice versa in some cases.
Paal Kvamme's avatar
Paal Kvamme committed
62
63
 *         Disallow compression in some circumstances. This is why compress
 *         should not be done in the higher levels.
64
65
66
67
68
69
70
71
72
73
 *         If the data end up constant then call _writeOneConstantBrick
 *         and we are done. So skip the rest.
 *
 *     _writeOneNormalBrick
 *
 *         Apply padding, compression, byte swapping, and convert from
 *         DataBuffer to void* + size suitable for the file i/o layer.
 *
 *     _writeWithRetry
 *
Paal Kvamme's avatar
Paal Kvamme committed
74
 *        Allocate space on the bulk file as needed.
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
 *        Handle retries needed if the segment (on the cloud) was closed.
 *        Update the lookup table if a new block was allocated.
 *
 *     Code flow for reading
 *
 *     readToNewBuffer
 *     readToExistingBuffer | readConstantValue
 *     _partsNeeded
 *     _getBrickFilePosition
 *     _deliverOneBrick
 *     _file.xx_readv
 */

#include "enum.h"
#include "types.h"
#include "bulk.h"
#include "meta.h"
#include "file.h"
#include "databuffer.h"
#include "arrayops.h"
#include "structaccess.h"
#include "subtiling.h"
#include "logger.h"
#include "compression.h"
#include "../exception.h"

#include <algorithm>
#include <limits>
#include <cmath>

namespace {
  /**
   * \brief Make a fake shared_ptr from a plain unsafe pointer.
   *
   * Used when a method or a data member expects a smart pointer while
   * the application has an unsafe pointer is guaranteed to remain
   * valid long enough. The returned smart pointer does nothing on
   * cleanup.
   *
   * CAVEAT: Using this method indicates a code smell.
   * Note: The method is duplicated in bulk.cpp, databuffer.cpp,
   * and api.cpp. There isn't much point in consolidatimg them.
   */
  template<typename T>
  static inline std::shared_ptr<T>
  fake_shared(T* p)
  {
    return std::shared_ptr<T>(p, [](T* p){});
  }
}

namespace InternalZGY {
#if 0
}
#endif

using namespace InternalZGY::ArrayOps;
using namespace InternalZGY::Formatters;

struct IJK
{
  std::int64_t i0;
  std::int64_t j0;
  std::int64_t k0;
  std::int64_t ni;
  std::int64_t nj;
  std::int64_t nk;
  std::ostream& dump(std::ostream &os) const {
    os << "start (" << i0 << ", " << j0 << ", " << k0 << ")"
       << " size (" << ni << ", " << nj << ", " << nk << ")";
    return os;
  }
  friend std::ostream& operator<<(std::ostream& os, const IJK& ijk) {
    return ijk.dump(os);
  }
};

152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
/**
 * Argument package used by _writeOneBrick(), _writeOneNormalBrick(),
 * _writeOneConstantBrick(). One size fits all. Which is in general
 * not a god idea. But there is a compromise between readablilty
 * and ease of implementing multi-threading.
 *
 * The original implementation processed a single brick at a time.
 * Using this class it becomes reasonably easy to pass around
 * lists of bricks to be handled.
 *
 *   * brickpos, lod: Always set and not modified by layers below.
 *   * data:          Actual bulk data to output. May be changed.
 *   * compressor:    Might be cleared. N/A if brick ends up constant.
 *   * fileoffset:    Only when known. Never known yet in _writeOneBrick(),
 *                    and N/A for constant bricks.
 */
struct WriteBrickArgPack
{
  std::array<std::int64_t,3>  brickpos;
  std::int32_t                lod;
  std::shared_ptr<DataBuffer> data;
  compressor_t                compressor; // TODO-minor, caller use std::ref?
  //bool                        use_compressor; // or this?
  std::int64_t                fileoffset;
  WriteBrickArgPack(const std::array<std::int64_t,3>& brickpos_in,
                    std::int32_t lod_in,
                    const std::shared_ptr<DataBuffer>& data_in,
                    const compressor_t& compressor_in,
                    std::int64_t fileoffset_in)
    : brickpos(brickpos_in)
    , lod(lod_in)
    , data(data_in)
    , compressor(compressor_in)
    , fileoffset(fileoffset_in)
  {
  }
  std::string toString() const
  {
    std::stringstream ss;
    ss << "pos=" << brickpos
       << ", lod=" << lod
       << ", size=" << data->size3d();
    if (data->isScalar())
      ss << ", scalar=" << data->scalarAsDouble();
    if (compressor)
      ss << ", compressor=*";
    if (fileoffset)
      ss << ", fileoffset=" << std::hex << fileoffset << std::dec;
    return ss.str();
  }
};

204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
/**
 * TODO-Low might want to fold this into LookupTable::LutInfo.
 *
 * Add position in samples. LookupTable::getBrickFilePosition()
 * cannot easily store this because it only has the brick position
 * and would need to know the brick size to get sample position.
 *
 * Add constvalue after decoding.
 */
struct LutInfoEx : public LookupTable::LutInfo
{
  // Inherited: status, offset_in_file, size_in_file, raw_constant;
  IJK    survey_position;
  double double_constvalue;

  LutInfoEx(const LookupTable::LutInfo& info, const IJK& pos_in, double constvalue_in)
    : LookupTable::LutInfo(info)
    , survey_position(pos_in)
    , double_constvalue(constvalue_in)
  {
  }
};

/**
 * Duplicated between impl/bulk.cpp and impl/meta.cpp but sets
 * different flags.
 *
 * Start a critical section where any exception means that the
 * owner class should be permanently flagged with _is_bad = True.
 * Typically this is used to prevent secondary errors after a
 * write failure that has most likely corrupted the entire file.
 * The exception itself will not be caught.
 *
 * The _is_bad flag normally means that any further attempts
 * to access this class, at least for writing, will raise a
 * ZgyCorruptedFile exception. Regardless of what the exception
 * was that caused the flag to be set.
 *
 * C++ note: Unlike Python it isn't trivial (or portable) to check
 * whether a destructor is being called due to leaving scope normally
 * or due to unwinding an exception. So in the C++ version the code
 * should explicitly call disarm() at the end of the critical section.
246
 *
Paal Kvamme's avatar
Paal Kvamme committed
247
 * Thread safety:
248
249
250
 * The class itself is not thread safe but this should not be an issue,
 * because instances are meant to be short lived. Typically used inside
 * a method and not acceesible outside.
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
 */
class ZgyInternalBulk::ErrorsWillCorruptFile
{
  ZgyInternalBulk *_owner;
public:
  explicit ErrorsWillCorruptFile(ZgyInternalBulk* owner) : _owner(owner)
  {
  }
  ~ErrorsWillCorruptFile()
  {
    if (_owner)
      _owner->set_errorflag(true);
    _owner = nullptr;
  }
  void disarm()
  {
    _owner = nullptr;
  }
};

271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
/**
 * ZgyInternalBulk is used both from ZgyReader, which is in general not
 * allowed to change any state to make it thread safe, and from the
 * non threadsafe ZgyWriter.
 *
 * To mitigate the risk of accidentally modifying data using a ZgyReader
 * there is both a const and a mutable pointer to the underlying
 * ZgyInternalMeta. When instanciated from a ZgyReader the mutable pointer
 * will be empty. When instanciated from a ZgyWriter the two pointers
 * will be identical. This mitigation by itself will cause a null pointer
 * exception if trying to modify data that shouldn't be change.
 * This of course assumes that ZgyInternalMeta is const-correct so that
 * no state can be changed by using a const pointer to a ZgyInternalMeta
 * instance.
 *
 * An additional mitigation is to not use the _metadata_rw directly but
 * instead call a _get_metadata_rw() method. If ZgyInternalBulk is
 * const-correct then you get a compile time error if trying to call
 * _get_metadata_rw() from inside a method declared as const. If not,
 * _get_metadata_rw() is still preferable because it can raise a proper
 * error message instead of the null pointer exception.
 */
293
294
ZgyInternalBulk::ZgyInternalBulk(
    const std::shared_ptr<FileADT>& file,
295
296
    const std::shared_ptr<const ZgyInternalMeta>& metadata,
    const std::shared_ptr<ZgyInternalMeta>& metadata_rw,
297
298
299
300
    bool compressed_write,
    const LoggerFn& logger)
  : _file(file)
  , _metadata(metadata)
301
  , _metadata_rw(metadata_rw)
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
  , _update_mode(compressed_write? UpdateMode::Constant : UpdateMode::Always)
  , _compressed_write(compressed_write)
  , _is_bad(false)
  , _written_sample_min(std::numeric_limits<double>::infinity())
  , _written_sample_max(-std::numeric_limits<double>::infinity())
  , _loggerfn(logger ? logger : LoggerBase::standardCallback(LoggerBase::getVerboseFromEnv("OPENZGY_VERBOSE"), "openzgy-bulk:", ""))
{
}

/**
 * \brief Get hint about all constant region.
 *
 * Check to see if the specified region is known to have all samples
 * set to the same value. Returns a pair of (is_const, const_value).
 *
 * The function only makes inexpensive checks so it might return
 * is_const=false even if the region was in fact constant. It will not
 * make the opposite mistake. This method is only intended as a hint
 * to improve performance.
 *
 * For int8 and int16 files the caller may specify whether to scale
 * the values or not. Even if unscaled the function returns the value
 * as a double.
 */
std::pair<bool,double>
ZgyInternalBulk::readConstantValue(
      const std::array<std::int64_t,3>& start,
      const std::array<std::int64_t,3>& size,
330
      int32_t lod, bool as_float) const
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
{
  _validateUserPosition(start, size, lod);
  const double defaultstorage = this->_metadata->ih().defaultstorage();
  std::vector<LutInfoEx> bricklist = _partsNeeded(start, size, lod);
  const double nan = std::numeric_limits<double>::quiet_NaN();
  double result = nan;
  bool first = true;
  for (const LutInfoEx& brick : bricklist) {
    if (_logger(5))
      _logger(5, std::stringstream()
              << "brick " << brick.survey_position
              << " " << (int)brick.status << "\n");
    switch (brick.status) {
    case BrickStatus::Constant:
      // Two NaN values should compare equal in this test.
      if (!first && result != brick.double_constvalue &&
          !(std::isnan(result) && std::isnan(brick.double_constvalue)))
        return std::make_pair(false,nan);
      result = brick.double_constvalue;
      break;
    case BrickStatus::Missing:
      if (!first && result != defaultstorage)
        return std::make_pair(false,nan);
      result = defaultstorage;
      break;
    default:
      return std::make_pair(false,nan);
    }
    first = false;
  }
  if (as_float && std::isfinite(result)) {
    result = (result * this->_metadata->ih().storagetofloat_slope() +
              this->_metadata->ih().storagetofloat_intercept());
  }
  return std::make_pair(true,result);
}

/**
 * Read bulk data starting at "start" in index space and store the
 * result in the provided DataBuffer. Start should be in the range
 * (0,0,0) to Size-1. The count of samples to read is implied by the
 * size of the DataBuffer that is passed in. The valid data types for
 * the result are float32 (in which case samples stored as int8 or
 * int16 will be scaled) or the files's storage value type (in which
 * case there is no scaling). It is valid to pass a size that
 * includes the padding area between the survey and the end of the
 * current brick. But not past that point.
 */
void
ZgyInternalBulk::readToExistingBuffer(
      const std::shared_ptr<DataBuffer>& result,
      const std::array<std::int64_t,3>& start,
383
      int32_t lod, bool as_float) const
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
{
  _validateUserPosition(start, result->size3d(), lod);
  RawDataType expected_dtype = as_float ? RawDataType::Float32 : _metadata->ih().datatype();
  // When called from ZgyReader.read(), the check on expected_dtype represents
  // a user error trying to convert to an intergral type while the others
  // should not be possible to trigger.
  if (!result || result->isScalar() || !result->voidData())
    throw OpenZGY::Errors::ZgyInternalError("Reading to missing or wrong buffer type.");
  if (result->datatype() != expected_dtype)
    throw OpenZGY::Errors::ZgyUserError("Requested data type not supported for this file.");

  // Need a default value to use when trying to read a brick that
  // was never written, or to fill in a brick that was only partly
  // written. To avoid non intuitive behavior the same value should
  // be used for both cases, and it should be a value that would
  // also be allowed to store as a regular sample. Use the value
  // that becomes 0.0 after conversion to float. Or as close as
  // possible to that value if the coding range is not zero centric.
  // Always use zero for floating point data. (Not NaN...)
  // Dead traces don't have any special handling apart from the
  // alpha flag. They still contain whatever data was written to them.

  const double defaultstorage = this->_metadata->ih().defaultstorage();
  const double defaultvalue = this->_metadata->ih().defaultvalue();

  // Make a separate pass to gather all the bricks we need to read.
  // FUTURE: we might fetch some of them in parallel and we might be
  // able to combine bricks to read larger blocks at a time. Both changes
  // can have a dramatic performance impact effect on cloud access.

  std::vector<LutInfoEx> bricklist = _partsNeeded(start, result->size3d(), lod);

  // After all bricks have been processed, the padding past the
  // end if the survey might still not have been touched. Just in
  // case the request did in fact include such samples we will
  // initialize the entire result buffer to the default value.
  // TODO-Performance can I refactor the code so this isn't needed?
  result->fill(as_float ? defaultvalue : defaultstorage);

  std::vector<ReadRequest> requests;
  for (const LutInfoEx& brick : bricklist) {
    // Functor that accepts a raw void* data + size and copies it into the
    // correct place in result. result, start, and as_float come straight
    // from user code. b.position and b.status vary per brick which is why
    // I copy them to make sure the up to date contents are captured. raw and
    // rawsize (the arguments to the lambda) is what was read from storage.

    const std::array<std::int64_t,3> bpos
      {brick.survey_position.i0,
       brick.survey_position.j0,
       brick.survey_position.k0};
    const BrickStatus bstatus = brick.status;
    auto deliverance = [this,result,start,as_float,bpos,bstatus](const void* raw, std::int64_t rawsize) {
                         this->_deliverOneBrick
                           (result, start, bpos, raw, rawsize,
                            bstatus, as_float);
                       };
    switch (brick.status) {
    case BrickStatus::Missing:
      if (_logger(2))
        _logger(2, std::stringstream()
                << "  Reading brick at " << brick.survey_position
                << " not found, use " << defaultstorage << "\n");
      deliverance(&defaultstorage, sizeof(double));
      break;

    case BrickStatus::Constant:
      if (_logger(2))
        _logger(2, std::stringstream()
                << "  Reading constant brick at "
                << brick.survey_position << "\n");
      deliverance(&brick.double_constvalue, sizeof(double));
      break;

    case BrickStatus::Normal:
      if (_logger(2))
        _logger(2, std::stringstream()
                << "  Reading brick at " << brick.survey_position
                << " from file offset " << std::hex
                << (std::intptr_t)brick.offset_in_file << std::dec << "\n");
      requests.push_back(ReadRequest{brick.offset_in_file, this->_metadata->ih().bytesperbrick(), deliverance});
      break;

    case BrickStatus::Compressed:
      if (_logger(2))
        _logger(2, std::stringstream()
                << "  Reading compressed brick at " << brick.survey_position
                << " from file offset " << std::hex
                << (std::intptr_t)brick.offset_in_file << std::dec
                << " size " << brick.size_in_file << "\n");
      // TODO-Worry obscure corner case, might need to re-try if we didn't
      // get enough data. I try to prevent this with the rule that
      // no compressed brick may be larger than the uncompressed version.
      requests.push_back(ReadRequest{brick.offset_in_file, brick.size_in_file, deliverance});
      break;

    default:
      throw OpenZGY::Errors::ZgyInternalError("Internal error, bad brick status");
    }
  }
    if (_logger(2))
      _logger(2, std::stringstream()
              << requests.size() << " read requests are queued\n");
    if (!requests.empty())
      this->_file->xx_readv(requests, false, false, true, UsageHint::Data);
}

/**
 * Read bulk data starting at "start" in index space size "size".
 * Return the result in a newly allocated DataBuffer. The result will
 * either be a scalar (constant-value) buffer or a regular buffer.
 *
 * Pass check_constant=true to check extra hard for all-constant data.
 * A region written with all samples identical, as opposed to a region
 * flagged as constant without taking up space, will also be detected.
 *
 * Start should be in the range (0,0,0) to Size-1. It is valid to pass
 * a size that includes the padding area between the survey and the
 * end of the current brick. But not past that point.
 */
std::shared_ptr<DataBuffer>
ZgyInternalBulk::readToNewBuffer(
    const std::array<std::int64_t,3>& start,
    const std::array<std::int64_t,3>& size,
508
    int32_t lod, bool as_float, bool check_constant) const
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
{
  _validateUserPosition(start, size, lod);
  std::shared_ptr<DataBuffer> result;
  RawDataType dtype = (as_float ?
                       RawDataType::Float32 :
                       this->_metadata->ih().datatype());
  std::pair<bool,double> cvalue = this->readConstantValue(start, size, lod, as_float);
  if (cvalue.first) {
    result = DataBuffer::makeDataBuffer3d(&cvalue.second, sizeof(cvalue.second), size, dtype);
  }
  else {
    result = DataBuffer::makeDataBuffer3d(nullptr, 0, size, dtype);
    this->readToExistingBuffer(result, start, lod, as_float);
    if (check_constant && result->isAllSame(result->size3d().data())) {
      double scalar = result->scalarAsDouble(); // returns the first value
      result = DataBuffer::makeDataBuffer3d(&scalar, sizeof(scalar), size, dtype);
    }
  }
  return result;
}

530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
/**
 * Mitigate the problem that ZgyInternalBulk is used both for reading
 * and writing, making it difficult to keep track of thread safety.
 * The function below forces the code to be more explicit read vs write.
 *
 * See ZgyInternalBulk::ZgyInternalBulk for more details.
 */
std::shared_ptr<ZgyInternalMeta>
ZgyInternalBulk::_get_metadata_rw()
{
  if (!_metadata_rw)
    throw OpenZGY::Errors::ZgyUserError("ZGY file not open for write");
  return _metadata_rw;
}

545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
/**
 * Convenience for invoking _loggerfn with a simple message.
 * This isn't much different from invoking the callback directly.
 * But it makes debugging slightly simpler to have an easy place
 * to set a breakpoint. It also adds more symmetry with respect
 * to the stringstream version, which does add value.
 */
bool
ZgyInternalBulk::_logger(int priority, const std::string& message) const
{
  return _loggerfn(priority, message);
}

/**
 * Convenience for invoking _loggerfn with a stringstream.
 * Due to a somewhat naughty cast, the function can be caller as:
 *
 *   if(_logger(pr1))
 *    _logger(pri, std::stringstream() << some << data << here);
 *
 * The first line is optional. It just prevents the expression in
 * the second line from being evaluatet if debugging is disabled.
 */
bool
ZgyInternalBulk::_logger(int priority, const std::ios& ss) const
{
  auto sstream = dynamic_cast<const std::stringstream*>(&ss);
  return _logger(priority, sstream ? sstream->str() : std::string());
}

/**
 * Check that i,j,k,lod are all inside valid bounds. Throw if not.
 * If used to validate an alpha tile then k should be passed as 0.
 * This is similar to LookupTable::_validatePosition() but checks
 * the entire region, not just one brick at a time. And the input
 * is in trace numbers not brick numbers.
 *
 * The exception message is currently rather verbose.
 * I might want to shorten it a bit; skipping the actual position.
 *
 * Note on test coverage: For defensive coding these checks are done
 * both here at a high level and in lookuptable. This hurts coverage
 * since it might not be possible to get those other checks to fail.
 * And some errore may be caught even earlier e.g. if trying to make
 * a DataBuffer with negative size.
 */
void
ZgyInternalBulk::_validateUserPosition(
    const std::array<std::int64_t,3>& start,
    const std::array<std::int64_t,3>& size,
595
    int32_t lod) const
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
{
  std::string error;
  const IInfoHeaderAccess& ih = this->_metadata->ih();
  const std::array<std::int64_t,3> one{1,1,1} ;
  const std::array<std::int64_t,3> bs  = this->_metadata->ih().bricksize();
  const std::array<std::int64_t,3> end = ((start + size + bs - one) / bs) * bs;
  const std::int64_t nlods = static_cast<std::int64_t>(ih.lodsizes().size());
  if (lod < 0 || lod >= nlods) {
    std::stringstream ss;
    ss << "Requested lod " << lod <<
      " is outside the valid range 0 to " << nlods << " inclusive";
    _logger(1, ss.str() + "\n");
    throw OpenZGY::Errors::ZgyUserError(ss.str());
  }
  const std::array<std::int64_t,3>& ssize = ih.lodsizes()[lod] * bs;
  if (start[0] < 0 || end[0] > ssize[0] || size[0] <= 0 ||
      start[1] < 0 || end[1] > ssize[1] || size[1] <= 0 ||
      start[2] < 0 || end[2] > ssize[2] || size[2] <= 0) {
    std::stringstream ss;
    ss << "Requested region"
       << " from (" << start[0] << ", " << start[1] << ", " << start[2] << ")"
       << " to (" << end[0] << ", " << end[1] << ", " << end[2] << ")"
       << " lod " << lod
       << " is empty or outside the valid range"
       << " (0, 0, 0)"
       << " to " << ssize[0] << ", " << ssize[1] << ", " << ssize[2] << ")"
       ;
    _logger(1, ss.str() + "\n");
    throw OpenZGY::Errors::ZgyUserError(ss.str());
  }
}

/**
 * Scale integral data from storage according to the coding range.
 * The operation is a no-op if file_dtype is float. The data must
 * be known to be in storage domain, we don't try to guess based on
 * the DataBuffer's valuetype.
 */
std::shared_ptr<DataBuffer>
ZgyInternalBulk::_scaleDataToFloat(
636
      const std::shared_ptr<DataBuffer>& data) const
637
638
639
640
641
642
643
644
645
646
{
  // The data buffer type should match file_dtype but I won't bother to check.
  // Also, if result ends up empty this means no conversion is needed.
  std::array<double,2> factors = this->_metadata->ih().storagetofloat();
  std::shared_ptr<DataBuffer> result = data->scaleToFloat(factors);
  return result ? result : data;
}

std::shared_ptr<DataBuffer>
ZgyInternalBulk::_scaleDataToStorage(
647
      const std::shared_ptr<DataBuffer>& data) const
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
{
  std::array<double,2> factors = this->_metadata->ih().storagetofloat();
  std::shared_ptr<DataBuffer> result =
    data->scaleToStorage(factors, _metadata->ih().datatype());
  return result ? result : data;
}

namespace {
  template<typename T> double _decodeConstantT(std::uint32_t in)
  {
    if (in == 1) // ZGY v1 could only store constant-zero.
      return 0;
    T value;
    byteswapT(&in);
    memcpy(&value, &in, sizeof(T));
    byteswapT(&value);
    return static_cast<double>(value);
  }

  template<typename T> std::uint32_t _encodeConstantT(double in)
  {
    if (std::numeric_limits<T>::is_integer)
      in = std::max((double)std::numeric_limits<T>::lowest(),
                    std::min((double)std::numeric_limits<T>::max(),
                             std::round(in)));
    T value = static_cast<T>(in);
    std::uint32_t result{0};
    byteswapT(&value);
    memcpy(&result, &value, sizeof(T));
    byteswapT(&result);
    return result;
  }
}

/**
 * Decode the constant brick value from the re-purposed file offset
 * stored in the lookup table in the ZGY file. Return as a double,
 * which should have enough precision to precisely represent all
 * supported value types.
 *
 * In ZGY v2 and later the actual constant value is stored on the file
 * as little-endian in the first 1, 2, or 4 bytes in the 64-bit slot
 * that normally holds the file offset. The v1 format does not store
 * arbitrary constants. If it had, it would be subject to the swapping
 * of low and high 32-bit words.
 *
 * Interpreting that value in C++ safely and portably is tricky.
 *
 * The input to this method has already been byteswapped to machine
 * byte order and contains the least significant 32 bits of the stored
 * file offset. So the correct handling is to mask out the least
 * significant bytes (however many we need) and return that. It may be
 * simpler to temporarily byte swap back to little endian so the code
 * can copy "first N bytes" instead. Because, memcpy seems to be the
 * safest approach to avoid casts that are technically illegal.
 *
 * The same reasoning applies to the inverse _encodeConstantT().
 * The returned result will be byteswapped before being written out,
 * so the constant needs to be stored in the least significant part
 * of the result and not the first N bytes of the result.
 *
 * TODO-Low: Idea: Could I convert a 0x80xxx pointer to the pointer's address
 * and set the size to one sample? This might actually work...
 * The "1" pointer from the v1 format might even be treated the same way;
 * the upper 23 bits of the pointer will be all zero so we could
 * point to that. Just be careful to not overwrite it. And caller
 * may need the not-converted version to test whether this is a constant.
 * Or can it simply test for size == 1 sample?
 *
 * TODO-Low: Idea: passing the pointer to the first byte might actually
 * be easier to deal with.
 */
/*static*/ double
ZgyInternalBulk::_decodeConstant(std::uint32_t in, RawDataType dtype)
{
  switch (dtype) {
  case RawDataType::SignedInt8:    return _decodeConstantT<std::int8_t>(in);
  case RawDataType::UnsignedInt8:  return _decodeConstantT<std::uint8_t>(in);
  case RawDataType::SignedInt16:   return _decodeConstantT<std::int16_t>(in);
  case RawDataType::UnsignedInt16: return _decodeConstantT<std::uint16_t>(in);
  case RawDataType::SignedInt32:   return _decodeConstantT<std::int32_t>(in);
  case RawDataType::UnsignedInt32: return _decodeConstantT<std::uint32_t>(in);
  case RawDataType::Float32:       return _decodeConstantT<float>(in);
  case RawDataType::IbmFloat32:
  default: throw OpenZGY::Errors::ZgyInternalError("Unrecognized type enum");
  }
}

/**
 * Inverse of _decodeConstant(). See that function for details.
 */
/*static*/ std::int32_t
ZgyInternalBulk::_encodeConstant(double in, RawDataType dtype)
{
  switch (dtype) {
  case RawDataType::SignedInt8:    return _encodeConstantT<std::int8_t>(in);
  case RawDataType::UnsignedInt8:  return _encodeConstantT<std::uint8_t>(in);
  case RawDataType::SignedInt16:   return _encodeConstantT<std::int16_t>(in);
  case RawDataType::UnsignedInt16: return _encodeConstantT<std::uint16_t>(in);
  case RawDataType::SignedInt32:   return _encodeConstantT<std::int32_t>(in);
  case RawDataType::UnsignedInt32: return _encodeConstantT<std::uint32_t>(in);
  case RawDataType::Float32:       return _encodeConstantT<float>(in);
  case RawDataType::IbmFloat32:
  default: throw OpenZGY::Errors::ZgyInternalError("Unrecognized type enum");
  }
}

/**
 * Return the list of bricks needed to cover the entire area given by
 * start and size. The resulting list gives the sample position
 * relative to the start of the survey. To get the brick position you
 * need to divide by brick size. Also compute file offsets etc. from
 * the lookup tables. This will indirectly validate start and size.

 * Implementation note: Don't collect the brick list in one loop and
 * then compute lookup info later. If requested size is ridiculously
 * large this may cause us to run out of memory before getting to the
 * validation stage.
 */
std::vector<LutInfoEx>
ZgyInternalBulk::_partsNeeded(
      const std::array<std::int64_t,3>& start,
      const std::array<std::int64_t,3>& size,
771
      int32_t lod) const
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
{
  std::vector<LutInfoEx> bricklist;
  const IInfoHeaderAccess& ih = this->_metadata->ih();
  const std::array<std::int64_t,3> bs = this->_metadata->ih().bricksize();
  const ILookupTableAccess& blup = this->_metadata->blup();
  const std::array<std::int64_t,3>
    brick0{(start[0] / bs[0]) * bs[0],
           (start[1] / bs[1]) * bs[1],
           (start[2] / bs[2]) * bs[2]};
  const std::array<std::int64_t,3>
    brickN{((start[0] + size[0] - 1) / bs[0]) * bs[0],
           ((start[1] + size[1] - 1) / bs[1]) * bs[1],
           ((start[2] + size[2] - 1) / bs[2]) * bs[2]};
  std::array<std::int64_t,3> iter;
  for (iter[0] = brick0[0]; iter[0] <= brickN[0]; iter[0] += bs[0]) {
    for (iter[1] = brick0[1]; iter[1] <= brickN[1]; iter[1] += bs[1]) {
      for (iter[2] = brick0[2]; iter[2] <= brickN[2]; iter[2] += bs[2]) {
        IJK pos{iter[0], iter[1], iter[2], bs[0], bs[1], bs[2]};
        LookupTable::LutInfo info = LookupTable::getBrickFilePosition
          (pos.i0/pos.ni, pos.j0/pos.nj, pos.k0/pos.nk, lod,
           /**/ih.lodsizes(), ih.brickoffsets(), blup.lup(), blup.lupend(),
           ih.bytesperbrick());
        bricklist.push_back
          (LutInfoEx(info, pos,
                     _decodeConstant(info.raw_constant, ih.datatype())));
      }
    }
  }
  return bricklist;
}

/**
 * This is the final step in readToExistingBuffer(). The data has
 * been read from storage, so now it needs to be copied back to
 * the user. This function may be invoked multiple times if data
 * was needed from more than one brick.
 *
 * Arguments:
 *    result   -- user's buffer which was passed to readToExistingBuffer().
 *    start    -- user's supplied position as a 3-tuple of index values.
 *    as_float -- user's choice about converting data.
 *    startpos -- position of the start of this brick.
 *    raw      -- bulk data for this brick as it exists on file.
 *    rawsize  -- size in BYTES of "raw".
 *    brickstatus -- normal, compressed, constant, ...
 *
 * This low level function technically deals with raw bytes as input and
 * and the higher level DataBuffer as output.
 *
 * The function will always be called to deliver one full brick. If the raw
 * data only contains a single sample then this is a constant-value brick.
 * otherwise there should be exactly bricksize-in-bytes available.
 * If the brick size is (1,1,1) (not even possible right now) then the
 * distinction between constant-value and regular goes away.
 *
 * TODO-Worry: beware this kludge: If passing sizeof(double) data then this also
 * represents a constant-value brick where the constant needs to be cast
 * to the correct type. There are some highly theoretical cases where this
 * becomes ambiguous.
 *
 * Caveat: If the data pointer is not correctly aligned to the type of data
 * then the returned DataBuffer will also have a misaligned pointer.
 * Another caveat: Even though result is reference counted, its buffer
 * might end up set to "raw" which means it must not be used after the
 * delivery function returns. Mitigated by adding a clear() member to
 * DataBuffer. THat doesn't help if somebody already has a raw pointer though.
 */
void
ZgyInternalBulk::_deliverOneBrick(
      const std::shared_ptr<DataBuffer>& result,
      const std::array<std::int64_t,3>& start,
      const std::array<std::int64_t,3>& startpos,
      const void *raw, std::int64_t rawsize,
845
      BrickStatus brickstatus, bool as_float) const
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
{
  if (_logger(5))
    _logger(5, std::stringstream()
            << "  Special delivery! type " << (int)brickstatus
            << " user buffer start "
            <<"(" << start[0] << ", " << start[1] << ", " << start[2] << ")"
            << " this brick start "
            <<"(" << startpos[0] <<
            ", " << startpos[1]
            << ", " << startpos[2] << ")"
            << " got " << rawsize << " bytes.\n");

  const RawDataType dtype = _metadata->ih().datatype();
  const std::array<std::int64_t,3> bricksize = _metadata->ih().bricksize();

  // TODO-Low const-correctness? Might not be worth the trouble.
  std::shared_ptr<DataBuffer> data;

  // TODO-Low this might not be the right place.
  // Note that if the file contained compressed integral data
  // and the application asked for float data back, the decompressor
  // would be responsible for the int to float conversion but NOT the
  // scaling from storage values to real values. Currently this is
  // N/A becase compressed files are always float. There is almost
  // nothing to gain by compressing integral data.
  // The one case I can think of is lossless compression of
  // discrete data such as classification. Not supported but might
  // be supported in the future. In that case retrieving float
  // does not make sense and must be disallowed.

  // TODO-Low, to make it clear what is going on I should probaby
  // rename Compressed to CompressedFloat.

  switch (brickstatus) {
  case BrickStatus::Compressed: {
    // I know that "raw" stays in scope long enough, so use a fake shared_ptr.
    // TODO-Low: Maybe the decompressor should simply input raw, rawsize.
    rawdata_t rawdata(fake_shared(raw), rawsize);
    rawdata_t tmp = CompressFactoryImpl::decompress(rawdata, brickstatus, bricksize);
    if (!tmp.first || !tmp.second)
      throw OpenZGY::Errors::ZgyFormatError("Compression type not recognized");
    if (tmp.second != bricksize[0]*bricksize[1]*bricksize[2]*(int)sizeof(float))
      throw OpenZGY::Errors::ZgyInternalError("Wrong decompressed size");
    auto floatdata = std::static_pointer_cast<const float>(tmp.first);
    // TODO-Low: Const-correctness of DataBuffer, at least at runtime.
    // Maybe just an overloaded constructor with const arg that will
    // set a readonly flag. And have both data() and rwdata() methods.
    auto floatunconst = std::const_pointer_cast<float>(floatdata);
    data.reset(new DataBufferNd<float,3>(floatunconst, bricksize));
    break;
  }
  case BrickStatus::Normal: {
    // TODO-Medium: byteswap, maybe clone first if not owndata.
    data = DataBuffer::makeDataBuffer3d(const_cast<void*>(raw), rawsize, bricksize, dtype);
    if (_metadata->fh().version() == 1) {
      // Rare case. Does not need to be performant.
      if (rawsize != bricksize[0]*bricksize[1]*bricksize[2]*data->itemsize())
        throw OpenZGY::Errors::ZgyInternalError("Wrong size of subtiled data");
      data = data->clone();
      subtiling(bricksize, data->itemsize(), data->voidData().get(), raw, true);
    }
  }
  default:
    // TODO-Medium: byteswap? Or did the constvalue decode do that?
    data = DataBuffer::makeDataBuffer3d(const_cast<void*>(raw), rawsize, bricksize, dtype);
  }

  if (as_float && dtype != RawDataType::Float32)
    data = _scaleDataToFloat(data);

  // Note, we might pass in the survey range as well to force
  // all padding bytes to be set to the default value. Less
  // surprises for the caller. It may look a bit odd if the user
  // does a flood-fill of the entire survey to a given value and
  // later sees that the content is different in the padding area.
  // But, the caller should ignore the padding.
  //
  // On write the padding samples should also be forced to contain
  // the same value. If nothing else, to help compression. But for
  // efficiency reasons the value is not specified. Typically it
  // will be the default "absent" value but if the rest of the
  // brick has a const value and no allocated disk space then
  // they will inherit that constant.

  result->copyFrom(data.get(), startpos.data(), start.data(), nullptr, nullptr);

  // Belt and suspenders mode...
  // The bulk data inside the DataBuffer I created will now go out of scope;
  // it is only valid for the duration of this function. The data smart pointer
  // should also be going out of scope now. But in case it doesn't I will
  // zero out the buffer pointer.

  data->clear();
}

// write support //

/**
 * Compute the size of the used (inside survey) area of a data buffer
 * with size "size". size will probably always be equal to bricksize
 * but the function should still work if it isn't.
 */
std::array<std::int64_t,3>
ZgyInternalBulk::_usedPartOfBrick(
      const std::array<std::int64_t,3>& size,
      const std::array<std::int64_t,3>& brickpos_in,
952
      std::int32_t lod) const
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
{
  std::array<std::int64_t,3> used{0,0,0};
  const IInfoHeaderAccess& ih = this->_metadata->ih();
  for (int dim=0; dim<3; ++dim) {
    const std::int64_t bricksize     = ih.bricksize()[dim];
    const std::int64_t surveysize0   = ih.size()[dim];
    const std::int64_t brickpos      = brickpos_in[dim];
    const std::int64_t lodfactor     = static_cast<std::int64_t>(1) << lod;
    const std::int64_t surveysizelod = (surveysize0 + lodfactor - 1) / lodfactor;
    const std::int64_t available     = surveysizelod - (brickpos * bricksize);
    used[dim] = std::max((std::int64_t)0, std::min(available, size[dim]));
  }
  return used;
}

/**
 * Return True if all useful samples in this brick have the same value.
 * Padding samples outside the survey are not useful and should not
 * be checked since they could easily have been set to a padding value
 * that doesn't mach the interesting stuff.
 */
bool
ZgyInternalBulk::_isUsedPartOfBrickAllConstant(
      const std::shared_ptr<DataBuffer>& data,
      const std::array<std::int64_t,3>& brickpos,
978
      int32_t lod) const
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
{
  const std::array<std::int64_t,3> used =
    this->_usedPartOfBrick(data->size3d(), brickpos, lod);
  return data->isAllSame(used.data());
}

/**
 * Pad unused parts of the data buffer by replicating the last samples,
 * but only up to a multiple of 'modulo' samples. Handles just one
 * dimension, so caller will typically invoke us three times.
 */
/*static*/ void
ZgyInternalBulk::_setPaddingToEdge(
      const std::shared_ptr<DataBuffer>& data,
      const std::array<std::int64_t,3>& used,
      int modulo, int dim)
{
  std::int64_t slice = used[dim];
  while (slice > 0 && slice < data->size3d()[dim] && (slice % modulo) != 0) {
    std::array<std::int64_t,3> srcorig{0,0,0};
    std::array<std::int64_t,3> dstorig{0,0,0};
    std::array<std::int64_t,3> cpyorig{0,0,0};
For faster browsing, not all history is shown. View entire blame