lodalgo.cpp 38.1 KB
Newer Older
1
// Copyright 2017-2021, Schlumberger
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//Adapted from: Zgy/ArrayBasic/ArrayTile

#include "lodalgo.h"
#include "lodsampling.h"
#include "roundandclip.h"
#include "databuffer.h"
21
#include "fancy_timers.h"
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#include "environment.h"
#include "../exception.h"

#include <memory.h>
#include <math.h>
#include <assert.h>
#include <limits>
#include <typeinfo>
#include <memory>
#include <atomic>
#include <omp.h>

namespace InternalZGY {
#if 0
}
#endif

/**
 * \brief Weighted arithmetic average of 8 neighboring samples.
 *
 * Each sample is weighted agaist how common this value is in the survey
 * as a whole. Rare values (likely very high of very low) get higher
 * priority. This prevents tiles from looking more and more washed out
 * the more they get decimated.
 *
 * TODO-Performance: Profile this function and try to speed it up.
 * It is by far the most expensize of the decimators, ~10x the cost
 * of LowPass. By default it is used for lod 2+ only. So it gets called
 * with just 1/7 of the data passed to LowPass. In sum the code still
 * spends more CPU cycles on this than on LowPass.
52
53
54
55
 *
 * \details Thread safety:
 * Modification may lead to a data race. This should not be an issue.
 * Instances are meant to be short lived and used only from one place.
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
 */
template<typename T>
class LodWeightedAverage
{
  // Information passed to constructor
  const std::int64_t* const hist_;
  const int bins_;
  const double minHist_;
  const double maxHist_;
  // pre-calculated in constructor
  double hsum_;
  double factor_;
  double offset_;

public:
  LodWeightedAverage(const std::int64_t *hist, int bins, double minHist, double maxHist) :
    hist_(hist), bins_(bins), minHist_(minHist), maxHist_(maxHist)
  {
    // find total number of samples to base weighting on
    hsum_ = 0.0;
    for (int dsti = 0; dsti < bins_; dsti++){
      hsum_ += static_cast<double>(hist_[dsti]);
    }

    // Avoid divide by zero in weights below
    if (hsum_ < 1.0){
      hsum_ = 1.0;
    }

    // Linear transform from sample value to bin number.
    // Note that result is supposed to be rounded to nearest integer.
    factor_ = IsFiniteD(maxHist) && IsFiniteD(minHist) && maxHist>minHist && bins>1 ? ((bins-1)/(maxHist-minHist)) : 0.0;
    offset_ = -minHist * factor_;
  }

  static const LodAlgorithm algorithm = LodAlgorithm::WeightedAverage;
  T operator()(T s0, T s1, T s2, T s3, T s4, T s5, T s6, T s7,
               int /*i*/, int /*j*/, int /*k*/)
  {
    T data[8] = { s0, s1, s2, s3, s4, s5, s6, s7};

    // calculate weighted sum for 2x2x2 part of src brick
    double weight(0.0), sum(0.0);
    for (int ii=0; ii<8; ++ii) {

      // get value from src brick
      const T val = data[ii];

      if (!IsFiniteT(val))
          continue;

      // Get histogram bin number for this sample, then look up this
      // sample's frequency. See HistogramData::get().
      double histv;
      int hVal= RoundD2I(val * factor_ + offset_);
      if (static_cast<unsigned int>(hVal) < static_cast<unsigned int>(bins_)) {
        histv = static_cast<double>(hist_[hVal]);
      } else {
        histv = 0.0;  // outside histogram, frequency definitely 0
      }
      // If frequency is reported as 0, this is an inconsistency.
      // Since we know it occurs at least one time.
      if (histv<1.0){ histv = 1.0; }

      // update weighted sum
      sum += val * hsum_ / histv;

      // update weight
      weight += hsum_ / histv;
    }

    double wval;
    if (weight != 0) // use weighted value, at least one finite value found.
      wval = sum / weight;
    else // all 8 values were infinite or NaN, just pick one of them.
      wval = s0;

    if (wval > maxHist_){ wval = maxHist_; }
    if (wval < minHist_){ wval = minHist_; }

    // assign weighted value to destination brick
    return static_cast<T>(wval);
  }
};

141
142
143
144
/**
 * \brief Arithmetic average of 8 neighboring integer samples.
 * \details Thread safety: Safe. Instance holds no data.
 */
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
template<typename T>
class LodAverage
{
public:
  static const LodAlgorithm algorithm = LodAlgorithm::Average;
  T operator()(T s0, T s1, T s2, T s3, T s4, T s5, T s6, T s7,
               int /*i*/, int /*j*/, int /*k*/)
  {
    double sum =
      static_cast<double>(s0) +
      static_cast<double>(s1) +
      static_cast<double>(s2) +
      static_cast<double>(s3) +
      static_cast<double>(s4) +
      static_cast<double>(s5) +
      static_cast<double>(s6) +
      static_cast<double>(s7);
    return static_cast<T>(sum/8);
  }
};

166
167
168
169
/**
 * \brief Arithmetic average of 8 neighboring float samples, ignoring NaN.
 * \details Thread safety: Safe. Instance holds no data.
 */
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
template<>
class LodAverage<float>
{
public:
  static const LodAlgorithm algorithm = LodAlgorithm::Average;
  float operator()(float s0, float s1, float s2, float s3, float s4, float s5, float s6, float s7,
                   int /*i*/, int /*j*/, int /*k*/)
  {
    double sum = 0;
    int count = 0;
    if (IsFiniteD(s0)) { sum += s0; ++count; }
    if (IsFiniteD(s1)) { sum += s1; ++count; }
    if (IsFiniteD(s2)) { sum += s2; ++count; }
    if (IsFiniteD(s3)) { sum += s3; ++count; }
    if (IsFiniteD(s4)) { sum += s4; ++count; }
    if (IsFiniteD(s5)) { sum += s5; ++count; }
    if (IsFiniteD(s6)) { sum += s6; ++count; }
    if (IsFiniteD(s7)) { sum += s7; ++count; }
    if (count == 0)
      return s0; // all infinite, so this we return.
    return static_cast<float>(sum/count);
  }
};

/**
 * \brief Arithmetic average, excluding zero, of integer data.
 *
 * \details This is not useful for seismic data. It might be handy
 * if samples are by nature integer values, not just floating point
 * numbers (such as seismic amplitude) crammed into a too-small integer.
200
201
 *
 * Thread safety: Safe. Instance holds no data.
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
 */
template<typename T>
class LodAverageNon0
{
public:
  static const LodAlgorithm algorithm = LodAlgorithm::AverageNon0;
  T operator()(T s0, T s1, T s2, T s3, T s4, T s5, T s6, T s7,
    int /*i*/, int /*j*/, int /*k*/)
  {
    double sum =
      static_cast<double>(s0)+
      static_cast<double>(s1)+
      static_cast<double>(s2)+
      static_cast<double>(s3)+
      static_cast<double>(s4)+
      static_cast<double>(s5)+
      static_cast<double>(s6)+
      static_cast<double>(s7);
    int count = 0;
    if (s0 != 0) ++count;
    if (s1 != 0) ++count;
    if (s2 != 0) ++count;
    if (s3 != 0) ++count;
    if (s4 != 0) ++count;
    if (s5 != 0) ++count;
    if (s6 != 0) ++count;
    if (s7 != 0) ++count;
    if (count == 0)
      return 0;
    else
      return static_cast<T>(sum / count);
  }
};

/**
 * \brief Arithmetic average, excluding NaN and zero, of float data.
 *
 * \details CAVEAT: This is not useful for seismic data. It might be handy
 * if samples are by nature integer values, not just floating point
 * numbers (such as seismic amplitude) crammed into a too-small integer.
 * This means that running the algorithm on float data is probably a mistake.
243
244
 *
 * Thread safety: Safe. Instance holds no data.
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
 */
template<>
class LodAverageNon0<float>
{
public:
  static const LodAlgorithm algorithm = LodAlgorithm::AverageNon0;
  float operator()(float s0, float s1, float s2, float s3, float s4, float s5, float s6, float s7,
    int /*i*/, int /*j*/, int /*k*/)
  {
    double sum = 0;
    int count = 0;
    if (IsFiniteD(s0) && s0 != 0) { sum += s0; ++count; }
    if (IsFiniteD(s1) && s1 != 0) { sum += s1; ++count; }
    if (IsFiniteD(s2) && s2 != 0) { sum += s2; ++count; }
    if (IsFiniteD(s3) && s3 != 0) { sum += s3; ++count; }
    if (IsFiniteD(s4) && s4 != 0) { sum += s4; ++count; }
    if (IsFiniteD(s5) && s5 != 0) { sum += s5; ++count; }
    if (IsFiniteD(s6) && s6 != 0) { sum += s6; ++count; }
    if (IsFiniteD(s7) && s7 != 0) { sum += s7; ++count; }
    if (count == 0) {
      // All infinite or zero. If at least one zero return that, else return any of the infinite values.
      if (s0 == 0 || s1 == 0 || s2 == 0 || s3 == 0 || s4 == 0 || s5 == 0 || s6 == 0 || s7 == 0)
        return 0;
      else
        return s0;
    }
    return static_cast<float>(sum / count);
  }
};

275
276
277
278
/**
 * brief Median of 8 neighboring integer samples.
 * \details Thread safety: Safe. Instance holds no data.
 */
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
template<typename T>
class LodMedian
{
public:
  static const LodAlgorithm algorithm = LodAlgorithm::Median;
  T operator()(T s0, T s1, T s2, T s3, T s4, T s5, T s6, T s7,
               int /*i*/, int /*j*/, int /*k*/)
  {
    T data[8] = { s0, s1, s2, s3, s4, s5, s6, s7};

    // Bubble sort the array of 8 source numbers.
    // Yes, Bubble sort. The array is short enough
    // that this should not be a problem.
    // I might not even bother bailing out as soon
    // as the array is proven to be sorted.
    for (int ii=7; ii>0; --ii)
      for (int jj=0; jj<ii; ++jj)
        if (data[jj]>data[jj+1])
          {
            T tmp = data[jj+1];
            data[jj+1] = data[jj];
            data[jj] = tmp;
          }
    return static_cast<T>(((float)data[3] + (float)data[4])/2.0);
  }
};

306
307
308
309
/**
 * \brief Median of 8 neighboring float samples, ignoring NaN.
 * \details Thread safety: Safe. Instance holds no data.
 */
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
template<>
class LodMedian<float>
{
public:
  static const LodAlgorithm algorithm = LodAlgorithm::Median;
  float operator()(float s0, float s1, float s2, float s3, float s4, float s5, float s6, float s7,
               int /*i*/, int /*j*/, int /*k*/)
  {
    float data[8] = { s0, s1, s2, s3, s4, s5, s6, s7};
    int count = 0;
    if (IsFiniteD(s0)) { data[count++] = s0; };
    if (IsFiniteD(s1)) { data[count++] = s1; };
    if (IsFiniteD(s2)) { data[count++] = s2; };
    if (IsFiniteD(s3)) { data[count++] = s3; };
    if (IsFiniteD(s4)) { data[count++] = s4; };
    if (IsFiniteD(s5)) { data[count++] = s5; };
    if (IsFiniteD(s6)) { data[count++] = s6; };
    if (IsFiniteD(s7)) { data[count++] = s7; };

    for (int ii=count-1; ii>0; --ii)
      for (int jj=0; jj<ii; ++jj)
        if (data[jj]>data[jj+1])
          {
            float tmp = data[jj+1];
            data[jj+1] = data[jj];
            data[jj] = tmp;
          }
    if (count == 0)
      return s0; // All NaN
    else if (count%2 == 1)
      return data[count/2]; // odd number of bins - return the center one
    else
      return (data[count/2-1] + data[count/2])/2;
  }
};

346
347
348
349
/**
 * \brief Minimum of 8 neighboring samples, excluding NaN
 * \details Thread safety: Safe. Instance holds no data.
 */
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
template<typename T>
class LodMinimum
{
public:
  static const LodAlgorithm algorithm = LodAlgorithm::Minimum;
  T operator()(T s0, T s1, T s2, T s3, T s4, T s5, T s6, T s7,
               int /*i*/, int /*j*/, int /*k*/)
  {
    T result = s0;
    // Invert the test, so if result is NaN the test will succeed
    // and we will favor returning a result that is not NaN.
    if (!(result <= s1)) result = s1;
    if (!(result <= s2)) result = s2;
    if (!(result <= s3)) result = s3;
    if (!(result <= s4)) result = s4;
    if (!(result <= s5)) result = s5;
    if (!(result <= s6)) result = s6;
    if (!(result <= s7)) result = s7;
    return result;
  }
};

372
373
374
375
/**
 * \brief Maximum of 8 neighboring samples, excluding NaN
 * \details Thread safety: Safe. Instance holds no data.
 */
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
template<typename T>
class LodMaximum
{
public:
  static const LodAlgorithm algorithm = LodAlgorithm::Maximum;
  T operator()(T s0, T s1, T s2, T s3, T s4, T s5, T s6, T s7,
               int /*i*/, int /*j*/, int /*k*/)
  {
    T result = s0;
    if (!(result >= s1)) result = s1;
    if (!(result >= s2)) result = s2;
    if (!(result >= s3)) result = s3;
    if (!(result >= s4)) result = s4;
    if (!(result >= s5)) result = s5;
    if (!(result >= s6)) result = s6;
    if (!(result >= s7)) result = s7;
    return result;
  }
};

/**
 * \brief Weird algorithm, probably not useful.
 *
 * Return either the minimum or the maximum value in a checkerboard pattern.
400
401
 *
 * Thread safety: Safe. Instance holds no data.
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
 */
template<typename T>
class LodMinMax
{
public:
  static const LodAlgorithm algorithm = LodAlgorithm::MinMax;
  T operator()(T s0, T s1, T s2, T s3, T s4, T s5, T s6, T s7,
               int i, int j, int k)
  {
    T result = s0;
    if ((i^j^k)&1) {
      // Return the minumum
      if (!(result <= s1)) result = s1;
      if (!(result <= s2)) result = s2;
      if (!(result <= s3)) result = s3;
      if (!(result <= s4)) result = s4;
      if (!(result <= s5)) result = s5;
      if (!(result <= s6)) result = s6;
      if (!(result <= s7)) result = s7;
    } else {
      // Return the maximum
      if (!(result >= s1)) result = s1;
      if (!(result >= s2)) result = s2;
      if (!(result >= s3)) result = s3;
      if (!(result >= s4)) result = s4;
      if (!(result >= s5)) result = s5;
      if (!(result >= s6)) result = s6;
      if (!(result >= s7)) result = s7;
    }

    return result;
  }
};

436
437
438
439
/**
 * \brief Set the low resolution data to all zeros.
 * \details Thread safety: Safe. Instance holds no data.
 */
440
441
442
443
444
445
446
447
448
449
450
template<typename T>
class LodAllZero
{
public:
  static const LodAlgorithm algorithm = LodAlgorithm::AllZero;
  T operator()(T, T, T, T, T, T, T, T, int, int, int)
  {
    return static_cast<T>(0);
  }
};

451
452
453
454
/**
 * \brief Use just one of the 8 input values.
 * \details Thread safety: Safe. Instance holds no data.
 */
455
456
457
458
459
460
461
462
463
464
465
template<typename T>
class LodDecimate
{
public:
  static const LodAlgorithm algorithm = LodAlgorithm::Decimate;
  T operator()(T s0, T, T, T, T, T, T, T, int, int, int)
  {
    return s0;
  }
};

466
467
468
469
/**
 * \brief Use just one of the 8 integral input values.
 * \details Thread safety: Safe. Instance holds no data.
 */
470
471
472
473
474
475
476
477
478
479
480
template<typename T>
class LodDecimateSkipNaN
{
public:
  static const LodAlgorithm algorithm = LodAlgorithm::DecimateSkipNaN;
  T operator()(T s0, T, T, T, T, T, T, T, int, int, int)
  {
    return s0;
  }
};

481
482
483
484
/**
 * \brief Use just one of the 8 float input values, looking for one not NaN.
 * \details Thread safety: Safe. Instance holds no data.
 */
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
template<>
class LodDecimateSkipNaN<float>
{
public:
  static const LodAlgorithm algorithm = LodAlgorithm::DecimateSkipNaN;
  float operator()(float s0, float s1, float s2, float s3, float s4, float s5, float s6, float s7,
               int /*i*/, int /*j*/, int /*k*/)
  {
    if (IsFiniteD(s0)) return s0;
    if (IsFiniteD(s1)) return s1;
    if (IsFiniteD(s2)) return s2;
    if (IsFiniteD(s3)) return s3;
    if (IsFiniteD(s4)) return s4;
    if (IsFiniteD(s5)) return s5;
    if (IsFiniteD(s6)) return s6;
    if (IsFiniteD(s7)) return s7;
    return s0; // All are infinite, so this is what we return.
  }
};

#ifdef _MSC_VER
#pragma warning (push)
#pragma warning (disable: 4127) // "conditional expression is constant" deliberate in template code
#endif

/**
 * \brief Most frequent value.
 *
 * Return the value that occurs most frequently in the input.
 * If there is a tie, return the first one.
 * NaN or 0 are returned if and only if all inputs have this value.
 * If the input is a mix of 0 and NaN only, the result is 0.
 *
 * Since there are tests for equality, this algorithm probably only
 * makes sense for integral data.
520
521
 *
 * Thread safety: Safe. Instance holds no data.
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
 */
template<typename T, bool SkipNaN, bool SkipZero>
class LodMostFrequent
{
public:
  static const LodAlgorithm algorithm = LodAlgorithm::DecimateSkipNaN; // TODO-Low, this is wrong, and depends on template args.
  T operator()(T s0, T s1, T s2, T s3, T s4, T s5, T s6, T s7,
               int /*i*/, int /*j*/, int /*k*/)
  {
    T in[8] = {s0, s1, s2, s3, s4, s5, s6, s7};
    signed char counts[8] = {0};
    for (int jj=1; jj<8; ++jj)    // how many inputs after the first one
      if (in[0]==in[jj])          // have the same value as the first?
        ++counts[0];              // end result will be in range 0..7
    if (counts[0] == 7)           // all 8 values are equal and not NaN
      return in[0];               // so this is a short cut
    for (int ii=1; ii<8; ++ii)    // start at 1, counts[0] done above
      for (int jj=ii+1;jj<8;++jj) // how many inputs above this one
        if (in[ii]==in[jj])       // have the same numerical value?
          ++counts[ii];           // NaN will always end up with count 0
    if (SkipNaN)                  // typically true if T is floating point
      for (int ii=0; ii<8; ++ii)  // caller use std::numeric_limits to check
        if (!(IsFiniteD(in[ii]))) // this checks both NaN and +/- inf
          counts[ii] = -2;        // strong bias against NaN in result
    if (SkipZero)                 // set depending on application's choice
      for (int ii=0; ii<8; ++ii)  // note: comparing against storage 0 not
        if (in[ii] == 0)          // converted 0, maybe we should change that?
          counts[ii] = -1;        // weaker bias against 0 than against NaN
    int ok = 0;                   // locate entry with the highest count
    for (int ii=1; ii<8; ++ii)    // note that all loops might be unrolled
      if (counts[ii]>counts[ok])  // could almost avoid building the in[] array
        ok=ii;                    // found a better result
    return in[ok];                // the one place where in[] should be array.
  }
};

#ifdef _MSC_VER
#pragma warning (pop) // restore "conditional expression is constant"
#endif

/**
 * Generic LOD calculation that can be used for all algorithms where
 * the resulting sample depends only on the 8 surrounding samples
 * in the source brick. I.e. currently everything except the lowpass
 * algorithm that needs a window of size 10. The actual algorithm to
 * be used is passed as a functor. If any calculation needs to be done
 * up front, that can be handled by the functor's constructor.
 */
template <typename T, typename F>
void createGenericLevelOfDetail(
    T* dst,
    const std::array<std::int64_t,3>&dsize,
    const std::array<std::int64_t,3>&dstride,
    const T* src,
    const std::array<std::int64_t,3>&ssize,
    const std::array<std::int64_t,3>&sstride,
    F function)
{
  //printf("@createGenericLevelOfDetail<%s,%s> Algo=%d\n",
  //        typeid(T).name(), typeid(F).name(), (int)function.algorithm);

  // The samples of the output where all 8 inputs are available.
  const std::array<std::int64_t,3> core
    {std::min(ssize[0]/2,dsize[0]),
     std::min(ssize[1]/2,dsize[1]),
     std::min(ssize[2]/2,dsize[2])};

  const bool needpad = core[0]<dsize[0] || core[1]<dsize[1] || core[2]<dsize[2];

  // TODO-Low: this is temporary, see discussion later down.
  // If made permanent it should at least do a more efficient fill.
  if (needpad)
    for (std::int64_t ii = 0; ii < dsize[0]; ++ii)
      for (std::int64_t jj = 0; jj < dsize[1]; ++jj)
        for (std::int64_t kk = 0; kk < dsize[2]; ++kk)
          dst[ii*dstride[0] + jj*dstride[1] + kk*dstride[2]] = 0;

  // Given offset of first input, where are the other 8?
  const std::array<std::int64_t,8> offsets
    {0*sstride[0] + 0*sstride[1] + 0*sstride[2],
     0*sstride[0] + 0*sstride[1] + 1*sstride[2],
     0*sstride[0] + 1*sstride[1] + 0*sstride[2],
     0*sstride[0] + 1*sstride[1] + 1*sstride[2],
     1*sstride[0] + 0*sstride[1] + 0*sstride[2],
     1*sstride[0] + 0*sstride[1] + 1*sstride[2],
     1*sstride[0] + 1*sstride[1] + 0*sstride[2],
     1*sstride[0] + 1*sstride[1] + 1*sstride[2]};

  for (std::int64_t ii = 0; ii < core[0]; ++ii) {
    for (std::int64_t jj = 0; jj < core[1]; ++jj) {
      const T* cursrc = src + 2*ii*sstride[0] + 2*jj*sstride[1];
      T*       curdst = dst +   ii*dstride[0] +   jj*dstride[1];
      for (std::int64_t kk = 0; kk < core[2]; ++kk,
             cursrc += 2*sstride[2], curdst += dstride[2]) {
        // Invoke the functor to calculate a single sample from 8 input samples.
        // The reason the cast is safe: While the product of size[] is int64,
        // size in each dimension is limited by int32. The size is declared
        // as int64_t[] simply to reduce the risk of forgetting to widen if
        // computing e.g. size[0]*size[1]*size[2].
        // TODO-Low: Can I simply remove the 3 last args, and the MinMax algo?
        *curdst = function(cursrc[offsets[0]], cursrc[offsets[1]],
                           cursrc[offsets[2]], cursrc[offsets[3]],
                           cursrc[offsets[4]], cursrc[offsets[5]],
                           cursrc[offsets[6]], cursrc[offsets[7]],
                           (int)ii, (int)jj, (int)kk);
      }
    }
  }

  if (needpad) {
    // Ambition levels to handle the last il/xl/sample if input size is odd:
    // 0 - Leave the samples unchanged. It is the callers responsibility to
    //     padd input to even size or to fill the buffer with defaultvalue.
    // 1 - Set the last sample to zero. Can be done here, but it is simpler
    //     albeit slightly more expensive to just zero the buffer first.
    // 2 - Set the last sample to simple decimation i.e. use just one input
    //     sample even when we have 2 or 4 available.
    // 3 - Use the available 1, 2, or 4 samples. Which is the "correct"
    //     solution but it is doubtful whether anybody will notice.
    // TODO-Low choose what to do here.
    if (core[0] < dsize[0])
      for (std::int64_t ii = core[0]; ii < dsize[0]; ++ii)
        for (std::int64_t jj = 0; jj < core[1]; ++jj)
          for (std::int64_t kk = 0; kk < core[2]; ++kk)
            dst[ii*dstride[0] + jj*dstride[1] + kk*dstride[2]] = 0;

    if (core[1] < dsize[1])
      for (std::int64_t ii = 0; ii < dsize[0]; ++ii)
        for (std::int64_t jj = core[1]; jj < dsize[1]; ++jj)
          for (std::int64_t kk = 0; kk < core[2]; ++kk)
            dst[ii*dstride[0] + jj*dstride[1] + kk*dstride[2]] = 0;

    if (core[2] < dsize[2])
      for (std::int64_t ii = 0; ii < dsize[0]; ++ii)
        for (std::int64_t jj = 0; jj < dsize[1]; ++jj)
          for (std::int64_t kk = core[2]; kk < dsize[2]; ++kk)
            dst[ii*dstride[0] + jj*dstride[1] + kk*dstride[2]] = 0;

  }
}

/**
 * \brief Lowpass decimation vertically, simple decimation horizontally.
 *
 * Unlike the other algorithms this one needs full traces as input.
 * Or at least prefers full traces, to avoid brick artifacts vertically.
 */
template <typename T>
void createGenericLevelOfDetailLoPass(
    T* dst,
    const std::array<std::int64_t,3>&dsize,
    const std::array<std::int64_t,3>&dstride,
    const T* src,
    const std::array<std::int64_t,3>&ssize,
    const std::array<std::int64_t,3>&sstride)
{
  //printf("@createLODLoPass<%s> size (%d, %d, %d) -> (%d, %d, %d)\n",
  //       typeid(T).name(),
  //       (int)ssize[0], (int)ssize[1], (int)ssize[2],
  //       (int)dsize[0], (int)dsize[1], (int)dsize[2]);

  for (int ii=0; ii<3; ++ii)
    if ((ssize[ii]+1) / 2 != dsize[ii])
      throw OpenZGY::Errors::ZgyInternalError("createLODLoPass buffer size mismatch.");

  std::unique_ptr<double[]> vecSrc (new double[std::max(ssize[2], 2*dsize[2])]);
  std::unique_ptr<double[]> vecDst (new double[dsize[2]]);

  for (std::int64_t ii = 0; ii < dsize[0]; ++ii) {
    for (std::int64_t jj = 0; jj < dsize[1]; ++jj) {
      const T* cursrc = src + 2*ii*sstride[0] + 2*jj*sstride[1];
      T*       curdst = dst +   ii*dstride[0] +   jj*dstride[1];
      // Copy a single trace of source data. If not enough source
      // (need twice the output size) then replicate the last value.
      // This supports the case where ssize[2] is odd.
      for (std::int64_t kk = 0; kk < ssize[2]; ++kk)
        vecSrc[kk] = cursrc[kk*sstride[2]];
      for (std::int64_t kk = ssize[2]; kk < 2*dsize[2]; ++kk)
        vecSrc[kk] = vecSrc[ssize[2]-1];
      // Filter and downsample vector.
      // The reason the cast is safe: While the product of size[] is int64,
      // size in each dimension is limited by int32. The size is declared
      // as int64_t[] simply to reduce the risk of forgetting to widen if
      // computing e.g. size[0]*size[1]*size[2].
      LodSampling::downSample1D(
        vecDst.get(),
        static_cast<int>(dsize[2]),
        vecSrc.get(),
        static_cast<int>(2*dsize[2]));
      // Convert back from double (which the lowpass filter uses)
      // to the storage valuetype. RoundAndClip() takes care of
      // clipping to the limits of T, just in case the result ended
      // up with slightly larger amplitude than the original and
      // that result also overflowed the target type. No clipping
      // is done for floats.
      for (std::int64_t kk = 0; kk < dsize[2]; ++kk)
        curdst[kk*dstride[2]] = RoundAndClip<T>(vecDst[kk]);
    }
  }
}

/**
 * Create a single LOD brick based on the data from one LOD level below.
 * The valuetype of the source and target must be the same, but can be almost
 * any scalar type.
 *
 * Histogram information is passed in for use by some of the algorithms.
 * The histogram should be in "storage" units. That only affects its
 * limits though. TODO-Worry make sure these are set correctly. The old accessor
 * assumed that for integral data the limits were simply the entire range.
 *
 * When doing lowpass filtering of integral data, the actual calculation is
 * done using doubles.  We need to clip the result to the range of the data
 * value type to prevent overflows, since the lowpass filter can give a result
 * slightly outside the input range.
 *
 * When doing lowpass filtering of float data there will be no clipping.
 * Some samples might end up slightly outside the actual value range of the
 * input data. Note behavior change: The old accessor did some clipping to
 * avoid that case. Which just added to the confusion because the range it
 * clipped to might be inaccurate.
 */
template <typename T>
void createLevelOfDetail(
    T* dst,
    const std::array<std::int64_t,3>&dsize,
    const std::array<std::int64_t,3>&dstride,
    const T* src,
    const std::array<std::int64_t,3>&ssize,
    const std::array<std::int64_t,3>&sstride,
    LodAlgorithm algorithm,
    const std::int64_t *hist, int bins, double minHist, double maxHist)
{
    switch (algorithm) {
    case LodAlgorithm::LowPass:
      createGenericLevelOfDetailLoPass(dst, dsize, dstride, src, ssize, sstride);
      break;

    case LodAlgorithm::WeightedAverage:
      createGenericLevelOfDetail(dst, dsize, dstride, src, ssize, sstride, LodWeightedAverage<T>(hist, bins, minHist, maxHist));
      break;

    case   LodAlgorithm::Average:
      createGenericLevelOfDetail(dst, dsize, dstride, src, ssize, sstride, LodAverage<T>());
      break;

    case   LodAlgorithm::Median:
      createGenericLevelOfDetail(dst, dsize, dstride, src, ssize, sstride, LodMedian<T>());
      break;

    case   LodAlgorithm::Minimum:
      createGenericLevelOfDetail(dst, dsize, dstride, src, ssize, sstride, LodMinimum<T>());
      break;

    case   LodAlgorithm::Maximum:
      createGenericLevelOfDetail(dst, dsize, dstride, src, ssize, sstride, LodMaximum<T>());
      break;

    case   LodAlgorithm::MinMax:
      createGenericLevelOfDetail(dst, dsize, dstride, src, ssize, sstride, LodMinMax<T>());
      break;

    case   LodAlgorithm::Decimate:
      createGenericLevelOfDetail(dst, dsize, dstride, src, ssize, sstride, LodDecimate<T>());
      break;

    case   LodAlgorithm::DecimateSkipNaN:
      createGenericLevelOfDetail(dst, dsize, dstride, src, ssize, sstride, LodDecimateSkipNaN<T>());
      break;

    //case   LodAlgorithm::DecimateRandom:
      //createGenericLevelOfDetail(dst, dsize, dstride, src, ssize, sstride, LodDecimateRandom<T>());
      //break;

    case   LodAlgorithm::AllZero:
      createGenericLevelOfDetail(dst, dsize, dstride, src, ssize, sstride, LodAllZero<T>());
      break;

    //case   LodAlgorithm::WhiteNoise:
      //createGenericLevelOfDetail(dst, dsize, dstride, src, ssize, sstride, LodWhiteNoise<T>());
      //break;

    case   LodAlgorithm::MostFrequent:
      createGenericLevelOfDetail(dst, dsize, dstride, src, ssize, sstride, LodMostFrequent<T, !std::numeric_limits<T>::is_integer, false>());
      break;

    case   LodAlgorithm::MostFrequentNon0:
      createGenericLevelOfDetail(dst, dsize, dstride, src, ssize, sstride, LodMostFrequent<T, !std::numeric_limits<T>::is_integer, true>());
      break;

    case   LodAlgorithm::AverageNon0:
      createGenericLevelOfDetail(dst, dsize, dstride, src, ssize, sstride, LodAverageNon0<T>());
      break;

          default:
      throw OpenZGY::Errors::ZgyInternalError("Unsupported decimation type");
    }
}

/**
 * New version with "modernized" types.
 * Not sure if it belongs in this file or higher up.
 */
template <typename T>
void createLodT(const std::shared_ptr<DataBuffer>& result,
                const std::shared_ptr<const DataBuffer>& input,
                LodAlgorithm algorithm,
                const std::int64_t* hist,
                std::int32_t bincount,
                double histogram_min,
                double histogram_max)
{
  typedef DataBufferNd<T,3> buffer_t;
  typedef const DataBufferNd<T,3> constbuffer_t;
  std::shared_ptr<buffer_t> resultT =
    std::dynamic_pointer_cast<buffer_t>(result);
  std::shared_ptr<constbuffer_t> inputT =
    std::dynamic_pointer_cast<constbuffer_t>(input);
  if (result == nullptr || input == nullptr)
    throw OpenZGY::Errors::ZgyInternalError("createLodT null buffer(s) passed.");
  if (resultT == nullptr)
    throw OpenZGY::Errors::ZgyInternalError("createLodT wrong result data type.");
  if (inputT == nullptr)
    throw OpenZGY::Errors::ZgyInternalError("createLodT wrong input data type.");
  createLevelOfDetail(resultT->data(), resultT->size3d(), resultT->stride3d(),
                      inputT->data(), inputT->size3d(), inputT->stride3d(),
                      algorithm,
                      hist, bincount, histogram_min, histogram_max);
}

/**
 * \brief Main entry point for low resolution compute.
 *
 * Create a single low resolution brick 1/8th the size of the input.
 * This is the single threaded version.
 */
void createLodST(const std::shared_ptr<DataBuffer>& result,
                 const std::shared_ptr<const DataBuffer>& input,
                 LodAlgorithm algorithm,
                 const std::int64_t* hist,
                 std::int32_t bincount,
                 double histogram_min,
                 double histogram_max)
{
866
867
  static SummaryPrintingTimerEx timer("createLod");
  SimpleTimerEx tt(timer);
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
  switch (result->datatype()) {
  case RawDataType::SignedInt8:    createLodT<std::int8_t>  (result, input, algorithm, hist, bincount, histogram_min, histogram_max); break;
  case RawDataType::UnsignedInt8:  createLodT<std::uint8_t> (result, input, algorithm, hist, bincount, histogram_min, histogram_max); break;
  case RawDataType::SignedInt16:   createLodT<std::int16_t> (result, input, algorithm, hist, bincount, histogram_min, histogram_max); break;
  case RawDataType::UnsignedInt16: createLodT<std::uint16_t>(result, input, algorithm, hist, bincount, histogram_min, histogram_max); break;
  case RawDataType::SignedInt32:   createLodT<std::int32_t> (result, input, algorithm, hist, bincount, histogram_min, histogram_max); break;
  case RawDataType::UnsignedInt32: createLodT<std::uint32_t>(result, input, algorithm, hist, bincount, histogram_min, histogram_max); break;
  case RawDataType::Float32:       createLodT<float>        (result, input, algorithm, hist, bincount, histogram_min, histogram_max); break;
  case RawDataType::IbmFloat32:
  default: throw OpenZGY::Errors::ZgyInternalError("Unrecognized type enum");
  }
}

/**
 * Run the low resolution compute on just a part of the provided
 * data buffer. The part will be one of more slices in the slowest
 * changing direction. If passing the last 3 parameters as "entire survey"
 * then this is equivalent to calling createLodST() directly.
 */
void createLodPart(const std::shared_ptr<DataBuffer>& result,
                   const std::shared_ptr<const DataBuffer>& input,
                   LodAlgorithm algorithm,
                   const std::int64_t* hist,
                   std::int32_t bincount,
                   double histogram_min,
                   double histogram_max,
                   int slice_dim,
                   std::int64_t slice_beg,
                   std::int64_t slice_count)
{
  // If at the end of the survey, we might process fewer slices. Or zero
  // slices e.g. if slices_per_thread was rounded up to make it even.
  const std::int64_t survey_end = input->size3d()[slice_dim];
  slice_count = std::min(slice_beg + slice_count, survey_end) - slice_beg;
  if (slice_count <= 0)
    return;
  // Cannot happen. Unless a progremmer messes up...
  // Note that slice_count may be odd if we are at the end of the survey.
  // In that case, remember that the output size rounds up.
  if (slice_beg % 2 != 0)
    throw OpenZGY::Errors::ZgyInternalError("Slice start must be even.");
  // This check is probably not needed but it matches current usage.
  // If this changes then more unit tests may be needed.
  if (!result->is_cstride() || !input->is_cstride())
    throw OpenZGY::Errors::ZgyInternalError("createLodPart expects c-contiguous buffers");
  // Create a view on the buffers so the actual lod generator still
  // thinks we start at zero.
  auto ibuf = input->slice1(slice_dim, slice_beg, slice_count);
  auto obuf = result->slice1(slice_dim, slice_beg/2, (slice_count+1)/2);
  createLodST(obuf, ibuf, algorithm,
              hist, bincount, histogram_min, histogram_max);
}

/**
 * \brief Main entry point for low resolution compute.
 *
 * Create a single low resolution brick 1/8th the size of the input.
 * This is the multi-threaded version.
 *
 * Caveat: If parallelizing is also done at a higher level then
 * consider omp_set_max_active_levels(), especially if the higher
 * level might only be able to use 2 or 4 threads.
 *
 * One argument for parallelizing on a higher level instead
 * is that we might then be able to read and compute at the
 * same time. N/A for local file access because buffer cache
 * prefectch can give us that automatically. I think.
 *
 * TODO-Low fall back to single threaded if the input is small
 * (e.g. less than 16 slices or less than 64^3 samples) or if
 * the caller asks for cheaper algorithms. Low priority because
 * those cases are (a) rare and (b) should run fast anyway, even
 * if single thread *might* give a slight improvement.
 *
 * Using a simple "copy" app between two local ssd files I measured
 * the time for lod compute as 4% of finalize time or 2% of total time
 * with 8 threads. Compared to 23% / 13% for a single thread. So the
 * MT case had a 10% overall speedup on this test.
 *
 * Switching to the much cheaper "Decimate" algorithm (just pick one
 * of every 8 samples) then even in the single threaded case the
 * algorithm only accounts for < 1% so there is not much to gain.
 *
 * | Algorithm(s) used for Onnia | Time     |
 * | :-------------------------- | -------: |
 * | Decimate                    |    8 sec |
 * | Average                     |   37 sec |
 * | LowPass                     |   72 sec |
 * | WeightedAverage             |  732 sec |
 * | Decimate,WeightedAverage    |  102 sec |
 * | LowPass,Decimate            |   64 sec |
 * | LowPass,WeightedAverage     |  159 sec |
 */
void createLodMT(const std::shared_ptr<DataBuffer>& result,
                 const std::shared_ptr<const DataBuffer>& input,
                 LodAlgorithm algorithm,
                 const std::int64_t* hist,
                 std::int32_t bincount,
                 double histogram_min,
                 double histogram_max)
{
969
970
  static SummaryPrintingTimerEx timerST("createLod[ST]");
  static SummaryPrintingTimerEx timerMT("createLod[MT]");
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
  const auto isize   = input->size3d();
  const auto istride = input->stride3d();
  const auto ostride = result->stride3d();
  static const auto slowest = [](std::int64_t a,std::int64_t b,std::int64_t c)
                              {
                                return (a>b && a>c ? 0 :
                                        b>a && b>c ? 1 :
                                        c>a && c>b ? 2 : -1);
                              };
  int input_slice_dim = slowest(istride[0], istride[1], istride[2]);
  int output_slice_dim = slowest(ostride[0], ostride[1], ostride[2]);
  // Technically this should could with any buffer layouts. But this
  // means a lot of extra testing, less efficiency, and caveats such
  // as updating a byte might be implemented as read/modify/write of
  // 4 or 8 bytes. So, always slice the slowest dim.
  if (input_slice_dim != -1 && input_slice_dim == output_slice_dim) {
987
    SimpleTimerEx tt(timerMT);
988
989
990
991
992
993
994
995
996
997
998
999
1000
    std::atomic<int> errors(0);
    std::string first_error;
#pragma omp parallel
    {
      std::int64_t slices_per_thread = (isize[input_slice_dim]-1) / omp_get_num_threads() + 1;
      if (slices_per_thread % 2 == 1)
        ++slices_per_thread;
#if 0
      if (omp_get_thread_num() == 0)
        std::cout << "LOD compute " << isize[input_slice_dim]
                  << " slices using " << omp_get_num_threads()
                  << " threads with " << slices_per_thread
                  << " slices per thread."
For faster browsing, not all history is shown. View entire blame