file.h 11.6 KB
Newer Older
1
// Copyright 2017-2021, Schlumberger
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

/**
\file: file.h
\brief Low level I/O, abstract layer.

This file contains the base class for low level I/O either to on-prem data
using the regular read and write methods of the OS or to a cloud back-end.
\code{.unparsed}
24
25
26
27
28
    // TODO-Low: To improve isolation, user visible context such as
    // OpenZGY::SeismicStoreIOContext could be be copied into an equivalent
    // InternalZGY::SDConfig. Base class InternalZGY::Config in this header.
    // Ditto for plain files, an InternalZGY::FileConfig defined here.
    // The downside is that it gets more tedious to maintain.
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
    InternalZGY::Config:
      InternalZGY::FileConfig(Config):
      InternalZGY::SDConfig(Config):

        * Details such as user credentials etc. established when the
          file is open. Specific to the backend type.
        * Note that there is currently no way to pass a configuration
          object along with every read and write request. This might
          have been useful for a server type application but would
          require the config parameter to ripple across at least 50
          existing methods. I doubt this would be worth the trouble.

    InternalZGY::FileADT:                                 <=== file.h
      InternalZGY::LocalFile(FileADT):                    <=== file_local.h
        InternalZGY::LocalFileOther(LocalFile):           <=== file_local.h
        InternalZGY::LocalFileLinux(LocalFile):           <=== file_local.h
      InternalZGY::SeismicStoreFile(FileADT):             <=== file_sd.h
      InternalZGY::SeismicStoreFileDelayedWrite(FileADT): <=== file_sd.h

        * Higher level code should only access the polymorphic FileADT
          base class and the InternalZGY::FileFactory that creates an
          instance of the desired type.
\endcode
*/

#include <cstdint>
#include <vector>
#include <string>
#include <memory>
#include <functional>
59
#include <mutex>
60
61
62
63
64
65
66
67
68
69
70
71

#include "../declspec.h"

namespace OpenZGY {
  class IOContext;
}

namespace InternalZGY {
#if 0
}
#endif

72
73
class SummaryPrintingTimerEx;

74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
enum class OpenMode
{
  Closed = 0,
  ReadOnly,
  ReadWrite,
  Truncate,
};

enum class UsageHint
{
  Unknown    = 0x00,
  TextFile   = 0x01,
  Header     = 0x10,
  Data       = 0x20,
  Compressed = 0x40,
  Mixed      = 0x40,
};

92
93
94
95
96
97
98
99
/**
 * Single entry in a scatter/gather read request.
 *
 * Thread safety:
 * Modification may lead to a data race. This should not be an issue,
 * because instances are only meant to be modified when created or
 * copied or assigned prior to being made available to others.
 */
100
101
102
class ReadRequest
{
public:
103
104
  typedef const void* data_t;
  typedef std::function<void(data_t, std::int64_t)> delivery_t;
105
106
107
108
109
110
111
112
113
114
115
116
117
118
  std::int64_t offset;
  std::int64_t size;
  delivery_t delivery;
  ReadRequest(std::int64_t offset_in, std::int64_t size_in, const delivery_t& delivery_in)
    : offset(offset_in)
    , size(size_in)
    , delivery(delivery_in)
  {
  }
};

typedef std::vector<ReadRequest> ReadList;
typedef std::vector<ReadList> ReadDoubleList;

119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
/**
 * \brief Internal interface for I/O operations.
 *
 * Public methods are prefixed with xx_ for practical reasons.
 * It makes it more obvious that an invocation is being made
 * on the FileADT interface. It also becomes simple to search
 * for usage.
 *
 * The class contains some protected static convenience methods
 * that specializations might need. So it isn't technically a
 * pure interface.
 *
 * Thread safety: Interfaces and classes that only contain static
 * methods do not have race conditions.
 */
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
class FileADT
{
public:
  virtual ~FileADT();

  /**
   * Read binary data from the file. Both size and offset are mandatory.
   * I.e. caller is not allowed to read "the entire file", and not
   * allowed to "read from where I left off the last time".
   * The actual reading will be done in a derived class.
   * The base class only validates the arguments.
   */
  virtual void xx_read(void *data, std::int64_t offset, std::int64_t size, UsageHint usagehint=UsageHint::Unknown) = 0;

  /**
   * Read binary data from multiple regions in the file. Each part
   * of the request specifies offset, size, and a delivery functor
   * which will be invoked to pass back the returned bulk.
   *
   * Arguments:
   *     parallel_ok:  If true then the delivery functor might be called
   *                   simultaneously from multiple worker threads.
   *                   The function itself will block until all the data
   *                   has been read or an error occurs.
   *     immutable_ok: If true the caller promises that the delivery
   *                   functor will not try to modify the data buffer.
   *                   Pass False e.g. if the functor may need to byteswap
   *                   the data it has read from file.
162
163
164
165
166
   *                   With the current implementation the bulk layer
   *                   will uncondiionally pass false because it doesn't
   *                   know yet whether byeswap and/or subtiling is needed.
   *                   With the current implementation this doesn't add
   *                   much cost to the cloud reader so this is probably ok.
167
168
169
   *     transient_ok: If true the caller promises that the delivery
   *                   functor will not keep a reference to the data buffer
   *                   after the functor returns.
170
171
172
173
174
175
   *                   With smart pointers it is possible to check whether
   *                   the delivery functor kept its promise and signal
   *                   a fatal error if it didn't. The reason that the code
   *                   doesn't just allow keeping a pointer and look at the
   *                   refcount on return is that this might make a future
   *                   cache module less efficient.
176
177
178
179
   *
   * The delivery functor is called as
   *     fn(void* data, std::int64_t size)
   *
180
181
182
   * size can in some cases be more than originally requested due to
   * caching and possibly less if end of file was encountered.
   *
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
   * FUTURE: a new argument partial_ok may be set to True if it is ok to
   * call the delivery functor with less data than requested, and to keep
   * calling it until all data has been delivered. The signature of the
   * delivery functor gets changed to fn(data, offset, size). Offset is the
   * absolute file offset. I.e. not relative to the requested offset.
   * Passing partial_ok=True might elide some buffer copies if the
   * caller is doing something simple (such as reading an uncompressed
   * brick) where partial copies are possible, and the backend is in the
   * cloud, and a longer lived cache is being maintained, and the cache
   * block size is smaller than the requested size. That is a lot of ifs.
   * There was some code to handle partial_ok but it has been removed.
   * Get it from the git history if you really want it.
   */
  virtual void xx_readv(const ReadList& requests, bool parallel_ok=false, bool immutable_ok=false, bool transient_ok=false, UsageHint usagehint=UsageHint::Unknown) = 0;

  /**
   * Write binary data to the file. Offset is mandatory. I.e. caller
   * is not allowed to "write to where I left off the last time".
   * The actual writing will be done in a derived class.
   * The base class only validates the arguments.
   */
  virtual void xx_write(const void* data, std::int64_t offset, std::int64_t size, UsageHint usagehint=UsageHint::Unknown) = 0;

  /**
207
208
209
210
211
212
213
214
215
216
   * Close the file. This should be done exactly once if the file was
   * opened normally, and not at all if the file was "opened" with mode
   * OpenMode::Closed. The latter is used when a backend handle is needed
   * e.g. for deleting files and the handle doesn't represent an open file.
   * Explicitly calling xx_close() more than once will raise an error.
   *
   * If the application forgets to close then the destructor will do so.
   * But in that case any exceptions will be caught and swallowed.
   *
   * Thread safety: Must not be called concurrently with any other method.
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
   */
  virtual void xx_close() = 0;

  /**
   * Return the current end-of-file, i.e. the file size.
   */
  virtual std::int64_t xx_eof() const = 0;

  /**
   * Return true if the file is on the cloud.
   * This might trigger some optimizations.
   */
  virtual bool xx_iscloud() const = 0;

protected:
  static std::string _nice(std::int64_t n);
  static void _validate_read(void *data, std::int64_t offset, std::int64_t size, std::int64_t eof, OpenMode mode);
  static void _validate_write(const void *data, std::int64_t offset, std::int64_t size, OpenMode mode);
  static void _validate_readv(const ReadList& requests, std::int64_t eof, OpenMode mode);
236
public: // Actually internal. Used by ConsolidateRequests.
237
238
  static void _deliver_old(const ReadRequest::delivery_t& fn, ReadRequest::data_t data, std::int64_t offset, std::int64_t size, bool transient);
  static void _deliver(const ReadRequest::delivery_t& fn, const std::shared_ptr<const void>& data, std::int64_t offset, std::int64_t size, bool transient);
239
240
241
242
243

public:
  static std::shared_ptr<FileADT> factory(const std::string& filename, OpenMode mode, const OpenZGY::IOContext *iocontext);
};

244
245
246
/**
 * Factory for instanciating an appropriate concrete FileADT instance.
 *
247
 * Thread safety: Synchronized using a lock.
248
 */
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
class OPENZGY_TEST_API FileFactory
{
public:
  typedef std::function<std::shared_ptr<FileADT>(const std::string&, OpenMode, const OpenZGY::IOContext*)> factory_t;

public:
  std::shared_ptr<FileADT> create(const std::string& filename, OpenMode mode, const OpenZGY::IOContext *iocontext);
  void add_factory(const factory_t& factory);
  static FileFactory& instance();

private:
  FileFactory();
  FileFactory(const FileFactory&) = delete;
  FileFactory& operator=(const FileFactory&) = delete;

private:
  std::vector<factory_t> _registry;
266
  std::mutex _mutex;
267
268
269
270
271
272
273
};

/**
 * \brief Implementation of some methods that might be shared.
 *
 * Using this class is optional. Concrete classes can inherit
 * directly from FileADT if they want to.
274
275
276
277
 *
 * Thread safety: By design, all FileADT specializations are expected to
 * allow concurrent reads but no guaranteed about anything else.
 * TODO-High: Need to analyze individual methods for thread safety issues.
278
279
280
281
 */
class FileCommon : public FileADT
{
protected:
282
283
284
285
286
  OpenMode _mode;         // Only xx_close() allowed to change these,
  std::string _name;      // and that one is non threadsafe anyway.
  /**
   * Keep track of EOF: Initially set on file open; later kept
   * up to date if we write to the file.
287
   * Thread safety: Synchronized by the per-file mutex.
288
   */
289
  std::int64_t _eof;
290
291
  std::shared_ptr<SummaryPrintingTimerEx> _rtimer; // Access is thread safe
  std::shared_ptr<SummaryPrintingTimerEx> _wtimer; // Access is thread safe
292
293

public:
294
  FileCommon(const std::string& filename, OpenMode mode);
295
  // NEW functions
296
  // TODO-Worry: All implementations need to be thread safe.
297
  virtual std::int64_t _real_eof() const;
298
299
300
  // TODO-Worry: calls xx_eof() and _real_eof() which need to be threadsafe.
  // Currently not true for SeismicStoreFileDelayedWrite, but this
  // check method won't be used there.
301
  void _check_short_read(std::int64_t offset, std::int64_t size, std::int64_t got) const;
302
303
304
};

}