file.h 10.1 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
// Copyright 2017-2020, Schlumberger
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

/**
\file: file.h
\brief Low level I/O, abstract layer.

This file contains the base class for low level I/O either to on-prem data
using the regular read and write methods of the OS or to a cloud back-end.
\code{.unparsed}
24
25
26
27
28
    // TODO-Low: To improve isolation, user visible context such as
    // OpenZGY::SeismicStoreIOContext could be be copied into an equivalent
    // InternalZGY::SDConfig. Base class InternalZGY::Config in this header.
    // Ditto for plain files, an InternalZGY::FileConfig defined here.
    // The downside is that it gets more tedious to maintain.
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
    InternalZGY::Config:
      InternalZGY::FileConfig(Config):
      InternalZGY::SDConfig(Config):

        * Details such as user credentials etc. established when the
          file is open. Specific to the backend type.
        * Note that there is currently no way to pass a configuration
          object along with every read and write request. This might
          have been useful for a server type application but would
          require the config parameter to ripple across at least 50
          existing methods. I doubt this would be worth the trouble.

    InternalZGY::FileADT:                                 <=== file.h
      InternalZGY::LocalFile(FileADT):                    <=== file_local.h
        InternalZGY::LocalFileOther(LocalFile):           <=== file_local.h
        InternalZGY::LocalFileLinux(LocalFile):           <=== file_local.h
      InternalZGY::SeismicStoreFile(FileADT):             <=== file_sd.h
      InternalZGY::SeismicStoreFileDelayedWrite(FileADT): <=== file_sd.h

        * Higher level code should only access the polymorphic FileADT
          base class and the InternalZGY::FileFactory that creates an
          instance of the desired type.
\endcode
*/

#include <cstdint>
#include <vector>
#include <string>
#include <memory>
#include <functional>
59
#include <mutex>
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90

#include "../declspec.h"
#include "timer.h"

namespace OpenZGY {
  class IOContext;
}

namespace InternalZGY {
#if 0
}
#endif

enum class OpenMode
{
  Closed = 0,
  ReadOnly,
  ReadWrite,
  Truncate,
};

enum class UsageHint
{
  Unknown    = 0x00,
  TextFile   = 0x01,
  Header     = 0x10,
  Data       = 0x20,
  Compressed = 0x40,
  Mixed      = 0x40,
};

91
92
93
94
95
96
97
98
/**
 * Single entry in a scatter/gather read request.
 *
 * Thread safety:
 * Modification may lead to a data race. This should not be an issue,
 * because instances are only meant to be modified when created or
 * copied or assigned prior to being made available to others.
 */
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
class ReadRequest
{
public:
  typedef std::function<void(const void*, std::int64_t)> delivery_t;
  std::int64_t offset;
  std::int64_t size;
  delivery_t delivery;
  ReadRequest(std::int64_t offset_in, std::int64_t size_in, const delivery_t& delivery_in)
    : offset(offset_in)
    , size(size_in)
    , delivery(delivery_in)
  {
  }
};

typedef std::vector<ReadRequest> ReadList;
typedef std::vector<ReadList> ReadDoubleList;

117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
/**
 * \brief Internal interface for I/O operations.
 *
 * Public methods are prefixed with xx_ for practical reasons.
 * It makes it more obvious that an invocation is being made
 * on the FileADT interface. It also becomes simple to search
 * for usage.
 *
 * The class contains some protected static convenience methods
 * that specializations might need. So it isn't technically a
 * pure interface.
 *
 * Thread safety: Interfaces and classes that only contain static
 * methods do not have race conditions.
 */
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
class FileADT
{
public:
  virtual ~FileADT();

  /**
   * Read binary data from the file. Both size and offset are mandatory.
   * I.e. caller is not allowed to read "the entire file", and not
   * allowed to "read from where I left off the last time".
   * The actual reading will be done in a derived class.
   * The base class only validates the arguments.
   */
  virtual void xx_read(void *data, std::int64_t offset, std::int64_t size, UsageHint usagehint=UsageHint::Unknown) = 0;

  /**
   * Read binary data from multiple regions in the file. Each part
   * of the request specifies offset, size, and a delivery functor
   * which will be invoked to pass back the returned bulk.
   *
   * Arguments:
   *     parallel_ok:  If true then the delivery functor might be called
   *                   simultaneously from multiple worker threads.
   *                   The function itself will block until all the data
   *                   has been read or an error occurs.
   *     immutable_ok: If true the caller promises that the delivery
   *                   functor will not try to modify the data buffer.
   *                   Pass False e.g. if the functor may need to byteswap
   *                   the data it has read from file.
   *     transient_ok: If true the caller promises that the delivery
   *                   functor will not keep a reference to the data buffer
   *                   after the functor returns.
   *
   * The delivery functor is called as
   *     fn(void* data, std::int64_t size)
   *
   * FUTURE: a new argument partial_ok may be set to True if it is ok to
   * call the delivery functor with less data than requested, and to keep
   * calling it until all data has been delivered. The signature of the
   * delivery functor gets changed to fn(data, offset, size). Offset is the
   * absolute file offset. I.e. not relative to the requested offset.
   * Passing partial_ok=True might elide some buffer copies if the
   * caller is doing something simple (such as reading an uncompressed
   * brick) where partial copies are possible, and the backend is in the
   * cloud, and a longer lived cache is being maintained, and the cache
   * block size is smaller than the requested size. That is a lot of ifs.
   * There was some code to handle partial_ok but it has been removed.
   * Get it from the git history if you really want it.
   */
  virtual void xx_readv(const ReadList& requests, bool parallel_ok=false, bool immutable_ok=false, bool transient_ok=false, UsageHint usagehint=UsageHint::Unknown) = 0;

  /**
   * Write binary data to the file. Offset is mandatory. I.e. caller
   * is not allowed to "write to where I left off the last time".
   * The actual writing will be done in a derived class.
   * The base class only validates the arguments.
   */
  virtual void xx_write(const void* data, std::int64_t offset, std::int64_t size, UsageHint usagehint=UsageHint::Unknown) = 0;

  /**
191
192
193
194
195
196
197
198
199
200
   * Close the file. This should be done exactly once if the file was
   * opened normally, and not at all if the file was "opened" with mode
   * OpenMode::Closed. The latter is used when a backend handle is needed
   * e.g. for deleting files and the handle doesn't represent an open file.
   * Explicitly calling xx_close() more than once will raise an error.
   *
   * If the application forgets to close then the destructor will do so.
   * But in that case any exceptions will be caught and swallowed.
   *
   * Thread safety: Must not be called concurrently with any other method.
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
   */
  virtual void xx_close() = 0;

  /**
   * Return the current end-of-file, i.e. the file size.
   */
  virtual std::int64_t xx_eof() const = 0;

  /**
   * Return true if the file is on the cloud.
   * This might trigger some optimizations.
   */
  virtual bool xx_iscloud() const = 0;

protected:
  static std::string _nice(std::int64_t n);
  static void _validate_read(void *data, std::int64_t offset, std::int64_t size, std::int64_t eof, OpenMode mode);
  static void _validate_write(const void *data, std::int64_t offset, std::int64_t size, OpenMode mode);
  static void _validate_readv(const ReadList& requests, std::int64_t eof, OpenMode mode);

public:
  static std::shared_ptr<FileADT> factory(const std::string& filename, OpenMode mode, const OpenZGY::IOContext *iocontext);
};

225
226
227
/**
 * Factory for instanciating an appropriate concrete FileADT instance.
 *
228
 * Thread safety: Synchronized using a lock.
229
 */
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
class OPENZGY_TEST_API FileFactory
{
public:
  typedef std::function<std::shared_ptr<FileADT>(const std::string&, OpenMode, const OpenZGY::IOContext*)> factory_t;

public:
  std::shared_ptr<FileADT> create(const std::string& filename, OpenMode mode, const OpenZGY::IOContext *iocontext);
  void add_factory(const factory_t& factory);
  static FileFactory& instance();

private:
  FileFactory();
  FileFactory(const FileFactory&) = delete;
  FileFactory& operator=(const FileFactory&) = delete;

private:
  std::vector<factory_t> _registry;
247
  std::mutex _mutex;
248
249
250
251
252
253
254
};

/**
 * \brief Implementation of some methods that might be shared.
 *
 * Using this class is optional. Concrete classes can inherit
 * directly from FileADT if they want to.
255
256
257
258
 *
 * Thread safety: By design, all FileADT specializations are expected to
 * allow concurrent reads but no guaranteed about anything else.
 * TODO-High: Need to analyze individual methods for thread safety issues.
259
260
261
262
 */
class FileCommon : public FileADT
{
protected:
263
264
265
266
267
  OpenMode _mode;         // Only xx_close() allowed to change these,
  std::string _name;      // and that one is non threadsafe anyway.
  /**
   * Keep track of EOF: Initially set on file open; later kept
   * up to date if we write to the file.
268
   * Thread safety: Synchronized by the per-file mutex.
269
   */
270
  std::int64_t _eof;
271
272
  std::shared_ptr<SummaryTimer> _rtimer; // Access is thread safe
  std::shared_ptr<SummaryTimer> _wtimer; // Access is thread safe
273
274

public:
275
  FileCommon(const std::string& filename, OpenMode mode);
276
  // NEW functions
277
  // WARNING: users responsible for thread safety.
278
  virtual std::int64_t _real_eof() const;
279
280
  // WARNING: calls xx_eof() and _real_eof() which might not be threadsafe.
  void _check_short_read(std::int64_t offset, std::int64_t size, std::int64_t got) const;
281
282
283
};

}