Commit b28f45e0 authored by Mark Hewitt's avatar Mark Hewitt
Browse files

dataload ingest - support sequence file for ordered loading (ref. standard reference-data)

parent b5985575
Pipeline #73887 passed with stages
in 2 minutes and 25 seconds
...@@ -28,13 +28,18 @@ For more information, specify the `-h` flag: ...@@ -28,13 +28,18 @@ For more information, specify the `-h` flag:
Change Log Change Log
========== ==========
0.0.20
------
- dataload support sequence file for ordered loading (ref. standard reference-data)
0.0.19 0.0.19
------ ------
- schema add --overwrite-existing option - schema add --overwrite-existing option
- merge dataload checkrefs code (wip) - merge dataload checkrefs code (wip)
- user friendly output mode - user friendly output mode
- updated dataload helper text - improved dataload helper text
0.0.18 0.0.18
------ ------
......
...@@ -12,4 +12,4 @@ ...@@ -12,4 +12,4 @@
""" OSDU command line environment""" """ OSDU command line environment"""
__VERSION__ = "0.0.19" __VERSION__ = "0.0.20"
...@@ -44,7 +44,7 @@ VERIFY_BATCH_SIZE = 200 ...@@ -44,7 +44,7 @@ VERIFY_BATCH_SIZE = 200
@click.option( @click.option(
"-p", "-p",
"--path", "--path",
help="Path to a manifest file or files to ingest.", help="Path to a a sequence file, manifest file or folder with manifest files to ingest.",
type=click.Path(exists=True, file_okay=True, dir_okay=True, readable=True, resolve_path=True), type=click.Path(exists=True, file_okay=True, dir_okay=True, readable=True, resolve_path=True),
required=True, required=True,
) )
...@@ -118,7 +118,14 @@ def ingest( ...@@ -118,7 +118,14 @@ def ingest(
logger.debug("Files list: %s", files) logger.debug("Files list: %s", files)
runids = _ingest_files( runids = _ingest_files(
state.config, manifest_files, files, runid_log, batch_size, wait, skip_existing, simulate state.config,
manifest_files,
files,
runid_log,
batch_size,
wait,
skip_existing,
simulate,
) )
print(runids) print(runids)
return runids return runids
...@@ -139,54 +146,93 @@ def _ingest_files( # noqa:C901 pylint: disable=R0912,too-many-nested-blocks ...@@ -139,54 +146,93 @@ def _ingest_files( # noqa:C901 pylint: disable=R0912,too-many-nested-blocks
for filepath in manifest_files: for filepath in manifest_files:
if filepath.endswith(".json"): if filepath.endswith(".json"):
with open(filepath) as file: with open(filepath) as file:
manifest = json.load(file) json_string = file.read()
# Note this code currently assumes only one of MasterData, ReferenceData or Data exists! # for reference data do replacements (acl, legal happens later)
if not manifest: json_string = json_string.replace(
logger.error("Error with file %s. File is empty.", filepath) "{{NAMESPACE}}", config.get("core", CONFIG_DATA_PARTITION_ID)
else: )
logger.info("Processing %s.", filepath) manifest = json.loads(json_string)
if "ReferenceData" in manifest and len(manifest["ReferenceData"]) > 0: if not manifest:
_update_legal_and_acl_tags_all(config, manifest["ReferenceData"]) logger.error("Error with file %s. File is empty.", filepath)
if batch_size is None and not skip_existing:
_create_and_submit(config, manifest, runids, runid_log_handle, simulate)
else: else:
data_objects += manifest["ReferenceData"] logger.info("Processing %s.", filepath)
file_batch_size = (
len(data_objects) if skip_existing and not batch_size else batch_size if isinstance(manifest, list):
) _ingest_json_as_sequence_file(
data_objects = _process_batch( config,
config, files,
file_batch_size, runid_log,
"ReferenceData", batch_size,
data_objects, skip_existing,
runids, simulate,
runid_log_handle, runids,
skip_existing, manifest,
simulate, )
)
elif "MasterData" in manifest and len(manifest["MasterData"]) > 0: else:
_update_legal_and_acl_tags_all(config, manifest["MasterData"]) # Note this code currently assumes only one of MasterData, ReferenceData or Data exists!
if batch_size is None and not skip_existing: if "ReferenceData" in manifest and len(manifest["ReferenceData"]) > 0:
_create_and_submit(config, manifest, runids, runid_log_handle, simulate) _update_legal_and_acl_tags_all(config, manifest["ReferenceData"])
else: if batch_size is None and not skip_existing:
data_objects += manifest["MasterData"] _create_and_submit(
file_batch_size = ( config, manifest, runids, runid_log_handle, simulate
len(data_objects) if skip_existing and not batch_size else batch_size )
) else:
data_objects = _process_batch( data_objects += manifest["ReferenceData"]
config, file_batch_size = (
file_batch_size, len(data_objects)
"MasterData", if skip_existing and not batch_size
data_objects, else batch_size
runids, )
runid_log_handle, data_objects = _process_batch(
skip_existing, config,
simulate, file_batch_size,
) "ReferenceData",
elif "Data" in manifest: data_objects,
_update_work_products_metadata(config, manifest["Data"], files, simulate) runids,
_create_and_submit(config, manifest, runids, runid_log_handle, simulate) runid_log_handle,
skip_existing,
simulate,
)
elif "MasterData" in manifest and len(manifest["MasterData"]) > 0:
_update_legal_and_acl_tags_all(config, manifest["MasterData"])
if batch_size is None and not skip_existing:
_create_and_submit(
config, manifest, runids, runid_log_handle, simulate
)
else:
data_objects += manifest["MasterData"]
file_batch_size = (
len(data_objects)
if skip_existing and not batch_size
else batch_size
)
data_objects = _process_batch(
config,
file_batch_size,
"MasterData",
data_objects,
runids,
runid_log_handle,
skip_existing,
simulate,
)
elif "Data" in manifest:
_update_work_products_metadata(
config, manifest["Data"], files, simulate
)
_create_and_submit(
config, manifest, runids, runid_log_handle, simulate
)
else:
logger.error(
"No manifest ReferenceData, MasterData or Data section found in %s.",
filepath,
)
else:
logger.warning("Skipping %s - no .json extension.", filepath)
finally: finally:
if runid_log_handle is not None: if runid_log_handle is not None:
runid_log_handle.close() runid_log_handle.close()
...@@ -197,6 +243,30 @@ def _ingest_files( # noqa:C901 pylint: disable=R0912,too-many-nested-blocks ...@@ -197,6 +243,30 @@ def _ingest_files( # noqa:C901 pylint: disable=R0912,too-many-nested-blocks
return runids return runids
def _ingest_json_as_sequence_file(
config, files, runid_log, batch_size, skip_existing, simulate, runids, sequence_file
):
logger.info(
"Processing as sequence file. Will wait for each entry to complete before submitting new."
)
if all(isinstance(entry, dict) and "FileName" in entry for entry in sequence_file):
for entry in sequence_file:
_sequence_run_ids = _ingest_files(
config,
[entry["FileName"]],
files,
runid_log,
batch_size,
True,
skip_existing,
simulate,
)
runids.extend(_sequence_run_ids)
else:
logger.error("Invalid sequence file.")
def _process_batch( def _process_batch(
config, batch_size, data_type, data_objects, runids, runid_log_handle, skip_existing, simulate config, batch_size, data_type, data_objects, runids, runid_log_handle, skip_existing, simulate
): ):
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment