Commit 40f03e49 authored by Yan Sushchynski (EPAM)'s avatar Yan Sushchynski (EPAM)
Browse files

GONRG-3452: Move Ingestion from Python SDK

parent be271a83
**/.idea
**/.DS_Store
**/.vscode
# Byte-compiled / optimized / DLL files
**/__pycache__/
**/*.py[cod]
**/*$py.class
.pytest_cache/*
*.log
# C extensions
**/*.so
# Distribution / packaging
**/.Python
**/build/
**/develop-eggs/
**/dist/
**/downloads/
**/eggs/
**/.eggs/
**/lib/
**/lib64/
**/parts/
**/sdist/
**/var/
**/wheels/
**/share/python-wheels/
**/*.egg-info/
**/.installed.cfg
**/*.egg
**/MANIFEST
**/venv
# ignore coverage.py
htmlcov/*
.coverage
default:
image: python:3.6-slim-buster
variables:
OSDU_API_LIBS_DIR: $CI_BUILDS_DIR
CLOUD_PROVIDER: provider_test
BUILD_TAG: $CI_COMMIT_TAG
BUILD_COMMIT_SHORT_SHA: $CI_COMMIT_SHORT_SHA
BUILD_ID: $CI_PIPELINE_IID
stages:
- linters
- test
- deploy
pylint:
stage: linters
allow_failure: true
script:
- python -m pip install setuptools pylint pylint_quotes pylint-exit
- pip install -r requirements.txt
- pylint --rcfile=.pylintrc osdu_ingestion/libs || EXIT_CODE=$?
- exit ${EXIT_CODE}
isort:
allow_failure: true
stage: linters
script:
- python -m pip install setuptools isort
- isort -c -v osdu_ingestion/libs || EXIT_CODE=$?
- exit ${EXIT_CODE}
test-libs:
stage: test
script:
- pip install -r requirements-dev.txt
- python -m pytest ./osdu_ingestion/tests/libs-unit-tests
create-package:
stage: deploy
script:
- pip install -r requirements.txt
- pip install twine
- python setup.py sdist bdist_wheel
- TWINE_PASSWORD=${CI_JOB_TOKEN} TWINE_USERNAME=gitlab-ci-token python -m twine upload --repository-url ${CI_API_V4_URL}/projects/${CI_PROJECT_ID}/packages/pypi dist/*
when: manual
[MASTER]
# Specify a configuration file.
#rcfile=.pylintrc
# Profiled execution.
profile=no
# Add <file or directory> to the black list. It should be a base name, not a
# path. You may set this option multiple times.
ignore=.git, .venv, .idea, CVS
# Pickle collected data for later comparisons.
persistent=yes
# When enabled, pylint would attempt to guess common misconfiguration and emit
# user-friendly hints instead of false-positive error messages.
suggestion-mode=yes
# Allow loading of arbitrary C extensions. Extensions are imported into the
# active Python interpreter and may run arbitrary code.
unsafe-load-any-extension=no
# List of plugins (as comma separated values of python modules names) to load,
# usually to register additional checkers.
load-plugins=pylint_quotes
[MESSAGES CONTROL]
# Enable the message, report, category or checker with the given id(s). You can
# either give multiple identifier separated by comma (,) or put this option
# multiple time.
enable=c-extension-no-member
# Disable the message, report, category or checker with the given id(s). You
# can either give multiple identifier separated by comma (,) or put this option
# multiple time.
disable=no-member, no-self-use
[REPORTS]
# Python expression which should return a note less than 10 (10 is the highest
# note). You have access to the variables errors warning, statement which
# respectively contain the number of errors / warnings messages and the total
# number of statements analyzed. This is used by the global evaluation report
# (R0004).
evaluation=13.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
# Set the output format. Available formats are text, parseable, colorized, msvs
# (visual studio) and html
output-format=text
# Include message's id in output
include-ids=yes
# Tells whether to display a full report or only the messages
reports=yes
# Put messages in a separate file for each module / package specified on the
# command line instead of printing them on stdout. Reports (if any) will be
# written in a file name "pylint_global.[txt|html]".
files-output=no
# Activate the evaluation score.
score=yes
[REFACTORING]
# Maximum number of nested blocks for function / method body
max-nested-blocks=5
[VARIABLES]
# Tells whether we should check for unused import in __init__ files.
init-import=yes
# A regular expression matching the name of dummy variables (i.e. expectedly
# not used).
dummy-variables-rgx=(_+[a-zA-Z0-9]*?$)|dummy
# List of additional names supposed to be defined in builtins. Remember that
# you should avoid to define new builtins when possible.
additional-builtins=
# List of strings which can identify a callback function by name. A callback
# name must start or end with one of those strings.
callbacks=cb_,_cb
# List of qualified module names which can have objects that can redefine
# builtins.
redefining-builtins-modules=six.moves,future.builtins
[CLASSES]
# List of method names used to declare (i.e. assign) instance attributes.
defining-attr-methods=__init__,__new__,setUp
# List of valid names for the first argument in a class method.
valid-classmethod-first-arg=cls
# List of valid names for the first argument in a metaclass class method.
valid-metaclass-classmethod-first-arg=cls
# List of member names, which should be excluded from the protected access
# warning.
exclude-protected=_asdict,_fields,_replace,_source,_make
[BASIC]
# Required attributes for module, separated by a comma
required-attributes=
# Regular expression which should only match correct module names
module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
# Regular expression which should only match correct module level names
const-rgx=
# Regular expression which should only match correct class names
class-rgx=[A-Z_][a-zA-Z0-9]+$
# Regular expression which should only match correct function names
function-rgx=[a-z_][a-z0-9_]{2,60}$
# Regular expression which should only match correct method names
method-rgx=[a-z_][a-z0-9_]{2,60}$
# Regular expression which should only match correct instance attribute names
attr-rgx=[a-z_][a-z0-9_]{2,30}$
# Regular expression which should only match correct argument names
argument-rgx=[a-z_][a-z0-9_]{2,30}$
# Regular expression which should only match correct variable names
variable-rgx=[a-z_][a-z0-9_]{2,30}$
# Regular expression which should only match correct list comprehension /
# generator expression variable names
inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
# Good variable names which should always be accepted, separated by a comma
good-names=e,f,i,j,k,ex,Run,_
# Bad variable names which should always be refused, separated by a comma
bad-names=foo,bar,baz,toto,tutu,tata
# Regular expression which should only match functions or classes name which do
# not require a docstring
no-docstring-rgx=__.*__
[FORMAT]
# Maximum number of characters on a single line.
max-line-length=100
# Maximum number of lines in a module
max-module-lines=1000
# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
# tab).
indent-string=' '
# Set the linting for string quotes
string-quote=double
triple-quote=double
docstring-quote=double
[LOGGING]
# Logging modules to check that the string format arguments are in logging
# function parameter format.
logging-modules=logging
# Format style used to check logging format string. `old` means using %
# formatting, `new` is for `{}` formatting,and `fstr` is for f-strings.
logging-format-style=new
[MISCELLANEOUS]
# List of note tags to take in consideration, separated by a comma.
notes=FIXME,XXX,TODO
[SIMILARITIES]
# Minimum lines number of a similarity.
min-similarity-lines=4
# Ignore comments when computing similarities.
ignore-comments=yes
# Ignore docstrings when computing similarities.
ignore-docstrings=yes
# Ignore imports when computing similarities.
ignore-imports=no
[DESIGN]
# Maximum number of arguments for function / method
max-args=7
# Argument names that match this expression will be ignored. Default to name
# with leading underscore
ignored-argument-names=_.*
# Maximum number of locals for function / method body
max-locals=15
# Maximum number of return / yield for function / method body
max-returns=6
# Maximum number of branch for function / method body
max-branches=12
# Maximum number of statements in function / method body
max-statements=50
# Maximum number of parents for a class (see R0901).
max-parents=7
# Maximum number of attributes for a class (see R0902).
max-attributes=7
# Minimum number of public methods for a class (see R0903).
min-public-methods=0
# Maximum number of public methods for a class (see R0904).
max-public-methods=20
# Maximum number of boolean expressions in a if statement
max-bool-expr=5
[IMPORTS]
# Deprecated modules which should not be used, separated by a comma
deprecated-modules=regsub,TERMIOS,Bastion,rexec
# Create a graph of every (i.e. internal and external) dependencies in the
# given file (report RP0402 must not be disabled)
import-graph=
# Create a graph of external dependencies in the given file (report RP0402 must
# not be disabled)
ext-import-graph=
# Create a graph of internal dependencies in the given file (report RP0402 must
# not be disabled)
int-import-graph=
# Force import order to recognize a module as part of the standard
# compatibility libraries.
known-standard-library=
# Force import order to recognize a module as part of a third party library.
known-third-party=enchant
# Analyse import fallback blocks. This can be used to support both Python 2 and
# 3 compatible code, which means that the block might have code that exists
# only in one or another interpreter, leading to false positives when analysed.
analyse-fallback-blocks=no
[EXCEPTIONS]
# Exceptions that will emit a warning when being caught. Defaults to
# "BaseException, Exception".
overgeneral-exceptions=BaseException, Exception
# OSDU Ingestion Lib
## Contents
* [Introduction](#introduction)
* [Getting Started](#getting-started)
* * [Installation from Package Registry](#installation-from-package-registry)
* [Testing](#testing)
* * [Running Ingestion libs Tests](#running-ingestion-libs-tests)
* [Licence](#licence)
# Introduction
OSDU Ingestion Lib is a package to implement steps of Manifest Based Ingestion.
OSDU Ingestion Lib is cloud platform-agnostic by design.
OSDU Ingestion Lib provides different components for the ingestion process in `osdu_ingestion.libs` folder. Among them:
- validating OSDU entities against corresponding schemas;
- ensuring referential integrity;
- finding parent-child relationships between entities;
- storing records in Storage Service;
- etc.
# Getting Started
## Installation from Package Registry
```sh
pip install osdu-ingestion --extra-index-url community.opengroup.org/api/v4/projects/823/packages/pypi/simple
```
## Testing
### Running ingestion libs tests
```shell
export CLOUD_PROVIDER=provider_test
pip install -r requirements-dev.txt
python -m pytest ./osdu_ingestion/tests/libs-unit-tests
```
## Licence
Copyright © Google LLC
Copyright © EPAM Systems
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
[http://www.apache.org/licenses/LICENSE-2.0](http://www.apache.org/licenses/LICENSE-2.0)
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
A package to interface with OSDU microservices
# Copyright 2021 Google LLC
# Copyright 2021 EPAM Systems
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2020 Google LLC
# Copyright 2020 EPAM Systems
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2020 Google LLC
# Copyright 2020 EPAM Systems
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Constants module."""
RETRIES = 3
TIMEOUT = 1
WAIT = 10
FIRST_STORED_RECORD_INDEX = 0
# Paths to extend schema fields with surrogate keys
DATA_TYPES_WITH_SURROGATE_KEYS = ("dataset", "work-product", "work-product-component")
SURROGATE_KEYS_PATHS = [
("definitions", "{{data-partition-id}}:wks:AbstractWPCGroupType:1.0.0", "properties", "Datasets",
"items"),
("definitions", "{{data-partition-id}}:wks:AbstractWPCGroupType:1.0.0", "properties", "Artefacts",
"items", "properties", "ResourceID"),
("properties", "data", "allOf", 1, "properties", "Components", "items"),
]
SEARCH_ID_BATCH_SIZE = 25
SAVE_RECORDS_BATCH_SIZE = 500
DATA_SECTION = "Data"
DATASETS_SECTION = "Datasets"
MASTER_DATA_SECTION ="MasterData"
REFERENCE_DATA_SECTION ="ReferenceData"
WORK_PRODUCT_SECTION = "WorkProduct"
WORK_PRODUCT_COMPONENTS_SECTION = "WorkProductComponents"
# Copyright 2020 Google LLC
# Copyright 2020 EPAM Systems
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Context module."""
import dataclasses
@dataclasses.dataclass
class Context:
"""Class to store data-partition-id and AppKey."""
data_partition_id: str
app_key: str
@classmethod
def populate(cls, ctx: dict) -> 'Context':
"""
Populates Context dataclass from dagrun.conf dict.
:return: populated Context
:rtype: Context
"""
ctx_payload = ctx.pop('Payload')
try:
data_partition_id = ctx_payload['data-partition-id']
except KeyError:
data_partition_id = ctx['dataPartitionId'] # to support some DAGs payload interface
ctx_obj = cls(app_key=ctx_payload['AppKey'],
data_partition_id=data_partition_id)
return ctx_obj
# Copyright 2020 Google LLC
# Copyright 2020 EPAM Systems
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Exceptions module."""
from typing import List
from osdu_ingestion.libs.utils import create_skipped_entity_info
class RecordsNotSearchableError(Exception):
"""Raise when expected totalCount of records differs from actual one."""
pass
class PipelineFailedError(Exception):
"""Raise when pipeline failed."""
pass
class EmptyManifestError(Exception):
"""Raise when manifest field is empty."""
pass
class GetSchemaError(Exception):
"""Raise when can't find schema."""
pass
class SRNNotFound(Exception):
"""Raise when can't find SRN."""
pass
class NotOSDUSchemaFormatError(Exception):
"""Raise when schema doesn't correspond OSDU format."""
pass
class FileSourceError(Exception):
"""Raise when file doesn't exist under given URI path."""
pass
class UploadFileError(Exception):
"""Raise when there is an error while uploading a file into OSDU."""
class TokenRefresherNotPresentError(Exception):
"""Raise when token refresher is not present in "refresh_token' decorator."""
pass
class NoParentEntitySystemSRNError(Exception):
"""Raise when parent entity doesn't have system-generated SRN."""
pass
class InvalidFileRecordData(Exception):
"""Raise when file data does not contain mandatory fields."""
class GenericManifestSchemaError(Exception):
"""Raise when a generic manifest schema is invalid."""
class BaseEntityValidationError(Exception):
"""
Base Error for failed validations.
"""