Commit b447b528 authored by Spencer Sutton's avatar Spencer Sutton
Browse files

Merge

parents eab1c62a 4246648d
......@@ -5,6 +5,8 @@
* [Introduction](#introduction)
* [Getting Started](#getting-started)
* * [Installation from source](#installation-from-source)
* * [Installation from Package Registry](#installation-from-package-registry)
* [Testing](#testing)
* * [Running E2E Tests](#running-e2e-tests)
* * [Running Ingestion libs Tests](#running-ingestion-libs-tests)
......@@ -35,7 +37,7 @@ cloud storages. In this `osdu_api.providers` folder CSP code is stored.
# Getting Started
To install this package:
## Installation from source
1. Pull the latest Python SDK's changes from https://community.opengroup.org/osdu/platform/system/sdks/common-python-sdk
......@@ -59,20 +61,26 @@ cd path/to/python-sdk
pip uninstall osdu-api
````
6. Install all needed dependencies
6. Install Python SDK
```sh
pip install -r requirements.txt
python setup.py install
```
7. Install Python SDK
Example import after installing:
`from osdu_api.storage.record_client import RecordClient`
## Installation from Package Registry
```sh
python setup.py install
pip install 'osdu-api' --extra-index-url=https://community.opengroup.org/api/v4/projects/148/packages/pypi/simple
```
Example import after installing:
`from osdu_api.storage.record_client import RecordClient`
**Note**: If the SDK is installing on environment where the packages `requests` and `tenacity` are not installed then run:
```sh
pip install 'osdu-api[all]' --extra-index-url=https://community.opengroup.org/api/v4/projects/148/packages/pypi/simple
```
## Testing
......
0.10.1
......@@ -12,11 +12,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import importlib
import os
from configparser import SafeConfigParser
import requests
from osdu_api.configuration.base_config_manager import BaseConfigManager
from osdu_api.configuration.config_manager import DefaultConfigManager
from osdu_api.model.http_method import HttpMethod
......@@ -25,31 +26,42 @@ class BaseClient:
Base client that is meant to be extended by service specific clients
"""
def __init__(self, data_partition_id=None):
def __init__(self, config_manager: BaseConfigManager = None, data_partition_id = None):
"""
Base client gets initialized with configuration values and a bearer token
based on provider-specific logic
"""
self._parse_config(data_partition_id)
self._parse_config(config_manager, data_partition_id)
self.unauth_retries = 0
if self.use_service_principal == 'True' or self.use_service_principal == 'true':
if self.use_service_principal:
self._refresh_service_principal_token()
def _parse_config(self, data_partition_id):
def _parse_config(self, config_manager: BaseConfigManager = None, data_partition_id = None):
"""
Example config file:
[environment]
data_partition_id=opendes
storage_url=https://[STORAGE_ENDPOINT]/api/storage/v2
search_url=https://[SEARCH_ENDPOINT]/api/search/v2
data_workflow_url=https://[WORKFLOW_ENDPOINT]/api/data-workflow/v1
file_dms_url=https://[FILE_DMS_ENDPOINT]/api/filedms/v2
dataset_registry_url=https://[DATASET_REGISTRY_URL]/api/dataset-registry/v1
[provider]
name=aws
entitlements_module_name=entitlements_client
Parse config.
:param config_manager: ConfigManager to get configs, defaults to None
:type config_manager: BaseConfigManager, optional
"""
config_manager = config_manager or DefaultConfigManager()
self.data_partition_id = config_manager.get('environment', 'data_partition_id')
self.provider = config_manager.get('provider', 'name')
self.data_workflow_url = config_manager.get('environment', 'data_workflow_url')
self.dataset_url = config_manager.get('environment', 'dataset_url')
self.entitlements_url = config_manager.get('environment', 'entitlements_url')
self.file_dms_url = config_manager.get('environment', 'file_dms_url')
self.legal_url = config_manager.get('environment', 'legal_url')
self.schema_url = config_manager.get('environment', 'schema_url')
self.search_url = config_manager.get('environment', 'search_url')
self.storage_url = config_manager.get('environment', 'storage_url')
self.ingestion_workflow_url = config_manager.get('environment', 'ingestion_workflow_url')
self.provider = config_manager.get('provider', 'name')
self.use_service_principal = config_manager.getbool('environment', 'use_service_principal', False)
if self.use_service_principal:
self.service_principal_module_name = config_manager.get('provider', 'service_principal_module_name')
config_parser = SafeConfigParser(os.environ)
config_file_name = 'osdu_api.ini'
found_names = config_parser.read(config_file_name)
......@@ -57,23 +69,10 @@ class BaseClient:
raise Exception('Could not find osdu_api.ini config file')
if data_partition_id is None:
self.data_partition_id = config_parser.get('environment', 'data_partition_id')
self.data_partition_id = config_manager.get('environment', 'data_partition_id')
else:
self.data_partition_id = data_partition_id
self.storage_url = config_parser.get('environment', 'storage_url')
self.search_url = config_parser.get('environment', 'search_url')
self.data_workflow_url = config_parser.get('environment', 'data_workflow_url')
self.file_dms_url = config_parser.get('environment', 'file_dms_url')
self.legal_url = config_parser.get('environment', 'legal_url')
self.entitlements_url = config_parser.get('environment', 'entitlements_url')
self.dataset_url = config_parser.get('environment', 'dataset_url')
self.use_service_principal = config_parser.get('environment', 'use_service_principal')
self.schema_url = config_parser.get('environment', 'schema_url')
self.ingestion_workflow_url = config_parser.get('environment', 'ingestion_workflow_url')
self.provider = config_parser.get('provider', 'name')
self.service_principal_module_name = config_parser.get('provider', 'service_principal_module_name')
def _refresh_service_principal_token(self):
"""
The path to the logic to get a valid bearer token is dynamically injected based on
......
# Copyright 2021 Google LLC
# Copyright 2021 EPAM Systems
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import abc
class BaseConfigManager:
@abc.abstractmethod
def get(self, section: str, option: str, default=None) -> str:
"""
Get config value.
:param section: Section of ini file.
:type section: str
:param option: Param of the section.
:type option: str
:param default: Default value, defaults to None.
:type default: int, optional
:return: Config value.
:rtype: str
"""
@abc.abstractmethod
def getint(self, section: str, option: str, default: int = None) -> int:
"""
Get config value. as int
:param section: Section of ini file.
:type section: str
:param option: Param of the section.
:type option: str
:param default: Default value, defaults to None
:type default: int, optional
:return: Config value.
:rtype: int
"""
@abc.abstractmethod
def getfloat(self, section: str, option: str, default: float = None) -> int:
"""
Get config value as float.
:param section: Section of ini file.
:type section: str
:param option: Param of the section.
:type option: str
:param default: Default value, defaults to None
:type default: float, optional
:return: Config value.
:rtype: float
"""
@abc.abstractmethod
def getbool(self, section: str, option: str, default: bool = None) -> int:
"""
Get config value as bool.
:param section: Section of ini file.
:type section: str
:param option: Param of the section.
:type option: str
:param default: Default value, defaults to None
:type default: bool, optional
:return: Config value
:rtype: bool
"""
# Copyright 2021 Google LLC
# Copyright 2021 EPAM Systems
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import configparser
import os
from osdu_api.configuration.base_config_manager import BaseConfigManager
"""
Default Config Manager to work with .ini files.
The .ini file's path can be:
1. passed directely to DefaultConfigManager,
2. obtained from OSDU_API_CONFIG_INI Env Var,
If both of this options are not provided, the file 'osdu_api.ini' will be taken from current working directory.
"""
class DefaultConfigManager(BaseConfigManager):
"""
This configuration manager is used for getting different configurations for OSDU clients.
"""
def __init__(self, config_file_path: str = None):
"""
Read the .ini config file by its path and parse it.
:param config_file_path: Path to config .ini file; if it is not provided, then 'OSDU_API_CONFIG_INI' env var will be used, defaults to None
:type config_file_path: str, optional
"""
self._parser = self._read_config(config_file_path)
def _read_config(self, config_file_path: str = None) -> configparser.ConfigParser:
"""
The .ini file's path can be:
1. passed directely to DefaultConfigManager,
2. obtained from OSDU_API_CONFIG_INI Env Var,
If both of this options are not provided, the file will be taken from current working directory.
Example config file:
[environment]
data_partition_id=opendes
storage_url=https://[STORAGE_ENDPOINT]/api/storage/v2
search_url=https://[SEARCH_ENDPOINT]/api/search/v2
data_workflow_url=https://[WORKFLOW_ENDPOINT]/api/data-workflow/v1
file_dms_url=https://[FILE_DMS_ENDPOINT]/api/filedms/v2
dataset_registry_url=https://[DATASET_REGISTRY_URL]/api/dataset-registry/v1
[provider]
name=aws
entitlements_module_name=entitlements_client
:raises Exception: If the .ini file can't be opened.
:return: ConfigParser with parsed configs.
:rtype: configparser.ConfigParser
"""
config_file_path = config_file_path or os.environ.get("OSDU_API_CONFIG_INI") or "osdu_api.ini"
parser = configparser.ConfigParser()
config_read_results = parser.read(config_file_path)
if not config_read_results:
raise configparser.Error(f"Could not find the config file in '{config_file_path}'.")
return parser
def get(self, section: str, option: str, default: str = None) -> str:
"""
Get config value.
:param section: Section of ini file.
:type section: str
:param option: Param of the section.
:type option: str
:param default: Default value, defaults to None.
:type default: int, optional
:return: Config value.
:rtype: str
"""
fallback = default if isinstance(default, str) else configparser._UNSET
config_value = self._parser.get(section=section, option=option, fallback=fallback)
return config_value
def getint(self, section: str, option: str, default: str = None) -> int:
"""
Get config value. as int
:param section: Section of ini file.
:type section: str
:param option: Param of the section.
:type option: str
:param default: Default value, defaults to None
:type default: int, optional
:return: Config value.
:rtype: int
"""
fallback = default if isinstance(default, int) else configparser._UNSET
config_value = self._parser.getint(section=section, option=option, fallback=fallback)
return config_value
def getfloat(self, section: str, option: str, default: str = None) -> int:
"""
Get config value as float.
:param section: Section of ini file.
:type section: str
:param option: Param of the section.
:type option: str
:param default: Default value, defaults to None
:type default: float, optional
:return: Config value.
:rtype: float
"""
fallback = default if isinstance(default, float) else configparser._UNSET
config_value = self._parser.getfloat(section=section, option=option, fallback=fallback)
return config_value
def getbool(self, section: str, option: str, default: bool = None) -> bool:
"""
Get config value as bool.
:param section: Section of ini file.
:type section: str
:param option: Param of the section.
:type option: str
:param default: Default value, defaults to None
:type default: bool, optional
:return: Config value
:rtype: bool
"""
fallback = default if isinstance(default, bool) else configparser._UNSET
config_value = self._parser.getboolean(section=section, option=option, fallback=fallback)
return config_value
# Copyright 2020 Google LLC
# Copyright 2020 EPAM Systems
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import enum
import os
import yaml
from osdu_api.libs.exceptions import ConfigurationError
"""
This module is used for initializing configurations, such as OSDU API endpoints, vendor info etc.
Requires the environmental variable 'OSDU_API_CONFIG' to be specified as a path to yaml-config file.
If there is 'airflow_vars' in this yaml-file, then we get configs from Airflow Variables, otherwise we get all the configs
from this yaml-file.
"""
class ConfigSource(enum.Enum):
AIRFLOW = enum.auto()
YAML = enum.auto()
class ConfigManager(object):
"""
This configuration manager is used for getting different configurations for OSDU clients. Depending on the
configuration yaml-file, it can return configurations specified either in Airflow or in the yaml-file.
"""
def __init__(self):
self._config_source = None
self.configs = None
self._parse_yaml_config()
def get_config_value(self, value: str) -> str:
"""
:param value: configuration value we attempt to get. Example: 'storage_api_url'.
:return: configuration value.
"""
if self._config_source is ConfigSource.YAML:
return self._read_yaml_variable(value)
elif self._config_source is ConfigSource.AIRFLOW:
return self._read_airflow_variable(value)
def _parse_yaml_config(self):
"""
Open "OSDU_API_CONFIG" yaml-file.
If the variable 'airflow_vars' is specified in this file, then we assume, that configs are specified in Airflow variables.
Else if the variable 'yaml_vars' is specified, then we assume that all configs are stored in this file.
"""
config_file_location = os.environ.get("OSDU_API_CONFIG")
if not config_file_location:
raise ConfigurationError(
"Configuration file is not specified in the environment variable 'OSDU_API_CONFIG'")
with open(config_file_location, 'r') as config_file:
self.configs = yaml.load(config_file)
if self.configs.get("airflow_vars"):
import airflow
self.airflow_variables = airflow.models.Variable
self._config_source = ConfigSource.AIRFLOW
elif self.configs.get("yaml_vars"):
self._config_source = ConfigSource.YAML
else:
raise ConfigurationError(
"There is no option for getting osdu_api configs. Either 'airflow_vars' or 'yaml_vars' must be set."
)
def _read_yaml_variable(self, value: str) -> str:
"""
:param value: configuration value we attempt to get from yaml-file. Example: 'storage_api_url'.
:return: configuration value
"""
try:
return self.configs[value]
except:
raise ConfigurationError(f"Can't get value '{value}' from configs.")
def _read_airflow_variable(self, value: str) -> str:
"""
:param value: configuration value we attempt to get from Airflow variables. Example: 'storage_api_url'.
:return: configuration value
"""
try:
return self.airflow_variables.get(value)
except:
raise ConfigurationError(f"Can't get value '{value}' from Airflow configs.")
......@@ -31,6 +31,9 @@ SURROGATE_KEYS_PATHS = [
("properties", "data", "allOf", 1, "properties", "Components", "items"),
]
SEARCH_ID_BATCH_SIZE = 25
DATA_SECTION = "Data"
DATASETS_SECTION = "Datasets"
MASTER_DATA_SECTION ="MasterData"
......
# Copyright 2021 Google LLC
# Copyright 2021 EPAM Systems
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from typing import List
import dataclasses
from osdu_api.libs.constants import DATA_SECTION, DATASETS_SECTION, MASTER_DATA_SECTION, \
REFERENCE_DATA_SECTION, WORK_PRODUCT_SECTION, WORK_PRODUCT_COMPONENTS_SECTION
from osdu_api.libs.exceptions import EmptyManifestError
logger = logging.getLogger()
@dataclasses.dataclass()
class ManifestEntity:
"""
This a dataclass class to represent entities of linearized Manifest
Args:
entity_data: Content of entity
manifest_path: Path to the entity inside the manifest.
E.g. 'ReferenceData' or 'Data.WorkProduct'
"""
entity_data: dict
manifest_path: str
def __eq__(self, other: "ManifestEntity"):
return self.entity_data == other.entity_data \
and self.manifest_path == other.manifest_path
class ManifestLinearizer:
"""Class to linearize manifest and extract all manifest records"""
def _populate_manifest_entity(self, entity_data: dict, manifest_path: str):
"""
Populate manifest entity for future processing
:param entity_data: manifest entity instance (for future processing)
:param manifest_path: corresponding generic schema (for future schema validation)
:return:
"""
return ManifestEntity(entity_data=entity_data, manifest_path=manifest_path)
def _traverse_list(
self,
manifest_entities: List[dict],
manifest_path: str
) -> List[ManifestEntity]:
"""
Traverse list of entities and returned populated list of entities
"""
entities = []
for manifest_entity in manifest_entities:
entities.append(
self._populate_manifest_entity(manifest_entity, manifest_path)
)
return entities
def linearize_manifest(self, manifest: dict) -> List[ManifestEntity]:
"""
Traverse manifest structure and return the list of manifest records.
:param manifest: Manifest
:return: list of records
"""
if not manifest:
raise EmptyManifestError
manifest_entities = []
for section in (REFERENCE_DATA_SECTION, MASTER_DATA_SECTION):
if manifest.get(section):
manifest_entities.extend(
self._traverse_list(manifest[section], section)
)
if manifest.get(DATA_SECTION):
if manifest[DATA_SECTION].get(WORK_PRODUCT_SECTION):
manifest_entities.append(
self._populate_manifest_entity(
manifest[DATA_SECTION][WORK_PRODUCT_SECTION],
f"{DATA_SECTION}.{WORK_PRODUCT_SECTION}"
)
)
for section in (WORK_PRODUCT_COMPONENTS_SECTION, DATASETS_SECTION):
if manifest[DATA_SECTION].get(section):
manifest_entities.extend(
self._traverse_list(
manifest[DATA_SECTION][section],
f"{DATA_SECTION}.{section}",
)
)
return manifest_entities
def assemble_manifest(self, linearized_manifest: List[ManifestEntity],
manifest_kind: str = None) -> dict: