Commit 620ec100 authored by Aleksandr Spivakov (EPAM)'s avatar Aleksandr Spivakov (EPAM)
Browse files

GONRG-2913: Added support for whitelist reference patterns to exclude from...

GONRG-2913: Added support for whitelist reference patterns to exclude from manifest integrity validation
parent 34303d2c
Pipeline #59984 passed with stages
in 1 minute and 53 seconds
......@@ -55,6 +55,7 @@ class ManifestIntegrity:
token_refresher,
file_source_validator: FileSourceValidator,
context: Context,
whitelist_ref_patterns: str = None,
):
self.search_url = search_url
self.token_refresher = token_refresher
......@@ -73,11 +74,42 @@ class ManifestIntegrity:
self.DATASET_ID_PATTERN,
self.SURROGATE_KEY_PATTERN
]
self.whitelist_ref_patterns = self._compile_whitelist_ref_patterns(whitelist_ref_patterns)
super().__init__()
def _compile_whitelist_ref_patterns(self, whitelist_ref_patterns: str) -> List[regex.Pattern]:
"""
Trying to parse whitelist reference regexp patterns from string into list of regexp compiled patterns
:param whitelist_ref_patterns: string containing various whitelist reference regexp patterns prepared for compilation
:return: list of regexp compiled patterns or nothing
"""
if not whitelist_ref_patterns:
return []
try:
logger.debug(whitelist_ref_patterns)
whitelist_ref_patterns = whitelist_ref_patterns.replace('\r\n', '\n').strip().split('\n')
return [
re.compile(r"{}".format(pattern), re.I + re.M)
for pattern in whitelist_ref_patterns
]
except Exception as e:
logger.error(f"Unable to init whitelist reference patterns: {whitelist_ref_patterns}", exc_info=e)
return []
def _match_id_with_pattern(self, pattern: regex.Pattern, source: str) -> List[str]:
return pattern.findall(source)
def _match_id_with_whitelist_pattern(self, pattern: regex.Pattern, source: str) -> List[Dict[str, str]]:
"""
Expects whitelist pattern containing (key) and (value) regexp groups
:param pattern: compiled regexp whitelist pattern
:param source: source to search with pattern
:return: pattern matches filtered by groups
"""
return [match.groupdict() for match in pattern.finditer(source)]
def _collect_ids_by_data_types(self, manifest_section: dict, data_type: str):
"""
Collect manifest entities ids by their data types.
......@@ -140,6 +172,23 @@ class ManifestIntegrity:
logger.debug(f"References of {entity.get('id')}: {references}")
return references
def _extract_whitelist_references(self, entity: dict) -> Set[str]:
"""
Extract whitelisted references from the entity.
:param entity: Manifest's entity
:return: Set of whitelisted ids to other entities or records.
"""
manifest_str = json.dumps(entity)
whitelist_references = set()
for pattern in self.whitelist_ref_patterns:
whitelist_references.update(
{match.get('value') for match in self._match_id_with_whitelist_pattern(pattern, manifest_str)}
)
logger.debug(f"Whitelist references of {entity.get('id')}: {whitelist_references}")
return whitelist_references
def _filter_not_found_ids(
self,
external_references: Iterable[EntityId],
......@@ -223,6 +272,9 @@ class ManifestIntegrity:
:param entity: Manifest's entity.
"""
references = self._extract_references(entity)
if self.whitelist_ref_patterns:
whitelist_references = self._extract_whitelist_references(entity)
references.difference_update(whitelist_references)
external_references = self._extract_external_references(entity, references)
# Search OSDU references that haven't been searched before
......
\"(?P<key>CurveUnit)\":\s?\"(?P<value>[\w\d:-]*:GAPI:)\"
\"(?P<key>CurveUnit)\":\s?\"(?P<value>[\w\d:-]*:V\/V:)\"
[
"osdu:reference-data--ResourceSecurityClassification:RESTRICTED",
"osdu:master-data--Wellbore:1013",
"osdu:reference-data--UnitOfMeasure:M",
"osdu:reference-data--UnitOfMeasure:US/F",
"osdu:reference-data--UnitOfMeasure:G/C3"
]
......@@ -17,6 +17,8 @@ import os
DATA_PATH_PREFIX = f"{os.path.dirname(__file__)}/data"
MANIFEST_REFERENCE_PATTERNS_WHITELIST = f"{DATA_PATH_PREFIX}/reference_patterns_whitelist.txt"
MANIFEST_GENERIC_SCHEMA_PATH = f"{DATA_PATH_PREFIX}/manifests/schema_Manifest.1.0.0.json"
MANIFEST_NEW_GENERIC_SCHEMA_PATH = f"{DATA_PATH_PREFIX}/manifests/new_schema_Manifest.1.0.0.json"
MANIFEST_GENERIC_PATH = f"{DATA_PATH_PREFIX}/manifests/Manifest.1.0.0.json"
......@@ -56,6 +58,7 @@ REF_RESULT_WELL_PATH = f"{DATA_PATH_PREFIX}/master/ref_result_r3_Well.json"
MANIFEST_WELLLOG_PATH = f"{DATA_PATH_PREFIX}/workProduct/r3_Welllog.json"
TRAVERSAL_WELLLOG_PATH = f"{DATA_PATH_PREFIX}/workProduct/traversal_r3_Welllog.json"
REF_RESULT_WELLLOG_PATH = f"{DATA_PATH_PREFIX}/workProduct/ref_result_r3_Welllog.json"
REF_RESULT_WHITELIST_WELLLOG_PATH = f"{DATA_PATH_PREFIX}/workProduct/ref_result_whitelist_r3_Welllog.json"
BATCH_MANIFEST_WELLBORE = f"{DATA_PATH_PREFIX}/batchManifest/Wellbore.0.3.0.json"
DATA_INTEGRITY_VALID_DATA = f"{DATA_PATH_PREFIX}/data_integrity/valid_data.json"
......
......@@ -25,7 +25,8 @@ from unittest.mock import patch
import pytest
from mock_providers import get_test_credentials
from file_paths import MANIFEST_WELL_PATH, REF_RESULT_WELL_PATH, MANIFEST_WELLLOG_PATH, \
REF_RESULT_WELLLOG_PATH, MANIFEST_GENERIC_PATH
REF_RESULT_WELLLOG_PATH, REF_RESULT_WHITELIST_WELLLOG_PATH, MANIFEST_GENERIC_PATH, \
MANIFEST_REFERENCE_PATTERNS_WHITELIST
from osdu_api.libs.exceptions import ValidationIntegrityError
from osdu_api.libs.refresh_token import BaseTokenRefresher
from osdu_api.libs.context import Context
......@@ -57,6 +58,12 @@ class TestIntegrityProvider:
context)
return manifest_integrity
@pytest.fixture
def whitelist_ref_patterns_str(self) -> List:
with open(MANIFEST_REFERENCE_PATTERNS_WHITELIST) as f:
whitelist_ref_patterns = f.read()
return whitelist_ref_patterns
@pytest.mark.parametrize(
"conf_path,ref_result_file",
[
......@@ -79,6 +86,36 @@ class TestIntegrityProvider:
test_data, references)
assert set(e.id for e in manifest_records) == set(expected_result)
@pytest.mark.parametrize(
"conf_path,ref_result_file",
[
pytest.param(
MANIFEST_WELLLOG_PATH,
REF_RESULT_WHITELIST_WELLLOG_PATH,
id="Validate manifest WPC"
)
]
)
def test_extract_references_with_applied_ref_patterns_whitelist(self, monkeypatch,
manifest_integrity,
whitelist_ref_patterns_str: str,
conf_path: str,
ref_result_file: str):
with open(ref_result_file) as f:
expected_result = json.load(f)
with open(conf_path) as f:
conf = json.load(f)
test_data = conf["Data"]["WorkProductComponents"][0]
patterns = manifest_integrity._compile_whitelist_ref_patterns(whitelist_ref_patterns_str)
manifest_integrity.whitelist_ref_patterns = patterns
manifest_integrity._collect_manifest_entities_ids(conf)
references = manifest_integrity._extract_references(test_data)
whitelist_references = manifest_integrity._extract_whitelist_references(test_data)
references.difference_update(whitelist_references)
external_references = manifest_integrity._extract_external_references(
test_data, references)
assert set(e.id for e in external_references) == set(expected_result)
@pytest.mark.parametrize(
"conf_path,ref_result_file",
[
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment