diff --git a/src/dags/libs/exceptions.py b/src/dags/libs/exceptions.py index f933923c03e631366bcaea3d48a0235ec4e7aa61..8fba0851888c03d932f2f41a3ae293297a7567da 100644 --- a/src/dags/libs/exceptions.py +++ b/src/dags/libs/exceptions.py @@ -54,6 +54,7 @@ class FileSourceError(Exception): class UploadFileError(Exception): """Raise when there is an error while uploading a file into OSDU.""" + class TokenRefresherNotPresentError(Exception): """Raise when token refresher is not present in "refresh_token' decorator.""" pass @@ -63,6 +64,7 @@ class NoParentEntitySystemSRNError(Exception): """Raise when parent entity doesn't have system-generated SRN.""" pass + class NoParentEntitySystemSRNError(Exception): """ Raise when parent entity doesn't have system-generated SRN. @@ -72,3 +74,7 @@ class NoParentEntitySystemSRNError(Exception): class InvalidFileRecordData(Exception): """Raise when file data does not contain mandatory fields.""" + + +class GenericManifestSchemaError(Exception): + """Raise when a generic manifest schema is invalid.""" diff --git a/src/dags/libs/validation/validate_file_source.py b/src/dags/libs/validation/validate_file_source.py index 90199fc2878a46b68e43f55173ecd6f2b830873c..2d60bcc686f897314b58ad79bc827e483ba1d0c5 100644 --- a/src/dags/libs/validation/validate_file_source.py +++ b/src/dags/libs/validation/validate_file_source.py @@ -35,7 +35,7 @@ class FileSourceValidator: :return: True if validation succeed or field was populated. :rtype: bool """ - if file_source_info["FileSource"]: + if file_source_info["FileSource"].strip(): return True else: return False diff --git a/src/dags/libs/validation/validate_schema.py b/src/dags/libs/validation/validate_schema.py index 217e3048a3e016d7886fc7bae7053823d70d2a13..25944d998d65600342ca63269fbe3b13ca65ee3a 100644 --- a/src/dags/libs/validation/validate_schema.py +++ b/src/dags/libs/validation/validate_schema.py @@ -17,14 +17,14 @@ import copy import logging -from typing import Any, List, Union, Tuple, Dict +from typing import Any, List, Union, Tuple, Union import jsonschema import requests import tenacity from jsonschema import exceptions from libs.context import Context -from libs.exceptions import EmptyManifestError, NotOSDUSchemaFormatError +from libs.exceptions import GenericManifestSchemaError, NotOSDUSchemaFormatError from libs.traverse_manifest import ManifestEntity from libs.mixins import HeadersMixin from osdu_api.libs.auth.authorization import TokenRefresher, authorize @@ -144,7 +144,7 @@ class SchemaValidator(HeadersMixin): response["$id"] = uri return response - def get_schema(self, kind: str) -> dict: + def get_schema(self, kind: str) -> Union[dict, None]: """Fetch schema from Schema service. :param kind: The kind of the scheema to fetch @@ -156,9 +156,10 @@ class SchemaValidator(HeadersMixin): manifest_schema_uri = f"{self.schema_service}/{kind}" try: response = self.get_schema_request(manifest_schema_uri) - except Exception as e: + except requests.HTTPError as e: logger.error(f"Error on getting schema of kind '{kind}'") - raise e + logger.error(e) + return None return response @staticmethod @@ -240,19 +241,32 @@ class SchemaValidator(HeadersMixin): """ Add 'surrogate-key:.+' to patterns of specific fields. - :param schema: - :return: + :param schema: Original schema. + :return: Extended schema. """ schema = self._extend_id_patterns_with_surrogate_keys(schema) schema = self._extend_referential_patterns_with_surrogate_keys(schema) return schema - def _validate_entity(self, entity: dict, schema: dict = None): + def _validate_entity(self, entity: dict, schema: dict = None) -> bool: """ - Validate the 'data' field of any entity against a schema got by entity's kind. + Validate an entity against a schema. + If the entity is valid, then return True, otherwise, False. + If the schema isn't passed, get it from Schema service. + + :param entity: A manifest's entity. + :param schema: The schema to validate an entity against. + :return: Validation result. """ + if not entity.get("kind"): + return False + if not schema: schema = self.get_schema(entity["kind"]) + if not schema: + logger.warning(f"{entity['kind']} is not present in Schema service.") + return False + try: schema = self._add_surrogate_keys_to_patterns(schema) self._validate_against_schema(schema, entity) @@ -266,9 +280,10 @@ class SchemaValidator(HeadersMixin): def _validate_against_schema(self, schema: dict, data: Any): """ - Validate any data against schema. - :param schema: - :param data: + Validate any data against schema. If the data is not valid, raises ValidationError. + + :param schema: The schema to validate an entity against. + :param data: Any data to validate against schema. :return: """ resolver = OSDURefResolver( @@ -288,7 +303,14 @@ class SchemaValidator(HeadersMixin): :param manifest: :return: Manifest schema """ + if not manifest.get("kind"): + raise GenericManifestSchemaError(f"There is no kind in the Manifest.") + schema = self.get_schema(manifest["kind"]) + if not schema: + raise GenericManifestSchemaError( + f"There is no schema for Manifest kind {manifest['kind']}") + schema_without_refs = copy.deepcopy(schema) if schema_without_refs.get("properties"): self._clear_data_fields(schema_without_refs["properties"]) @@ -296,7 +318,12 @@ class SchemaValidator(HeadersMixin): self._clear_data_fields(schema_without_refs) logger.debug("Schema without refs") logger.debug(f"{schema_without_refs}") - self._validate_against_schema(schema_without_refs, manifest) + + try: + self._validate_against_schema(schema_without_refs, manifest) + except jsonschema.ValidationError as err: + raise GenericManifestSchemaError(f"Manifest failed generic schema validation.\n {err}") + return schema def validate_manifest(self, manifest_records: List[ManifestEntity]) -> List[ManifestEntity]: diff --git a/src/plugins/operators/process_manifest_r3.py b/src/plugins/operators/process_manifest_r3.py index 5c3744c590a9f26db38ed30651cacf0f03c3997e..4e7e4a3f499220f1ac999db5a1b5c4976ff7109f 100644 --- a/src/plugins/operators/process_manifest_r3.py +++ b/src/plugins/operators/process_manifest_r3.py @@ -27,7 +27,7 @@ from airflow.models import BaseOperator, Variable from jsonschema import SchemaError from libs.context import Context from libs.exceptions import EmptyManifestError, NotOSDUSchemaFormatError, \ - UploadFileError, GetSchemaError + UploadFileError, GetSchemaError, GenericManifestSchemaError from libs.source_file_check import SourceFileChecker from libs.handle_file import FileHandler from libs.refresh_token import AirflowTokenRefresher @@ -96,8 +96,10 @@ class ProcessManifestOperatorR3(BaseOperator, ReceivingContextMixin): for single_manifest in manifest[slice_start_index:slice_end_index]: logger.debug(f"processing {single_manifest}") try: - record_ids.extend(single_manifest_processor.process_manifest(single_manifest, True)) - except (UploadFileError, HTTPError, GetSchemaError, SchemaError) as e: + record_ids.extend( + single_manifest_processor.process_manifest(single_manifest, True)) + except (UploadFileError, HTTPError, GetSchemaError, SchemaError, + GenericManifestSchemaError) as e: logger.error(f"Can't process {single_manifest}") logger.error(e) continue diff --git a/tests/plugin-unit-tests/test_file_source_validator.py b/tests/plugin-unit-tests/test_file_source_validator.py index 43acc65dcb553e0e26a3b64d860ff8dadb5aaca5..21bdf73f2f01372ac92eb333fd67d3c564bc6ebc 100644 --- a/tests/plugin-unit-tests/test_file_source_validator.py +++ b/tests/plugin-unit-tests/test_file_source_validator.py @@ -52,6 +52,22 @@ class TestFileSourceValidator: for i in range(len(expected_datasets)): assert expected_datasets[i] == filtered_datasets[i] + @pytest.mark.parametrize("input_datasets_path,wrong_file_source", [ + pytest.param(FILES_SOURCE_VALID, " "), + pytest.param(FILES_SOURCE_VALID, " "), + pytest.param(FILES_SOURCE_VALID, "\t"), + ]) + def test_FileSource_space_chars(self, provide_datasets: list, input_datasets_path: str, wrong_file_source: str): + """Test invalid inputs with spaces.""" + file_source_validator = FileSourceValidator() + for ds in provide_datasets: + ds["data"]["DatasetProperties"]["FileSourceInfo"]["FileSource"] = wrong_file_source + + filtered_datasets = file_source_validator.filter_valid_datasets(provide_datasets) + filtered_datasets = sorted(filtered_datasets, key=lambda k: k["id"]) + assert not filtered_datasets + + @pytest.mark.parametrize("input_datasets_path", [ pytest.param(FILES_SOURCE_INVALID), pytest.param(FILE_COLLECTIONS_INVALID), diff --git a/tests/plugin-unit-tests/test_schema_validator_r3.py b/tests/plugin-unit-tests/test_schema_validator_r3.py index e7a770343a542056b15735ee211f978e9a566ae2..8e7ecc2d60a19db64a92e4b5b824078e0b65d35e 100644 --- a/tests/plugin-unit-tests/test_schema_validator_r3.py +++ b/tests/plugin-unit-tests/test_schema_validator_r3.py @@ -194,8 +194,7 @@ class TestSchemaValidator: "get", lambda *args, **kwargs: MockSchemaResponse("{}", http.HTTPStatus.INTERNAL_SERVER_ERROR)) - with pytest.raises(requests.HTTPError): - schema_validator.get_schema(kind) + assert not schema_validator.get_schema(kind) @pytest.mark.parametrize( "manifest_file,schema_file",