Commit 8b2ffc7a authored by Igor Zimovets (EPAM)'s avatar Igor Zimovets (EPAM)
Browse files

Merge branch 'master' into GONRG-4118-Update-Helms-with-imagePullPolicy-value

parents 9de1b65a 62055c53
Pipeline #87552 failed with stage
in 15 seconds
...@@ -46,6 +46,9 @@ include: ...@@ -46,6 +46,9 @@ include:
- project: "osdu/platform/ci-cd-pipelines" - project: "osdu/platform/ci-cd-pipelines"
file: "build/python.yml" file: "build/python.yml"
- project: "osdu/platform/ci-cd-pipelines"
file: "scanners/fossa-python.yml"
- project: "osdu/platform/ci-cd-pipelines" - project: "osdu/platform/ci-cd-pipelines"
file: "scanners/gitlab-ultimate.yml" file: "scanners/gitlab-ultimate.yml"
...@@ -128,9 +131,6 @@ osdu-gcp-containerize-gitlab: ...@@ -128,9 +131,6 @@ osdu-gcp-containerize-gitlab:
image: docker:19.03 image: docker:19.03
cache: {} cache: {}
tags: ["osdu-medium"] tags: ["osdu-medium"]
only:
variables:
- $OSDU_GCP == 'true'
variables: variables:
IMAGE_TAG: $CI_REGISTRY_IMAGE/osdu-gcp:$CI_COMMIT_SHORT_SHA IMAGE_TAG: $CI_REGISTRY_IMAGE/osdu-gcp:$CI_COMMIT_SHORT_SHA
IMAGE_TAG_LATEST: $CI_REGISTRY_IMAGE/osdu-gcp:latest IMAGE_TAG_LATEST: $CI_REGISTRY_IMAGE/osdu-gcp:latest
...@@ -192,9 +192,6 @@ osdu-gcp-test-python: ...@@ -192,9 +192,6 @@ osdu-gcp-test-python:
stage: integration stage: integration
image: gcr.io/google.com/cloudsdktool/cloud-sdk image: gcr.io/google.com/cloudsdktool/cloud-sdk
needs: ["osdu-gcp-deploy-deployment"] needs: ["osdu-gcp-deploy-deployment"]
only:
variables:
- $OSDU_GCP == 'true' && $OSDU_GCP_INT_TEST_TYPE == 'python'
script: script:
- apt-get install -y python3-venv - apt-get install -y python3-venv
- python3 -m venv env - python3 -m venv env
...@@ -224,136 +221,10 @@ osdu-gcp-test: ...@@ -224,136 +221,10 @@ osdu-gcp-test:
extends: extends:
- .osdu-gcp-variables - .osdu-gcp-variables
osdu-gcp-dev2-test:
extends:
- .osdu-gcp-dev2-variables
# Allow failure on private development deployments # Allow failure on private development deployments
ibm-deploy-devpri: ibm-deploy-devpri:
allow_failure: true allow_failure: true
# --------------------------------------------------------------------------------
# Experimental FOSSA jobs. These will be promoted to the standard ci-cd-pipelines after
# they've had some testing in a real project
fossa-analyze:
image: $CI_REGISTRY/divido/fossa-with-cache/incremental:latest
stage: scan
needs: ['compile-and-unit-test']
rules:
- if: $FOSSA_API_KEY
variables:
FOSSA_OUTPUT_DIR: fossa-output
artifacts:
paths:
- fossa-output
when: always
expire_in: 2 days
script:
# fossa-with-cache needs a CI_COMMIT_BRANCH defined to know how to parse the FOSSA API results
# When building tags, this isn't defined by GitLab. In that case, we use the tag name instead. If that's not defined
# then things will fail and we'll have to make this smarter
- test -z "$CI_COMMIT_BRANCH" && export CI_COMMIT_BRANCH="$CI_COMMIT_TAG"
- |
if [ ! -e all-requirements.txt ]; then
echo "I was expecting a file named 'all-requirements.txt' to have been generated by compile-and-unit-test"
echo "However, that file doesn't seem to exist"
echo "----------------------------------------"
echo "That file should have been the output of a 'pip freeze', so that I knew what the full list of deep"
echo "dependencies were. I can't reasonably generate that in this job, because I don't know what python image"
echo "is appropriate. If this structure has been changed in the build/python.yml, you may need to update this"
echo "logic as well (in scanners/fossa-python.yml)"
exit 1
fi
# This variable is used by the python build environment to refer to the set of requirements that need to
# be compiled down into the single 'all-requirements.txt'. Here, we override it to supply fossa-with-cache
# with a direct answer.
- PIP_REQUIREMENTS=all-requirements.txt fossa-with-cache
fossa-check-notice:
image: $CI_REGISTRY/divido/fossa-with-cache/incremental:latest
stage: scan
needs: ['fossa-analyze']
tags: ['osdu-small']
rules:
- if: $FOSSA_API_KEY
artifacts:
when: on_failure
paths:
- fossa-output/cached-NOTICE
- fossa-output/generated-clean-NOTICE
expire_in: 2 days
script:
# Check to see if a newer commit exists for the pipeline's branch, and if it does, use that NOTICE instead of this one's
- |
if [ "$CI_COMMIT_BRANCH" != "" ]; then
colorCmd="\e[32;1m"
colorReset="\e[0m"
function echoCmd() {
echo -e "${colorCmd}>" "$@" "${colorReset}"
}
echoCmd git fetch
git fetch
echoCmd git diff --name-only HEAD origin/$CI_COMMIT_BRANCH
branchDiffs="$(git diff --name-only HEAD origin/$CI_COMMIT_BRANCH)"
echo $branchDiffs
echo "--------------------"
if [ "$branchDiffs" == "NOTICE" ]; then
echo "The branch associated with this pipeline ($CI_COMMIT_BRANCH) has been changed, but the only changes are the NOTICE file"
echo "I will use the NOTICE file from origin/$CI_COMMIT_BRANCH ($(git rev-parse --short origin/$CI_COMMIT_BRANCH)) as the basis for comparison"
echoCmd git checkout origin/$CI_COMMIT_BRANCH -- NOTICE
git checkout origin/$CI_COMMIT_BRANCH -- NOTICE
elif [ "$branchDiffs" == "" ]; then
echo "The branch associated with this pipeline ($CI_COMMIT_BRANCH) has not been changed since the commit that spawned this pipeline"
echo "I will use the NOTICE file from the pipeline's commit ($CI_COMMIT_SHORT_SHA) as the basis for comparison"
else
echo "The branch associated with this pipeline ($CI_COMMIT_BRANCH) has been changed, but the changes include more than just the NOTICE file"
echo "I will use the NOTICE file from the pipeline's commit ($CI_COMMIT_SHORT_SHA) as the basis for comparison"
fi
fi
# Use a cached NOTICE if available, otherwise use a generated one
- |
if [ -e fossa-output/cached-NOTICE ]; then
fossaGeneratedNotice=fossa-output/cached-NOTICE;
elif [ -e fossa-output/generated-clean-NOTICE ]; then
fossaGeneratedNotice=fossa-output/generated-clean-NOTICE
else
echo "Couldn't find either a cached-NOTICE or generated-clean-NOTICE in the fossa-output/ directory"
echo
echo "At least one of these should have been generated by a previous job stage (fossa-analyze) and stored"
echo "as an artifact. Something must be wrong in the CI setup"
exit 1
fi
echo "Comparing with $fossaGeneratedNotice"
# If the comparison finds differences, let the user know what to do next
- |
if ! fossa-compare-notices NOTICE $fossaGeneratedNotice; then
echo --------------------------------------------------------------------------------
echo "There are differences in the NOTICE file"
echo "Please review these differences, and if they look appropriate based on your"
echo "changes, update the committed NOTICE file"
echo "--------------------"
echo "If you make changes to the NOTICE file (and only the NOTICE file), you can"
echo "re-run this single stage of the pipeline alone rather than the whole pipeline"
echo "One way to achieve this:"
echo "$ wget -O NOTICE '${CI_PROJECT_URL}/-/jobs/${CI_JOB_ID}/artifacts/raw/${fossaGeneratedNotice}?inline=false'"
echo "$ git add NOTICE"
echo "$ git commit -m 'Updating NOTICE'"
echo "$ git push -o ci.skip"
echo "Then retry this job"
exit 1
fi
...@@ -9,7 +9,7 @@ Apache-2.0 ...@@ -9,7 +9,7 @@ Apache-2.0
The following software have components provided under the terms of this license: The following software have components provided under the terms of this license:
- aiobotocore (from https://github.com/aio-libs/aiobotocore) - aiobotocore (from https://github.com/aio-libs/aiobotocore)
- aiohttp (from https://github.com/aio-libs/aiohttp/) - aiohttp (from https://github.com/aio-libs/aiohttp)
- async-timeout (from https://github.com/aio-libs/async_timeout/) - async-timeout (from https://github.com/aio-libs/async_timeout/)
- boto3 (from https://github.com/boto/boto3) - boto3 (from https://github.com/boto/boto3)
- botocore (from https://github.com/boto/botocore) - botocore (from https://github.com/boto/botocore)
...@@ -226,9 +226,9 @@ MIT ...@@ -226,9 +226,9 @@ MIT
The following software have components provided under the terms of this license: The following software have components provided under the terms of this license:
- PyJWT (from http://github.com/jpadilla/pyjwt) - PyJWT (from http://github.com/jpadilla/pyjwt)
- PyYAML - PyYAML (from http://pyyaml.org/wiki/PyYAML)
- adal (from https://github.com/AzureAD/azure-activedirectory-library-for-python) - adal (from https://github.com/AzureAD/azure-activedirectory-library-for-python)
- aiohttp (from https://github.com/aio-libs/aiohttp/) - aiohttp (from https://github.com/aio-libs/aiohttp)
- aioitertools (from https://github.com/jreese/aioitertools) - aioitertools (from https://github.com/jreese/aioitertools)
- aioredis (from https://github.com/aio-libs/aioredis) - aioredis (from https://github.com/aio-libs/aioredis)
- anyio (from https://pypi.org/project/anyio/3.3.0/, https://pypi.org/project/anyio/3.4.0/) - anyio (from https://pypi.org/project/anyio/3.3.0/, https://pypi.org/project/anyio/3.4.0/)
......
...@@ -38,6 +38,9 @@ class ChunkGroup: ...@@ -38,6 +38,9 @@ class ChunkGroup:
paths: List[str] paths: List[str]
dtypes: List[str] dtypes: List[str]
ColumnLabel = str
ColumnDType = str
class BulkCatalog: class BulkCatalog:
"""Represent a bulk catalog """Represent a bulk catalog
Example: Example:
...@@ -67,7 +70,7 @@ class BulkCatalog: ...@@ -67,7 +70,7 @@ class BulkCatalog:
self.columns: List[ChunkGroup] = [] self.columns: List[ChunkGroup] = []
@property @property
def all_columns_dtypes(self) -> Dict[str, str]: def all_columns_dtypes(self) -> Dict[ColumnLabel, ColumnDType]:
"""Returns all columns with their dtype """Returns all columns with their dtype
Returns: Returns:
Dict[str, str]: a dict { column label : column dtype } Dict[str, str]: a dict { column label : column dtype }
...@@ -120,11 +123,17 @@ class BulkCatalog: ...@@ -120,11 +123,17 @@ class BulkCatalog:
paths: List[str] paths: List[str]
def get_paths_for_columns(self, labels: Iterable[str], base_path: str) -> List[ColumnsPaths]: def get_paths_for_columns(self, labels: Iterable[str], base_path: str) -> List[ColumnsPaths]:
"""Returns the paths to load data of the requested columns grouped by paths""" """Returns the paths to load data of the requested columns grouped by paths
Args:
labels (Iterable[str]): List of desired columns. If None or empty select all columns.
base_path (str): Base path as prefix to chunks path
Returns:
List[ColumnsPaths]: The requested columns grouped by paths
"""
grouped_files = [] grouped_files = []
for col_group in self.columns: for col_group in self.columns:
matching_columns = col_group.labels.intersection(labels) matching_columns = col_group.labels.intersection(labels) if labels else col_group.labels
if matching_columns: if matching_columns:
grouped_files.append(self.ColumnsPaths( grouped_files.append(self.ColumnsPaths(
labels=matching_columns, labels=matching_columns,
......
...@@ -33,14 +33,16 @@ from app.conf import Config ...@@ -33,14 +33,16 @@ from app.conf import Config
from .dask_worker_plugin import DaskWorkerPlugin from .dask_worker_plugin import DaskWorkerPlugin
from .errors import BulkRecordNotFound, BulkNotProcessable, internal_bulk_exceptions from .errors import BulkRecordNotFound, BulkNotProcessable, internal_bulk_exceptions
from .traces import map_with_trace, submit_with_trace from .traces import map_with_trace, submit_with_trace
from .utils import (by_pairs, do_merge, worker_capture_timing_handlers, from .utils import (WDMS_INDEX_NAME, by_pairs, do_merge, worker_capture_timing_handlers,
get_num_rows, set_index, index_union) get_num_rows, set_index, index_union)
from ..dataframe_validators import is_reserved_column_name, DataFrameValidationFunc from ..dataframe_validators import is_reserved_column_name, DataFrameValidationFunc
from .. import DataframeSerializerSync from .. import DataframeSerializerSync
from . import storage_path_builder as pathBuilder from . import storage_path_builder as pathBuilder
from . import session_file_meta as session_meta from . import session_file_meta as session_meta
from ..bulk_id import new_bulk_id from ..bulk_id import new_bulk_id
from .bulk_catalog import BulkCatalog, ChunkGroup, load_bulk_catalog, save_bulk_catalog from .bulk_catalog import (BulkCatalog, ChunkGroup,
async_load_bulk_catalog,
async_save_bulk_catalog)
from ..mime_types import MimeType from ..mime_types import MimeType
from .dask_data_ipc import DaskNativeDataIPC, DaskLocalFileDataIPC from .dask_data_ipc import DaskNativeDataIPC, DaskLocalFileDataIPC
from . import dask_worker_write_bulk as bulk_writer from . import dask_worker_write_bulk as bulk_writer
...@@ -50,8 +52,7 @@ def read_with_dask(path: Union[str, List[str]], **kwargs) -> dd.DataFrame: ...@@ -50,8 +52,7 @@ def read_with_dask(path: Union[str, List[str]], **kwargs) -> dd.DataFrame:
"""call dask.dataframe.read_parquet with default parameters """call dask.dataframe.read_parquet with default parameters
Dask read_parquet parameters: Dask read_parquet parameters:
chunksize='25M': if chunk are too small, we aggregate them until we reach chunksize chunksize='25M': if chunk are too small, we aggregate them until we reach chunksize
aggregate_files=True: because we are passing a list of path when commiting a session, aggregate_files=True: aggregate_files is needed when files are in different folders
aggregate_files is needed when paths are different
Args: Args:
path (Union[str, List[str]]): a file, a folder or a list of files path (Union[str, List[str]]): a file, a folder or a list of files
Returns: Returns:
...@@ -74,8 +75,8 @@ def _load_index_from_meta(meta, **kwargs): ...@@ -74,8 +75,8 @@ def _load_index_from_meta(meta, **kwargs):
**kwargs).index **kwargs).index
def _index_union_tuple(t): def _index_union_tuple(indexes: Tuple[pd.Index, Optional[pd.Index]]):
return index_union(*t) return index_union(*indexes)
class DaskBulkStorage: class DaskBulkStorage:
...@@ -165,14 +166,17 @@ class DaskBulkStorage: ...@@ -165,14 +166,17 @@ class DaskBulkStorage:
- if columns is None, we load all columns - if columns is None, we load all columns
Returns: Future<dd.dataframe> Returns: Future<dd.dataframe>
""" """
if columns is None:
columns = catalog.all_columns_dtypes.keys()
record_path = pathBuilder.record_path(self.base_directory, catalog.record_id, self.protocol) record_path = pathBuilder.record_path(self.base_directory, catalog.record_id, self.protocol)
files_to_load = catalog.get_paths_for_columns(columns, record_path) files_to_load = catalog.get_paths_for_columns(columns, record_path)
# read all chunk for requested columns # read all chunk for requested columns
def read_parquet_files(f): def read_parquet_files(f):
return read_with_dask(f.paths, columns=f.labels, storage_options=self._parameters.storage_options) return read_with_dask(f.paths, columns=f.labels, storage_options=self._parameters.storage_options)
dfs = self._map_with_trace(read_parquet_files, files_to_load) dfs = self._map_with_trace(read_parquet_files, files_to_load)
index_df = self._read_index_from_catalog_index_path(catalog)
if index_df:
dfs.append(index_df)
if not dfs: if not dfs:
raise RuntimeError("cannot find requested columns") raise RuntimeError("cannot find requested columns")
...@@ -187,11 +191,11 @@ class DaskBulkStorage: ...@@ -187,11 +191,11 @@ class DaskBulkStorage:
"""Load columns from parquet files in the bulk_path. """Load columns from parquet files in the bulk_path.
Returns: Future<dd.DataFrame> Returns: Future<dd.DataFrame>
""" """
bulk_path = pathBuilder.record_bulk_path(self.base_directory, record_id, bulk_id, self.protocol) catalog = await self.get_bulk_catalog(record_id, bulk_id, generate_if_not_exists=False)
catalog = load_bulk_catalog(self._fs, bulk_path)
if catalog is not None: if catalog is not None:
return self._load_bulk_from_catalog(catalog, columns) return self._load_bulk_from_catalog(catalog, columns)
# No catalog means that we can read the folder as a parquet dataset. (legacy behavior) # No catalog means that we can read the folder as a parquet dataset. (legacy behavior)
bulk_path = pathBuilder.record_bulk_path(self.base_directory, record_id, bulk_id, self.protocol)
return self._read_parquet(bulk_path, columns=columns) return self._read_parquet(bulk_path, columns=columns)
@with_trace('read_stat') @with_trace('read_stat')
...@@ -215,7 +219,7 @@ class DaskBulkStorage: ...@@ -215,7 +219,7 @@ class DaskBulkStorage:
Args: Args:
record_id (str): the record id on which belongs the bulk. record_id (str): the record id on which belongs the bulk.
bulk_id (str): the bulk id to load. bulk_id (str): the bulk id to load.
columns (List[str], optional): columns to load. If None all all available columns. Defaults to None. columns (List[str], optional): columns to load. If None, all available columns. Defaults to None.
Raises: Raises:
BulkRecordNotFound: If bulk data cannot be found. BulkRecordNotFound: If bulk data cannot be found.
Returns: Returns:
...@@ -223,7 +227,10 @@ class DaskBulkStorage: ...@@ -223,7 +227,10 @@ class DaskBulkStorage:
""" """
try: try:
future_df = await self._load_bulk(record_id, bulk_id, columns=columns) future_df = await self._load_bulk(record_id, bulk_id, columns=columns)
return await future_df dataframe = await future_df
if columns and set(dataframe.columns) != set(columns):
raise BulkRecordNotFound(record_id, bulk_id)
return dataframe
except (OSError, RuntimeError) as exp: except (OSError, RuntimeError) as exp:
raise BulkRecordNotFound(record_id, bulk_id) from exp raise BulkRecordNotFound(record_id, bulk_id) from exp
...@@ -237,17 +244,18 @@ class DaskBulkStorage: ...@@ -237,17 +244,18 @@ class DaskBulkStorage:
engine='pyarrow', schema="infer", compression='snappy') engine='pyarrow', schema="infer", compression='snappy')
@capture_timings('get_bulk_catalog') @capture_timings('get_bulk_catalog')
async def get_bulk_catalog(self, record_id: str, bulk_id: str) -> BulkCatalog: async def get_bulk_catalog(self, record_id: str, bulk_id: str, generate_if_not_exists=True) -> BulkCatalog:
bulk_path = pathBuilder.record_bulk_path(self.base_directory, record_id, bulk_id) bulk_path = pathBuilder.record_bulk_path(self.base_directory, record_id, bulk_id)
catalog = load_bulk_catalog(self._fs, bulk_path) catalog = await async_load_bulk_catalog(self._fs, bulk_path)
if catalog: if catalog:
return catalog return catalog
# For legacy bulk, construct a catalog on the fly if generate_if_not_exists:
try: # For legacy bulk, construct a catalog on the fly
return await self._build_catalog_from_path(bulk_path, record_id) try:
except FileNotFoundError as e: return await self._build_catalog_from_path(bulk_path, record_id)
raise BulkRecordNotFound(record_id, bulk_id) from e except FileNotFoundError as error:
raise BulkRecordNotFound(record_id, bulk_id) from error
@capture_timings('_build_catalog_from_path') @capture_timings('_build_catalog_from_path')
async def _build_catalog_from_path(self, path: str, record_id: str) -> BulkCatalog: async def _build_catalog_from_path(self, path: str, record_id: str) -> BulkCatalog:
...@@ -285,13 +293,24 @@ class DaskBulkStorage: ...@@ -285,13 +293,24 @@ class DaskBulkStorage:
return catalog return catalog
def _read_index_from_catalog_index_path(self, catalog: BulkCatalog) -> Optional[dd.DataFrame]:
"""Returns a Future dask dataframe or None if index path is not in the catalog"""
if catalog.index_path:
index_path = pathBuilder.full_path(self.base_directory, catalog.record_id,
catalog.index_path, self.protocol)
return self._read_parquet(index_path)
return None
@capture_timings('_future_load_index')
async def _future_load_index(self, record_id: str, bulk_id: str) -> Awaitable[pd.Index]: async def _future_load_index(self, record_id: str, bulk_id: str) -> Awaitable[pd.Index]:
"""load the dataframe index of the specified record""" """Loads the dataframe index of the specified record
index should be save in a specific folder but for bulk prior to catalog creation
we read one column and retreive the index associated with it.
"""
catalog = await self.get_bulk_catalog(record_id, bulk_id) catalog = await self.get_bulk_catalog(record_id, bulk_id)
if catalog.index_path: future_df = self._read_index_from_catalog_index_path(catalog)
index_path = pathBuilder.full_path(self.base_directory, record_id, catalog.index_path, self.protocol) if future_df is None:
future_df = self._read_parquet(index_path) # read one column to get the index. (It doesn't seems possible to get the index directly)
else: # only read one column to get the index. It doesn't seems possible to get the index directly.
first_column = next(iter(catalog.all_columns_dtypes)) first_column = next(iter(catalog.all_columns_dtypes))
future_df = await self._load_bulk(record_id, bulk_id, [first_column]) future_df = await self._load_bulk(record_id, bulk_id, [first_column])
return self._submit_with_trace(lambda df: df.index.compute(), future_df) return self._submit_with_trace(lambda df: df.index.compute(), future_df)
...@@ -331,7 +350,7 @@ class DaskBulkStorage: ...@@ -331,7 +350,7 @@ class DaskBulkStorage:
async def _fill_catalog_columns_info( async def _fill_catalog_columns_info(
self, catalog: BulkCatalog, session_metas, bulk_id: str self, catalog: BulkCatalog, session_metas, bulk_id: str
) -> Optional[BulkCatalog]: ) -> Optional[BulkCatalog]:
""" build the catalog from the session.""" """Build the catalog from the session."""
catalog_columns = set(catalog.all_columns_dtypes) catalog_columns = set(catalog.all_columns_dtypes)
for chunks_metas in session_meta.get_next_chunk_files(session_metas): for chunks_metas in session_meta.get_next_chunk_files(session_metas):
...@@ -392,10 +411,12 @@ class DaskBulkStorage: ...@@ -392,10 +411,12 @@ class DaskBulkStorage:
self._ensure_dir_tree_exists(index_folder) self._ensure_dir_tree_exists(index_folder)
index_path = pathBuilder.join(index_folder, 'index.parquet') index_path = pathBuilder.join(index_folder, 'index.parquet')
f_pdf = await self.client.scatter(pd.DataFrame(index=index)) dataframe = pd.DataFrame(index=index)
dataframe.index.name = WDMS_INDEX_NAME
f_pdf = await self.client.scatter(dataframe)
await self._submit_with_trace(DataframeSerializerSync.to_parquet, f_pdf, index_path, await self._submit_with_trace(DataframeSerializerSync.to_parquet, f_pdf, index_path,
storage_options=self._parameters.storage_options) storage_options=self._parameters.storage_options)
return index_path return index_path
@capture_timings('session_commit') @capture_timings('session_commit')
...@@ -414,8 +435,8 @@ class DaskBulkStorage: ...@@ -414,8 +435,8 @@ class DaskBulkStorage:
""" """
bulk_id = new_bulk_id() bulk_id = new_bulk_id()
chunk_metas = await session_meta.get_chunks_metadata(self._fs, self.base_directory, session) chunk_metas = await session_meta.get_chunks_metadata(self._fs, self.protocol, self.base_directory, session)
if len(chunk_metas) == 0:# there is no files in this session if len(chunk_metas) == 0: # there is no files in this session
raise BulkNotProcessable(message="No data to commit") raise BulkNotProcessable(message="No data to commit")
if from_bulk_id: if from_bulk_id:
...@@ -437,7 +458,8 @@ class DaskBulkStorage: ...@@ -437,7 +458,8 @@ class DaskBulkStorage:
self._fill_catalog_columns_info(catalog, chunk_metas, bulk_id) self._fill_catalog_columns_info(catalog, chunk_metas, bulk_id)
) )
save_bulk_catalog(self._fs, commit_path, catalog) fcatalog = await self.client.scatter(catalog)
await async_save_bulk_catalog(self._fs, commit_path, fcatalog)
return bulk_id return bulk_id
@internal_bulk_exceptions @internal_bulk_exceptions
......
...@@ -3,6 +3,7 @@ import json ...@@ -3,6 +3,7 @@ import json
import fsspec import fsspec
import pandas as pd import pandas as pd
from app.bulk_persistence.dask.utils import WDMS_INDEX_NAME
from app.model.model_chunking import DataframeBasicDescribe from app.model.model_chunking import DataframeBasicDescribe
...@@ -69,6 +70,9 @@ def write_bulk_without_session(data_handle, ...@@ -69,6 +70,9 @@ def write_bulk_without_session(data_handle,
trace_dataframe_attributes(df) trace_dataframe_attributes(df)
# set the name of the index column
df.index.name = WDMS_INDEX_NAME
# 3- build blob filename and final full blob path # 3- build blob filename and final full blob path
filename = session_meta.generate_chunk_filename(df) filename = session_meta.generate_chunk_filename(df)
full_file_path = path_builder.join(bulk_base_path, filename + '.parquet') full_file_path = path_builder.join(bulk_base_path, filename + '.parquet')
...@@ -115,8 +119,9 @@ def add_chunk_in_session(data_handle, ...@@ -115,8 +119,9 @@ def add_chunk_in_session(data_handle,
trace_dataframe_attributes(df) trace_dataframe_attributes(df)
# sort column by names # TODO could it be avoided ? then we could keep input untouched and save serialization step? # sort column by names and set index column name # TODO could it be avoided ? then we could keep input untouched and save serialization step?
df = df[sorted(df.columns)] df = df[sorted(df.columns)]
df.index.name = WDMS_INDEX_NAME
# 3- build blob filename and final full blob path # 3- build blob filename and final full blob path
filename = session_meta.generate_chunk_filename(df) filename = session_meta.generate_chunk_filename(df)
......
...@@ -34,7 +34,7 @@ from .storage_path_builder import add_protocol, record_session_path ...@@ -34,7 +34,7 @@ from .storage_path_builder import add_protocol, record_session_path
class SessionFileMeta: class SessionFileMeta:
"""The class extract information about chunks.""" """The class extract information about chunks."""
def __init__(self, fs, file_path: str, lazy: bool = True) -> None: def __init__(self, fs, protocol: str, file_path: str, lazy: bool = True) -> None:
""" """
Args: Args: