Merge pull request #8 from morand-g/main

morand-g · web-flow · commit 3eec153c7630 · 2024-10-22T14:42:33.000+02:00
geoenrich 0.6.3
diff --git a/.zenodo.json b/.zenodo.json
@@ -18,7 +18,7 @@
 
     "license": "GPL-3.0",
 
-    "title": "GeoEnrich v0.6.2: a new tool for scientists to painlessly enrich species occurrence data with environmental variables",
+    "title": "GeoEnrich v0.6.3: a new tool for scientists to painlessly enrich species occurrence data with environmental variables",
 
     "related_identifiers": [
         {
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,13 @@
+## v0.6.3
+
+#### New functions
+- Added possibility to merge duplicate variables in geoenrich.exports.collate_npy
+- Added population density and distance to port
+
+#### Bug fixes:
+ - Fixed issue that occured when degenerate dimensions are present in remote netcdf file
+
+
 ## v0.6.2
 
 #### New functions:
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# **geoenrich 0.6.2**
+# **geoenrich 0.6.3**
 
 [![Read the Docs](https://img.shields.io/readthedocs/geoenrich)](https://geoenrich.readthedocs.io/en/latest/)
 [![License](https://img.shields.io/github/license/morand-g/geoenrich?color=green)](https://github.com/morand-g/geoenrich/blob/main/LICENSE)
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -25,7 +25,7 @@
 author = 'Gaétan Morand (UMR Marbec)'
 
 # The full version, including alpha/beta/rc tags
-release = '0.6.2'
+release = '0.6.3'
 
 
 # -- General configuration ---------------------------------------------------
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -1,4 +1,4 @@
-geoenrich 0.6.2 documentation
+geoenrich 0.6.3 documentation
 ==============================
 
 |Read the Docs| |License| |PyPI| |Python versions| |Last commit| |DOI|
diff --git a/geoenrich/data/catalog.csv b/geoenrich/data/catalog.csv
@@ -5,6 +5,7 @@ current3d-u,Copernicus,0.25°,7d,1993-01-01 - 2022-12-28,50 levels,dataset-armor
 current3d-v,Copernicus,0.25°,7d,1993-01-01 - 2022-12-28,50 levels,dataset-armor-3d-rep-weekly,vgo
 diatoms,Copernicus,4km,1m,1997-09-04 - ongoing,surface,cmems_obs-oc_glo_bgc-plankton_my_l4-multi-4km_P1M,DIATO
 dinophytes,Copernicus,4km,1m,1997-09-04 - ongoing,surface,cmems_obs-oc_glo_bgc-plankton_my_l4-multi-4km_P1M,DINO
+distance-to-port,Global Fishing Watch,0.01°,,,surface,http://tds.marbec-tools.ird.fr/thredds/dodsC/testAll/geoenrich/dist_port.nc,Band1
 eke,,,,,,calculated,eke
 fsle,Aviso+,0.04°,1d,1994-01-04 – ongoing,surface,https://tds.aviso.altimetry.fr/thredds/dodsC/dataset-duacs-dt-global-allsat-madt-fsle,fsle_max
 fsle-orientation,Aviso+,0.04°,1d,1994-01-04 – ongoing,surface,https://tds.aviso.altimetry.fr/thredds/dodsC/dataset-duacs-dt-global-allsat-madt-fsle,theta_max
@@ -20,6 +21,7 @@ organic-carbon3d,Copernicus,0.25°,7d,1998-01-01 - 2021-12-29,36 levels,cmems_ob
 oxygen,Copernicus,0.25°,1d,1993-01-01 - 2022-12-31,75 levels,cmems_mod_glo_bgc_my_0.25deg_P1D-m,o2
 ph,Copernicus,0.25°,1m,1985-01-01 - 2022-12-01,surface,dataset-carbon-rep-monthly,ph
 picophytoplankton,Copernicus,4km,1m,1997-09-04 - ongoing,surface,cmems_obs-oc_glo_bgc-plankton_my_l4-multi-4km_P1M,PICO
+pop-density-log,GPW v3v4,0.0083°,5y,1995-2020,surface,http://tds.marbec-tools.ird.fr/thredds/dodsC/testAll/geoenrich/pop_density3.nc,Band1
 primary-production,Copernicus,4km,1m,1997-09-04 - ongoing,surface,cmems_obs-oc_glo_bgc-pp_my_l4-multi-4km_P1M,PP
 prochlorophytes,Copernicus,4km,1m,1997-09-04 - ongoing,surface,cmems_obs-oc_glo_bgc-plankton_my_l4-multi-4km_P1M,PROCHLO
 prokaryotes,Copernicus,4km,1m,1997-09-04 - ongoing,surface,cmems_obs-oc_glo_bgc-plankton_my_l4-multi-4km_P1M,PROKAR
diff --git a/geoenrich/enrichment.py b/geoenrich/enrichment.py
@@ -63,7 +63,7 @@ def enrich(dataset_ref, var_id, geo_buff = None, time_buff = None, depth_request
         var_id (str): ID of the variable to download.
         geo_buff (int): Geographic buffer for which to download data around occurrence point (kilometers).
         time_buff (float list): Time bounds for which to download data around occurrence day (days). For instance, time_buff = [-7, 0] will download data from 7 days before the occurrence to the occurrence date.
-        depth_request (str): For 4D data: 'all' -> data for all depths. 'nearest' -> closest lower available depth. Anything else downloads surface data.
+        depth_request (str): For 4D data: 'all' -> data for all depths. 'nearest' -> closest available depth. 'nearest_lower' -> closest lower available depth. Anything else downloads surface data.
         downsample (dict): Number of points to skip between each downloaded point, for each dimension, using its standard name as a key.
         slice (int tuple): Slice of the enrichment file to use for enrichment.
         maxpoints(int): Maximum number of points to download.
@@ -254,7 +254,7 @@ def enrich_download(geodf, varname, var_id, url, geo_buff, time_buff, depth_requ
         url (str): Dataset url (including credentials if needed).
         geo_buff (int): Geographic buffer for which to download data around occurrence point (kilometers).
         time_buff (float list): Time bounds for which to download data around occurrence day (days). For instance, time_buff = [-7, 0] will download data from 7 days before the occurrence to the occurrence date.
-        depth_request (str): For 4D data: 'all' -> data for all depths. 'nearest' -> closest lower available depth. Anything else downloads surface data.
+        depth_request (str): For 4D data: 'all' -> data for all depths. 'nearest' -> closest available depth. 'nearest_lower' -> closest lower available depth. Anything else downloads surface data.
         downsample (dict): Number of points to skip between each downloaded point, for each dimension, using its standard name as a key.
         maxpoints(int): Maximum number of points to download.
         force_download(bool): If True, download data regardless of cache status.
@@ -366,7 +366,7 @@ def enrich_copernicus(geodf, varname, var_id, dataset_id, geo_buff, time_buff, d
         dataset_id (str): Copernicus dataset ID.
         geo_buff (int): Geographic buffer for which to download data around occurrence point (kilometers).
         time_buff (float list): Time bounds for which to download data around occurrence day (days). For instance, time_buff = [-7, 0] will download data from 7 days before the occurrence to the occurrence date.
-        depth_request (str): For 4D data: 'all' -> data for all depths. 'nearest' -> closest lower available depth. Anything else downloads surface data.
+        depth_request (str): For 4D data: 'all' -> data for all depths. 'nearest' -> closest available depth. 'nearest_lower' -> closest lower available depth. Anything else downloads surface data.
         downsample (dict): Number of points to skip between each downloaded point, for each dimension, using its standard name as a key.
         maxpoints(int): Maximum number of points to download.
         force_download(bool): If True, download data regardless of cache status.
@@ -559,7 +559,7 @@ def row_enrich(row, remote_ds, local_ds, bool_ds, dimdict, var, depth_request, d
         bool_ds (netCDF4.Dataset): Local dataset recording whether data has already been downloaded.
         dimdict (dict): Dictionary of dimensions as returned by :func:`geoenrich.satellite.get_metadata`.
         var (dict): Variable dictionary as returned by :func:`geoenrich.satellite.get_metadata`.
-        depth_request (str): For 4D data: 'all' -> data for all depths. 'nearest' -> closest lower available depth. Anything else downloads surface data.
+        depth_request (str): For 4D data: 'all' -> data for all depths. 'nearest' -> closest available depth. 'nearest_lower' -> closest lower available depth. Anything else downloads surface data.
         downsample (dict): Number of points to skip between each downloaded point, for each dimension, using its standard name as a key.
         force_download(bool): If True, download data regardless of cache status.    
     Returns:
@@ -679,7 +679,7 @@ def calculate_indices(row, dimdict, var, depth_request, downsample):
         row (pandas.Series): GeoDataFrame row to enrich.
         dimdict (dict): Dictionary of dimensions as returned by geoenrich.satellite.get_metadata.
         var (dict): Variable dictionary as returned by geoenrich.satellite.get_metadata.
-        depth_request (str): For 4D data: 'all' -> data for all depths. 'nearest' -> closest lower available depth. Anything else downloads surface data.
+        depth_request (str): For 4D data: 'all' -> data for all depths. 'nearest' -> closest available depth. 'nearest_lower' -> closest lower available depth. Anything else downloads surface data.
         downsample (dict): Number of points to skip between each downloaded point, for each dimension, using its standard name as a key.
     Returns:
         dict: Dictionary of indices for each dimension (keys are standard dimension names).
@@ -727,12 +727,16 @@ def calculate_indices(row, dimdict, var, depth_request, downsample):
     # if depth is a dimension, select surface layer, nearest lower value or everything
 
     if ('depth' in dimdict) and (dimdict['depth']['name'] in var['params']):
-        if depth_request == 'nearest' and pd.notna(row['bestz']):
+        if depth_request == 'nearest_lower' and pd.notna(row['bestz']):
             diffs = (row['bestz'] - dimdict['depth']['vals']).astype('float')
             diffs[diffs < 0] = np.nan
             d1 = np.nanargmin(diffs)
             ind['depth'] = {'min': d1, 'max': d1, 'best': d1, 'step': 1}
         
+        elif depth_request == 'nearest' and pd.notna(row['bestz']):
+            d1 = np.argmin( np.abs(row['bestz'] - dimdict['depth']['vals'] ) )
+            ind['depth'] = {'min': d1, 'max': d1, 'best': d1, 'step': 1}
+        
         elif depth_request == 'all':
             ind['depth'] = {'min': 0, 'max': len(dimdict['depth']['vals']) - 1, 'best': None, 'step': 1}
 
@@ -1087,7 +1091,7 @@ def get_enrichment_id(enrichments, var_id, geo_buff, time_buff, depth_request, d
         var_id (str): ID of the variable to download.
         geo_buff (int): Geographic buffer for which to download data around occurrence point (kilometers).
         time_buff (float list): Time bounds for which to download data around occurrence day (days). For instance, time_buff = [-7, 0] will download data from 7 days before the occurrence to the occurrence date.
-        depth_request (str): For 4D data: 'all' -> data for all depths. 'nearest' -> closest lower available depth. Anything else downloads surface data.
+        depth_request (str): For 4D data: 'all' -> data for all depths. 'nearest' -> closest available depth. 'nearest_lower' -> closest lower available depth. Anything else downloads surface data.
         downsample (dict): Number of points to skip between each downloaded point, for each dimension, using its standard name as a key.
     
     Returns:
@@ -1124,7 +1128,7 @@ def save_enrichment_config(dataset_ref, enrichment_id, var_id, geo_buff, time_bu
         var_id (str): ID of the variable to download.
         geo_buff (int): Geographic buffer for which to download data around occurrence point (kilometers).
         time_buff (float list): Time bounds for which to download data around occurrence day (days). For instance, time_buff = [-7, 0] will download data from 7 days before the occurrence to the occurrence date.
-        depth_request (str): For 4D data: 'all' -> data for all depths. 'nearest' -> closest lower available depth. Anything else downloads surface data.
+        depth_request (str): For 4D data: 'all' -> data for all depths. 'nearest' -> closest available depth. 'nearest_lower' -> closest lower available depth. Anything else downloads surface data.
         downsample (dict): Number of points to skip between each downloaded point, for each dimension, using its standard name as a key.
     Returns:
         None
diff --git a/geoenrich/exports.py b/geoenrich/exports.py
@@ -665,7 +665,7 @@ def export_raster(dataset_ref, occ_id, var_id, path = Path('./'), geo_buff = Non
             print('Abort. Array is smaller than 2x2 pixels.')
 
 
-def collate_npy(ds_ref, data_path, output_res = 32, slice = None, dimension3 = {'example-var': 2}):
+def collate_npy(ds_ref, data_path, output_res = 32, slice = None, dimension3 = {'example-var': 2}, duplicates = {'var_to_remove':'var_to_keep'}):
 
     """
     Export a 3D numpy array with all layers for each occurrence of a dataset.
@@ -676,7 +676,9 @@ def collate_npy(ds_ref, data_path, output_res = 32, slice = None, dimension3 = {
         data_path (str): path where numpy files will be saved.
         output_res (int) : output data resolution along lat and lon axes.
         slice (list[int]): if not None, only process the given slice of the dataset.
-        dimension3: provides the expected 3rd dimension length (time dimension * depth dimension) for each variable where it is larger than 1.
+        dimension3 (dict): provides the expected 3rd dimension length (time dimension * depth dimension) for each variable where it is larger than 1.
+        duplicates (dict): dictionnary of variables which should be merged. If var_to_keep is empty, data from var_to_remove are used instead.
+
 
     Returns:
         None
@@ -717,8 +719,12 @@ def collate_npy(ds_ref, data_path, output_res = 32, slice = None, dimension3 = {
 
     # Export np arrays for each occurrence
 
+    var_list = [en['parameters']['var_id'] for en in enrichments]
+    for v in duplicates.keys():
+        var_list.remove(v)
+
     for occ_id in tqdm(ids):
-        all_bands = []
+        all_bands = {}
         for en in enrichments:
             
             params = en['parameters']
@@ -742,12 +748,26 @@ def collate_npy(ds_ref, data_path, output_res = 32, slice = None, dimension3 = {
                                               stack = True,
                                               squeeze = False,
                                               target_len = target_len)
-            all_bands.append(band)
+            all_bands[var_id] = band
+        
+        # replace missing values with value from duplicate variable; and remove said duplicates
+        for to_rem in duplicates:
+            if np.isnan(all_bands[duplicates[to_rem]]).all():
+                all_bands[duplicates[to_rem]] = all_bands[to_rem]
+            all_bands.pop(to_rem)
+
+        var_data = [all_bands[k] for k in var_list]
 
-        to_save = np.concatenate(all_bands, -1)
+        to_save = np.concatenate(var_data, -1)
         np.save(folderpath / (str(occ_id) + '.npy'), to_save)
 
 
+    with open(folderpath / '0000_npy_metadata.txt', 'w') as f:
+        for line in var_list:
+            f.write(f"{line}\n")
+
+
+
     # close NC datasets
 
     for en in enrichments:
diff --git a/geoenrich/satellite.py b/geoenrich/satellite.py
@@ -246,7 +246,8 @@ def create_nc(var):
     bool_ds = nc.Dataset(str(pathd), mode = 'w')
 
     for name, dimension in remote_ds.dimensions.items():
-        if getattr(remote_ds.variables[name], 'standard_name', 'Unknown') == 'time' or name in ['time', 'time_agg']:
+        if name in remote_ds.variables and \
+                (getattr(remote_ds.variables[name], 'standard_name', 'Unknown') == 'time' or name in ['time', 'time_agg']):
             local_ds.createDimension(name, None)
             bool_ds.createDimension(name, None)
         else:
diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = geoenrich
-version = 0.6.2
+version = 0.6.3
 author = Gaétan Morand (UMR Marbec)
 author_email = gaetan.morand@ird.fr
 description = A package to enrich your geo-referenced data (e.g. species occurrences) with environmental data.

Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@`
`18`	`18`
`19`	`19`	`"license": "GPL-3.0",`
`20`	`20`
`21`		`- "title": "GeoEnrich v0.6.2: a new tool for scientists to painlessly enrich species occurrence data with environmental variables",`
	`21`	`+ "title": "GeoEnrich v0.6.3: a new tool for scientists to painlessly enrich species occurrence data with environmental variables",`
`22`	`22`
`23`	`23`	`"related_identifiers": [`
`24`	`24`	`{`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# geoenrich 0.6.2`
	`1`	`+# geoenrich 0.6.3`
`2`	`2`
`3`	`3`	`[![Read the Docs](https://img.shields.io/readthedocs/geoenrich)](https://geoenrich.readthedocs.io/en/latest/)`
`4`	`4`	`[![License](https://img.shields.io/github/license/morand-g/geoenrich?color=green)](https://github.com/morand-g/geoenrich/blob/main/LICENSE)`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-geoenrich 0.6.2 documentation`
	`1`	`+geoenrich 0.6.3 documentation`
`2`	`2`	`==============================`
`3`	`3`
`4`	`4`	`\|Read the Docs\| \|License\| \|PyPI\| \|Python versions\| \|Last commit\| \|DOI\|`