zxcv

GSA · Feb 25, 2025 · 5e36b96 · 5e36b96
1 parent a12e0a9
commit 5e36b96
Show file tree

Hide file tree

Showing 7 changed files with 69 additions and 279 deletions.
diff --git a/.env b/.env
@@ -44,7 +44,7 @@ CKAN_SMTP_PASSWORD=pass
 CKAN_SMTP_MAIL_FROM=ckan@localhost
 
 # Extensions
-CKAN__PLUGINS=envvars tracking image_view text_view datagov_harvest ckan_harvester geodatagov z3950_harvester arcgis_harvester geodatagov_geoportal_harvester waf_harvester_collection geodatagov_csw_harvester geodatagov_doc_harvester geodatagov_waf_harvester spatial_metadata spatial_query s3test datajson datajson_harvest
+CKAN__PLUGINS=tracking harvest datagov_harvest ckan_harvester geodatagov z3950_harvester arcgis_harvester geodatagov_geoportal_harvester waf_harvester_collection geodatagov_csw_harvester geodatagov_doc_harvester geodatagov_waf_harvester spatial_metadata spatial_query s3test datajson datajson_harvest envvars
 
 # Harvest settings
 CKAN__HARVEST__MQ__TYPE=redis

diff --git a/ckanext/geodatagov/cli.py b/ckanext/geodatagov/cli.py
@@ -8,7 +8,7 @@
 import tempfile
 import warnings
 from typing import Optional
-from sqlalchemy import func, and_, select, desc
+from sqlalchemy import func, and_
 
 import boto3
 import ckan.logic as logic
@@ -20,7 +20,7 @@
 from ckan.lib.search.common import make_connection
 from ckan.lib.search.index import NoopSearchIndex, PackageSearchIndex
 from ckan.model.meta import Session as session
-from ckanext.tracking.cli.tracking import update_tracking
+from ckanext.tracking.cli import tracking
 from ckanext.tracking.model import TrackingSummary as ts
 
 from ckanext.geodatagov.search import GeoPackageSearchQuery
@@ -730,38 +730,14 @@ def harvest_object_relink(harvest_source_id: Optional[str]):
 @click.argument("start_date", required=False)
 def tracking_update(start_date: Optional[str]):
     """ckan tracking update with customized options and output"""
-    update_all(start_date)
-
-
-def update_all(start_date: Optional[str] = None):
-    if start_date:
-        date = datetime.datetime.strptime(start_date, "%Y-%m-%d")
-    else:
-        # No date given. See when we last have data for and get data
-        # from 2 days before then in case new data is available.
-        # If no date here then use 2020-01-01 as the start date
-        stmt = select(ts).order_by(desc(ts.tracking_date))
-        result = session.scalars(stmt).first()
-        if result:
-            date = result.tracking_date
-            date += datetime.timedelta(-2)
-            # convert date to datetime
-            combine = datetime.datetime.combine
-            date = combine(date, datetime.time(0))
-        else:
-            date = datetime.datetime(2020, 1, 1)
-    start_date_solrsync = date
-    end_date = datetime.datetime.now()
-
-    while date < end_date:
-        stop_date = date + datetime.timedelta(1)
-        update_tracking(date)
-        click.echo("tracking updated for {}".format(date))
-        date = stop_date
-
-    update_tracking_solr(start_date_solrsync)
+    # override the function update_tracking_solr in tracking module
+    tracking.update_tracking_solr = update_tracking_solr
+    tracking.update_all(start_date)
 
 def update_tracking_solr(start_date: datetime.datetime):
+    """copied from ckanext/tracking/cli/tracking.py
+       but with customized way of doing solr indexing
+    """
     results = (
         session.query(ts.package_id)
         .filter(

diff --git a/ckanext/geodatagov/commands.py b/ckanext/geodatagov/commands.py
@@ -41,7 +41,6 @@ class GeoGovCommand(p.SingletonPlugin):
         paster geodatagov combine-feeds -c <config>
         paster geodatagov harvest-job-cleanup -c <config>
         paster geodatagov export-csv -c <config>
-        paster geodatagov update-dataset-geo-fields -c <config>
     '''
     p.implements(p.IClick)
     summary = __doc__.split('\n')[0]
@@ -107,8 +106,6 @@ def command(self):
         if cmd == 'metrics-csv':
             self.metrics_csv()
         """
-        if cmd == 'update-dataset-geo-fields':
-            self.update_dataset_geo_fields()
 
     def get_user_org_mapping(self, location):
         user_org_mapping = open(location)
@@ -654,95 +651,6 @@ def metrics_csv(self):
         print(str(datetime.datetime.now()) + ' Done.')
         """
 
-    def update_dataset_geo_fields(self):
-        """ Re-index dataset with geofields
-            Catalog-classic use `spatial` field with string values (like _California_) or
-            raw coordinates (like _-17.4,34.2,-17.1,24.6_). Catalog-next take this data and
-            transform it into a valid GeoJson polygon (with the `translate_spatial` function).
-            On `package_create` or `package_update` this transformation will happend but
-            datasets already harvested will not be updated automatically.
-            """
-
-        # iterate over all datasets
-
-        search_backend = config.get('ckanext.spatial.search_backend', 'postgis')
-        if search_backend != 'solr-bbox':
-            raise ValueError('Solr is not your default search backend (ckanext.spatial.search_backend)')
-
-        datasets = model.Session.query(model.Package).all()
-        total = len(datasets)
-        print('Transforming {} datasets.'.format(total))
-        c = 0
-        transformed = 0
-        failed = 0
-        skipped = 0
-        results = {
-            'datasets': {}
-        }
-        for dataset in datasets:
-            c += 1
-            print('Transforming {}/{}: {}. {} skipped, {} failed, {} transformed'.
-                  format(c, total, dataset.name, skipped, failed, transformed))
-            results['datasets'][dataset.id] = {'name': dataset.name}
-            dataset_dict = dataset.as_dict()
-            extras = dataset_dict['extras']
-            rolled_up = extras.get('extras_rollup', None)
-            if rolled_up is None:
-                results['datasets'][dataset.id]['skip'] = 'No rolled up extras'
-                skipped += 1
-                continue
-            new_extras_rollup = json.loads(rolled_up)
-
-            old_spatial = new_extras_rollup.get('spatial', None)
-            if old_spatial is None:
-                results['datasets'][dataset.id]['skip'] = 'No rolled up spatial extra found'
-                skipped += 1
-                continue
-            print(' - Old Spatial found "{}"'.format(old_spatial))
-
-            try:
-                # check if already we have spatial valid data
-                json.loads(old_spatial)
-                results['datasets'][dataset.id]['spatial-already-done'] = old_spatial
-                skipped += 1
-                continue
-            except BaseException:
-                pass
-
-            # update package, the translate_spatial function will fix spatial data
-            context = {'user': self.user_name, 'ignore_auth': True}
-            old_pkg = p.toolkit.get_action('package_show')(context, {'id': dataset.id})
-            pkg_dict = p.toolkit.get_action('package_update')(context, old_pkg)
-
-            # check the results
-            extras = pkg_dict['extras']
-            new_spatial = None
-            for extra in extras:
-                if extra['key'] == 'spatial':
-                    if old_spatial != extra['value']:
-                        transformed += 1
-                        new_spatial = extra['value']
-                        results['datasets'][dataset.id]['transformation'] = [old_spatial, new_spatial]
-                    else:
-                        results['datasets'][dataset.id]['transformation'] = [old_spatial, 'not found']
-
-            if new_spatial is None:
-                failed += 1
-                new_spatial = '**** NOT FOUND ****'
-
-            print(' - NEW Spatial: "{}"'.format(new_spatial))
-
-        print('Final results {} total datasets. {} skipped, {} failed, {} transformed'.
-              format(total, skipped, failed, transformed))
-
-        results.update({
-            'total': c,
-            'transformed': transformed,
-            'skipped': skipped,
-            'failed': failed
-        })
-        return results
-
 
 def get_response(url):
     req = Request(url)

diff --git a/ckanext/geodatagov/harvesters/waf_collection.py b/ckanext/geodatagov/harvesters/waf_collection.py
@@ -104,24 +104,15 @@ def gather_stage(self, harvest_job):
             .first()
         )
 
-        if existing_harvest_object:
-            status = "change"
-            guid = existing_harvest_object.guid
-            package_id = existing_harvest_object.package_id
-        else:
-            status, package_id = "new", None
-
         obj = HarvestObject(
-            job=harvest_job,
-            extras=[
-                HOExtra(key="collection_metadata", value="true"),
-                HOExtra(key="waf_location", value=collection_metadata_url),
-                HOExtra(key="status", value=status),
-            ],
             guid=guid,
-            status=status,
-            package_id=package_id,
+            job=harvest_job
         )
+        obj.extras = [
+                HOExtra(key="collection_metadata", value="true"),
+                HOExtra(key="waf_location", value=collection_metadata_url),
+        ]
+
         queue.fetch_and_import_stages(self, obj)
         if obj.state == "ERROR":
             self._save_gather_error(

diff --git a/ckanext/geodatagov/tests/test_update_geo.py b/ckanext/geodatagov/tests/test_update_geo.py
@@ -1,10 +1,7 @@
-import json
 import logging
 
 from ckan.tests.helpers import reset_db
-from ckan.tests import factories
 
-from ckanext.geodatagov.commands import GeoGovCommand
 from ckanext.geodatagov.logic import translate_spatial
 
 
@@ -36,98 +33,3 @@ def test_translations(self):
         assert translate_spatial('not exists') is None
         assert translate_spatial('1.0,3.0') is None
         assert translate_spatial('US, Virginia, Fairfax, Reston') is None
-
-    def create_datasets(self):
-
-        user = factories.Sysadmin(name='sysadmin')  # NOQA
-        self.dataset1 = factories.Dataset(
-            extras=[
-                {'key': 'spatial', 'value': 'United States'}
-            ]
-        )
-        self.dataset2 = factories.Dataset(
-            extras=[
-                {'key': 'spatial', 'value': '34.1,25.2,26.2,27.9'}
-            ]
-        )
-
-        self.dataset3 = factories.Dataset(
-            extras=[
-                {'key': 'spatial', 'value': '34.1,25.2,+26.2,+27.9'}
-            ]
-        )
-
-        self.dataset4 = factories.Dataset()
-
-        polygon = {
-            "type": "Polygon",
-            "coordinates": [
-                [
-                    [2.05827, 49.8625],
-                    [2.05827, 55.7447],
-                    [-6.41736, 55.7447],
-                    [-6.41736, 49.8625],
-                    [2.05827, 49.8625]
-                ]
-            ]
-        }
-        self.dataset5 = factories.Dataset(
-            extras=[
-                {'key': 'spatial', 'value': json.dumps(polygon)}
-            ]
-        )
-
-    def test_create_sitemap(self):
-        """ Run update-dataset-geo-fields command and check results.
-            We don't expect transformation because catalog-next
-            already include transformation while save datasets """
-
-        self.create_datasets()
-
-        cmd = GeoGovCommand()
-        cmd.user_name = 'sysadmin'
-        results = cmd.update_dataset_geo_fields()
-
-        assert results['total'] == 5
-        assert results['failed'] == 0
-        assert results['skipped'] == 5
-
-        # this dataset transformed its spatial data
-        d1 = results['datasets'][self.dataset1['id']]
-        assert d1['skip'] == 'No rolled up spatial extra found'
-        extras = {x['key']: x['value'] for x in self.dataset1['extras']}
-        assert extras['old-spatial'] == 'United States'
-        keys = list(json.loads(extras['spatial']).keys())
-        assert 'coordinates' in keys
-
-        # this dataset transformed its spatial data
-        d2 = results['datasets'][self.dataset2['id']]
-        assert d2['skip'] == 'No rolled up spatial extra found'
-        extras = {x['key']: x['value'] for x in self.dataset2['extras']}
-        assert extras['old-spatial'] == '34.1,25.2,26.2,27.9'
-        keys = list(json.loads(extras['spatial']).keys())
-        assert 'coordinates' in keys
-
-        # This dataset already include good spatial data
-        d3 = results['datasets'][self.dataset3['id']]
-        assert d3['skip'] == 'No rolled up spatial extra found'
-        extras = {x['key']: x['value'] for x in self.dataset3['extras']}
-        assert extras['old-spatial'] == '34.1,25.2,+26.2,+27.9'
-        keys = list(json.loads(extras['spatial']).keys())
-        assert 'coordinates' in keys
-
-        # this dataset don't have any spatial data
-        d4 = results['datasets'][self.dataset4['id']]
-        assert d4['skip'] == 'No rolled up extras'
-        extras = {x['key']: x['value'] for x in self.dataset4['extras']}
-        assert 'old-spatial' not in list(extras.keys())
-        assert 'spatial' not in list(extras.keys())
-
-        # This dataset already include good spatial data
-        d5 = results['datasets'][self.dataset5['id']]
-        assert d5['skip'] == 'No rolled up spatial extra found'
-        extras = {x['key']: x['value'] for x in self.dataset5['extras']}
-        assert 'old-spatial' in list(extras.keys())
-        keys = list(json.loads(extras['spatial']).keys())
-        assert 'coordinates' in keys
-        assert extras['old-spatial'] == extras['spatial']