Skip to content

Commit

Permalink
zxcv
Browse files Browse the repository at this point in the history
  • Loading branch information
FuhuXia committed Feb 25, 2025
1 parent a12e0a9 commit 5e36b96
Show file tree
Hide file tree
Showing 7 changed files with 69 additions and 279 deletions.
2 changes: 1 addition & 1 deletion .env
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ CKAN_SMTP_PASSWORD=pass
CKAN_SMTP_MAIL_FROM=ckan@localhost

# Extensions
CKAN__PLUGINS=envvars tracking image_view text_view datagov_harvest ckan_harvester geodatagov z3950_harvester arcgis_harvester geodatagov_geoportal_harvester waf_harvester_collection geodatagov_csw_harvester geodatagov_doc_harvester geodatagov_waf_harvester spatial_metadata spatial_query s3test datajson datajson_harvest
CKAN__PLUGINS=tracking harvest datagov_harvest ckan_harvester geodatagov z3950_harvester arcgis_harvester geodatagov_geoportal_harvester waf_harvester_collection geodatagov_csw_harvester geodatagov_doc_harvester geodatagov_waf_harvester spatial_metadata spatial_query s3test datajson datajson_harvest envvars

# Harvest settings
CKAN__HARVEST__MQ__TYPE=redis
Expand Down
40 changes: 8 additions & 32 deletions ckanext/geodatagov/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import tempfile
import warnings
from typing import Optional
from sqlalchemy import func, and_, select, desc
from sqlalchemy import func, and_

import boto3
import ckan.logic as logic
Expand All @@ -20,7 +20,7 @@
from ckan.lib.search.common import make_connection
from ckan.lib.search.index import NoopSearchIndex, PackageSearchIndex
from ckan.model.meta import Session as session
from ckanext.tracking.cli.tracking import update_tracking
from ckanext.tracking.cli import tracking
from ckanext.tracking.model import TrackingSummary as ts

from ckanext.geodatagov.search import GeoPackageSearchQuery
Expand Down Expand Up @@ -730,38 +730,14 @@ def harvest_object_relink(harvest_source_id: Optional[str]):
@click.argument("start_date", required=False)
def tracking_update(start_date: Optional[str]):
"""ckan tracking update with customized options and output"""
update_all(start_date)


def update_all(start_date: Optional[str] = None):
if start_date:
date = datetime.datetime.strptime(start_date, "%Y-%m-%d")
else:
# No date given. See when we last have data for and get data
# from 2 days before then in case new data is available.
# If no date here then use 2020-01-01 as the start date
stmt = select(ts).order_by(desc(ts.tracking_date))
result = session.scalars(stmt).first()
if result:
date = result.tracking_date
date += datetime.timedelta(-2)
# convert date to datetime
combine = datetime.datetime.combine
date = combine(date, datetime.time(0))
else:
date = datetime.datetime(2020, 1, 1)
start_date_solrsync = date
end_date = datetime.datetime.now()

while date < end_date:
stop_date = date + datetime.timedelta(1)
update_tracking(date)
click.echo("tracking updated for {}".format(date))
date = stop_date

update_tracking_solr(start_date_solrsync)
# override the function update_tracking_solr in tracking module
tracking.update_tracking_solr = update_tracking_solr
tracking.update_all(start_date)

def update_tracking_solr(start_date: datetime.datetime):
"""copied from ckanext/tracking/cli/tracking.py
but with customized way of doing solr indexing
"""
results = (
session.query(ts.package_id)
.filter(
Expand Down
92 changes: 0 additions & 92 deletions ckanext/geodatagov/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ class GeoGovCommand(p.SingletonPlugin):
paster geodatagov combine-feeds -c <config>
paster geodatagov harvest-job-cleanup -c <config>
paster geodatagov export-csv -c <config>
paster geodatagov update-dataset-geo-fields -c <config>
'''
p.implements(p.IClick)
summary = __doc__.split('\n')[0]
Expand Down Expand Up @@ -107,8 +106,6 @@ def command(self):
if cmd == 'metrics-csv':
self.metrics_csv()
"""
if cmd == 'update-dataset-geo-fields':
self.update_dataset_geo_fields()

def get_user_org_mapping(self, location):
user_org_mapping = open(location)
Expand Down Expand Up @@ -654,95 +651,6 @@ def metrics_csv(self):
print(str(datetime.datetime.now()) + ' Done.')
"""

def update_dataset_geo_fields(self):
""" Re-index dataset with geofields
Catalog-classic use `spatial` field with string values (like _California_) or
raw coordinates (like _-17.4,34.2,-17.1,24.6_). Catalog-next take this data and
transform it into a valid GeoJson polygon (with the `translate_spatial` function).
On `package_create` or `package_update` this transformation will happend but
datasets already harvested will not be updated automatically.
"""

# iterate over all datasets

search_backend = config.get('ckanext.spatial.search_backend', 'postgis')
if search_backend != 'solr-bbox':
raise ValueError('Solr is not your default search backend (ckanext.spatial.search_backend)')

datasets = model.Session.query(model.Package).all()
total = len(datasets)
print('Transforming {} datasets.'.format(total))
c = 0
transformed = 0
failed = 0
skipped = 0
results = {
'datasets': {}
}
for dataset in datasets:
c += 1
print('Transforming {}/{}: {}. {} skipped, {} failed, {} transformed'.
format(c, total, dataset.name, skipped, failed, transformed))
results['datasets'][dataset.id] = {'name': dataset.name}
dataset_dict = dataset.as_dict()
extras = dataset_dict['extras']
rolled_up = extras.get('extras_rollup', None)
if rolled_up is None:
results['datasets'][dataset.id]['skip'] = 'No rolled up extras'
skipped += 1
continue
new_extras_rollup = json.loads(rolled_up)

old_spatial = new_extras_rollup.get('spatial', None)
if old_spatial is None:
results['datasets'][dataset.id]['skip'] = 'No rolled up spatial extra found'
skipped += 1
continue
print(' - Old Spatial found "{}"'.format(old_spatial))

try:
# check if already we have spatial valid data
json.loads(old_spatial)
results['datasets'][dataset.id]['spatial-already-done'] = old_spatial
skipped += 1
continue
except BaseException:
pass

# update package, the translate_spatial function will fix spatial data
context = {'user': self.user_name, 'ignore_auth': True}
old_pkg = p.toolkit.get_action('package_show')(context, {'id': dataset.id})
pkg_dict = p.toolkit.get_action('package_update')(context, old_pkg)

# check the results
extras = pkg_dict['extras']
new_spatial = None
for extra in extras:
if extra['key'] == 'spatial':
if old_spatial != extra['value']:
transformed += 1
new_spatial = extra['value']
results['datasets'][dataset.id]['transformation'] = [old_spatial, new_spatial]
else:
results['datasets'][dataset.id]['transformation'] = [old_spatial, 'not found']

if new_spatial is None:
failed += 1
new_spatial = '**** NOT FOUND ****'

print(' - NEW Spatial: "{}"'.format(new_spatial))

print('Final results {} total datasets. {} skipped, {} failed, {} transformed'.
format(total, skipped, failed, transformed))

results.update({
'total': c,
'transformed': transformed,
'skipped': skipped,
'failed': failed
})
return results


def get_response(url):
req = Request(url)
Expand Down
21 changes: 6 additions & 15 deletions ckanext/geodatagov/harvesters/waf_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,24 +104,15 @@ def gather_stage(self, harvest_job):
.first()
)

if existing_harvest_object:
status = "change"
guid = existing_harvest_object.guid
package_id = existing_harvest_object.package_id
else:
status, package_id = "new", None

obj = HarvestObject(
job=harvest_job,
extras=[
HOExtra(key="collection_metadata", value="true"),
HOExtra(key="waf_location", value=collection_metadata_url),
HOExtra(key="status", value=status),
],
guid=guid,
status=status,
package_id=package_id,
job=harvest_job
)
obj.extras = [
HOExtra(key="collection_metadata", value="true"),
HOExtra(key="waf_location", value=collection_metadata_url),
]

queue.fetch_and_import_stages(self, obj)
if obj.state == "ERROR":
self._save_gather_error(
Expand Down
98 changes: 0 additions & 98 deletions ckanext/geodatagov/tests/test_update_geo.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
import json
import logging

from ckan.tests.helpers import reset_db
from ckan.tests import factories

from ckanext.geodatagov.commands import GeoGovCommand
from ckanext.geodatagov.logic import translate_spatial


Expand Down Expand Up @@ -36,98 +33,3 @@ def test_translations(self):
assert translate_spatial('not exists') is None
assert translate_spatial('1.0,3.0') is None
assert translate_spatial('US, Virginia, Fairfax, Reston') is None

def create_datasets(self):

user = factories.Sysadmin(name='sysadmin') # NOQA
self.dataset1 = factories.Dataset(
extras=[
{'key': 'spatial', 'value': 'United States'}
]
)
self.dataset2 = factories.Dataset(
extras=[
{'key': 'spatial', 'value': '34.1,25.2,26.2,27.9'}
]
)

self.dataset3 = factories.Dataset(
extras=[
{'key': 'spatial', 'value': '34.1,25.2,+26.2,+27.9'}
]
)

self.dataset4 = factories.Dataset()

polygon = {
"type": "Polygon",
"coordinates": [
[
[2.05827, 49.8625],
[2.05827, 55.7447],
[-6.41736, 55.7447],
[-6.41736, 49.8625],
[2.05827, 49.8625]
]
]
}
self.dataset5 = factories.Dataset(
extras=[
{'key': 'spatial', 'value': json.dumps(polygon)}
]
)

def test_create_sitemap(self):
""" Run update-dataset-geo-fields command and check results.
We don't expect transformation because catalog-next
already include transformation while save datasets """

self.create_datasets()

cmd = GeoGovCommand()
cmd.user_name = 'sysadmin'
results = cmd.update_dataset_geo_fields()

assert results['total'] == 5
assert results['failed'] == 0
assert results['skipped'] == 5

# this dataset transformed its spatial data
d1 = results['datasets'][self.dataset1['id']]
assert d1['skip'] == 'No rolled up spatial extra found'
extras = {x['key']: x['value'] for x in self.dataset1['extras']}
assert extras['old-spatial'] == 'United States'
keys = list(json.loads(extras['spatial']).keys())
assert 'coordinates' in keys

# this dataset transformed its spatial data
d2 = results['datasets'][self.dataset2['id']]
assert d2['skip'] == 'No rolled up spatial extra found'
extras = {x['key']: x['value'] for x in self.dataset2['extras']}
assert extras['old-spatial'] == '34.1,25.2,26.2,27.9'
keys = list(json.loads(extras['spatial']).keys())
assert 'coordinates' in keys

# This dataset already include good spatial data
d3 = results['datasets'][self.dataset3['id']]
assert d3['skip'] == 'No rolled up spatial extra found'
extras = {x['key']: x['value'] for x in self.dataset3['extras']}
assert extras['old-spatial'] == '34.1,25.2,+26.2,+27.9'
keys = list(json.loads(extras['spatial']).keys())
assert 'coordinates' in keys

# this dataset don't have any spatial data
d4 = results['datasets'][self.dataset4['id']]
assert d4['skip'] == 'No rolled up extras'
extras = {x['key']: x['value'] for x in self.dataset4['extras']}
assert 'old-spatial' not in list(extras.keys())
assert 'spatial' not in list(extras.keys())

# This dataset already include good spatial data
d5 = results['datasets'][self.dataset5['id']]
assert d5['skip'] == 'No rolled up spatial extra found'
extras = {x['key']: x['value'] for x in self.dataset5['extras']}
assert 'old-spatial' in list(extras.keys())
keys = list(json.loads(extras['spatial']).keys())
assert 'coordinates' in keys
assert extras['old-spatial'] == extras['spatial']
Loading

0 comments on commit 5e36b96

Please sign in to comment.