Rewrite load/normalize/lookup logic for INSPIRE vocabularies

This should resolve #138. It also addresses #137 but tests have to be added.
PublicaMundi · Jun 7, 2015 · d533046 · d533046
1 parent 20310c5
commit d533046
Show file tree

Hide file tree

Showing 5 changed files with 54 additions and 34 deletions.
diff --git a/ckanext/publicamundi/lib/metadata/types/_common.py b/ckanext/publicamundi/lib/metadata/types/_common.py
@@ -68,6 +68,17 @@ class FreeKeyword(Object):
     reference_date = None
     date_type = None
 
+    @classmethod
+    def normalize_keyword(cls, s):
+        from inflection import dasherize, underscore
+        return dasherize(underscore(unicode(s)))
+
+    def __init__(self, **kwargs):
+        value = kwargs.get('value')
+        if value:
+            kwargs['value'] = self.normalize_keyword(value)
+        super(FreeKeyword, self).__init__(**kwargs)
+
 @object_null_adapter()
 class GeographicBoundingBox(Object):
 

diff --git a/ckanext/publicamundi/lib/metadata/types/inspire_metadata.py b/ckanext/publicamundi/lib/metadata/types/inspire_metadata.py
@@ -192,10 +192,9 @@ def to_responsible_party(alist):
                     thes_version = None
                 else:
                     thes_version = re.sub(r'^[ ]*version[ ]+(\d\.\d)$', r'\1', thes_version)
-                thes_name = 'keywords-' + vocabularies.munge(thes_title)
                 # Note thes_version can be used to enforce a specific thesaurus version
                 try:
-                    thes = Thesaurus.lookup(name=thes_name)
+                    thes = Thesaurus.lookup(title=thes_title, for_keywords=True)
                 except ValueError:
                     thes = None
             # Treat present keywords depending on if they belong to a thesaurus
@@ -214,7 +213,6 @@ def to_responsible_party(alist):
                 vocab_date = to_date(it['thesaurus']['date'])
                 vocab_datetype = it['thesaurus']['datetype']
                 for keyword in it['keywords']:
-                    # Todo Maybe convert keyword to a canonical form (e.g. munge)
                     free_keywords.append(FreeKeyword(
                         value = keyword,
                         reference_date = vocab_date,

diff --git a/ckanext/publicamundi/lib/metadata/types/thesaurus.py b/ckanext/publicamundi/lib/metadata/types/thesaurus.py
@@ -22,31 +22,40 @@ class Thesaurus(Object):
 
     @property
     def vocabulary(self):
-        spec = vocabularies.get_by_name(self.name)
-        return spec.get('vocabulary') if spec else None
+        vocab = vocabularies.get_by_name(self.name)
+        return vocab.get('vocabulary') if vocab else None
 
     # Factory for Thesaurus
 
     @classmethod
-    def lookup(cls, name):
-        '''Lookup a thesaurus by it's name and return a Thesaurus instance.
-        The metadata for a newly created thesaurus are queried from vocabularies 
-        module.
+    def lookup(cls, name=None, title=None, for_keywords=False):
+        '''Lookup by name or title and return a Thesaurus instance.
+
+        This is a factory method that tries to instantiate a Thesaurus object
+        from a collection of well-known (mostly related to INSPIRE) vocabularies.
         '''
+
+        vocab = None
+
+        if (name is None) and title:
+            name = vocabularies.normalize_thesaurus_title(title, for_keywords)
+
+        if name:
+            vocab = vocabularies.get_by_name(name)
+        else:
+            raise ValueError('Expected a name/title lookup')
 
-        spec = vocabularies.get_by_name(name)
-        if spec:
+        if vocab:
             kwargs = {
-               'title': spec.get('title'),
-               'name': spec.get('name'),
-               'reference_date': spec.get('reference_date'),
-               'version' : spec.get('version'),
-               'date_type': spec.get('date_type'),
+               'title': vocab.get('title'),
+               'name': vocab.get('name'),
+               'reference_date': vocab.get('reference_date'),
+               'version' : vocab.get('version'),
+               'date_type': vocab.get('date_type'),
             }
             return cls(**kwargs)
         else:
-            raise ValueError(
-                'Cannot find an INSPIRE thesaurus named "%s"' %(name))
+            raise ValueError('Cannot find a thesaurus named "%s"' %(name))
 
 @object_null_adapter()
 class ThesaurusTerms(Object):

diff --git a/ckanext/publicamundi/lib/metadata/vocabularies/__init__.py b/ckanext/publicamundi/lib/metadata/vocabularies/__init__.py
@@ -8,15 +8,13 @@
 
 # Import loader
 
-from ckanext.publicamundi.lib.metadata.vocabularies import json_loader
-
-munge = json_loader.munge
+from ckanext.publicamundi.lib.metadata.vocabularies.json_loader import (
+    make_vocabularies, normalize_keyword, normalize_thesaurus_title)
 
 def _update(data_file, name_prefix='', overwrite=False):
     '''Update the module-global vocabularies from external JSON data.
     '''
-
-    for name, desc in json_loader.make_vocabularies(data_file):
+    for name, desc in make_vocabularies(data_file):
         assert overwrite or not (name in vocabularies), (
             'A vocabulary named %r is allready loaded' % (name))
         vocabularies[name_prefix + name] = desc

diff --git a/ckanext/publicamundi/lib/metadata/vocabularies/json_loader.py b/ckanext/publicamundi/lib/metadata/vocabularies/json_loader.py
@@ -6,10 +6,8 @@
 import zope.schema
 from zope.schema.vocabulary import SimpleVocabulary, SimpleTerm
 
-def munge(name):
-    '''Convert human-friendly to machine-friendly terms.
-    
-    Needed when a machine-friendly version is not supplied.
+def _munge(name):
+    '''Convert human-friendly to machine-friendly names.
     '''
 
     re_bad = re.compile('[\(\),]+')
@@ -23,24 +21,31 @@ def munge(name):
 
     return name
 
+def normalize_keyword(name):
+    return _munge(name)
+
+def normalize_thesaurus_title(name, for_keywords=False):
+    if not for_keywords:
+        return _munge(name)
+    else:
+        return _munge('keywords' + ' ' + name)
+
 def make_vocabulary(data):
     '''Convert raw data to a SimpleVocabulary instance.
     
     The input data can be one of the following:
      * a list of human-readable terms or a
      * a dict that maps machine-readable to human-readable terms.
     '''
-
-    # Note: A SimpleTerm is a tuple (value, token, title) 
 
     terms = []
     if isinstance(data, list):
         for t in data:
-            k = munge(t)
+            k = normalize_keyword(t)
             terms.append(SimpleTerm(k, t, t))
     elif isinstance(data, dict):     
         for k, t in data.items():
-            #k = munge(k)
+            #k = normalize_keyword(k)
             terms.append(SimpleTerm(k, t, t))
     return SimpleVocabulary(terms, swallow_duplicates=True)
 
@@ -55,7 +60,7 @@ def make_vocabularies(data_file):
         data = json.loads(fp.read())
 
     for title in (set(data.keys()) - set(['Keywords'])):
-        name = munge(title)
+        name = normalize_thesaurus_title(title)
         desc = {
             'name': name,
             'title': title,
@@ -67,8 +72,7 @@ def make_vocabularies(data_file):
     for title in keywords_data.keys():
         keywords = keywords_data.get(title)
         keywords_terms = make_vocabulary(keywords.get('terms'))
-
-        name = munge('Keywords-' + title)
+        name = normalize_thesaurus_title(title, for_keywords=True)
         desc = {
             'name': name,
             'title': title,