gtoffoli
diff --git a/‎.gitignore
+3 b/‎.gitignore
+3
diff --git a/‎textanalysis/__init__.py b/‎textanalysis/__init__.py
diff --git a/‎textanalysis/admin.py
+3 b/‎textanalysis/admin.py
+3
diff --git a/‎textanalysis/apps.py
+5 b/‎textanalysis/apps.py
+5
diff --git a/‎textanalysis/babelnet.py b/‎textanalysis/babelnet.py
diff --git a/‎textanalysis/forms.py
+16 b/‎textanalysis/forms.py
+16
diff --git a/‎textanalysis/lang/__init__.py b/‎textanalysis/lang/__init__.py
diff --git a/‎textanalysis/lang/babelnet.py b/‎textanalysis/lang/babelnet.py
diff --git a/‎textanalysis/lang/da/__init__.py b/‎textanalysis/lang/da/__init__.py
diff --git a/‎textanalysis/lang/da/basic_vocabulary_da.py
+9 b/‎textanalysis/lang/da/basic_vocabulary_da.py
+9
diff --git a/‎textanalysis/lang/da/utils.py b/‎textanalysis/lang/da/utils.py
diff --git a/‎textanalysis/lang/el/__init__.py b/‎textanalysis/lang/el/__init__.py
diff --git a/‎textanalysis/lang/el/basic_vocabulary_el.py
+102 b/‎textanalysis/lang/el/basic_vocabulary_el.py
+102
diff --git a/‎textanalysis/lang/el/utils.py
+19 b/‎textanalysis/lang/el/utils.py
+19
diff --git a/‎textanalysis/lang/en/__init__.py b/‎textanalysis/lang/en/__init__.py
@@ -150,3 +150,6 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+# Eclipse
+.*
@@ -0,0 +1,3 @@
+from django.contrib import admin
+
+# Register your models here.
@@ -0,0 +1,5 @@
+from django.apps import AppConfig
+
+
+class TextanalysisConfig(AppConfig):
+    name = 'textanalysis'
@@ -0,0 +1,16 @@
+
+from django.utils.translation import gettext_lazy as _
+from django import forms
+
+TA_FUNCTION_CHOICES = (
+    ('context', _('Keywords In Context')),
+    ('wordlists', _('Word Lists by POS')),
+    ('nounchunks', _('Noun chunks')),
+    ('readability', _('Text Readability')),
+    ('summarization', _('Text Summarization')),
+    ('dashboard', _('Text Analysis Dashboard')),
+)
+
+class TextAnalysisInputForm(forms.Form):
+    text = forms.CharField(required=True, label=_('text to analyze'), widget=forms.Textarea(attrs={'class':'form-control', 'rows': 10, 'cols': 120,}), help_text=_('short text of a few paragraphs, or url of a web page'))
+    function = forms.ChoiceField(required=True, choices=TA_FUNCTION_CHOICES, label=_('text-analysis function'), widget=forms.Select(attrs={'class':'form-control',}))
@@ -0,0 +1,9 @@
+voc_da = [
+]
+
+token_level_dict = {}
+
+def get_vocabulary():
+    global token_level_dict
+    return token_level_dict
+    
@@ -0,0 +1,102 @@
+import os
+import pyexcel
+file_name = 'KELLY_EL.xlsx'
+source = 'https://inventory.clarin.gr/lcr/741'
+attribution = """KELLY word-list Greek by Institute for Language and Speech Processing - Athena Research Center used under Creative Commons Attribution Non Commercial 4.0 International (https://creativecommons.org/licenses/by-nc/4.0/legalcode, https://creativecommons.org/licenses/by-nc/4.0/). Source: http://hdl.handle.net/11500/ATHENA-0000-0000-25C1-C (CLARIN:EL)"""
+
+# Μέρος του Λόγου (Part of speech)
+pos_map = {
+   'ουσιαστικό': 'noun',
+   'αντωνυμία': 'pronoun',
+   'επίθετο': 'adjective',
+   'επίθετο (κλιτή μορφή)': 'adjective',
+   'επίθετο (συγκριτικός βαθμός)': 'adjective', # adjective (comparative degree)
+   'επίθετο (συγκρ. βαθμός)': 'adjective', # adjective (comparative degree)
+   'άρθρο': 'determiner',
+   'ρήμα': 'verb',
+   'ρήμα (έκφραση)': 'verb', # verbal expression ?
+   'ρήμα (απρ. έκφρ.)': 'verb',
+   'επίρρημα': 'adverb',
+   'επιρρηματική έκφραση': 'adverb', # adverbial expression
+   'πρόθεση': 'preposition',
+   'σύνδεσμος': 'conjunction',
+   'επιφώνημα': 'exclamation',
+   'επιφώνημα (μόριο)': 'exclamation',
+   'αριθμητικό': 'number', # => adjective, noun (see code)
+   'μετοχή': 'adjective', # past participle ?
+   'μόριο': '?', # 'particle: prefix or suffix ?
+   'έκφραση': '?', # expression, present participle ?
+   'έκρφαση': '?', # spelling error!
+   'συντομογραφία': '?', # 'abbreviation',
+   'συντομογραφία/σύντμηση': '?', #'abbreviation/syntax',
+   '': '?',
+}
+
+# ending of some base forms of adjectives (?) for which other endings are listed
+base_ends = ['ος', 'ός', 'ής', 'ών',]
+
+# the vocabulary annotated with CEFR level to be created by interpreting the KELLY_EL file
+voc_el = [
+]
+
+def list_to_dict(lst):
+    return {k: v for v, k in enumerate(lst)}
+
+def split_lemma(lemma):
+    els = [x.strip() for x in lemma.split(',')]
+    if len(els) == 1:
+        lemmas = [lemma]
+    else:
+        if lemma.count('-'):
+            base = els[0]
+            lemmas = [base]
+            for end in base_ends:
+                if base.endswith(end):
+                    root = base[:-len(end)]
+                    for el in els[1:]:
+                        lemmas.append(root+el[1:])
+        else:
+            lemmas = els
+    return lemmas           
+
+def split_postag(postag):
+    els = [x.strip() for x in postag.split(',')]
+    postags = [pos_map[postag.lower()] for postag in els]
+    if 'number' in postags:
+        postags = ['adjective', 'noun',]
+    return postags
+
+def make_entries(voc_cols_dict, row):
+    entries = []
+    level = row[voc_cols_dict['CEF level']]
+    lemma = row[voc_cols_dict['Λήμμα (Lemma)']]
+    lemmas = split_lemma(lemma)
+    postag = row[voc_cols_dict['Μέρος του Λόγου (Part of speech)']]
+    postags = split_postag(postag)
+    for lemma in lemmas:
+        for postag in postags:
+            entries.append([lemma, postag, level.lower()])
+    return entries
+
+def load_vocabulary (file_name=''):
+    global voc_el
+    file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), file_name)
+    f = open(file_path, "br")
+    extension = file_name.split(".")[-1]
+    content = f.read()
+    f.close()
+    book = pyexcel.get_book(file_type=extension, file_content=content)
+    book_dict = book.to_dict()
+    voc_table = book_dict["Sheet1"]
+    voc_cols = voc_table[0]
+    voc_rows = voc_table[1:]
+    voc_cols_dict = list_to_dict(voc_cols)
+    # print(voc_cols_dict)
+    for row in voc_rows:
+        voc_el.extend(make_entries(voc_cols_dict, row))
+
+load_vocabulary(file_name)
+"""
+for entry in voc_el:
+    print(entry)
+"""
@@ -0,0 +1,19 @@
+# see http://graficnotes.blogspot.com/2012/08/6.html
+vowels = 'αειηουω'
+diphthongs = ['αι', 'αη', 'οι', 'οη',]
+abusive = ['ει', 'οι', 'ι', 'υ',]
+abusive_diphthongs = [a+d for a in abusive for d in diphthongs]
+abusive_diphthongs += [a+v for a in abusive for v in vowels]
+vowel_groups = abusive_diphthongs + diphthongs + ['αυ', 'ευ',]
+
+def count_word_syllables(word):
+    n_syllables = 0
+    for group in vowel_groups:
+        if word.count(group):
+            n_syllables += 1
+            word = word.replace(group, '')
+    for c in word:
+        if c in vowels:
+            n_syllables += 1
+    return n_syllables
+
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from django.contrib import admin`
	`2`	`+`
	`3`	`+# Register your models here.`