Skip to content

Commit 65a0a01

Browse files
committed
First version extracted from lang folder of commons repo.
1 parent 30bdeb3 commit 65a0a01

39 files changed

+217320
-0
lines changed

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -150,3 +150,6 @@ cython_debug/
150150
# and can be added to the global gitignore or merged into this file. For a more nuclear
151151
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
152152
#.idea/
153+
154+
# Eclipse
155+
.*

textanalysis/__init__.py

Whitespace-only changes.

textanalysis/admin.py

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from django.contrib import admin
2+
3+
# Register your models here.

textanalysis/apps.py

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
from django.apps import AppConfig
2+
3+
4+
class TextanalysisConfig(AppConfig):
5+
name = 'textanalysis'

textanalysis/babelnet.py

Whitespace-only changes.

textanalysis/forms.py

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
2+
from django.utils.translation import gettext_lazy as _
3+
from django import forms
4+
5+
TA_FUNCTION_CHOICES = (
6+
('context', _('Keywords In Context')),
7+
('wordlists', _('Word Lists by POS')),
8+
('nounchunks', _('Noun chunks')),
9+
('readability', _('Text Readability')),
10+
('summarization', _('Text Summarization')),
11+
('dashboard', _('Text Analysis Dashboard')),
12+
)
13+
14+
class TextAnalysisInputForm(forms.Form):
15+
text = forms.CharField(required=True, label=_('text to analyze'), widget=forms.Textarea(attrs={'class':'form-control', 'rows': 10, 'cols': 120,}), help_text=_('short text of a few paragraphs, or url of a web page'))
16+
function = forms.ChoiceField(required=True, choices=TA_FUNCTION_CHOICES, label=_('text-analysis function'), widget=forms.Select(attrs={'class':'form-control',}))

textanalysis/lang/__init__.py

Whitespace-only changes.

textanalysis/lang/babelnet.py

Whitespace-only changes.

textanalysis/lang/da/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
voc_da = [
2+
]
3+
4+
token_level_dict = {}
5+
6+
def get_vocabulary():
7+
global token_level_dict
8+
return token_level_dict
9+

textanalysis/lang/da/utils.py

Whitespace-only changes.

textanalysis/lang/el/__init__.py

Whitespace-only changes.
+102
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
import os
2+
import pyexcel
3+
file_name = 'KELLY_EL.xlsx'
4+
source = 'https://inventory.clarin.gr/lcr/741'
5+
attribution = """KELLY word-list Greek by Institute for Language and Speech Processing - Athena Research Center used under Creative Commons Attribution Non Commercial 4.0 International (https://creativecommons.org/licenses/by-nc/4.0/legalcode, https://creativecommons.org/licenses/by-nc/4.0/). Source: http://hdl.handle.net/11500/ATHENA-0000-0000-25C1-C (CLARIN:EL)"""
6+
7+
# Μέρος του Λόγου (Part of speech)
8+
pos_map = {
9+
'ουσιαστικό': 'noun',
10+
'αντωνυμία': 'pronoun',
11+
'επίθετο': 'adjective',
12+
'επίθετο (κλιτή μορφή)': 'adjective',
13+
'επίθετο (συγκριτικός βαθμός)': 'adjective', # adjective (comparative degree)
14+
'επίθετο (συγκρ. βαθμός)': 'adjective', # adjective (comparative degree)
15+
'άρθρο': 'determiner',
16+
'ρήμα': 'verb',
17+
'ρήμα (έκφραση)': 'verb', # verbal expression ?
18+
'ρήμα (απρ. έκφρ.)': 'verb',
19+
'επίρρημα': 'adverb',
20+
'επιρρηματική έκφραση': 'adverb', # adverbial expression
21+
'πρόθεση': 'preposition',
22+
'σύνδεσμος': 'conjunction',
23+
'επιφώνημα': 'exclamation',
24+
'επιφώνημα (μόριο)': 'exclamation',
25+
'αριθμητικό': 'number', # => adjective, noun (see code)
26+
'μετοχή': 'adjective', # past participle ?
27+
'μόριο': '?', # 'particle: prefix or suffix ?
28+
'έκφραση': '?', # expression, present participle ?
29+
'έκρφαση': '?', # spelling error!
30+
'συντομογραφία': '?', # 'abbreviation',
31+
'συντομογραφία/σύντμηση': '?', #'abbreviation/syntax',
32+
'': '?',
33+
}
34+
35+
# ending of some base forms of adjectives (?) for which other endings are listed
36+
base_ends = ['ος', 'ός', 'ής', 'ών',]
37+
38+
# the vocabulary annotated with CEFR level to be created by interpreting the KELLY_EL file
39+
voc_el = [
40+
]
41+
42+
def list_to_dict(lst):
43+
return {k: v for v, k in enumerate(lst)}
44+
45+
def split_lemma(lemma):
46+
els = [x.strip() for x in lemma.split(',')]
47+
if len(els) == 1:
48+
lemmas = [lemma]
49+
else:
50+
if lemma.count('-'):
51+
base = els[0]
52+
lemmas = [base]
53+
for end in base_ends:
54+
if base.endswith(end):
55+
root = base[:-len(end)]
56+
for el in els[1:]:
57+
lemmas.append(root+el[1:])
58+
else:
59+
lemmas = els
60+
return lemmas
61+
62+
def split_postag(postag):
63+
els = [x.strip() for x in postag.split(',')]
64+
postags = [pos_map[postag.lower()] for postag in els]
65+
if 'number' in postags:
66+
postags = ['adjective', 'noun',]
67+
return postags
68+
69+
def make_entries(voc_cols_dict, row):
70+
entries = []
71+
level = row[voc_cols_dict['CEF level']]
72+
lemma = row[voc_cols_dict['Λήμμα (Lemma)']]
73+
lemmas = split_lemma(lemma)
74+
postag = row[voc_cols_dict['Μέρος του Λόγου (Part of speech)']]
75+
postags = split_postag(postag)
76+
for lemma in lemmas:
77+
for postag in postags:
78+
entries.append([lemma, postag, level.lower()])
79+
return entries
80+
81+
def load_vocabulary (file_name=''):
82+
global voc_el
83+
file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), file_name)
84+
f = open(file_path, "br")
85+
extension = file_name.split(".")[-1]
86+
content = f.read()
87+
f.close()
88+
book = pyexcel.get_book(file_type=extension, file_content=content)
89+
book_dict = book.to_dict()
90+
voc_table = book_dict["Sheet1"]
91+
voc_cols = voc_table[0]
92+
voc_rows = voc_table[1:]
93+
voc_cols_dict = list_to_dict(voc_cols)
94+
# print(voc_cols_dict)
95+
for row in voc_rows:
96+
voc_el.extend(make_entries(voc_cols_dict, row))
97+
98+
load_vocabulary(file_name)
99+
"""
100+
for entry in voc_el:
101+
print(entry)
102+
"""

textanalysis/lang/el/utils.py

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# see http://graficnotes.blogspot.com/2012/08/6.html
2+
vowels = 'αειηουω'
3+
diphthongs = ['αι', 'αη', 'οι', 'οη',]
4+
abusive = ['ει', 'οι', 'ι', 'υ',]
5+
abusive_diphthongs = [a+d for a in abusive for d in diphthongs]
6+
abusive_diphthongs += [a+v for a in abusive for v in vowels]
7+
vowel_groups = abusive_diphthongs + diphthongs + ['αυ', 'ευ',]
8+
9+
def count_word_syllables(word):
10+
n_syllables = 0
11+
for group in vowel_groups:
12+
if word.count(group):
13+
n_syllables += 1
14+
word = word.replace(group, '')
15+
for c in word:
16+
if c in vowels:
17+
n_syllables += 1
18+
return n_syllables
19+

textanalysis/lang/en/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)