First commit

xirdneh · May 23, 2016 · 53f0bab · 53f0bab
commit 53f0bab
Show file tree

Hide file tree

Showing 12 changed files with 1,335 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,73 @@
+# ignore the populated environment file so as not to commit secrets!
+.idea/
+designsafe.env
+certs/
+node_modules/
+datadump.json
+db.sqlite3*
+
+####
+#
+# Below is the Github-provided python .gitignore file
+#
+####
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+#dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+db.sqlite3
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2016 Josue Balandrano Coronel
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,31 @@
+# LiveQA submission for TREC-2016
+
+**Introduction**
+
+    This project is based on the [TREC-2016 track LiveQA](https://sites.google.com/site/trecliveqa2016/call-for-participation).
+    In the heart of it uses Latent Dirichlet Allocation (LDA) to infer the semantic topics and uses this model to construct
+    a probability distribution for each of the retrieved documents from the knowledge base. Finally the Jensen-Shannon
+    Distance (JSD) is calculated to have a symilarity measure and the most similar answer is selected as the returned answer.
+    The knowledge base used right now is the yahoo answers database. 
+
+Leverages on:
+
+  - [beautifulsoup4](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)
+  - [scipy](https://pypi.python.org/pypi/scipy)
+  - [numpy](https://pypi.python.org/pypi/numpy)
+  - [nltk](http://www.nltk.org/)
+  - [gensim](http://radimrehurek.com/gensim/)
+
+## Future Work
+
+  * [ ] Add more resources other than YahooAnswers.
+  * [ ] Improve query construction when searching for candidate question/answer tuples.
+  * [ ] Add more similarity metrics (aggregation, semantic).
+  * [ ] Improve NLP processing.
+  * [ ] Add multi-document summarization when possible.
+
+## References 
+
+  - [TREC-2016 track LiveQA](https://sites.google.com/site/trecliveqa2016/call-for-participation) 
+  - [Blei et al. Latent Dirichlet Allocation](http://www.cs.princeton.edu/~blei/papers/BleiNgJordan2003.pdf)
+  - [Gensim LDA implementation](https://github.com/piskvorky/gensim/blob/develop/gensim/models/ldamodel.py)
diff --git a/__init__.py b/__init__.py
diff --git a/liveqa/__init__.py b/liveqa/__init__.py
diff --git a/liveqa/nltk_utils.py b/liveqa/nltk_utils.py
@@ -0,0 +1,42 @@
+from nltk.stem.wordnet import WordNetLemmatizer
+import nltk
+import re
+
+def preprocess_text(text):
+    text = text.lower()
+    text = re.sub(r'https?:\/\/[.\s]*', ' ', text, flags=re.MULTILINE)
+    text = re.sub(r'[^\w\s\-_]+', ' ', text, flags=re.MULTILINE)
+    text = re.sub(r'\s+', ' ', text, flags=re.MULTILINE)
+    #text = re.sub(r'\W\s[\d]{1,3}\s', ' ', text, flags=re.MULTILINE)
+    text = text.encode('utf-8')
+    return text
+
+def get_word_lists(documents):
+    """
+    Use also to preprocess any string.
+    text = get_word_lists([data])[0]
+    """
+    word_lists = []
+    for d in documents:
+        tokens = tokenize(d)
+        tokens = remove_stop_words(tokens)
+        word_lists.append(tokens)
+    return word_lists
+
+def tokenize(text):
+    tokens = nltk.word_tokenize(text)
+    return tokens
+
+def is_int(s):
+    try:
+        int(s)
+        return True
+    except ValueError:
+        return False
+
+def remove_stop_words(tokens_list):
+    stopwords = nltk.corpus.stopwords.words('english')
+    stopwords += ['http', 'https', 'img', 'src', 'href', 'alt']
+    lmtz = WordNetLemmatizer()
+    filtered_words = [lmtz.lemmatize(w) for w in tokens_list if w not in stopwords and (is_int(w) or len(w) > 1)]
+    return filtered_words
diff --git a/liveqa/qs_proc.py b/liveqa/qs_proc.py
@@ -0,0 +1,151 @@
+from bs4 import BeautifulSoup
+from time import time
+from . import nltk_utils
+import threading
+import requests
+import logging
+import urllib2
+import urllib
+import json
+
+logger = logging.getLogger(__name__)
+ya_domain = 'https://answers.yahoo.com'
+ya_search = 'https://answers.yahoo.com/search/search_result?p='
+ya_new = 'https://answers.yahoo.com/dir/index/answer'
+ya_list = 'https://answers.yahoo.com/dir/index/discover'
+
+def get_question_details(q_url):
+    response = requests.get(q_url)
+    html = response.text
+    soup = BeautifulSoup(html, 'html5lib')
+    q_det = soup.find('div', id='ya-question-detail')
+    title = q_det.h1.get_text()
+    #q_det = q_det.find_all('div')
+    body = q_det.find('span', class_='ya-q-full-text') or q_det.find('span', class_='ya-q-text')
+    if body:
+        body = body.get_text()
+    else:
+        body = ''
+    best_answer = soup.find('div', id='ya-best-answer') or ''
+    if best_answer:
+        best_answer = best_answer.find('span', class_='ya-q-full-text').get_text()
+
+    answers_ul = soup.find('ul', id='ya-qn-answers')
+    answers = []
+    if answers_ul:
+        answers_lis = answers_ul.find_all('li')
+        answers = []
+        for answer in answers_lis:
+            answer_dets = answer.select('.answer-detail')
+            text = answer_dets[0].get_text()
+            upvotes = answer_dets[1].select('[itemprop="upvoteCount"]')[0].get_text()
+            upvotes = int(upvotes)
+            answers.append({'answer': text, 'upvotes': upvotes})
+        answers = sorted(answers, key=lambda x: x['upvotes'], reverse=True)
+        if not best_answer:
+            if answers:
+                best_answer = answers[0]['answer']
+                answers = answers[1:]        
+    return {'title': title, 'body': body, 'best_answer': best_answer, 'answers': answers, 'url': q_url}
+
+
+def question_to_document(q):
+    doc = q['title'] + ' ' + q['body'] + ' ' +  q['best_answer']
+    at = ''
+    for answer in q['answers']:
+        at += ' ' + answer['answer']
+    return doc + ' ' + at
+
+def get_newest_question():
+    response = urllib2.urlopen('https://answers.yahoo.com/dir/index/answer', timeout=10)
+    html = response.read()
+    soup = BeautifulSoup(html, 'html5lib')
+    questions = soup.find('ul', id='ya-answer-tab')
+    q_url = ya_domain + questions.li.h3.a['href']
+    return q_url
+
+def search(q, q_url, dictionary):
+    cnt = 0
+    q_split = []
+    qs_lis = []
+    for w in q.split():
+        freq = dictionary.dfs.get(dictionary.token2id.get(w, ''), 0)
+        q_split.append((w, freq))
+    q_split = sorted(q_split, key=lambda x: x[1], reverse=True)
+    cnt_max = len(q_split) * 2
+    p = 1 
+    bw = False
+    qid = q_url.split('qid=')[1].strip()
+    while not bw:
+        logger.debug('YA Search Q: %s &s=%s' % (q, p))
+        s_url = ya_search + urllib.quote(q)
+        if p > 1:
+            s_url += '&s=%d' % p
+        response = urllib2.urlopen(s_url, timeout=10)
+        html = response.read()
+        soup = BeautifulSoup(html, 'html5lib')
+        qs = soup.find('ul', id = 'yan-questions')
+        lis = qs.find_all('li')
+        qs_lis += lis
+        #print 'len qs_lis {}'.format(len(qs_lis))
+        if len(qs_lis) >= 50 or cnt >= cnt_max:
+            bw = True
+        if len(lis) < 10 and p == 1 and len(q_split) >= 3:
+            #print 'fixing q'
+            q = ' '.join([w for w in q.split() if w != q_split[-1][0]])
+            q_split.pop()
+            p = 0
+        elif len(lis) < 10:
+            bw = True
+        cnt += 1
+        p += 1
+    seen = set()
+    ret = []
+    for li in qs_lis:
+        url = ya_domain + li.a['href']
+        ref_qid = url.split('qid=')[1]
+        #print 'qid: {} == ref_qid: {}. {}'.format(qid, ref_qid, qid == ref_qid)
+        if qid == ref_qid or ref_qid in seen:
+            continue
+        seen.add(ref_qid)
+        ret.append(url)
+    return ret
+
+def search_questions(q, q_url, dictionary):
+    urls = search(q, q_url, dictionary)
+    qs_dets = [{}] * len(urls)
+    t0 = time()
+    threads = []
+    no_threads = 10
+    print 'url len: {}'.format(len(urls))
+    for i in range(len(urls)):
+        t = QThread(i, urls[i], qs_dets)
+        threads.append(t)
+
+    for j in range(len(threads) / no_threads):
+        offset = no_threads * j
+        end = offset + no_threads
+        if offset + no_threads > len(urls):
+            end = len(urls)
+        for t in threads[offset:end]:
+            t.start()
+        for t in threads[offset:end]:
+            t.join()
+    t1 = time()
+    print 'Time fetchings candidates qs: {}'.format(t1 - t0)
+    return qs_dets
+
+class QThread(threading.Thread):
+    def __init__(self, _id, url, texts, *args, **kwargs):
+        threading.Thread.__init__(self)
+        self._id = _id
+        self.url = url
+        self.texts = texts
+
+    def _get_art(self, url):
+        return get_question_details(url)
+
+    def run(self):
+        det = self._get_art(self.url)
+        self.texts[self._id] = det
+
diff --git a/liveqa/websearch.py b/liveqa/websearch.py
@@ -0,0 +1,74 @@
+from requests.auth import HTTPBasicAuth
+from bs4 import BeautifulSoup
+from . import nltk_utils
+from time import time
+import threading
+import requests
+import urllib2
+import urllib
+import json
+
+bing_api = 'https://api.datamarket.azure.com/Bing/SearchWeb/v1/Web?$format=json&Query='
+bing_key = 'IgVbvvtgQVYI7Yfu9hPgVx0Tmbih1gq5lFOXaIQH4f8'
+user_agent = 'Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19'
+
+def search(q, q_url):
+    search_url = bing_api + urllib.quote(q)
+    print 'Search Url: %s\n' % search_url
+    try:
+        response = requests.get(search_url, auth=HTTPBasicAuth(bing_key, bing_key))
+        results = response.json()['d']['results']
+        urls = []
+        for r in results:
+            if r['Url'] != q_url:
+                urls.append(r['Url'])
+        if len(urls) >= 20:                    
+            return urls[:20]
+        else:
+            return urls
+    except Exception as e:
+        print e
+        #print response.text
+        traceback.print_exc(file=sys.stdout)
+
+class URLThread(threading.Thread):
+    def __init__(self, _id, url, texts, *args, **kwargs):
+        threading.Thread.__init__(self)
+        self._id = _id
+        self.url = url
+        self.texts = texts
+
+    def _get_art(self, url):
+        #print 'requesting: {}'.format(url)
+        req = urllib2.Request(url, headers={'User-Agent': user_agent})
+        response = urllib2.urlopen(req, timeout=10)
+        html = response.read()
+        soup = BeautifulSoup(html, 'html5lib')
+        [s.extract() for s in soup(['script', 'a', 'rel', 'style', 'img', 'link', 'style'])]
+        text = soup.get_text()
+        text = nltk_utils.preprocess_text(text)
+        return text
+
+    def run(self):
+        txt = self._get_art(self.url)
+        self.texts[self._id] = txt
+
+def get_articles(urls):
+    corpus = [''] * len(urls)
+    t0 = time()
+    threads = []
+    no_threads = 10
+    print 'url len: {}'.format(len(urls))
+    for i in range(len(urls)):
+        t = URLThread(i, urls[i], corpus)
+        threads.append(t)
+
+    for j in range(len(threads) / no_threads):
+        offset = no_threads * j    
+        for t in threads[offset:offset +no_threads]:
+            t.start()
+        for t in threads[offset:offset + no_threads]:
+            t.join()
+    t1 = time()
+    print 'Time fetching urls: {}'.format(t1 - t0)
+    return corpus