From f435e01df2daef7e796919ba819da416c54537f8 Mon Sep 17 00:00:00 2001 From: Mocchaso Date: Sun, 30 Dec 2018 02:03:27 +0900 Subject: [PATCH 1/2] change import io at onmt/io/TextDataset.py, from top of this python code to constructor of class ShardedTextCorpusIterator(object). --- onmt/io/TextDataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onmt/io/TextDataset.py b/onmt/io/TextDataset.py index 2e8a59e5..3eade17c 100644 --- a/onmt/io/TextDataset.py +++ b/onmt/io/TextDataset.py @@ -2,7 +2,6 @@ from collections import Counter from itertools import chain -import io import codecs import sys @@ -297,6 +296,7 @@ def __init__(self, corpus_path, line_truncate, side, shard_size, this iterator should align its step with. """ try: + import io # The codecs module seems to have bugs with seek()/tell(), # so we use io.open(). self.corpus = io.open(corpus_path, "r", encoding="utf-8") From 9ebe430fb3bc4e34ee935ee6ee0c96933cac2dce Mon Sep 17 00:00:00 2001 From: Mocchaso Date: Sun, 30 Dec 2018 02:26:24 +0900 Subject: [PATCH 2/2] add nltk.download(stopwords) --- craigslistbargain/model/parser.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/craigslistbargain/model/parser.py b/craigslistbargain/model/parser.py index 7ea71303..ae895e9d 100644 --- a/craigslistbargain/model/parser.py +++ b/craigslistbargain/model/parser.py @@ -1,5 +1,6 @@ import re import numpy as np +import nltk from nltk import ngrams from nltk.corpus import stopwords from collections import defaultdict @@ -11,6 +12,7 @@ from core.tokenizer import tokenize class Parser(BaseParser): + nltk.download('stopwords') stopwords = set(stopwords.words('english')) stopwords.update(['may', 'might', 'rent', 'new', 'brand', 'low', 'high', 'now', 'available'])