diff --git a/craigslistbargain/model/parser.py b/craigslistbargain/model/parser.py index 7ea71303..ae895e9d 100644 --- a/craigslistbargain/model/parser.py +++ b/craigslistbargain/model/parser.py @@ -1,5 +1,6 @@ import re import numpy as np +import nltk from nltk import ngrams from nltk.corpus import stopwords from collections import defaultdict @@ -11,6 +12,7 @@ from core.tokenizer import tokenize class Parser(BaseParser): + nltk.download('stopwords') stopwords = set(stopwords.words('english')) stopwords.update(['may', 'might', 'rent', 'new', 'brand', 'low', 'high', 'now', 'available']) diff --git a/onmt/io/TextDataset.py b/onmt/io/TextDataset.py index 2e8a59e5..3eade17c 100644 --- a/onmt/io/TextDataset.py +++ b/onmt/io/TextDataset.py @@ -2,7 +2,6 @@ from collections import Counter from itertools import chain -import io import codecs import sys @@ -297,6 +296,7 @@ def __init__(self, corpus_path, line_truncate, side, shard_size, this iterator should align its step with. """ try: + import io # The codecs module seems to have bugs with seek()/tell(), # so we use io.open(). self.corpus = io.open(corpus_path, "r", encoding="utf-8")