-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtopic_modeler.py
146 lines (101 loc) · 4.17 KB
/
topic_modeler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
"""
Uses UMAP to reduce and cluster OpenAI embeddings of the sample data.
"""
import os
import re
import pandas as pd
from openai import OpenAI, BadRequestError
from dotenv import load_dotenv
from stqdm import stqdm as tqdm
import umap
from sklearn.cluster import SpectralClustering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from directories import DATA_DIR, EMBEDDINGS_DIR
load_dotenv()
CLIENT = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
SMALL_MODEL = "text-embedding-3-small"
LARGE_MODEL = "text-embedding-3-large"
class TFIDF_Topic_Modeler:
def __init__(self, tfidf_filename):
self.tfidf_filepath = EMBEDDINGS_DIR / tfidf_filename
### Embeddings
def attempt_to_find_topics(self, df, columns=["title", "text"]):
if not os.path.exists(self.tfidf_filepath):
vectors = self.vectorize(df, columns=columns)
nmf_reduction = self.reduce_dimensions(vectors)
data = self.reduce_to_2d(nmf_reduction)
tfidf_embeddings = pd.DataFrame(data, index=df["post link"])
tfidf_embeddings.to_pickle(self.tfidf_filepath)
else:
tfidf_embeddings = pd.read_pickle(self.tfidf_filepath)
return tfidf_embeddings
def vectorize(self, df, columns=["title", "text"]):
"""
Clean text and create the TF-IDF vectors
"""
cleaned_text = self._clean_text(df, columns=columns)
vectorizer = TfidfVectorizer(stop_words="english")
vectors = vectorizer.fit_transform(cleaned_text)
return vectors
def reduce_dimensions(self, vectors, n_components=10):
model = NMF(n_components=n_components)
reduction = model.fit_transform(vectors)
return reduction
def reduce_to_2d(self, reduction):
umap_reducer = umap.UMAP(random_state=42, n_jobs=1)
futher_reduction = umap_reducer.fit_transform(reduction)
return futher_reduction
def _clean_text(self, df, columns=["title", "text"]):
"""
Preprocess text to prep it for TFIDF vectorizing
"""
text = df[columns].apply(lambda row: " ".join(row), axis=1).values
text = [self._remove_newlines_and_whitespace(txt) for txt in text]
return text
@staticmethod
def _remove_newlines_and_whitespace(text):
pattern = "\n"
cleaned_text = re.sub(pattern, " ", text)
pattern = "[ \t]{2,}"
cleaned_text = re.sub(pattern, " ", cleaned_text)
pattern = "[ \t]{2,}"
cleaned_text = re.sub(pattern, " ", cleaned_text)
return cleaned_text
class Topic_Modeler:
def __init__(self, embeddings=None, source_data=None):
# self.embeddings_filepath = embeddings_filepath
# self.source_data_filepath = source_data_filepath
# self.embeddings = pd.read_pickle(embeddings_filepath)
# self.source_data = pd.read_csv(source_data_filepath)
self.embeddings = embeddings
self.source_data = source_data
def reduce_dimensions(self):
self.umap_model = umap.UMAP(random_state=42, n_jobs=1)
reduction = self.umap_model.fit_transform(self.embeddings.values)
return reduction
def cluster(self, reduction, n_clusters):
self.cluster_model = SpectralClustering(n_clusters=n_clusters)
self.cluster_model.fit(reduction)
return self.cluster_model.labels_
def get_embeddings(df):
tqdm.pandas(desc=f"Getting OpenAI Embeddings for {df.shape[0]} posts")
df.set_index("post link", inplace=True)
data = (df["title"] + "\n\n" + df["text"]).progress_apply(fetch_embeddings_api_call)
embeddings = pd.DataFrame(data.values, index=df.index)
embeddings = embeddings[0].apply(pd.Series)
return embeddings
def fetch_embeddings_api_call(source_text):
try:
response = CLIENT.embeddings.create(
input=source_text,
model=SMALL_MODEL,
)
embedding = response.data[0].embedding
return embedding
except BadRequestError:
return None
if __name__ == "__main__":
filepath = "data/hardie+install.csv"
df = pd.read_csv(filepath)
get_embeddings(df)