1
- import numpy as np
2
- import random
3
-
4
- random .seed (10 )
5
- np .random .seed (0 )
6
-
7
1
from sentence_transformers import SentenceTransformer
8
2
import cuml
9
3
import cudf
12
6
from cuml .metrics import pairwise_distances
13
7
import cupy as cp
14
8
from torch .utils .dlpack import to_dlpack
15
- from vectorizer import CountVecWrapper
16
9
from ctfidf import ClassTFIDF
17
10
from mmr import mmr
11
+ from utils .sparse_matrix_utils import top_n_idx_sparse , top_n_values_sparse
12
+ from vectorizer .vectorizer import CountVecWrapper
18
13
19
-
20
- class gpu_bertopic :
14
+ class gpu_BERTopic :
21
15
def __init__ (self ):
22
16
self .top_n_words_df = None
23
17
self .topic_sizes_df = None
@@ -83,7 +77,9 @@ def clustering_hdbscan(self, umap_embeddings, documents):
83
77
documents: DataFrame from the original data
84
78
85
79
Returns:
86
- cluster: HDBSCAN object
80
+ documents: Modified dataframe with topic labels
81
+ probabilities: response from cluster.probabilities_ which
82
+ represents the likelihood of the doc belonging to a cluster.
87
83
"""
88
84
cluster = cuml .cluster .HDBSCAN (
89
85
min_cluster_size = 10 ,
@@ -127,8 +123,7 @@ def new_c_tf_idf(self, document_df, m, ngram_range=(1, 1)):
127
123
def create_topics (self , docs_df ):
128
124
"""Extract topics from the clusters using a class-based TF-IDF
129
125
Arguments:
130
- data: list with documents
131
- cluster: HDBSCAN object
126
+ docs_df: DataFrame containing documents and other information
132
127
Returns:
133
128
tf_idf: The resulting matrix giving a value (importance score) for
134
129
each word per topic
@@ -142,52 +137,52 @@ def create_topics(self, docs_df):
142
137
tf_idf , count = self .new_c_tf_idf (docs_df , len (docs_df ))
143
138
return tf_idf , count , docs_per_topics_topics , docs_df
144
139
145
- def top_n_idx_sparse (self , matrix , n ):
146
- """Return indices of top n values in each row of a sparse matrix
147
- Retrieved from:
148
- https://stackoverflow.com/questions/49207275/finding-the-top-n-values-in-a-row-of-a-scipy-sparse-matrix
149
- Args:
150
- matrix: The sparse matrix from which to get the
151
- top n indices per row
152
- n: The number of highest values to extract from each row
153
- Returns:
154
- indices: The top n indices per row
155
- """
156
- top_n_idx = []
157
- mat_inptr_np_ar = matrix .indptr .get ()
158
- le_np = mat_inptr_np_ar [:- 1 ]
159
- ri_np = mat_inptr_np_ar [1 :]
160
-
161
- for le , ri in zip (le_np , ri_np ):
162
- le = le .item ()
163
- ri = ri .item ()
164
- n_row_pick = min (n , ri - le )
165
- top_n_idx .append (
166
- matrix .indices [
167
- le + cp .argpartition (
168
- matrix .data [le :ri ], - n_row_pick
169
- )[- n_row_pick :]
170
- ]
171
- )
172
- return cp .array (top_n_idx )
173
-
174
- def top_n_values_sparse (self , matrix , indices ):
175
- """Return the top n values for each row in a sparse matrix
176
- Args:
177
- matrix: The sparse matrix from which to get the top n
178
- indices per row
179
- indices: The top n indices per row
180
- Returns:
181
- top_values: The top n scores per row
182
- """
183
- top_values = []
184
- for row , values in enumerate (indices ):
185
- scores = cp .array (
186
- [matrix [row , value ] if value is not None
187
- else 0 for value in values ]
188
- )
189
- top_values .append (scores )
190
- return cp .array (top_values )
140
+ # def top_n_idx_sparse(self, matrix, n):
141
+ # """Return indices of top n values in each row of a sparse matrix
142
+ # Retrieved from:
143
+ # https://stackoverflow.com/questions/49207275/finding-the-top-n-values-in-a-row-of-a-scipy-sparse-matrix
144
+ # Args:
145
+ # matrix: The sparse matrix from which to get the
146
+ # top n indices per row
147
+ # n: The number of highest values to extract from each row
148
+ # Returns:
149
+ # indices: The top n indices per row
150
+ # """
151
+ # top_n_idx = []
152
+ # mat_inptr_np_ar = matrix.indptr.get()
153
+ # le_np = mat_inptr_np_ar[:-1]
154
+ # ri_np = mat_inptr_np_ar[1:]
155
+
156
+ # for le, ri in zip(le_np, ri_np):
157
+ # le = le.item()
158
+ # ri = ri.item()
159
+ # n_row_pick = min(n, ri - le)
160
+ # top_n_idx.append(
161
+ # matrix.indices[
162
+ # le + cp.argpartition(
163
+ # matrix.data[le:ri], -n_row_pick
164
+ # )[-n_row_pick:]
165
+ # ]
166
+ # )
167
+ # return cp.array(top_n_idx)
168
+
169
+ # def top_n_values_sparse(self, matrix, indices):
170
+ # """Return the top n values for each row in a sparse matrix
171
+ # Args:
172
+ # matrix: The sparse matrix from which to get the top n
173
+ # indices per row
174
+ # indices: The top n indices per row
175
+ # Returns:
176
+ # top_values: The top n scores per row
177
+ # """
178
+ # top_values = []
179
+ # for row, values in enumerate(indices):
180
+ # scores = cp.array(
181
+ # [matrix[row, value] if value is not None
182
+ # else 0 for value in values]
183
+ # )
184
+ # top_values.append(scores)
185
+ # return cp.array(top_values)
191
186
192
187
# Topic representation
193
188
def extract_top_n_words_per_topic (
@@ -213,8 +208,8 @@ def extract_top_n_words_per_topic(
213
208
214
209
labels = sorted (docs_per_topics_topics .to_arrow ().to_pylist ())
215
210
216
- indices = self . top_n_idx_sparse (tf_idf , n )
217
- scores = self . top_n_values_sparse (tf_idf , indices )
211
+ indices = top_n_idx_sparse (tf_idf , n )
212
+ scores = top_n_values_sparse (tf_idf , indices )
218
213
sorted_indices = cp .argsort (scores , 1 )
219
214
indices = cp .take_along_axis (indices , sorted_indices , axis = 1 )
220
215
scores = cp .take_along_axis (scores , sorted_indices , axis = 1 )
@@ -309,9 +304,11 @@ def reduce_topics(self, num_topics, tf_idf, docs_df):
309
304
similarities [topic_to_merge + 1 ]) - 1
310
305
311
306
# Adjust topics
312
- a = cudf .Series (topic_to_merge_into )
313
- docs_df .loc [docs_df ["Topic" ] == topic_to_merge , "Topic" ] = a
314
- old_topics = docs_df .sort_values ("Topic" ).Topic .unique ()
307
+ topic_to_merge_into_series = cudf .Series (topic_to_merge_into )
308
+ docs_df .loc [
309
+ docs_df ["Topic" ] == topic_to_merge , "Topic"
310
+ ] = topic_to_merge_into_series
311
+ old_topics = docs_df .Topic .unique ().sort_values ()
315
312
old_topics = old_topics .to_arrow ().to_pylist ()
316
313
map_topics = {
317
314
old_topic : index - 1 for index ,
@@ -397,31 +394,31 @@ def get_topic_info(self):
397
394
398
395
# Note: getting topics in sorted order without using
399
396
# TopicMapper, as in BERTopic
400
- test = self .topic_sizes_df .Name .str .split (
397
+ topic_sizes_df_columns = self .topic_sizes_df .Name .str .split (
401
398
"_" , expand = True
402
399
)[[0 , 1 , 2 , 3 , 4 ]]
403
- self .original_topic_mapping = test [0 ]
400
+ self .original_topic_mapping = topic_sizes_df_columns [0 ]
404
401
self .new_topic_mapping = sorted (
405
402
self .topic_sizes_df ["Topic" ].to_pandas ()
406
403
)
407
404
408
405
self .original_topic_mapping = self .original_topic_mapping .to_arrow ().to_pylist ()
409
406
self .final_topic_mapping = dict (zip (self .new_topic_mapping ,
410
407
self .original_topic_mapping ))
411
- test [0 ] = self .new_topic_mapping
412
- test ["Name" ] = (
413
- test [0 ].astype ("str" )
408
+ topic_sizes_df_columns [0 ] = self .new_topic_mapping
409
+ topic_sizes_df_columns ["Name" ] = (
410
+ topic_sizes_df_columns [0 ].astype ("str" )
414
411
+ "_"
415
- + test [1 ]
412
+ + topic_sizes_df_columns [1 ]
416
413
+ "_"
417
- + test [2 ]
414
+ + topic_sizes_df_columns [2 ]
418
415
+ "_"
419
- + test [3 ]
416
+ + topic_sizes_df_columns [3 ]
420
417
+ "_"
421
- + test [4 ]
418
+ + topic_sizes_df_columns [4 ]
422
419
)
423
- self .topic_sizes_df ["Name" ] = test ["Name" ]
424
- self .topic_sizes_df ["Topic" ] = test [0 ]
420
+ self .topic_sizes_df ["Name" ] = topic_sizes_df_columns ["Name" ]
421
+ self .topic_sizes_df ["Topic" ] = topic_sizes_df_columns [0 ]
425
422
return self .topic_sizes_df
426
423
427
424
def update_topic_size (self , documents ):
0 commit comments