Skip to content

Commit 678da80

Browse files
addressing reviews part 1
1 parent 2f2727d commit 678da80

16 files changed

+232
-241
lines changed

cuBERT_topic_modelling/README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,6 @@ An [example](berttopic_example.ipynb) notebook is provided, which goes through t
2020

2121
## Acknowledgement
2222

23-
Our work has been inspired from the [BERTopic library](https://github.com/MaartenGr/BERTopic) and Maarten Grootendorst's [blog](https://towardsdatascience.com/topic-modeling-with-bert-779f7db187e6) on how to use BERT to create your own topic model.
24-
23+
Our work ports the CPU implementation of the [BERTopic library](https://github.com/MaartenGr/BERTopic) to a python-based GPU backend using NVIDIA RAPIDS.
2524

25+
Please refer to Maarten Grootendorst's [blog](https://towardsdatascience.com/topic-modeling-with-bert-779f7db187e6) on how to use BERT to create your own topic model.

cuBERT_topic_modelling/cuBERTopic.py

+71-74
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,3 @@
1-
import numpy as np
2-
import random
3-
4-
random.seed(10)
5-
np.random.seed(0)
6-
71
from sentence_transformers import SentenceTransformer
82
import cuml
93
import cudf
@@ -12,12 +6,12 @@
126
from cuml.metrics import pairwise_distances
137
import cupy as cp
148
from torch.utils.dlpack import to_dlpack
15-
from vectorizer import CountVecWrapper
169
from ctfidf import ClassTFIDF
1710
from mmr import mmr
11+
from utils.sparse_matrix_utils import top_n_idx_sparse, top_n_values_sparse
12+
from vectorizer.vectorizer import CountVecWrapper
1813

19-
20-
class gpu_bertopic:
14+
class gpu_BERTopic:
2115
def __init__(self):
2216
self.top_n_words_df = None
2317
self.topic_sizes_df = None
@@ -83,7 +77,9 @@ def clustering_hdbscan(self, umap_embeddings, documents):
8377
documents: DataFrame from the original data
8478
8579
Returns:
86-
cluster: HDBSCAN object
80+
documents: Modified dataframe with topic labels
81+
probabilities: response from cluster.probabilities_ which
82+
represents the likelihood of the doc belonging to a cluster.
8783
"""
8884
cluster = cuml.cluster.HDBSCAN(
8985
min_cluster_size=10,
@@ -127,8 +123,7 @@ def new_c_tf_idf(self, document_df, m, ngram_range=(1, 1)):
127123
def create_topics(self, docs_df):
128124
"""Extract topics from the clusters using a class-based TF-IDF
129125
Arguments:
130-
data: list with documents
131-
cluster: HDBSCAN object
126+
docs_df: DataFrame containing documents and other information
132127
Returns:
133128
tf_idf: The resulting matrix giving a value (importance score) for
134129
each word per topic
@@ -142,52 +137,52 @@ def create_topics(self, docs_df):
142137
tf_idf, count = self.new_c_tf_idf(docs_df, len(docs_df))
143138
return tf_idf, count, docs_per_topics_topics, docs_df
144139

145-
def top_n_idx_sparse(self, matrix, n):
146-
"""Return indices of top n values in each row of a sparse matrix
147-
Retrieved from:
148-
https://stackoverflow.com/questions/49207275/finding-the-top-n-values-in-a-row-of-a-scipy-sparse-matrix
149-
Args:
150-
matrix: The sparse matrix from which to get the
151-
top n indices per row
152-
n: The number of highest values to extract from each row
153-
Returns:
154-
indices: The top n indices per row
155-
"""
156-
top_n_idx = []
157-
mat_inptr_np_ar = matrix.indptr.get()
158-
le_np = mat_inptr_np_ar[:-1]
159-
ri_np = mat_inptr_np_ar[1:]
160-
161-
for le, ri in zip(le_np, ri_np):
162-
le = le.item()
163-
ri = ri.item()
164-
n_row_pick = min(n, ri - le)
165-
top_n_idx.append(
166-
matrix.indices[
167-
le + cp.argpartition(
168-
matrix.data[le:ri], -n_row_pick
169-
)[-n_row_pick:]
170-
]
171-
)
172-
return cp.array(top_n_idx)
173-
174-
def top_n_values_sparse(self, matrix, indices):
175-
"""Return the top n values for each row in a sparse matrix
176-
Args:
177-
matrix: The sparse matrix from which to get the top n
178-
indices per row
179-
indices: The top n indices per row
180-
Returns:
181-
top_values: The top n scores per row
182-
"""
183-
top_values = []
184-
for row, values in enumerate(indices):
185-
scores = cp.array(
186-
[matrix[row, value] if value is not None
187-
else 0 for value in values]
188-
)
189-
top_values.append(scores)
190-
return cp.array(top_values)
140+
# def top_n_idx_sparse(self, matrix, n):
141+
# """Return indices of top n values in each row of a sparse matrix
142+
# Retrieved from:
143+
# https://stackoverflow.com/questions/49207275/finding-the-top-n-values-in-a-row-of-a-scipy-sparse-matrix
144+
# Args:
145+
# matrix: The sparse matrix from which to get the
146+
# top n indices per row
147+
# n: The number of highest values to extract from each row
148+
# Returns:
149+
# indices: The top n indices per row
150+
# """
151+
# top_n_idx = []
152+
# mat_inptr_np_ar = matrix.indptr.get()
153+
# le_np = mat_inptr_np_ar[:-1]
154+
# ri_np = mat_inptr_np_ar[1:]
155+
156+
# for le, ri in zip(le_np, ri_np):
157+
# le = le.item()
158+
# ri = ri.item()
159+
# n_row_pick = min(n, ri - le)
160+
# top_n_idx.append(
161+
# matrix.indices[
162+
# le + cp.argpartition(
163+
# matrix.data[le:ri], -n_row_pick
164+
# )[-n_row_pick:]
165+
# ]
166+
# )
167+
# return cp.array(top_n_idx)
168+
169+
# def top_n_values_sparse(self, matrix, indices):
170+
# """Return the top n values for each row in a sparse matrix
171+
# Args:
172+
# matrix: The sparse matrix from which to get the top n
173+
# indices per row
174+
# indices: The top n indices per row
175+
# Returns:
176+
# top_values: The top n scores per row
177+
# """
178+
# top_values = []
179+
# for row, values in enumerate(indices):
180+
# scores = cp.array(
181+
# [matrix[row, value] if value is not None
182+
# else 0 for value in values]
183+
# )
184+
# top_values.append(scores)
185+
# return cp.array(top_values)
191186

192187
# Topic representation
193188
def extract_top_n_words_per_topic(
@@ -213,8 +208,8 @@ def extract_top_n_words_per_topic(
213208

214209
labels = sorted(docs_per_topics_topics.to_arrow().to_pylist())
215210

216-
indices = self.top_n_idx_sparse(tf_idf, n)
217-
scores = self.top_n_values_sparse(tf_idf, indices)
211+
indices = top_n_idx_sparse(tf_idf, n)
212+
scores = top_n_values_sparse(tf_idf, indices)
218213
sorted_indices = cp.argsort(scores, 1)
219214
indices = cp.take_along_axis(indices, sorted_indices, axis=1)
220215
scores = cp.take_along_axis(scores, sorted_indices, axis=1)
@@ -309,9 +304,11 @@ def reduce_topics(self, num_topics, tf_idf, docs_df):
309304
similarities[topic_to_merge + 1]) - 1
310305

311306
# Adjust topics
312-
a = cudf.Series(topic_to_merge_into)
313-
docs_df.loc[docs_df["Topic"] == topic_to_merge, "Topic"] = a
314-
old_topics = docs_df.sort_values("Topic").Topic.unique()
307+
topic_to_merge_into_series = cudf.Series(topic_to_merge_into)
308+
docs_df.loc[
309+
docs_df["Topic"] == topic_to_merge, "Topic"
310+
] = topic_to_merge_into_series
311+
old_topics = docs_df.Topic.unique().sort_values()
315312
old_topics = old_topics.to_arrow().to_pylist()
316313
map_topics = {
317314
old_topic: index - 1 for index,
@@ -397,31 +394,31 @@ def get_topic_info(self):
397394

398395
# Note: getting topics in sorted order without using
399396
# TopicMapper, as in BERTopic
400-
test = self.topic_sizes_df.Name.str.split(
397+
topic_sizes_df_columns = self.topic_sizes_df.Name.str.split(
401398
"_", expand=True
402399
)[[0, 1, 2, 3, 4]]
403-
self.original_topic_mapping = test[0]
400+
self.original_topic_mapping = topic_sizes_df_columns[0]
404401
self.new_topic_mapping = sorted(
405402
self.topic_sizes_df["Topic"].to_pandas()
406403
)
407404

408405
self.original_topic_mapping = self.original_topic_mapping.to_arrow().to_pylist()
409406
self.final_topic_mapping = dict(zip(self.new_topic_mapping,
410407
self.original_topic_mapping))
411-
test[0] = self.new_topic_mapping
412-
test["Name"] = (
413-
test[0].astype("str")
408+
topic_sizes_df_columns[0] = self.new_topic_mapping
409+
topic_sizes_df_columns["Name"] = (
410+
topic_sizes_df_columns[0].astype("str")
414411
+ "_"
415-
+ test[1]
412+
+ topic_sizes_df_columns[1]
416413
+ "_"
417-
+ test[2]
414+
+ topic_sizes_df_columns[2]
418415
+ "_"
419-
+ test[3]
416+
+ topic_sizes_df_columns[3]
420417
+ "_"
421-
+ test[4]
418+
+ topic_sizes_df_columns[4]
422419
)
423-
self.topic_sizes_df["Name"] = test["Name"]
424-
self.topic_sizes_df["Topic"] = test[0]
420+
self.topic_sizes_df["Name"] = topic_sizes_df_columns["Name"]
421+
self.topic_sizes_df["Topic"] = topic_sizes_df_columns[0]
425422
return self.topic_sizes_df
426423

427424
def update_topic_size(self, documents):

cuBERT_topic_modelling/mmr.py

-5
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,6 @@ def mmr(
3030
List[str]: The selected keywords/keyphrases
3131
"""
3232

33-
# doc_embedding = doc_embedding.get()
34-
# word_embeddings = word_embeddings.get()
35-
# words = words.to_arrow().to_pylist()
3633
# Extract similarity within words, and between words and the document
3734
word_doc_similarity = pairwise_distances(
3835
word_embeddings, doc_embedding, metric="cosine"
@@ -43,8 +40,6 @@ def mmr(
4340
keywords_idx = [cp.argmax(word_doc_similarity).get()]
4441
candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]
4542

46-
# keywords_idx = keywords_idx.get()
47-
# word_similarity = word_similarity.get()
4843
for _ in range(top_n - 1):
4944
# Extract similarities within candidates and
5045
# between candidates and selected keywords/phrases

cuBERT_topic_modelling/setup.py

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from setuptools import setup, find_packages
2+
3+
setup(
4+
name="cubertopic",
5+
version='1.0',
6+
packages=find_packages()
7+
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
from bertopic import BERTopic
2+
import pandas as pd
3+
import numpy as np
4+
import cudf
5+
from sklearn.datasets import fetch_20newsgroups
6+
import pytest
7+
from cupyx.scipy.sparse.csr import csr_matrix
8+
from cuBERTopic import gpu_BERTopic
9+
from ctfidf import ClassTFIDF
10+
from vectorizer.vectorizer import CountVecWrapper
11+
12+
data_trivial = [
13+
"This is the first document.",
14+
"This document is the second document.",
15+
"And this is the third one.",
16+
"Is this the first document?",
17+
]
18+
19+
newsgroup_docs = fetch_20newsgroups(
20+
subset="all", remove=("headers", "footers", "quotes")
21+
)["data"][:1000]
22+
23+
docs_df_trivial = pd.DataFrame(data_trivial, columns=["Document"])
24+
docs_df_trivial["Topic"] = [1, 2, 0, 1]
25+
docs_df_trivial = docs_df_trivial.sort_values("Topic")
26+
27+
data_big = fetch_20newsgroups(subset="all")["data"]
28+
docs_df_big = pd.DataFrame(data_big, columns=["Document"])
29+
docs_df_big["Topic"] = np.random.randint(0, 100, len(docs_df_big))
30+
docs_df_big = docs_df_big.sort_values("Topic")
31+
32+
33+
def extract_c_tf_idf_scores(documents: pd.DataFrame):
34+
cpu_bertopic = BERTopic()
35+
documents_per_topic = documents.groupby(["Topic"], as_index=False).agg(
36+
{"Document": " ".join}
37+
)
38+
cpu_bertopic.c_tf_idf, words = cpu_bertopic._c_tf_idf(
39+
documents_per_topic, m=len(documents)
40+
)
41+
return cpu_bertopic.c_tf_idf, words
42+
43+
44+
@pytest.mark.parametrize("docs_df", [(docs_df_trivial), (docs_df_big)])
45+
def test_ctfidf_values(docs_df):
46+
"""Test c-TF-IDF values
47+
Here we test the values against the _c_tf_idf method from BERTopic
48+
to make sure we get the same correctness.
49+
"""
50+
tfidf_score, w = extract_c_tf_idf_scores(docs_df)
51+
docs_df_gpu = cudf.from_pandas(docs_df)
52+
gpu_topic = gpu_BERTopic()
53+
X = gpu_topic.new_c_tf_idf(docs_df_gpu, len(docs_df_gpu))
54+
np.testing.assert_almost_equal(X[0].toarray().get(), tfidf_score.toarray())
55+
56+
57+
def test_ctfidf_general():
58+
"""Test c-TF-IDF general
59+
Test whether the c-TF-IDF matrix is correctly calculated.
60+
This includes the general shape of the matrix as well as the
61+
possible values that could occupy the matrix.
62+
"""
63+
nr_topics = 10
64+
docs_df = cudf.DataFrame(newsgroup_docs, columns=["Document"])
65+
docs_df["Topic"] = np.random.randint(-1, nr_topics, len(newsgroup_docs))
66+
67+
count = CountVecWrapper(ngram_range=(1, 1))
68+
X = count.fit_transform(docs_df)
69+
words = count.get_feature_names()
70+
multiplier = None
71+
72+
transformer = ClassTFIDF().fit(
73+
X, n_samples=len(newsgroup_docs), multiplier=multiplier
74+
)
75+
76+
c_tf_idf = transformer.transform(X)
77+
78+
words = words.to_arrow().to_pylist()
79+
assert len(words) > 1000
80+
assert all([isinstance(x, str) for x in words])
81+
82+
assert isinstance(X, csr_matrix)
83+
assert isinstance(c_tf_idf, csr_matrix)
84+
85+
assert X.shape[0] == nr_topics + 1
86+
assert X.shape[1] == len(words)
87+
88+
assert c_tf_idf.shape[0] == nr_topics + 1
89+
assert c_tf_idf.shape[1] == len(words)
90+
91+
assert np.min(c_tf_idf) > -1
92+
assert np.max(c_tf_idf) < 1
93+
94+
assert np.min(X) == 0
95+

0 commit comments

Comments
 (0)