VibhuJawa
diff --git a/‎cuBERT_topic_modelling/README.md
+2-2 b/‎cuBERT_topic_modelling/README.md
+2-2
diff --git a/‎cuBERT_topic_modelling/cuBERTopic.py
+71-74 b/‎cuBERT_topic_modelling/cuBERTopic.py
+71-74
diff --git a/‎cuBERT_topic_modelling/mmr.py
-5 b/‎cuBERT_topic_modelling/mmr.py
-5
diff --git a/‎cuBERT_topic_modelling/setup.py
+7 b/‎cuBERT_topic_modelling/setup.py
+7
diff --git a/‎cuBERT_topic_modelling/tests/test_ctfidf.py
+95 b/‎cuBERT_topic_modelling/tests/test_ctfidf.py
+95
@@ -20,6 +20,6 @@ An [example](berttopic_example.ipynb) notebook is provided, which goes through t
 
 ## Acknowledgement
 
-Our work has been inspired from the [BERTopic library](https://github.com/MaartenGr/BERTopic) and Maarten Grootendorst's [blog](https://towardsdatascience.com/topic-modeling-with-bert-779f7db187e6) on how to use BERT to create your own topic model.
-
+Our work ports the CPU implementation of the [BERTopic library](https://github.com/MaartenGr/BERTopic) to a python-based GPU backend using NVIDIA RAPIDS.
 
+Please refer to Maarten Grootendorst's [blog](https://towardsdatascience.com/topic-modeling-with-bert-779f7db187e6) on how to use BERT to create your own topic model.
@@ -1,9 +1,3 @@
-import numpy as np
-import random
-
-random.seed(10)
-np.random.seed(0)
-
 from sentence_transformers import SentenceTransformer
 import cuml
 import cudf
@@ -12,12 +6,12 @@
 from cuml.metrics import pairwise_distances
 import cupy as cp
 from torch.utils.dlpack import to_dlpack
-from vectorizer import CountVecWrapper
 from ctfidf import ClassTFIDF
 from mmr import mmr
+from utils.sparse_matrix_utils import top_n_idx_sparse, top_n_values_sparse
+from vectorizer.vectorizer import CountVecWrapper
 
-
-class gpu_bertopic:
+class gpu_BERTopic:
     def __init__(self):
         self.top_n_words_df = None
         self.topic_sizes_df = None
@@ -83,7 +77,9 @@ def clustering_hdbscan(self, umap_embeddings, documents):
             documents: DataFrame from the original data
 
         Returns:
-            cluster: HDBSCAN object
+            documents: Modified dataframe with topic labels
+            probabilities: response from cluster.probabilities_ which
+            represents the likelihood of the doc belonging to a cluster.
         """
         cluster = cuml.cluster.HDBSCAN(
             min_cluster_size=10,
@@ -127,8 +123,7 @@ def new_c_tf_idf(self, document_df, m, ngram_range=(1, 1)):
     def create_topics(self, docs_df):
         """Extract topics from the clusters using a class-based TF-IDF
         Arguments:
-            data: list with documents
-            cluster: HDBSCAN object
+            docs_df: DataFrame containing documents and other information
         Returns:
             tf_idf: The resulting matrix giving a value (importance score) for
             each word per topic
@@ -142,52 +137,52 @@ def create_topics(self, docs_df):
         tf_idf, count = self.new_c_tf_idf(docs_df, len(docs_df))
         return tf_idf, count, docs_per_topics_topics, docs_df
 
-    def top_n_idx_sparse(self, matrix, n):
-        """Return indices of top n values in each row of a sparse matrix
-        Retrieved from:
-            https://stackoverflow.com/questions/49207275/finding-the-top-n-values-in-a-row-of-a-scipy-sparse-matrix
-        Args:
-            matrix: The sparse matrix from which to get the
-            top n indices per row
-            n: The number of highest values to extract from each row
-        Returns:
-            indices: The top n indices per row
-        """
-        top_n_idx = []
-        mat_inptr_np_ar = matrix.indptr.get()
-        le_np = mat_inptr_np_ar[:-1]
-        ri_np = mat_inptr_np_ar[1:]
-
-        for le, ri in zip(le_np, ri_np):
-            le = le.item()
-            ri = ri.item()
-            n_row_pick = min(n, ri - le)
-            top_n_idx.append(
-                matrix.indices[
-                    le + cp.argpartition(
-                        matrix.data[le:ri], -n_row_pick
-                    )[-n_row_pick:]
-                ]
-            )
-        return cp.array(top_n_idx)
-
-    def top_n_values_sparse(self, matrix, indices):
-        """Return the top n values for each row in a sparse matrix
-        Args:
-            matrix: The sparse matrix from which to get the top n
-            indices per row
-            indices: The top n indices per row
-        Returns:
-            top_values: The top n scores per row
-        """
-        top_values = []
-        for row, values in enumerate(indices):
-            scores = cp.array(
-                [matrix[row, value] if value is not None
-                 else 0 for value in values]
-            )
-            top_values.append(scores)
-        return cp.array(top_values)
+    # def top_n_idx_sparse(self, matrix, n):
+    #     """Return indices of top n values in each row of a sparse matrix
+    #     Retrieved from:
+    #         https://stackoverflow.com/questions/49207275/finding-the-top-n-values-in-a-row-of-a-scipy-sparse-matrix
+    #     Args:
+    #         matrix: The sparse matrix from which to get the
+    #         top n indices per row
+    #         n: The number of highest values to extract from each row
+    #     Returns:
+    #         indices: The top n indices per row
+    #     """
+    #     top_n_idx = []
+    #     mat_inptr_np_ar = matrix.indptr.get()
+    #     le_np = mat_inptr_np_ar[:-1]
+    #     ri_np = mat_inptr_np_ar[1:]
+
+    #     for le, ri in zip(le_np, ri_np):
+    #         le = le.item()
+    #         ri = ri.item()
+    #         n_row_pick = min(n, ri - le)
+    #         top_n_idx.append(
+    #             matrix.indices[
+    #                 le + cp.argpartition(
+    #                     matrix.data[le:ri], -n_row_pick
+    #                 )[-n_row_pick:]
+    #             ]
+    #         )
+    #     return cp.array(top_n_idx)
+
+    # def top_n_values_sparse(self, matrix, indices):
+    #     """Return the top n values for each row in a sparse matrix
+    #     Args:
+    #         matrix: The sparse matrix from which to get the top n
+    #         indices per row
+    #         indices: The top n indices per row
+    #     Returns:
+    #         top_values: The top n scores per row
+    #     """
+    #     top_values = []
+    #     for row, values in enumerate(indices):
+    #         scores = cp.array(
+    #             [matrix[row, value] if value is not None
+    #              else 0 for value in values]
+    #         )
+    #         top_values.append(scores)
+    #     return cp.array(top_values)
 
     # Topic representation
     def extract_top_n_words_per_topic(
@@ -213,8 +208,8 @@ def extract_top_n_words_per_topic(
 
         labels = sorted(docs_per_topics_topics.to_arrow().to_pylist())
 
-        indices = self.top_n_idx_sparse(tf_idf, n)
-        scores = self.top_n_values_sparse(tf_idf, indices)
+        indices = top_n_idx_sparse(tf_idf, n)
+        scores = top_n_values_sparse(tf_idf, indices)
         sorted_indices = cp.argsort(scores, 1)
         indices = cp.take_along_axis(indices, sorted_indices, axis=1)
         scores = cp.take_along_axis(scores, sorted_indices, axis=1)
@@ -309,9 +304,11 @@ def reduce_topics(self, num_topics, tf_idf, docs_df):
                 similarities[topic_to_merge + 1]) - 1
 
             # Adjust topics
-            a = cudf.Series(topic_to_merge_into)
-            docs_df.loc[docs_df["Topic"] == topic_to_merge, "Topic"] = a
-            old_topics = docs_df.sort_values("Topic").Topic.unique()
+            topic_to_merge_into_series = cudf.Series(topic_to_merge_into)
+            docs_df.loc[
+                docs_df["Topic"] == topic_to_merge, "Topic"
+            ] = topic_to_merge_into_series
+            old_topics = docs_df.Topic.unique().sort_values()
             old_topics = old_topics.to_arrow().to_pylist()
             map_topics = {
                 old_topic: index - 1 for index,
@@ -397,31 +394,31 @@ def get_topic_info(self):
 
         # Note: getting topics in sorted order without using
         # TopicMapper, as in BERTopic
-        test = self.topic_sizes_df.Name.str.split(
+        topic_sizes_df_columns = self.topic_sizes_df.Name.str.split(
             "_", expand=True
         )[[0, 1, 2, 3, 4]]
-        self.original_topic_mapping = test[0]
+        self.original_topic_mapping = topic_sizes_df_columns[0]
         self.new_topic_mapping = sorted(
             self.topic_sizes_df["Topic"].to_pandas()
         )
 
         self.original_topic_mapping = self.original_topic_mapping.to_arrow().to_pylist()
         self.final_topic_mapping = dict(zip(self.new_topic_mapping, 
                                             self.original_topic_mapping))
-        test[0] = self.new_topic_mapping
-        test["Name"] = (
-            test[0].astype("str")
+        topic_sizes_df_columns[0] = self.new_topic_mapping
+        topic_sizes_df_columns["Name"] = (
+            topic_sizes_df_columns[0].astype("str")
             + "_"
-            + test[1]
+            + topic_sizes_df_columns[1]
             + "_"
-            + test[2]
+            + topic_sizes_df_columns[2]
             + "_"
-            + test[3]
+            + topic_sizes_df_columns[3]
             + "_"
-            + test[4]
+            + topic_sizes_df_columns[4]
         )
-        self.topic_sizes_df["Name"] = test["Name"]
-        self.topic_sizes_df["Topic"] = test[0]
+        self.topic_sizes_df["Name"] = topic_sizes_df_columns["Name"]
+        self.topic_sizes_df["Topic"] = topic_sizes_df_columns[0]
         return self.topic_sizes_df
 
     def update_topic_size(self, documents):
 
@@ -30,9 +30,6 @@ def mmr(
          List[str]: The selected keywords/keyphrases
     """
 
-    # doc_embedding = doc_embedding.get()
-    # word_embeddings = word_embeddings.get()
-    # words = words.to_arrow().to_pylist()
     # Extract similarity within words, and between words and the document
     word_doc_similarity = pairwise_distances(
         word_embeddings, doc_embedding, metric="cosine"
@@ -43,8 +40,6 @@ def mmr(
     keywords_idx = [cp.argmax(word_doc_similarity).get()]
     candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]
 
-    # keywords_idx = keywords_idx.get()
-    # word_similarity = word_similarity.get()
     for _ in range(top_n - 1):
         # Extract similarities within candidates and
         # between candidates and selected keywords/phrases
 
@@ -0,0 +1,7 @@
+from setuptools import setup, find_packages
+
+setup(
+    name="cubertopic",
+    version='1.0',
+    packages=find_packages()
+)
@@ -0,0 +1,95 @@
+from bertopic import BERTopic
+import pandas as pd
+import numpy as np
+import cudf
+from sklearn.datasets import fetch_20newsgroups
+import pytest
+from cupyx.scipy.sparse.csr import csr_matrix
+from cuBERTopic import gpu_BERTopic
+from ctfidf import ClassTFIDF
+from vectorizer.vectorizer import CountVecWrapper
+
+data_trivial = [
+    "This is the first document.",
+    "This document is the second document.",
+    "And this is the third one.",
+    "Is this the first document?",
+]
+
+newsgroup_docs = fetch_20newsgroups(
+    subset="all", remove=("headers", "footers", "quotes")
+)["data"][:1000]
+
+docs_df_trivial = pd.DataFrame(data_trivial, columns=["Document"])
+docs_df_trivial["Topic"] = [1, 2, 0, 1]
+docs_df_trivial = docs_df_trivial.sort_values("Topic")
+
+data_big = fetch_20newsgroups(subset="all")["data"]
+docs_df_big = pd.DataFrame(data_big, columns=["Document"])
+docs_df_big["Topic"] = np.random.randint(0, 100, len(docs_df_big))
+docs_df_big = docs_df_big.sort_values("Topic")
+
+
+def extract_c_tf_idf_scores(documents: pd.DataFrame):
+    cpu_bertopic = BERTopic()
+    documents_per_topic = documents.groupby(["Topic"], as_index=False).agg(
+        {"Document": " ".join}
+    )
+    cpu_bertopic.c_tf_idf, words = cpu_bertopic._c_tf_idf(
+        documents_per_topic, m=len(documents)
+    )
+    return cpu_bertopic.c_tf_idf, words
+
+
+@pytest.mark.parametrize("docs_df", [(docs_df_trivial), (docs_df_big)])
+def test_ctfidf_values(docs_df):
+    """Test c-TF-IDF values
+    Here we test the values against the _c_tf_idf method from BERTopic
+    to make sure we get the same correctness.
+    """
+    tfidf_score, w = extract_c_tf_idf_scores(docs_df)
+    docs_df_gpu = cudf.from_pandas(docs_df)
+    gpu_topic = gpu_BERTopic()
+    X = gpu_topic.new_c_tf_idf(docs_df_gpu, len(docs_df_gpu))
+    np.testing.assert_almost_equal(X[0].toarray().get(), tfidf_score.toarray())
+
+
+def test_ctfidf_general():
+    """Test c-TF-IDF general
+    Test whether the c-TF-IDF matrix is correctly calculated.
+    This includes the general shape of the matrix as well as the
+    possible values that could occupy the matrix.
+    """
+    nr_topics = 10
+    docs_df = cudf.DataFrame(newsgroup_docs, columns=["Document"])
+    docs_df["Topic"] = np.random.randint(-1, nr_topics, len(newsgroup_docs))
+
+    count = CountVecWrapper(ngram_range=(1, 1))
+    X = count.fit_transform(docs_df)
+    words = count.get_feature_names()
+    multiplier = None
+
+    transformer = ClassTFIDF().fit(
+        X, n_samples=len(newsgroup_docs), multiplier=multiplier
+    )
+
+    c_tf_idf = transformer.transform(X)
+
+    words = words.to_arrow().to_pylist()
+    assert len(words) > 1000
+    assert all([isinstance(x, str) for x in words])
+
+    assert isinstance(X, csr_matrix)
+    assert isinstance(c_tf_idf, csr_matrix)
+
+    assert X.shape[0] == nr_topics + 1
+    assert X.shape[1] == len(words)
+
+    assert c_tf_idf.shape[0] == nr_topics + 1
+    assert c_tf_idf.shape[1] == len(words)
+
+    assert np.min(c_tf_idf) > -1
+    assert np.max(c_tf_idf) < 1
+
+    assert np.min(X) == 0
+