-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtopkwords.py
120 lines (84 loc) · 3.84 KB
/
topkwords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# !pip install transformers -q
from transformers import AutoTokenizer, TFRobertaModel,pipeline, AutoModel
import matplotlib.pyplot as plt
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# PLEASE COMMENT IF AFFTER DOWNLOADING
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('punkt_tab')
'''Trying out different models'''
# Pretrained on Clickbait corupus
# tokenizer = AutoTokenizer.from_pretrained("caush/Clickbait1",is_split_into_words=True)
# model = AutoModel.from_pretrained("caush/Clickbait1",output_hidden_states=True,output_attentions=True)
# Pretrained on Clickbait corupus
tokenizer = AutoTokenizer.from_pretrained("valurank/distilroberta-clickbait",is_split_into_words=True,add_prefix_space=True)
model = AutoModel.from_pretrained("valurank/distilroberta-clickbait",output_hidden_states=True,output_attentions=True)
# Not pretrained on clickbait corpus Roberta model
# tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base",is_split_into_words=True,add_prefix_space=True)
# model = AutoModel.from_pretrained("FacebookAI/roberta-base",output_hidden_states=True,output_attentions=True)
model.eval()
def tokenize_and_remove_stopwords(text):
tokens = word_tokenize(text)
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
return filtered_tokens
def word_space_tokenizer(text):
return text.split()
def word_to_vectors(text):
return
def predict(text,top_k = None,plot_graph=False):
# Playing with different tokenizers
tokens = tokenize_and_remove_stopwords(text)
# Hugging face tokenizer
inputs = tokenizer(tokens, is_split_into_words=True, return_tensors="pt")
# inputs = tokenizer("You Won't Believe what happend next!", return_tensors="pt")
outputs = model(**inputs)
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
attentions = outputs.attentions
# print(attentions)
layer_x = -2 # 2nd last layer
# Attention from layer pervious to next
attention_layer_x = attentions[layer_x]
# CLS token attention from layer X to X+1
cls_attention = attention_layer_x[0, :, 0, :]
avg_cls_attention_to_tokens = cls_attention.mean(dim=0)
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
sorted_indices = torch.argsort(avg_cls_attention_to_tokens, descending=True)
if top_k == None:
top_k = len(sorted_indices)
k=top_k
else:
k=top_k
top_K_words_list = [(tokens[i], avg_cls_attention_to_tokens[i].item()) for i in sorted_indices[:top_k]]
# avg attention plot
if plot_graph:
plt.figure(figsize=(10, 6))
plt.bar(range(len(tokens)), avg_cls_attention_to_tokens.detach().numpy())
plt.xticks(range(len(tokens)), tokens, rotation=90)
plt.xlabel("Tokens")
plt.ylabel("Attention Weight")
plt.title(f"CLS Token Attention (Layer {layer_x} to Layer {layer_x+1})")
plt.tight_layout()
plt.show()
return top_K_words_list
# last_hidden_states = outputs.last_hidden_state
text = "How to choose the best college for you"
text = "An Open Letter to Jerry Seinfeld from a 'Politically Correct' College Student"
text = "Kids runs away from house to become the greatest Gamer of all time!"
text = "100 Fast ways to make quick quick"
text = "German killed by elephant"
text = "Wow, there are things that you can never put in the freezer!"
text = "How to increase your profit using the same content marketing strategy as Spotify?"
text = "President announce free PS4 for all kids"
print(predict(text))
print(text)
# print(len(outputs.hidden_states))
# print(len(outputs.hidden_states[-1].shape))
# print(outputs.hidden_states[-1].shape)
# print(outputs.hidden_states[-2])
"""Using the pretrained model on the Webis Clickbait data set"""
# pipe = pipeline("text-classification", model="caush/Clickbait1")