-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
executable file
·300 lines (246 loc) · 10.2 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
#!/usr/bin/env python3
"""
YoutubeNLP:
Usage:
youtubeNLP "video_url"
It returns many plots and a csv file with the data.
- The first plot is the sentiment analysis of the comments of the video.
- The second plot a wordcloud of the comments.
- The csv will content the user name, the comment and the sentiment of the comment, and the keywords of the comment.
The script will get all the comments made in the video and will analyze them with the help of the Google Natural Language API.
"""
import sys
import os
import csv
import re
import logging
from typing import Dict, List, Iterator, Optional, Any, Tuple
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from youtube_comment_downloader import YoutubeCommentDownloader, SORT_BY_POPULAR
from itertools import islice
from google.cloud import language_v1
import requests
from urllib.parse import urlparse, parse_qs
import json
from utils.video_utils import VideoMetadata, extract_video_id, get_video_metadata
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger("YoutubeNLP")
def get_comments(url: str, comments_len: int = 100) -> Iterator:
"""
Get comments from a YouTube video.
Args:
url: YouTube video URL
comments_len: Maximum number of comments to retrieve
Returns:
Iterator of comments
"""
try:
downloader = YoutubeCommentDownloader()
comments = downloader.get_comments_from_url(url, sort_by=SORT_BY_POPULAR)
# Get the specified number of comments from the video
return islice(comments, comments_len)
except Exception as e:
logger.error(f"Failed to get comments: {e}")
return iter([]) # Return empty iterator
def get_sentiment(comment: Any, client: Optional[language_v1.LanguageServiceClient] = None) -> float:
"""
Get the sentiment score of a comment.
Args:
comment: Comment object
client: Language service client (created if None)
Returns:
Sentiment score (-1.0 to 1.0)
"""
try:
if client is None:
client = language_v1.LanguageServiceClient()
# The text to analyze
text = comment.text
document = language_v1.Document(
content=text,
type=language_v1.Document.Type.PLAIN_TEXT)
# Detects the sentiment of the text
sentiment = client.analyze_sentiment(document=document).document_sentiment
return sentiment.score
except Exception as e:
logger.warning(f"Failed to analyze sentiment: {e}")
return 0.0 # Neutral sentiment as fallback
def get_keywords(comment: Any, client: Optional[language_v1.LanguageServiceClient] = None) -> List[str]:
"""
Extract keywords from a comment.
Args:
comment: Comment object
client: Language service client (created if None)
Returns:
List of keywords
"""
try:
if client is None:
client = language_v1.LanguageServiceClient()
# The text to analyze
text = comment.text
document = language_v1.Document(
content=text,
type=language_v1.Document.Type.PLAIN_TEXT)
# Detects syntax and extracts tokens
keywords = client.analyze_syntax(document=document).tokens
return [keyword.lemma for keyword in keywords if keyword.part_of_speech.tag == 1]
except Exception as e:
logger.warning(f"Failed to extract keywords: {e}")
return []
def analyze_comments(url: str, comments_len: int = 100) -> Tuple[Dict[str, Any], str]:
"""
Analyze comments from a YouTube video and generate visualizations.
Args:
url: YouTube video URL
comments_len: Maximum number of comments to analyze
Returns:
Tuple containing analysis results and CSV filename
"""
# Get video metadata
video_metadata = get_video_metadata(url)
if not video_metadata:
logger.error("Failed to get video metadata")
else:
logger.info(f"Analyzing video: {video_metadata.title or video_metadata.video_id}")
# Define output file based on video ID if available
output_filename = f"comments_{video_metadata.video_id}.csv" if video_metadata else "comments.csv"
# Get comments
comments = get_comments(url, comments_len)
if not comments:
logger.error("No comments found or error occurred")
return {"error": "No comments found"}, output_filename
# Initialize Language API client (reuse for all comments)
try:
nlp_client = language_v1.LanguageServiceClient()
except Exception as e:
logger.error(f"Failed to initialize Google Language API: {e}")
logger.warning("Continuing without sentiment analysis")
nlp_client = None
# Create the csv file
try:
with open(output_filename, "w", newline='', encoding='utf-8') as csv_file:
csv_writer = csv.writer(csv_file)
# Include video metadata in header
if video_metadata and video_metadata.title:
csv_writer.writerow(["Video Title", video_metadata.title])
if video_metadata.channel:
csv_writer.writerow(["Channel", video_metadata.channel])
csv_writer.writerow([]) # Empty row for separation
# Write column headers
csv_writer.writerow(["user", "comment", "sentiment", "keywords"])
# Process all comments
all_comments = ""
sentiments = []
comment_count = 0
for comment in comments:
comment_count += 1
if comment_count % 10 == 0:
logger.info(f"Processed {comment_count} comments")
# Get the sentiment of the comment
sentiment = get_sentiment(comment, nlp_client)
# Get the keywords of the comment
keywords = get_keywords(comment, nlp_client)
# Write the data in the csv file
csv_writer.writerow([comment.author, comment.text, sentiment, ",".join(keywords)])
# Add the comment to the wordcloud
all_comments += comment.text + " "
# Add the sentiment to the sentiments list
sentiments.append(sentiment)
logger.info(f"Analysis complete. Processed {comment_count} comments.")
# Generate visualizations
results = {
"comment_count": comment_count,
"sentiments": sentiments,
"wordcloud_text": all_comments,
"video_metadata": video_metadata.__dict__ if video_metadata else {}
}
return results, output_filename
except Exception as e:
logger.error(f"Error analyzing comments: {e}")
return {"error": str(e)}, output_filename
def display_results(results: Dict[str, Any], output_filename: str) -> None:
"""
Display analysis results including video information and visualizations.
Args:
results: Analysis results from analyze_comments
output_filename: Path to the CSV file containing comment data
"""
if "error" in results:
logger.error(f"Analysis error: {results['error']}")
return
# Display video information
video_metadata = results.get("video_metadata", {})
if video_metadata:
print("\n" + "="*40)
print("VIDEO INFORMATION:")
print("="*40)
if "title" in video_metadata and video_metadata["title"]:
print(f"Title: {video_metadata['title']}")
if "channel" in video_metadata and video_metadata["channel"]:
print(f"Channel: {video_metadata['channel']}")
if "view_count" in video_metadata and video_metadata["view_count"]:
print(f"Views: {video_metadata['view_count']:,}")
if "like_count" in video_metadata and video_metadata["like_count"]:
print(f"Likes: {video_metadata['like_count']:,}")
# Display embed information
if "video_id" in video_metadata:
print("\nEmbed URL:")
print(f"https://www.youtube.com/embed/{video_metadata['video_id']}")
print("="*40 + "\n")
# Display comment information
print(f"Analyzed {results['comment_count']} comments")
print(f"Results saved to: {output_filename}")
# Show the sentiment analysis
if "sentiments" in results and results["sentiments"]:
plt.figure(figsize=(10, 6))
plt.hist(results["sentiments"], bins=10, edgecolor='black')
plt.title('Comment Sentiment Distribution')
plt.xlabel('Sentiment Score (-1 to +1)')
plt.ylabel('Number of Comments')
plt.grid(axis='y', alpha=0.75)
plt.tight_layout()
plt.show()
# Generate and show the wordcloud
if "wordcloud_text" in results and results["wordcloud_text"]:
# Generate the wordcloud
wordcloud = WordCloud(
width=800,
height=400,
background_color='white',
max_words=200,
contour_width=3,
contour_color='steelblue'
).generate(results["wordcloud_text"])
# Show the wordcloud
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title('Word Cloud of Comment Text')
plt.tight_layout()
plt.show()
def main() -> None:
"""Main entry point for the script."""
if len(sys.argv) < 2:
print("Usage: python main.py <youtube_url> [comment_count]")
return
# Get the URL from command line
url = sys.argv[1]
# Get optional comment count parameter
comments_len = 100
if len(sys.argv) > 2:
try:
comments_len = int(sys.argv[2])
except ValueError:
logger.warning(f"Invalid comment count: {sys.argv[2]}. Using default (100).")
# Analyze comments
results, csv_filename = analyze_comments(url, comments_len)
# Display results
display_results(results, csv_filename)
if __name__ == "__main__":
main()