-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathfetchTwitterData.py
160 lines (125 loc) · 5.57 KB
/
fetchTwitterData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import tweepy
import sys
import json
import unicodedata
import os
from collections import defaultdict
#url = 'http://localhost:3300/twitterdata' # Set destination URL here
#IMPORTANT VARIABLES
TWITTER_USER = sys.argv[1] #@Username of the twitter user to be analyzed
MAX_TWEETS = int(sys.argv[2]) #Maximum number of tweets to be scraped from user's profile (inc. retweets)
MAX_MENTIONS = 10 #Maximum number of top mentions to store (i.e. store top 10 mentions)
#Building the JSON format
tweetData = {}
tweetData['direct_tweets'] = []
tweetData['images'] = []
tweetData['top_mentions'] = []
tweetData['profile_picture_url'] = ""
tweetData['num_followers'] = ""
tweetData['num_following'] = ""
mentionData = defaultdict()
'''
Function: sliceMentions
Desc: Removes and each mention (@someone) from tweet text so that it will not be analyzed for emotions.
Stores each mention in order to keep track of frequency so that we may calculate top mentions.
Input: tweetText - string containing the entirety of the text of the tweet
tweetObj - instance of the tweet
Output: Returns text after removing mentions
'''
def sliceMentions(tweetText, tweetObj):
while '@' in tweetText:
indexOfMention = tweetText.find('@')
if tweetText[indexOfMention-1] == '.':
indexOfMention -= 1
indexOfSpace = tweetText[indexOfMention:].find(' ')
#Edge case: If the mention is the last word in the tweet, we will not find a space
if indexOfSpace == -1:
indexOfSpace = len(tweetText) - indexOfMention
mentionName = tweetText[indexOfMention : indexOfSpace + indexOfMention + 1]
tweetText = tweetText.replace(mentionName,"")
#Remove any spaces or colons in the mention name
while ' ' in mentionName or ':' in mentionName:
mentionName = mentionName.replace(" ","")
mentionName = mentionName.replace(":","")
for currentMention in tweetObj.entities['user_mentions']:
if currentMention['id'] in mentionData:
mentionData[currentMention['id']] += 1
else:
mentionData[currentMention['id']] = 1
return tweetText
'''
Function: storeTopMentions
Desc: Stores MAX_MENTIONS amount of the most frequently mentioned users from the main user, and the frequeny
Output: Stores the mentions/frequency in the dataDump.json file under "top_mentions"
'''
def storeTopMentions():
mentionCount = 0
topMentions = []
for w in sorted(mentionData, key=mentionData.get, reverse=True):
if TWITTER_USER_ID != w:
topMentions.append((w,mentionData[w]))
mentionCount += 1
if mentionCount > MAX_MENTIONS:
break
for user_id, mentioned_frq in topMentions:
currentUser = api.get_user(user_id=user_id)
tweetData['top_mentions'].append({
'user': str("@" + currentUser.screen_name),
'num_mentions': str(mentioned_frq),
'profile_picture_url': str(currentUser.profile_image_url_https)
})
'''Function: scrapeImages
Desc: Finds all images contained in a tweet and stores the direct URL in the json file
Input: Instance of the tweet
Output: Stores direct URL to images under 'images' in dataDump.json file
'''
def scrapeImages(tweet):
if "media" in tweet.entities:
for imageIndex in range(0,len(tweet.extended_entities["media"])):
imageURL = tweet.extended_entities["media"][imageIndex]['media_url_https']
#Ignoring video thumbnails and retweeded images/videos
if "RT " not in tweet.full_text and "video" not in imageURL:
tweetData['images'].append({
'URL': str(imageURL),
'date': str(tweet.created_at)
})
'''Function: storeTweet
Desc: Stores date and text of the tweet in the output json file
Input: Instance of the current tweet
Output: Stores the text/date of tweet in json file'''
def storeTweet(tweet):
#Retweeted tweets (all begin with "RT ..")
tweetText = str(tweet.full_text)
tweetText = tweetText.replace("RT ", "")
tweetText = sliceMentions(tweetText, tweet)
tweetData['direct_tweets'].append({
'text': str(tweetText),
'date': str(tweet.created_at)
})
################ API SETUP #####################
config = json.load(open('config.json'))["Twitter"]
auth = tweepy.OAuthHandler(config["consumer_key"], config["consumer_secret"])
auth.set_access_token(config["access_token"], config["access_secret"])
api = tweepy.API(auth)
#################################################
#Retrieve high-res version of user's profile picture
userProfile = api.get_user(screen_name = TWITTER_USER)
tweetData['num_followers'] = userProfile.followers_count
tweetData['num_following'] = userProfile.friends_count
#print(numFollowers, numFollowing)
profilePictureURL = userProfile.profile_image_url_https
profilePictureURL = profilePictureURL.replace("_normal","")
tweetData['profile_picture_url'] = str(profilePictureURL)
TWITTER_USER_ID = userProfile.id
tweetCount = 1
for tweet in tweepy.Cursor(api.user_timeline, tweet_mode='extended', screen_name = TWITTER_USER).items():
scrapeImages(tweet)
storeTweet(tweet)
tweetCount = tweetCount + 1
if (tweetCount > MAX_TWEETS):
break
storeTopMentions()
with open(str(os.getpid())+'.json', 'w') as f:
json.dump(tweetData, f, indent=2)
# request = Request(url, urlencode(tweetData).encode())
# json = urlopen(request).read().decode()