TwitterAccounts / scripts /twitter_scraper.py
aus10powell's picture
Upload twitter_scraper.py
07768ac
raw
history blame
7.17 kB
import snscrape.modules.twitter as sntwitter
import pandas as pd
import datetime as dt
from tqdm import tqdm
import requests
from scripts import sentiment
def get_latest_account_tweets(handle):
import tweepy
import configparser
config = configparser.ConfigParser()
config.read("tweepy_auth.ini")
# Get the authentication details
authentication_section = config['AUTHENTICATION']
consumer_key = authentication_section["twitter_consumer_key"]
consumer_secret = authentication_section["twitter_consumer_secret"]
access_token = authentication_section["twitter_access_token"]
access_token_secret = authentication_section["twitter_access_token_secret"]
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
# create the API object
api = tweepy.API(auth)
# load the tweets from a specific user
tweets = api.user_timeline(
screen_name=handle, count=10000000, tweet_mode="extended"
)
df_tweets = pd.DataFrame(data=[t._json for t in tweets])
df_tweets["created_at"] = pd.DataFrame(df_tweets["created_at"])
df_tweets = df_tweets.sort_values("created_at")
# print the tweet texts
tweets_txt = []
for tweet in tweets:
tweets_txt.append(sentiment.tweet_cleaner(tweet.full_text))
df_tweets["clean_text"] = tweets_txt
df_tweets["handle"] = df_tweets.user.iloc[0]["screen_name"]
return df_tweets
def get_tweets(
query: str,
) -> list:
"""
Fetches tweets from Twitter based on a given query and returns a list of extracted tweet information.
Args:
query (str): The query to search for tweets on Twitter.
Returns:
A list of extracted tweet information.
"""
print(f"Fetching tweets with query: {query}")
fetched_tweets = sntwitter.TwitterSearchScraper(query).get_items()
return [extract_tweet_info(tweet) for tweet in tqdm(fetched_tweets)]
def get_replies(username: str, conversation_id: str, max_tweets: int) -> list:
"""
Fetches the replies for a given Twitter user and conversation, and returns a list of extracted tweet information.
Args:
username (str): The username of the Twitter user whose replies are to be fetched.
conversation_id (str): The ID of the conversation for which replies are to be fetched.
Returns:
A list of extracted tweet information for the replies.
"""
print(
f"Fetching replies for username {username} and conversation {conversation_id}"
)
query = f"to:{username} since_id:{conversation_id} filter:safe"
tweets_list = []
for i, tweet in tqdm(enumerate(sntwitter.TwitterSearchScraper(query).get_items())):
if i > max_tweets:
break
else:
tweets_list.append(extract_tweet_info(tweet))
return tweets_list
def get_tweet_by_id_and_username(username: str, tweet_id: str):
"""
Fetches a tweet from Twitter based on the given username and tweet ID.
Args:
username (str): The username of the Twitter user who posted the tweet.
tweet_id (str): The ID of the tweet to fetch.
Returns:
The fetched tweet.
"""
tweet_url = f"https://twitter.com/{username}/status/{tweet_id}"
return sntwitter.TwitterSearchScraper(tweet_url).get_items()
def extract_tweet_info(tweet):
"""
Extracts relevant information from a tweet object and returns a dictionary with the extracted values.
Args:
tweet: A tweet object.
Returns:
A dictionary with the extracted tweet information.
"""
return {
"date": tweet.date,
"username": tweet.user.username,
"content": tweet.rawContent,
"retweet_count": tweet.retweetCount,
"tweet_id": tweet.id,
"like_count": tweet.likeCount,
"reply_count": tweet.replyCount,
"in_reply_to_tweet_id": tweet.inReplyToTweetId,
"conversation_id": tweet.conversationId,
"view_count": tweet.viewCount,
}
def get_follower_ids(username: str, limit: int = 20):
"""
Retrieves a list of Twitter IDs for users who follow a given Twitter handle.
Args:
username (str): The Twitter handle to retrieve follower IDs for.
limit (int): The maximum number of follower IDs to retrieve.
Returns:
A list of Twitter user IDs (as strings).
"""
# Construct the search query using snscrape
query = f"from:{username} replies:True"
start_date = dt.date(year=2023, month=3, day=10)
end_date = dt.date(year=2023, month=3, day=22)
query = f"from:{username} since:{start_date} until:{end_date}"
tweets = get_tweets(query=query)
one_tweet = tweets[-1]
one_tweet_id = one_tweet["tweet_id"]
replies = get_replies(
username=username, conversation_id=one_tweet_id, max_tweets=1000
)
return one_tweet, replies
def get_twitter_account_info(twitter_handle: str) -> dict:
"""
Extracts the name, username, follower count, and last tweet of a Twitter user using snscrape.
Args:
twitter_handle (str): The Twitter username to retrieve information for.
Returns:
dict: A dictionary containing the name, username, follower count, and last tweet of the Twitter user.
"""
# Create a TwitterUserScraper object
user_scraper = sntwitter.TwitterUserScraper(twitter_handle)
# Get the user's profile information
user_profile = user_scraper.entity
check_string = lambda s: "false" if str(s).lower() == "false" else "true"
return {
"name": user_profile.displayname,
"username": user_profile.username,
"user_id": user_profile.id,
"follower_count": user_profile.followersCount,
"friends_count": user_profile.friendsCount,
"verified": check_string(user_profile.verified),
}
if __name__ == "__main__":
## Testing extracting tweets from an account
# Set the search variables (dates for when account tweeted. Does not take into account replies)
account = "taylorlorenz"
start_date = dt.date(year=2023, month=2, day=1)
end_date = dt.date(year=2023, month=3, day=11)
# Format the query string
query = f"from:{account} since:{start_date} until:{end_date}"
print(f"query: {query}")
tweets = get_tweets(query=query)
df_tweets = pd.DataFrame(data=tweets)
df_tweets = df_tweets.sort_values("in_reply_to_tweet_id")
# Uncomment to save output
df_tweets.to_csv("df_tweets.csv")
print(df_tweets.head(2))
print(df_tweets.tail(2))
print(f"Total Tweets: {len(tweets)}")
## Testing extracting conversatin threeds from conversation Id
conversation_id = (
1620650202305798144 # A tweet from elon musk about turbulent times
)
max_tweets = 3000
tweets = get_replies(
username="elonmusk", conversation_id=conversation_id, max_tweets=max_tweets
)
df_replies = pd.DataFrame(data=tweets)
# Uncomment to save output
# df_replies.to_csv("df_replies.csv")
print(
f"Number of extracted tweets from conversation_id: {conversation_id}, {len(tweets)}"
)