Spaces:
Runtime error
Runtime error
import snscrape.modules.twitter as sntwitter | |
import pandas as pd | |
import datetime as dt | |
from tqdm import tqdm | |
import requests | |
from scripts import sentiment | |
import tweepy | |
import configparser | |
import os | |
import pandas as pd | |
from datetime import datetime, date, timedelta | |
def get_latest_account_tweets(handle): | |
try: | |
if os.path.exists("tweepy_auth.ini"): | |
config = configparser.ConfigParser() | |
config.read("tweepy_auth.ini") | |
# Get the authentication details | |
authentication_section = config["AUTHENTICATION"] | |
consumer_key = authentication_section["twitter_consumer_key"] | |
consumer_secret = authentication_section["twitter_consumer_secret"] | |
access_token = authentication_section["twitter_access_token"] | |
access_token_secret = authentication_section["twitter_access_token_secret"] | |
else: | |
consumer_key = os.environ["twitter_consumer_key"] | |
consumer_secret = os.environ["twitter_consumer_secret"] | |
access_token = os.environ["twitter_access_token"] | |
access_token_secret = os.environ["twitter_access_token_secret"] | |
auth = tweepy.OAuthHandler(consumer_key, consumer_secret) | |
auth.set_access_token(access_token, access_token_secret) | |
# create the API object | |
api = tweepy.API(auth) | |
# load the tweets from a specific user | |
tweets = api.user_timeline( | |
screen_name=handle, count=10000000, tweet_mode="extended" | |
) | |
df_tweets = pd.DataFrame(data=[t._json for t in tweets]) | |
df_tweets["created_at"] = pd.to_datetime(df_tweets["created_at"]) | |
df_tweets = df_tweets.sort_values("created_at") | |
# print the tweet texts | |
tweets_txt = [] | |
for tweet in tweets: | |
tweets_txt.append(sentiment.tweet_cleaner(tweet.full_text)) | |
df_tweets["clean_text"] = tweets_txt | |
df_tweets["handle"] = df_tweets.user.iloc[0]["screen_name"] | |
return df_tweets | |
except tweepy.TweepError as e: | |
# Handle specific error conditions | |
if e.api_code == 63: | |
print("User has been suspended.") | |
elif e.api_code == 88: | |
print("Rate limit exceeded. Please try again later.") | |
else: | |
print("Error occurred during API call:", str(e)) | |
return str(e) | |
except Exception as e: | |
print("An error occurred:", str(e)) | |
return str(e) | |
return None | |
def get_tweets( | |
handle: str, | |
): | |
""" | |
Fetches tweets from Twitter based on a given query and returns a list of extracted tweet information. | |
Args: | |
query (str): The query to search for tweets on Twitter. | |
Returns: | |
A list of extracted tweet information. | |
""" | |
# Get the current date | |
today = datetime.today() | |
two_months_ago = today - timedelta(days=2 * 30) | |
start_date = two_months_ago.strftime("%Y-%m-%d") | |
end_date = today.strftime("%Y-%m-%d") | |
query = f"from:{handle} since:{start_date} until:{end_date} -filter:replies -filter:retweets" | |
fetched_tweets = sntwitter.TwitterSearchScraper(query).get_items() | |
tweets = [extract_tweet_info(tweet) for tweet in tqdm(fetched_tweets)] | |
df_tweets = pd.DataFrame(tweets) | |
df_tweets["full_text"] = df_tweets["content"] | |
df_tweets["clean_text"] = df_tweets["full_text"].apply( | |
lambda r: sentiment.tweet_cleaner(r) | |
) | |
df_tweets["handle"] = df_tweets["username"] | |
df_tweets["created_at"] = df_tweets["date"] | |
return df_tweets | |
def get_replies(username: str, conversation_id: str, max_tweets: int) -> list: | |
""" | |
Fetches the replies for a given Twitter user and conversation, and returns a list of extracted tweet information. | |
Args: | |
username (str): The username of the Twitter user whose replies are to be fetched. | |
conversation_id (str): The ID of the conversation for which replies are to be fetched. | |
Returns: | |
A list of extracted tweet information for the replies. | |
""" | |
print( | |
f"Fetching replies for username {username} and conversation {conversation_id}" | |
) | |
query = f"to:{username} since_id:{conversation_id} filter:safe" | |
tweets_list = [] | |
for i, tweet in tqdm(enumerate(sntwitter.TwitterSearchScraper(query).get_items())): | |
if i > max_tweets: | |
break | |
else: | |
tweets_list.append(extract_tweet_info(tweet)) | |
return tweets_list | |
def get_tweet_by_id_and_username(username: str, tweet_id: str): | |
""" | |
Fetches a tweet from Twitter based on the given username and tweet ID. | |
Args: | |
username (str): The username of the Twitter user who posted the tweet. | |
tweet_id (str): The ID of the tweet to fetch. | |
Returns: | |
The fetched tweet. | |
""" | |
tweet_url = f"https://twitter.com/{username}/status/{tweet_id}" | |
return sntwitter.TwitterSearchScraper(tweet_url).get_items() | |
def extract_tweet_info(tweet): | |
""" | |
Extracts relevant information from a tweet object and returns a dictionary with the extracted values. | |
Args: | |
tweet: A tweet object. | |
Returns: | |
A dictionary with the extracted tweet information. | |
""" | |
return { | |
"date": tweet.date, | |
"username": tweet.user.username, | |
"content": tweet.rawContent, | |
"retweet_count": tweet.retweetCount, | |
"tweet_id": tweet.id, | |
"like_count": tweet.likeCount, | |
"reply_count": tweet.replyCount, | |
"in_reply_to_tweet_id": tweet.inReplyToTweetId, | |
"conversation_id": tweet.conversationId, | |
"view_count": tweet.viewCount, | |
} | |
def get_follower_ids(username: str, limit: int = 20): | |
""" | |
Retrieves a list of Twitter IDs for users who follow a given Twitter handle. | |
Args: | |
username (str): The Twitter handle to retrieve follower IDs for. | |
limit (int): The maximum number of follower IDs to retrieve. | |
Returns: | |
A list of Twitter user IDs (as strings). | |
""" | |
# Construct the search query using snscrape | |
query = f"from:{username} replies:True" | |
start_date = dt.date(year=2023, month=3, day=10) | |
end_date = dt.date(year=2023, month=3, day=22) | |
query = f"from:{username} since:{start_date} until:{end_date}" | |
tweets = get_tweets(query=query) | |
one_tweet = tweets[-1] | |
one_tweet_id = one_tweet["tweet_id"] | |
replies = get_replies( | |
username=username, conversation_id=one_tweet_id, max_tweets=1000 | |
) | |
return one_tweet, replies | |
def get_twitter_account_info(twitter_handle: str) -> dict: | |
""" | |
Extracts the name, username, follower count, and last tweet of a Twitter user using snscrape. | |
Args: | |
twitter_handle (str): The Twitter username to retrieve information for. | |
Returns: | |
dict: A dictionary containing the name, username, follower count, and last tweet of the Twitter user. | |
""" | |
# Create a TwitterUserScraper object | |
user_scraper = sntwitter.TwitterUserScraper(twitter_handle) | |
# Get the user's profile information | |
user_profile = user_scraper.entity | |
check_string = lambda s: "false" if str(s).lower() == "false" else "true" | |
return { | |
"name": user_profile.displayname, | |
"username": user_profile.username, | |
"user_id": user_profile.id, | |
"follower_count": user_profile.followersCount, | |
"friends_count": user_profile.friendsCount, | |
"verified": check_string(user_profile.verified), | |
} | |
if __name__ == "__main__": | |
## Testing extracting tweets from an account | |
# Set the search variables (dates for when account tweeted. Does not take into account replies) | |
account = "taylorlorenz" | |
start_date = dt.date(year=2023, month=2, day=1) | |
end_date = dt.date(year=2023, month=3, day=11) | |
# Format the query string | |
query = f"from:{account} since:{start_date} until:{end_date}" | |
print(f"query: {query}") | |
tweets = get_tweets(query=query) | |
df_tweets = pd.DataFrame(data=tweets) | |
df_tweets = df_tweets.sort_values("in_reply_to_tweet_id") | |
# Uncomment to save output | |
df_tweets.to_csv("df_tweets.csv") | |
print(df_tweets.head(2)) | |
print(df_tweets.tail(2)) | |
print(f"Total Tweets: {len(tweets)}") | |
## Testing extracting conversatin threeds from conversation Id | |
conversation_id = ( | |
1620650202305798144 # A tweet from elon musk about turbulent times | |
) | |
max_tweets = 3000 | |
tweets = get_replies( | |
username="elonmusk", conversation_id=conversation_id, max_tweets=max_tweets | |
) | |
df_replies = pd.DataFrame(data=tweets) | |
# Uncomment to save output | |
# df_replies.to_csv("df_replies.csv") | |
print( | |
f"Number of extracted tweets from conversation_id: {conversation_id}, {len(tweets)}" | |
) | |