Spaces:

arpy8
/

Omdena-IREX-Sentiment-Analysis

Build error

App Files Files Community

Omdena-IREX-Sentiment-Analysis / utils /scraper.py

arpy8

ruff format

29137b2 over 1 year ago

raw

history blame contribute delete

3.74 kB

	import os
	import pandas as pd
	from apify_client import ApifyClient

	# Constants
	TWEETS_COLUMNS_LIST = [
	"url",
	"createdAt",
	"id",
	"isReply",
	"inReplyToId",
	"isRetweet",
	"isQuote",
	"viewCount",
	"retweetCount",
	"likeCount",
	"replyCount",
	"lang",
	"author__createdAt",
	"author__location",
	"author__name",
	"author__id",
	"author__description",
	"author__followers",
	"author__verified",
	"text",
	]

	REMOVE_COLUMNS_COMMENTS = [
	"author__name",
	"author__id",
	"author__description",
	]

	INT_COLUMNS = [
	"viewCount",
	"retweetCount",
	"likeCount",
	"replyCount",
	"author__followers",
	]

	APIFY_ACTOR_ID = os.getenv("APIFY_ACTOR_ID")
	APIFY_TOKEN = os.getenv("APIFY_TOKEN")

	# Start client
	client = ApifyClient(APIFY_TOKEN)


	def flatten_response(response):
	"""Returns a flat dictionary with unnested values"""

	return {
	"url": response.get("url"),
	"createdAt": pd.to_datetime(response.get("createdAt")),
	"id": response.get("id"),
	"isReply": response.get("isReply"),
	"inReplyToId": response.get(
	"inReplyToId", None
	), # Uses None if inReply is false
	"isRetweet": response.get("isRetweet"),
	"isQuote": response.get("isQuote"),
	"viewCount": response.get("viewCount"),
	"retweetCount": response.get("retweetCount"),
	"likeCount": response.get("likeCount"),
	"replyCount": response.get("replyCount"),
	"lang": response.get("lang"),
	"author__createdAt": pd.to_datetime(response["author"].get("createdAt")),
	"author__location": response["author"].get("location"),
	"author__name": response["author"].get("name"),
	"author__id": response["author"].get("id"),
	"author__description": response["author"].get("description"),
	"author__followers": response["author"].get("followers"),
	"author__verified": response["author"].get("isVerified"),
	"text": response.get("text"),
	}


	def fetch_main_tweet_dataframe(url):
	"""Given a tweet URL, returns a dataframe for it"""

	# Input validation
	if "x.com" not in url and "twitter.com" not in url:
	return {"error": "Input is not a tweet URL"}

	run_input = {
	"startUrls": [url],
	}

	run = client.actor(APIFY_ACTOR_ID).call(run_input=run_input)

	response = [
	dictionary
	for dictionary in client.dataset(run["defaultDatasetId"]).iterate_items()
	][0]

	flattened_data = flatten_response(response)

	# Convert the flattened dictionary to a DataFrame and return
	return pd.DataFrame([flattened_data], columns=TWEETS_COLUMNS_LIST)


	def fetch_comments_dataframe(url):
	"""Given a tweet URL, returns a dataframe for the comments related to that tweet"""

	# Input validation
	if "x.com" not in url and "twitter.com" not in url:
	return {"error": "Input is not a tweet URL"}

	one_tweet_id = str(url.split("/")[-1])

	run_input_comment = {
	"conversationIds": [one_tweet_id],
	"tweetLanguage": "es",
	"maxItems": 50,
	}

	run_comment = client.actor(APIFY_ACTOR_ID).call(run_input=run_input_comment)

	response_comment = [
	dictionary
	for dictionary in client.dataset(
	run_comment["defaultDatasetId"]
	).iterate_items()
	]

	flattened_responses = [flatten_response(response) for response in response_comment]

	include_columns = [
	column
	for column in TWEETS_COLUMNS_LIST
	if column not in REMOVE_COLUMNS_COMMENTS
	]

	# Convert the flattened dictionary to a DataFrame and return
	return pd.DataFrame(flattened_responses, columns=include_columns)