Vera-ZWY's picture
Update app.py
8b96174 verified
raw
history blame
5.04 kB
import gradio as gr
from datasets import load_dataset
import pandas as pd
import sys
import subprocess
from datetime import datetime
from huggingface_hub import HfApi
def get_newest_file(repo_id, prefix):
"""Get the newest file with given prefix from HuggingFace repo"""
api = HfApi()
files = api.list_repo_files(repo_id, repo_type="dataset")
relevant_files = [f for f in files if f.startswith(prefix)]
if not relevant_files:
return None
file_dates = []
for filename in relevant_files:
try:
date_str = filename.split('_')[-1].split('.')[0]
date = datetime.strptime(date_str, '%Y%m%d')
file_dates.append((date, filename))
except (IndexError, ValueError):
continue
if not file_dates:
return None
newest_file = sorted(file_dates, key=lambda x: x[0], reverse=True)[0][1]
return newest_file
def load_data(repo_id, file_path):
"""Load data from HuggingFace and return as DataFrame"""
try:
dataset = load_dataset(repo_id, data_files={'train': file_path}, split='train')
df = pd.DataFrame(dataset)
return df.head(3)
except Exception as e:
return pd.DataFrame({'Error': [str(e)]})
def praw_new_data():
"""Execute praw.py and show the latest data"""
try:
# Execute praw.py
subprocess.run([sys.executable, "praw.py"], check=True)
success_message = "βœ… Successfully crawled new data!"
except Exception as e:
success_message = f"❌ Error executing praw.py: {str(e)}"
# Load and return latest data
repo_id = "Vera-ZWY/reddite2024elections_submissions"
newest_file = get_newest_file(repo_id, "submissions/df_")
if newest_file:
df = load_data(repo_id, newest_file)
return success_message, df, load_merged_data()[1] # Return current merged data state
else:
return "No crawled data files found", pd.DataFrame(), load_merged_data()[1]
def merge_data():
"""Execute merge.py and show the latest merged data"""
try:
# Execute merge.py
subprocess.run([sys.executable, "merge.py"], check=True)
success_message = "βœ… Successfully merged data!"
except Exception as e:
success_message = f"❌ Error executing merge.py: {str(e)}"
# Load and return latest merged data
merged_df = load_merged_data()[1]
crawled_df = load_crawled_data()[1]
return success_message, crawled_df, merged_df
def load_crawled_data():
"""Load latest crawled data"""
repo_id = "Vera-ZWY/reddite2024elections_submissions"
newest_file = get_newest_file(repo_id, "submissions/df_24")
if newest_file:
return f"Latest crawled data ({newest_file}):", load_data(repo_id, newest_file)
return "No crawled data available", pd.DataFrame()
def load_merged_data():
"""Load latest merged data"""
repo_id = "Vera-ZWY/reddite2024elections_submissions"
newest_merged = "submission/merged_reddit_data.csv"
if newest_merged:
return f"Latest merged data ({newest_merged}):", load_data(repo_id, newest_merged)
return "No merged data available", pd.DataFrame()
# Create Gradio interface
with gr.Blocks(title="Reddit Data Processing") as iface:
gr.Markdown("# Reddit Data Processing Interface")
# Status message for operations
status_text = gr.Textbox(label="Status", interactive=False)
with gr.Row():
with gr.Column():
praw_button = gr.Button("Crawl New Data", variant="primary")
with gr.Column():
merge_button = gr.Button("Merge Data", variant="primary")
with gr.Row():
with gr.Column():
gr.Markdown("### Latest Crawled Data (Top 3 Rows)")
crawled_table = gr.Dataframe(
headers=["title", "score", "id", "url", "comms_num", "created", "body", "subreddit"],
value=load_crawled_data()[1],
wrap=True
)
with gr.Row():
with gr.Column():
gr.Markdown("### Latest Merged Data (Top 3 Rows)")
merged_table = gr.Dataframe(
headers=["title", "score", "id", "url", "num_comments", "created", "body", "content", "subreddit"],
value=load_merged_data()[1],
wrap=True
)
# Button click handlers
praw_button.click(
fn=praw_new_data,
outputs=[status_text, crawled_table, merged_table]
)
merge_button.click(
fn=merge_data,
outputs=[status_text, crawled_table, merged_table]
)
gr.Markdown("""
## The full dataset storage at https://huggingface.co/datasets/Vera-ZWY/reddite2024elections_submissions/
### Instructions:
1. Click 'Crawl New Data' to fetch new Reddit data
2. Click 'Merge Data' to merge the latest datasets
3. Tables will automatically update to show the latest data
""")
# Launch the interface
if __name__ == "__main__":
iface.launch()