Spaces:
Runtime error
Runtime error
import os | |
from datetime import datetime, timedelta | |
from functools import lru_cache | |
from typing import Any, List | |
import gradio as gr | |
import httpx | |
import pandas as pd | |
import plotly.express as px | |
import polars as pl | |
from cachetools import TTLCache, cached | |
from datasets import Dataset, load_dataset | |
from dotenv import load_dotenv | |
from httpx import Client | |
from toolz import concat, frequencies | |
from tqdm.auto import tqdm | |
load_dotenv() | |
token = os.environ["HUGGINGFACE_TOKEN"] | |
user_agent = os.environ["USER_AGENT"] | |
user = os.environ["USER_TO_TRACK"] | |
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" | |
assert token | |
assert user_agent | |
assert user | |
headers = {"user-agent": user_agent, "authorization": f"Bearer {token}"} | |
limits = httpx.Limits(max_keepalive_connections=10, max_connections=20) | |
client = Client(headers=headers, limits=limits, timeout=120.0) | |
def get_hub_community_activity(user: str) -> List[Any]: | |
with tqdm() as pbar: | |
all_data = [] | |
i = 1 | |
while True: | |
r = httpx.get( | |
f"https://huggingface.co/api/recent-activity?limit=100&activityType=discussion&skip={i}&entity={user}&feedType=user", | |
headers=headers, | |
) | |
activity = r.json()["recentActivity"] | |
if not activity: | |
break | |
all_data.append(activity) | |
if len(all_data) % 1000 == 0: | |
# print(f"Length of all_data: {len(all_data)}") | |
pbar.write(f"Length of all_data: {len(all_data)}") | |
i += 100 | |
pbar.update(100) | |
return list(concat(all_data)) | |
# def get_hub_community_activity(user: str) -> List[Any]: | |
# all_data = [] | |
# for i in range(1, 2000, 100): | |
# r = httpx.get( | |
# f"https://huggingface.co/api/recent-activity?limit=100&type=discussion&skip={i}&user={user}" | |
# ) | |
# activity = r.json()["recentActivity"] | |
# all_data.append(activity) | |
# return list(concat(all_data)) | |
def parse_date_time(date_time: str) -> datetime: | |
return datetime.strptime(date_time, "%Y-%m-%dT%H:%M:%S.%fZ") | |
def parse_pr_data(data): | |
data = data["discussionData"] | |
createdAt = parse_date_time(data["createdAt"]) | |
pr_number = data["num"] | |
status = data["status"] | |
repo_id = data["repo"]["name"] | |
repo_type = data["repo"]["type"] | |
isPullRequest = data["isPullRequest"] | |
return { | |
"createdAt": createdAt, | |
"pr_number": pr_number, | |
"status": status, | |
"repo_id": repo_id, | |
"type": repo_type, | |
"isPullRequest": isPullRequest, | |
} | |
def update_data(): | |
try: | |
previous_df = pl.DataFrame( | |
load_dataset(f"librarian-bot/{user}-stats", split="train").data.table | |
) | |
except FileNotFoundError: | |
previous_df = pl.DataFrame() | |
data = get_hub_community_activity(user) | |
data = [d for d in data if d.get("discussionData", None) is not None] | |
data = [parse_pr_data(d) for d in data] | |
update_df = pl.DataFrame(data) | |
df = pl.concat([previous_df, update_df]).unique() | |
if len(df) != len(previous_df): | |
Dataset(df.to_arrow()).push_to_hub(f"{user}-stats", token=token) | |
return df | |
# def get_pr_status(): | |
# df = update_data() | |
# df = df.filter(pl.col("isPullRequest") is True) | |
# return df.select(pl.col("status").value_counts()) | |
# # return frequencies(x["status"] for x in pr_data) | |
def get_pr_status(user: str): | |
all_data = get_hub_community_activity(user) | |
print(all_data) | |
# pr_data = ( | |
# x["discussionData"] for x in all_data if x["discussionData"]["isPullRequest"] | |
# ) | |
all_data = [ | |
pr_data | |
for pr_data in all_data | |
if pr_data.get("discussionData", None) is not None | |
] | |
pr_data = ( | |
x.get("discussionData", {}) | |
for x in all_data | |
if x.get("discussionData", {}).get("isPullRequest", False) | |
) | |
return frequencies(x["status"] for x in pr_data) | |
def create_pie(): | |
frequencies = get_pr_status(user) | |
df = pd.DataFrame({"status": frequencies.keys(), "number": frequencies.values()}) | |
return px.pie(df, values="number", names="status", template="seaborn") | |
def group_status_by_pr_number(): | |
all_data = get_hub_community_activity(user) | |
all_data = [d for d in all_data if d.get("discussionData", None) is not None] | |
all_data = [parse_pr_data(d) for d in all_data] | |
return ( | |
pl.DataFrame(all_data).groupby("status").agg(pl.mean("pr_number")).to_pandas() | |
) | |
def plot_over_time(): | |
all_data = get_hub_community_activity(user) | |
all_data = [d for d in all_data if d.get("discussionData", None) is not None] | |
all_data = [parse_pr_data(d) for d in all_data] | |
df = pl.DataFrame(all_data).with_columns(pl.col("createdAt").cast(pl.Date)) | |
df = df.pivot( | |
values=["status"], | |
index=["createdAt"], | |
columns=["status"], | |
aggregate_function="count", | |
) | |
df = df.fill_null(0) | |
df = df.with_columns(pl.sum(["open", "closed", "merged"])).sort("createdAt") | |
df = df.to_pandas().set_index("createdAt").cumsum() | |
return px.line(df, x=df.index, y=[c for c in df.columns if c != "sum"]) | |
create_pie() | |
with gr.Blocks() as demo: | |
# frequencies = get_pr_status("librarian-bot") | |
gr.Markdown(f"# {user} PR Stats") | |
gr.Markdown(f"Total prs and issues opened by {user}: {len(update_data()):,}") | |
# gr.Markdown(f"Total PRs opened: {sum(frequencies.values())}") | |
with gr.Column(): | |
gr.Markdown("## Pull requests status") | |
gr.Markdown( | |
"The below pie chart shows the percentage of pull requests made by" | |
" librarian bot that are open, closed or merged" | |
) | |
gr.Plot(create_pie()) | |
with gr.Column(): | |
gr.Markdown("Pull requests opened, closed and merged over time (cumulative)") | |
gr.Plot(plot_over_time()) | |
with gr.Column(): | |
gr.Markdown("## Pull requests status by PR number") | |
gr.DataFrame(group_status_by_pr_number()) | |
demo.launch(debug=True) | |