Spaces:
Runtime error
Runtime error
Commit
•
75d9f7d
1
Parent(s):
6981528
Update API timeout and filter out invalid data
Browse files
app.py
CHANGED
@@ -26,31 +26,28 @@ assert user
|
|
26 |
|
27 |
headers = {"user-agent": user_agent, "authorization": f"Bearer {token}"}
|
28 |
limits = httpx.Limits(max_keepalive_connections=10, max_connections=20)
|
29 |
-
client = Client(headers=headers, http2=True, limits=limits, timeout=
|
30 |
|
31 |
|
32 |
@lru_cache(maxsize=None)
|
33 |
-
def get_hub_community_activity(user: str
|
34 |
with tqdm() as pbar:
|
35 |
all_data = []
|
36 |
i = 1
|
37 |
-
while
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
except Exception as e:
|
52 |
-
print(e)
|
53 |
-
continue
|
54 |
|
55 |
return list(concat(all_data))
|
56 |
|
@@ -97,6 +94,7 @@ def update_data():
|
|
97 |
except FileNotFoundError:
|
98 |
previous_df = pl.DataFrame()
|
99 |
data = get_hub_community_activity(user)
|
|
|
100 |
data = [parse_pr_data(d) for d in data]
|
101 |
update_df = pl.DataFrame(data)
|
102 |
df = pl.concat([previous_df, update_df]).unique()
|
@@ -115,9 +113,21 @@ def update_data():
|
|
115 |
@lru_cache(maxsize=512)
|
116 |
def get_pr_status(user: str):
|
117 |
all_data = get_hub_community_activity(user)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
pr_data = (
|
119 |
-
x
|
|
|
|
|
120 |
)
|
|
|
121 |
return frequencies(x["status"] for x in pr_data)
|
122 |
|
123 |
|
@@ -129,6 +139,7 @@ def create_pie():
|
|
129 |
|
130 |
def group_status_by_pr_number():
|
131 |
all_data = get_hub_community_activity(user)
|
|
|
132 |
all_data = [parse_pr_data(d) for d in all_data]
|
133 |
return (
|
134 |
pl.DataFrame(all_data).groupby("status").agg(pl.mean("pr_number")).to_pandas()
|
@@ -137,6 +148,7 @@ def group_status_by_pr_number():
|
|
137 |
|
138 |
def plot_over_time():
|
139 |
all_data = get_hub_community_activity(user)
|
|
|
140 |
all_data = [parse_pr_data(d) for d in all_data]
|
141 |
df = pl.DataFrame(all_data).with_columns(pl.col("createdAt").cast(pl.Date))
|
142 |
df = df.pivot(
|
@@ -146,7 +158,7 @@ def plot_over_time():
|
|
146 |
aggregate_function="count",
|
147 |
)
|
148 |
df = df.fill_null(0)
|
149 |
-
df = df.with_columns(pl.sum(["open", "merged"])).sort("createdAt")
|
150 |
df = df.to_pandas().set_index("createdAt").cumsum()
|
151 |
return px.line(df, x=df.index, y=[c for c in df.columns if c != "sum"])
|
152 |
|
|
|
26 |
|
27 |
headers = {"user-agent": user_agent, "authorization": f"Bearer {token}"}
|
28 |
limits = httpx.Limits(max_keepalive_connections=10, max_connections=20)
|
29 |
+
client = Client(headers=headers, http2=True, limits=limits, timeout=120.0)
|
30 |
|
31 |
|
32 |
@lru_cache(maxsize=None)
|
33 |
+
def get_hub_community_activity(user: str) -> List[Any]:
|
34 |
with tqdm() as pbar:
|
35 |
all_data = []
|
36 |
i = 1
|
37 |
+
while True:
|
38 |
+
r = httpx.get(
|
39 |
+
f"https://huggingface.co/api/recent-activity?limit=100&activityType=discussion&skip={i}&entity={user}&feedType=user",
|
40 |
+
headers=headers,
|
41 |
+
)
|
42 |
+
activity = r.json()["recentActivity"]
|
43 |
+
if not activity:
|
44 |
+
break
|
45 |
+
all_data.append(activity)
|
46 |
+
if len(all_data) % 1000 == 0:
|
47 |
+
# print(f"Length of all_data: {len(all_data)}")
|
48 |
+
pbar.write(f"Length of all_data: {len(all_data)}")
|
49 |
+
i += 100
|
50 |
+
pbar.update(100)
|
|
|
|
|
|
|
51 |
|
52 |
return list(concat(all_data))
|
53 |
|
|
|
94 |
except FileNotFoundError:
|
95 |
previous_df = pl.DataFrame()
|
96 |
data = get_hub_community_activity(user)
|
97 |
+
data = [d for d in data if d.get("discussionData", None) is not None]
|
98 |
data = [parse_pr_data(d) for d in data]
|
99 |
update_df = pl.DataFrame(data)
|
100 |
df = pl.concat([previous_df, update_df]).unique()
|
|
|
113 |
@lru_cache(maxsize=512)
|
114 |
def get_pr_status(user: str):
|
115 |
all_data = get_hub_community_activity(user)
|
116 |
+
print(all_data)
|
117 |
+
# pr_data = (
|
118 |
+
# x["discussionData"] for x in all_data if x["discussionData"]["isPullRequest"]
|
119 |
+
# )
|
120 |
+
all_data = [
|
121 |
+
pr_data
|
122 |
+
for pr_data in all_data
|
123 |
+
if pr_data.get("discussionData", None) is not None
|
124 |
+
]
|
125 |
pr_data = (
|
126 |
+
x.get("discussionData", {})
|
127 |
+
for x in all_data
|
128 |
+
if x.get("discussionData", {}).get("isPullRequest", False)
|
129 |
)
|
130 |
+
|
131 |
return frequencies(x["status"] for x in pr_data)
|
132 |
|
133 |
|
|
|
139 |
|
140 |
def group_status_by_pr_number():
|
141 |
all_data = get_hub_community_activity(user)
|
142 |
+
all_data = [d for d in all_data if d.get("discussionData", None) is not None]
|
143 |
all_data = [parse_pr_data(d) for d in all_data]
|
144 |
return (
|
145 |
pl.DataFrame(all_data).groupby("status").agg(pl.mean("pr_number")).to_pandas()
|
|
|
148 |
|
149 |
def plot_over_time():
|
150 |
all_data = get_hub_community_activity(user)
|
151 |
+
all_data = [d for d in all_data if d.get("discussionData", None) is not None]
|
152 |
all_data = [parse_pr_data(d) for d in all_data]
|
153 |
df = pl.DataFrame(all_data).with_columns(pl.col("createdAt").cast(pl.Date))
|
154 |
df = df.pivot(
|
|
|
158 |
aggregate_function="count",
|
159 |
)
|
160 |
df = df.fill_null(0)
|
161 |
+
df = df.with_columns(pl.sum(["open", "closed", "merged"])).sort("createdAt")
|
162 |
df = df.to_pandas().set_index("createdAt").cumsum()
|
163 |
return px.line(df, x=df.index, y=[c for c in df.columns if c != "sum"])
|
164 |
|