davanstrien HF staff commited on
Commit
75d9f7d
1 Parent(s): 6981528

Update API timeout and filter out invalid data

Browse files
Files changed (1) hide show
  1. app.py +33 -21
app.py CHANGED
@@ -26,31 +26,28 @@ assert user
26
 
27
  headers = {"user-agent": user_agent, "authorization": f"Bearer {token}"}
28
  limits = httpx.Limits(max_keepalive_connections=10, max_connections=20)
29
- client = Client(headers=headers, http2=True, limits=limits, timeout=60.0)
30
 
31
 
32
  @lru_cache(maxsize=None)
33
- def get_hub_community_activity(user: str, max: int = 200_000) -> List[Any]:
34
  with tqdm() as pbar:
35
  all_data = []
36
  i = 1
37
- while i <= max:
38
- try:
39
- r = client.get(
40
- f"https://huggingface.co/api/recent-activity?limit=100&activityType=discussion&skip={i}1&entity={user}&feedType=user"
41
- )
42
- activity = r.json()["recentActivity"]
43
- if not activity:
44
- break
45
- all_data.append(activity)
46
- if len(all_data) % 1000 == 0:
47
- # print(f"Length of all_data: {len(all_data)}")
48
- pbar.write(f"Length of all_data: {len(all_data)}")
49
- i += 100
50
- pbar.update(100)
51
- except Exception as e:
52
- print(e)
53
- continue
54
 
55
  return list(concat(all_data))
56
 
@@ -97,6 +94,7 @@ def update_data():
97
  except FileNotFoundError:
98
  previous_df = pl.DataFrame()
99
  data = get_hub_community_activity(user)
 
100
  data = [parse_pr_data(d) for d in data]
101
  update_df = pl.DataFrame(data)
102
  df = pl.concat([previous_df, update_df]).unique()
@@ -115,9 +113,21 @@ def update_data():
115
  @lru_cache(maxsize=512)
116
  def get_pr_status(user: str):
117
  all_data = get_hub_community_activity(user)
 
 
 
 
 
 
 
 
 
118
  pr_data = (
119
- x["discussionData"] for x in all_data if x["discussionData"]["isPullRequest"]
 
 
120
  )
 
121
  return frequencies(x["status"] for x in pr_data)
122
 
123
 
@@ -129,6 +139,7 @@ def create_pie():
129
 
130
  def group_status_by_pr_number():
131
  all_data = get_hub_community_activity(user)
 
132
  all_data = [parse_pr_data(d) for d in all_data]
133
  return (
134
  pl.DataFrame(all_data).groupby("status").agg(pl.mean("pr_number")).to_pandas()
@@ -137,6 +148,7 @@ def group_status_by_pr_number():
137
 
138
  def plot_over_time():
139
  all_data = get_hub_community_activity(user)
 
140
  all_data = [parse_pr_data(d) for d in all_data]
141
  df = pl.DataFrame(all_data).with_columns(pl.col("createdAt").cast(pl.Date))
142
  df = df.pivot(
@@ -146,7 +158,7 @@ def plot_over_time():
146
  aggregate_function="count",
147
  )
148
  df = df.fill_null(0)
149
- df = df.with_columns(pl.sum(["open", "merged"])).sort("createdAt")
150
  df = df.to_pandas().set_index("createdAt").cumsum()
151
  return px.line(df, x=df.index, y=[c for c in df.columns if c != "sum"])
152
 
 
26
 
27
  headers = {"user-agent": user_agent, "authorization": f"Bearer {token}"}
28
  limits = httpx.Limits(max_keepalive_connections=10, max_connections=20)
29
+ client = Client(headers=headers, http2=True, limits=limits, timeout=120.0)
30
 
31
 
32
  @lru_cache(maxsize=None)
33
+ def get_hub_community_activity(user: str) -> List[Any]:
34
  with tqdm() as pbar:
35
  all_data = []
36
  i = 1
37
+ while True:
38
+ r = httpx.get(
39
+ f"https://huggingface.co/api/recent-activity?limit=100&activityType=discussion&skip={i}&entity={user}&feedType=user",
40
+ headers=headers,
41
+ )
42
+ activity = r.json()["recentActivity"]
43
+ if not activity:
44
+ break
45
+ all_data.append(activity)
46
+ if len(all_data) % 1000 == 0:
47
+ # print(f"Length of all_data: {len(all_data)}")
48
+ pbar.write(f"Length of all_data: {len(all_data)}")
49
+ i += 100
50
+ pbar.update(100)
 
 
 
51
 
52
  return list(concat(all_data))
53
 
 
94
  except FileNotFoundError:
95
  previous_df = pl.DataFrame()
96
  data = get_hub_community_activity(user)
97
+ data = [d for d in data if d.get("discussionData", None) is not None]
98
  data = [parse_pr_data(d) for d in data]
99
  update_df = pl.DataFrame(data)
100
  df = pl.concat([previous_df, update_df]).unique()
 
113
  @lru_cache(maxsize=512)
114
  def get_pr_status(user: str):
115
  all_data = get_hub_community_activity(user)
116
+ print(all_data)
117
+ # pr_data = (
118
+ # x["discussionData"] for x in all_data if x["discussionData"]["isPullRequest"]
119
+ # )
120
+ all_data = [
121
+ pr_data
122
+ for pr_data in all_data
123
+ if pr_data.get("discussionData", None) is not None
124
+ ]
125
  pr_data = (
126
+ x.get("discussionData", {})
127
+ for x in all_data
128
+ if x.get("discussionData", {}).get("isPullRequest", False)
129
  )
130
+
131
  return frequencies(x["status"] for x in pr_data)
132
 
133
 
 
139
 
140
  def group_status_by_pr_number():
141
  all_data = get_hub_community_activity(user)
142
+ all_data = [d for d in all_data if d.get("discussionData", None) is not None]
143
  all_data = [parse_pr_data(d) for d in all_data]
144
  return (
145
  pl.DataFrame(all_data).groupby("status").agg(pl.mean("pr_number")).to_pandas()
 
148
 
149
  def plot_over_time():
150
  all_data = get_hub_community_activity(user)
151
+ all_data = [d for d in all_data if d.get("discussionData", None) is not None]
152
  all_data = [parse_pr_data(d) for d in all_data]
153
  df = pl.DataFrame(all_data).with_columns(pl.col("createdAt").cast(pl.Date))
154
  df = df.pivot(
 
158
  aggregate_function="count",
159
  )
160
  df = df.fill_null(0)
161
+ df = df.with_columns(pl.sum(["open", "closed", "merged"])).sort("createdAt")
162
  df = df.to_pandas().set_index("createdAt").cumsum()
163
  return px.line(df, x=df.index, y=[c for c in df.columns if c != "sum"])
164