anton-l HF staff commited on
Commit
6203b88
β€’
1 Parent(s): f87f797
Files changed (2) hide show
  1. app.py +28 -25
  2. requirements.txt +1 -0
app.py CHANGED
@@ -1,30 +1,30 @@
1
  import gradio as gr
2
  from huggingface_hub import hf_hub_download
3
- import json
4
  import gzip
5
  import urllib
6
-
7
- usernames = {}
8
-
9
- filepath = hf_hub_download(repo_id="bigcode/the-stack-username-to-repo", filename="username_to_repo.json.gz", repo_type="dataset", revision="v2.0.1")
10
- with gzip.open(filepath, 'r') as f:
11
- usernames["v2.0.1"] = json.loads(f.read().decode('utf-8'))
12
-
13
- filepath = hf_hub_download(repo_id="bigcode/the-stack-username-to-repo", filename="username_to_repo.json.gz", repo_type="dataset", revision="v2.0")
14
- with gzip.open(filepath, 'r') as f:
15
- usernames["v2.0"] = json.loads(f.read().decode('utf-8'))
16
-
17
- filepath = hf_hub_download(repo_id="bigcode/the-stack-username-to-repo", filename="username_to_repo.json.gz", repo_type="dataset", revision="v1.2")
18
- with gzip.open(filepath, 'r') as f:
19
- usernames["v1.2"] = json.loads(f.read().decode('utf-8'))
20
-
21
- filepath = hf_hub_download(repo_id="bigcode/the-stack-username-to-repo", filename="username_to_repo.json.gz", repo_type="dataset", revision="v1.1")
22
- with gzip.open(filepath, 'r') as f:
23
- usernames["v1.1"] = json.loads(f.read().decode('utf-8'))
24
-
25
- filepath = hf_hub_download(repo_id="bigcode/the-stack-username-to-repo", filename="username_to_repo.json.gz", repo_type="dataset")
26
- with gzip.open(filepath, 'r') as f:
27
- usernames["v1.0"] = json.loads(f.read().decode('utf-8'))
28
 
29
  text = """\
30
  ![](https://huggingface.co/spaces/lvwerra/in-the-stack-gr/resolve/main/banner.png)
@@ -77,8 +77,11 @@ def issue_url(username, repos):
77
 
78
  def check_username(username, version):
79
  output_md = ""
80
- if username in usernames[version] and len(usernames[version][username])>0:
81
- repos = usernames[version][username]
 
 
 
82
  repo_word = "repository" if len(repos)==1 else "repositories"
83
  if version[:2] == "v2":
84
  output_md += f"**Yes**, there is code from **{len(repos)} {repo_word}** in The Stack. Check the links to see when it was archived by Software Heritage:\n\n"
 
1
  import gradio as gr
2
  from huggingface_hub import hf_hub_download
3
+ import json_stream as json
4
  import gzip
5
  import urllib
6
+ from collections import defaultdict
7
+ import gc
8
+ import sys
9
+
10
+ usernames = defaultdict(dict)
11
+
12
+ versions = ["v1.0", "v1.1", "v1.2", "v2.0", "v2.0.1"]
13
+ versions = [sys.intern(version) for version in versions]
14
+
15
+ for version in versions:
16
+ print(f"Loading {version}")
17
+ branch = version if version != "v1.0" else "main"
18
+ filepath = hf_hub_download(repo_id="bigcode/the-stack-username-to-repo", filename="username_to_repo.json.gz", repo_type="dataset", revision=branch)
19
+ with gzip.open(filepath, 'r') as f:
20
+ data = json.load(f)
21
+ for username, repos in data.items():
22
+ for repo in repos:
23
+ if repo not in usernames[username]:
24
+ usernames[username][repo] = []
25
+ usernames[username][repo].append(version)
26
+ del data
27
+ gc.collect()
28
 
29
  text = """\
30
  ![](https://huggingface.co/spaces/lvwerra/in-the-stack-gr/resolve/main/banner.png)
 
77
 
78
  def check_username(username, version):
79
  output_md = ""
80
+ repos = []
81
+ if username in usernames:
82
+ repos = [repo for repo, versions in usernames[username].items() if version in versions]
83
+
84
+ if repos:
85
  repo_word = "repository" if len(repos)==1 else "repositories"
86
  if version[:2] == "v2":
87
  output_md += f"**Yes**, there is code from **{len(repos)} {repo_word}** in The Stack. Check the links to see when it was archived by Software Heritage:\n\n"
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ json-stream