loubnabnl HF staff commited on
Commit
72c2877
β€’
1 Parent(s): 2171b06

update code

Browse files
app.py CHANGED
@@ -2,19 +2,23 @@ import json
2
  import requests
3
  import streamlit as st
4
 
 
 
 
 
 
 
5
  st.title("The Stack Bot πŸ€–")
6
 
7
  intro = """
8
  The Stack Bot is a tool to help you get started with tools developed in [BigCode](https://huggingface.co/bigcode),
9
  such as [The Stack](https://huggingface.co/bigcode/the-stack) dataset and [SantaCoder](https://huggingface.co/bigcode/santacoder) model.
10
-
11
- We show information about existing programming languages and models trained on them. If you trained a model on The Stack, let us know so we feature your model! πŸš€
12
  """
13
  st.markdown(intro, unsafe_allow_html=True)
14
 
15
  @st.cache()
16
  def load_languages():
17
- with open("languages.json", "r") as f:
18
  languages = json.load(f)
19
  return languages
20
 
@@ -22,7 +26,11 @@ def how_to_load(language):
22
  text = f"""
23
  ```python
24
  from datasets import load_dataset
25
- dataset = load_dataset("bigcode/the-stack", data_dir=f"data/{language}, split="train")
 
 
 
 
26
  ```
27
  """
28
  st.markdown(text)
@@ -34,43 +42,37 @@ def load_model(values, language):
34
  You can also train your own model on The Stack using the instructions below πŸš€"""
35
  st.write(text)
36
  if st.button("Fine-tune your own model", key=4):
37
- st.write("Code available at [GitHub link] + add preview + example of time & required hardware estimation")
38
  else:
39
- text = f"""{model} is a model that was trained on the {language} from The Stack. Here's how to use it:"""
40
  code = f"""
41
  ```python
42
  from transformers import AutoModelForCausalLM, AutoTokenizer
43
 
44
- device = "cuda" # for GPU usage or "cpu" for CPU usage
45
-
46
  tokenizer = AutoTokenizer.from_pretrained({model})
47
- model = AutoModelForCausalLM.from_pretrained({model}, trust_remote_code=True).to(device)
48
 
49
- inputs = tokenizer.encode("def print_hello_world():", return_tensors="pt").to(device)
50
  outputs = model.generate(inputs)
51
  print(tokenizer.decode(outputs[0]))
52
  ```
53
  """
54
  st.write(text)
55
  st.markdown(code)
56
- st.write("The scores of this model are the following:")
57
- for key, value in values["scores"].items():
58
- st.write(f"{key}: {value}")
59
 
60
  def generate_code(
61
  demo, gen_prompt, max_new_tokens=40, temperature=0.2, seed=0
62
  ):
63
  # call space using its API endpoint
64
- try:
65
- url = (
66
- f"https://hf.space/embed/{demo.lower()}/+/api/predict/"
67
- )
68
- r = requests.post(
69
- url=url, json={"data": [gen_prompt, max_new_tokens, temperature, seed]}
70
- )
71
- generated_text = r.json()["data"][0]
72
- except:
73
- generated_text = ""
74
  return generated_text
75
 
76
  def init_nested_buttons():
@@ -86,9 +88,9 @@ def init_nested_buttons():
86
 
87
  languages = load_languages()
88
 
89
- col1, col2 = st.columns([1, 2])
90
  with col1:
91
- selected_language = st.selectbox("Languages of The Stack", list(languages.keys()), key=1)
92
 
93
  st.write(f"Here's how you can load the {selected_language.capitalize()} subset of The Stack:")
94
  code = how_to_load(selected_language)
@@ -101,21 +103,22 @@ if st.session_state["Models trained on dataset"]:
101
  load_model(languages[selected_language], selected_language)
102
 
103
  if languages[selected_language]["model"] and languages[selected_language]["gradio_demo"]:
104
- st.write(f"Here's a demo to try the model, for more flxibilty you can use the original at [Gradio demo](hf.co/{languages[selected_language]['gradio_demo']})")
105
  gen_prompt = st.text_area(
106
  "Generate code with prompt:",
107
- value="# print hello world",
108
  height=100,
109
  ).strip()
110
 
111
  if st.button("Generate code"):
112
  st.session_state["Generate code"] = not st.session_state["Generate code"]
113
  if st.session_state["Generate code"]:
114
- generated_text = generate_code(
115
- demo=languages[selected_language]["gradio_demo"],
116
- gen_prompt=gen_prompt,
117
- )
118
- if not generated_text:
119
- st.write(f"Error: could not generate code. Make sure the Gradio demo at hf.co/{languages[selected_language]['gradio_demo']} works.")
120
- else:
121
- st.code(generated_text)
 
 
2
  import requests
3
  import streamlit as st
4
 
5
+ st.set_page_config(layout="wide")
6
+ with open("utils/table_contents.md", "r") as f:
7
+ contents = f.read()
8
+
9
+ st.sidebar.markdown(contents)
10
+
11
  st.title("The Stack Bot πŸ€–")
12
 
13
  intro = """
14
  The Stack Bot is a tool to help you get started with tools developed in [BigCode](https://huggingface.co/bigcode),
15
  such as [The Stack](https://huggingface.co/bigcode/the-stack) dataset and [SantaCoder](https://huggingface.co/bigcode/santacoder) model.
 
 
16
  """
17
  st.markdown(intro, unsafe_allow_html=True)
18
 
19
  @st.cache()
20
  def load_languages():
21
+ with open("utils/languages.json", "r") as f:
22
  languages = json.load(f)
23
  return languages
24
 
 
26
  text = f"""
27
  ```python
28
  from datasets import load_dataset
29
+
30
+ dataset = load_dataset("bigcode/the-stack", data_dir="data/{language}", split="train")
31
+
32
+ # print first element
33
+ print(dataset[0])
34
  ```
35
  """
36
  st.markdown(text)
 
42
  You can also train your own model on The Stack using the instructions below πŸš€"""
43
  st.write(text)
44
  if st.button("Fine-tune your own model", key=4):
45
+ st.write("Code available at [GitHub link] + add preview")
46
  else:
47
+ text = f"""{model} is a model that was trained on the {language.capitalize()} subset of The Stack. Here's how to use it:"""
48
  code = f"""
49
  ```python
50
  from transformers import AutoModelForCausalLM, AutoTokenizer
51
 
 
 
52
  tokenizer = AutoTokenizer.from_pretrained({model})
53
+ model = AutoModelForCausalLM.from_pretrained({model}, trust_remote_code=True)
54
 
55
+ inputs = tokenizer.encode("def print_hello_world():", return_tensors="pt")
56
  outputs = model.generate(inputs)
57
  print(tokenizer.decode(outputs[0]))
58
  ```
59
  """
60
  st.write(text)
61
  st.markdown(code)
62
+ st.write(f"The scores of this model are the following: {values['scores']}")
 
 
63
 
64
  def generate_code(
65
  demo, gen_prompt, max_new_tokens=40, temperature=0.2, seed=0
66
  ):
67
  # call space using its API endpoint
68
+ #try:
69
+ url = (
70
+ f"{demo}/run/predict/"
71
+ )
72
+ r = requests.post(
73
+ url=url, json={"data": [gen_prompt, max_new_tokens, temperature, seed]}
74
+ )
75
+ generated_text = r.json()["data"][0]
 
 
76
  return generated_text
77
 
78
  def init_nested_buttons():
 
88
 
89
  languages = load_languages()
90
 
91
+ col1, col2 = st.columns([1, 1.5])
92
  with col1:
93
+ selected_language = st.selectbox("Select one of 358 languages in The Stack", list(languages.keys()), key=1)
94
 
95
  st.write(f"Here's how you can load the {selected_language.capitalize()} subset of The Stack:")
96
  code = how_to_load(selected_language)
 
103
  load_model(languages[selected_language], selected_language)
104
 
105
  if languages[selected_language]["model"] and languages[selected_language]["gradio_demo"]:
106
+ st.write(f"Here's a demo to try the model, for more flexibilty you can use the [Gradio demo]({languages[selected_language]['gradio_demo']}).")
107
  gen_prompt = st.text_area(
108
  "Generate code with prompt:",
109
+ value="# Implement a function to print hello world",
110
  height=100,
111
  ).strip()
112
 
113
  if st.button("Generate code"):
114
  st.session_state["Generate code"] = not st.session_state["Generate code"]
115
  if st.session_state["Generate code"]:
116
+ with st.spinner("Generating code..."):
117
+ generated_text = generate_code(
118
+ demo=languages[selected_language]["gradio_demo"],
119
+ gen_prompt=gen_prompt,
120
+ )
121
+ if not generated_text:
122
+ st.markdown(f"Error: could not generate code. Make sure the Gradio demo at [{languages[selected_language]['gradio_demo']}]({languages[selected_language]['gradio_demo']}) works.")
123
+ else:
124
+ st.code(generated_text)
languages.json β†’ utils/languages.json RENAMED
@@ -1,6 +1,6 @@
1
- {"python": {"num_examples": 10, "model": "bigcode/santacoder", "scores": {"HumanEval-pass@1": 10, "HumanEval-pass@10": 20, "HumanEval-pass@100": 40}, "gradio_demo": "bigcode/santacoder-demo"},
2
- "java": {"num_examples": 10, "model": "bigcode/santacoder", "scores": { "HumanEval-pass@1": 10, "HumanEval-pass@10": 20, "HumanEval-pass@100": 40}, "gradio_demo": "bigcode/santacoder-demo"},
3
- "javascript": {"num_examples": 10, "model": "bigcode/santacoder", "scores": { "HumanEval-pass@1": 10, "HumanEval-pass@10": 20, "HumanEval-pass@100": 40}, "gradio_demo": "bigcode/santacoder-demo"},
4
  "typescript": {"num_examples": 10, "model": ""},
5
  "go": {"num_examples": 10, "model": ""},
6
  "php": {"num_examples": 10, "model": ""},
 
1
+ {"python": {"num_examples": 10, "model": "bigcode/santacoder", "scores": {"HumanEval-pass@1": 10, "HumanEval-pass@10": 20, "HumanEval-pass@100": 40}, "gradio_demo": "https://loubnabnl-santa-demo.hf.space"},
2
+ "java": {"num_examples": 10, "model": "bigcode/santacoder", "scores": { "HumanEval-pass@1": 10, "HumanEval-pass@10": 20, "HumanEval-pass@100": 40}, "gradio_demo": "https://loubnabnl-santa-demo.hf.space"},
3
+ "javascript": {"num_examples": 10, "model": "bigcode/santacoder", "scores": { "HumanEval-pass@1": 10, "HumanEval-pass@10": 20, "HumanEval-pass@100": 40}, "gradio_demo": "https://loubnabnl-santa-demo.hf.space"},
4
  "typescript": {"num_examples": 10, "model": ""},
5
  "go": {"num_examples": 10, "model": ""},
6
  "php": {"num_examples": 10, "model": ""},
utils/table_contents.md ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ ### πŸ“– Table of contents πŸ“–
2
+
3
+ 1 - [The Stack](https://huggingface.co/bigcode/the-stack) exploration
4
+
5
+ 2 - Models trained on The Stack (e.g. [SantaCoder](https://huggingface.co/bigcode/santacodee))
6
+
7
+ 3 - Demos for code generation
8
+
9
+ If you trained a model on The Stack, let us know so we can feature it! πŸš€