File size: 4,695 Bytes
2171b06
 
 
 
72c2877
 
 
 
 
 
2171b06
 
 
 
 
 
 
 
 
 
72c2877
2171b06
 
 
 
 
 
 
72c2877
 
 
 
 
2171b06
 
 
 
 
 
 
 
 
 
 
72c2877
2171b06
72c2877
2171b06
 
 
 
 
72c2877
2171b06
72c2877
2171b06
 
 
 
 
 
72c2877
2171b06
 
 
 
 
72c2877
 
 
 
 
 
 
 
2171b06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72c2877
2171b06
72c2877
2171b06
 
 
 
 
 
 
 
 
 
 
 
72c2877
2171b06
 
72c2877
2171b06
 
 
 
 
 
72c2877
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import json
import requests
import streamlit as st

st.set_page_config(layout="wide")
with open("utils/table_contents.md", "r") as f:
    contents = f.read()

st.sidebar.markdown(contents)

st.title("The Stack Bot 🤖")

intro = """
The Stack Bot is a tool to help you get started with tools developed in [BigCode](https://huggingface.co/bigcode), 
such as [The Stack](https://huggingface.co/bigcode/the-stack) dataset and [SantaCoder](https://huggingface.co/bigcode/santacoder) model.
"""
st.markdown(intro, unsafe_allow_html=True)

@st.cache()
def load_languages():
    with open("utils/languages.json", "r") as f:
        languages = json.load(f)
    return languages

def how_to_load(language):
    text = f"""
    ```python
    from datasets import load_dataset

    dataset = load_dataset("bigcode/the-stack", data_dir="data/{language}", split="train")

    # print first element
    print(dataset[0])
    ```
    """
    st.markdown(text)

def load_model(values, language):
    model = values["model"]
    if not model:
        text = f"""No model available for {language.capitalize()}. If you trained a model on this language, let us know at contact@bigcode.com to feature your model!\n\
        You can also train your own model on The Stack using the instructions below 🚀"""
        st.write(text)
        if st.button("Fine-tune your own model", key=4):
            st.write("Code available at [GitHub link] + add preview")
    else:
        text = f"""{model} is a model that was trained on the {language.capitalize()} subset of The Stack. Here's how to use it:"""
        code = f"""
        ```python
        from transformers import AutoModelForCausalLM, AutoTokenizer

        tokenizer = AutoTokenizer.from_pretrained({model})
        model = AutoModelForCausalLM.from_pretrained({model}, trust_remote_code=True)

        inputs = tokenizer.encode("def print_hello_world():", return_tensors="pt")
        outputs = model.generate(inputs)
        print(tokenizer.decode(outputs[0]))
        ```
        """
        st.write(text)
        st.markdown(code)
        st.write(f"The scores of this model are the following: {values['scores']}")

def generate_code(
   demo, gen_prompt, max_new_tokens=40, temperature=0.2, seed=0
):
    # call space using its API endpoint
    #try:
    url = (
        f"{demo}/run/predict/"
    )
    r = requests.post(
        url=url, json={"data": [gen_prompt, max_new_tokens, temperature, seed]}
    )
    generated_text = r.json()["data"][0]
    return generated_text

def init_nested_buttons():
    if "Models trained on dataset" not in st.session_state:
        st.session_state["Models trained on dataset"] = False

    if "Generate code" not in st.session_state:
        st.session_state["Generate code"] = False

    if st.button("Models trained on dataset"):
        st.session_state["Models trained on dataset"] = not st.session_state["Models trained on dataset"]


languages = load_languages()

col1, col2 = st.columns([1, 1.5])
with col1:
    selected_language = st.selectbox("Select one of 358 languages in The Stack", list(languages.keys()), key=1)

st.write(f"Here's how you can load the {selected_language.capitalize()} subset of The Stack:")
code = how_to_load(selected_language)
if st.button("More info about the dataset", key=2):
    st.write(f"The dataset contains {languages[selected_language]['num_examples']} examples.")
    # we can add some stats about files

init_nested_buttons()
if st.session_state["Models trained on dataset"]:
    load_model(languages[selected_language], selected_language)

    if languages[selected_language]["model"] and languages[selected_language]["gradio_demo"]:
        st.write(f"Here's a demo to try the model, for more flexibilty you can use the [Gradio demo]({languages[selected_language]['gradio_demo']}).")
        gen_prompt = st.text_area(
        "Generate code with prompt:",
        value="# Implement a function to print hello world",
        height=100,
        ).strip()

        if st.button("Generate code"):
            st.session_state["Generate code"] = not st.session_state["Generate code"]
        if st.session_state["Generate code"]:
            with st.spinner("Generating code..."):
                generated_text = generate_code(
                    demo=languages[selected_language]["gradio_demo"],
                    gen_prompt=gen_prompt,
                )
                if not generated_text:
                    st.markdown(f"Error: could not generate code. Make sure the Gradio demo at [{languages[selected_language]['gradio_demo']}]({languages[selected_language]['gradio_demo']}) works.")
                else:
                    st.code(generated_text)