Loubna ben allal commited on
Commit
c9e8e4a
1 Parent(s): d490108
app.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
3
+ from transformers import pipeline
4
+ import torch
5
+ import json
6
+
7
+
8
+ @st.cache(allow_output_mutation=True)
9
+ def load_tokenizer(model_ckpt):
10
+ return AutoTokenizer.from_pretrained(model_ckpt)
11
+
12
+ @st.cache(allow_output_mutation=True)
13
+ def load_model(model_ckpt):
14
+ model = AutoModelForCausalLM.from_pretrained(model_ckpt, low_cpu_mem_usage=True)
15
+ return model
16
+
17
+ @st.cache()
18
+ def load_examples():
19
+ with open("examples.json", "r") as f:
20
+ examples = json.load(f)
21
+ return examples
22
+
23
+ st.set_page_config(page_icon=':parrot:', layout="wide")
24
+
25
+ tokenizer1 = load_tokenizer("lvwerra/codeparrot")
26
+ model1 = load_model("lvwerra/codeparrot")
27
+
28
+ tokenizer2 = load_tokenizer("facebook/opt-1.3b")
29
+ model2 = load_model("facebook/opt-1.3b")
30
+
31
+ tokenizer3 = load_tokenizer("facebook/incoder-1B")
32
+ model3 = load_model("facebook/incoder-1B")
33
+
34
+ st.sidebar.header("Models:")
35
+ models = ["CodeParrot", "OPT", "InCoder"]
36
+ selected_models = st.multiselect('Select code generation models to compare',
37
+ models,
38
+ default=["CodeParrot"])
39
+ st.sidebar.header("Tasks:")
40
+ taks = ["Model architecture", "Model evaluation", "Pretraining dataset", "Prompting"]
41
+ selected_task = st.sidebar.selectbox("Select a task:", tasks, default="Model architecture")
42
+
43
+ st.title("Code Generation Models👩‍💻")
44
+
45
+ architectures = {}
46
+ datasets = {}
47
+ pipelines = {}
48
+ if selected_task == "Model architecture":
49
+ st.markdown("## Model architectures")
50
+ for model in selected_models:
51
+ with open(f"datasets/{model.lower()}.txt", "r") as f:
52
+ text = f.read()
53
+ #architectures[model] = text
54
+ st.markdown(f"### {model}:")
55
+ st.markdown(text)
56
+
57
+ elif selected_task == "Pretraining dataset":
58
+ st.markdown("## Pretraining Datasets")
59
+ for model in selected_models:
60
+ with open(f"datasets/{model.lower()}.txt", "r") as f:
61
+ text = f.read()
62
+ #datasets[model] = text
63
+ st.markdown(f"### {model}:")
64
+ st.markdown(text)
65
+
66
+ elif selected_task == "Prompting":
67
+ for model in selected_models:
68
+ if model == "CodeParrot":
69
+ tokenizer = load_tokenizer("lvwerra/codeparrot")
70
+ model = load_model("lvwerra/codeparrot")
71
+ pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
72
+ pipelines[model] = pipe
73
+ elif model == "InCoder":
74
+ tokenizer = load_tokenizer("facebook/incoder-1B")
75
+ model = load_model("facebook/incoder-1B")
76
+ pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
77
+ pipelines[model] = pipe
78
+ else:
79
+ tokenizer = load_tokenizer("facebook/opt-1.3b")
80
+ model = load_model("facebook/opt-1.3b")
81
+ pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
82
+ pipelines[model] = pipe
datasets/.ipynb_checkpoints/codeparrot-checkpoint.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ [CodeParrot](https://huggingface.co/lvwerra/codeparrot) was trained on **50GB** of Python data from Github repositories: [CodeParrot dataset](https://huggingface.co/datasets/lvwerra/codeparrot-clean). The original dataset contains a lot of duplicated and noisy data. Therefore, the dataset was cleaned with the following steps:
2
+ - Exact match deduplication
3
+ - Filtering
4
+ - Average line length < 100
5
+ - Maximum line length < 1000
6
+ - Alpha numeric characters fraction > 0.25
7
+ - Remove auto-generated files (keyword search)
8
+
9
+ For more details see the preprocessing script in the transformers repository [here](https://github.com/huggingface/transformers/tree/master/examples/research_projects/codeparrot).
datasets/.ipynb_checkpoints/incoder-checkpoint.txt ADDED
File without changes
datasets/.ipynb_checkpoints/opt-checkpoint.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [OPT](https://huggingface.co/facebook/opt-30b) was trained on the following 5 filtered datasets of textual documents, one of them includes code, [The Pile](https://arxiv.org/pdf/2101.00027v1.pdf), it used *Pile-CC, OpenWebText2, USPTO, Project Gutenberg, OpenSubtitles, Wikipedia, DM Mathematics and HackerNews*.
2
+ The final training data contains 180B tokens corresponding to 800GB of data. For more details please refer to this [paper](https://arxiv.org/abs/2205.01068)
datasets/codeparrot.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ [CodeParrot](https://huggingface.co/lvwerra/codeparrot) was trained on **50GB** of Python data from Github repositories: [CodeParrot dataset](https://huggingface.co/datasets/lvwerra/codeparrot-clean). The original dataset contains a lot of duplicated and noisy data. Therefore, the dataset was cleaned with the following steps:
2
+ - Exact match deduplication
3
+ - Filtering
4
+ - Average line length < 100
5
+ - Maximum line length < 1000
6
+ - Alpha numeric characters fraction > 0.25
7
+ - Remove auto-generated files (keyword search)
8
+
9
+ For more details see the preprocessing script in the transformers repository [here](https://github.com/huggingface/transformers/tree/master/examples/research_projects/codeparrot).
datasets/incoder.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [InCoder](https://huggingface.co/facebook/incoder-6B) was trained on trained on 216 GB of data from Github and Stackoverflow from 28 programming languages. 52 GB rae in Python, 107GB in other programming languages and 57GB is content from stackoverflow that isn't code.
2
+
3
+ The Github data used the following filtering:
4
+ - Average line length < 100
5
+ - Maximum line length < 3000
6
+ - Alphanumeric characters fraction > 0.4
7
+ - Remove auto-generated files (keyword search)
8
+
9
+ The second componenet of the data consists of questions, answers, and comments from StackOverflow, it includes:
10
+ - all questions that have at least one answer
11
+ - up to ten answers with a non-negative score (sorted
12
+ by score) per question
13
+ - up to five comments per question/answer
14
+ Exact match deduplication was performed in code files.
15
+
16
+ For more details please refer to this [paper](https://arxiv.org/pdf/2204.05999.pdf).
datasets/opt.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [OPT](https://huggingface.co/facebook/opt-30b) was trained on the following 5 filtered datasets of textual documents, one of them includes code, [The Pile](https://arxiv.org/pdf/2101.00027v1.pdf), it used *Pile-CC, OpenWebText2, USPTO, Project Gutenberg, OpenSubtitles, Wikipedia, DM Mathematics and HackerNews*.
2
+ The final training data contains 180B tokens corresponding to 800GB of data. For more details please refer to this [paper](https://arxiv.org/abs/2205.01068)