holly123 commited on
Commit
2d38d2c
1 Parent(s): c5a7a0f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +115 -62
app.py CHANGED
@@ -1,63 +1,116 @@
1
- import gradio as gr
2
- from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed, pipeline
3
-
4
-
5
- title = "Code Generator"
6
- description = "This is a space to convert english text to Python code using with [codeparrot-small-text-to-code](https://huggingface.co/codeparrot/codeparrot-small-text-to-code),\
7
- a code generation model for Python finetuned on [github-jupyter-text](https://huggingface.co/datasets/codeparrot/github-jupyter-text) a dataset of doctrings\
8
- and their Python code extracted from Jupyter notebooks."
9
- example = [
10
- ["Utility function to compute the accuracy of predictions using metric from sklearn", 65, 0.6, 42],
11
- ["Let's implement a function that computes the size of a file called filepath", 60, 0.6, 42],
12
- ["Let's implement bubble sort in a helper function:", 87, 0.6, 42],
13
- ]
14
-
15
- # change model to the finetuned one
16
- tokenizer = AutoTokenizer.from_pretrained("codeparrot/codeparrot-small-text-to-code")
17
- model = AutoModelForCausalLM.from_pretrained("codeparrot/codeparrot-small-text-to-code")
18
-
19
- def make_doctring(gen_prompt):
20
- return "\"\"\"\n" + gen_prompt + "\n\"\"\"\n\n"
21
-
22
- def code_generation(gen_prompt, max_tokens, temperature=0.6, seed=42):
23
- set_seed(seed)
24
- pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
25
- prompt = make_doctring(gen_prompt)
26
- generated_text = pipe(prompt, do_sample=True, top_p=0.95, temperature=temperature, max_new_tokens=max_tokens)[0]['generated_text']
27
- return generated_text
28
-
29
-
30
- iface = gr.Interface(
31
- fn=code_generation,
32
- inputs=[
33
- gr.Code(lines=10, language="python", label="English instructions"),
34
- gr.inputs.Slider(
35
- minimum=8,
36
- maximum=256,
37
- step=1,
38
- default=8,
39
- label="Number of tokens to generate",
40
- ),
41
- gr.inputs.Slider(
42
- minimum=0,
43
- maximum=2.5,
44
- step=0.1,
45
- default=0.6,
46
- label="Temperature",
47
- ),
48
- gr.inputs.Slider(
49
- minimum=0,
50
- maximum=1000,
51
- step=1,
52
- default=42,
53
- label="Random seed to use for the generation"
54
- )
55
- ],
56
- outputs=gr.Code(label="Predicted Python code", language="python", lines=10),
57
- examples=example,
58
- layout="horizontal",
59
- theme="peach",
60
- description=description,
61
- title=title
 
 
 
 
 
 
62
  )
63
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, DataCollatorForLanguageModeling, GPT2LMHeadModel, AutoConfig
2
+ from datasets import load_dataset, DatasetDict
3
+
4
+ # 加载数据集
5
+ ds_train = load_dataset("huggingface-course/codeparrot-ds-train", split="train")
6
+ ds_valid = load_dataset("huggingface-course/codeparrot-ds-valid", split="validation")
7
+
8
+ # 数据集字典
9
+ raw_datasets = DatasetDict(
10
+ {
11
+ # 训练集
12
+ # "train": ds_train, # .shuffle().select(range(50000)),
13
+ "train": ds_train.shuffle().select(range(10000)),
14
+ # 验证集
15
+ # "valid": ds_valid, # .shuffle().select(range(500))
16
+ "valid": ds_valid.shuffle().select(range(500))
17
+ }
18
+ )
19
+
20
+ context_length = 128
21
+ tokenizer = AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer")
22
+
23
+ outputs = tokenizer(
24
+ # 从训练集数据集中选择前两个样本的"content"字段
25
+ raw_datasets["train"][:2]["content"],
26
+ # 截断操作,如果文本长度超过max_length,则截断到指定的最大长度
27
+ truncation=True,
28
+ # 128
29
+ max_length=context_length,
30
+ # 表示如果文本长度超过了max_length,则返回超出部分的标记
31
+ return_overflowing_tokens=True,
32
+ # 表示返回每个样本处理后的标记序列的长度
33
+ return_length=True,
34
+ )
35
+
36
+ print(f"Input IDs length: {len(outputs['input_ids'])}")
37
+ print(f"Input chunk lengths: {(outputs['length'])}")
38
+ print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")
39
+
40
+ def tokenize(element):
41
+ outputs = tokenizer(
42
+ element["content"],
43
+ truncation=True,
44
+ max_length=context_length,
45
+ return_overflowing_tokens=True,
46
+ return_length=True,
47
+ )
48
+ input_batch = []
49
+ for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
50
+ if length == context_length:
51
+ input_batch.append(input_ids)
52
+ return {"input_ids": input_batch}
53
+
54
+
55
+ tokenized_datasets = raw_datasets.map(
56
+ tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
57
+ )
58
+
59
+ print(tokenized_datasets)
60
+
61
+ # 创建一个GPT-2语言模型的配置(config)对象
62
+ config = AutoConfig.from_pretrained(
63
+ "gpt2",
64
+ vocab_size=len(tokenizer),
65
+ n_ctx=context_length,
66
+ bos_token_id=tokenizer.bos_token_id,
67
+ eos_token_id=tokenizer.eos_token_id,
68
  )
69
+ # 初始化模型
70
+ model = GPT2LMHeadModel(config)
71
+ # 参数数量
72
+ model_size = sum(t.numel() for t in model.parameters())
73
+
74
+ print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")
75
+
76
+ # 将分词器(tokenizer)的填充标记(pad token)设置为结束标记(eos token)
77
+ # 这将确保在数据收集过程中,将结束标记用作填充标记,以便对不同长度的序列进行批处理。
78
+ tokenizer.pad_token = tokenizer.eos_token
79
+ # 用于语言建模任务的数据收集器对象
80
+ data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
81
+
82
+ out = data_collator([tokenized_datasets["train"][i] for i in range(5)])
83
+ for key in out:
84
+ print(f"{key} shape: {out[key].shape}")
85
+
86
+ from transformers import Trainer, TrainingArguments
87
+
88
+ args = TrainingArguments(
89
+ output_dir="codeparrot-ds",
90
+ per_device_train_batch_size=32,
91
+ per_device_eval_batch_size=32,
92
+ evaluation_strategy="steps",
93
+ eval_steps=5_000,
94
+ logging_steps=5_000,
95
+ gradient_accumulation_steps=8,
96
+ num_train_epochs=1,
97
+ weight_decay=0.1,
98
+ warmup_steps=1_000,
99
+ lr_scheduler_type="cosine",
100
+ learning_rate=5e-4,
101
+ save_steps=5_000,
102
+ fp16=False,
103
+ push_to_hub=False,
104
+ )
105
+
106
+ trainer = Trainer(
107
+ model=model,
108
+ tokenizer=tokenizer,
109
+ args=args,
110
+ data_collator=data_collator,
111
+ train_dataset=tokenized_datasets["train"],
112
+ eval_dataset=tokenized_datasets["valid"],
113
+ )
114
+
115
+ print(trainer)
116
+ trainer.train()