Youssefk commited on
Commit
82fe914
1 Parent(s): 3623a75
Files changed (1) hide show
  1. app.py +51 -0
app.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
2
+ import torch
3
+ import streamlit as st
4
+
5
+ # Load the pre-trained tokenizer and model
6
+ tokenizer = AutoTokenizer.from_pretrained("togethercomputer/GPT-NeoXT-Chat-Base-20B")
7
+ model = AutoModelForCausalLM.from_pretrained("togethercomputer/GPT-NeoXT-Chat-Base-20B")
8
+
9
+
10
+ dataa = "My name is youssef khemiri i am 21 years old and i am a data scientist"
11
+
12
+ # Prepare the dataset
13
+ train_dataset = TextDataset(
14
+ tokenizer=tokenizer,
15
+ file_path=dataa,
16
+ block_size=128,
17
+ )
18
+
19
+ # Prepare the data collator
20
+ data_collator = DataCollatorForLanguageModeling(
21
+ tokenizer=tokenizer, mlm=False,
22
+ )
23
+
24
+ # Initialize the trainer
25
+ training_args = TrainingArguments(
26
+ # output_dir='./results', # output directory
27
+ num_train_epochs=3, # total number of training epochs
28
+ per_device_train_batch_size=16, # batch size per device during training
29
+ save_steps=10_000, # number of steps between saving checkpoints
30
+ save_total_limit=2, # limit the total amount of checkpoints to save
31
+ prediction_loss_only=True,
32
+ learning_rate=5e-5,
33
+ )
34
+
35
+ trainer = Trainer(
36
+ model=model,
37
+ args=training_args,
38
+ train_dataset=train_dataset,
39
+ data_collator=data_collator,
40
+ )
41
+
42
+ # Fine-tune the model
43
+ trainer.train()
44
+ st.write("finished training")
45
+
46
+ # infer
47
+ inputs = tokenizer("<human>: Tell me about youssef khemiri\n<bot>:", return_tensors='pt').to(model.device)
48
+ outputs = model.generate(**inputs, max_new_tokens=10, do_sample=True, temperature=0.8)
49
+ output_str = tokenizer.decode(outputs[0])
50
+ st.write(output_str)
51
+