WLS / app.py
Youssefk's picture
V1.0
82fe914
raw
history blame
1.66 kB
from transformers import AutoTokenizer, AutoModelForCausalLM, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import torch
import streamlit as st
# Load the pre-trained tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("togethercomputer/GPT-NeoXT-Chat-Base-20B")
model = AutoModelForCausalLM.from_pretrained("togethercomputer/GPT-NeoXT-Chat-Base-20B")
dataa = "My name is youssef khemiri i am 21 years old and i am a data scientist"
# Prepare the dataset
train_dataset = TextDataset(
tokenizer=tokenizer,
file_path=dataa,
block_size=128,
)
# Prepare the data collator
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, mlm=False,
)
# Initialize the trainer
training_args = TrainingArguments(
# output_dir='./results', # output directory
num_train_epochs=3, # total number of training epochs
per_device_train_batch_size=16, # batch size per device during training
save_steps=10_000, # number of steps between saving checkpoints
save_total_limit=2, # limit the total amount of checkpoints to save
prediction_loss_only=True,
learning_rate=5e-5,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
data_collator=data_collator,
)
# Fine-tune the model
trainer.train()
st.write("finished training")
# infer
inputs = tokenizer("<human>: Tell me about youssef khemiri\n<bot>:", return_tensors='pt').to(model.device)
outputs = model.generate(**inputs, max_new_tokens=10, do_sample=True, temperature=0.8)
output_str = tokenizer.decode(outputs[0])
st.write(output_str)