Spaces:
Running
Running
from transformers import AutoTokenizer, MT5ForConditionalGeneration | |
from transformers import T5Tokenizer | |
import streamlit as st | |
import pandas as pd | |
from datasets import Dataset | |
import torch | |
from datasets import Dataset, DatasetDict | |
from transformers import Trainer, TrainingArguments | |
tokenizer = T5Tokenizer.from_pretrained('google/mt5-base') | |
model = MT5ForConditionalGeneration.from_pretrained("google/mt5-base") | |
#st.write(model) | |
df = pd.read_csv('proverbs.csv') | |
df | |
dataset = Dataset.from_pandas(df) | |
def preprocess_function(examples): | |
inputs = examples['Proverb'] | |
targets = examples['Meaning'] | |
model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length") | |
with tokenizer.as_target_tokenizer(): | |
labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length") | |
model_inputs["labels"] = labels["input_ids"] | |
return model_inputs | |
tokenized_dataset = dataset.map(preprocess_function, batched=True) | |
dataset_split = tokenized_dataset.train_test_split(test_size=0.2) | |
train_dataset = dataset_split['train'] | |
test_dataset = dataset_split['test'] | |
print(f"Training dataset size: {len(train_dataset)}") | |
print(f"Testing dataset size: {len(test_dataset)}") | |
training_args = TrainingArguments( | |
output_dir="./results", | |
evaluation_strategy="epoch", | |
learning_rate=2e-5, | |
per_device_train_batch_size=4, | |
per_device_eval_batch_size=4, | |
num_train_epochs=3, | |
weight_decay=0.01, | |
save_total_limit=2, | |
save_steps=500, | |
) | |
# Initialize Trainer | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=tokenized_dataset, | |
eval_dataset=tokenized_dataset, # Typically you'd have a separate eval dataset | |
) | |
# Fine-tune the model | |
trainer.train() | |
model.save_pretrained("./fine-tuned-mt5-marathi-proverbs") | |
tokenizer.save_pretrained("./fine-tuned-mt5-marathi-proverbs") | |
repo_id = "grpathak22/mt5-proverbs" | |
# # Log in and create the repo | |
# api = HfApi() | |
# api.login(token=hf_token) | |
# api.create_repo(repo_id, exist_ok=True) | |
# # Initialize the Repository object | |
# repo = Repository(local_dir="./fine-tuned-mt5-marathi-proverbs", clone_from=repo_id) | |
# # Push the model and tokenizer to the Hugging Face Hub | |
# repo.push_to_hub(commit_message="Add fine-tuned MT5 model for Marathi proverbs") | |
prompt = "अति शहाणा त्याचा बैल रिकामा" | |
# Tokenize the input prompt | |
input_ids = tokenizer.encode(prompt, return_tensors='pt') | |
# Generate the output | |
output_ids = model.generate(input_ids, max_length=256) | |
# Decode the output to text | |
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True) | |