modelo_ner / train_model.py
eldavid's picture
Update train_model.py
30b8aa6 verified
raw
history blame
2.95 kB
import subprocess
import os
import pandas as pd
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset
# Instalar dependências necessárias
subprocess.run(["pip", "install", "-r", "requirements.txt"])
# Recuperar o token do Hugging Face a partir das variáveis de ambiente
hf_token = os.getenv('HUGGINGFACE_TOKEN')
if not hf_token:
raise ValueError("Token do Hugging Face não encontrado. Adicione o token como uma variável de ambiente no Hugging Face Spaces.")
# Fazer login no Hugging Face
subprocess.run(["huggingface-cli", "login", "--token", hf_token])
# Carregar os dados do dataset
data = pd.read_csv('NER/ner_dataset.csv', encoding='latin1').fillna(method='ffill')
# Preparar os dados
unique_labels = data['Tag'].unique().tolist()
label2id = {label: i for i, label in enumerate(unique_labels)}
def tokenize_and_align_labels(examples):
tokenized_inputs = tokenizer(examples['words'], truncation=True, is_split_into_words=True)
labels = []
for i, label in enumerate(examples['tags']):
word_ids = tokenized_inputs.word_ids(batch_index=i)
previous_word_idx = None
label_ids = []
for word_idx in word_ids:
if word_idx is None:
label_ids.append(-100)
elif word_idx != previous_word_idx:
label_ids.append(label2id[label[word_idx]])
else:
label_ids.append(-100)
previous_word_idx = word_idx
labels.append(label_ids)
tokenized_inputs['labels'] = labels
return tokenized_inputs
grouped = data.groupby('Sentence #').apply(lambda s: [(w, t) for w, t in zip(s['Word'].values.tolist(), s['Tag'].values.tolist())])
grouped = grouped.apply(pd.Series).reset_index()
grouped.columns = ['Sentence #', 'words_and_tags']
grouped['words'] = grouped['words_and_tags'].apply(lambda x: [w for w, t in x])
grouped['tags'] = grouped['words_and_tags'].apply(lambda x: [t for w, t in x])
dataset = Dataset.from_pandas(grouped[['words', 'tags']])
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
dataset = dataset.map(tokenize_and_align_labels, batched=True)
# Dividir o dataset em treino e teste
dataset = dataset.train_test_split(test_size=0.1)
# Carregar o modelo pré-treinado do Hugging Face
model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(unique_labels))
# Definir argumentos de treinamento
training_args = TrainingArguments(
output_dir='./results',
evaluation_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=3,
weight_decay=0.01,
)
# Inicializar o Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset['train'],
eval_dataset=dataset['test'],
)
# Treinar o modelo
trainer.train()
print("Treinamento do modelo concluído.")