Spaces:
Runtime error
Runtime error
import subprocess | |
import os | |
import pandas as pd | |
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments | |
from datasets import Dataset | |
# Instalar dependências necessárias | |
subprocess.run(["pip", "install", "-r", "requirements.txt"]) | |
# Recuperar o token do Hugging Face a partir das variáveis de ambiente | |
hf_token = os.getenv('HUGGINGFACE_TOKEN') | |
if not hf_token: | |
raise ValueError("Token do Hugging Face não encontrado. Adicione o token como uma variável de ambiente no Hugging Face Spaces.") | |
# Fazer login no Hugging Face | |
subprocess.run(["huggingface-cli", "login", "--token", hf_token]) | |
# Carregar os dados do dataset | |
data = pd.read_csv('NER/ner_dataset.csv', encoding='latin1').fillna(method='ffill') | |
# Preparar os dados | |
unique_labels = data['Tag'].unique().tolist() | |
label2id = {label: i for i, label in enumerate(unique_labels)} | |
def tokenize_and_align_labels(examples): | |
tokenized_inputs = tokenizer(examples['words'], truncation=True, is_split_into_words=True) | |
labels = [] | |
for i, label in enumerate(examples['tags']): | |
word_ids = tokenized_inputs.word_ids(batch_index=i) | |
previous_word_idx = None | |
label_ids = [] | |
for word_idx in word_ids: | |
if word_idx is None: | |
label_ids.append(-100) | |
elif word_idx != previous_word_idx: | |
label_ids.append(label2id[label[word_idx]]) | |
else: | |
label_ids.append(-100) | |
previous_word_idx = word_idx | |
labels.append(label_ids) | |
tokenized_inputs['labels'] = labels | |
return tokenized_inputs | |
grouped = data.groupby('Sentence #').apply(lambda s: [(w, t) for w, t in zip(s['Word'].values.tolist(), s['Tag'].values.tolist())]) | |
grouped = grouped.apply(pd.Series).reset_index() | |
grouped.columns = ['Sentence #', 'words_and_tags'] | |
grouped['words'] = grouped['words_and_tags'].apply(lambda x: [w for w, t in x]) | |
grouped['tags'] = grouped['words_and_tags'].apply(lambda x: [t for w, t in x]) | |
dataset = Dataset.from_pandas(grouped[['words', 'tags']]) | |
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased') | |
dataset = dataset.map(tokenize_and_align_labels, batched=True) | |
# Dividir o dataset em treino e teste | |
dataset = dataset.train_test_split(test_size=0.1) | |
# Carregar o modelo pré-treinado do Hugging Face | |
model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(unique_labels)) | |
# Definir argumentos de treinamento | |
training_args = TrainingArguments( | |
output_dir='./results', | |
evaluation_strategy="epoch", | |
learning_rate=2e-5, | |
per_device_train_batch_size=8, | |
per_device_eval_batch_size=8, | |
num_train_epochs=3, | |
weight_decay=0.01, | |
) | |
# Inicializar o Trainer | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=dataset['train'], | |
eval_dataset=dataset['test'], | |
) | |
# Treinar o modelo | |
trainer.train() | |
print("Treinamento do modelo concluído.") | |