import subprocess import os import pandas as pd from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments from datasets import Dataset # Instalar dependências necessárias subprocess.run(["pip", "install", "-r", "requirements.txt"]) # Recuperar o token do Hugging Face a partir das variáveis de ambiente hf_token = os.getenv('HUGGINGFACE_TOKEN') if not hf_token: raise ValueError("Token do Hugging Face não encontrado. Adicione o token como uma variável de ambiente no Hugging Face Spaces.") # Fazer login no Hugging Face subprocess.run(["huggingface-cli", "login", "--token", hf_token]) # Carregar os dados do dataset data = pd.read_csv('NER/ner_dataset.csv', encoding='latin1').fillna(method='ffill') # Preparar os dados unique_labels = data['Tag'].unique().tolist() label2id = {label: i for i, label in enumerate(unique_labels)} def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer(examples['words'], truncation=True, is_split_into_words=True) labels = [] for i, label in enumerate(examples['tags']): word_ids = tokenized_inputs.word_ids(batch_index=i) previous_word_idx = None label_ids = [] for word_idx in word_ids: if word_idx is None: label_ids.append(-100) elif word_idx != previous_word_idx: label_ids.append(label2id[label[word_idx]]) else: label_ids.append(-100) previous_word_idx = word_idx labels.append(label_ids) tokenized_inputs['labels'] = labels return tokenized_inputs grouped = data.groupby('Sentence #').apply(lambda s: [(w, t) for w, t in zip(s['Word'].values.tolist(), s['Tag'].values.tolist())]) grouped = grouped.apply(pd.Series).reset_index() grouped.columns = ['Sentence #', 'words_and_tags'] grouped['words'] = grouped['words_and_tags'].apply(lambda x: [w for w, t in x]) grouped['tags'] = grouped['words_and_tags'].apply(lambda x: [t for w, t in x]) dataset = Dataset.from_pandas(grouped[['words', 'tags']]) tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased') dataset = dataset.map(tokenize_and_align_labels, batched=True) # Dividir o dataset em treino e teste dataset = dataset.train_test_split(test_size=0.1) # Carregar o modelo pré-treinado do Hugging Face model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(unique_labels)) # Definir argumentos de treinamento training_args = TrainingArguments( output_dir='./results', evaluation_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=8, per_device_eval_batch_size=8, num_train_epochs=3, weight_decay=0.01, ) # Inicializar o Trainer trainer = Trainer( model=model, args=training_args, train_dataset=dataset['train'], eval_dataset=dataset['test'], ) # Treinar o modelo trainer.train() print("Treinamento do modelo concluído.")