eldavid commited on
Commit
30b8aa6
1 Parent(s): fce85d7

Update train_model.py

Browse files
Files changed (1) hide show
  1. train_model.py +39 -61
train_model.py CHANGED
@@ -15,62 +15,50 @@ if not hf_token:
15
  # Fazer login no Hugging Face
16
  subprocess.run(["huggingface-cli", "login", "--token", hf_token])
17
 
18
- # Clonar o repositório do dataset
19
- subprocess.run(["git", "clone", "https://github.com/DigantaD/NER.git"])
20
- data = pd.read_csv('NER/ner_dataset.csv', encoding='latin1')
21
- data = data.fillna(method='ffill')
22
 
 
23
  unique_labels = data['Tag'].unique().tolist()
24
-
25
- class NERDataset:
26
- def __init__(self, dataframe):
27
- self.dataframe = dataframe
28
- self.tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
29
- self.label2id = {label: i for i, label in enumerate(unique_labels)}
30
- self.id2label = {i: label for label, i in self.label2id.items()}
31
-
32
- def tokenize_and_align_labels(self, examples):
33
- tokenized_inputs = self.tokenizer(examples['words'], truncation=True, is_split_into_words=True)
34
- labels = []
35
- for i, label in enumerate(examples['tags']):
36
- word_ids = tokenized_inputs.word_ids(batch_index=i)
37
- previous_word_idx = None
38
- label_ids = []
39
- for word_idx in word_ids:
40
- if word_idx is None:
41
- label_ids.append(-100)
42
- elif word_idx != previous_word_idx:
43
- label_ids.append(self.label2id[label[word_idx]])
44
- else:
45
- label_ids.append(-100)
46
- previous_word_idx = word_idx
47
- labels.append(label_ids)
48
- tokenized_inputs['labels'] = labels
49
- return tokenized_inputs
50
-
51
- def create_dataset(self):
52
- grouped = self.dataframe.groupby('Sentence #').apply(lambda s: [(w, t) for w, t in zip(s['Word'].values.tolist(), s['Tag'].values.tolist())])
53
- grouped = grouped.apply(pd.Series).reset_index()
54
- grouped.columns = ['Sentence #', 'words_and_tags']
55
-
56
- grouped['words'] = grouped['words_and_tags'].apply(lambda x: [w for w, t in x])
57
- grouped['tags'] = grouped['words_and_tags'].apply(lambda x: [t for w, t in x])
58
-
59
- dataset = Dataset.from_pandas(grouped[['words', 'tags']])
60
- dataset = dataset.map(self.tokenize_and_align_labels, batched=True)
61
- return dataset
62
-
63
- # Instanciar o dataset
64
- ner_dataset = NERDataset(data)
65
- dataset = ner_dataset.create_dataset()
66
 
67
  # Dividir o dataset em treino e teste
68
  dataset = dataset.train_test_split(test_size=0.1)
69
 
70
- # Carregar o modelo pré-treinado
71
  model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(unique_labels))
72
 
73
- # Treinamento
74
  training_args = TrainingArguments(
75
  output_dir='./results',
76
  evaluation_strategy="epoch",
@@ -81,6 +69,7 @@ training_args = TrainingArguments(
81
  weight_decay=0.01,
82
  )
83
 
 
84
  trainer = Trainer(
85
  model=model,
86
  args=training_args,
@@ -88,18 +77,7 @@ trainer = Trainer(
88
  eval_dataset=dataset['test'],
89
  )
90
 
 
91
  trainer.train()
92
 
93
- # Salvar o modelo
94
- model.save_pretrained('./ner_model')
95
- ner_dataset.tokenizer.save_pretrained('./ner_model')
96
-
97
- # Verificar se o diretório do modelo foi criado corretamente
98
- model_dir = './ner_model'
99
- if os.path.exists(model_dir) and os.path.isdir(model_dir):
100
- print(f"Diretório do modelo encontrado: {model_dir}")
101
- print("Arquivos no diretório do modelo:")
102
- for file_name in os.listdir(model_dir):
103
- print(file_name)
104
- else:
105
- print(f"Diretório do modelo não encontrado: {model_dir}")
 
15
  # Fazer login no Hugging Face
16
  subprocess.run(["huggingface-cli", "login", "--token", hf_token])
17
 
18
+ # Carregar os dados do dataset
19
+ data = pd.read_csv('NER/ner_dataset.csv', encoding='latin1').fillna(method='ffill')
 
 
20
 
21
+ # Preparar os dados
22
  unique_labels = data['Tag'].unique().tolist()
23
+ label2id = {label: i for i, label in enumerate(unique_labels)}
24
+
25
+ def tokenize_and_align_labels(examples):
26
+ tokenized_inputs = tokenizer(examples['words'], truncation=True, is_split_into_words=True)
27
+ labels = []
28
+ for i, label in enumerate(examples['tags']):
29
+ word_ids = tokenized_inputs.word_ids(batch_index=i)
30
+ previous_word_idx = None
31
+ label_ids = []
32
+ for word_idx in word_ids:
33
+ if word_idx is None:
34
+ label_ids.append(-100)
35
+ elif word_idx != previous_word_idx:
36
+ label_ids.append(label2id[label[word_idx]])
37
+ else:
38
+ label_ids.append(-100)
39
+ previous_word_idx = word_idx
40
+ labels.append(label_ids)
41
+ tokenized_inputs['labels'] = labels
42
+ return tokenized_inputs
43
+
44
+ grouped = data.groupby('Sentence #').apply(lambda s: [(w, t) for w, t in zip(s['Word'].values.tolist(), s['Tag'].values.tolist())])
45
+ grouped = grouped.apply(pd.Series).reset_index()
46
+ grouped.columns = ['Sentence #', 'words_and_tags']
47
+
48
+ grouped['words'] = grouped['words_and_tags'].apply(lambda x: [w for w, t in x])
49
+ grouped['tags'] = grouped['words_and_tags'].apply(lambda x: [t for w, t in x])
50
+
51
+ dataset = Dataset.from_pandas(grouped[['words', 'tags']])
52
+ tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
53
+ dataset = dataset.map(tokenize_and_align_labels, batched=True)
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  # Dividir o dataset em treino e teste
56
  dataset = dataset.train_test_split(test_size=0.1)
57
 
58
+ # Carregar o modelo pré-treinado do Hugging Face
59
  model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(unique_labels))
60
 
61
+ # Definir argumentos de treinamento
62
  training_args = TrainingArguments(
63
  output_dir='./results',
64
  evaluation_strategy="epoch",
 
69
  weight_decay=0.01,
70
  )
71
 
72
+ # Inicializar o Trainer
73
  trainer = Trainer(
74
  model=model,
75
  args=training_args,
 
77
  eval_dataset=dataset['test'],
78
  )
79
 
80
+ # Treinar o modelo
81
  trainer.train()
82
 
83
+ print("Treinamento do modelo concluído.")