EveSa commited on
Commit
c0eeece
2 Parent(s): 7a29699 4e410f4

Merge pull request #14 from EveSa/Ling

Browse files
Files changed (2) hide show
  1. src/fine_tune_T5.py +38 -32
  2. src/inference_t5.py +8 -4
src/fine_tune_T5.py CHANGED
@@ -60,22 +60,20 @@ def datasetmaker(path=str):
60
 
61
 
62
  def generate_batch_sized_chunks(list_elements, batch_size):
63
- """split the dataset into smaller batches that we can process simultaneously
 
64
  Yield successive batch-sized chunks from list_of_elements."""
65
  for i in range(0, len(list_elements), batch_size):
66
  yield list_elements[i: i + batch_size]
67
 
68
 
69
- def calculate_metric(
70
- dataset,
71
- metric,
72
- model,
73
- tokenizer,
74
- batch_size,
75
- device,
76
- column_text="text",
77
- column_summary="summary",
78
- ):
79
  article_batches = list(
80
  str(generate_batch_sized_chunks(dataset[column_text], batch_size))
81
  )
@@ -127,10 +125,9 @@ def calculate_metric(
127
 
128
 
129
  def convert_ex_to_features(example_batch):
130
- input_encodings = tokenizer(
131
- example_batch["text"],
132
- max_length=1024,
133
- truncation=True)
134
 
135
  labels = tokenizer(
136
  example_batch["summary"],
@@ -144,20 +141,22 @@ def convert_ex_to_features(example_batch):
144
  }
145
 
146
 
147
- if __name__ == "__main__":
148
- train_dataset = datasetmaker("data/train_extract.jsonl")
 
149
 
150
- dev_dataset = datasetmaker("data/dev_extract.jsonl")
151
 
152
  test_dataset = datasetmaker("data/test_extract.jsonl")
153
 
154
- dataset = datasets.DatasetDict(
155
- {"train": train_dataset, "dev": dev_dataset, "test": test_dataset}
156
- )
157
 
158
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 
 
 
 
159
 
160
- tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
161
  mt5_config = AutoConfig.from_pretrained(
162
  "google/mt5-small",
163
  max_length=128,
@@ -165,9 +164,11 @@ if __name__ == "__main__":
165
  no_repeat_ngram_size=2,
166
  num_beams=15,
167
  )
168
- model = AutoModelForSeq2SeqLM.from_pretrained(
169
- "google/mt5-small", config=mt5_config
170
- ).to(device)
 
 
171
 
172
  dataset_pt = dataset.map(
173
  convert_ex_to_features,
@@ -178,7 +179,7 @@ if __name__ == "__main__":
178
 
179
  data_collator = DataCollatorForSeq2Seq(
180
  tokenizer, model=model, return_tensors="pt")
181
-
182
  training_args = Seq2SeqTrainingArguments(
183
  output_dir="t5_summary",
184
  log_level="error",
@@ -198,7 +199,8 @@ if __name__ == "__main__":
198
  logging_steps=10,
199
  # push_to_hub = True
200
  )
201
-
 
202
  trainer = Seq2SeqTrainer(
203
  model=model,
204
  args=training_args,
@@ -211,7 +213,7 @@ if __name__ == "__main__":
211
 
212
  trainer.train()
213
  rouge_metric = evaluate.load("rouge")
214
-
215
  score = calculate_metric(
216
  test_dataset,
217
  rouge_metric,
@@ -226,15 +228,19 @@ if __name__ == "__main__":
226
 
227
  # Fine Tuning terminés et à sauvgarder
228
 
229
- # save fine-tuned model in local
230
  os.makedirs("t5_summary", exist_ok=True)
231
  if hasattr(trainer.model, "module"):
232
  trainer.model.module.save_pretrained("t5_summary")
233
  else:
234
  trainer.model.save_pretrained("t5_summary")
235
  tokenizer.save_pretrained("t5_summary")
236
- # load local model
237
- model = AutoModelForSeq2SeqLM.from_pretrained("t5_summary").to(device)
 
 
 
 
238
 
239
  # mettre en usage : TEST
240
 
 
60
 
61
 
62
  def generate_batch_sized_chunks(list_elements, batch_size):
63
+ """this fonction split the dataset into smaller batches
64
+ that we can process simultaneously
65
  Yield successive batch-sized chunks from list_of_elements."""
66
  for i in range(0, len(list_elements), batch_size):
67
  yield list_elements[i: i + batch_size]
68
 
69
 
70
+ def calculate_metric(dataset, metric, model, tokenizer,
71
+ batch_size, device,
72
+ column_text='text',
73
+ column_summary='summary'):
74
+ """this fonction evaluate the model with metric rouge and
75
+ print a table of rouge scores rouge1', 'rouge2', 'rougeL', 'rougeLsum'"""
76
+
 
 
 
77
  article_batches = list(
78
  str(generate_batch_sized_chunks(dataset[column_text], batch_size))
79
  )
 
125
 
126
 
127
  def convert_ex_to_features(example_batch):
128
+ """this fonction takes for input a list of inputExemples and convert to InputFeatures"""
129
+ input_encodings = tokenizer(example_batch['text'],
130
+ max_length=1024, truncation=True)
 
131
 
132
  labels = tokenizer(
133
  example_batch["summary"],
 
141
  }
142
 
143
 
144
+ if __name__ == '__main__':
145
+ # réalisation des datasets propres
146
+ train_dataset = datasetmaker('data/train_extract.jsonl')
147
 
 
148
 
149
  test_dataset = datasetmaker("data/test_extract.jsonl")
150
 
151
+ test_dataset = datasetmaker('data/test_extract.jsonl')
 
 
152
 
153
+ dataset = datasets.DatasetDict({'train': train_dataset,
154
+ 'dev': dev_dataset, 'test': test_dataset})
155
+ # définition de device
156
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
157
+ # faire appel au model à entrainer
158
+ tokenizer = AutoTokenizer.from_pretrained('google/mt5-small')
159
 
 
160
  mt5_config = AutoConfig.from_pretrained(
161
  "google/mt5-small",
162
  max_length=128,
 
164
  no_repeat_ngram_size=2,
165
  num_beams=15,
166
  )
167
+
168
+ model = (AutoModelForSeq2SeqLM
169
+ .from_pretrained('google/mt5-small', config=mt5_config)
170
+ .to(device))
171
+ #convertir les exemples en inputFeatures
172
 
173
  dataset_pt = dataset.map(
174
  convert_ex_to_features,
 
179
 
180
  data_collator = DataCollatorForSeq2Seq(
181
  tokenizer, model=model, return_tensors="pt")
182
+ #définir les paramètres d'entrainement(fine tuning)
183
  training_args = Seq2SeqTrainingArguments(
184
  output_dir="t5_summary",
185
  log_level="error",
 
199
  logging_steps=10,
200
  # push_to_hub = True
201
  )
202
+ #donner au entraineur(trainer) le model
203
+ # et les éléments nécessaire pour l'entrainement
204
  trainer = Seq2SeqTrainer(
205
  model=model,
206
  args=training_args,
 
213
 
214
  trainer.train()
215
  rouge_metric = evaluate.load("rouge")
216
+ #évluer ensuite le model selon les résultats d'entrainement
217
  score = calculate_metric(
218
  test_dataset,
219
  rouge_metric,
 
228
 
229
  # Fine Tuning terminés et à sauvgarder
230
 
231
+ # sauvegarder fine-tuned model à local
232
  os.makedirs("t5_summary", exist_ok=True)
233
  if hasattr(trainer.model, "module"):
234
  trainer.model.module.save_pretrained("t5_summary")
235
  else:
236
  trainer.model.save_pretrained("t5_summary")
237
  tokenizer.save_pretrained("t5_summary")
238
+
239
+ # faire appel au model en local
240
+ model = (AutoModelForSeq2SeqLM
241
+ .from_pretrained("t5_summary")
242
+ .to(device))
243
+
244
 
245
  # mettre en usage : TEST
246
 
src/inference_t5.py CHANGED
@@ -11,12 +11,12 @@ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
11
 
12
  def clean_text(texts: str) -> str:
13
  texts = texts.lower()
14
- texts = contractions.fix(texts)
15
  texts = texts.translate(str.maketrans("", "", string.punctuation))
16
  texts = re.sub(r"\n", " ", texts)
17
  return texts
18
 
19
 
 
20
  def inference_t5(text: str) -> str:
21
  """
22
  Predict the summary for an input text
@@ -32,9 +32,13 @@ def inference_t5(text: str) -> str:
32
  # On défini les paramètres d'entrée pour le modèle
33
  text = clean_text(text)
34
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
35
- tokenizer = AutoTokenizer.from_pretrained("Linggg/t5_summary")
 
36
  # load local model
37
- model = AutoModelForSeq2SeqLM.from_pretrained("Linggg/t5_summary").to(device)
 
 
 
38
 
39
  text_encoding = tokenizer(
40
  text,
@@ -65,4 +69,4 @@ def inference_t5(text: str) -> str:
65
 
66
  # if __name__ == "__main__":
67
  # text = input('Entrez votre phrase à résumer : ')
68
- # print('summary:', inferenceAPI(text))
 
11
 
12
  def clean_text(texts: str) -> str:
13
  texts = texts.lower()
 
14
  texts = texts.translate(str.maketrans("", "", string.punctuation))
15
  texts = re.sub(r"\n", " ", texts)
16
  return texts
17
 
18
 
19
+
20
  def inference_t5(text: str) -> str:
21
  """
22
  Predict the summary for an input text
 
32
  # On défini les paramètres d'entrée pour le modèle
33
  text = clean_text(text)
34
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
35
+
36
+ tokenizer = (AutoTokenizer.from_pretrained("Linggg/t5_summary",use_auth_token=True))
37
  # load local model
38
+ model = (AutoModelForSeq2SeqLM
39
+ .from_pretrained("Linggg/t5_summary",use_auth_token=True)
40
+ .to(device))
41
+
42
 
43
  text_encoding = tokenizer(
44
  text,
 
69
 
70
  # if __name__ == "__main__":
71
  # text = input('Entrez votre phrase à résumer : ')
72
+ # print('summary:', inferenceAPI_T5(text))