Linggg commited on
Commit
4fa4fe8
1 Parent(s): 1aab2b0

script commenté

Browse files
Files changed (2) hide show
  1. src/fine_tune_T5.py +15 -10
  2. src/fine_tune_t5.py +0 -204
src/fine_tune_T5.py CHANGED
@@ -54,7 +54,8 @@ def datasetmaker(path=str):
54
 
55
 
56
  def generate_batch_sized_chunks(list_elements, batch_size):
57
- """split the dataset into smaller batches that we can process simultaneously
 
58
  Yield successive batch-sized chunks from list_of_elements."""
59
  for i in range(0, len(list_elements), batch_size):
60
  yield list_elements[i: i + batch_size]
@@ -64,6 +65,8 @@ def calculate_metric(dataset, metric, model, tokenizer,
64
  batch_size, device,
65
  column_text='text',
66
  column_summary='summary'):
 
 
67
  article_batches = list(
68
  str(generate_batch_sized_chunks(dataset[column_text], batch_size)))
69
  target_batches = list(
@@ -106,6 +109,7 @@ def calculate_metric(dataset, metric, model, tokenizer,
106
 
107
 
108
  def convert_ex_to_features(example_batch):
 
109
  input_encodings = tokenizer(example_batch['text'],
110
  max_length=1024, truncation=True)
111
 
@@ -122,7 +126,7 @@ def convert_ex_to_features(example_batch):
122
 
123
 
124
  if __name__ == '__main__':
125
-
126
  train_dataset = datasetmaker('data/train_extract.jsonl')
127
 
128
  dev_dataset = datasetmaker('data/dev_extract.jsonl')
@@ -131,9 +135,9 @@ if __name__ == '__main__':
131
 
132
  dataset = datasets.DatasetDict({'train': train_dataset,
133
  'dev': dev_dataset, 'test': test_dataset})
134
-
135
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
136
-
137
  tokenizer = AutoTokenizer.from_pretrained('google/mt5-small')
138
  mt5_config = AutoConfig.from_pretrained(
139
  'google/mt5-small',
@@ -145,7 +149,7 @@ if __name__ == '__main__':
145
  model = (AutoModelForSeq2SeqLM
146
  .from_pretrained('google/mt5-small', config=mt5_config)
147
  .to(device))
148
-
149
  dataset_pt = dataset.map(
150
  convert_ex_to_features,
151
  remove_columns=[
@@ -156,7 +160,7 @@ if __name__ == '__main__':
156
 
157
  data_collator = DataCollatorForSeq2Seq(
158
  tokenizer, model=model, return_tensors="pt")
159
-
160
  training_args = Seq2SeqTrainingArguments(
161
  output_dir="t5_summary",
162
  log_level="error",
@@ -176,7 +180,8 @@ if __name__ == '__main__':
176
  logging_steps=10,
177
  # push_to_hub = True
178
  )
179
-
 
180
  trainer = Seq2SeqTrainer(
181
  model=model,
182
  args=training_args,
@@ -189,7 +194,7 @@ if __name__ == '__main__':
189
 
190
  trainer.train()
191
  rouge_metric = evaluate.load("rouge")
192
-
193
  score = calculate_metric(
194
  test_dataset,
195
  rouge_metric,
@@ -203,14 +208,14 @@ if __name__ == '__main__':
203
 
204
  # Fine Tuning terminés et à sauvgarder
205
 
206
- # save fine-tuned model in local
207
  os.makedirs("t5_summary", exist_ok=True)
208
  if hasattr(trainer.model, "module"):
209
  trainer.model.module.save_pretrained("t5_summary")
210
  else:
211
  trainer.model.save_pretrained("t5_summary")
212
  tokenizer.save_pretrained("t5_summary")
213
- # load local model
214
  model = (AutoModelForSeq2SeqLM
215
  .from_pretrained("t5_summary")
216
  .to(device))
 
54
 
55
 
56
  def generate_batch_sized_chunks(list_elements, batch_size):
57
+ """this fonction split the dataset into smaller batches
58
+ that we can process simultaneously
59
  Yield successive batch-sized chunks from list_of_elements."""
60
  for i in range(0, len(list_elements), batch_size):
61
  yield list_elements[i: i + batch_size]
 
65
  batch_size, device,
66
  column_text='text',
67
  column_summary='summary'):
68
+ """this fonction evaluate the model with metric rouge and
69
+ print a table of rouge scores rouge1', 'rouge2', 'rougeL', 'rougeLsum'"""
70
  article_batches = list(
71
  str(generate_batch_sized_chunks(dataset[column_text], batch_size)))
72
  target_batches = list(
 
109
 
110
 
111
  def convert_ex_to_features(example_batch):
112
+ """this fonction takes for input a list of inputExemples and convert to InputFeatures"""
113
  input_encodings = tokenizer(example_batch['text'],
114
  max_length=1024, truncation=True)
115
 
 
126
 
127
 
128
  if __name__ == '__main__':
129
+ # réalisation des datasets propres
130
  train_dataset = datasetmaker('data/train_extract.jsonl')
131
 
132
  dev_dataset = datasetmaker('data/dev_extract.jsonl')
 
135
 
136
  dataset = datasets.DatasetDict({'train': train_dataset,
137
  'dev': dev_dataset, 'test': test_dataset})
138
+ # définition de device
139
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
140
+ # faire appel au model à entrainer
141
  tokenizer = AutoTokenizer.from_pretrained('google/mt5-small')
142
  mt5_config = AutoConfig.from_pretrained(
143
  'google/mt5-small',
 
149
  model = (AutoModelForSeq2SeqLM
150
  .from_pretrained('google/mt5-small', config=mt5_config)
151
  .to(device))
152
+ #convertir les exemples en inputFeatures
153
  dataset_pt = dataset.map(
154
  convert_ex_to_features,
155
  remove_columns=[
 
160
 
161
  data_collator = DataCollatorForSeq2Seq(
162
  tokenizer, model=model, return_tensors="pt")
163
+ #définir les paramètres d'entrainement(fine tuning)
164
  training_args = Seq2SeqTrainingArguments(
165
  output_dir="t5_summary",
166
  log_level="error",
 
180
  logging_steps=10,
181
  # push_to_hub = True
182
  )
183
+ #donner au entraineur(trainer) le model
184
+ # et les éléments nécessaire pour l'entrainement
185
  trainer = Seq2SeqTrainer(
186
  model=model,
187
  args=training_args,
 
194
 
195
  trainer.train()
196
  rouge_metric = evaluate.load("rouge")
197
+ #évluer ensuite le model selon les résultats d'entrainement
198
  score = calculate_metric(
199
  test_dataset,
200
  rouge_metric,
 
208
 
209
  # Fine Tuning terminés et à sauvgarder
210
 
211
+ # sauvegarder fine-tuned model à local
212
  os.makedirs("t5_summary", exist_ok=True)
213
  if hasattr(trainer.model, "module"):
214
  trainer.model.module.save_pretrained("t5_summary")
215
  else:
216
  trainer.model.save_pretrained("t5_summary")
217
  tokenizer.save_pretrained("t5_summary")
218
+ # faire appel au model en local
219
  model = (AutoModelForSeq2SeqLM
220
  .from_pretrained("t5_summary")
221
  .to(device))
src/fine_tune_t5.py DELETED
@@ -1,204 +0,0 @@
1
- import torch
2
- import datasets
3
- from datasets import Dataset, DatasetDict
4
- import pandas as pd
5
- from tqdm import tqdm
6
- import re
7
- import os
8
- import nltk
9
- import string
10
- import contractions
11
- from transformers import pipeline
12
- import evaluate
13
- from transformers import AutoModelForSeq2SeqLM, AutoTokenizer,AutoConfig
14
- from transformers import Seq2SeqTrainingArguments ,Seq2SeqTrainer
15
- from transformers import DataCollatorForSeq2Seq
16
-
17
- # cuda out of memory
18
- os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:200"
19
-
20
- nltk.download('stopwords')
21
- nltk.download('punkt')
22
-
23
-
24
- def clean_data(texts):
25
- texts = texts.lower()
26
- texts = contractions.fix(texts)
27
- texts = texts.translate(str.maketrans("", "", string.punctuation))
28
- texts = re.sub(r'\n',' ',texts)
29
- return texts
30
-
31
- def datasetmaker (path=str):
32
- data = pd.read_json(path, lines=True)
33
- df = data.drop(['url','archive','title','date','compression','coverage','density','compression_bin','coverage_bin','density_bin'],axis=1)
34
- tqdm.pandas()
35
- df['text'] = df.text.apply(lambda texts : clean_data(texts))
36
- df['summary'] = df.summary.apply(lambda summary : clean_data(summary))
37
- # df['text'] = df['text'].map(str)
38
- # df['summary'] = df['summary'].map(str)
39
- dataset = Dataset.from_dict(df)
40
- return dataset
41
-
42
- #voir si le model par hasard esr déjà bien
43
-
44
- # test_text = dataset['text'][0]
45
- # pipe = pipeline('summarization',model = model_ckpt)
46
- # pipe_out = pipe(test_text)
47
- # print (pipe_out[0]['summary_text'].replace('.<n>','.\n'))
48
- # print(dataset['summary'][0])
49
-
50
- def generate_batch_sized_chunks(list_elements, batch_size):
51
- """split the dataset into smaller batches that we can process simultaneously
52
- Yield successive batch-sized chunks from list_of_elements."""
53
- for i in range(0, len(list_elements), batch_size):
54
- yield list_elements[i : i + batch_size]
55
-
56
- def calculate_metric(dataset, metric, model, tokenizer,
57
- batch_size, device,
58
- column_text='text',
59
- column_summary='summary'):
60
- article_batches = list(str(generate_batch_sized_chunks(dataset[column_text], batch_size)))
61
- target_batches = list(str(generate_batch_sized_chunks(dataset[column_summary], batch_size)))
62
-
63
- for article_batch, target_batch in tqdm(
64
- zip(article_batches, target_batches), total=len(article_batches)):
65
-
66
- inputs = tokenizer(article_batch, max_length=1024, truncation=True,
67
- padding="max_length", return_tensors="pt")
68
-
69
- summaries = model.generate(input_ids=inputs["input_ids"].to(device),
70
- attention_mask=inputs["attention_mask"].to(device),
71
- length_penalty=0.8, num_beams=8, max_length=128)
72
- ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''
73
-
74
- # Décode les textes
75
- # renplacer les tokens, ajouter des textes décodés avec les rédéfences vers la métrique.
76
- decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
77
- clean_up_tokenization_spaces=True)
78
- for s in summaries]
79
-
80
- decoded_summaries = [d.replace("", " ") for d in decoded_summaries]
81
-
82
-
83
- metric.add_batch(predictions=decoded_summaries, references=target_batch)
84
-
85
- #compute et return les ROUGE scores.
86
- results = metric.compute()
87
- rouge_names = ['rouge1','rouge2','rougeL','rougeLsum']
88
- rouge_dict = dict((rn, results[rn] ) for rn in rouge_names )
89
- return pd.DataFrame(rouge_dict, index = ['T5'])
90
-
91
-
92
- def convert_ex_to_features(example_batch):
93
- input_encodings = tokenizer(example_batch['text'],max_length = 1024,truncation = True)
94
-
95
- labels =tokenizer(example_batch['summary'], max_length = 128, truncation = True )
96
-
97
- return {
98
- 'input_ids' : input_encodings['input_ids'],
99
- 'attention_mask': input_encodings['attention_mask'],
100
- 'labels': labels['input_ids']
101
- }
102
-
103
- if __name__=='__main__':
104
-
105
- train_dataset = datasetmaker('data/train_extract_100.jsonl')
106
-
107
- dev_dataset = datasetmaker('data/dev_extract_100.jsonl')
108
-
109
- test_dataset = datasetmaker('data/test_extract_100.jsonl')
110
-
111
- dataset = datasets.DatasetDict({'train':train_dataset,'dev':dev_dataset ,'test':test_dataset})
112
-
113
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
114
-
115
- tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
116
- mt5_config = AutoConfig.from_pretrained(
117
- "google/mt5-small",
118
- max_length=128,
119
- length_penalty=0.6,
120
- no_repeat_ngram_size=2,
121
- num_beams=15,
122
- )
123
- model = (AutoModelForSeq2SeqLM
124
- .from_pretrained("google/mt5-small", config=mt5_config)
125
- .to(device))
126
-
127
- dataset_pt= dataset.map(convert_ex_to_features,remove_columns=["summary", "text"],batched = True,batch_size=128)
128
-
129
- data_collator = DataCollatorForSeq2Seq(tokenizer, model=model,return_tensors="pt")
130
-
131
-
132
- training_args = Seq2SeqTrainingArguments(
133
- output_dir = "mt5_sum",
134
- log_level = "error",
135
- num_train_epochs = 10,
136
- learning_rate = 5e-4,
137
- # lr_scheduler_type = "linear",
138
- warmup_steps = 0,
139
- optim = "adafactor",
140
- weight_decay = 0.01,
141
- per_device_train_batch_size = 2,
142
- per_device_eval_batch_size = 1,
143
- gradient_accumulation_steps = 16,
144
- evaluation_strategy = "steps",
145
- eval_steps = 100,
146
- predict_with_generate=True,
147
- generation_max_length = 128,
148
- save_steps = 500,
149
- logging_steps = 10,
150
- # push_to_hub = True
151
- )
152
-
153
-
154
- trainer = Seq2SeqTrainer(
155
- model = model,
156
- args = training_args,
157
- data_collator = data_collator,
158
- # compute_metrics = calculate_metric,
159
- train_dataset=dataset_pt['train'],
160
- eval_dataset=dataset_pt['dev'].select(range(10)),
161
- tokenizer = tokenizer,
162
- )
163
-
164
- trainer.train()
165
- rouge_metric = evaluate.load("rouge")
166
-
167
- score = calculate_metric(test_dataset, rouge_metric, trainer.model, tokenizer,
168
- batch_size=2, device=device,
169
- column_text='text',
170
- column_summary='summary')
171
- print (score)
172
-
173
-
174
- #Fine Tuning terminés et à sauvgarder
175
-
176
-
177
-
178
- # save fine-tuned model in local
179
- os.makedirs("./summarization_t5", exist_ok=True)
180
- if hasattr(trainer.model, "module"):
181
- trainer.model.module.save_pretrained("./summarization_t5")
182
- else:
183
- trainer.model.save_pretrained("./summarization_t5")
184
- tokenizer.save_pretrained("./summarization_t5")
185
- # load local model
186
- model = (AutoModelForSeq2SeqLM
187
- .from_pretrained("./summarization_t5")
188
- .to(device))
189
-
190
-
191
- # mettre en usage : TEST
192
-
193
-
194
- # gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "max_length": 128}
195
- # sample_text = dataset["test"][0]["text"]
196
- # reference = dataset["test"][0]["summary"]
197
- # pipe = pipeline("summarization", model='./summarization_t5')
198
-
199
- # print("Text:")
200
- # print(sample_text)
201
- # print("\nReference Summary:")
202
- # print(reference)
203
- # print("\nModel Summary:")
204
- # print(pipe(sample_text, **gen_kwargs)[0]["summary_text"])