Spaces:

EveSa
/

SummaryProject

Runtime error

App Files Files Community

Linggg commited on Mar 13, 2023

Commit

5925e5f

•

1 Parent(s): d5d5a19

model t5 tout bon + mis sur huggingface

Browse files

Files changed (3) hide show

requirements.txt +97 -19
src/fine_tune_T5.py +136 -110
src/inference_t5.py +15 -13

requirements.txt CHANGED Viewed

@@ -1,27 +1,105 @@
-brotli==1.0.9
-brotlicffi==1.0.9.2
-chardet==5.1.0
 contractions==0.1.73
-cryptography==39.0.2
-Cython==0.29.33
 datasets==2.10.1
-dl==0.1.0
 evaluate==0.4.0
-fastapi==0.94.0
-ipaddr==2.2.0
-keyring==23.13.1
-mock==5.0.1
-mypy_extensions==1.0.0
 nltk==3.8.1
 numpy==1.24.2
-ordereddict==1.1
 pandas==1.5.3
-protobuf==4.22.1
-pyOpenSSL==23.0.0
-simplejson==3.18.3
 torch==1.13.1
-tqdm==4.65.0
 transformers==4.26.1
-urllib3_secure_extra==0.1.0
-uvicorn==0.21.0
-wincertstore==0.2.1

+absl-py==1.4.0
+aiohttp==3.8.4
+aiosignal==1.3.1
+alembic==1.9.4
+anyascii==0.3.1
+anyio==3.6.2
+async-timeout==4.0.2
+attrs==22.2.0
+autopep8==2.0.2
+banal==1.0.6
+blis==0.7.9
+catalogue==2.0.8
+certifi==2022.12.7
+charset-normalizer==3.0.1
+click==8.1.3
+confection==0.0.4
+contourpy==1.0.7
 contractions==0.1.73
+cycler==0.11.0
+cymem==2.0.7
+dataloader==2.0
+dataset==1.6.0
 datasets==2.10.1
+dill==0.3.6
+en-core-web-lg==3.5.0
 evaluate==0.4.0
+fastapi==0.91.0
+filelock==3.9.0
+flake8==6.0.0
+fonttools==4.38.0
+frozenlist==1.3.3
+fsspec==2023.3.0
+greenlet==2.0.2
+h11==0.14.0
+huggingface-hub==0.12.1
+idna==3.4
+importlib-metadata==6.0.0
+importlib-resources==5.12.0
+Jinja2==3.1.2
+joblib==1.2.0
+kiwisolver==1.4.4
+langcodes==3.3.0
+Mako==1.2.4
+MarkupSafe==2.1.2
+matplotlib==3.7.0
+mccabe==0.7.0
+multidict==6.0.4
+multiprocess==0.70.14
+murmurhash==1.0.9
 nltk==3.8.1
 numpy==1.24.2
+nvidia-cublas-cu11==11.10.3.66
+nvidia-cuda-nvrtc-cu11==11.7.99
+nvidia-cuda-runtime-cu11==11.7.99
+nvidia-cudnn-cu11==8.5.0.96
+packaging==23.0
 pandas==1.5.3
+pathy==0.10.1
+Pillow==9.4.0
+preshed==3.0.8
+protobuf==3.20.0
+pyahocorasick==2.0.0
+pyarrow==11.0.0
+pycodestyle==2.10.0
+pydantic==1.10.4
+pyflakes==3.0.1
+pyparsing==3.0.9
+python-dateutil==2.8.2
+python-multipart==0.0.5
+pytz==2022.7.1
+PyYAML==6.0
+regex==2022.10.31
+requests==2.28.2
+responses==0.18.0
+rouge-score==0.1.2
+scikit-learn==1.2.1
+scipy==1.10.0
+sentencepiece==0.1.97
+six==1.16.0
+sklearn==0.0.post1
+smart-open==6.3.0
+sniffio==1.3.0
+spacy==3.5.0
+spacy-legacy==3.0.12
+spacy-loggers==1.0.4
+SQLAlchemy==1.4.46
+srsly==2.4.5
+starlette==0.24.0
+summarizer==0.0.7
+textsearch==0.0.24
+thinc==8.1.7
+threadpoolctl==3.1.0
+tokenizers==0.13.2
+tomli==2.0.1
 torch==1.13.1
+tqdm==4.64.1
 transformers==4.26.1
+typer==0.7.0
+typing-extensions==4.4.0
+urllib3==1.26.14
+uvicorn==0.20.0
+wasabi==1.1.1
+xxhash==3.2.0
+yarl==1.8.2
+zipp==3.14.0

src/fine_tune_T5.py CHANGED Viewed

@@ -1,106 +1,127 @@
-import torch
-import datasets
-from datasets import Dataset, DatasetDict
-import pandas as pd
-from tqdm import tqdm
 import re
 import os
-import nltk
 import string
-nltk.download('stopwords')
-nltk.download('punkt')
 import contractions
-from transformers import pipeline
 import evaluate
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer,AutoConfig
-from transformers import Seq2SeqTrainingArguments ,Seq2SeqTrainer
-# from transformers import TrainingArguments, Trainer
 from transformers import DataCollatorForSeq2Seq
-def clean_data(texts):
     texts = texts.lower()
     texts = contractions.fix(texts)
     texts = texts.translate(str.maketrans("", "", string.punctuation))
-    texts = re.sub(r'\n',' ',texts)
     return texts
-def datasetmaker (path=str):
     data = pd.read_json(path, lines=True)
-    df = data.drop(['url','archive','title','date','compression','coverage','density','compression_bin','coverage_bin','density_bin'],axis=1)
     tqdm.pandas()
-    df['text'] = df.text.apply(lambda texts : clean_data(texts))
-    df['summary'] = df.summary.apply(lambda summary : clean_data(summary))
-    # df['text'] = df['text'].map(str)
-    # df['summary'] = df['summary'].map(str)
     dataset = Dataset.from_dict(df)
     return dataset
-#voir si le model par hasard esr déjà bien
 # test_text = dataset['text'][0]
-# pipe = pipeline('summarization',model = model_ckpt)
 # pipe_out = pipe(test_text)
-# print (pipe_out[0]['summary_text'].replace('.<n>','.\n'))
 # print(dataset['summary'][0])
 def generate_batch_sized_chunks(list_elements, batch_size):
     """split the dataset into smaller batches that we can process simultaneously
     Yield successive batch-sized chunks from list_of_elements."""
     for i in range(0, len(list_elements), batch_size):
-        yield list_elements[i : i + batch_size]
 def calculate_metric(dataset, metric, model, tokenizer,
-                               batch_size, device,
-                               column_text='text',
-                               column_summary='summary'):
-    article_batches = list(str(generate_batch_sized_chunks(dataset[column_text], batch_size)))
-    target_batches = list(str(generate_batch_sized_chunks(dataset[column_summary], batch_size)))
     for article_batch, target_batch in tqdm(
-        zip(article_batches, target_batches), total=len(article_batches)):
-        inputs = tokenizer(article_batch, max_length=1024,  truncation=True,
-                        padding="max_length", return_tensors="pt")
-        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
-                         attention_mask=inputs["attention_mask"].to(device),
-                         length_penalty=0.8, num_beams=8, max_length=128)
-        ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''
         # Décode les textes
-        # renplacer les tokens, ajouter des textes décodés avec les rédéfences vers la métrique.
-        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
-                                clean_up_tokenization_spaces=True)
-               for s in summaries]
         decoded_summaries = [d.replace("", " ") for d in decoded_summaries]
-        metric.add_batch(predictions=decoded_summaries, references=target_batch)
-    #compute et return les ROUGE scores.
     results = metric.compute()
-    rouge_names = ['rouge1','rouge2','rougeL','rougeLsum']
-    rouge_dict = dict((rn, results[rn] ) for rn in rouge_names )
-    return pd.DataFrame(rouge_dict, index = ['T5'])
 def convert_ex_to_features(example_batch):
-    input_encodings = tokenizer(example_batch['text'],max_length = 1024,truncation = True)
-    labels =tokenizer(example_batch['summary'], max_length = 128, truncation = True )
     return {
-        'input_ids' : input_encodings['input_ids'],
         'attention_mask': input_encodings['attention_mask'],
         'labels': labels['input_ids']
     }
-if __name__=='__main__':
     train_dataset = datasetmaker('data/train_extract.jsonl')
@@ -108,97 +129,102 @@ if __name__=='__main__':
     test_dataset = datasetmaker('data/test_extract.jsonl')
-    dataset = datasets.DatasetDict({'train':train_dataset,'dev':dev_dataset ,'test':test_dataset})
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
     mt5_config = AutoConfig.from_pretrained(
-    "google/mt5-small",
-    max_length=128,
-    length_penalty=0.6,
-    no_repeat_ngram_size=2,
-    num_beams=15,
     )
     model = (AutoModelForSeq2SeqLM
-            .from_pretrained("google/mt5-small", config=mt5_config)
-            .to(device))
-    dataset_pt= dataset.map(convert_ex_to_features,remove_columns=["summary", "text"],batched = True,batch_size=128)
-    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model,return_tensors="pt")
     training_args = Seq2SeqTrainingArguments(
-        output_dir = "mt5_sum",
-        log_level = "error",
-        num_train_epochs = 10,
-        learning_rate = 5e-4,
-        #   lr_scheduler_type = "linear",
-        warmup_steps = 0,
-        optim = "adafactor",
-        weight_decay = 0.01,
-        per_device_train_batch_size = 2,
-        per_device_eval_batch_size = 1,
-        gradient_accumulation_steps = 16,
-        evaluation_strategy = "steps",
-        eval_steps = 100,
         predict_with_generate=True,
-        generation_max_length = 128,
-        save_steps = 500,
-        logging_steps = 10,
         # push_to_hub = True
     )
     trainer = Seq2SeqTrainer(
-        model = model,
-        args = training_args,
-        data_collator = data_collator,
         # compute_metrics = calculate_metric,
         train_dataset=dataset_pt['train'],
         eval_dataset=dataset_pt['dev'].select(range(10)),
-        tokenizer = tokenizer,
     )
     trainer.train()
     rouge_metric = evaluate.load("rouge")
-    score = calculate_metric(test_dataset, rouge_metric, trainer.model, tokenizer,
-                                batch_size=2, device=device,
-                                column_text='text',
-                                column_summary='summary')
-    print (score)
-    #Fine Tuning terminés et à sauvgarder
     # save fine-tuned model in local
-    os.makedirs("./summarization_t5", exist_ok=True)
     if hasattr(trainer.model, "module"):
-        trainer.model.module.save_pretrained("./summarization_t5")
     else:
-        trainer.model.save_pretrained("./summarization_t5")
-    tokenizer.save_pretrained("./summarization_t5")
     # load local model
     model = (AutoModelForSeq2SeqLM
-            .from_pretrained("./summarization_t5")
-            .to(device))
     # mettre en usage : TEST
-    # gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "max_length": 128}
     # sample_text = dataset["test"][0]["text"]
     # reference = dataset["test"][0]["summary"]
     # pipe = pipeline("summarization", model='./summarization_t5')
-    # print("Text:")
     # print(sample_text)
-    # print("\nReference Summary:")
     # print(reference)
-    # print("\nModel Summary:")
     # print(pipe(sample_text, **gen_kwargs)[0]["summary_text"])

 import re
 import os
 import string
 import contractions
+import torch
+import datasets
+from datasets import Dataset
+import pandas as pd
+from tqdm import tqdm
 import evaluate
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoConfig
+from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
 from transformers import DataCollatorForSeq2Seq
+def clean_text(texts):
+    '''This fonction makes clean text for the future use'''
     texts = texts.lower()
     texts = contractions.fix(texts)
     texts = texts.translate(str.maketrans("", "", string.punctuation))
+    texts = re.sub(r'\n', ' ', texts)
     return texts
+def datasetmaker(path=str):
+    '''This fonction take the jsonl file, read it to a dataframe,
+     remove the colums not needed for the task and turn it into a file type Dataset
+    '''
     data = pd.read_json(path, lines=True)
+    df = data.drop(['url',
+                    'archive',
+                    'title',
+                    'date',
+                    'compression',
+                    'coverage',
+                    'density',
+                    'compression_bin',
+                    'coverage_bin',
+                    'density_bin'],
+                   axis=1)
     tqdm.pandas()
+    df['text'] = df.text.apply(lambda texts: clean_text(texts))
+    df['summary'] = df.summary.apply(lambda summary: clean_text(summary))
     dataset = Dataset.from_dict(df)
     return dataset
+# voir si le model par hasard esr déjà bien
 # test_text = dataset['text'][0]
+# pipe = pipeline('summarization', model = model_ckpt)
 # pipe_out = pipe(test_text)
+# print(pipe_out[0]['summary_text'].replace('.<n>', '.\n'))
 # print(dataset['summary'][0])
 def generate_batch_sized_chunks(list_elements, batch_size):
     """split the dataset into smaller batches that we can process simultaneously
     Yield successive batch-sized chunks from list_of_elements."""
     for i in range(0, len(list_elements), batch_size):
+        yield list_elements[i: i + batch_size]
 def calculate_metric(dataset, metric, model, tokenizer,
+                     batch_size, device,
+                     column_text='text',
+                     column_summary='summary'):
+    article_batches = list(
+        str(generate_batch_sized_chunks(dataset[column_text], batch_size)))
+    target_batches = list(
+        str(generate_batch_sized_chunks(dataset[column_summary], batch_size)))
     for article_batch, target_batch in tqdm(
+            zip(article_batches, target_batches), total=len(article_batches)):
+        inputs = tokenizer(article_batch, max_length=1024, truncation=True,
+                           padding="max_length", return_tensors="pt")
+        # parameter for length penalty ensures that the model does not
+        # generate sequences that are too long.
+        summaries = model.generate(
+            input_ids=inputs["input_ids"].to(device),
+            attention_mask=inputs["attention_mask"].to(device),
+            length_penalty=0.8,
+            num_beams=8,
+            max_length=128)
         # Décode les textes
+        # renplacer les tokens, ajouter des textes décodés avec les rédéfences
+        # vers la métrique.
+        decoded_summaries = [
+            tokenizer.decode(
+                s,
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=True) for s in summaries]
         decoded_summaries = [d.replace("", " ") for d in decoded_summaries]
+        metric.add_batch(
+            predictions=decoded_summaries,
+            references=target_batch)
+    # compute et return les ROUGE scores.
     results = metric.compute()
+    rouge_names = ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
+    rouge_dict = dict((rn, results[rn]) for rn in rouge_names)
+    return pd.DataFrame(rouge_dict, index=['T5'])
 def convert_ex_to_features(example_batch):
+    input_encodings = tokenizer(example_batch['text'],
+                                max_length=1024, truncation=True)
+    labels = tokenizer(
+        example_batch['summary'],
+        max_length=128,
+        truncation=True)
     return {
+        'input_ids': input_encodings['input_ids'],
         'attention_mask': input_encodings['attention_mask'],
         'labels': labels['input_ids']
     }
+if __name__ == '__main__':
     train_dataset = datasetmaker('data/train_extract.jsonl')
     test_dataset = datasetmaker('data/test_extract.jsonl')
+    dataset = datasets.DatasetDict({'train': train_dataset,
+                                    'dev': dev_dataset, 'test': test_dataset})
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    tokenizer = AutoTokenizer.from_pretrained('google/mt5-small')
     mt5_config = AutoConfig.from_pretrained(
+        'google/mt5-small',
+        max_length=128,
+        length_penalty=0.6,
+        no_repeat_ngram_size=2,
+        num_beams=15,
     )
     model = (AutoModelForSeq2SeqLM
+             .from_pretrained('google/mt5-small', config=mt5_config)
+             .to(device))
+    dataset_pt = dataset.map(
+        convert_ex_to_features,
+        remove_columns=[
+            "summary",
+            "text"],
+        batched=True,
+        batch_size=128)
+    data_collator = DataCollatorForSeq2Seq(
+        tokenizer, model=model, return_tensors="pt")
     training_args = Seq2SeqTrainingArguments(
+        output_dir="t5_summary",
+        log_level="error",
+        num_train_epochs=10,
+        learning_rate=5e-4,
+        warmup_steps=0,
+        optim="adafactor",
+        weight_decay=0.01,
+        per_device_train_batch_size=2,
+        per_device_eval_batch_size=1,
+        gradient_accumulation_steps=16,
+        evaluation_strategy="steps",
+        eval_steps=100,
         predict_with_generate=True,
+        generation_max_length=128,
+        save_steps=500,
+        logging_steps=10,
         # push_to_hub = True
     )
     trainer = Seq2SeqTrainer(
+        model=model,
+        args=training_args,
+        data_collator=data_collator,
         # compute_metrics = calculate_metric,
         train_dataset=dataset_pt['train'],
         eval_dataset=dataset_pt['dev'].select(range(10)),
+        tokenizer=tokenizer,
     )
     trainer.train()
     rouge_metric = evaluate.load("rouge")
+    score = calculate_metric(
+        test_dataset,
+        rouge_metric,
+        trainer.model,
+        tokenizer,
+        batch_size=2,
+        device=device,
+        column_text='text',
+        column_summary='summary')
+    print(score)
+    # Fine Tuning terminés et à sauvgarder
     # save fine-tuned model in local
+    os.makedirs("t5_summary", exist_ok=True)
     if hasattr(trainer.model, "module"):
+        trainer.model.module.save_pretrained("t5_summary")
     else:
+        trainer.model.save_pretrained("t5_summary")
+    tokenizer.save_pretrained("t5_summary")
     # load local model
     model = (AutoModelForSeq2SeqLM
+             .from_pretrained("t5_summary")
+             .to(device))
     # mettre en usage : TEST
+    # gen_kwargs = {"length_penalty" : 0.8, "num_beams" : 8, "max_length" : 128}
     # sample_text = dataset["test"][0]["text"]
     # reference = dataset["test"][0]["summary"]
     # pipe = pipeline("summarization", model='./summarization_t5')
+    # print("Text :")
     # print(sample_text)
+    # print("\nReference Summary :")
     # print(reference)
+    # print("\nModel Summary :")
     # print(pipe(sample_text, **gen_kwargs)[0]["summary_text"])

src/inference_t5.py CHANGED Viewed

@@ -2,21 +2,20 @@
  Allows to predict the summary for a given entry text
 """
 import torch
-import nltk
 import contractions
 import re
 import string
-nltk.download('stopwords')
-nltk.download('punkt')
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
-def clean_data(texts):
     texts = texts.lower()
     texts = contractions.fix(texts)
     texts = texts.translate(str.maketrans("", "", string.punctuation))
-    texts = re.sub(r'\n',' ',texts)
     return texts
 def inferenceAPI(text: str) -> str:
     """
     Predict the summary for an input text
@@ -29,13 +28,13 @@ def inferenceAPI(text: str) -> str:
             The summary for the input text
     """
     # On défini les paramètres d'entrée pour le modèle
-    text = clean_data(text)
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    tokenizer= (AutoTokenizer.from_pretrained("./summarization_t5"))
     # load local model
     model = (AutoModelForSeq2SeqLM
-            .from_pretrained("./summarization_t5")
-            .to(device))
     text_encoding = tokenizer(
         text,
         max_length=1024,
@@ -55,11 +54,14 @@ def inferenceAPI(text: str) -> str:
     )
     preds = [
-            tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
-            for gen_id in generated_ids
     ]
     return "".join(preds)
 if __name__ == "__main__":
-     text = input('Entrez votre phrase à résumer : ')
-     print('summary:',inferenceAPI(text))

  Allows to predict the summary for a given entry text
 """
 import torch
 import contractions
 import re
 import string
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+def clean_text(texts: str) -> str:
     texts = texts.lower()
     texts = contractions.fix(texts)
     texts = texts.translate(str.maketrans("", "", string.punctuation))
+    texts = re.sub(r'\n', ' ', texts)
     return texts
 def inferenceAPI(text: str) -> str:
     """
     Predict the summary for an input text
             The summary for the input text
     """
     # On défini les paramètres d'entrée pour le modèle
+    text = clean_text(text)
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    tokenizer = (AutoTokenizer.from_pretrained("Linggg/t5_summary"))
     # load local model
     model = (AutoModelForSeq2SeqLM
+             .from_pretrained("Linggg/t5_summary")
+             .to(device))
     text_encoding = tokenizer(
         text,
         max_length=1024,
     )
     preds = [
+        tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+        for gen_id in generated_ids
     ]
     return "".join(preds)
 if __name__ == "__main__":
+    '''
+    '''
+    text = input('Entrez votre phrase à résumer : ')
+    print('summary:', inferenceAPI(text))