raj22rishi commited on
Commit
fb4a3c6
1 Parent(s): 30ddd1e

Upload 14 files

Browse files
Files changed (14) hide show
  1. DockerFile +21 -0
  2. LICENSE +21 -0
  3. README.md +49 -12
  4. __init__.py +0 -0
  5. app.py +44 -0
  6. evaluation.py +3 -0
  7. inference_pipeline.py +13 -0
  8. ingest_data.py +5 -0
  9. model_train.py +28 -0
  10. preprocess.py +26 -0
  11. requirements.txt +7 -0
  12. run_pipeline.py +6 -0
  13. training_model.py +13 -0
  14. utils.py +15 -0
DockerFile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use a base image with Python installed
2
+ FROM python:3.9
3
+
4
+ # Set the working directory in the container
5
+ WORKDIR /app
6
+
7
+ # Copy the requirements file to the container
8
+ COPY requirements.txt .
9
+
10
+ # Install the required packages
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+
13
+ # Copy the source code to the container
14
+ COPY pipeline ./pipeline
15
+ COPY steps ./steps
16
+ COPY models ./models
17
+ COPY run_pipeline.py .
18
+ COPY utils ./utils
19
+
20
+ # Set the command to run when the container starts
21
+ CMD ["python", "run_pipeline.py"]
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Sujal Neupane
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,12 +1,49 @@
1
- ---
2
- title: Text Summarization
3
- emoji: 🐠
4
- colorFrom: green
5
- colorTo: yellow
6
- sdk: streamlit
7
- sdk_version: 1.31.1
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ github: https://github.com/SujalNeupane9
2
+ linkedin: https://www.linkedin.com/in/sujal-neupane-2a9a2b210/
3
+
4
+ # Summarization
5
+
6
+ This project is a machine learning pipeline for natural language processing tasks. It contains a set of scripts and modules that allow you to train and evaluate various models on your own data.
7
+
8
+ ## Description
9
+ This repository contains a sample code with aim to demonstrate how to train a model for text summarization. The main focus is to show a basic template on how to create a structure from which we can smoothly deploy the model as well as perform inference on the trained model.
10
+
11
+ ## Framework used:
12
+ * PyTorch
13
+ * Transformers
14
+
15
+ ## Project Structure
16
+
17
+ * `pipeline`
18
+ This directory contains the code for the main data pipeline.
19
+
20
+ - `training_pipeline.py`: Code for the training pipeline.
21
+ - `inference_pipeline.py`: Code for the inference pipeline.
22
+
23
+ * `steps`
24
+ This directory includes various steps involved in the data pipeline.
25
+
26
+ - `evaluation.py`: Code for evaluating the model.
27
+ - `ingest_data.py`: Code for ingesting data into the pipeline.
28
+ - `preprocess.py`: Data preprocessing code.
29
+ - `model_train.py`: Model training code.
30
+
31
+ * `utils`
32
+ This directory contains utility functions used throughout the project.
33
+ - `utils.py`: General utility functions.
34
+
35
+ * `run_pipeline.py`
36
+ This script is the entry point for running the entire data pipeline.
37
+
38
+ * `Dockerfile`
39
+ The Dockerfile for creating a Docker image for this project.
40
+
41
+ * `requirements.txt`
42
+ List of Python packages required for running the project. Install them using:
43
+
44
+ ## Demo
45
+ I have already trained a t5-base model and uploaded it into HuggingFace. The streamlit demo can be accessed from following link.
46
+ https://summarization-2s9wr7njxcgpeuuraprip5.streamlit.app/
47
+
48
+ ## License
49
+ This project is licensed under the MIT License - see the LICENSE file for details.
__init__.py ADDED
File without changes
app.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ from transformers import T5ForConditionalGeneration, T5Tokenizer
4
+
5
+ # Load the fine-tuned T5 model and tokenizer
6
+ model_path = "Neupane9Sujal/Text_Summarization"
7
+ tokenizer = T5Tokenizer.from_pretrained(model_path)
8
+ model = T5ForConditionalGeneration.from_pretrained(model_path)
9
+
10
+ # Set device
11
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
12
+
13
+ # Function to generate summaries
14
+ def generate_summary(text):
15
+ # Tokenize input text
16
+ inputs = tokenizer.encode(text, return_tensors="pt", max_length=512, truncation=True).to(device)
17
+ #st.write(inputs.shape)
18
+ # Generate summary
19
+ summary_ids = model.generate(inputs, num_beams=4, max_length=264, early_stopping=True)
20
+ summary = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)
21
+
22
+ return summary
23
+
24
+ # Streamlit app
25
+ def main():
26
+ st.title("Text Summarization")
27
+
28
+ # User input
29
+ user_input = st.text_area("Enter the text to summarize")
30
+
31
+ # Generate summary button
32
+ if st.button("Generate Summary"):
33
+ if user_input.strip() == "":
34
+ st.warning("Please enter some text.")
35
+ else:
36
+ # Generate summary
37
+ summary = generate_summary(user_input)
38
+
39
+ # Display summary
40
+ st.subheader("Summary")
41
+ st.write(summary)
42
+
43
+ if __name__ == "__main__":
44
+ main()
evaluation.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+
2
+ def evaluate_model(trainer):
3
+ eval_metrics = trainer.evaluate()
inference_pipeline.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils.utils import tokenize_for_inference
2
+ from transformers import AutoTokenizer
3
+
4
+ def infer_model(trainer):
5
+ tokenizer = AutoTokenizer.from_pretrained('t5-base')
6
+ text = input("Enter the text you want to summarize: ")
7
+ tokenized = tokenize_for_inference(text)
8
+ generated = trainer.model.generate(tokenized, max_length=256)
9
+
10
+ # Convert the generated output back to text
11
+ summary = tokenizer.decode(generated.squeeze(), skip_special_tokens=True)
12
+ print(summary)
13
+ return summary
ingest_data.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+
3
+ def get_data():
4
+ dataset = load_dataset("multi_news")
5
+ return dataset
model_train.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import Trainer, TrainingArguments, T5ForConditionalGeneration
2
+
3
+ def train_model(tok_ds,num_train_epochs,batch_size):
4
+ model = T5ForConditionalGeneration.from_pretrained('t5-base')
5
+ training_args = TrainingArguments(
6
+ output_dir="./output",
7
+ per_device_train_batch_size=batch_size,
8
+ per_device_eval_batch_size=batch_size,
9
+ save_total_limit=2,
10
+ num_train_epochs=num_train_epochs,
11
+ save_strategy="epoch",
12
+ learning_rate=2e-5,
13
+ weight_decay=0.01,
14
+ fp16=True
15
+ )
16
+ trainer = Trainer(
17
+ model=model,
18
+ args=training_args,
19
+ train_dataset=tok_ds["train"],
20
+ eval_dataset=tok_ds["validation"],
21
+ #data_collator=data_collator,
22
+ compute_metrics=lambda p: compute_rouge_scores(
23
+ tokenizer.batch_decode(p.predictions, skip_special_tokens=True),
24
+ tokenizer.batch_decode(p.label_ids, skip_special_tokens=True),
25
+ ),
26
+ )
27
+ trainer.train()
28
+ return trainer
preprocess.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer
2
+ from.ingest_data import get_data
3
+
4
+ model_nm = 't5-small'
5
+ tokenizer = AutoTokenizer.from_pretrained(model_nm)
6
+
7
+ def tokenize_data(x):
8
+ model_inputs = tokenizer(
9
+ x['document'],
10
+ max_length = 512,
11
+ padding=True,
12
+ truncation=True
13
+ )
14
+ labels = tokenizer(
15
+ x['summary'],
16
+ max_length = 512,
17
+ padding = True,
18
+ truncation=True
19
+ )
20
+ model_inputs['labels'] = labels['input_ids']
21
+ return model_inputs
22
+
23
+ def preprocess():
24
+ dataset = get_data()
25
+ tok_ds = dataset.map(tokenize_data, batched=True)
26
+ return tok_ds
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ sentencepiece
2
+ transformers
3
+ accelerate
4
+ torch
5
+ rouge-score
6
+ datasets
7
+ streamlit
run_pipeline.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from pipeline.training_model import training_pipeline
2
+
3
+ if __name__ == "__main__":
4
+ num_train_epochs=3
5
+ batch_size=8
6
+ training_pipeline(num_train_epochs, batch_size)
training_model.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from steps.preprocess import preprocess
2
+ from steps.model_train import train_model
3
+ from steps.evaluation import evaluate_model
4
+ from pipeline.inference_pipeline import infer_model
5
+
6
+
7
+ def training_pipeline(num_train_epochs,batch_size):
8
+ tok_ds = preprocess()
9
+ #data_collator = DataCollatorForSeq2Seq(tokenizer,model=model,return_tensors='pt')
10
+ trainer = train_model(tok_ds, num_train_epochs, batch_size)
11
+ trained_model = trainer.model
12
+ eval_metric = evaluate_model(trainer)
13
+ infer_model(trainer)
utils.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer
2
+
3
+ model_nm = 't5-base'
4
+ device = 'cuda'
5
+
6
+ def tokenize_for_inference(text):
7
+ tokenizer = AutoTokenizer.from_pretrained(model_nm)
8
+ model_inputs = tokenizer.encode(
9
+ text,
10
+ max_length = 512,
11
+ padding=True,
12
+ truncation=True,
13
+ return_tensors='pt'
14
+ )
15
+ return model_inputs.to(device)