Spaces:

raj22rishi
/

Text_Summarization

Sleeping

App Files Files Community

raj22rishi commited on Feb 29

Commit

fb4a3c6

•

1 Parent(s): 30ddd1e

Upload 14 files

Browse files

Files changed (14) hide show

DockerFile +21 -0
LICENSE +21 -0
README.md +49 -12
__init__.py +0 -0
app.py +44 -0
evaluation.py +3 -0
inference_pipeline.py +13 -0
ingest_data.py +5 -0
model_train.py +28 -0
preprocess.py +26 -0
requirements.txt +7 -0
run_pipeline.py +6 -0
training_model.py +13 -0
utils.py +15 -0

DockerFile ADDED Viewed

	@@ -0,0 +1,21 @@

+# Use a base image with Python installed
+FROM python:3.9
+# Set the working directory in the container
+WORKDIR /app
+# Copy the requirements file to the container
+COPY requirements.txt .
+# Install the required packages
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the source code to the container
+COPY pipeline ./pipeline
+COPY steps ./steps
+COPY models ./models
+COPY run_pipeline.py .
+COPY utils ./utils
+# Set the command to run when the container starts
+CMD ["python", "run_pipeline.py"]

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Sujal Neupane
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,12 +1,49 @@
----
-title: Text Summarization
-emoji: 🐠
-colorFrom: green
-colorTo: yellow
-sdk: streamlit
-sdk_version: 1.31.1
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+github: https://github.com/SujalNeupane9
+linkedin: https://www.linkedin.com/in/sujal-neupane-2a9a2b210/
+# Summarization
+This project is a machine learning pipeline for natural language processing tasks. It contains a set of scripts and modules that allow you to train and evaluate various models on your own data.
+## Description
+This repository contains a sample code with aim to demonstrate how to train a model for text summarization. The main focus is to show a basic template on how to create a structure from which we can smoothly deploy the model as well as perform inference on the  trained model.
+## Framework used:
+* PyTorch
+* Transformers
+## Project Structure
+* `pipeline`
+This directory contains the code for the main data pipeline.
+- `training_pipeline.py`: Code for the training pipeline.
+- `inference_pipeline.py`: Code for the inference pipeline.
+ * `steps`
+This directory includes various steps involved in the data pipeline.
+- `evaluation.py`: Code for evaluating the model.
+- `ingest_data.py`: Code for ingesting data into the pipeline.
+  - `preprocess.py`: Data preprocessing code.
+  - `model_train.py`: Model training code.
+* `utils`
+This directory contains utility functions used throughout the project.
+  - `utils.py`: General utility functions.
+* `run_pipeline.py`
+This script is the entry point for running the entire data pipeline.
+* `Dockerfile`
+The Dockerfile for creating a Docker image for this project.
+* `requirements.txt`
+List of Python packages required for running the project. Install them using:
+## Demo
+I have already trained a t5-base model and uploaded it into HuggingFace. The streamlit demo can be accessed from following link.
+https://summarization-2s9wr7njxcgpeuuraprip5.streamlit.app/
+## License
+This project is licensed under the MIT License - see the LICENSE file for details.

__init__.py ADDED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import streamlit as st
+import torch
+from transformers import T5ForConditionalGeneration, T5Tokenizer
+# Load the fine-tuned T5 model and tokenizer
+model_path = "Neupane9Sujal/Text_Summarization"
+tokenizer = T5Tokenizer.from_pretrained(model_path)
+model = T5ForConditionalGeneration.from_pretrained(model_path)
+# Set device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Function to generate summaries
+def generate_summary(text):
+    # Tokenize input text
+    inputs = tokenizer.encode(text, return_tensors="pt", max_length=512, truncation=True).to(device)
+    #st.write(inputs.shape)
+    # Generate summary
+    summary_ids = model.generate(inputs, num_beams=4, max_length=264, early_stopping=True)
+    summary = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)
+    return summary
+# Streamlit app
+def main():
+    st.title("Text Summarization")
+    # User input
+    user_input = st.text_area("Enter the text to summarize")
+    # Generate summary button
+    if st.button("Generate Summary"):
+        if user_input.strip() == "":
+            st.warning("Please enter some text.")
+        else:
+            # Generate summary
+            summary = generate_summary(user_input)
+            # Display summary
+            st.subheader("Summary")
+            st.write(summary)
+if __name__ == "__main__":
+    main()

evaluation.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+
2	+ def evaluate_model(trainer):
3	+ eval_metrics = trainer.evaluate()

inference_pipeline.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from utils.utils import tokenize_for_inference
+from transformers import AutoTokenizer
+def infer_model(trainer):
+    tokenizer = AutoTokenizer.from_pretrained('t5-base')
+    text = input("Enter the text you want to summarize: ")
+    tokenized = tokenize_for_inference(text)
+    generated = trainer.model.generate(tokenized, max_length=256)
+    # Convert the generated output back to text
+    summary = tokenizer.decode(generated.squeeze(), skip_special_tokens=True)
+    print(summary)
+    return summary

ingest_data.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from datasets import load_dataset
+def get_data():
+    dataset = load_dataset("multi_news")
+    return dataset

model_train.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from transformers import Trainer, TrainingArguments, T5ForConditionalGeneration
+def train_model(tok_ds,num_train_epochs,batch_size):
+    model = T5ForConditionalGeneration.from_pretrained('t5-base')
+    training_args = TrainingArguments(
+    output_dir="./output",
+    per_device_train_batch_size=batch_size,
+    per_device_eval_batch_size=batch_size,
+    save_total_limit=2,
+    num_train_epochs=num_train_epochs,
+    save_strategy="epoch",
+    learning_rate=2e-5,
+    weight_decay=0.01,
+    fp16=True
+    )
+    trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=tok_ds["train"],
+    eval_dataset=tok_ds["validation"],
+    #data_collator=data_collator,
+    compute_metrics=lambda p: compute_rouge_scores(
+        tokenizer.batch_decode(p.predictions, skip_special_tokens=True),
+        tokenizer.batch_decode(p.label_ids, skip_special_tokens=True),
+        ),
+    )
+    trainer.train()
+    return trainer

preprocess.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from transformers import AutoTokenizer
+from.ingest_data import get_data
+model_nm = 't5-small'
+tokenizer = AutoTokenizer.from_pretrained(model_nm)
+def tokenize_data(x):
+  model_inputs = tokenizer(
+      x['document'],
+      max_length = 512,
+      padding=True,
+      truncation=True
+  )
+  labels = tokenizer(
+      x['summary'],
+      max_length = 512,
+      padding = True,
+      truncation=True
+  )
+  model_inputs['labels'] = labels['input_ids']
+  return model_inputs
+def preprocess():
+    dataset = get_data()
+    tok_ds = dataset.map(tokenize_data, batched=True)
+    return tok_ds

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+sentencepiece
+transformers
+accelerate
+torch
+rouge-score
+datasets
+streamlit

run_pipeline.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from pipeline.training_model import training_pipeline
+if __name__ == "__main__":
+    num_train_epochs=3
+    batch_size=8
+    training_pipeline(num_train_epochs, batch_size)

training_model.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from steps.preprocess import preprocess
+from steps.model_train import train_model
+from steps.evaluation import evaluate_model
+from pipeline.inference_pipeline import infer_model
+def training_pipeline(num_train_epochs,batch_size):
+    tok_ds = preprocess()
+    #data_collator = DataCollatorForSeq2Seq(tokenizer,model=model,return_tensors='pt')
+    trainer = train_model(tok_ds, num_train_epochs, batch_size)
+    trained_model = trainer.model
+    eval_metric = evaluate_model(trainer)
+    infer_model(trainer)

utils.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from transformers import AutoTokenizer
+model_nm = 't5-base'
+device = 'cuda'
+def tokenize_for_inference(text):
+    tokenizer = AutoTokenizer.from_pretrained(model_nm)
+    model_inputs = tokenizer.encode(
+      text,
+      max_length = 512,
+      padding=True,
+      truncation=True,
+        return_tensors='pt'
+    )
+    return model_inputs.to(device)