Spaces:

Jhenderson112
/

Text

Running

App Files Files Community

Jhenderson112 commited on Oct 10, 2023

Commit

19f61e3

1 Parent(s): fe64f77

Upload 6 files

Browse files

Files changed (5) hide show

.DS_Store +0 -0
.gitignore +3 -0
Text_Summarization_T5.ipynb +791 -0
app.py +38 -0
requirements.txt +155 -0

.DS_Store ADDED Viewed

Binary file (8.2 kB). View file

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@


1	+
2	+ *.bin
3	+ *.pt

Text_Summarization_T5.ipynb ADDED Viewed

	@@ -0,0 +1,791 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "c08e675e-437e-4e7d-baee-bd55dda74611",
+   "metadata": {},
+   "source": [
+    "# Abstractive Text Summarization with T5\n",
+    "\n",
+    "This implementation uses HuggingFace, especially utilizing `AutoModelForSeq2SeqLM` and `AutoTokenizer`. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a910e4b5-040d-4499-b5c2-32f3e1ac1c34",
+   "metadata": {},
+   "source": [
+    "## Importing libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "d22ee5a9-1981-4883-a926-db37905ec8b6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Setup done!\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Installs\n",
+    "!pip install -q evaluate py7zr rouge_score absl-py\n",
+    "\n",
+    "# Imports here\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "import nltk\n",
+    "from nltk.tokenize import sent_tokenize\n",
+    "nltk.download(\"punkt\")\n",
+    "\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "\n",
+    "import datasets\n",
+    "import transformers\n",
+    "from transformers import (\n",
+    "        AutoModelForSeq2SeqLM,\n",
+    "        Seq2SeqTrainingArguments,\n",
+    "        Seq2SeqTrainer,\n",
+    "        AutoTokenizer\n",
+    ")\n",
+    "import evaluate\n",
+    "\n",
+    "# Quality of life fixes\n",
+    "import warnings\n",
+    "warnings.filterwarnings('ignore')\n",
+    "from pprint import pprint\n",
+    "\n",
+    "import os\n",
+    "os.environ[\"WANDB_DISABLED\"] = \"true\"\n",
+    "\n",
+    "from IPython.display import clear_output\n",
+    "\n",
+    "print(f\"PyTorch version: {torch.__version__}\")\n",
+    "print(f\"Transformers version: {transformers.__version__}\")\n",
+    "print(f\"Datasets version: {datasets.__version__}\")\n",
+    "print(f\"Evaluate version: {evaluate.__version__}\")\n",
+    "\n",
+    "# Get the samsum dataset\n",
+    "samsum = datasets.load_dataset('samsum')\n",
+    "clear_output()\n",
+    "print(\"Setup done!\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "bafa753c-0746-4ece-b5eb-4511c9138b09",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'4.27.4'"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Verify transformers version\n",
+    "transformers.__version__"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f15204cc-0f21-4dc9-a8e4-429c57b227a9",
+   "metadata": {},
+   "source": [
+    "## Playing around with the dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "ba5c1425-a776-4201-97e2-bd420ec112fe",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['id', 'dialogue', 'summary'],\n",
+       "        num_rows: 14732\n",
+       "    })\n",
+       "    test: Dataset({\n",
+       "        features: ['id', 'dialogue', 'summary'],\n",
+       "        num_rows: 819\n",
+       "    })\n",
+       "    validation: Dataset({\n",
+       "        features: ['id', 'dialogue', 'summary'],\n",
+       "        num_rows: 818\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# The samsum dataset shape\n",
+    "samsum"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "5d53736c-a8c7-4fe3-b8f1-566c1d99162b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dialogue:\n",
+      "Ollie: How is your Hebrew?\r\n",
+      "Gabi: Not great. \r\n",
+      "Ollie: Could you translate a letter?\r\n",
+      "Gabi: From Hebrew to English maybe, the opposite I don’t think so\r\n",
+      "Gabi: My writing sucks\r\n",
+      "Ollie: Please help me. I don’t have anyone else to ask\r\n",
+      "Gabi: Send it to me. I’ll try. \n",
+      "\n",
+      " -------------------------------------------------- \n",
+      "\n",
+      "Summary:\n",
+      "Gabi knows a bit of Hebrew, though her writing isn't great. She will try to help Ollie translate a letter.\n"
+     ]
+    }
+   ],
+   "source": [
+    "rand_idx = np.random.randint(0, len(samsum['train']))\n",
+    "\n",
+    "print(f\"Dialogue:\\n{samsum['train'][rand_idx]['dialogue']}\")\n",
+    "print('\\n', '-'*50, '\\n')\n",
+    "print(f\"Summary:\\n{samsum['train'][rand_idx]['summary']}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8f95359e-c9c4-4ed5-9130-5e2b4a0a83ad",
+   "metadata": {},
+   "source": [
+    "## Preprocessing data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "50b572e6-b37a-4688-94c9-9c45a2c67c51",
+   "metadata": {},
+   "source": [
+    " I'm using the T5 Transformers model (Text-to-Text Transfer Transformer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "13634dfe-5b1a-4515-9476-8ac0637d0362",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_ckpt = 't5-small'\n",
+    "\n",
+    "# TODO: Create the Tokenizer AutoTokenizer pretrained checkpoint\n",
+    "tokenizer = AutoTokenizer.from_pretrained('t5-small')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "6b0be9fc-029b-4057-9d08-29235e5b4573",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading cached processed dataset at C:\\Users\\QXLVR\\.cache\\huggingface\\datasets\\samsum\\samsum\\0.0.0\\f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e\\cache-78c13bd5dd6a016a.arrow\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Max source length: 512\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/15551 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Max target length: 95\n"
+     ]
+    }
+   ],
+   "source": [
+    "from datasets import concatenate_datasets\n",
+    "# Find the max lengths of the source and target samples\n",
+    "# The maximum total input sequence length after tokenization. \n",
+    "# Sequences that are longer than this will be truncated, sequences shorter are be padded.\n",
+    "tokenized_inputs = concatenate_datasets([samsum[\"train\"], samsum[\"test\"]]).map(lambda x: tokenizer(x[\"dialogue\"], truncation=True), batched=True, remove_columns=[\"dialogue\", \"summary\"])\n",
+    "max_source_length = max([len(x) for x in tokenized_inputs[\"input_ids\"]])\n",
+    "print(f\"Max source length: {max_source_length}\")\n",
+    "\n",
+    "# The maximum total sequence length for target text after tokenization. \n",
+    "# Sequences that are longer than this will be truncated, sequences shorter are be padded.\n",
+    "tokenized_targets = concatenate_datasets([samsum[\"train\"], samsum[\"test\"]]).map(lambda x: tokenizer(x[\"summary\"], truncation=True), batched=True, remove_columns=[\"dialogue\", \"summary\"])\n",
+    "max_target_length = max([len(x) for x in tokenized_targets[\"input_ids\"]])\n",
+    "print(f\"Max target length: {max_target_length}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "c43b0864-8b92-4cb9-b159-bc8ec15bcc2d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading cached processed dataset at C:\\Users\\QXLVR\\.cache\\huggingface\\datasets\\samsum\\samsum\\0.0.0\\f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e\\cache-073bbcc8f496f07c.arrow\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/819 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading cached processed dataset at C:\\Users\\QXLVR\\.cache\\huggingface\\datasets\\samsum\\samsum\\0.0.0\\f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e\\cache-a43b31cabc78c9c3.arrow\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']\n"
+     ]
+    }
+   ],
+   "source": [
+    "def preprocess_function(\n",
+    "    sample, \n",
+    "    padding=\"max_length\", \n",
+    "    max_source_length=max_source_length,\n",
+    "    max_target_length=max_target_length\n",
+    "):\n",
+    "    '''\n",
+    "    A preprocessing function that will be applied across the dataset.\n",
+    "    The inputs and targets will be tokenized and padded/truncated to the max lengths.\n",
+    "\n",
+    "    Args:\n",
+    "        sample: A dictionary containing the source and target texts (keys are \"dialogue\" and \"summary\") in a list.\n",
+    "        padding: Whether to pad the inputs and targets to the max lengths.\n",
+    "        max_source_length: The maximum length of the source text.\n",
+    "        max_target_length: The maximum length of the target text.\n",
+    "    '''\n",
+    "    # Add prefix to the input for t5\n",
+    "    inputs = ['summarize: ' + s for s in sample['dialogue']]\n",
+    "   \n",
+    "    # Tokenize inputs, specifying the padding, truncation and max_length\n",
+    "    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)\n",
+    "\n",
+    "    # Tokenize targets with the `text_target` keyword argument\n",
+    "    labels = tokenizer(text_target=sample['summary'], max_length=max_target_length, padding=padding, truncation=True)\n",
+    "\n",
+    "    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore padding in the loss\n",
+    "    if padding == \"max_length\":\n",
+    "        labels[\"input_ids\"] = [\n",
+    "            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels[\"input_ids\"]\n",
+    "        ]\n",
+    "\n",
+    "    # Format and return\n",
+    "    model_inputs[\"labels\"] = labels[\"input_ids\"]\n",
+    "    return model_inputs\n",
+    "\n",
+    "# Map this preprocessing function to our datasets using .map on the samsum variable\n",
+    "tokenized_dataset = samsum.map(preprocess_function, batched=True, remove_columns=[\"dialogue\", \"summary\", \"id\"])\n",
+    "print(f\"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "3becd236-0097-4ae5-9bd6-a91ed332e748",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['input_ids', 'attention_mask', 'labels'],\n",
+       "        num_rows: 14732\n",
+       "    })\n",
+       "    test: Dataset({\n",
+       "        features: ['input_ids', 'attention_mask', 'labels'],\n",
+       "        num_rows: 819\n",
+       "    })\n",
+       "    validation: Dataset({\n",
+       "        features: ['input_ids', 'attention_mask', 'labels'],\n",
+       "        num_rows: 818\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenized_dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "20110839-bb02-4d64-8de7-53253e3f7fe0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "metric = evaluate.load(\"rouge\")\n",
+    "clear_output()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "ca00f91d-8453-4496-a064-525ef437198f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def postprocess_text(preds, labels):\n",
+    "    '''\n",
+    "    A simple post-processing function to clean up the predictions and labels\n",
+    "\n",
+    "    Args:\n",
+    "        preds: List[str] of predictions\n",
+    "        labels: List[str] of labels\n",
+    "    '''\n",
+    "    \n",
+    "    # strip whitespace on all sentences in preds and labels\n",
+    "    preds = [p.strip(' ') for p in preds]\n",
+    "    labels = [l.strip(' ') for l in preds]\n",
+    "    \n",
+    "    # rougeLSum expects newline after each sentence\n",
+    "    preds = [\"\\n\".join(sent_tokenize(pred)) for pred in preds]\n",
+    "    labels = [\"\\n\".join(sent_tokenize(label)) for label in labels]\n",
+    "\n",
+    "    return preds, labels\n",
+    "\n",
+    "def compute_metrics(eval_preds):\n",
+    "    \n",
+    "    # Fetch the predictions and labels\n",
+    "    preds, labels = eval_preds\n",
+    "    if isinstance(preds, tuple):\n",
+    "        preds = preds[0]\n",
+    "    \n",
+    "    # Decode the predictions back to text\n",
+    "    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)\n",
+    "    \n",
+    "    # Replace -100 in the labels as we can't decode them.\n",
+    "    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)\n",
+    "    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)\n",
+    "\n",
+    "    # Some simple post-processing for ROUGE\n",
+    "    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)\n",
+    "\n",
+    "    # Compute ROUGE on the decoded predictions and the decoder labels\n",
+    "    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)\n",
+    "    \n",
+    "    result = {k: round(v * 100, 4) for k, v in result.items()}\n",
+    "    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]\n",
+    "    result[\"gen_len\"] = np.mean(prediction_lens)\n",
+    "    return result"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7b244846-2ebf-4019-a577-3ef07e350f7c",
+   "metadata": {},
+   "source": [
+    "## Creating the model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "49c1ac7c-6400-4a67-b32b-5bdc7330d790",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# the AutoModelForSeq2SeqLM class and use the model_ckpt variable)\n",
+    "model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt)\n",
+    "\n",
+    "clear_output()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "e027b290-c04f-4241-b238-41787f32abe0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# we want to ignore tokenizer pad token in the loss\n",
+    "label_pad_token_id = -100\n",
+    "\n",
+    "# Data Collator, specifying the tokenizer, model, and label_pad_token_id\n",
+    "# pad_to_multiple_of=8 to speed up training\n",
+    "data_collator = transformers.DataCollatorForSeq2Seq(\n",
+    "    tokenizer,\n",
+    "    model=model,\n",
+    "    label_pad_token_id=label_pad_token_id,\n",
+    "    pad_to_multiple_of=8\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "0d20ee86-ac8c-4ae7-9e7c-92283e879e00",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).\n"
+     ]
+    }
+   ],
+   "source": [
+    "import logging\n",
+    "logging.getLogger(\"transformers\").setLevel(logging.WARNING)\n",
+    "\n",
+    "\n",
+    "# Define training hyperparameters in Seq2SeqTrainingArguments\n",
+    "training_args = Seq2SeqTrainingArguments(\n",
+    "    output_dir=\"./t5_samsum\", # the output directory\n",
+    "    logging_strategy=\"epoch\",\n",
+    "    save_strategy=\"epoch\",\n",
+    "    evaluation_strategy=\"epoch\",\n",
+    "    learning_rate=2e-5,\n",
+    "    num_train_epochs=5,\n",
+    "    predict_with_generate=True,\n",
+    "    per_device_train_batch_size=8,\n",
+    "    per_device_eval_batch_size=8,\n",
+    "    weight_decay=0.01,\n",
+    "    load_best_model_at_end=True,\n",
+    "    logging_steps=50,\n",
+    "    logging_first_step=False,\n",
+    "    fp16=False\n",
+    ")\n",
+    "\n",
+    "# index into the tokenized_dataset variable to get the training and validation data\n",
+    "training_data = tokenized_dataset['train']\n",
+    "eval_data = tokenized_dataset['validation']\n",
+    "\n",
+    "# Create the Trainer for the model\n",
+    "trainer = Seq2SeqTrainer(\n",
+    "    model=model,    # the model to be trained\n",
+    "    args=training_args, # training arguments\n",
+    "    train_dataset=training_data, # the training dataset\n",
+    "    eval_dataset=eval_data, # the validation dataset\n",
+    "    tokenizer=tokenizer, # the tokenizer we used to tokenize our data\n",
+    "    compute_metrics=compute_metrics, # the function we defined above to compute metrics\n",
+    "    data_collator=data_collator # the data collator we defined above\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "a3b5f21d-b4cb-4f8b-a7fc-cf132ef43c65",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TrainOutput(global_step=9210, training_loss=1.9861197174436753, metrics={'train_runtime': 3551.1547, 'train_samples_per_second': 20.743, 'train_steps_per_second': 2.594, 'total_flos': 9969277096427520.0, 'train_loss': 1.9861197174436753, 'epoch': 5.0})\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Train the model (this will take a while!)\n",
+    "results = trainer.train()\n",
+    "clear_output()\n",
+    "pprint(results)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ddf8c308",
+   "metadata": {},
+   "source": [
+    "## Evaluating the model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "03e94a7f-2d26-48eb-ab17-cb58b14b93f3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "res = trainer.evaluate()\n",
+    "clear_output()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "23675ccb-071c-4a4f-8e42-1a71dc628a5c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>eval_loss</th>\n",
+       "      <th>eval_rouge1</th>\n",
+       "      <th>eval_rouge2</th>\n",
+       "      <th>eval_rougeL</th>\n",
+       "      <th>eval_rougeLsum</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>t5-small</th>\n",
+       "      <td>1.764253</td>\n",
+       "      <td>100.0</td>\n",
+       "      <td>100.0</td>\n",
+       "      <td>100.0</td>\n",
+       "      <td>100.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          eval_loss  eval_rouge1  eval_rouge2  eval_rougeL  eval_rougeLsum\n",
+       "t5-small   1.764253        100.0        100.0        100.0           100.0"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cols  = [\"eval_loss\", \"eval_rouge1\", \"eval_rouge2\", \"eval_rougeL\", \"eval_rougeLsum\"]\n",
+    "filtered_scores = dict((x , res[x]) for x in cols)\n",
+    "pd.DataFrame([filtered_scores], index=[model_ckpt])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "7c59a731",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import pipeline\n",
+    "\n",
+    "summarizer_pipeline = pipeline(\"summarization\",\n",
+    "                              model=model,\n",
+    "                              tokenizer=tokenizer,\n",
+    "                              device=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "5138f2bc",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dialogue: Adelina: Hi handsome. Where you you come from?\r\n",
+      "Cyprien: What do you mean?\r\n",
+      "Adelina: What do you mean, \"what do you mean\"? It's a simple question, where do you come from?\r\n",
+      "Cyprien: Well I was born in Jarrow, live in London now, so you could say I came from either of those places\r\n",
+      "Cyprien: I was educated in Loughborouogh, so in a sense I came from there.\r\n",
+      "Adelina: OK. \r\n",
+      "Cyprien: In another sense I come from my mother's vagina, but I dare say everyone can say that.\r\n",
+      "Adelina: Are you all right?\r\n",
+      "Cyprien: IN another sense I come from the atoms in the air that I breath or the food I eat, which comes to me from many places, so all I can say is \"I come from Planet Earth\".\r\n",
+      "Adelina: OK, bye. If you're gonna be a dick...\r\n",
+      "Cyprien: Wait, what you got against earthlings?\n",
+      "-------------------------\n",
+      "True Summary: Cyprien irritates Adelina by giving too many responses.\n",
+      "-------------------------\n",
+      "Model Summary: Cyprien came from Jarrow, live in London. She came from Loughborouogh, and came from her mother's vagina.\n",
+      "-------------------------\n"
+     ]
+    }
+   ],
+   "source": [
+    "rand_idx = np.random.randint(low=0, high=len(samsum[\"test\"]))\n",
+    "sample = samsum[\"test\"][rand_idx]\n",
+    "\n",
+    "dialog = sample[\"dialogue\"]\n",
+    "true_summary = sample[\"summary\"]\n",
+    "\n",
+    "model_summary = summarizer_pipeline(dialog)\n",
+    "clear_output()\n",
+    "\n",
+    "print(f\"Dialogue: {dialog}\")\n",
+    "print(\"-\"*25)\n",
+    "print(f\"True Summary: {true_summary}\")\n",
+    "print(\"-\"*25)\n",
+    "print(f\"Model Summary: {model_summary[0]['summary_text']}\")\n",
+    "print(\"-\"*25)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "f051655f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Your max_length is set to 200, but you input_length is only 94. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=47)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Original Text:\n",
+      "\n",
+      "Andy: I need you to come in to work on the weekend.\n",
+      "David: Why boss? I have plans to go on a concert I might not be able to come on the weekend.\n",
+      "Andy: It's important we need to get our paperwork all sorted out for this year. Corporate needs it.\n",
+      "David: But I already made plans and this is news to me on very short notice.\n",
+      "Andy: Be there or you'r fired\n",
+      "\n",
+      "\n",
+      " -------------------------------------------------- \n",
+      "\n",
+      "Generated Summary: \n",
+      "[{'summary_text': 'David has plans to go on a concert. Andy needs to get his paperwork all sorted out for this year. David already made plans.'}]\n"
+     ]
+    }
+   ],
+   "source": [
+    "def create_summary(input_text, model_pipeline=summarizer_pipeline):\n",
+    "    summary = model_pipeline(input_text)\n",
+    "    return summary\n",
+    "\n",
+    "text = '''\n",
+    "Andy: I need you to come in to work on the weekend.\n",
+    "David: Why boss? I have plans to go on a concert I might not be able to come on the weekend.\n",
+    "Andy: It's important we need to get our paperwork all sorted out for this year. Corporate needs it.\n",
+    "David: But I already made plans and this is news to me on very short notice.\n",
+    "Andy: Be there or you'r fired\n",
+    "'''\n",
+    "\n",
+    "print(f\"Original Text:\\n{text}\")\n",
+    "print('\\n', '-'*50, '\\n')\n",
+    "\n",
+    "summary = create_summary(text)\n",
+    "\n",
+    "print(f\"Generated Summary: \\n{summary}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ad5d29a0",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

app.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from flask import Flask, render_template, request, jsonify
+from tf_model_api.model_api import ModelAPI
+app = Flask(__name__)
+# Create the model class object
+summarizer_model = ModelAPI()
+@app.route('/')
+def index():
+    data = {
+        'prompts': ''
+    }
+    return render_template('index.html', data=data)
+@app.route('/create-summary', methods=['POST'])
+def creat_summary_response():
+    """
+    create a summary using the input received
+    from the user.
+    """
+    data = request.get_json()  # Extract the JSON data from the request
+    text = data.get('text')    # Get the 'text' field from the JSON data
+    summary = summarizer_model.get_summary(text)
+    if summary:
+        result = {
+            'status': 'success',
+            'result': summary}
+        return jsonify(result), 200
+    else:
+        result = {
+            'status': 'fail'
+        }
+        return jsonify(result), 400
+if __name__ == '__main__':
+    app.run()

requirements.txt ADDED Viewed

	@@ -0,0 +1,155 @@

+absl-py==2.0.0
+accelerate==0.23.0
+aiohttp==3.8.5
+aiosignal==1.3.1
+anyio==4.0.0
+appnope==0.1.3
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.2.3
+asttokens==2.4.0
+async-lru==2.0.4
+async-timeout==4.0.3
+attrs==23.1.0
+Babel==2.12.1
+backcall==0.2.0
+beautifulsoup4==4.12.2
+bleach==6.0.0
+blinker==1.6.2
+Brotli==1.1.0
+certifi==2023.7.22
+cffi==1.15.1
+charset-normalizer==3.2.0
+click==8.1.7
+comm==0.1.4
+contourpy==1.1.1
+cycler==0.11.0
+datasets==2.14.5
+debugpy==1.8.0
+decorator==5.1.1
+defusedxml==0.7.1
+dill==0.3.7
+evaluate==0.4.0
+executing==1.2.0
+fastjsonschema==2.18.0
+filelock==3.12.4
+Flask==2.3.3
+fonttools==4.42.1
+fqdn==1.5.1
+frozenlist==1.4.0
+fsspec==2023.6.0
+huggingface-hub==0.17.3
+idna==3.4
+inflate64==0.3.1
+ipykernel==6.25.2
+ipython==8.15.0
+ipython-genutils==0.2.0
+ipywidgets==8.1.1
+isoduration==20.11.0
+itsdangerous==2.1.2
+jedi==0.19.0
+Jinja2==3.1.2
+joblib==1.3.2
+json5==0.9.14
+jsonpointer==2.4
+jsonschema==4.19.1
+jsonschema-specifications==2023.7.1
+jupyter==1.0.0
+jupyter-console==6.6.3
+jupyter-events==0.7.0
+jupyter-lsp==2.2.0
+jupyter_client==8.3.1
+jupyter_core==5.3.2
+jupyter_server==2.7.3
+jupyter_server_terminals==0.4.4
+jupyterlab==4.0.6
+jupyterlab-pygments==0.2.2
+jupyterlab-widgets==3.0.9
+jupyterlab_server==2.25.0
+kiwisolver==1.4.5
+MarkupSafe==2.1.3
+matplotlib==3.8.0
+matplotlib-inline==0.1.6
+mistune==3.0.1
+mpmath==1.3.0
+multidict==6.0.4
+multiprocess==0.70.15
+multivolumefile==0.2.3
+nbclient==0.8.0
+nbconvert==7.8.0
+nbformat==5.9.2
+nest-asyncio==1.5.8
+networkx==3.1
+nltk==3.8.1
+notebook==7.0.4
+notebook_shim==0.2.3
+numpy==1.26.0
+overrides==7.4.0
+packaging==23.1
+pandas==2.1.1
+pandocfilters==1.5.0
+parso==0.8.3
+pexpect==4.8.0
+pickleshare==0.7.5
+Pillow==10.0.1
+platformdirs==3.10.0
+prometheus-client==0.17.1
+prompt-toolkit==3.0.39
+psutil==5.9.5
+ptyprocess==0.7.0
+pure-eval==0.2.2
+py7zr==0.20.6
+pyarrow==13.0.0
+pybcj==1.0.1
+pycparser==2.21
+pycryptodomex==3.19.0
+Pygments==2.16.1
+pyparsing==3.1.1
+pyppmd==1.0.0
+python-dateutil==2.8.2
+python-json-logger==2.0.7
+pytz==2023.3.post1
+PyYAML==6.0.1
+pyzmq==25.1.1
+pyzstd==0.15.9
+qtconsole==5.4.4
+QtPy==2.4.0
+referencing==0.30.2
+regex==2023.8.8
+requests==2.31.0
+responses==0.18.0
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rouge-score==0.1.2
+rpds-py==0.10.3
+safetensors==0.3.3
+seaborn==0.12.2
+Send2Trash==1.8.2
+six==1.16.0
+sniffio==1.3.0
+soupsieve==2.5
+stack-data==0.6.2
+sympy==1.12
+terminado==0.17.1
+texttable==1.6.7
+tinycss2==1.2.1
+tokenizers==0.13.3
+torch==2.0.1
+torchaudio==2.0.2
+torchvision==0.15.2
+tornado==6.3.3
+tqdm==4.66.1
+traitlets==5.10.1
+transformers==4.33.3
+typing_extensions==4.8.0
+tzdata==2023.3
+uri-template==1.3.0
+urllib3==2.0.5
+wcwidth==0.2.6
+webcolors==1.13
+webencodings==0.5.1
+websocket-client==1.6.3
+Werkzeug==2.3.7
+widgetsnbextension==4.0.9
+xxhash==3.3.0
+yarl==1.9.2