Spaces:

noahnsimbe
/

text-summarizer

Runtime error

App Files Files Community

Noah Nsimbe commited on Apr 3

Commit

07c028c

•

1 Parent(s): 5977c4a

support for txt and doc files

Browse files

Files changed (7) hide show

README.md +4 -2
app.ipynb +11 -83
app.py +41 -6
data/doc-file-example.docx +0 -0
data/{example.pdf → pd-file-example.pdf} +0 -0
data/text-file-example.txt +1 -0
requirements.txt +2 -1

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: "Document Summarizer"
 emoji: 📄
 colorFrom: blue
 colorTo: gray
@@ -8,11 +8,13 @@ sdk_version: 4.24.0
 app_file: app.py
 pinned: true
 license: "mit"
-short_description: "Performs document summarization"
 tags:
   - "document"
   - "summary"
   - "summarizer"
 models:
   - "facebook/bart-large-cnn"
 ---

 ---
+title: "Text Summarizer"
 emoji: 📄
 colorFrom: blue
 colorTo: gray
 app_file: app.py
 pinned: true
 license: "mit"
+short_description: "Performs text summarization"
 tags:
   - "document"
+  - "text"
   - "summary"
   - "summarizer"
+  - "summarization"
 models:
   - "facebook/bart-large-cnn"
 ---

app.ipynb CHANGED Viewed

@@ -2,48 +2,18 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Requirement already satisfied: torch in ./venv/lib/python3.11/site-packages (2.2.2)\n",
-      "Requirement already satisfied: torchvision in ./venv/lib/python3.11/site-packages (0.17.2)\n",
-      "Requirement already satisfied: torchaudio in ./venv/lib/python3.11/site-packages (2.2.2)\n",
-      "Requirement already satisfied: pypdf in ./venv/lib/python3.11/site-packages (4.1.0)\n",
-      "Requirement already satisfied: filelock in ./venv/lib/python3.11/site-packages (from torch) (3.13.3)\n",
-      "Requirement already satisfied: typing-extensions>=4.8.0 in ./venv/lib/python3.11/site-packages (from torch) (4.10.0)\n",
-      "Requirement already satisfied: sympy in ./venv/lib/python3.11/site-packages (from torch) (1.12)\n",
-      "Requirement already satisfied: networkx in ./venv/lib/python3.11/site-packages (from torch) (3.2.1)\n",
-      "Requirement already satisfied: jinja2 in ./venv/lib/python3.11/site-packages (from torch) (3.1.3)\n",
-      "Requirement already satisfied: fsspec in ./venv/lib/python3.11/site-packages (from torch) (2024.3.1)\n",
-      "Requirement already satisfied: numpy in ./venv/lib/python3.11/site-packages (from torchvision) (1.26.4)\n",
-      "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in ./venv/lib/python3.11/site-packages (from torchvision) (10.2.0)\n",
-      "Requirement already satisfied: MarkupSafe>=2.0 in ./venv/lib/python3.11/site-packages (from jinja2->torch) (2.1.5)\n",
-      "Requirement already satisfied: mpmath>=0.19 in ./venv/lib/python3.11/site-packages (from sympy->torch) (1.3.0)\n"
-     ]
-    }
-   ],
    "source": [
     "! pip install torch torchvision torchaudio pypdf"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/noah/spaces/document-summarizer/venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n"
-     ]
-    }
-   ],
    "source": [
     "from transformers import pipeline\n",
     "from pypdf import PdfReader"
@@ -51,7 +21,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -60,20 +30,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'Intr oducing\\nthe\\nlatest\\nsmar tphone\\nfr om\\nXYZ\\nT ech.\\nThe\\nXYZ\\nT ech\\nX10\\nf eatur es\\na\\nsleek\\ndesign,\\npower ful\\nper formance,\\nand\\nadv anced\\ncamer a\\ncapabilities.\\nWith\\na\\nquad-camer a\\nsetup,\\nincluding\\na\\n108MP\\nmain\\ncamer a\\nand\\na\\n5x\\noptical\\nz oom\\nlens,\\ny ou\\ncan\\ncaptur e\\nstunning\\nphot os\\nand\\nvideos\\nin\\nany\\nlighting\\ncondition.\\nThe\\nX10\\nalso\\nboasts\\na\\nhigh-r esolution\\nAMOLED\\ndispla y ,\\nfast-char ging\\ntechnology ,\\nand\\n5G\\nconnectivity\\nfor\\nseamless\\nbr owsing\\nand\\nstr eaming.'"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
    "source": [
     "reader = PdfReader(\"data/example.pdf\")\n",
     "number_of_pages = len(reader.pages)\n",
@@ -84,29 +43,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Your min_length=56 must be inferior than your max_length=30.\n",
-      "/Users/noah/spaces/document-summarizer/venv/lib/python3.11/site-packages/transformers/generation/utils.py:1156: UserWarning: Unfeasible length constraints: `min_length` (56) is larger than the maximum possible length (30). Generation will stop at the defined maximum length. You should decrease the minimum length and/or increase the maximum length.\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "[{'summary_text': 'The XYZT ech is a high-resolution camera with a range of up to 100 feet. The X10 is equipped with'}]"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
    "source": [
     "results = summarizer(text, max_length=30, do_sample=False)\n",
     "results"
@@ -114,20 +53,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'The XYZT ech is a high-resolution camera with a range of up to 100 feet. The X10 is equipped with'"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
    "source": [
     "results[0][\"summary_text\"]"
    ]

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
     "! pip install torch torchvision torchaudio pypdf"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
     "from transformers import pipeline\n",
     "from pypdf import PdfReader"
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
     "reader = PdfReader(\"data/example.pdf\")\n",
     "number_of_pages = len(reader.pages)\n",
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
     "results = summarizer(text, max_length=30, do_sample=False)\n",
     "results"
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
     "results[0][\"summary_text\"]"
    ]

app.py CHANGED Viewed

@@ -1,22 +1,52 @@
 import gradio as gr
 from transformers import pipeline
 from pypdf import PdfReader
 summarizer = pipeline(task="summarization", model="facebook/bart-large-cnn")
 MAX_PDF_SIZE = 10000000
-def summarize(doc: str) -> str:
-    if doc is None:
-        raise gr.Error(f"Please select a PDF to summarize")
-    reader = PdfReader(doc)
     text = ""
     for page in reader.pages:
         text += page.extract_text()
     text_length = len(text)
     if text_length > MAX_PDF_SIZE:
@@ -36,10 +66,15 @@ app = gr.Interface(
     [
         gr.File(
             label="Document to summarize",
         ),
     ],
     gr.Textbox(label="Summary"),
-    examples=[["data/example.pdf"]],
 )
 if __name__ == "__main__":

 import gradio as gr
 from transformers import pipeline
 from pypdf import PdfReader
+import docx
+import os
 summarizer = pipeline(task="summarization", model="facebook/bart-large-cnn")
 MAX_PDF_SIZE = 10000000
+def extract_text_from_pdf(pdf_file):
+    reader = PdfReader(pdf_file)
     text = ""
     for page in reader.pages:
         text += page.extract_text()
+    return text
+def extract_text_from_doc(docx_file):
+    doc = docx.Document(docx_file)
+    text = ""
+    for paragraph in doc.paragraphs:
+        text += paragraph.text + "\n"
+    return text
+def extract_text_from_txt(txt_file):
+    with open(txt_file, "r", encoding="utf-8") as file:
+        text = file.read()
+    return text
+def summarize(doc: str) -> str:
+    if doc is None:
+        raise gr.Error(f"Please select a PDF to summarize")
+    file_extension = os.path.splitext(doc)[1]
+    if file_extension == ".pdf":
+        text = extract_text_from_pdf(doc)
+    elif file_extension == ".txt":
+        text = extract_text_from_txt(doc)
+    elif file_extension == ".docx" or file_extension == ".doc":
+        text = extract_text_from_doc(doc)
+    else:
+        raise gr.Error(f"We only support .pdf, .txt, .doc and .docx files")
     text_length = len(text)
     if text_length > MAX_PDF_SIZE:
     [
         gr.File(
             label="Document to summarize",
+            file_types=["pdf", "docx", "doc", "txt", "odt", "dot", "dotx"],
         ),
     ],
     gr.Textbox(label="Summary"),
+    examples=[
+        ["data/pd-file-example.pdf"],
+        ["data/doc-file-example.docx"],
+        ["data/text-file-example.txt"],
+    ],
 )
 if __name__ == "__main__":

data/doc-file-example.docx ADDED Viewed

Binary file (292 kB). View file

data/{example.pdf → pd-file-example.pdf} RENAMED Viewed

File without changes

data/text-file-example.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Researchers have discovered a new species of dinosaur in Argentina. The dinosaur, named Bajadasaurus pronuspinax, lived approximately 140 million years ago during the Cretaceous period. It was a herbivore with a long neck and spiky back, similar to the more well-known Stegosaurus. The discovery sheds light on the diversity of dinosaurs in South America during the Cretaceous period.

requirements.txt CHANGED Viewed

@@ -2,4 +2,5 @@ transformers
 torch
 torchvision
 torchaudio
-pypdf

 torch
 torchvision
 torchaudio
+pypdf
+python-docx