Spaces:

noahnsimbe
/

text-summarizer

Runtime error

App Files Files Community

Noah Nsimbe commited on Apr 14

Commit

b5498e2

•

1 Parent(s): c50f96e

udate

Browse files

Files changed (3) hide show

app.ipynb +88 -11
app.py +74 -51
requirements.txt +4 -1

app.ipynb CHANGED Viewed

@@ -6,7 +6,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "! pip install torch torchvision torchaudio pypdf"
    ]
   },
   {
@@ -15,8 +15,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
     "from transformers import pipeline\n",
-    "from pypdf import PdfReader"
    ]
   },
   {
@@ -25,7 +29,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "summarizer = pipeline(task=\"summarization\", model=\"facebook/bart-large-cnn\")"
    ]
   },
   {
@@ -34,11 +46,61 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "reader = PdfReader(\"data/example.pdf\")\n",
-    "number_of_pages = len(reader.pages)\n",
-    "page = reader.pages[0]\n",
-    "text = page.extract_text()\n",
-    "text"
    ]
   },
   {
@@ -47,8 +109,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "results = summarizer(text, max_length=30, do_sample=False)\n",
-    "results"
    ]
   },
   {
@@ -57,7 +124,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "results[0][\"summary_text\"]"
    ]
   }
  ],

    "metadata": {},
    "outputs": [],
    "source": [
+    "! pip install -U \"transformers[torch]\" pypdf python-docx langdetect sentencepiece sacremoses"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "import gradio as gr\n",
     "from transformers import pipeline\n",
+    "from pypdf import PdfReader\n",
+    "import docx\n",
+    "import os\n",
+    "from langdetect import detect"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "summarizer = pipeline(task=\"summarization\", model=\"facebook/bart-large-cnn\")\n",
+    "translator_to_french = pipeline(\n",
+    "    task=\"translation_en_to_fr\", model=\"Helsinki-NLP/opus-mt-en-fr\"\n",
+    ")\n",
+    "translator_to_english = pipeline(\n",
+    "    task=\"translation_fr_to_en\", model=\"Helsinki-NLP/opus-mt-fr-en\"\n",
+    ")\n",
+    "\n",
+    "MAX_FILE_SIZE = 10000000"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "class TextExtractor:\n",
+    "    def __init__(self, doc_location: str):\n",
+    "        if doc_location is None:\n",
+    "            raise Exception(f\"Please select a PDF to summarize\")\n",
+    "        self.doc_location = doc_location\n",
+    "\n",
+    "    def extract_text_from_pdf(self):\n",
+    "        reader = PdfReader(self.doc_location)\n",
+    "        text = \"\"\n",
+    "\n",
+    "        for page in reader.pages:\n",
+    "            text += page.extract_text()\n",
+    "\n",
+    "        return text\n",
+    "\n",
+    "    def extract_text_from_doc(self):\n",
+    "        doc = docx.Document(self.doc_location)\n",
+    "        text = \"\"\n",
+    "\n",
+    "        for paragraph in doc.paragraphs:\n",
+    "            text += paragraph.text + \"\\n\"\n",
+    "        return text\n",
+    "\n",
+    "    def extract_text_from_txt(self):\n",
+    "        with open(self.doc_location, \"r\", encoding=\"utf-8\") as file:\n",
+    "            text = file.read()\n",
+    "        return text\n",
+    "\n",
+    "    def extract_text_from_txt(self):\n",
+    "        with open(self.doc_location, \"r\", encoding=\"utf-8\") as file:\n",
+    "            text = file.read()\n",
+    "        return text\n",
+    "    \n",
+    "    def text_length(self):\n",
+    "        words = self.text.split()\n",
+    "        num_words = len(words)\n",
+    "        return num_words\n",
+    "\n",
+    "    def get_text(self) -> str:\n",
+    "        file_extension = os.path.splitext(self.doc_location)[1]\n",
+    "        if file_extension == \".pdf\":\n",
+    "            self.text = self.extract_text_from_pdf()\n",
+    "        elif file_extension == \".txt\":\n",
+    "            self.text = self.extract_text_from_txt()\n",
+    "        elif file_extension == \".docx\" or file_extension == \".doc\":\n",
+    "            self.text = self.extract_text_from_doc()\n",
+    "        else:\n",
+    "            raise gr.Error(f\"We only support .pdf, .txt, .doc and .docx files\")\n",
+    "\n",
+    "        if len(self.text) > MAX_FILE_SIZE:\n",
+    "            raise gr.Error(\n",
+    "                f\"Document exceeds the maximum supported size of {MAX_FILE_SIZE} characters.\"\n",
+    "            )\n",
+    "\n",
+    "        return self.text"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "text_extractor = TextExtractor(\"data/doc-file-example.docx\")\n",
+    "text = text_extractor.get_text()\n",
+    "\n",
+    "text_length = text_extractor.text_length()\n",
+    "summary_length = int(text_length / 2)\n",
+    "\n",
+    "summary = summarizer(text, max_length=summary_length, do_sample=False)[0][\"summary_text\"]"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "summary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "detected_lang = detect(summary)\n",
+    "detected_lang"
    ]
   }
  ],

app.py CHANGED Viewed

@@ -1,88 +1,111 @@
 import gradio as gr
-from transformers import BartTokenizer, BartForConditionalGeneration
 from pypdf import PdfReader
 import docx
 import os
-model_name = "facebook/bart-large-cnn"
-tokenizer = BartTokenizer.from_pretrained(model_name)
-model = BartForConditionalGeneration.from_pretrained(model_name)
-MAX_PDF_SIZE = 10000000
-def extract_text_from_pdf(pdf_file):
-    reader = PdfReader(pdf_file)
-    text = ""
-    for page in reader.pages:
-        text += page.extract_text()
-    return text
-def extract_text_from_doc(docx_file):
-    doc = docx.Document(docx_file)
-    text = ""
-    for paragraph in doc.paragraphs:
-        text += paragraph.text + "\n"
-    return text
-def extract_text_from_txt(txt_file):
-    with open(txt_file, "r", encoding="utf-8") as file:
-        text = file.read()
-    return text
-def summarize(doc: str) -> str:
-    if doc is None:
-        raise gr.Error(f"Please select a PDF to summarize")
-    file_extension = os.path.splitext(doc)[1]
-    if file_extension == ".pdf":
-        text = extract_text_from_pdf(doc)
-    elif file_extension == ".txt":
-        text = extract_text_from_txt(doc)
-    elif file_extension == ".docx" or file_extension == ".doc":
-        text = extract_text_from_doc(doc)
-    else:
-        raise gr.Error(f"We only support .pdf, .txt, .doc and .docx files")
-    text_length = len(text)
-    if text_length > MAX_PDF_SIZE:
-        raise gr.Error(
-            f"Document characters limit exceeded. Your document should not contain more than {MAX_PDF_SIZE} characters"
-        )
-    summary_length = int(text_length / 3)
-    inputs = tokenizer([text], max_length=1024, return_tensors="pt", truncation=True)
-    results = model.generate(
-        inputs.input_ids,
-        num_beams=4,
-        min_length=30,
-        max_length=summary_length,
-        early_stopping=True,
-    )
-    summary = tokenizer.decode(results[0], skip_special_tokens=True)
     return summary
 app = gr.Interface(
-    summarize,
-    [
         gr.File(
             label="Document to summarize",
             file_types=["pdf", "docx", "doc", "txt", "odt", "dot", "dotx"],
         ),
     ],
-    gr.Textbox(label="Summary"),
     examples=[
         ["data/pd-file-example.pdf"],
         ["data/doc-file-example.docx"],

 import gradio as gr
+from transformers import pipeline
 from pypdf import PdfReader
 import docx
 import os
+from langdetect import detect
+summarizer = pipeline(task="summarization", model="facebook/bart-large-cnn")
+translator_to_french = pipeline(
+    task="translation_en_to_fr", model="Helsinki-NLP/opus-mt-en-fr"
+)
+translator_to_english = pipeline(
+    task="translation_fr_to_en", model="Helsinki-NLP/opus-mt-fr-en"
+)
+MAX_FILE_SIZE = 10000000
+class TextExtractor:
+    def __init__(self, doc_location: str):
+        if doc_location is None:
+            raise Exception(f"Please select a PDF to summarize")
+        self.doc_location = doc_location
+    def extract_text_from_pdf(self):
+        reader = PdfReader(self.doc_location)
+        text = ""
+        for page in reader.pages:
+            text += page.extract_text()
+        return text
+    def extract_text_from_doc(self):
+        doc = docx.Document(self.doc_location)
+        text = ""
+        for paragraph in doc.paragraphs:
+            text += paragraph.text + "\n"
+        return text
+    def extract_text_from_txt(self):
+        with open(self.doc_location, "r", encoding="utf-8") as file:
+            text = file.read()
+        return text
+    def extract_text_from_txt(self):
+        with open(self.doc_location, "r", encoding="utf-8") as file:
+            text = file.read()
+        return text
+    def text_length(self):
+        words = self.text.split()
+        num_words = len(words)
+        return num_words
+    def get_text(self) -> str:
+        file_extension = os.path.splitext(self.doc_location)[1]
+        if file_extension == ".pdf":
+            self.text = self.extract_text_from_pdf()
+        elif file_extension == ".txt":
+            self.text = self.extract_text_from_txt()
+        elif file_extension == ".docx" or file_extension == ".doc":
+            self.text = self.extract_text_from_doc()
+        else:
+            raise gr.Error(f"We only support .pdf, .txt, .doc and .docx files")
+        if len(self.text) > MAX_FILE_SIZE:
+            raise gr.Error(
+                f"Document exceeds the maximum supported size of {MAX_FILE_SIZE} characters."
+            )
+        return self.text
+def summarize(doc: str, target_language: str) -> str:
+    text_extractor = TextExtractor(doc)
+    text = text_extractor.get_text()
+    text_length = text_extractor.text_length()
+    summary_length = int(text_length / 2)
+    summary = summarizer(text, max_length=summary_length, do_sample=False)[0]["summary_text"]
+    detected_lang = detect(summary)
+    if target_language is None:
+        pass
+    elif detected_lang == "fr" and str(target_language).lower() == "english":
+        summary = translator_to_english(summary)[0]["translation_text"]
+    elif detected_lang == "en" and str(target_language).lower() == "french":
+        summary = translator_to_french(summary)[0]["translation_text"]
     return summary
 app = gr.Interface(
+    fn=summarize,
+    inputs=[
         gr.File(
             label="Document to summarize",
             file_types=["pdf", "docx", "doc", "txt", "odt", "dot", "dotx"],
         ),
+        gr.Radio(
+            label="Translate summary to", choices=["English", "French"], value="English"
+        ),
     ],
+    outputs=gr.Textbox(label="Summary"),
     examples=[
         ["data/pd-file-example.pdf"],
         ["data/doc-file-example.docx"],

requirements.txt CHANGED Viewed

@@ -3,4 +3,7 @@ torch
 torchvision
 torchaudio
 pypdf
-python-docx

 torchvision
 torchaudio
 pypdf
+python-docx
+langdetect
+sentencepiece
+sacremoses