Noah Nsimbe commited on
Commit
07c028c
1 Parent(s): 5977c4a

support for txt and doc files

Browse files
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: "Document Summarizer"
3
  emoji: 📄
4
  colorFrom: blue
5
  colorTo: gray
@@ -8,11 +8,13 @@ sdk_version: 4.24.0
8
  app_file: app.py
9
  pinned: true
10
  license: "mit"
11
- short_description: "Performs document summarization"
12
  tags:
13
  - "document"
 
14
  - "summary"
15
  - "summarizer"
 
16
  models:
17
  - "facebook/bart-large-cnn"
18
  ---
 
1
  ---
2
+ title: "Text Summarizer"
3
  emoji: 📄
4
  colorFrom: blue
5
  colorTo: gray
 
8
  app_file: app.py
9
  pinned: true
10
  license: "mit"
11
+ short_description: "Performs text summarization"
12
  tags:
13
  - "document"
14
+ - "text"
15
  - "summary"
16
  - "summarizer"
17
+ - "summarization"
18
  models:
19
  - "facebook/bart-large-cnn"
20
  ---
app.ipynb CHANGED
@@ -2,48 +2,18 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 1,
6
  "metadata": {},
7
- "outputs": [
8
- {
9
- "name": "stdout",
10
- "output_type": "stream",
11
- "text": [
12
- "Requirement already satisfied: torch in ./venv/lib/python3.11/site-packages (2.2.2)\n",
13
- "Requirement already satisfied: torchvision in ./venv/lib/python3.11/site-packages (0.17.2)\n",
14
- "Requirement already satisfied: torchaudio in ./venv/lib/python3.11/site-packages (2.2.2)\n",
15
- "Requirement already satisfied: pypdf in ./venv/lib/python3.11/site-packages (4.1.0)\n",
16
- "Requirement already satisfied: filelock in ./venv/lib/python3.11/site-packages (from torch) (3.13.3)\n",
17
- "Requirement already satisfied: typing-extensions>=4.8.0 in ./venv/lib/python3.11/site-packages (from torch) (4.10.0)\n",
18
- "Requirement already satisfied: sympy in ./venv/lib/python3.11/site-packages (from torch) (1.12)\n",
19
- "Requirement already satisfied: networkx in ./venv/lib/python3.11/site-packages (from torch) (3.2.1)\n",
20
- "Requirement already satisfied: jinja2 in ./venv/lib/python3.11/site-packages (from torch) (3.1.3)\n",
21
- "Requirement already satisfied: fsspec in ./venv/lib/python3.11/site-packages (from torch) (2024.3.1)\n",
22
- "Requirement already satisfied: numpy in ./venv/lib/python3.11/site-packages (from torchvision) (1.26.4)\n",
23
- "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in ./venv/lib/python3.11/site-packages (from torchvision) (10.2.0)\n",
24
- "Requirement already satisfied: MarkupSafe>=2.0 in ./venv/lib/python3.11/site-packages (from jinja2->torch) (2.1.5)\n",
25
- "Requirement already satisfied: mpmath>=0.19 in ./venv/lib/python3.11/site-packages (from sympy->torch) (1.3.0)\n"
26
- ]
27
- }
28
- ],
29
  "source": [
30
  "! pip install torch torchvision torchaudio pypdf"
31
  ]
32
  },
33
  {
34
  "cell_type": "code",
35
- "execution_count": 2,
36
  "metadata": {},
37
- "outputs": [
38
- {
39
- "name": "stderr",
40
- "output_type": "stream",
41
- "text": [
42
- "/Users/noah/spaces/document-summarizer/venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
43
- " from .autonotebook import tqdm as notebook_tqdm\n"
44
- ]
45
- }
46
- ],
47
  "source": [
48
  "from transformers import pipeline\n",
49
  "from pypdf import PdfReader"
@@ -51,7 +21,7 @@
51
  },
52
  {
53
  "cell_type": "code",
54
- "execution_count": 3,
55
  "metadata": {},
56
  "outputs": [],
57
  "source": [
@@ -60,20 +30,9 @@
60
  },
61
  {
62
  "cell_type": "code",
63
- "execution_count": 4,
64
  "metadata": {},
65
- "outputs": [
66
- {
67
- "data": {
68
- "text/plain": [
69
- "'Intr oducing\\nthe\\nlatest\\nsmar tphone\\nfr om\\nXYZ\\nT ech.\\nThe\\nXYZ\\nT ech\\nX10\\nf eatur es\\na\\nsleek\\ndesign,\\npower ful\\nper formance,\\nand\\nadv anced\\ncamer a\\ncapabilities.\\nWith\\na\\nquad-camer a\\nsetup,\\nincluding\\na\\n108MP\\nmain\\ncamer a\\nand\\na\\n5x\\noptical\\nz oom\\nlens,\\ny ou\\ncan\\ncaptur e\\nstunning\\nphot os\\nand\\nvideos\\nin\\nany\\nlighting\\ncondition.\\nThe\\nX10\\nalso\\nboasts\\na\\nhigh-r esolution\\nAMOLED\\ndispla y ,\\nfast-char ging\\ntechnology ,\\nand\\n5G\\nconnectivity\\nfor\\nseamless\\nbr owsing\\nand\\nstr eaming.'"
70
- ]
71
- },
72
- "execution_count": 4,
73
- "metadata": {},
74
- "output_type": "execute_result"
75
- }
76
- ],
77
  "source": [
78
  "reader = PdfReader(\"data/example.pdf\")\n",
79
  "number_of_pages = len(reader.pages)\n",
@@ -84,29 +43,9 @@
84
  },
85
  {
86
  "cell_type": "code",
87
- "execution_count": 7,
88
  "metadata": {},
89
- "outputs": [
90
- {
91
- "name": "stderr",
92
- "output_type": "stream",
93
- "text": [
94
- "Your min_length=56 must be inferior than your max_length=30.\n",
95
- "/Users/noah/spaces/document-summarizer/venv/lib/python3.11/site-packages/transformers/generation/utils.py:1156: UserWarning: Unfeasible length constraints: `min_length` (56) is larger than the maximum possible length (30). Generation will stop at the defined maximum length. You should decrease the minimum length and/or increase the maximum length.\n",
96
- " warnings.warn(\n"
97
- ]
98
- },
99
- {
100
- "data": {
101
- "text/plain": [
102
- "[{'summary_text': 'The XYZT ech is a high-resolution camera with a range of up to 100 feet. The X10 is equipped with'}]"
103
- ]
104
- },
105
- "execution_count": 7,
106
- "metadata": {},
107
- "output_type": "execute_result"
108
- }
109
- ],
110
  "source": [
111
  "results = summarizer(text, max_length=30, do_sample=False)\n",
112
  "results"
@@ -114,20 +53,9 @@
114
  },
115
  {
116
  "cell_type": "code",
117
- "execution_count": 8,
118
  "metadata": {},
119
- "outputs": [
120
- {
121
- "data": {
122
- "text/plain": [
123
- "'The XYZT ech is a high-resolution camera with a range of up to 100 feet. The X10 is equipped with'"
124
- ]
125
- },
126
- "execution_count": 8,
127
- "metadata": {},
128
- "output_type": "execute_result"
129
- }
130
- ],
131
  "source": [
132
  "results[0][\"summary_text\"]"
133
  ]
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": null,
6
  "metadata": {},
7
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  "source": [
9
  "! pip install torch torchvision torchaudio pypdf"
10
  ]
11
  },
12
  {
13
  "cell_type": "code",
14
+ "execution_count": null,
15
  "metadata": {},
16
+ "outputs": [],
 
 
 
 
 
 
 
 
 
17
  "source": [
18
  "from transformers import pipeline\n",
19
  "from pypdf import PdfReader"
 
21
  },
22
  {
23
  "cell_type": "code",
24
+ "execution_count": null,
25
  "metadata": {},
26
  "outputs": [],
27
  "source": [
 
30
  },
31
  {
32
  "cell_type": "code",
33
+ "execution_count": null,
34
  "metadata": {},
35
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
36
  "source": [
37
  "reader = PdfReader(\"data/example.pdf\")\n",
38
  "number_of_pages = len(reader.pages)\n",
 
43
  },
44
  {
45
  "cell_type": "code",
46
+ "execution_count": null,
47
  "metadata": {},
48
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  "source": [
50
  "results = summarizer(text, max_length=30, do_sample=False)\n",
51
  "results"
 
53
  },
54
  {
55
  "cell_type": "code",
56
+ "execution_count": null,
57
  "metadata": {},
58
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
59
  "source": [
60
  "results[0][\"summary_text\"]"
61
  ]
app.py CHANGED
@@ -1,22 +1,52 @@
1
  import gradio as gr
2
  from transformers import pipeline
3
  from pypdf import PdfReader
 
 
4
 
5
  summarizer = pipeline(task="summarization", model="facebook/bart-large-cnn")
6
 
7
  MAX_PDF_SIZE = 10000000
8
 
9
 
10
- def summarize(doc: str) -> str:
11
- if doc is None:
12
- raise gr.Error(f"Please select a PDF to summarize")
13
-
14
- reader = PdfReader(doc)
15
  text = ""
16
 
17
  for page in reader.pages:
18
  text += page.extract_text()
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  text_length = len(text)
21
 
22
  if text_length > MAX_PDF_SIZE:
@@ -36,10 +66,15 @@ app = gr.Interface(
36
  [
37
  gr.File(
38
  label="Document to summarize",
 
39
  ),
40
  ],
41
  gr.Textbox(label="Summary"),
42
- examples=[["data/example.pdf"]],
 
 
 
 
43
  )
44
 
45
  if __name__ == "__main__":
 
1
  import gradio as gr
2
  from transformers import pipeline
3
  from pypdf import PdfReader
4
+ import docx
5
+ import os
6
 
7
  summarizer = pipeline(task="summarization", model="facebook/bart-large-cnn")
8
 
9
  MAX_PDF_SIZE = 10000000
10
 
11
 
12
+ def extract_text_from_pdf(pdf_file):
13
+ reader = PdfReader(pdf_file)
 
 
 
14
  text = ""
15
 
16
  for page in reader.pages:
17
  text += page.extract_text()
18
 
19
+ return text
20
+
21
+
22
+ def extract_text_from_doc(docx_file):
23
+ doc = docx.Document(docx_file)
24
+ text = ""
25
+ for paragraph in doc.paragraphs:
26
+ text += paragraph.text + "\n"
27
+ return text
28
+
29
+
30
+ def extract_text_from_txt(txt_file):
31
+ with open(txt_file, "r", encoding="utf-8") as file:
32
+ text = file.read()
33
+ return text
34
+
35
+
36
+ def summarize(doc: str) -> str:
37
+ if doc is None:
38
+ raise gr.Error(f"Please select a PDF to summarize")
39
+
40
+ file_extension = os.path.splitext(doc)[1]
41
+ if file_extension == ".pdf":
42
+ text = extract_text_from_pdf(doc)
43
+ elif file_extension == ".txt":
44
+ text = extract_text_from_txt(doc)
45
+ elif file_extension == ".docx" or file_extension == ".doc":
46
+ text = extract_text_from_doc(doc)
47
+ else:
48
+ raise gr.Error(f"We only support .pdf, .txt, .doc and .docx files")
49
+
50
  text_length = len(text)
51
 
52
  if text_length > MAX_PDF_SIZE:
 
66
  [
67
  gr.File(
68
  label="Document to summarize",
69
+ file_types=["pdf", "docx", "doc", "txt", "odt", "dot", "dotx"],
70
  ),
71
  ],
72
  gr.Textbox(label="Summary"),
73
+ examples=[
74
+ ["data/pd-file-example.pdf"],
75
+ ["data/doc-file-example.docx"],
76
+ ["data/text-file-example.txt"],
77
+ ],
78
  )
79
 
80
  if __name__ == "__main__":
data/doc-file-example.docx ADDED
Binary file (292 kB). View file
 
data/{example.pdf → pd-file-example.pdf} RENAMED
File without changes
data/text-file-example.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Researchers have discovered a new species of dinosaur in Argentina. The dinosaur, named Bajadasaurus pronuspinax, lived approximately 140 million years ago during the Cretaceous period. It was a herbivore with a long neck and spiky back, similar to the more well-known Stegosaurus. The discovery sheds light on the diversity of dinosaurs in South America during the Cretaceous period.
requirements.txt CHANGED
@@ -2,4 +2,5 @@ transformers
2
  torch
3
  torchvision
4
  torchaudio
5
- pypdf
 
 
2
  torch
3
  torchvision
4
  torchaudio
5
+ pypdf
6
+ python-docx