Spaces:
Runtime error
Runtime error
Noah Nsimbe
commited on
Commit
•
07c028c
1
Parent(s):
5977c4a
support for txt and doc files
Browse files- README.md +4 -2
- app.ipynb +11 -83
- app.py +41 -6
- data/doc-file-example.docx +0 -0
- data/{example.pdf → pd-file-example.pdf} +0 -0
- data/text-file-example.txt +1 -0
- requirements.txt +2 -1
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title: "
|
3 |
emoji: 📄
|
4 |
colorFrom: blue
|
5 |
colorTo: gray
|
@@ -8,11 +8,13 @@ sdk_version: 4.24.0
|
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: "mit"
|
11 |
-
short_description: "Performs
|
12 |
tags:
|
13 |
- "document"
|
|
|
14 |
- "summary"
|
15 |
- "summarizer"
|
|
|
16 |
models:
|
17 |
- "facebook/bart-large-cnn"
|
18 |
---
|
|
|
1 |
---
|
2 |
+
title: "Text Summarizer"
|
3 |
emoji: 📄
|
4 |
colorFrom: blue
|
5 |
colorTo: gray
|
|
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: "mit"
|
11 |
+
short_description: "Performs text summarization"
|
12 |
tags:
|
13 |
- "document"
|
14 |
+
- "text"
|
15 |
- "summary"
|
16 |
- "summarizer"
|
17 |
+
- "summarization"
|
18 |
models:
|
19 |
- "facebook/bart-large-cnn"
|
20 |
---
|
app.ipynb
CHANGED
@@ -2,48 +2,18 @@
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
-
"execution_count":
|
6 |
"metadata": {},
|
7 |
-
"outputs": [
|
8 |
-
{
|
9 |
-
"name": "stdout",
|
10 |
-
"output_type": "stream",
|
11 |
-
"text": [
|
12 |
-
"Requirement already satisfied: torch in ./venv/lib/python3.11/site-packages (2.2.2)\n",
|
13 |
-
"Requirement already satisfied: torchvision in ./venv/lib/python3.11/site-packages (0.17.2)\n",
|
14 |
-
"Requirement already satisfied: torchaudio in ./venv/lib/python3.11/site-packages (2.2.2)\n",
|
15 |
-
"Requirement already satisfied: pypdf in ./venv/lib/python3.11/site-packages (4.1.0)\n",
|
16 |
-
"Requirement already satisfied: filelock in ./venv/lib/python3.11/site-packages (from torch) (3.13.3)\n",
|
17 |
-
"Requirement already satisfied: typing-extensions>=4.8.0 in ./venv/lib/python3.11/site-packages (from torch) (4.10.0)\n",
|
18 |
-
"Requirement already satisfied: sympy in ./venv/lib/python3.11/site-packages (from torch) (1.12)\n",
|
19 |
-
"Requirement already satisfied: networkx in ./venv/lib/python3.11/site-packages (from torch) (3.2.1)\n",
|
20 |
-
"Requirement already satisfied: jinja2 in ./venv/lib/python3.11/site-packages (from torch) (3.1.3)\n",
|
21 |
-
"Requirement already satisfied: fsspec in ./venv/lib/python3.11/site-packages (from torch) (2024.3.1)\n",
|
22 |
-
"Requirement already satisfied: numpy in ./venv/lib/python3.11/site-packages (from torchvision) (1.26.4)\n",
|
23 |
-
"Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in ./venv/lib/python3.11/site-packages (from torchvision) (10.2.0)\n",
|
24 |
-
"Requirement already satisfied: MarkupSafe>=2.0 in ./venv/lib/python3.11/site-packages (from jinja2->torch) (2.1.5)\n",
|
25 |
-
"Requirement already satisfied: mpmath>=0.19 in ./venv/lib/python3.11/site-packages (from sympy->torch) (1.3.0)\n"
|
26 |
-
]
|
27 |
-
}
|
28 |
-
],
|
29 |
"source": [
|
30 |
"! pip install torch torchvision torchaudio pypdf"
|
31 |
]
|
32 |
},
|
33 |
{
|
34 |
"cell_type": "code",
|
35 |
-
"execution_count":
|
36 |
"metadata": {},
|
37 |
-
"outputs": [
|
38 |
-
{
|
39 |
-
"name": "stderr",
|
40 |
-
"output_type": "stream",
|
41 |
-
"text": [
|
42 |
-
"/Users/noah/spaces/document-summarizer/venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
43 |
-
" from .autonotebook import tqdm as notebook_tqdm\n"
|
44 |
-
]
|
45 |
-
}
|
46 |
-
],
|
47 |
"source": [
|
48 |
"from transformers import pipeline\n",
|
49 |
"from pypdf import PdfReader"
|
@@ -51,7 +21,7 @@
|
|
51 |
},
|
52 |
{
|
53 |
"cell_type": "code",
|
54 |
-
"execution_count":
|
55 |
"metadata": {},
|
56 |
"outputs": [],
|
57 |
"source": [
|
@@ -60,20 +30,9 @@
|
|
60 |
},
|
61 |
{
|
62 |
"cell_type": "code",
|
63 |
-
"execution_count":
|
64 |
"metadata": {},
|
65 |
-
"outputs": [
|
66 |
-
{
|
67 |
-
"data": {
|
68 |
-
"text/plain": [
|
69 |
-
"'Intr oducing\\nthe\\nlatest\\nsmar tphone\\nfr om\\nXYZ\\nT ech.\\nThe\\nXYZ\\nT ech\\nX10\\nf eatur es\\na\\nsleek\\ndesign,\\npower ful\\nper formance,\\nand\\nadv anced\\ncamer a\\ncapabilities.\\nWith\\na\\nquad-camer a\\nsetup,\\nincluding\\na\\n108MP\\nmain\\ncamer a\\nand\\na\\n5x\\noptical\\nz oom\\nlens,\\ny ou\\ncan\\ncaptur e\\nstunning\\nphot os\\nand\\nvideos\\nin\\nany\\nlighting\\ncondition.\\nThe\\nX10\\nalso\\nboasts\\na\\nhigh-r esolution\\nAMOLED\\ndispla y ,\\nfast-char ging\\ntechnology ,\\nand\\n5G\\nconnectivity\\nfor\\nseamless\\nbr owsing\\nand\\nstr eaming.'"
|
70 |
-
]
|
71 |
-
},
|
72 |
-
"execution_count": 4,
|
73 |
-
"metadata": {},
|
74 |
-
"output_type": "execute_result"
|
75 |
-
}
|
76 |
-
],
|
77 |
"source": [
|
78 |
"reader = PdfReader(\"data/example.pdf\")\n",
|
79 |
"number_of_pages = len(reader.pages)\n",
|
@@ -84,29 +43,9 @@
|
|
84 |
},
|
85 |
{
|
86 |
"cell_type": "code",
|
87 |
-
"execution_count":
|
88 |
"metadata": {},
|
89 |
-
"outputs": [
|
90 |
-
{
|
91 |
-
"name": "stderr",
|
92 |
-
"output_type": "stream",
|
93 |
-
"text": [
|
94 |
-
"Your min_length=56 must be inferior than your max_length=30.\n",
|
95 |
-
"/Users/noah/spaces/document-summarizer/venv/lib/python3.11/site-packages/transformers/generation/utils.py:1156: UserWarning: Unfeasible length constraints: `min_length` (56) is larger than the maximum possible length (30). Generation will stop at the defined maximum length. You should decrease the minimum length and/or increase the maximum length.\n",
|
96 |
-
" warnings.warn(\n"
|
97 |
-
]
|
98 |
-
},
|
99 |
-
{
|
100 |
-
"data": {
|
101 |
-
"text/plain": [
|
102 |
-
"[{'summary_text': 'The XYZT ech is a high-resolution camera with a range of up to 100 feet. The X10 is equipped with'}]"
|
103 |
-
]
|
104 |
-
},
|
105 |
-
"execution_count": 7,
|
106 |
-
"metadata": {},
|
107 |
-
"output_type": "execute_result"
|
108 |
-
}
|
109 |
-
],
|
110 |
"source": [
|
111 |
"results = summarizer(text, max_length=30, do_sample=False)\n",
|
112 |
"results"
|
@@ -114,20 +53,9 @@
|
|
114 |
},
|
115 |
{
|
116 |
"cell_type": "code",
|
117 |
-
"execution_count":
|
118 |
"metadata": {},
|
119 |
-
"outputs": [
|
120 |
-
{
|
121 |
-
"data": {
|
122 |
-
"text/plain": [
|
123 |
-
"'The XYZT ech is a high-resolution camera with a range of up to 100 feet. The X10 is equipped with'"
|
124 |
-
]
|
125 |
-
},
|
126 |
-
"execution_count": 8,
|
127 |
-
"metadata": {},
|
128 |
-
"output_type": "execute_result"
|
129 |
-
}
|
130 |
-
],
|
131 |
"source": [
|
132 |
"results[0][\"summary_text\"]"
|
133 |
]
|
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
"metadata": {},
|
7 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
"source": [
|
9 |
"! pip install torch torchvision torchaudio pypdf"
|
10 |
]
|
11 |
},
|
12 |
{
|
13 |
"cell_type": "code",
|
14 |
+
"execution_count": null,
|
15 |
"metadata": {},
|
16 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
"source": [
|
18 |
"from transformers import pipeline\n",
|
19 |
"from pypdf import PdfReader"
|
|
|
21 |
},
|
22 |
{
|
23 |
"cell_type": "code",
|
24 |
+
"execution_count": null,
|
25 |
"metadata": {},
|
26 |
"outputs": [],
|
27 |
"source": [
|
|
|
30 |
},
|
31 |
{
|
32 |
"cell_type": "code",
|
33 |
+
"execution_count": null,
|
34 |
"metadata": {},
|
35 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
"source": [
|
37 |
"reader = PdfReader(\"data/example.pdf\")\n",
|
38 |
"number_of_pages = len(reader.pages)\n",
|
|
|
43 |
},
|
44 |
{
|
45 |
"cell_type": "code",
|
46 |
+
"execution_count": null,
|
47 |
"metadata": {},
|
48 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
"source": [
|
50 |
"results = summarizer(text, max_length=30, do_sample=False)\n",
|
51 |
"results"
|
|
|
53 |
},
|
54 |
{
|
55 |
"cell_type": "code",
|
56 |
+
"execution_count": null,
|
57 |
"metadata": {},
|
58 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
"source": [
|
60 |
"results[0][\"summary_text\"]"
|
61 |
]
|
app.py
CHANGED
@@ -1,22 +1,52 @@
|
|
1 |
import gradio as gr
|
2 |
from transformers import pipeline
|
3 |
from pypdf import PdfReader
|
|
|
|
|
4 |
|
5 |
summarizer = pipeline(task="summarization", model="facebook/bart-large-cnn")
|
6 |
|
7 |
MAX_PDF_SIZE = 10000000
|
8 |
|
9 |
|
10 |
-
def
|
11 |
-
|
12 |
-
raise gr.Error(f"Please select a PDF to summarize")
|
13 |
-
|
14 |
-
reader = PdfReader(doc)
|
15 |
text = ""
|
16 |
|
17 |
for page in reader.pages:
|
18 |
text += page.extract_text()
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
text_length = len(text)
|
21 |
|
22 |
if text_length > MAX_PDF_SIZE:
|
@@ -36,10 +66,15 @@ app = gr.Interface(
|
|
36 |
[
|
37 |
gr.File(
|
38 |
label="Document to summarize",
|
|
|
39 |
),
|
40 |
],
|
41 |
gr.Textbox(label="Summary"),
|
42 |
-
examples=[
|
|
|
|
|
|
|
|
|
43 |
)
|
44 |
|
45 |
if __name__ == "__main__":
|
|
|
1 |
import gradio as gr
|
2 |
from transformers import pipeline
|
3 |
from pypdf import PdfReader
|
4 |
+
import docx
|
5 |
+
import os
|
6 |
|
7 |
summarizer = pipeline(task="summarization", model="facebook/bart-large-cnn")
|
8 |
|
9 |
MAX_PDF_SIZE = 10000000
|
10 |
|
11 |
|
12 |
+
def extract_text_from_pdf(pdf_file):
|
13 |
+
reader = PdfReader(pdf_file)
|
|
|
|
|
|
|
14 |
text = ""
|
15 |
|
16 |
for page in reader.pages:
|
17 |
text += page.extract_text()
|
18 |
|
19 |
+
return text
|
20 |
+
|
21 |
+
|
22 |
+
def extract_text_from_doc(docx_file):
|
23 |
+
doc = docx.Document(docx_file)
|
24 |
+
text = ""
|
25 |
+
for paragraph in doc.paragraphs:
|
26 |
+
text += paragraph.text + "\n"
|
27 |
+
return text
|
28 |
+
|
29 |
+
|
30 |
+
def extract_text_from_txt(txt_file):
|
31 |
+
with open(txt_file, "r", encoding="utf-8") as file:
|
32 |
+
text = file.read()
|
33 |
+
return text
|
34 |
+
|
35 |
+
|
36 |
+
def summarize(doc: str) -> str:
|
37 |
+
if doc is None:
|
38 |
+
raise gr.Error(f"Please select a PDF to summarize")
|
39 |
+
|
40 |
+
file_extension = os.path.splitext(doc)[1]
|
41 |
+
if file_extension == ".pdf":
|
42 |
+
text = extract_text_from_pdf(doc)
|
43 |
+
elif file_extension == ".txt":
|
44 |
+
text = extract_text_from_txt(doc)
|
45 |
+
elif file_extension == ".docx" or file_extension == ".doc":
|
46 |
+
text = extract_text_from_doc(doc)
|
47 |
+
else:
|
48 |
+
raise gr.Error(f"We only support .pdf, .txt, .doc and .docx files")
|
49 |
+
|
50 |
text_length = len(text)
|
51 |
|
52 |
if text_length > MAX_PDF_SIZE:
|
|
|
66 |
[
|
67 |
gr.File(
|
68 |
label="Document to summarize",
|
69 |
+
file_types=["pdf", "docx", "doc", "txt", "odt", "dot", "dotx"],
|
70 |
),
|
71 |
],
|
72 |
gr.Textbox(label="Summary"),
|
73 |
+
examples=[
|
74 |
+
["data/pd-file-example.pdf"],
|
75 |
+
["data/doc-file-example.docx"],
|
76 |
+
["data/text-file-example.txt"],
|
77 |
+
],
|
78 |
)
|
79 |
|
80 |
if __name__ == "__main__":
|
data/doc-file-example.docx
ADDED
Binary file (292 kB). View file
|
|
data/{example.pdf → pd-file-example.pdf}
RENAMED
File without changes
|
data/text-file-example.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Researchers have discovered a new species of dinosaur in Argentina. The dinosaur, named Bajadasaurus pronuspinax, lived approximately 140 million years ago during the Cretaceous period. It was a herbivore with a long neck and spiky back, similar to the more well-known Stegosaurus. The discovery sheds light on the diversity of dinosaurs in South America during the Cretaceous period.
|
requirements.txt
CHANGED
@@ -2,4 +2,5 @@ transformers
|
|
2 |
torch
|
3 |
torchvision
|
4 |
torchaudio
|
5 |
-
pypdf
|
|
|
|
2 |
torch
|
3 |
torchvision
|
4 |
torchaudio
|
5 |
+
pypdf
|
6 |
+
python-docx
|