Spaces:
Runtime error
Runtime error
Noah Nsimbe
commited on
Commit
•
b5498e2
1
Parent(s):
c50f96e
udate
Browse files- app.ipynb +88 -11
- app.py +74 -51
- requirements.txt +4 -1
app.ipynb
CHANGED
@@ -6,7 +6,7 @@
|
|
6 |
"metadata": {},
|
7 |
"outputs": [],
|
8 |
"source": [
|
9 |
-
"! pip install torch
|
10 |
]
|
11 |
},
|
12 |
{
|
@@ -15,8 +15,12 @@
|
|
15 |
"metadata": {},
|
16 |
"outputs": [],
|
17 |
"source": [
|
|
|
18 |
"from transformers import pipeline\n",
|
19 |
-
"from pypdf import PdfReader"
|
|
|
|
|
|
|
20 |
]
|
21 |
},
|
22 |
{
|
@@ -25,7 +29,15 @@
|
|
25 |
"metadata": {},
|
26 |
"outputs": [],
|
27 |
"source": [
|
28 |
-
"summarizer = pipeline(task=\"summarization\", model=\"facebook/bart-large-cnn\")"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
]
|
30 |
},
|
31 |
{
|
@@ -34,11 +46,61 @@
|
|
34 |
"metadata": {},
|
35 |
"outputs": [],
|
36 |
"source": [
|
37 |
-
"
|
38 |
-
"
|
39 |
-
"
|
40 |
-
"
|
41 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
]
|
43 |
},
|
44 |
{
|
@@ -47,8 +109,13 @@
|
|
47 |
"metadata": {},
|
48 |
"outputs": [],
|
49 |
"source": [
|
50 |
-
"
|
51 |
-
"
|
|
|
|
|
|
|
|
|
|
|
52 |
]
|
53 |
},
|
54 |
{
|
@@ -57,7 +124,17 @@
|
|
57 |
"metadata": {},
|
58 |
"outputs": [],
|
59 |
"source": [
|
60 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
]
|
62 |
}
|
63 |
],
|
|
|
6 |
"metadata": {},
|
7 |
"outputs": [],
|
8 |
"source": [
|
9 |
+
"! pip install -U \"transformers[torch]\" pypdf python-docx langdetect sentencepiece sacremoses"
|
10 |
]
|
11 |
},
|
12 |
{
|
|
|
15 |
"metadata": {},
|
16 |
"outputs": [],
|
17 |
"source": [
|
18 |
+
"import gradio as gr\n",
|
19 |
"from transformers import pipeline\n",
|
20 |
+
"from pypdf import PdfReader\n",
|
21 |
+
"import docx\n",
|
22 |
+
"import os\n",
|
23 |
+
"from langdetect import detect"
|
24 |
]
|
25 |
},
|
26 |
{
|
|
|
29 |
"metadata": {},
|
30 |
"outputs": [],
|
31 |
"source": [
|
32 |
+
"summarizer = pipeline(task=\"summarization\", model=\"facebook/bart-large-cnn\")\n",
|
33 |
+
"translator_to_french = pipeline(\n",
|
34 |
+
" task=\"translation_en_to_fr\", model=\"Helsinki-NLP/opus-mt-en-fr\"\n",
|
35 |
+
")\n",
|
36 |
+
"translator_to_english = pipeline(\n",
|
37 |
+
" task=\"translation_fr_to_en\", model=\"Helsinki-NLP/opus-mt-fr-en\"\n",
|
38 |
+
")\n",
|
39 |
+
"\n",
|
40 |
+
"MAX_FILE_SIZE = 10000000"
|
41 |
]
|
42 |
},
|
43 |
{
|
|
|
46 |
"metadata": {},
|
47 |
"outputs": [],
|
48 |
"source": [
|
49 |
+
"class TextExtractor:\n",
|
50 |
+
" def __init__(self, doc_location: str):\n",
|
51 |
+
" if doc_location is None:\n",
|
52 |
+
" raise Exception(f\"Please select a PDF to summarize\")\n",
|
53 |
+
" self.doc_location = doc_location\n",
|
54 |
+
"\n",
|
55 |
+
" def extract_text_from_pdf(self):\n",
|
56 |
+
" reader = PdfReader(self.doc_location)\n",
|
57 |
+
" text = \"\"\n",
|
58 |
+
"\n",
|
59 |
+
" for page in reader.pages:\n",
|
60 |
+
" text += page.extract_text()\n",
|
61 |
+
"\n",
|
62 |
+
" return text\n",
|
63 |
+
"\n",
|
64 |
+
" def extract_text_from_doc(self):\n",
|
65 |
+
" doc = docx.Document(self.doc_location)\n",
|
66 |
+
" text = \"\"\n",
|
67 |
+
"\n",
|
68 |
+
" for paragraph in doc.paragraphs:\n",
|
69 |
+
" text += paragraph.text + \"\\n\"\n",
|
70 |
+
" return text\n",
|
71 |
+
"\n",
|
72 |
+
" def extract_text_from_txt(self):\n",
|
73 |
+
" with open(self.doc_location, \"r\", encoding=\"utf-8\") as file:\n",
|
74 |
+
" text = file.read()\n",
|
75 |
+
" return text\n",
|
76 |
+
"\n",
|
77 |
+
" def extract_text_from_txt(self):\n",
|
78 |
+
" with open(self.doc_location, \"r\", encoding=\"utf-8\") as file:\n",
|
79 |
+
" text = file.read()\n",
|
80 |
+
" return text\n",
|
81 |
+
" \n",
|
82 |
+
" def text_length(self):\n",
|
83 |
+
" words = self.text.split()\n",
|
84 |
+
" num_words = len(words)\n",
|
85 |
+
" return num_words\n",
|
86 |
+
"\n",
|
87 |
+
" def get_text(self) -> str:\n",
|
88 |
+
" file_extension = os.path.splitext(self.doc_location)[1]\n",
|
89 |
+
" if file_extension == \".pdf\":\n",
|
90 |
+
" self.text = self.extract_text_from_pdf()\n",
|
91 |
+
" elif file_extension == \".txt\":\n",
|
92 |
+
" self.text = self.extract_text_from_txt()\n",
|
93 |
+
" elif file_extension == \".docx\" or file_extension == \".doc\":\n",
|
94 |
+
" self.text = self.extract_text_from_doc()\n",
|
95 |
+
" else:\n",
|
96 |
+
" raise gr.Error(f\"We only support .pdf, .txt, .doc and .docx files\")\n",
|
97 |
+
"\n",
|
98 |
+
" if len(self.text) > MAX_FILE_SIZE:\n",
|
99 |
+
" raise gr.Error(\n",
|
100 |
+
" f\"Document exceeds the maximum supported size of {MAX_FILE_SIZE} characters.\"\n",
|
101 |
+
" )\n",
|
102 |
+
"\n",
|
103 |
+
" return self.text"
|
104 |
]
|
105 |
},
|
106 |
{
|
|
|
109 |
"metadata": {},
|
110 |
"outputs": [],
|
111 |
"source": [
|
112 |
+
"text_extractor = TextExtractor(\"data/doc-file-example.docx\")\n",
|
113 |
+
"text = text_extractor.get_text()\n",
|
114 |
+
"\n",
|
115 |
+
"text_length = text_extractor.text_length()\n",
|
116 |
+
"summary_length = int(text_length / 2)\n",
|
117 |
+
"\n",
|
118 |
+
"summary = summarizer(text, max_length=summary_length, do_sample=False)[0][\"summary_text\"]"
|
119 |
]
|
120 |
},
|
121 |
{
|
|
|
124 |
"metadata": {},
|
125 |
"outputs": [],
|
126 |
"source": [
|
127 |
+
"summary"
|
128 |
+
]
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"cell_type": "code",
|
132 |
+
"execution_count": null,
|
133 |
+
"metadata": {},
|
134 |
+
"outputs": [],
|
135 |
+
"source": [
|
136 |
+
"detected_lang = detect(summary)\n",
|
137 |
+
"detected_lang"
|
138 |
]
|
139 |
}
|
140 |
],
|
app.py
CHANGED
@@ -1,88 +1,111 @@
|
|
1 |
import gradio as gr
|
2 |
-
from transformers import
|
3 |
from pypdf import PdfReader
|
4 |
import docx
|
5 |
import os
|
|
|
6 |
|
7 |
-
|
8 |
-
|
9 |
-
model
|
|
|
|
|
|
|
|
|
10 |
|
11 |
-
|
12 |
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
-
def extract_text_from_pdf(
|
15 |
-
|
16 |
-
|
17 |
|
18 |
-
|
19 |
-
|
20 |
|
21 |
-
|
22 |
|
|
|
|
|
|
|
23 |
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
|
28 |
-
|
29 |
-
|
30 |
-
|
|
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
-
def
|
34 |
-
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
|
|
|
|
|
|
|
|
38 |
|
39 |
-
|
40 |
-
if doc is None:
|
41 |
-
raise gr.Error(f"Please select a PDF to summarize")
|
42 |
|
43 |
-
file_extension = os.path.splitext(doc)[1]
|
44 |
-
if file_extension == ".pdf":
|
45 |
-
text = extract_text_from_pdf(doc)
|
46 |
-
elif file_extension == ".txt":
|
47 |
-
text = extract_text_from_txt(doc)
|
48 |
-
elif file_extension == ".docx" or file_extension == ".doc":
|
49 |
-
text = extract_text_from_doc(doc)
|
50 |
-
else:
|
51 |
-
raise gr.Error(f"We only support .pdf, .txt, .doc and .docx files")
|
52 |
|
53 |
-
text_length = len(text)
|
54 |
|
55 |
-
if text_length > MAX_PDF_SIZE:
|
56 |
-
raise gr.Error(
|
57 |
-
f"Document characters limit exceeded. Your document should not contain more than {MAX_PDF_SIZE} characters"
|
58 |
-
)
|
59 |
|
60 |
-
|
|
|
|
|
61 |
|
62 |
-
|
|
|
63 |
|
64 |
-
|
65 |
-
|
66 |
-
num_beams=4,
|
67 |
-
min_length=30,
|
68 |
-
max_length=summary_length,
|
69 |
-
early_stopping=True,
|
70 |
-
)
|
71 |
|
72 |
-
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
return summary
|
75 |
|
76 |
|
77 |
app = gr.Interface(
|
78 |
-
summarize,
|
79 |
-
[
|
80 |
gr.File(
|
81 |
label="Document to summarize",
|
82 |
file_types=["pdf", "docx", "doc", "txt", "odt", "dot", "dotx"],
|
83 |
),
|
|
|
|
|
|
|
84 |
],
|
85 |
-
gr.Textbox(label="Summary"),
|
86 |
examples=[
|
87 |
["data/pd-file-example.pdf"],
|
88 |
["data/doc-file-example.docx"],
|
|
|
1 |
import gradio as gr
|
2 |
+
from transformers import pipeline
|
3 |
from pypdf import PdfReader
|
4 |
import docx
|
5 |
import os
|
6 |
+
from langdetect import detect
|
7 |
|
8 |
+
summarizer = pipeline(task="summarization", model="facebook/bart-large-cnn")
|
9 |
+
translator_to_french = pipeline(
|
10 |
+
task="translation_en_to_fr", model="Helsinki-NLP/opus-mt-en-fr"
|
11 |
+
)
|
12 |
+
translator_to_english = pipeline(
|
13 |
+
task="translation_fr_to_en", model="Helsinki-NLP/opus-mt-fr-en"
|
14 |
+
)
|
15 |
|
16 |
+
MAX_FILE_SIZE = 10000000
|
17 |
|
18 |
+
class TextExtractor:
|
19 |
+
def __init__(self, doc_location: str):
|
20 |
+
if doc_location is None:
|
21 |
+
raise Exception(f"Please select a PDF to summarize")
|
22 |
+
self.doc_location = doc_location
|
23 |
|
24 |
+
def extract_text_from_pdf(self):
|
25 |
+
reader = PdfReader(self.doc_location)
|
26 |
+
text = ""
|
27 |
|
28 |
+
for page in reader.pages:
|
29 |
+
text += page.extract_text()
|
30 |
|
31 |
+
return text
|
32 |
|
33 |
+
def extract_text_from_doc(self):
|
34 |
+
doc = docx.Document(self.doc_location)
|
35 |
+
text = ""
|
36 |
|
37 |
+
for paragraph in doc.paragraphs:
|
38 |
+
text += paragraph.text + "\n"
|
39 |
+
return text
|
40 |
|
41 |
+
def extract_text_from_txt(self):
|
42 |
+
with open(self.doc_location, "r", encoding="utf-8") as file:
|
43 |
+
text = file.read()
|
44 |
+
return text
|
45 |
|
46 |
+
def extract_text_from_txt(self):
|
47 |
+
with open(self.doc_location, "r", encoding="utf-8") as file:
|
48 |
+
text = file.read()
|
49 |
+
return text
|
50 |
+
|
51 |
+
def text_length(self):
|
52 |
+
words = self.text.split()
|
53 |
+
num_words = len(words)
|
54 |
+
return num_words
|
55 |
|
56 |
+
def get_text(self) -> str:
|
57 |
+
file_extension = os.path.splitext(self.doc_location)[1]
|
58 |
+
if file_extension == ".pdf":
|
59 |
+
self.text = self.extract_text_from_pdf()
|
60 |
+
elif file_extension == ".txt":
|
61 |
+
self.text = self.extract_text_from_txt()
|
62 |
+
elif file_extension == ".docx" or file_extension == ".doc":
|
63 |
+
self.text = self.extract_text_from_doc()
|
64 |
+
else:
|
65 |
+
raise gr.Error(f"We only support .pdf, .txt, .doc and .docx files")
|
66 |
|
67 |
+
if len(self.text) > MAX_FILE_SIZE:
|
68 |
+
raise gr.Error(
|
69 |
+
f"Document exceeds the maximum supported size of {MAX_FILE_SIZE} characters."
|
70 |
+
)
|
71 |
|
72 |
+
return self.text
|
|
|
|
|
73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
|
|
75 |
|
|
|
|
|
|
|
|
|
76 |
|
77 |
+
def summarize(doc: str, target_language: str) -> str:
|
78 |
+
text_extractor = TextExtractor(doc)
|
79 |
+
text = text_extractor.get_text()
|
80 |
|
81 |
+
text_length = text_extractor.text_length()
|
82 |
+
summary_length = int(text_length / 2)
|
83 |
|
84 |
+
summary = summarizer(text, max_length=summary_length, do_sample=False)[0]["summary_text"]
|
85 |
+
detected_lang = detect(summary)
|
|
|
|
|
|
|
|
|
|
|
86 |
|
87 |
+
if target_language is None:
|
88 |
+
pass
|
89 |
+
elif detected_lang == "fr" and str(target_language).lower() == "english":
|
90 |
+
summary = translator_to_english(summary)[0]["translation_text"]
|
91 |
+
elif detected_lang == "en" and str(target_language).lower() == "french":
|
92 |
+
summary = translator_to_french(summary)[0]["translation_text"]
|
93 |
|
94 |
return summary
|
95 |
|
96 |
|
97 |
app = gr.Interface(
|
98 |
+
fn=summarize,
|
99 |
+
inputs=[
|
100 |
gr.File(
|
101 |
label="Document to summarize",
|
102 |
file_types=["pdf", "docx", "doc", "txt", "odt", "dot", "dotx"],
|
103 |
),
|
104 |
+
gr.Radio(
|
105 |
+
label="Translate summary to", choices=["English", "French"], value="English"
|
106 |
+
),
|
107 |
],
|
108 |
+
outputs=gr.Textbox(label="Summary"),
|
109 |
examples=[
|
110 |
["data/pd-file-example.pdf"],
|
111 |
["data/doc-file-example.docx"],
|
requirements.txt
CHANGED
@@ -3,4 +3,7 @@ torch
|
|
3 |
torchvision
|
4 |
torchaudio
|
5 |
pypdf
|
6 |
-
python-docx
|
|
|
|
|
|
|
|
3 |
torchvision
|
4 |
torchaudio
|
5 |
pypdf
|
6 |
+
python-docx
|
7 |
+
langdetect
|
8 |
+
sentencepiece
|
9 |
+
sacremoses
|