Noah Nsimbe commited on
Commit
b5498e2
1 Parent(s): c50f96e
Files changed (3) hide show
  1. app.ipynb +88 -11
  2. app.py +74 -51
  3. requirements.txt +4 -1
app.ipynb CHANGED
@@ -6,7 +6,7 @@
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
9
- "! pip install torch torchvision torchaudio pypdf"
10
  ]
11
  },
12
  {
@@ -15,8 +15,12 @@
15
  "metadata": {},
16
  "outputs": [],
17
  "source": [
 
18
  "from transformers import pipeline\n",
19
- "from pypdf import PdfReader"
 
 
 
20
  ]
21
  },
22
  {
@@ -25,7 +29,15 @@
25
  "metadata": {},
26
  "outputs": [],
27
  "source": [
28
- "summarizer = pipeline(task=\"summarization\", model=\"facebook/bart-large-cnn\")"
 
 
 
 
 
 
 
 
29
  ]
30
  },
31
  {
@@ -34,11 +46,61 @@
34
  "metadata": {},
35
  "outputs": [],
36
  "source": [
37
- "reader = PdfReader(\"data/example.pdf\")\n",
38
- "number_of_pages = len(reader.pages)\n",
39
- "page = reader.pages[0]\n",
40
- "text = page.extract_text()\n",
41
- "text"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  ]
43
  },
44
  {
@@ -47,8 +109,13 @@
47
  "metadata": {},
48
  "outputs": [],
49
  "source": [
50
- "results = summarizer(text, max_length=30, do_sample=False)\n",
51
- "results"
 
 
 
 
 
52
  ]
53
  },
54
  {
@@ -57,7 +124,17 @@
57
  "metadata": {},
58
  "outputs": [],
59
  "source": [
60
- "results[0][\"summary_text\"]"
 
 
 
 
 
 
 
 
 
 
61
  ]
62
  }
63
  ],
 
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
9
+ "! pip install -U \"transformers[torch]\" pypdf python-docx langdetect sentencepiece sacremoses"
10
  ]
11
  },
12
  {
 
15
  "metadata": {},
16
  "outputs": [],
17
  "source": [
18
+ "import gradio as gr\n",
19
  "from transformers import pipeline\n",
20
+ "from pypdf import PdfReader\n",
21
+ "import docx\n",
22
+ "import os\n",
23
+ "from langdetect import detect"
24
  ]
25
  },
26
  {
 
29
  "metadata": {},
30
  "outputs": [],
31
  "source": [
32
+ "summarizer = pipeline(task=\"summarization\", model=\"facebook/bart-large-cnn\")\n",
33
+ "translator_to_french = pipeline(\n",
34
+ " task=\"translation_en_to_fr\", model=\"Helsinki-NLP/opus-mt-en-fr\"\n",
35
+ ")\n",
36
+ "translator_to_english = pipeline(\n",
37
+ " task=\"translation_fr_to_en\", model=\"Helsinki-NLP/opus-mt-fr-en\"\n",
38
+ ")\n",
39
+ "\n",
40
+ "MAX_FILE_SIZE = 10000000"
41
  ]
42
  },
43
  {
 
46
  "metadata": {},
47
  "outputs": [],
48
  "source": [
49
+ "class TextExtractor:\n",
50
+ " def __init__(self, doc_location: str):\n",
51
+ " if doc_location is None:\n",
52
+ " raise Exception(f\"Please select a PDF to summarize\")\n",
53
+ " self.doc_location = doc_location\n",
54
+ "\n",
55
+ " def extract_text_from_pdf(self):\n",
56
+ " reader = PdfReader(self.doc_location)\n",
57
+ " text = \"\"\n",
58
+ "\n",
59
+ " for page in reader.pages:\n",
60
+ " text += page.extract_text()\n",
61
+ "\n",
62
+ " return text\n",
63
+ "\n",
64
+ " def extract_text_from_doc(self):\n",
65
+ " doc = docx.Document(self.doc_location)\n",
66
+ " text = \"\"\n",
67
+ "\n",
68
+ " for paragraph in doc.paragraphs:\n",
69
+ " text += paragraph.text + \"\\n\"\n",
70
+ " return text\n",
71
+ "\n",
72
+ " def extract_text_from_txt(self):\n",
73
+ " with open(self.doc_location, \"r\", encoding=\"utf-8\") as file:\n",
74
+ " text = file.read()\n",
75
+ " return text\n",
76
+ "\n",
77
+ " def extract_text_from_txt(self):\n",
78
+ " with open(self.doc_location, \"r\", encoding=\"utf-8\") as file:\n",
79
+ " text = file.read()\n",
80
+ " return text\n",
81
+ " \n",
82
+ " def text_length(self):\n",
83
+ " words = self.text.split()\n",
84
+ " num_words = len(words)\n",
85
+ " return num_words\n",
86
+ "\n",
87
+ " def get_text(self) -> str:\n",
88
+ " file_extension = os.path.splitext(self.doc_location)[1]\n",
89
+ " if file_extension == \".pdf\":\n",
90
+ " self.text = self.extract_text_from_pdf()\n",
91
+ " elif file_extension == \".txt\":\n",
92
+ " self.text = self.extract_text_from_txt()\n",
93
+ " elif file_extension == \".docx\" or file_extension == \".doc\":\n",
94
+ " self.text = self.extract_text_from_doc()\n",
95
+ " else:\n",
96
+ " raise gr.Error(f\"We only support .pdf, .txt, .doc and .docx files\")\n",
97
+ "\n",
98
+ " if len(self.text) > MAX_FILE_SIZE:\n",
99
+ " raise gr.Error(\n",
100
+ " f\"Document exceeds the maximum supported size of {MAX_FILE_SIZE} characters.\"\n",
101
+ " )\n",
102
+ "\n",
103
+ " return self.text"
104
  ]
105
  },
106
  {
 
109
  "metadata": {},
110
  "outputs": [],
111
  "source": [
112
+ "text_extractor = TextExtractor(\"data/doc-file-example.docx\")\n",
113
+ "text = text_extractor.get_text()\n",
114
+ "\n",
115
+ "text_length = text_extractor.text_length()\n",
116
+ "summary_length = int(text_length / 2)\n",
117
+ "\n",
118
+ "summary = summarizer(text, max_length=summary_length, do_sample=False)[0][\"summary_text\"]"
119
  ]
120
  },
121
  {
 
124
  "metadata": {},
125
  "outputs": [],
126
  "source": [
127
+ "summary"
128
+ ]
129
+ },
130
+ {
131
+ "cell_type": "code",
132
+ "execution_count": null,
133
+ "metadata": {},
134
+ "outputs": [],
135
+ "source": [
136
+ "detected_lang = detect(summary)\n",
137
+ "detected_lang"
138
  ]
139
  }
140
  ],
app.py CHANGED
@@ -1,88 +1,111 @@
1
  import gradio as gr
2
- from transformers import BartTokenizer, BartForConditionalGeneration
3
  from pypdf import PdfReader
4
  import docx
5
  import os
 
6
 
7
- model_name = "facebook/bart-large-cnn"
8
- tokenizer = BartTokenizer.from_pretrained(model_name)
9
- model = BartForConditionalGeneration.from_pretrained(model_name)
 
 
 
 
10
 
11
- MAX_PDF_SIZE = 10000000
12
 
 
 
 
 
 
13
 
14
- def extract_text_from_pdf(pdf_file):
15
- reader = PdfReader(pdf_file)
16
- text = ""
17
 
18
- for page in reader.pages:
19
- text += page.extract_text()
20
 
21
- return text
22
 
 
 
 
23
 
24
- def extract_text_from_doc(docx_file):
25
- doc = docx.Document(docx_file)
26
- text = ""
27
 
28
- for paragraph in doc.paragraphs:
29
- text += paragraph.text + "\n"
30
- return text
 
31
 
 
 
 
 
 
 
 
 
 
32
 
33
- def extract_text_from_txt(txt_file):
34
- with open(txt_file, "r", encoding="utf-8") as file:
35
- text = file.read()
36
- return text
 
 
 
 
 
 
37
 
 
 
 
 
38
 
39
- def summarize(doc: str) -> str:
40
- if doc is None:
41
- raise gr.Error(f"Please select a PDF to summarize")
42
 
43
- file_extension = os.path.splitext(doc)[1]
44
- if file_extension == ".pdf":
45
- text = extract_text_from_pdf(doc)
46
- elif file_extension == ".txt":
47
- text = extract_text_from_txt(doc)
48
- elif file_extension == ".docx" or file_extension == ".doc":
49
- text = extract_text_from_doc(doc)
50
- else:
51
- raise gr.Error(f"We only support .pdf, .txt, .doc and .docx files")
52
 
53
- text_length = len(text)
54
 
55
- if text_length > MAX_PDF_SIZE:
56
- raise gr.Error(
57
- f"Document characters limit exceeded. Your document should not contain more than {MAX_PDF_SIZE} characters"
58
- )
59
 
60
- summary_length = int(text_length / 3)
 
 
61
 
62
- inputs = tokenizer([text], max_length=1024, return_tensors="pt", truncation=True)
 
63
 
64
- results = model.generate(
65
- inputs.input_ids,
66
- num_beams=4,
67
- min_length=30,
68
- max_length=summary_length,
69
- early_stopping=True,
70
- )
71
 
72
- summary = tokenizer.decode(results[0], skip_special_tokens=True)
 
 
 
 
 
73
 
74
  return summary
75
 
76
 
77
  app = gr.Interface(
78
- summarize,
79
- [
80
  gr.File(
81
  label="Document to summarize",
82
  file_types=["pdf", "docx", "doc", "txt", "odt", "dot", "dotx"],
83
  ),
 
 
 
84
  ],
85
- gr.Textbox(label="Summary"),
86
  examples=[
87
  ["data/pd-file-example.pdf"],
88
  ["data/doc-file-example.docx"],
 
1
  import gradio as gr
2
+ from transformers import pipeline
3
  from pypdf import PdfReader
4
  import docx
5
  import os
6
+ from langdetect import detect
7
 
8
+ summarizer = pipeline(task="summarization", model="facebook/bart-large-cnn")
9
+ translator_to_french = pipeline(
10
+ task="translation_en_to_fr", model="Helsinki-NLP/opus-mt-en-fr"
11
+ )
12
+ translator_to_english = pipeline(
13
+ task="translation_fr_to_en", model="Helsinki-NLP/opus-mt-fr-en"
14
+ )
15
 
16
+ MAX_FILE_SIZE = 10000000
17
 
18
+ class TextExtractor:
19
+ def __init__(self, doc_location: str):
20
+ if doc_location is None:
21
+ raise Exception(f"Please select a PDF to summarize")
22
+ self.doc_location = doc_location
23
 
24
+ def extract_text_from_pdf(self):
25
+ reader = PdfReader(self.doc_location)
26
+ text = ""
27
 
28
+ for page in reader.pages:
29
+ text += page.extract_text()
30
 
31
+ return text
32
 
33
+ def extract_text_from_doc(self):
34
+ doc = docx.Document(self.doc_location)
35
+ text = ""
36
 
37
+ for paragraph in doc.paragraphs:
38
+ text += paragraph.text + "\n"
39
+ return text
40
 
41
+ def extract_text_from_txt(self):
42
+ with open(self.doc_location, "r", encoding="utf-8") as file:
43
+ text = file.read()
44
+ return text
45
 
46
+ def extract_text_from_txt(self):
47
+ with open(self.doc_location, "r", encoding="utf-8") as file:
48
+ text = file.read()
49
+ return text
50
+
51
+ def text_length(self):
52
+ words = self.text.split()
53
+ num_words = len(words)
54
+ return num_words
55
 
56
+ def get_text(self) -> str:
57
+ file_extension = os.path.splitext(self.doc_location)[1]
58
+ if file_extension == ".pdf":
59
+ self.text = self.extract_text_from_pdf()
60
+ elif file_extension == ".txt":
61
+ self.text = self.extract_text_from_txt()
62
+ elif file_extension == ".docx" or file_extension == ".doc":
63
+ self.text = self.extract_text_from_doc()
64
+ else:
65
+ raise gr.Error(f"We only support .pdf, .txt, .doc and .docx files")
66
 
67
+ if len(self.text) > MAX_FILE_SIZE:
68
+ raise gr.Error(
69
+ f"Document exceeds the maximum supported size of {MAX_FILE_SIZE} characters."
70
+ )
71
 
72
+ return self.text
 
 
73
 
 
 
 
 
 
 
 
 
 
74
 
 
75
 
 
 
 
 
76
 
77
+ def summarize(doc: str, target_language: str) -> str:
78
+ text_extractor = TextExtractor(doc)
79
+ text = text_extractor.get_text()
80
 
81
+ text_length = text_extractor.text_length()
82
+ summary_length = int(text_length / 2)
83
 
84
+ summary = summarizer(text, max_length=summary_length, do_sample=False)[0]["summary_text"]
85
+ detected_lang = detect(summary)
 
 
 
 
 
86
 
87
+ if target_language is None:
88
+ pass
89
+ elif detected_lang == "fr" and str(target_language).lower() == "english":
90
+ summary = translator_to_english(summary)[0]["translation_text"]
91
+ elif detected_lang == "en" and str(target_language).lower() == "french":
92
+ summary = translator_to_french(summary)[0]["translation_text"]
93
 
94
  return summary
95
 
96
 
97
  app = gr.Interface(
98
+ fn=summarize,
99
+ inputs=[
100
  gr.File(
101
  label="Document to summarize",
102
  file_types=["pdf", "docx", "doc", "txt", "odt", "dot", "dotx"],
103
  ),
104
+ gr.Radio(
105
+ label="Translate summary to", choices=["English", "French"], value="English"
106
+ ),
107
  ],
108
+ outputs=gr.Textbox(label="Summary"),
109
  examples=[
110
  ["data/pd-file-example.pdf"],
111
  ["data/doc-file-example.docx"],
requirements.txt CHANGED
@@ -3,4 +3,7 @@ torch
3
  torchvision
4
  torchaudio
5
  pypdf
6
- python-docx
 
 
 
 
3
  torchvision
4
  torchaudio
5
  pypdf
6
+ python-docx
7
+ langdetect
8
+ sentencepiece
9
+ sacremoses