Spaces:
Sleeping
Sleeping
Ari
commited on
Commit
•
99a5022
1
Parent(s):
85bfbd2
Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
import gradio as gr
|
2 |
import os
|
3 |
-
import nltk
|
4 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
5 |
from fpdf import FPDF
|
6 |
from gtts import gTTS
|
@@ -9,25 +8,21 @@ from docx import Document
|
|
9 |
from reportlab.lib.pagesizes import letter
|
10 |
from reportlab.pdfgen import canvas
|
11 |
|
12 |
-
# Clear any potentially corrupted data and ensure correct download
|
13 |
-
nltk.data.path.append("/home/user/nltk_data")
|
14 |
-
nltk.download('punkt')
|
15 |
-
|
16 |
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
|
17 |
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
|
18 |
|
19 |
-
# Function to split text into chunks
|
20 |
-
def
|
21 |
-
|
22 |
chunks = []
|
23 |
chunk = ""
|
24 |
|
25 |
-
for
|
26 |
-
if len(chunk) + len(
|
27 |
-
chunk +=
|
28 |
else:
|
29 |
chunks.append(chunk.strip())
|
30 |
-
chunk =
|
31 |
|
32 |
if chunk:
|
33 |
chunks.append(chunk.strip())
|
@@ -51,7 +46,7 @@ def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
|
|
51 |
pdf.save()
|
52 |
return output_pdf
|
53 |
|
54 |
-
# Main processing function with text chunking
|
55 |
def pdf_to_text(text, PDF, min_length=80):
|
56 |
try:
|
57 |
file_extension = os.path.splitext(PDF.name)[1].lower()
|
@@ -62,7 +57,7 @@ def pdf_to_text(text, PDF, min_length=80):
|
|
62 |
elif file_extension == '.pdf' and text == "":
|
63 |
text = extract_text(PDF.name)
|
64 |
|
65 |
-
chunks =
|
66 |
summarized_text = ""
|
67 |
|
68 |
for chunk in chunks:
|
|
|
1 |
import gradio as gr
|
2 |
import os
|
|
|
3 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
4 |
from fpdf import FPDF
|
5 |
from gtts import gTTS
|
|
|
8 |
from reportlab.lib.pagesizes import letter
|
9 |
from reportlab.pdfgen import canvas
|
10 |
|
|
|
|
|
|
|
|
|
11 |
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
|
12 |
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
|
13 |
|
14 |
+
# Function to split text into chunks based on paragraphs
|
15 |
+
def split_text_by_paragraph(text, max_chunk_size=1024):
|
16 |
+
paragraphs = text.split("\n\n") # Splitting by paragraphs
|
17 |
chunks = []
|
18 |
chunk = ""
|
19 |
|
20 |
+
for paragraph in paragraphs:
|
21 |
+
if len(chunk) + len(paragraph) <= max_chunk_size:
|
22 |
+
chunk += paragraph + "\n\n"
|
23 |
else:
|
24 |
chunks.append(chunk.strip())
|
25 |
+
chunk = paragraph + "\n\n"
|
26 |
|
27 |
if chunk:
|
28 |
chunks.append(chunk.strip())
|
|
|
46 |
pdf.save()
|
47 |
return output_pdf
|
48 |
|
49 |
+
# Main processing function with paragraph-based text chunking
|
50 |
def pdf_to_text(text, PDF, min_length=80):
|
51 |
try:
|
52 |
file_extension = os.path.splitext(PDF.name)[1].lower()
|
|
|
57 |
elif file_extension == '.pdf' and text == "":
|
58 |
text = extract_text(PDF.name)
|
59 |
|
60 |
+
chunks = split_text_by_paragraph(text)
|
61 |
summarized_text = ""
|
62 |
|
63 |
for chunk in chunks:
|