Ari commited on
Commit
99a5022
1 Parent(s): 85bfbd2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -14
app.py CHANGED
@@ -1,6 +1,5 @@
1
  import gradio as gr
2
  import os
3
- import nltk
4
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
5
  from fpdf import FPDF
6
  from gtts import gTTS
@@ -9,25 +8,21 @@ from docx import Document
9
  from reportlab.lib.pagesizes import letter
10
  from reportlab.pdfgen import canvas
11
 
12
- # Clear any potentially corrupted data and ensure correct download
13
- nltk.data.path.append("/home/user/nltk_data")
14
- nltk.download('punkt')
15
-
16
  tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
17
  model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
18
 
19
- # Function to split text into chunks
20
- def split_text(text, max_chunk_size=1024):
21
- sentences = nltk.sent_tokenize(text) # Use NLTK's sentence tokenizer
22
  chunks = []
23
  chunk = ""
24
 
25
- for sentence in sentences:
26
- if len(chunk) + len(sentence) <= max_chunk_size:
27
- chunk += sentence + " "
28
  else:
29
  chunks.append(chunk.strip())
30
- chunk = sentence + " "
31
 
32
  if chunk:
33
  chunks.append(chunk.strip())
@@ -51,7 +46,7 @@ def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
51
  pdf.save()
52
  return output_pdf
53
 
54
- # Main processing function with text chunking
55
  def pdf_to_text(text, PDF, min_length=80):
56
  try:
57
  file_extension = os.path.splitext(PDF.name)[1].lower()
@@ -62,7 +57,7 @@ def pdf_to_text(text, PDF, min_length=80):
62
  elif file_extension == '.pdf' and text == "":
63
  text = extract_text(PDF.name)
64
 
65
- chunks = split_text(text)
66
  summarized_text = ""
67
 
68
  for chunk in chunks:
 
1
  import gradio as gr
2
  import os
 
3
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
4
  from fpdf import FPDF
5
  from gtts import gTTS
 
8
  from reportlab.lib.pagesizes import letter
9
  from reportlab.pdfgen import canvas
10
 
 
 
 
 
11
  tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
12
  model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
13
 
14
+ # Function to split text into chunks based on paragraphs
15
+ def split_text_by_paragraph(text, max_chunk_size=1024):
16
+ paragraphs = text.split("\n\n") # Splitting by paragraphs
17
  chunks = []
18
  chunk = ""
19
 
20
+ for paragraph in paragraphs:
21
+ if len(chunk) + len(paragraph) <= max_chunk_size:
22
+ chunk += paragraph + "\n\n"
23
  else:
24
  chunks.append(chunk.strip())
25
+ chunk = paragraph + "\n\n"
26
 
27
  if chunk:
28
  chunks.append(chunk.strip())
 
46
  pdf.save()
47
  return output_pdf
48
 
49
+ # Main processing function with paragraph-based text chunking
50
  def pdf_to_text(text, PDF, min_length=80):
51
  try:
52
  file_extension = os.path.splitext(PDF.name)[1].lower()
 
57
  elif file_extension == '.pdf' and text == "":
58
  text = extract_text(PDF.name)
59
 
60
+ chunks = split_text_by_paragraph(text)
61
  summarized_text = ""
62
 
63
  for chunk in chunks: