Ari commited on
Commit
ac28e59
·
verified ·
1 Parent(s): 170c2bc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -5
app.py CHANGED
@@ -5,6 +5,7 @@ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
5
  from fpdf import FPDF
6
  from gtts import gTTS
7
  from pdfminer.high_level import extract_text
 
8
 
9
  nltk.download('punkt')
10
 
@@ -12,13 +13,35 @@ nltk.download('punkt')
12
  tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
13
  model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  # Main processing function
16
  def pdf_to_text(text, PDF, min_length=20):
17
  try:
18
- # Extract text from PDF if no input text provided
19
- if text == "":
 
 
 
 
 
 
 
20
  text = extract_text(PDF.name)
21
-
22
  # Tokenize text
23
  inputs = tokenizer([text], max_length=1024, return_tensors="pt")
24
  min_length = int(min_length)
@@ -48,9 +71,9 @@ def pdf_to_text(text, PDF, min_length=20):
48
  # Gradio interface
49
  iface = gr.Interface(
50
  fn=pdf_to_text,
51
- inputs=[gr.Textbox(label="Input Text"), gr.File(label="Upload PDF"), gr.Slider(minimum=10, maximum=100, step=10, value=20, label="Summary Minimum Length")],
52
  outputs=[gr.Audio(label="Generated Audio"), gr.Textbox(label="Generated Summary"), gr.File(label="Summary PDF")]
53
  )
54
 
55
  if __name__ == "__main__":
56
- iface.launch()
 
5
  from fpdf import FPDF
6
  from gtts import gTTS
7
  from pdfminer.high_level import extract_text
8
+ from docx import Document
9
 
10
  nltk.download('punkt')
11
 
 
13
  tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
14
  model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
15
 
16
+ # Function to convert DOCX to PDF
17
+ def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
18
+ doc = Document(docx_file)
19
+ full_text = []
20
+ for para in doc.paragraphs:
21
+ full_text.append(para.text)
22
+
23
+ # Create a PDF and write the extracted text
24
+ pdf = FPDF()
25
+ pdf.add_page()
26
+ pdf.set_font("Times", size=12)
27
+ pdf.multi_cell(190, 10, txt="\n".join(full_text), align='C')
28
+ pdf.output(output_pdf)
29
+ return output_pdf
30
+
31
  # Main processing function
32
  def pdf_to_text(text, PDF, min_length=20):
33
  try:
34
+ # Determine whether the input is a PDF or DOCX
35
+ file_extension = os.path.splitext(PDF.name)[1].lower()
36
+
37
+ # If DOCX, first convert it to PDF
38
+ if file_extension == '.docx':
39
+ pdf_file_path = docx_to_pdf(PDF.name) # Convert DOCX to PDF
40
+ text = extract_text(pdf_file_path) # Extract text from the newly created PDF
41
+ # If PDF, extract text from it directly
42
+ elif file_extension == '.pdf' and text == "":
43
  text = extract_text(PDF.name)
44
+
45
  # Tokenize text
46
  inputs = tokenizer([text], max_length=1024, return_tensors="pt")
47
  min_length = int(min_length)
 
71
  # Gradio interface
72
  iface = gr.Interface(
73
  fn=pdf_to_text,
74
+ inputs=[gr.Textbox(label="Input Text"), gr.File(label="Upload PDF or DOCX"), gr.Slider(minimum=10, maximum=100, step=10, value=20, label="Summary Minimum Length")],
75
  outputs=[gr.Audio(label="Generated Audio"), gr.Textbox(label="Generated Summary"), gr.File(label="Summary PDF")]
76
  )
77
 
78
  if __name__ == "__main__":
79
+ iface.launch()