vilarin commited on
Commit
4d87b14
1 Parent(s): 60221b1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -5
app.py CHANGED
@@ -6,7 +6,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStream
6
  import os
7
  from threading import Thread
8
 
9
- import fitz
10
  import docx
11
  from pptx import Presentation
12
 
@@ -56,7 +56,7 @@ def extract_text(path):
56
  return open(path, 'r').read()
57
 
58
  def extract_pdf(path):
59
- doc = fitz.open(path)
60
  text = ""
61
  for page in doc:
62
  text += page.get_text()
@@ -82,12 +82,13 @@ def extract_pptx(path):
82
  def mode_load(path):
83
  choice = ""
84
  file_type = path.split(".")[-1]
 
85
  if file_type in ["pdf", "txt", "py", "docx", "pptx", "json", "cpp", "md"]:
86
- if file_type.endswith(".pdf"):
87
  content = extract_pdf(path)
88
- elif file_type.endswith(".docx"):
89
  content = extract_docx(path)
90
- elif file_type.endswith(".pptx"):
91
  content = extract_pptx(path)
92
  else:
93
  content = extract_text(path)
 
6
  import os
7
  from threading import Thread
8
 
9
+ import pymupdf
10
  import docx
11
  from pptx import Presentation
12
 
 
56
  return open(path, 'r').read()
57
 
58
  def extract_pdf(path):
59
+ doc = pymupdf.open(path)
60
  text = ""
61
  for page in doc:
62
  text += page.get_text()
 
82
  def mode_load(path):
83
  choice = ""
84
  file_type = path.split(".")[-1]
85
+ print(file_type)
86
  if file_type in ["pdf", "txt", "py", "docx", "pptx", "json", "cpp", "md"]:
87
+ if file_type.endswith("pdf"):
88
  content = extract_pdf(path)
89
+ elif file_type.endswith("docx"):
90
  content = extract_docx(path)
91
+ elif file_type.endswith("pptx"):
92
  content = extract_pptx(path)
93
  else:
94
  content = extract_text(path)