albhu commited on
Commit
aafe73b
1 Parent(s): 3bb3d19

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +2 -6
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import streamlit as st
2
- import pdfplumber
3
  import docx
4
  from transformers import AutoTokenizer, AutoModelForCausalLM
5
 
@@ -21,11 +21,7 @@ model = AutoModelForCausalLM.from_pretrained("databricks/dolly-v2-3b", trust_rem
21
  def process_document(document_file):
22
  document_text = ""
23
  if document_file.type == "application/pdf":
24
- with pdfplumber.open(document_file) as pdf:
25
- for page in pdf.pages:
26
- text = page.extract_text()
27
- if text:
28
- document_text += text.strip() + "\n\n"
29
  elif document_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
30
  docx_file = docx.Document(document_file)
31
  for paragraph in docx_file.paragraphs:
 
1
  import streamlit as st
2
+ from pdfminer.high_level import extract_text
3
  import docx
4
  from transformers import AutoTokenizer, AutoModelForCausalLM
5
 
 
21
  def process_document(document_file):
22
  document_text = ""
23
  if document_file.type == "application/pdf":
24
+ document_text = extract_text(document_file)
 
 
 
 
25
  elif document_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
26
  docx_file = docx.Document(document_file)
27
  for paragraph in docx_file.paragraphs: