Spaces:

albhu
/

legalgeek

Sleeping

albhu commited on Apr 28

Commit

aafe73b

•

1 Parent(s): 3bb3d19

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import streamlit as st
-import pdfplumber
 import docx
 from transformers import AutoTokenizer, AutoModelForCausalLM
@@ -21,11 +21,7 @@ model = AutoModelForCausalLM.from_pretrained("databricks/dolly-v2-3b", trust_rem
 def process_document(document_file):
     document_text = ""
     if document_file.type == "application/pdf":
-        with pdfplumber.open(document_file) as pdf:
-            for page in pdf.pages:
-                text = page.extract_text()
-                if text:
-                    document_text += text.strip() + "\n\n"
     elif document_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
         docx_file = docx.Document(document_file)
         for paragraph in docx_file.paragraphs:

 import streamlit as st
+from pdfminer.high_level import extract_text
 import docx
 from transformers import AutoTokenizer, AutoModelForCausalLM
 def process_document(document_file):
     document_text = ""
     if document_file.type == "application/pdf":
+        document_text = extract_text(document_file)
     elif document_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
         docx_file = docx.Document(document_file)
         for paragraph in docx_file.paragraphs: