Spaces:

ignaciaginting
/

extract_from_doc

Build error

ignaciaginting commited on May 5

Commit

b39b068

verified ·

1 Parent(s): e205139

update

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,17 +1,27 @@
 import gradio as gr
-import os
 from huggingface_hub import snapshot_download
-from pdf_extract_kit import extract_text  # Assuming this function exists in the toolkit
-# Ensure the model is downloaded
-model_dir = "./PDF-Extract-Kit-1.0"
-if not os.path.exists(model_dir):
-    snapshot_download(repo_id='opendatalab/PDF-Extract-Kit-1.0', local_dir=model_dir, max_workers=20)
-def process_pdf(file):
-    # Implement your PDF processing logic here using PDF-Extract-Kit
-    extracted_text = extract_text(file.name, model_dir=model_dir)
-    return extracted_text
-iface = gr.Interface(fn=process_pdf, inputs=gr.File(type="binary"), outputs="text", title="PDF Extractor")
-iface.launch()

 import gradio as gr
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
 from huggingface_hub import snapshot_download
+import os
+# Step 1: Download the model
+model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0')
+# Step 2: Initialize pipeline
+pipe = pipeline(
+    task=Tasks.document_segmentation,
+    model=model_dir
+)
+# Step 3: Define inference function
+def extract_info_from_pdf(pdf_file):
+    result = pipe({'file': pdf_file.name})
+    return str(result)
+# Step 4: Gradio UI
+gr.Interface(
+    fn=extract_info_from_pdf,
+    inputs=gr.File(type="binary", label="Upload PDF"),
+    outputs="text",
+    title="PDF Extractor (PDF-Extract-Kit)"
+).launch()