Spaces:

Gangadhar123
/

pdf_ocr_extraction_1

Sleeping

Gangadhar123 commited on Jun 6

Commit

fa12524

verified ·

1 Parent(s): 88ace73

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -4,17 +4,18 @@ import fitz  # PyMuPDF
 import pytesseract
 from PIL import Image
 import gradio as gr
-from transformers import pipeline
 from extraction_service import ExtractionService
 # Load field extraction config
 extractor = ExtractionService("fields_config.json")
-# Load the TinyLlama chat pipeline (text-generation with instruction-tuning)
 chat_pipeline = pipeline(
     "text-generation",
-    model="TinyLlama/TinyLlama-1.1B-Chat",
-    device=-1,  # CPU; change to 0 if you have a GPU
     max_length=512,
     do_sample=True,
     temperature=0.7,
@@ -71,17 +72,21 @@ def ask_question(question):
         return "Upload and extract a PDF first."
     context = json.dumps(extracted_data_store["fields"], indent=2)
-    prompt = f"Context: {context}\n\nQuestion: {question}\nAnswer:"
     try:
-        results = chat_pipeline(prompt)
-        answer = results[0]["generated_text"].split("Answer:")[-1].strip()
         return answer
     except Exception as e:
         return f"Model inference error: {str(e)}"
 with gr.Blocks() as demo:
-    gr.Markdown("## 🛡️ Insurance PDF Extractor & Q&A (TinyLlama 1.1B Chat)")
     with gr.Row():
         pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
         extract_btn = gr.Button("Extract")

 import pytesseract
 from PIL import Image
 import gradio as gr
+import os
 from extraction_service import ExtractionService
+from transformers import pipeline
 # Load field extraction config
 extractor = ExtractionService("fields_config.json")
+# Initialize Hugging Face text-generation pipeline with Dolly v2 3B
 chat_pipeline = pipeline(
     "text-generation",
+    model="databricks/dolly-v2-3b",
+    device=-1,  # CPU; set device=0 for GPU if available
     max_length=512,
     do_sample=True,
     temperature=0.7,
         return "Upload and extract a PDF first."
     context = json.dumps(extracted_data_store["fields"], indent=2)
+    prompt = f"You are an assistant for insurance data.\nExtracted data:\n{context}\n\nQuestion: {question}\nAnswer:"
     try:
+        responses = chat_pipeline(prompt, max_length=512, do_sample=True, temperature=0.7, top_p=0.9)
+        answer = responses[0]["generated_text"]
+        # Extract answer text after the question (remove prompt)
+        if prompt in answer:
+            answer = answer.split(prompt,1)[1].strip()
         return answer
     except Exception as e:
         return f"Model inference error: {str(e)}"
 with gr.Blocks() as demo:
+    gr.Markdown("## 🛡️ Insurance PDF Extractor & Q&A using Dolly-v2-3B")
     with gr.Row():
         pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
         extract_btn = gr.Button("Extract")