Spaces:

Gangadhar123
/

pdf_ocr_extraction_1

Sleeping

Gangadhar123 commited on Jun 6

Commit

f701cf1

verified ·

1 Parent(s): d781f94

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -7,15 +7,6 @@ import gradio as gr
 from transformers import pipeline
 from extraction_service import ExtractionService
-# Initialize the HF pipeline with smaller faster model
-model_name = "declare-lab/flan-alpaca-small"
-pipe = pipeline(
-    "text2text-generation",
-    model=model_name,
-    tokenizer=model_name,
-    device=-1  # CPU
-)
 # Load field extraction config
 extractor = ExtractionService("fields_config.json")
@@ -36,6 +27,15 @@ def extract_text_from_pdf(pdf_stream: io.BytesIO) -> str:
     except Exception as e:
         return f"Error processing PDF: {str(e)}"
 # Store extracted content
 extracted_data_store = {"raw_text": "", "fields": {}}
@@ -69,18 +69,17 @@ def ask_question(question):
         return "Upload and extract a PDF first."
     context = json.dumps(extracted_data_store["fields"], indent=2)
-    prompt = f"Extracted data:\n{context}\n\nQuestion: {question}\nAnswer:"
     try:
-        # Using HF pipeline generate method for text generation
         result = pipe(prompt, max_length=256, do_sample=False)
-        answer = result[0]['generated_text']
         return answer.strip()
     except Exception as e:
-        return f"Error generating answer: {str(e)}"
 with gr.Blocks() as demo:
-    gr.Markdown("## 🛡️ Insurance PDF Extractor & Q&A (using declare-lab/flan-alpaca-small)")
     with gr.Row():
         pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
         extract_btn = gr.Button("Extract")

 from transformers import pipeline
 from extraction_service import ExtractionService
 # Load field extraction config
 extractor = ExtractionService("fields_config.json")
     except Exception as e:
         return f"Error processing PDF: {str(e)}"
+# Initialize Hugging Face pipeline with a small public model
+model_name = "google/flan-t5-small"
+pipe = pipeline(
+    "text2text-generation",
+    model=model_name,
+    tokenizer=model_name,
+    device=-1  # Use CPU; change to 0 if you want GPU
+)
 # Store extracted content
 extracted_data_store = {"raw_text": "", "fields": {}}
         return "Upload and extract a PDF first."
     context = json.dumps(extracted_data_store["fields"], indent=2)
+    prompt = f"Context: {context}\nQuestion: {question}"
     try:
         result = pipe(prompt, max_length=256, do_sample=False)
+        answer = result[0]["generated_text"]
         return answer.strip()
     except Exception as e:
+        return f"Model inference error: {str(e)}"
 with gr.Blocks() as demo:
+    gr.Markdown("## 🛡️ Insurance PDF Extractor & Q&A (Using google/flan-t5-small)")
     with gr.Row():
         pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
         extract_btn = gr.Button("Extract")