|
|
""" |
|
|
VisionOCRAgent for SPARKNET |
|
|
|
|
|
Handles OCR and document vision tasks using Ollama's llava model. |
|
|
Extracts text from images, PDFs, diagrams, and complex documents. |
|
|
""" |
|
|
|
|
|
import base64 |
|
|
from pathlib import Path |
|
|
from typing import Optional, Dict, Any |
|
|
from loguru import logger |
|
|
from langchain_ollama import ChatOllama |
|
|
from langchain_core.messages import HumanMessage |
|
|
|
|
|
class VisionOCRAgent: |
|
|
""" |
|
|
Specialized agent for vision-based OCR tasks. |
|
|
Uses llava vision-language model for document analysis. |
|
|
""" |
|
|
|
|
|
def __init__(self, model_name: str = "llava:7b", base_url: str = "http://localhost:11434"): |
|
|
""" |
|
|
Initialize VisionOCRAgent. |
|
|
|
|
|
Args: |
|
|
model_name: Ollama vision model to use (default: llava:7b) |
|
|
base_url: Ollama service URL |
|
|
""" |
|
|
self.model_name = model_name |
|
|
self.base_url = base_url |
|
|
|
|
|
|
|
|
self.vision_llm = ChatOllama( |
|
|
model=model_name, |
|
|
base_url=base_url, |
|
|
temperature=0.1, |
|
|
) |
|
|
|
|
|
logger.info(f"Initialized VisionOCRAgent with model: {model_name}") |
|
|
|
|
|
def _encode_image(self, image_path: str) -> str: |
|
|
""" |
|
|
Encode image to base64 for llava. |
|
|
|
|
|
Args: |
|
|
image_path: Path to image file |
|
|
|
|
|
Returns: |
|
|
Base64 encoded image string |
|
|
""" |
|
|
with open(image_path, "rb") as image_file: |
|
|
return base64.b64encode(image_file.read()).decode('utf-8') |
|
|
|
|
|
async def extract_text_from_image( |
|
|
self, |
|
|
image_path: str, |
|
|
preserve_formatting: bool = True |
|
|
) -> str: |
|
|
""" |
|
|
Extract text from an image using vision model. |
|
|
|
|
|
Args: |
|
|
image_path: Path to image file |
|
|
preserve_formatting: Whether to preserve document structure |
|
|
|
|
|
Returns: |
|
|
Extracted text content |
|
|
""" |
|
|
logger.info(f"📷 Extracting text from: {image_path}") |
|
|
|
|
|
try: |
|
|
|
|
|
if preserve_formatting: |
|
|
prompt = """Extract all text from this image, preserving the original formatting and structure. |
|
|
|
|
|
Maintain: |
|
|
- Paragraph breaks and line spacing |
|
|
- Bullet points and numbered lists |
|
|
- Section headings and hierarchy |
|
|
- Table structures if present |
|
|
|
|
|
Return only the extracted text, formatted as closely as possible to the original.""" |
|
|
else: |
|
|
prompt = "Extract all text from this image. Return only the text content without any additional commentary." |
|
|
|
|
|
|
|
|
image_data = self._encode_image(image_path) |
|
|
|
|
|
|
|
|
message = HumanMessage( |
|
|
content=[ |
|
|
{"type": "text", "text": prompt}, |
|
|
{ |
|
|
"type": "image_url", |
|
|
"image_url": f"data:image/jpeg;base64,{image_data}" |
|
|
} |
|
|
] |
|
|
) |
|
|
|
|
|
|
|
|
response = await self.vision_llm.ainvoke([message]) |
|
|
extracted_text = response.content |
|
|
|
|
|
logger.success(f"✅ Extracted {len(extracted_text)} characters from {Path(image_path).name}") |
|
|
return extracted_text |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Failed to extract text from {image_path}: {e}") |
|
|
raise |
|
|
|
|
|
async def analyze_diagram(self, image_path: str) -> Dict[str, Any]: |
|
|
""" |
|
|
Analyze technical diagrams, flowcharts, and schematics. |
|
|
|
|
|
Args: |
|
|
image_path: Path to diagram image |
|
|
|
|
|
Returns: |
|
|
Dictionary with diagram analysis |
|
|
""" |
|
|
logger.info(f"📊 Analyzing diagram: {image_path}") |
|
|
|
|
|
try: |
|
|
prompt = """Analyze this technical diagram in detail. Provide: |
|
|
|
|
|
1. Type of diagram (flowchart, circuit, organizational chart, etc.) |
|
|
2. Main components and elements |
|
|
3. All text labels and annotations |
|
|
4. Connections and relationships between elements |
|
|
5. Overall purpose and meaning |
|
|
|
|
|
Format your response as structured text.""" |
|
|
|
|
|
image_data = self._encode_image(image_path) |
|
|
|
|
|
message = HumanMessage( |
|
|
content=[ |
|
|
{"type": "text", "text": prompt}, |
|
|
{ |
|
|
"type": "image_url", |
|
|
"image_url": f"data:image/jpeg;base64,{image_data}" |
|
|
} |
|
|
] |
|
|
) |
|
|
|
|
|
response = await self.vision_llm.ainvoke([message]) |
|
|
analysis = response.content |
|
|
|
|
|
logger.success(f"✅ Analyzed diagram: {Path(image_path).name}") |
|
|
|
|
|
return { |
|
|
"diagram_type": "technical_diagram", |
|
|
"analysis": analysis, |
|
|
"source": image_path |
|
|
} |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Failed to analyze diagram {image_path}: {e}") |
|
|
raise |
|
|
|
|
|
async def extract_table_data(self, image_path: str) -> str: |
|
|
""" |
|
|
Extract data from tables in images. |
|
|
|
|
|
Args: |
|
|
image_path: Path to image containing table |
|
|
|
|
|
Returns: |
|
|
Table data in markdown format |
|
|
""" |
|
|
logger.info(f"📋 Extracting table from: {image_path}") |
|
|
|
|
|
try: |
|
|
prompt = """Extract the table data from this image. |
|
|
|
|
|
Format the output as a Markdown table with proper alignment: |
|
|
- Use | for column separators |
|
|
- Use | --- | for header separator |
|
|
- Maintain proper column alignment |
|
|
- Include all rows and columns |
|
|
|
|
|
Example format: |
|
|
| Header 1 | Header 2 | Header 3 | |
|
|
| --- | --- | --- | |
|
|
| Data 1 | Data 2 | Data 3 | |
|
|
|
|
|
Return ONLY the table, no additional text.""" |
|
|
|
|
|
image_data = self._encode_image(image_path) |
|
|
|
|
|
message = HumanMessage( |
|
|
content=[ |
|
|
{"type": "text", "text": prompt}, |
|
|
{ |
|
|
"type": "image_url", |
|
|
"image_url": f"data:image/jpeg;base64,{image_data}" |
|
|
} |
|
|
] |
|
|
) |
|
|
|
|
|
response = await self.vision_llm.ainvoke([message]) |
|
|
table_markdown = response.content |
|
|
|
|
|
logger.success(f"✅ Extracted table from {Path(image_path).name}") |
|
|
return table_markdown |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Failed to extract table from {image_path}: {e}") |
|
|
raise |
|
|
|
|
|
async def analyze_patent_page(self, image_path: str) -> Dict[str, Any]: |
|
|
""" |
|
|
Specialized analysis for patent document pages. |
|
|
|
|
|
Args: |
|
|
image_path: Path to patent page image |
|
|
|
|
|
Returns: |
|
|
Dictionary with extracted patent information |
|
|
""" |
|
|
logger.info(f"📄 Analyzing patent page: {image_path}") |
|
|
|
|
|
try: |
|
|
prompt = """Analyze this patent document page. Extract: |
|
|
|
|
|
1. Patent number or application number (if visible) |
|
|
2. Title or heading |
|
|
3. All body text (claims, descriptions, specifications) |
|
|
4. Figure numbers and captions |
|
|
5. Any diagrams or technical drawings descriptions |
|
|
6. Inventor names and assignee information (if visible) |
|
|
7. Dates (filing date, publication date, etc.) |
|
|
|
|
|
Preserve the structure and formatting. Return comprehensive extracted content.""" |
|
|
|
|
|
image_data = self._encode_image(image_path) |
|
|
|
|
|
message = HumanMessage( |
|
|
content=[ |
|
|
{"type": "text", "text": prompt}, |
|
|
{ |
|
|
"type": "image_url", |
|
|
"image_url": f"data:image/jpeg;base64,{image_data}" |
|
|
} |
|
|
] |
|
|
) |
|
|
|
|
|
response = await self.vision_llm.ainvoke([message]) |
|
|
analysis = response.content |
|
|
|
|
|
logger.success(f"✅ Analyzed patent page: {Path(image_path).name}") |
|
|
|
|
|
return { |
|
|
"page_content": analysis, |
|
|
"source": image_path, |
|
|
"type": "patent_page" |
|
|
} |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Failed to analyze patent page {image_path}: {e}") |
|
|
raise |
|
|
|
|
|
async def identify_handwriting(self, image_path: str) -> str: |
|
|
""" |
|
|
Extract handwritten text from images. |
|
|
|
|
|
Args: |
|
|
image_path: Path to image with handwritten content |
|
|
|
|
|
Returns: |
|
|
Extracted handwritten text |
|
|
""" |
|
|
logger.info(f"✍️ Extracting handwriting from: {image_path}") |
|
|
|
|
|
try: |
|
|
prompt = """This image contains handwritten text. Please: |
|
|
|
|
|
1. Carefully read all handwritten content |
|
|
2. Transcribe the text exactly as written |
|
|
3. Indicate [unclear] for illegible portions |
|
|
4. Preserve line breaks and spacing |
|
|
5. Note any annotations or margin notes |
|
|
|
|
|
Return only the transcribed text.""" |
|
|
|
|
|
image_data = self._encode_image(image_path) |
|
|
|
|
|
message = HumanMessage( |
|
|
content=[ |
|
|
{"type": "text", "text": prompt}, |
|
|
{ |
|
|
"type": "image_url", |
|
|
"image_url": f"data:image/jpeg;base64,{image_data}" |
|
|
} |
|
|
] |
|
|
) |
|
|
|
|
|
response = await self.vision_llm.ainvoke([message]) |
|
|
handwriting = response.content |
|
|
|
|
|
logger.success(f"✅ Extracted handwriting from {Path(image_path).name}") |
|
|
return handwriting |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Failed to extract handwriting from {image_path}: {e}") |
|
|
raise |
|
|
|
|
|
def is_available(self) -> bool: |
|
|
""" |
|
|
Check if vision model is available. |
|
|
|
|
|
Returns: |
|
|
True if model is available, False otherwise |
|
|
""" |
|
|
try: |
|
|
|
|
|
import requests |
|
|
response = requests.get(f"{self.base_url}/api/tags") |
|
|
if response.status_code == 200: |
|
|
models = response.json().get("models", []) |
|
|
return any(self.model_name in model.get("name", "") for model in models) |
|
|
return False |
|
|
except Exception as e: |
|
|
logger.warning(f"Could not check model availability: {e}") |
|
|
return False |
|
|
|