minar09's picture
Create main.py
a639170 verified
raw
history blame
3.92 kB
import os
import json
import time
import logging
from pathlib import Path
from typing import List, Dict, Optional
from dataclasses import dataclass, asdict
from mineru import Mineru, Layout, Table
from sentence_transformers import SentenceTransformer
from llama_cpp import Llama
from fastapi.encoders import jsonable_encoder
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@dataclass
class ProductSpec:
name: str
description: Optional[str] = None
price: Optional[float] = None
attributes: Dict[str, str] = None
tables: List[Dict] = None
def to_dict(self):
return jsonable_encoder(self)
class PDFProcessor:
def __init__(self):
self.mineru = Mineru()
self.emb_model = SentenceTransformer('all-MiniLM-L6-v2')
# Initialize quantized LLM (using deepseek-1.3b)
self.llm = Llama(
model_path="models/deepseek-1.3b-q5_k_m.gguf",
n_ctx=2048,
n_threads=os.cpu_count() - 1,
n_gpu_layers=35 if os.getenv('USE_GPU') else 0
)
def extract_layout(self, pdf_path: str) -> List[Layout]:
"""Extract structured layout using MinerU"""
return self.mineru.process_pdf(pdf_path)
def process_tables(self, tables: List[Table]) -> List[Dict]:
"""Convert MinerU tables to structured format"""
return [{
"page": table.page_number,
"cells": table.cells,
"header": table.headers,
"content": table.content
} for table in tables]
def generate_query_prompt(self, text: str) -> str:
"""Create optimized extraction prompt"""
return f"""Extract product specifications from this text:
{text}
Return JSON format:
{{
"name": "product name",
"description": "product description",
"price": numeric_price,
"attributes": {{ "key": "value" }}
}}"""
def parse_response(self, response: str) -> Optional[ProductSpec]:
"""Robust JSON parsing with fallbacks"""
try:
json_start = response.find('{')
json_end = response.rfind('}') + 1
data = json.loads(response[json_start:json_end])
return ProductSpec(
name=data.get('name', ''),
description=data.get('description'),
price=data.get('price'),
attributes=data.get('attributes', {})
)
except (json.JSONDecodeError, KeyError) as e:
logger.warning(f"Parse error: {e}")
return None
def process_pdf(self, pdf_path: str) -> Dict:
"""Main processing pipeline"""
start_time = time.time()
# Extract structured content
layout = self.extract_layout(pdf_path)
tables = self.process_tables(layout.tables)
# Process text blocks
products = []
for block in layout.text_blocks:
prompt = self.generate_query_prompt(block.text)
# Generate response with hardware optimization
response = self.llm.create_chat_completion(
messages=[{"role": "user", "content": prompt}],
temperature=0.1,
max_tokens=512
)
if product := self.parse_response(response['choices'][0]['message']['content']):
product.tables = tables
products.append(product.to_dict())
logger.info(f"Processed {len(products)} products in {time.time()-start_time:.2f}s")
return {"products": products, "tables": tables}
def process_pdf_catalog(pdf_path: str):
processor = PDFProcessor()
try:
result = processor.process_pdf(pdf_path)
return result, "Processing completed successfully!"
except Exception as e:
logger.error(f"Processing failed: {e}")
return {}, "Error processing PDF"