Spaces:
Sleeping
Sleeping
| # File: llm_processor.py | |
| import os | |
| import json | |
| from huggingface_hub import hf_hub_download | |
| from llama_cpp import Llama | |
| # Model Configuration | |
| MODEL_REPO = "bartowski/gemma-2-2b-it-GGUF" | |
| MODEL_FILE = "gemma-2-2b-it-Q4_K_M.gguf" | |
| llm = None | |
| def load_llm_model(): | |
| """Downloads and loads the GGUF model from Hugging Face.""" | |
| global llm | |
| try: | |
| hf_token = os.getenv("HF_TOKEN") | |
| if not hf_token: | |
| raise EnvironmentError("HF_TOKEN environment variable not found.") | |
| print(f"Downloading model {MODEL_FILE}...") | |
| model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE, token=hf_token) | |
| print("Loading GGUF model...") | |
| llm = Llama( | |
| model_path=model_path, | |
| n_ctx=2048, | |
| n_threads=2, | |
| n_gpu_layers=0, | |
| verbose=False | |
| ) | |
| print("GGUF model loaded successfully.") | |
| except Exception as e: | |
| print(f"Fatal error loading LLM: {e}") | |
| llm = None | |
| def generate_json_from_text(ocr_text: str) -> dict: | |
| """ | |
| Takes raw OCR text and uses the LLM to convert it into a structured JSON object. | |
| """ | |
| if not llm: | |
| raise RuntimeError("LLM is not available.") | |
| prompt = f"""You are an expert invoice parsing AI. Convert the OCR text below into a structured JSON object based on the provided schema. Follow these rules strictly: | |
| - Output ONLY the JSON object, with no additional text, markdown, or backticks. | |
| - Interpret OCR errors logically and correct them without confusion (e.g., '3il1' as 'Bill', 'DoSa' as 'Dosa', 'Cofee' as 'Coffee', 'BisiBeleBATH' as 'Bisibelebath', 'Masala-Dosa*' as 'Masala Dosa', 'ONION*DoSa' as 'Onion Dosa' β treat * or other artifacts as typos, not synonyms). | |
| - Extract invoice_number from patterns like 'Bill #:128998' or similar; use null if missing. | |
| - Format invoice_date as DD-MM-YYYY; infer full year if abbreviated (e.g., '17/02/19' as '17-02-2019' based on context). | |
| - Seller is the business name/address at the top (e.g., 'SHANTHI HOTEL CATERERS'); invoice_to is only a clear buyer name if present, else null (do not confuse with seller's address). | |
| - For items, parse lines matching 'Item Qty Rate Value' pattern; extract description (normalized), quantity (integer), rate (float), total (float). Ignore tax or total lines in items. | |
| - Sum all tax amounts (e.g., CGT 13.94 + SGT 13.94 = 27.88) for tax_amount. | |
| - Use 'Net Amount' or similar as grand_total; calculate subtotal as grand_total minus tax_amount if not explicit. | |
| - Be precise and fast β focus only on relevant data. | |
| **JSON Schema:** | |
| {{ | |
| "invoice_number": "string or null", | |
| "invoice_date": "DD-MM-YYYY or null", | |
| "seller": "string or null", | |
| "invoice_to": "string or null", | |
| "items": [ | |
| {{ "description": "string", "quantity": "integer or null", "rate": "float or null", "total": "float or null" }} | |
| ], | |
| "subtotal": "float or null", | |
| "tax_amount": "float or null", | |
| "grand_total": "float or null" | |
| }} | |
| **OCR Text:** | |
| {ocr_text} | |
| """ | |
| output = llm( | |
| prompt, | |
| max_tokens=1024, # Increased for longer JSON | |
| temperature=0.5, # Slightly higher for better reasoning | |
| top_p=0.9, | |
| stop=["<|endoftext|>", "</s>"], | |
| echo=False | |
| ) | |
| generated_text = output["choices"][0]["text"].strip() | |
| try: | |
| start_idx = generated_text.find("{") | |
| end_idx = generated_text.rfind("}") + 1 | |
| if start_idx != -1 and end_idx != -1: | |
| json_str = generated_text[start_idx:end_idx] | |
| json_data = json.loads(json_str) | |
| return json_data | |
| else: | |
| raise json.JSONDecodeError("No JSON object found.", generated_text, 0) | |
| except json.JSONDecodeError: | |
| # Fallback: Return structured error with cleaned OCR text | |
| return { | |
| "error": "LLM failed to generate valid JSON.", | |
| "raw_output": generated_text, | |
| "cleaned_ocr_text": ocr_text | |
| } |