Spaces:

ayushsoni155
/

Invoice_IMG_To_JSON

Sleeping

Invoice_IMG_To_JSON / llm_processor.py

Ayush soni

Add application file

6034171 3 months ago

3.96 kB

	# File: llm_processor.py
	import os
	import json
	from huggingface_hub import hf_hub_download
	from llama_cpp import Llama

	# Model Configuration
	MODEL_REPO = "bartowski/gemma-2-2b-it-GGUF"
	MODEL_FILE = "gemma-2-2b-it-Q4_K_M.gguf"

	llm = None

	def load_llm_model():
	"""Downloads and loads the GGUF model from Hugging Face."""
	global llm
	try:
	hf_token = os.getenv("HF_TOKEN")
	if not hf_token:
	raise EnvironmentError("HF_TOKEN environment variable not found.")

	print(f"Downloading model {MODEL_FILE}...")
	model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE, token=hf_token)

	print("Loading GGUF model...")
	llm = Llama(
	model_path=model_path,
	n_ctx=2048,
	n_threads=2,
	n_gpu_layers=0,
	verbose=False
	)
	print("GGUF model loaded successfully.")
	except Exception as e:
	print(f"Fatal error loading LLM: {e}")
	llm = None
	def generate_json_from_text(ocr_text: str) -> dict:
	"""
	Takes raw OCR text and uses the LLM to convert it into a structured JSON object.
	"""
	if not llm:
	raise RuntimeError("LLM is not available.")

	prompt = f"""You are an expert invoice parsing AI. Convert the OCR text below into a structured JSON object based on the provided schema. Follow these rules strictly:
	- Output ONLY the JSON object, with no additional text, markdown, or backticks.
	- Interpret OCR errors logically and correct them without confusion (e.g., '3il1' as 'Bill', 'DoSa' as 'Dosa', 'Cofee' as 'Coffee', 'BisiBeleBATH' as 'Bisibelebath', 'Masala-Dosa' as 'Masala Dosa', 'ONIONDoSa' as 'Onion Dosa' – treat * or other artifacts as typos, not synonyms).
	- Extract invoice_number from patterns like 'Bill #:128998' or similar; use null if missing.
	- Format invoice_date as DD-MM-YYYY; infer full year if abbreviated (e.g., '17/02/19' as '17-02-2019' based on context).
	- Seller is the business name/address at the top (e.g., 'SHANTHI HOTEL CATERERS'); invoice_to is only a clear buyer name if present, else null (do not confuse with seller's address).
	- For items, parse lines matching 'Item Qty Rate Value' pattern; extract description (normalized), quantity (integer), rate (float), total (float). Ignore tax or total lines in items.
	- Sum all tax amounts (e.g., CGT 13.94 + SGT 13.94 = 27.88) for tax_amount.
	- Use 'Net Amount' or similar as grand_total; calculate subtotal as grand_total minus tax_amount if not explicit.
	- Be precise and fast – focus only on relevant data.

	JSON Schema:
	{{
	"invoice_number": "string or null",
	"invoice_date": "DD-MM-YYYY or null",
	"seller": "string or null",
	"invoice_to": "string or null",
	"items": [
	{{ "description": "string", "quantity": "integer or null", "rate": "float or null", "total": "float or null" }}
	],
	"subtotal": "float or null",
	"tax_amount": "float or null",
	"grand_total": "float or null"
	}}
	OCR Text:
	{ocr_text}
	"""
	output = llm(
	prompt,
	max_tokens=1024, # Increased for longer JSON
	temperature=0.5, # Slightly higher for better reasoning
	top_p=0.9,
	stop=["<\|endoftext\|>", "</s>"],
	echo=False
	)

	generated_text = output["choices"][0]["text"].strip()

	try:
	start_idx = generated_text.find("{")
	end_idx = generated_text.rfind("}") + 1
	if start_idx != -1 and end_idx != -1:
	json_str = generated_text[start_idx:end_idx]
	json_data = json.loads(json_str)
	return json_data
	else:
	raise json.JSONDecodeError("No JSON object found.", generated_text, 0)
	except json.JSONDecodeError:
	# Fallback: Return structured error with cleaned OCR text
	return {
	"error": "LLM failed to generate valid JSON.",
	"raw_output": generated_text,
	"cleaned_ocr_text": ocr_text
	}