Spaces:

sunnysharma20
/

PDFReader

Build error

App Files Files Community

PDFReader / backend.py

sunnysharma20

Update backend.py

fc022bb verified about 1 year ago

raw

history blame contribute delete

6.65 kB

	import os
	import re
	import pandas as pd
	from typing import List, Dict
	from langchain.prompts import PromptTemplate
	from langchain_openai import OpenAI
	from ratelimit import limits, sleep_and_retry
	from pdfminer.high_level import extract_text # Changed from pypdf to pdfminer


	# Replace with your actual API key
	api_key = ""

	os.environ["OPENAI_API_KEY"] = api_key


	class InvoicePipeline:

	def __init__(self, paths):
	# This is your file path
	self._paths = paths
	# This is your LLM (GPT)
	self._llm = OpenAI() # Initialize OpenAI here, no rate limiting yet.
	# This is prompt
	self._prompt_template = self._get_default_prompt_template()

	# Rate Limiting Configuration (adjust based on your OpenAI account limits)
	self.calls_per_minute = 60 # Example: Adjust based on your plan's RPM limit
	self.one_minute = 60

	# Apply rate limiting to the LLM call
	@sleep_and_retry
	@limits(calls=60, period=60) # Calls/minute
	def _extract_data_from_llm_with_rate_limit(self, raw_data: str) -> str:
	"""Extracts data from the LLM with rate limiting."""
	try:
	resp = self._llm(self._prompt_template.format(pages=raw_data))
	return resp
	except Exception as e:
	print(f"Error during OpenAI API call: {e}")
	return None

	# This function will help in extracting and run the code, and will produce a dataframe for us
	def run(self) -> pd.DataFrame:
	# We have defined the way the data has to be returned
	df = pd.DataFrame({
	"Invoice ID": pd.Series(dtype="str"), # Changed to string to accommodate the invoice number format
	"DESCRIPTION": pd.Series(dtype="str"),
	"Issue Data": pd.Series(dtype="str"),
	"UNIT PRICE": pd.Series(dtype="str"),
	"AMOUNT": pd.Series(dtype="str"), # Changed to string to handle potential non-integer values
	"Bill For": pd.Series(dtype="str"),
	"From": pd.Series(dtype="str"),
	"Terms": pd.Series(dtype="str")}
	)

	for path in self._paths:
	raw_text = self._get_raw_text_from_pdf(path) # This function needs to be created
	print(f"Extracted Text: {raw_text}") # Added line for debugging
	if not raw_text:
	print(f"Skipping file {path} due to empty extraction")
	continue
	llm_resp = self._extract_data_from_llm_with_rate_limit(raw_text) # Apply rate limit here
	if llm_resp: # Check for None response from rate limiter
	data = self._parse_response(llm_resp)
	if data: # Only append if parsing was successful
	df = pd.concat([df, pd.DataFrame([data])], ignore_index=True)
	else:
	print(f"Skipping file {path} due to parsing failure.")

	else:
	print(f"Skipping file due to rate limit or API error: {path}")

	return df

	def _get_default_prompt_template(self) -> PromptTemplate:
	template = """You are an expert invoice data extractor. Analyze the following text and extract the specified fields. Return the results in a structured, easily parseable format.

	Here are the extraction requirements:

	1. Invoice ID: The unique identifier for the invoice.
	2. DESCRIPTION: A brief description of the product or service provided.
	3. Issue Data: The date the invoice was issued.
	4. UNIT PRICE: The price per unit of the product or service.
	5. AMOUNT: The total amount due for the line item.
	6. Bill For: The entity or individual being billed.
	7. From: The name of the company issuing the invoice.
	8. Terms: The payment terms (e.g., "Net 30 days").

	Important Instructions:
	* Return a single line containing only the extracted values. Do NOT include any introductory text, conversational elements, or explanations.
	* Enclose each value in double quotes. If a value is not found or is not applicable return "N/A".
	* Do NOT include currency symbols (e.g., $, €, £).
	* Separate each extracted value with a pipe symbol (`\|`).
	* The order of the extracted values MUST be: Invoice ID \| DESCRIPTION \| Issue Data \| UNIT PRICE \| AMOUNT \| Bill For \| From \| Terms

	Example:
	"12345" \| "Consulting Services" \| "2023-11-15" \| "100.00" \| "1000.00" \| "Acme Corp" \| "XYZ Consulting" \| "Net 30 days"

	Here is the text to analyze:
	{pages}
	"""
	prompt_template = PromptTemplate(input_variables=["pages"], template=template)
	return prompt_template

	# We will try to extract the text from the PDF to a normal variable.
	def _get_raw_text_from_pdf(self, path: str) -> str:
	"""Extracts text from a PDF using pdfminer."""
	try:
	text = extract_text(path) # Use pdfminer
	return text
	except Exception as e:
	print(f"Error extracting text from PDF using pdfminer: {e}")
	return "" # Return empty string on failure

	def _extract_data_from_llm(self, raw_data: str) -> str:
	resp = self._llm(self._prompt_template.format(pages=raw_data))
	return resp

	def _parse_response(self, response: str) -> Dict[str, str]:
	"""Parses the LLM response using regular expressions."""
	try:
	# Split the response by the pipe symbol
	values = response.strip().split("\|")
	if len(values) != 8: # Ensure we have all expected values
	print(f"Warning: Unexpected number of values in response: {len(values)}. Response: {response}")
	return {} # Return empty dictionary

	# Assign values to keys, handling potential errors
	data = {
	"Invoice ID": values[0].strip().replace('"', ''),
	"DESCRIPTION": values[1].strip().replace('"', ''),
	"Issue Data": values[2].strip().replace('"', ''),
	"UNIT PRICE": values[3].strip().replace('"', ''),
	"AMOUNT": values[4].strip().replace('"', ''),
	"Bill For": values[5].strip().replace('"', ''),
	"From": values[6].strip().replace('"', ''),
	"Terms": values[7].strip().replace('"', '')
	}
	return data

	except Exception as e:
	print(f"Error parsing LLM response: {e}. Response: {response}")
	return {} # Return empty dictionary on parsing failure