PDFReader / backend.py
sunnysharma20's picture
Update backend.py
fc022bb verified
import os
import re
import pandas as pd
from typing import List, Dict
from langchain.prompts import PromptTemplate
from langchain_openai import OpenAI
from ratelimit import limits, sleep_and_retry
from pdfminer.high_level import extract_text # Changed from pypdf to pdfminer
# Replace with your actual API key
api_key = ""
os.environ["OPENAI_API_KEY"] = api_key
class InvoicePipeline:
def __init__(self, paths):
# This is your file path
self._paths = paths
# This is your LLM (GPT)
self._llm = OpenAI() # Initialize OpenAI here, no rate limiting yet.
# This is prompt
self._prompt_template = self._get_default_prompt_template()
# Rate Limiting Configuration (adjust based on your OpenAI account limits)
self.calls_per_minute = 60 # Example: Adjust based on your plan's RPM limit
self.one_minute = 60
# Apply rate limiting to the LLM call
@sleep_and_retry
@limits(calls=60, period=60) # Calls/minute
def _extract_data_from_llm_with_rate_limit(self, raw_data: str) -> str:
"""Extracts data from the LLM with rate limiting."""
try:
resp = self._llm(self._prompt_template.format(pages=raw_data))
return resp
except Exception as e:
print(f"Error during OpenAI API call: {e}")
return None
# This function will help in extracting and run the code, and will produce a dataframe for us
def run(self) -> pd.DataFrame:
# We have defined the way the data has to be returned
df = pd.DataFrame({
"Invoice ID": pd.Series(dtype="str"), # Changed to string to accommodate the invoice number format
"DESCRIPTION": pd.Series(dtype="str"),
"Issue Data": pd.Series(dtype="str"),
"UNIT PRICE": pd.Series(dtype="str"),
"AMOUNT": pd.Series(dtype="str"), # Changed to string to handle potential non-integer values
"Bill For": pd.Series(dtype="str"),
"From": pd.Series(dtype="str"),
"Terms": pd.Series(dtype="str")}
)
for path in self._paths:
raw_text = self._get_raw_text_from_pdf(path) # This function needs to be created
print(f"Extracted Text: {raw_text}") # Added line for debugging
if not raw_text:
print(f"Skipping file {path} due to empty extraction")
continue
llm_resp = self._extract_data_from_llm_with_rate_limit(raw_text) # Apply rate limit here
if llm_resp: # Check for None response from rate limiter
data = self._parse_response(llm_resp)
if data: # Only append if parsing was successful
df = pd.concat([df, pd.DataFrame([data])], ignore_index=True)
else:
print(f"Skipping file {path} due to parsing failure.")
else:
print(f"Skipping file due to rate limit or API error: {path}")
return df
def _get_default_prompt_template(self) -> PromptTemplate:
template = """You are an expert invoice data extractor. Analyze the following text and extract the specified fields. Return the results in a *structured, easily parseable format*.
Here are the extraction requirements:
1. **Invoice ID:** The unique identifier for the invoice.
2. **DESCRIPTION:** A brief description of the product or service provided.
3. **Issue Data:** The date the invoice was issued.
4. **UNIT PRICE:** The price per unit of the product or service.
5. **AMOUNT:** The total amount due for the line item.
6. **Bill For:** The entity or individual being billed.
7. **From:** The name of the company issuing the invoice.
8. **Terms:** The payment terms (e.g., "Net 30 days").
*Important Instructions*:
* Return a single line containing only the extracted values. Do *NOT* include any introductory text, conversational elements, or explanations.
* Enclose *each value* in double quotes. If a value is not found or is not applicable return "N/A".
* Do *NOT* include currency symbols (e.g., $, €, £).
* Separate each extracted value with a pipe symbol (`|`).
* The order of the extracted values *MUST* be: Invoice ID | DESCRIPTION | Issue Data | UNIT PRICE | AMOUNT | Bill For | From | Terms
Example:
"12345" | "Consulting Services" | "2023-11-15" | "100.00" | "1000.00" | "Acme Corp" | "XYZ Consulting" | "Net 30 days"
Here is the text to analyze:
{pages}
"""
prompt_template = PromptTemplate(input_variables=["pages"], template=template)
return prompt_template
# We will try to extract the text from the PDF to a normal variable.
def _get_raw_text_from_pdf(self, path: str) -> str:
"""Extracts text from a PDF using pdfminer."""
try:
text = extract_text(path) # Use pdfminer
return text
except Exception as e:
print(f"Error extracting text from PDF using pdfminer: {e}")
return "" # Return empty string on failure
def _extract_data_from_llm(self, raw_data: str) -> str:
resp = self._llm(self._prompt_template.format(pages=raw_data))
return resp
def _parse_response(self, response: str) -> Dict[str, str]:
"""Parses the LLM response using regular expressions."""
try:
# Split the response by the pipe symbol
values = response.strip().split("|")
if len(values) != 8: # Ensure we have all expected values
print(f"Warning: Unexpected number of values in response: {len(values)}. Response: {response}")
return {} # Return empty dictionary
# Assign values to keys, handling potential errors
data = {
"Invoice ID": values[0].strip().replace('"', ''),
"DESCRIPTION": values[1].strip().replace('"', ''),
"Issue Data": values[2].strip().replace('"', ''),
"UNIT PRICE": values[3].strip().replace('"', ''),
"AMOUNT": values[4].strip().replace('"', ''),
"Bill For": values[5].strip().replace('"', ''),
"From": values[6].strip().replace('"', ''),
"Terms": values[7].strip().replace('"', '')
}
return data
except Exception as e:
print(f"Error parsing LLM response: {e}. Response: {response}")
return {} # Return empty dictionary on parsing failure