Spaces:
Sleeping
Sleeping
import os | |
import json | |
import re | |
from typing import Optional, Dict, Union, BinaryIO | |
import requests | |
from google import genai | |
from google.genai import types | |
from application.utils.logger import get_logger | |
from application.services.gemini_api_service import upload_file | |
from application.services.mongo_db_service import store_document | |
from application.schemas.response_schema import GEMINI_GHG_PARAMETERS | |
from langchain_core.tools import tool | |
logger = get_logger() | |
client = genai.Client(api_key=os.getenv("GEMINI_API_KEY")) | |
MODEL = "gemini-2.0-flash" | |
PROMPT = ( | |
"""You are a PDF parsing agent specialized in extracting structured sustainability data from a company's Sustainability, ESG, or Corporate Responsibility Report in PDF format. | |
Your task is to extract Greenhouse Gas (GHG) Protocol, Environmental (CSRD), Materiality, Net Zero Interventions, and ESG (Environmental, Social, Governance) Data with high accuracy and consistency for downstream processing. | |
### Instructions: | |
1. **Schema Adherence**: Strictly follow the provided schema for output structure. Ensure every field in the schema is populated with either extracted data or a placeholder. | |
2. **Data Sources**: Extract data from all relevant sections of the PDF, including: | |
- Narrative text | |
- Tables | |
- Infographics, charts, or visual elements (interpret labels, captions, or legends to extract numerical or textual data) | |
- Footnotes or appendices | |
3. **Infographic Handling**: For infographics, prioritize: | |
- Text labels or annotations within the graphic | |
- Captions or descriptions near the infographic | |
- Legends or keys that clarify values | |
- If values are ambiguous, cross-reference with narrative text or tables discussing similar metrics. | |
4. **Year and Scope**: Identify the reporting year and scope (e.g., global, regional) for each metric. If not explicitly stated, infer from the report's context (e.g., '2023 Sustainability Report' implies 2023 data). | |
5. **Edge Cases**: | |
- If data is missing, use placeholders as specified in the schema. | |
- If multiple values exist for a field (e.g., emissions for different years), select the most recent year unless otherwise specified in the schema. | |
### Output Requirements: | |
- Return a JSON object adhering to the schema. | |
- Ensure all fields are populated, using placeholders for missing data. | |
- Include a 'notes' field in the output for any assumptions, estimations, or conflicts encountered during extraction. | |
### Task: | |
- Parse the PDF thoroughly to extract all relevant data. | |
- Ensure consistency in units, years, and terminology across the output. | |
- Handle infographics with care, prioritizing textual data and flagging estimates. | |
- Provide a complete, schema-compliant JSON output with notes for any ambiguities or assumptions. | |
""" | |
) | |
def extract_emission_data_as_json(file_input: Union[BinaryIO, bytes, str]) -> Optional[Dict]: | |
""" | |
Extracts emission-related ESG data from a PDF file using the Gemini API. | |
This function uploads the provided PDF (local file path, binary file, or byte stream) to Gemini, | |
sends a structured prompt to extract relevant emission data, and attempts to parse the response as JSON. | |
Args: | |
file_input (Union[BinaryIO, bytes, str]): | |
The input file to process. Can be a file object, byte stream, or local file path. | |
Returns: | |
Optional[Dict]: | |
A dictionary containing the extracted emission data if parsing succeeds, | |
or a dictionary with the raw text response if JSON parsing fails. | |
Returns None if the extraction process encounters an error. | |
Raises: | |
Exception: | |
Logs and handles any unexpected errors during file upload, Gemini API interaction, or response parsing. | |
Notes: | |
- The function automatically handles uploading if the file is not already present on Gemini. | |
- If the response is not valid JSON, the raw response text is returned under the key "raw_response". | |
- Token usage information (input, output, total tokens) is logged if available. | |
""" | |
try: | |
uploaded_file = upload_file(file=file_input) | |
response = client.models.generate_content( | |
model=MODEL, | |
contents=[uploaded_file, PROMPT], | |
config={ | |
'response_mime_type': 'application/json', | |
'response_schema': GEMINI_GHG_PARAMETERS, | |
'temperature': 0.0, | |
}, | |
) | |
if hasattr(response, 'usage_metadata'): | |
logger.info(f"Input tokens: {response.usage_metadata.prompt_token_count}") | |
logger.info(f"Output tokens: {response.usage_metadata.candidates_token_count}") | |
logger.info(f"Total tokens: {response.usage_metadata.total_token_count}") | |
else: | |
logger.info("Token usage metadata not available in response") | |
logger.info("[Gemini] Response received.") | |
try: | |
result = json.loads(response.text) | |
file_name = result.get('Company Name', 'Unknown Company') | |
document ={"Greenhouse Gas (GHG) Protocol Parameters": result.get('Greenhouse Gas (GHG) Protocol Parameters')} | |
store_document(file_name, document) | |
return json.loads(response.text) | |
except json.JSONDecodeError: | |
logger.warning("Failed to parse JSON, returning raw response.") | |
return {"raw_response": response.text} | |
except Exception as e: | |
logger.exception("Error during ESG data extraction.") | |
return None |