PDFExtractor / pages /multiple_pdf_extractor.py
Vela
enhanced graph
75115cd
import streamlit as st
import os
from application.schemas.response_schema import (
GEMINI_GHG_PARAMETERS, GEMINI_ENVIRONMENTAL_PARAMETERS_CSRD,
GEMINI_ENVIRONMENT_PARAMETERS, GEMINI_SOCIAL_PARAMETERS,
GEMINI_GOVERNANCE_PARAMETERS, GEMINI_MATERIALITY_PARAMETERS,
GEMINI_NET_ZERO_INTERVENTION_PARAMETERS
)
from application.services import gemini_api_service, streamlit_function
from application.utils import logger
logger = logger.get_logger()
streamlit_function.config_homepage()
st.title("Sustainability Report Analyzer")
st.write("Upload your sustainability report PDF and generate insights using Gemini models.")
AVAILABLE_MODELS = [
"gemini-1.5-pro-latest",
"gemini-2.0-flash",
"gemini-1.5-flash",
"gemini-2.5-pro-exp-03-25"
]
RESPONSE_SCHEMAS = {
"Greenhouse Gas (GHG) Protocol Parameters": GEMINI_GHG_PARAMETERS,
# "Environmental Parameters (CSRD)": GEMINI_ENVIRONMENTAL_PARAMETERS_CSRD,
# "Environmental Parameters": GEMINI_ENVIRONMENT_PARAMETERS,
# "Social Parameters": GEMINI_SOCIAL_PARAMETERS,
# "Governance Parameters": GEMINI_GOVERNANCE_PARAMETERS,
# "Materiality Parameters": GEMINI_MATERIALITY_PARAMETERS,
# "Net Zero Intervention Parameters": GEMINI_NET_ZERO_INTERVENTION_PARAMETERS,
}
selected_model = st.selectbox("Select Gemini Model", options=AVAILABLE_MODELS)
uploaded_files = streamlit_function.upload_file("pdf", label="๐Ÿ“ค Upload Sustainability Report PDF")
if uploaded_files:
st.session_state.uploaded_files = uploaded_files
if "uploaded_files" not in st.session_state:
st.session_state.uploaded_files = []
if st.session_state.uploaded_files:
columns = st.columns(3)
for i, pdf_file in enumerate(st.session_state.uploaded_files):
with columns[i % 3]:
file_name = pdf_file.name.removesuffix(".pdf")
st.write(f"๐Ÿ“„ **File {i+1}:** `{pdf_file.name}`")
extract_btn = st.button(f"Extract Data from File {i+1}", key=f"extract_{i}")
result_key = f"{selected_model}_result_file_{i+1}"
if extract_btn:
with st.spinner(f"Extracting data from `{pdf_file.name}` using `{selected_model}`..."):
try:
all_results = {}
for label, schema in RESPONSE_SCHEMAS.items():
result = gemini_api_service.extract_emissions_data_as_json("gemini", selected_model, pdf_file, schema)
streamlit_function.export_results_to_excel(result, sheet_name=selected_model, filename=file_name, column=label)
all_results[label] = result
st.session_state[result_key] = all_results
st.success("Data extraction complete.")
except Exception as e:
logger.error(f"Extraction failed: {e}")
st.error("Failed to extract data.")
if st.session_state.get(result_key):
st.write(f"๐Ÿงพ **Extracted Metrics for File {i+1}:**")
st.json(st.session_state[result_key])
file_path = f"data/{file_name}.xlsx"
if os.path.exists(file_path):
with open(file_path, "rb") as file:
st.download_button(
label="Download Excel File",
data=file,
file_name=f"{file_name}.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
# import streamlit as st
# from application.schemas.response_schema import GEMINI_GHG_PARAMETERS, GEMINI_ENVIRONMENTAL_PARAMETERS_CSRD,GEMINI_ENVIRONMENT_PARAMETERS,GEMINI_SOCIAL_PARAMETERS, GEMINI_GOVERNANCE_PARAMETERS, GEMINI_MATERIALITY_PARAMETERS, GEMINI_NET_ZERO_INTERVENTION_PARAMETERS
# from application.services import streamlit_function, gemini_model
# from application.utils import logger
# import test
# logger = logger.get_logger()
# streamlit_function.config_homepage()
# st.title("Sustainability Report Analyzer")
# st.write("Upload your sustainability report PDF and generate insights using different models.")
# MODEL = ["gemini-1.5-pro-latest", "gemini-2.0-flash", "gemini-1.5-flash", "gemini-2.5-pro-exp-03-25"]
# MODEL_1 = "gemini-1.5-pro-latest"
# MODEL_2 = "gemini-2.0-flash"
# MODEL_3 = "gemini-1.5-flash"
# API_1 = "gemini"
# API_2 = "gemini"
# API_3 = "gemini"
# response_schema = [ GEMINI_GHG_PARAMETERS, GEMINI_ENVIRONMENTAL_PARAMETERS_CSRD,
# GEMINI_ENVIRONMENT_PARAMETERS,GEMINI_SOCIAL_PARAMETERS,
# GEMINI_GOVERNANCE_PARAMETERS, GEMINI_MATERIALITY_PARAMETERS,
# GEMINI_NET_ZERO_INTERVENTION_PARAMETERS]
# if "uploaded_files" not in st.session_state:
# st.session_state.uploaded_files = []
# MODEL = st.selectbox(
# "Select Model",
# options=MODEL,
# index=0,
# )
# uploaded_files = streamlit_function.upload_file("pdf", label="Upload Sustainability Report PDF")
# if uploaded_files:
# st.session_state.uploaded_files = uploaded_files
# if st.session_state.uploaded_files:
# columns = st.columns([5, 5, 5], gap="small")
# for i, col in enumerate(columns):
# if i < len(st.session_state.uploaded_files):
# pdf_file = st.session_state.uploaded_files[i]
# file_name = pdf_file.name.removesuffix(".pdf")
# result_key = f"{MODEL}_result_file_{i+1}"
# with col:
# st.write(f"**File {i+1}:** `{pdf_file.name}`")
# if st.button(f"Extract Data from File {i+1}", key=f"extract_btn_{i}"):
# with st.spinner(f"Extracting data from File {i+1} using {MODEL}..."):
# for schema in response_schema:
# result = gemini_model.extract_emissions_data_as_json(API_1, MODEL, pdf_file, schema)
# if schema == GEMINI_GHG_PARAMETERS:
# column = "Greenhouse Gas (GHG) Protocol Parameters"
# elif schema == GEMINI_ENVIRONMENTAL_PARAMETERS_CSRD:
# column = "Environmental Parameters (CSRD)"
# elif schema == GEMINI_ENVIRONMENT_PARAMETERS:
# column = "Environmental Parameters"
# elif schema == GEMINI_SOCIAL_PARAMETERS:
# column = "Social Parameters"
# elif schema == GEMINI_GOVERNANCE_PARAMETERS:
# column = "Governance Parameters"
# elif schema == GEMINI_MATERIALITY_PARAMETERS:
# column = "Materiality Parameters"
# elif schema == GEMINI_NET_ZERO_INTERVENTION_PARAMETERS:
# column = "Net Zero Intervention Parameters"
# else:
# column = None
# test.export_results_to_excel(result, sheet_name=MODEL, filename=file_name, column=column )
# st.session_state[result_key] = result
# if st.session_state.get(result_key):
# st.write(f"**Extracted Metrics for File {i+1}:**")
# st.json(st.session_state[result_key])