Ronan
feat: add new filters
dd6a24d
raw
history blame
No virus
2.94 kB
import base64
import logging
from pathlib import Path
from typing import Any
import pandas as pd
import streamlit as st
from pypdf import PdfReader
def get_pdf_iframe(pdf_to_process: str) -> str:
base64_pdf = base64.b64encode(Path(pdf_to_process).read_bytes()).decode("utf-8")
pdf_display = f"""
<iframe src="data:application/pdf;base64,{base64_pdf}
" width="100%" height="1000px" type="application/pdf"></iframe>
"""
return pdf_display
def set_algorithm_name(my_key: str) -> None:
st.session_state["algorithm_name"] = st.session_state[my_key]
@st.cache_data
def to_csv_file(df: pd.DataFrame) -> bytes:
# Populate the columns with the metadata, if available
# They may not be available if the user skipped the metadata page
# by not clicking on Submit
if "metadata" in st.session_state:
df = df.assign(company=st.session_state["metadata"]["company_name"])
df = df.assign(sector=st.session_state["metadata"]["sector"])
df = df.assign(year=st.session_state["metadata"]["year"])
df = df.assign(currency=st.session_state["metadata"]["currency"])
df = df.assign(unit=st.session_state["metadata"]["unit"])
df = df.assign(headquarter=st.session_state["metadata"]["headquarter"])
else:
df = df.assign(company="")
df = df.assign(sector="")
df = df.assign(year="")
df = df.assign(currency="")
df = df.assign(unit="")
df = df.assign(headquarter="")
return df.to_csv(index=False).encode("utf-8")
def set_state(key: Any, value: Any) -> None:
"""
Sets the session_state[key] to value.
key can be a list to reach nested values.
Ex: ["key1", "key2"] to reach session_state["key1"]["key2"] value.
"""
if isinstance(key, list):
key_list = key
nested_key_string = "session_state"
nested_value = st.session_state
for k in key_list[:-1]:
try:
nested_key_string += f"['{k}']"
nested_value = nested_value[k]
except KeyError as e:
raise KeyError(f"{nested_key_string} does not exist") from e
nested_value[key_list[-1]] = value
else:
st.session_state[key] = value
def generate_assets() -> None:
assets = {
"pagefilter": {},
"table_extractors": [],
}
# Filtering the pages
st.session_state["proc"].page_filter(
st.session_state["working_file_pdf"].name,
assets,
)
logging.info(f"Assets : {assets}")
if len(assets["pagefilter"]["selected_pages"]) == 0:
# No page has been automatically selected by the page filter
# Hence, we display the full pdf, letting the user select the pages
number_pages = len(PdfReader(st.session_state["working_file_pdf"]).pages)
assets["pagefilter"]["selected_pages"] = list(range(number_pages))
st.session_state["assets"] = assets