ClinFly / utilities /convert.py
GERNET Enody
Multiple pdf file correction
6348126 unverified
import json
import streamlit as st
from .web_utilities import st_cache_data_if, supported_cache
from pdf2image import convert_from_bytes, convert_from_path
import pytesseract
@st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
def convert_df(df):
return df.dropna(how="all").to_csv(sep="\t", index=False).encode("utf-8")
@st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
def convert_df_no_header(df):
return (
df.dropna(how="all").to_csv(sep="\t", index=False, header=None).encode("utf-8")
)
@st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
def convert_json(df):
dict_return = {"features": []}
df_check = df.dropna(subset=["HPO ID", "Phenotype name"])
if len(df_check) > 0:
df_dict_list = df[["HPO ID", "Phenotype name"]].to_dict(orient="index")
for key, value in df_dict_list.items():
dict_return["features"].append(
{
"id": value["HPO ID"],
"observed": "yes",
"label": value["Phenotype name"],
"type": "phenotype",
}
)
return json.dumps(dict_return)
else:
return json.dumps(dict_return)
@st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
def convert_list_phenogenius(df):
df_check = df.dropna(subset=["HPO ID", "Phenotype name"])
if len(df_check) > 0:
return ",".join(df_check["HPO ID"].to_list())
else:
return "No HPO in letters."
@st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
def convert_pdf_to_text(file):
if isinstance(file, bytes):
images = convert_from_bytes(file)
else:
images = convert_from_path(file)
extraction = []
for img in images:
text = pytesseract.image_to_string(img)
extraction.append(text)
return " ".join(extraction)