Spaces:

kyauy
/

ClinFly

Running

GERNET Enody

Multiple pdf file correction

6348126 unverified about 1 year ago

1.95 kB

	import json
	import streamlit as st
	from .web_utilities import st_cache_data_if, supported_cache
	from pdf2image import convert_from_bytes, convert_from_path
	import pytesseract


	@st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
	def convert_df(df):
	return df.dropna(how="all").to_csv(sep="\t", index=False).encode("utf-8")


	@st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
	def convert_df_no_header(df):
	return (
	df.dropna(how="all").to_csv(sep="\t", index=False, header=None).encode("utf-8")
	)


	@st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
	def convert_json(df):
	dict_return = {"features": []}
	df_check = df.dropna(subset=["HPO ID", "Phenotype name"])
	if len(df_check) > 0:
	df_dict_list = df[["HPO ID", "Phenotype name"]].to_dict(orient="index")
	for key, value in df_dict_list.items():
	dict_return["features"].append(
	{
	"id": value["HPO ID"],
	"observed": "yes",
	"label": value["Phenotype name"],
	"type": "phenotype",
	}
	)
	return json.dumps(dict_return)
	else:
	return json.dumps(dict_return)


	@st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
	def convert_list_phenogenius(df):
	df_check = df.dropna(subset=["HPO ID", "Phenotype name"])
	if len(df_check) > 0:
	return ",".join(df_check["HPO ID"].to_list())
	else:
	return "No HPO in letters."

	@st_cache_data_if(supported_cache, max_entries=10, ttl=3600)
	def convert_pdf_to_text(file):
	if isinstance(file, bytes):
	images = convert_from_bytes(file)
	else:
	images = convert_from_path(file)
	extraction = []
	for img in images:
	text = pytesseract.image_to_string(img)
	extraction.append(text)

	return " ".join(extraction)