import io import os import pandas as pd import streamlit as st from concurrent.futures import ThreadPoolExecutor from datetime import datetime from langchain_community.document_loaders.pdf import PyPDFLoader from langchain_core.documents.base import Document from langchain_text_splitters import TokenTextSplitter from stqdm import stqdm from tempfile import NamedTemporaryFile from utils import * from process import Process from validate import Validation buffer = io.BytesIO() st.cache_data() st.set_page_config(page_title="NutriGenMe Paper Extractor") st.title("NutriGenMe - Paper Extractor") st.markdown("
{datetime.now().strftime('%H:%M')}
", unsafe_allow_html=True)
# Load Documents
loader = PyPDFLoader(pdf.name)
pages = loader.load()
chunk_size = 120000
chunk_overlap = 0
docs = pages
# Split Documents
if chunk_option:
passage = '\n'.join([page.page_content for page in pages])
docs = [Document(passage)]
docs[0].metadata = {'source': pages[0].metadata['source']}
chunk_size = chunk_option
chunk_overlap = int(0.25 * chunk_size)
text_splitter = TokenTextSplitter.from_tiktoken_encoder(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
chunks = text_splitter.split_documents(docs)
# Start extraction process in parallel
process = Process(model)
with ThreadPoolExecutor() as executor:
result_text = executor.submit(process.get_entity, (chunks, 'alls')).result()
result_table = executor.submit(process.get_table, pdf.name).result()
# Manually search for rsID
result_text = process.get_rsid(result_text, passage)
# Combine two results
result_text['Source'] = 'Text'
result_table['Source'] = 'Table'
dataframe = pd.concat([result_table, result_text], ignore_index=True)
dataframe.reset_index(drop=True, inplace=True)
# Validate Result
st.markdown(f"Start Validation process at {datetime.now().strftime('%H:%M')}
", unsafe_allow_html=True)
validation = Validation(model_val)
df, df_clean = validation.validate(dataframe, passage, api)
df.drop_duplicates(['Genes', 'rsID'], ignore_index=True, inplace=True)
# Integrate with Database
df_final = integrate(df)
st.write("Success in ", round((datetime.now().timestamp() - start_time.timestamp()) / 60, 2), "minutes")
st.divider()
st.write(f"Extracted **{len(df)}** rows with database alignment of **{len(df_final) - len(df)}** rows")
st.dataframe(df_final)
with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
df_final.to_excel(writer, sheet_name='Validated + Database')
df_clean.to_excel(writer, sheet_name='Cleaned')
dataframe.to_excel(writer, sheet_name='Original')
writer.close()
st.download_button(
label="Save Result",
data=buffer,
file_name=f"{uploaded_file.name.replace('.pdf', '')}_{chunk_option}_{model.split('-')[0]}_{model_val.split('-')[0]}.xlsx",
mime='application/vnd.ms-excel'
)