import io import os import pandas as pd import streamlit as st from concurrent.futures import ThreadPoolExecutor from datetime import datetime from langchain_community.document_loaders.pdf import PyPDFLoader from langchain_core.documents.base import Document from langchain_text_splitters import TokenTextSplitter from process import Process from tempfile import NamedTemporaryFile from stqdm import stqdm from validate import Validation buffer = io.BytesIO() st.cache_data() st.set_page_config(page_title="NutriGenMe Paper Extractor") st.title("NutriGenMe - Paper Extractor") st.markdown("
{datetime.now().strftime('%H:%M')}
", unsafe_allow_html=True)
# Load Documents
loader = PyPDFLoader(pdf.name)
pages = loader.load()
chunk_size = 120000
chunk_overlap = 0
docs = pages
# Split Documents
if chunk_option:
docs = [Document('\n'.join([page.page_content for page in pages]))]
docs[0].metadata = {'source': pages[0].metadata['source']}
chunk_size = chunk_option
chunk_overlap = int(0.25 * chunk_size)
text_splitter = TokenTextSplitter.from_tiktoken_encoder(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
chunks = text_splitter.split_documents(docs)
# Start extraction process in parallel
process = Process(model)
with ThreadPoolExecutor() as executor:
result_gsd = executor.submit(process.get_entity, (chunks, 'gsd'))
result_summ = executor.submit(process.get_entity, (chunks, 'summ'))
result = executor.submit(process.get_entity, (chunks, 'all'))
result_one = executor.submit(process.get_entity_one, [c.page_content for c in chunks[:1]])
result_table = executor.submit(process.get_table, pdf.name)
result_gsd = result_gsd.result()
result_summ = result_summ.result()
result = result.result()
result_one = result_one.result()
res_gene, res_snp, res_dis = result_table.result()
# Combine Result
result['Genes'] = res_gene + result_gsd['Genes']
result['SNPs'] = res_snp + result_gsd['SNPs']
result['Diseases'] = res_dis + result_gsd['Diseases']
result['Conclusion'] = result_summ
for k in result_one.keys():
result[k] = result_one[k]
if len(result['Genes']) == 0:
result['Genes'] = ['']
# Adjust Genes, SNPs, Diseases
num_rows = max(max(len(result['Genes']), len(result['SNPs'])), len(result['Diseases']))
for k in ['Genes', 'SNPs', 'Diseases']:
while len(result[k]) < num_rows:
result[k].append('')
# Temporary handling
result[k] = result[k][:num_rows]
# Arrange Column
result = {key: value if isinstance(value, list) else [value] * num_rows for key, value in result.items()}
dataframe = pd.DataFrame(result)
dataframe = dataframe[['Genes', 'SNPs', 'Diseases', 'Title', 'Authors', 'Publisher Name', 'Publication Year', 'Population', 'Sample Size', 'Study Methodology', 'Study Level', 'Conclusion']]
dataframe = dataframe[dataframe['Genes'].astype(bool)].reset_index(drop=True)
dataframe.reset_index(drop=True, inplace=True)
# Validate Result
st.markdown(f"Start Validation process at {datetime.now().strftime('%H:%M')}
", unsafe_allow_html=True)
validation = Validation(model_val)
df, df_no_llm, df_clean = validation.validate(dataframe, api)
df.drop_duplicates(['Genes', 'SNPs'], inplace=True)
st.write("Success in ", round((datetime.now().timestamp() - start_time.timestamp()) / 60, 2), "minutes")
st.dataframe(df)
with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
if api:
df.to_excel(writer, sheet_name='Result Cleaned API LLM')
df_no_llm.to_excel(writer, sheet_name='Result Cleaned API')
else:
df.to_excel(writer, sheet_name='Result Cleaned LLM')
df_clean.to_excel(writer, sheet_name='Result Cleaned')
dataframe.to_excel(writer, sheet_name='Original')
writer.close()
st.download_button(
label="Save Result",
data=buffer,
file_name=f"{uploaded_file.name.replace('.pdf', '')}_{chunk_option}_{model.split('-')[0]}_{model_val.split('-')[0]}.xlsx",
mime='application/vnd.ms-excel'
)