import io
import os
import pandas as pd
import streamlit as st
from datetime import datetime
from langchain_community.document_loaders.pdf import PyPDFLoader
from langchain_core.documents.base import Document
from langchain_text_splitters import TokenTextSplitter
from process import get_entity, get_entity_one, get_table, validate
from tempfile import NamedTemporaryFile
from stqdm import stqdm
from threading import Thread
class CustomThread(Thread):
def __init__(self, func, chunk):
super().__init__()
self.func = func
self.chunk = chunk
self.result = ''
def run(self):
self.result = self.func(self.chunk)
buffer = io.BytesIO()
st.cache_data()
st.set_page_config(page_title="NutriGenMe Paper Extractor")
st.title("NutriGenMe - Paper Extraction")
st.markdown("
In its latest version, the app is equipped to extract essential information from papers, including tables in both horizontal and vertical orientations, images, and text exclusively.
", unsafe_allow_html=True)
uploaded_files = st.file_uploader("Upload Paper(s) here :", type="pdf", accept_multiple_files=True)
col1, col2 = st.columns(2)
with col1:
chunk_option = st.selectbox(
'Token amounts per process:',
(24000, 16000, 8000), key='token'
)
chunk_overlap = 0
with col2:
model = st.selectbox(
'Model selection: (UNDER DEVELOPED)',
# 128000, 32768, 1048576
('gpt-4-turbo', 'llama-3-sonar-large-32k-chat', 'gemini-1.5-pro-latest'), key='model'
)
if uploaded_files:
journals = []
parseButtonHV = st.button("Get Result", key='table_HV')
if parseButtonHV:
with st.status("Extraction in progress ...", expanded=True) as status:
start_time = datetime.now()
for uploaded_file in stqdm(uploaded_files):
with NamedTemporaryFile(dir='.', suffix=".pdf", delete=eval(os.getenv('DELETE_TEMP_PDF', 'True'))) as pdf:
pdf.write(uploaded_file.getbuffer())
loader = PyPDFLoader(pdf.name)
pages = loader.load()
chunk_size = 120000
chunk_overlap = 0
docs = pages
if chunk_option:
docs = [Document('\n'.join([page.page_content for page in pages]))]
docs[0].metadata = {'source': pages[0].metadata['source']}
chunk_size = chunk_option
chunk_overlap = int(0.25 * chunk_size)
text_splitter = TokenTextSplitter.from_tiktoken_encoder(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
chunks = text_splitter.split_documents(docs)
threads = []
threads.append(CustomThread(get_entity, (chunks, 'gsd')))
threads.append(CustomThread(get_entity, (chunks, 'summ')))
threads.append(CustomThread(get_entity, (chunks, 'all')))
threads.append(CustomThread(get_entity_one, [c.page_content for c in chunks[:1]]))
threads.append(CustomThread(get_table, pdf.name))
[t.start() for t in threads]
[t.join() for t in threads]
result_gsd = threads[0].result
result_summ = threads[1].result
result = threads[2].result
result_one = threads[3].result
res_gene, res_snp, res_dis = threads[4].result
# Combine
result['Genes'] = res_gene + result_gsd['Genes']
result['SNPs'] = res_snp + result_gsd['SNPs']
result['Diseases'] = res_dis + result_gsd['Diseases']
result['Conclusion'] = result_summ
for k in result_one.keys():
result[k] = result_one[k]
if len(result['Genes']) == 0:
result['Genes'] = ['']
num_rows = max(max(len(result['Genes']), len(result['SNPs'])), len(result['Diseases']))
# Adjust Genes, SNPs, Diseases
for k in ['Genes', 'SNPs', 'Diseases']:
while len(result[k]) < num_rows:
result[k].append('')
# Temporary handling
result[k] = result[k][:num_rows]
# Key Column
result = {key: value if isinstance(value, list) else [value] * num_rows for key, value in result.items()}
dataframe = pd.DataFrame(result)
dataframe = dataframe[['Genes', 'SNPs', 'Diseases', 'Title', 'Authors', 'Publisher Name', 'Publication Year', 'Population', 'Sample Size', 'Study Methodology', 'Study Level', 'Conclusion']]
dataframe.drop_duplicates(['Genes', 'SNPs'], inplace=True)
dataframe.reset_index(drop=True, inplace=True)
cleaned_df, cleaned_llm_df = validate(dataframe)
end_time = datetime.now()
st.write("Success in ", round((end_time.timestamp() - start_time.timestamp()) / 60, 2), "minutes")
st.dataframe(cleaned_df)
with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
# cleaned_llm_df.to_excel(writer, sheet_name='Result with LLM')
cleaned_df.to_excel(writer, sheet_name='Result')
dataframe.to_excel(writer, sheet_name='Original')
writer.close()
st.download_button(
label="Save Result",
data=buffer,
file_name=f"{uploaded_file.name.replace('.pdf', '')}_{chunk_option}.xlsx",
mime='application/vnd.ms-excel'
)