Spaces:

KalbeDigitalLab
/

nutrigenme-paper-extractor

Running

App Files Files Community

fadliaulawi commited on 5 days ago

Commit

ff62661

•

1 Parent(s): 8fe9391

Implement multiple files and zip

Browse files

Files changed (2) hide show

app.py +102 -82
utils.py +5 -3

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import io
 import pandas as pd
 import streamlit as st
 from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime
@@ -63,85 +64,104 @@ if api:
 st.divider()
 st.markdown("<h4>Process</h4>", unsafe_allow_html=True)
-uploaded_files = st.file_uploader("Upload Paper(s) here :", type="pdf", accept_multiple_files=True)
-if uploaded_files:
-    submit = st.button("Get Result", key='submit')
-if uploaded_files and submit:
-    with st.status("Extraction in progress ...", expanded=True) as status:
-        for uploaded_file in stqdm(uploaded_files):
-            start_time = datetime.now()
-            with NamedTemporaryFile(dir='.', suffix=".pdf") as pdf:
-                pdf.write(uploaded_file.getbuffer())
-                st.markdown(f"Start Extraction process at <code>{datetime.now().strftime('%H:%M')}</code>", unsafe_allow_html=True)
-                # Load Documents
-                loader = PyPDFLoader(pdf.name)
-                pages = loader.load()
-                chunk_size = 120000
-                chunk_overlap = 0
-                docs = pages
-                # Split Documents
-                if chunk_option:
-                    passage = '\n'.join([page.page_content for page in pages])
-                    docs = [Document(passage)]
-                    docs[0].metadata = {'source': pages[0].metadata['source']}
-                    chunk_size = chunk_option
-                    chunk_overlap = int(0.25 * chunk_size)
-                text_splitter = TokenTextSplitter.from_tiktoken_encoder(
-                    chunk_size=chunk_size, chunk_overlap=chunk_overlap
-                )
-                chunks = text_splitter.split_documents(docs)
-                # Start extraction process in parallel
-                process = Process(model)
-                with ThreadPoolExecutor() as executor:
-                    result_text = executor.submit(process.get_entity, (chunks, 'alls')).result()
-                    result_table = executor.submit(process.get_table, pdf.name).result()
-                    result_rsid = executor.submit(process.get_rsid, passage).result()
-                # Manually search for rsID
-                result_text = pd.concat([result_text, result_rsid]).fillna('').reset_index(drop=True)
-                # Combine two results
-                result_text['Source'] = 'Text'
-                result_table['Source'] = 'Table'
-                dataframe = pd.concat([result_table, result_text], ignore_index=True)
-                dataframe.reset_index(drop=True, inplace=True)
-                # Validate Result
-                st.markdown(f"Start Validation process at <code>{datetime.now().strftime('%H:%M')}</code>", unsafe_allow_html=True)
-                validation = Validation(model_val)
-                df, df_clean = validation.validate(dataframe, passage, api)
-                df.drop_duplicates(['Genes', 'rsID'], ignore_index=True, inplace=True)
-                # Integrate with Database
-                df_final = integrate(df)
-                st.write("Success in ", round((datetime.now().timestamp() - start_time.timestamp()) / 60, 2), "minutes")
-                st.divider()
-                st.write(f"Extracted **{len(df)}** rows with database alignment of **{len(df_final) - len(df)}** rows")
-                st.dataframe(df_final)
-                with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
-                    df_final.to_excel(writer, sheet_name='Validated + Database')
-                    df_clean.to_excel(writer, sheet_name='Cleaned')
-                    dataframe.to_excel(writer, sheet_name='Original')
-                    writer.close()
-                st.markdown(
-                    create_download_link(
-                        "application/vnd.ms-excel",
-                        buffer.getvalue(),
-                        f"{uploaded_file.name.replace('.pdf', '')}_{chunk_option}_{model.split('-')[0]}_{model_val.split('-')[0]}.xlsx",
-                        "Save Result"
-                    ),
-                    unsafe_allow_html=True
-                )

 import io
 import pandas as pd
 import streamlit as st
+import zipfile
 from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime
 st.divider()
 st.markdown("<h4>Process</h4>", unsafe_allow_html=True)
+# Set chunks
+chunk_size = chunk_option
+chunk_overlap = int(0.25 * chunk_size)
+# Uploading form
+form = st.form(key="files")
+uploaded_files = form.file_uploader(label='Upload Paper(s) here', accept_multiple_files=True)
+submit = form.form_submit_button("Get Result")
+if not uploaded_files or not submit:
+    exit()
+# Loop through uploaded files
+buffers = []
+for pdf in stqdm(uploaded_files):
+    file_name = pdf.name
+    with st.expander(f"{file_name}", expanded=True):
+        start_time = datetime.now()
+        st.markdown(f"Start Extraction process at <code>{datetime.now().strftime('%H:%M')}</code>", unsafe_allow_html=True)
+        with NamedTemporaryFile(dir='.', suffix=".pdf") as file:
+            file.write(pdf.getbuffer())
+            # Load Documents
+            loader = PyPDFLoader(file.name)
+            pages = loader.load()
+            passage = '\n'.join([page.page_content for page in pages])
+            # Split text into chunks
+            docs = [Document(passage)]
+            text_splitter = TokenTextSplitter.from_tiktoken_encoder(
+                chunk_size=chunk_size, chunk_overlap=chunk_overlap
+            )
+            chunks = text_splitter.split_documents(docs)
+            # Start extraction process in parallel
+            process = Process(model)
+            with ThreadPoolExecutor() as executor:
+                result_text = executor.submit(process.get_entity, (chunks, 'alls')).result()
+                result_table = executor.submit(process.get_table, file.name).result()
+                result_rsid = executor.submit(process.get_rsid, passage).result()
+            # Manually search for rsID
+            result_text = pd.concat([result_text, result_rsid]).fillna('').reset_index(drop=True)
+            # Combine two results
+            result_text['Source'] = 'Text'
+            result_table['Source'] = 'Table'
+            dataframe = pd.concat([result_table, result_text], ignore_index=True)
+            dataframe.reset_index(drop=True, inplace=True)
+            # Validate Result
+            st.markdown(f"Start Validation process at <code>{datetime.now().strftime('%H:%M')}</code>", unsafe_allow_html=True)
+            validation = Validation(model_val)
+            df, df_clean = validation.validate(dataframe, passage, api)
+            df.drop_duplicates(['Genes', 'rsID'], ignore_index=True, inplace=True)
+            # Integrate with Database
+            df_final = integrate(df)
+            st.write("Success in ", round((datetime.now().timestamp() - start_time.timestamp()) / 60, 2), "minutes")
+            st.divider()
+            st.write(f"Extracted **{len(df)}** rows with database alignment of **{len(df_final) - len(df)}** rows")
+            st.dataframe(df_final)
+            # Save to Excel
+            output_name = f"{file_name.replace('.pdf', '')}_{chunk_option}_{model.split('-')[0]}_{model_val.split('-')[0]}.xlsx"
+            with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
+                df_final.to_excel(writer, sheet_name='Validated + Database')
+                df_clean.to_excel(writer, sheet_name='Cleaned')
+                dataframe.to_excel(writer, sheet_name='Original')
+                writer.close()
+            st.markdown(
+                create_download_link(
+                    "application/vnd.ms-excel",
+                    buffer.getvalue(),
+                    output_name,
+                    "Save Result"
+                ),
+                unsafe_allow_html=True
+            )
+            buffers.append((buffer, output_name))
+# Zip all results
+zip_buffer = io.BytesIO()
+for buffer, output_name in buffers:
+    with zipfile.ZipFile(zip_buffer, 'a') as zip_file:
+        zip_file.writestr(output_name, buffer.getvalue())
+# Download all results
+st.markdown(
+    create_download_link(
+        "application/octet-stream",
+        zip_buffer.getvalue(),
+        "extracted-results.zip",
+        "Download All Results"
+    ),
+    unsafe_allow_html=True
+)

utils.py CHANGED Viewed

@@ -63,15 +63,17 @@ def generate_raw_files():
     # Load Raw GWAS files
     if os.path.exists(gwas_path):
-        gwas = pd.read_csv(gwas_path, delimiter='\t')[['DISEASE/TRAIT', 'CHR_ID', 'MAPPED_GENE', 'SNPS', 'P-VALUE', 'OR or BETA']]
     else:
         data = requests.get(raw_url).content.decode('utf-8')
-        gwas = pd.read_csv(StringIO(data), delimiter='\t')[['DISEASE/TRAIT', 'CHR_ID', 'MAPPED_GENE', 'SNPS', 'P-VALUE', 'OR or BETA']]
     # Load Genes and SNPs from GWAS
     gwas_gene_rsid = gwas[['MAPPED_GENE', 'SNPS']]
     gwas_gene_rsid.dropna(inplace=True, ignore_index=True)
-    gwas_gene_rsid['MAPPED_GENE'] = gwas_gene_rsid['MAPPED_GENE'].apply(lambda x: x.replace(' ', '').upper())
     # Generate Genes and SNPs mapping
     ground_truth = defaultdict(list)

     # Load Raw GWAS files
     if os.path.exists(gwas_path):
+        gwas = pd.read_csv(gwas_path, delimiter='\t', dtype=str)
     else:
         data = requests.get(raw_url).content.decode('utf-8')
+        gwas = pd.read_csv(StringIO(data), delimiter='\t', dtype=str)
+    gwas = gwas[['DISEASE/TRAIT', 'CHR_ID', 'MAPPED_GENE', 'SNPS', 'P-VALUE', 'OR or BETA']].copy()
     # Load Genes and SNPs from GWAS
     gwas_gene_rsid = gwas[['MAPPED_GENE', 'SNPS']]
     gwas_gene_rsid.dropna(inplace=True, ignore_index=True)
+    gwas_gene_rsid.loc[:, 'MAPPED_GENE'] = gwas_gene_rsid['MAPPED_GENE'].apply(lambda x: x.replace(' ', '').upper())
     # Generate Genes and SNPs mapping
     ground_truth = defaultdict(list)