fadliaulawi commited on
Commit
ff62661
1 Parent(s): 8fe9391

Implement multiple files and zip

Browse files
Files changed (2) hide show
  1. app.py +102 -82
  2. utils.py +5 -3
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import io
2
  import pandas as pd
3
  import streamlit as st
 
4
 
5
  from concurrent.futures import ThreadPoolExecutor
6
  from datetime import datetime
@@ -63,85 +64,104 @@ if api:
63
 
64
  st.divider()
65
  st.markdown("<h4>Process</h4>", unsafe_allow_html=True)
66
- uploaded_files = st.file_uploader("Upload Paper(s) here :", type="pdf", accept_multiple_files=True)
67
-
68
- if uploaded_files:
69
- submit = st.button("Get Result", key='submit')
70
-
71
- if uploaded_files and submit:
72
-
73
- with st.status("Extraction in progress ...", expanded=True) as status:
74
- for uploaded_file in stqdm(uploaded_files):
75
- start_time = datetime.now()
76
- with NamedTemporaryFile(dir='.', suffix=".pdf") as pdf:
77
-
78
- pdf.write(uploaded_file.getbuffer())
79
- st.markdown(f"Start Extraction process at <code>{datetime.now().strftime('%H:%M')}</code>", unsafe_allow_html=True)
80
-
81
- # Load Documents
82
- loader = PyPDFLoader(pdf.name)
83
- pages = loader.load()
84
-
85
- chunk_size = 120000
86
- chunk_overlap = 0
87
- docs = pages
88
-
89
- # Split Documents
90
- if chunk_option:
91
- passage = '\n'.join([page.page_content for page in pages])
92
- docs = [Document(passage)]
93
- docs[0].metadata = {'source': pages[0].metadata['source']}
94
-
95
- chunk_size = chunk_option
96
- chunk_overlap = int(0.25 * chunk_size)
97
-
98
- text_splitter = TokenTextSplitter.from_tiktoken_encoder(
99
- chunk_size=chunk_size, chunk_overlap=chunk_overlap
100
- )
101
- chunks = text_splitter.split_documents(docs)
102
-
103
- # Start extraction process in parallel
104
- process = Process(model)
105
- with ThreadPoolExecutor() as executor:
106
- result_text = executor.submit(process.get_entity, (chunks, 'alls')).result()
107
- result_table = executor.submit(process.get_table, pdf.name).result()
108
- result_rsid = executor.submit(process.get_rsid, passage).result()
109
-
110
- # Manually search for rsID
111
- result_text = pd.concat([result_text, result_rsid]).fillna('').reset_index(drop=True)
112
-
113
- # Combine two results
114
- result_text['Source'] = 'Text'
115
- result_table['Source'] = 'Table'
116
- dataframe = pd.concat([result_table, result_text], ignore_index=True)
117
- dataframe.reset_index(drop=True, inplace=True)
118
-
119
- # Validate Result
120
- st.markdown(f"Start Validation process at <code>{datetime.now().strftime('%H:%M')}</code>", unsafe_allow_html=True)
121
- validation = Validation(model_val)
122
- df, df_clean = validation.validate(dataframe, passage, api)
123
- df.drop_duplicates(['Genes', 'rsID'], ignore_index=True, inplace=True)
124
-
125
- # Integrate with Database
126
- df_final = integrate(df)
127
-
128
- st.write("Success in ", round((datetime.now().timestamp() - start_time.timestamp()) / 60, 2), "minutes")
129
- st.divider()
130
- st.write(f"Extracted **{len(df)}** rows with database alignment of **{len(df_final) - len(df)}** rows")
131
- st.dataframe(df_final)
132
-
133
- with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
134
- df_final.to_excel(writer, sheet_name='Validated + Database')
135
- df_clean.to_excel(writer, sheet_name='Cleaned')
136
- dataframe.to_excel(writer, sheet_name='Original')
137
- writer.close()
138
-
139
- st.markdown(
140
- create_download_link(
141
- "application/vnd.ms-excel",
142
- buffer.getvalue(),
143
- f"{uploaded_file.name.replace('.pdf', '')}_{chunk_option}_{model.split('-')[0]}_{model_val.split('-')[0]}.xlsx",
144
- "Save Result"
145
- ),
146
- unsafe_allow_html=True
147
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import io
2
  import pandas as pd
3
  import streamlit as st
4
+ import zipfile
5
 
6
  from concurrent.futures import ThreadPoolExecutor
7
  from datetime import datetime
 
64
 
65
  st.divider()
66
  st.markdown("<h4>Process</h4>", unsafe_allow_html=True)
67
+
68
+ # Set chunks
69
+ chunk_size = chunk_option
70
+ chunk_overlap = int(0.25 * chunk_size)
71
+
72
+ # Uploading form
73
+ form = st.form(key="files")
74
+ uploaded_files = form.file_uploader(label='Upload Paper(s) here', accept_multiple_files=True)
75
+ submit = form.form_submit_button("Get Result")
76
+ if not uploaded_files or not submit:
77
+ exit()
78
+
79
+ # Loop through uploaded files
80
+ buffers = []
81
+ for pdf in stqdm(uploaded_files):
82
+ file_name = pdf.name
83
+ with st.expander(f"{file_name}", expanded=True):
84
+ start_time = datetime.now()
85
+ st.markdown(f"Start Extraction process at <code>{datetime.now().strftime('%H:%M')}</code>", unsafe_allow_html=True)
86
+
87
+ with NamedTemporaryFile(dir='.', suffix=".pdf") as file:
88
+ file.write(pdf.getbuffer())
89
+
90
+ # Load Documents
91
+ loader = PyPDFLoader(file.name)
92
+ pages = loader.load()
93
+ passage = '\n'.join([page.page_content for page in pages])
94
+
95
+ # Split text into chunks
96
+ docs = [Document(passage)]
97
+ text_splitter = TokenTextSplitter.from_tiktoken_encoder(
98
+ chunk_size=chunk_size, chunk_overlap=chunk_overlap
99
+ )
100
+ chunks = text_splitter.split_documents(docs)
101
+
102
+ # Start extraction process in parallel
103
+ process = Process(model)
104
+ with ThreadPoolExecutor() as executor:
105
+ result_text = executor.submit(process.get_entity, (chunks, 'alls')).result()
106
+ result_table = executor.submit(process.get_table, file.name).result()
107
+ result_rsid = executor.submit(process.get_rsid, passage).result()
108
+
109
+ # Manually search for rsID
110
+ result_text = pd.concat([result_text, result_rsid]).fillna('').reset_index(drop=True)
111
+
112
+ # Combine two results
113
+ result_text['Source'] = 'Text'
114
+ result_table['Source'] = 'Table'
115
+ dataframe = pd.concat([result_table, result_text], ignore_index=True)
116
+ dataframe.reset_index(drop=True, inplace=True)
117
+
118
+ # Validate Result
119
+ st.markdown(f"Start Validation process at <code>{datetime.now().strftime('%H:%M')}</code>", unsafe_allow_html=True)
120
+ validation = Validation(model_val)
121
+ df, df_clean = validation.validate(dataframe, passage, api)
122
+ df.drop_duplicates(['Genes', 'rsID'], ignore_index=True, inplace=True)
123
+
124
+ # Integrate with Database
125
+ df_final = integrate(df)
126
+
127
+ st.write("Success in ", round((datetime.now().timestamp() - start_time.timestamp()) / 60, 2), "minutes")
128
+ st.divider()
129
+ st.write(f"Extracted **{len(df)}** rows with database alignment of **{len(df_final) - len(df)}** rows")
130
+ st.dataframe(df_final)
131
+
132
+ # Save to Excel
133
+ output_name = f"{file_name.replace('.pdf', '')}_{chunk_option}_{model.split('-')[0]}_{model_val.split('-')[0]}.xlsx"
134
+ with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
135
+ df_final.to_excel(writer, sheet_name='Validated + Database')
136
+ df_clean.to_excel(writer, sheet_name='Cleaned')
137
+ dataframe.to_excel(writer, sheet_name='Original')
138
+ writer.close()
139
+
140
+ st.markdown(
141
+ create_download_link(
142
+ "application/vnd.ms-excel",
143
+ buffer.getvalue(),
144
+ output_name,
145
+ "Save Result"
146
+ ),
147
+ unsafe_allow_html=True
148
+ )
149
+
150
+ buffers.append((buffer, output_name))
151
+
152
+ # Zip all results
153
+ zip_buffer = io.BytesIO()
154
+ for buffer, output_name in buffers:
155
+ with zipfile.ZipFile(zip_buffer, 'a') as zip_file:
156
+ zip_file.writestr(output_name, buffer.getvalue())
157
+
158
+ # Download all results
159
+ st.markdown(
160
+ create_download_link(
161
+ "application/octet-stream",
162
+ zip_buffer.getvalue(),
163
+ "extracted-results.zip",
164
+ "Download All Results"
165
+ ),
166
+ unsafe_allow_html=True
167
+ )
utils.py CHANGED
@@ -63,15 +63,17 @@ def generate_raw_files():
63
 
64
  # Load Raw GWAS files
65
  if os.path.exists(gwas_path):
66
- gwas = pd.read_csv(gwas_path, delimiter='\t')[['DISEASE/TRAIT', 'CHR_ID', 'MAPPED_GENE', 'SNPS', 'P-VALUE', 'OR or BETA']]
67
  else:
68
  data = requests.get(raw_url).content.decode('utf-8')
69
- gwas = pd.read_csv(StringIO(data), delimiter='\t')[['DISEASE/TRAIT', 'CHR_ID', 'MAPPED_GENE', 'SNPS', 'P-VALUE', 'OR or BETA']]
 
 
70
 
71
  # Load Genes and SNPs from GWAS
72
  gwas_gene_rsid = gwas[['MAPPED_GENE', 'SNPS']]
73
  gwas_gene_rsid.dropna(inplace=True, ignore_index=True)
74
- gwas_gene_rsid['MAPPED_GENE'] = gwas_gene_rsid['MAPPED_GENE'].apply(lambda x: x.replace(' ', '').upper())
75
 
76
  # Generate Genes and SNPs mapping
77
  ground_truth = defaultdict(list)
 
63
 
64
  # Load Raw GWAS files
65
  if os.path.exists(gwas_path):
66
+ gwas = pd.read_csv(gwas_path, delimiter='\t', dtype=str)
67
  else:
68
  data = requests.get(raw_url).content.decode('utf-8')
69
+ gwas = pd.read_csv(StringIO(data), delimiter='\t', dtype=str)
70
+
71
+ gwas = gwas[['DISEASE/TRAIT', 'CHR_ID', 'MAPPED_GENE', 'SNPS', 'P-VALUE', 'OR or BETA']].copy()
72
 
73
  # Load Genes and SNPs from GWAS
74
  gwas_gene_rsid = gwas[['MAPPED_GENE', 'SNPS']]
75
  gwas_gene_rsid.dropna(inplace=True, ignore_index=True)
76
+ gwas_gene_rsid.loc[:, 'MAPPED_GENE'] = gwas_gene_rsid['MAPPED_GENE'].apply(lambda x: x.replace(' ', '').upper())
77
 
78
  # Generate Genes and SNPs mapping
79
  ground_truth = defaultdict(list)