fadliaulawi
commited on
Commit
•
ff62661
1
Parent(s):
8fe9391
Implement multiple files and zip
Browse files
app.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import io
|
2 |
import pandas as pd
|
3 |
import streamlit as st
|
|
|
4 |
|
5 |
from concurrent.futures import ThreadPoolExecutor
|
6 |
from datetime import datetime
|
@@ -63,85 +64,104 @@ if api:
|
|
63 |
|
64 |
st.divider()
|
65 |
st.markdown("<h4>Process</h4>", unsafe_allow_html=True)
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import io
|
2 |
import pandas as pd
|
3 |
import streamlit as st
|
4 |
+
import zipfile
|
5 |
|
6 |
from concurrent.futures import ThreadPoolExecutor
|
7 |
from datetime import datetime
|
|
|
64 |
|
65 |
st.divider()
|
66 |
st.markdown("<h4>Process</h4>", unsafe_allow_html=True)
|
67 |
+
|
68 |
+
# Set chunks
|
69 |
+
chunk_size = chunk_option
|
70 |
+
chunk_overlap = int(0.25 * chunk_size)
|
71 |
+
|
72 |
+
# Uploading form
|
73 |
+
form = st.form(key="files")
|
74 |
+
uploaded_files = form.file_uploader(label='Upload Paper(s) here', accept_multiple_files=True)
|
75 |
+
submit = form.form_submit_button("Get Result")
|
76 |
+
if not uploaded_files or not submit:
|
77 |
+
exit()
|
78 |
+
|
79 |
+
# Loop through uploaded files
|
80 |
+
buffers = []
|
81 |
+
for pdf in stqdm(uploaded_files):
|
82 |
+
file_name = pdf.name
|
83 |
+
with st.expander(f"{file_name}", expanded=True):
|
84 |
+
start_time = datetime.now()
|
85 |
+
st.markdown(f"Start Extraction process at <code>{datetime.now().strftime('%H:%M')}</code>", unsafe_allow_html=True)
|
86 |
+
|
87 |
+
with NamedTemporaryFile(dir='.', suffix=".pdf") as file:
|
88 |
+
file.write(pdf.getbuffer())
|
89 |
+
|
90 |
+
# Load Documents
|
91 |
+
loader = PyPDFLoader(file.name)
|
92 |
+
pages = loader.load()
|
93 |
+
passage = '\n'.join([page.page_content for page in pages])
|
94 |
+
|
95 |
+
# Split text into chunks
|
96 |
+
docs = [Document(passage)]
|
97 |
+
text_splitter = TokenTextSplitter.from_tiktoken_encoder(
|
98 |
+
chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
99 |
+
)
|
100 |
+
chunks = text_splitter.split_documents(docs)
|
101 |
+
|
102 |
+
# Start extraction process in parallel
|
103 |
+
process = Process(model)
|
104 |
+
with ThreadPoolExecutor() as executor:
|
105 |
+
result_text = executor.submit(process.get_entity, (chunks, 'alls')).result()
|
106 |
+
result_table = executor.submit(process.get_table, file.name).result()
|
107 |
+
result_rsid = executor.submit(process.get_rsid, passage).result()
|
108 |
+
|
109 |
+
# Manually search for rsID
|
110 |
+
result_text = pd.concat([result_text, result_rsid]).fillna('').reset_index(drop=True)
|
111 |
+
|
112 |
+
# Combine two results
|
113 |
+
result_text['Source'] = 'Text'
|
114 |
+
result_table['Source'] = 'Table'
|
115 |
+
dataframe = pd.concat([result_table, result_text], ignore_index=True)
|
116 |
+
dataframe.reset_index(drop=True, inplace=True)
|
117 |
+
|
118 |
+
# Validate Result
|
119 |
+
st.markdown(f"Start Validation process at <code>{datetime.now().strftime('%H:%M')}</code>", unsafe_allow_html=True)
|
120 |
+
validation = Validation(model_val)
|
121 |
+
df, df_clean = validation.validate(dataframe, passage, api)
|
122 |
+
df.drop_duplicates(['Genes', 'rsID'], ignore_index=True, inplace=True)
|
123 |
+
|
124 |
+
# Integrate with Database
|
125 |
+
df_final = integrate(df)
|
126 |
+
|
127 |
+
st.write("Success in ", round((datetime.now().timestamp() - start_time.timestamp()) / 60, 2), "minutes")
|
128 |
+
st.divider()
|
129 |
+
st.write(f"Extracted **{len(df)}** rows with database alignment of **{len(df_final) - len(df)}** rows")
|
130 |
+
st.dataframe(df_final)
|
131 |
+
|
132 |
+
# Save to Excel
|
133 |
+
output_name = f"{file_name.replace('.pdf', '')}_{chunk_option}_{model.split('-')[0]}_{model_val.split('-')[0]}.xlsx"
|
134 |
+
with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
|
135 |
+
df_final.to_excel(writer, sheet_name='Validated + Database')
|
136 |
+
df_clean.to_excel(writer, sheet_name='Cleaned')
|
137 |
+
dataframe.to_excel(writer, sheet_name='Original')
|
138 |
+
writer.close()
|
139 |
+
|
140 |
+
st.markdown(
|
141 |
+
create_download_link(
|
142 |
+
"application/vnd.ms-excel",
|
143 |
+
buffer.getvalue(),
|
144 |
+
output_name,
|
145 |
+
"Save Result"
|
146 |
+
),
|
147 |
+
unsafe_allow_html=True
|
148 |
+
)
|
149 |
+
|
150 |
+
buffers.append((buffer, output_name))
|
151 |
+
|
152 |
+
# Zip all results
|
153 |
+
zip_buffer = io.BytesIO()
|
154 |
+
for buffer, output_name in buffers:
|
155 |
+
with zipfile.ZipFile(zip_buffer, 'a') as zip_file:
|
156 |
+
zip_file.writestr(output_name, buffer.getvalue())
|
157 |
+
|
158 |
+
# Download all results
|
159 |
+
st.markdown(
|
160 |
+
create_download_link(
|
161 |
+
"application/octet-stream",
|
162 |
+
zip_buffer.getvalue(),
|
163 |
+
"extracted-results.zip",
|
164 |
+
"Download All Results"
|
165 |
+
),
|
166 |
+
unsafe_allow_html=True
|
167 |
+
)
|
utils.py
CHANGED
@@ -63,15 +63,17 @@ def generate_raw_files():
|
|
63 |
|
64 |
# Load Raw GWAS files
|
65 |
if os.path.exists(gwas_path):
|
66 |
-
gwas = pd.read_csv(gwas_path, delimiter='\t'
|
67 |
else:
|
68 |
data = requests.get(raw_url).content.decode('utf-8')
|
69 |
-
gwas = pd.read_csv(StringIO(data), delimiter='\t'
|
|
|
|
|
70 |
|
71 |
# Load Genes and SNPs from GWAS
|
72 |
gwas_gene_rsid = gwas[['MAPPED_GENE', 'SNPS']]
|
73 |
gwas_gene_rsid.dropna(inplace=True, ignore_index=True)
|
74 |
-
gwas_gene_rsid['MAPPED_GENE'] = gwas_gene_rsid['MAPPED_GENE'].apply(lambda x: x.replace(' ', '').upper())
|
75 |
|
76 |
# Generate Genes and SNPs mapping
|
77 |
ground_truth = defaultdict(list)
|
|
|
63 |
|
64 |
# Load Raw GWAS files
|
65 |
if os.path.exists(gwas_path):
|
66 |
+
gwas = pd.read_csv(gwas_path, delimiter='\t', dtype=str)
|
67 |
else:
|
68 |
data = requests.get(raw_url).content.decode('utf-8')
|
69 |
+
gwas = pd.read_csv(StringIO(data), delimiter='\t', dtype=str)
|
70 |
+
|
71 |
+
gwas = gwas[['DISEASE/TRAIT', 'CHR_ID', 'MAPPED_GENE', 'SNPS', 'P-VALUE', 'OR or BETA']].copy()
|
72 |
|
73 |
# Load Genes and SNPs from GWAS
|
74 |
gwas_gene_rsid = gwas[['MAPPED_GENE', 'SNPS']]
|
75 |
gwas_gene_rsid.dropna(inplace=True, ignore_index=True)
|
76 |
+
gwas_gene_rsid.loc[:, 'MAPPED_GENE'] = gwas_gene_rsid['MAPPED_GENE'].apply(lambda x: x.replace(' ', '').upper())
|
77 |
|
78 |
# Generate Genes and SNPs mapping
|
79 |
ground_truth = defaultdict(list)
|