Spaces:
Build error
Build error
fadliaulawi
commited on
Commit
β’
e79e408
1
Parent(s):
e347585
Add time tracking for each process
Browse files
app.py
CHANGED
@@ -142,6 +142,7 @@ if uploaded_files:
|
|
142 |
# st.write(pdf.name)
|
143 |
L = []
|
144 |
# Entity Extraction
|
|
|
145 |
st.write("β Extracting Entities ...")
|
146 |
bytes_data = uploaded_file.read()
|
147 |
journal = Journal(uploaded_file.name, bytes_data)
|
@@ -181,12 +182,14 @@ if uploaded_files:
|
|
181 |
chunkdf.append(df)
|
182 |
|
183 |
concat = pd.concat(chunkdf, axis=0).reset_index().drop('index', axis=1).fillna('')
|
184 |
-
st.write("β Entities Extraction Done ..")
|
185 |
time.sleep(0.1)
|
|
|
186 |
st.write("β Generating Summary ...")
|
187 |
summary = get_summ(pdf.name)
|
188 |
-
st.write("β Generating Summary Done ..")
|
189 |
time.sleep(0.1)
|
|
|
190 |
st.write("β Table Extraction in progress ...")
|
191 |
# Table Extraction
|
192 |
# L = []
|
@@ -569,7 +572,7 @@ if uploaded_files:
|
|
569 |
else:
|
570 |
L.append(row)
|
571 |
|
572 |
-
st.write("β Table Extraction Done ...")
|
573 |
status.update(label="Gene and SNPs succesfully collected.")
|
574 |
L = [{key: ''.join(['' if item == 'Unknow' else item for item in value]) for key, value in d.items()} for d in L]
|
575 |
L = [{key: ''.join(['Not Available' if item == '' else item for item in value]) for key, value in d.items()} for d in L]
|
@@ -624,6 +627,7 @@ if uploaded_files:
|
|
624 |
with NamedTemporaryFile(dir='.', suffix=".pdf") as rotated_pdf:
|
625 |
pdf_writer.write(rotated_pdf.name)
|
626 |
# Entity Extraction
|
|
|
627 |
st.write("β Extracting Entities ...")
|
628 |
bytes_data = uploaded_file.read()
|
629 |
journal = Journal(uploaded_file.name, bytes_data)
|
@@ -654,12 +658,14 @@ if uploaded_files:
|
|
654 |
chunkdf.append(df)
|
655 |
|
656 |
concat = pd.concat(chunkdf, axis=0).reset_index().drop('index', axis=1).fillna('')
|
657 |
-
st.write("β Entities Extraction Done ..")
|
658 |
time.sleep(0.1)
|
|
|
659 |
st.write("β Generating Summary ...")
|
660 |
summary = get_summ(pdf.name)
|
661 |
-
st.write("β Generating Summary Done ..")
|
662 |
time.sleep(0.1)
|
|
|
663 |
st.write("β Table Extraction in progress ...")
|
664 |
|
665 |
# Table Extraction
|
@@ -930,53 +936,12 @@ if uploaded_files:
|
|
930 |
})
|
931 |
else:
|
932 |
L.append(row)
|
933 |
-
|
934 |
-
|
935 |
-
|
936 |
-
|
937 |
-
|
938 |
-
|
939 |
-
'Title' : concat['title'][0],
|
940 |
-
'Authors' : concat['authors'][0] if 'authors' in list(concat.columns) else '',
|
941 |
-
'Publisher Name' : concat['publisher_name'][0] if 'publisher_name' in list(concat.columns) else '',
|
942 |
-
'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())) if 'year_of_publication' in concat.columns else concat.assign(year_of_publication='')['year_of_publication'],
|
943 |
-
'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()) if 'population_race' in concat.columns else concat.assign(population_race='')['population_race'],
|
944 |
-
'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()) if 'sample_size' in concat.columns else concat.assign(sample_size='')['sample_size'],
|
945 |
-
'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()) if 'study_methodology' in concat.columns else concat.assign(study_methodology='')['study_methodology'],
|
946 |
-
'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()) if 'study_level' in concat.columns else concat.assign(study_level='')['study_level'],
|
947 |
-
'Recommendation' : summary,
|
948 |
-
}
|
949 |
-
}
|
950 |
-
if not row['SNPs'].startswith("rs"):
|
951 |
-
row.update({
|
952 |
-
'SNPs' : "-"
|
953 |
-
})
|
954 |
-
else:
|
955 |
-
L.append(row)
|
956 |
-
except KeyError:
|
957 |
-
row = literal_eval(output_list[2]['result'].split('\n')[i])
|
958 |
-
row = {**row, **{
|
959 |
-
'Title' : concat['title'][0],
|
960 |
-
'Authors' : concat['authors'][0] if 'authors' in list(concat.columns) else '',
|
961 |
-
'Publisher Name' : concat['publisher_name'][0] if 'publisher_name' in list(concat.columns) else '',
|
962 |
-
'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())) if 'year_of_publication' in concat.columns else concat.assign(year_of_publication='')['year_of_publication'],
|
963 |
-
'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()) if 'population_race' in concat.columns else concat.assign(population_race='')['population_race'],
|
964 |
-
'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()) if 'sample_size' in concat.columns else concat.assign(sample_size='')['sample_size'],
|
965 |
-
'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()) if 'study_methodology' in concat.columns else concat.assign(study_methodology='')['study_methodology'],
|
966 |
-
'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()) if 'study_level' in concat.columns else concat.assign(study_level='')['study_level'],
|
967 |
-
'Recommendation' : summary,
|
968 |
-
}
|
969 |
-
}
|
970 |
-
if not row['SNPs'].startswith("rs"):
|
971 |
-
row.update({
|
972 |
-
'SNPs' : "-"
|
973 |
-
})
|
974 |
-
else:
|
975 |
-
L.append(row)
|
976 |
-
except ValueError:
|
977 |
-
if type(output_list[2]['result'].split('\n')[i]) is dict:
|
978 |
-
row = output_list[2]['result'].split('\n')[i]
|
979 |
-
row = {**row, **{
|
980 |
'Title' : concat['title'][0],
|
981 |
'Authors' : concat['authors'][0] if 'authors' in list(concat.columns) else '',
|
982 |
'Publisher Name' : concat['publisher_name'][0] if 'publisher_name' in list(concat.columns) else '',
|
@@ -987,15 +952,15 @@ if uploaded_files:
|
|
987 |
'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()) if 'study_level' in concat.columns else concat.assign(study_level='')['study_level'],
|
988 |
'Recommendation' : summary,
|
989 |
}
|
990 |
-
|
991 |
-
|
992 |
-
|
993 |
-
|
994 |
-
|
995 |
-
|
996 |
-
|
997 |
-
|
998 |
-
row = literal_eval(
|
999 |
row = {**row, **{
|
1000 |
'Title' : concat['title'][0],
|
1001 |
'Authors' : concat['authors'][0] if 'authors' in list(concat.columns) else '',
|
@@ -1014,7 +979,49 @@ if uploaded_files:
|
|
1014 |
})
|
1015 |
else:
|
1016 |
L.append(row)
|
1017 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1018 |
status.update(label="Gene and SNPs succesfully collected.")
|
1019 |
L = [{key: ''.join(['' if item == 'Unknow' else item for item in value]) for key, value in d.items()} for d in L]
|
1020 |
L = [{key: ''.join(['Not Available' if item == '' else item for item in value]) for key, value in d.items()} for d in L]
|
@@ -1048,6 +1055,7 @@ if uploaded_files:
|
|
1048 |
pdf.write(uploaded_file.getbuffer())
|
1049 |
|
1050 |
# Entity Extraction
|
|
|
1051 |
st.write("β Extracting Entities ...")
|
1052 |
bytes_data = uploaded_file.read()
|
1053 |
journal = Journal(uploaded_file.name, bytes_data)
|
@@ -1068,8 +1076,9 @@ if uploaded_files:
|
|
1068 |
chunkdf.append(df)
|
1069 |
|
1070 |
concat = pd.concat(chunkdf, axis=0).reset_index().drop('index', axis=1).fillna('')
|
1071 |
-
st.write("β Entities Extraction Done ..")
|
1072 |
time.sleep(0.1)
|
|
|
1073 |
st.write("β Generating Summary ...")
|
1074 |
|
1075 |
if 'SNPs' in list(concat.columns):
|
@@ -1081,7 +1090,7 @@ if uploaded_files:
|
|
1081 |
|
1082 |
summary = get_summ(pdf.name)
|
1083 |
time.sleep(0.1)
|
1084 |
-
st.write("β Generating Summary Done...")
|
1085 |
for i in range(len(concat)):
|
1086 |
if (len(concat['genes_locus'][i].split(',')) >= 1) and concat['SNPs'][i] == '':
|
1087 |
for g in concat['genes_locus'][i].split(','):
|
|
|
142 |
# st.write(pdf.name)
|
143 |
L = []
|
144 |
# Entity Extraction
|
145 |
+
start_time_ext = time.time()
|
146 |
st.write("β Extracting Entities ...")
|
147 |
bytes_data = uploaded_file.read()
|
148 |
journal = Journal(uploaded_file.name, bytes_data)
|
|
|
182 |
chunkdf.append(df)
|
183 |
|
184 |
concat = pd.concat(chunkdf, axis=0).reset_index().drop('index', axis=1).fillna('')
|
185 |
+
st.write("β Entities Extraction Done ..", round((time.time() - start_time_ext) / 60, 2), "minutes")
|
186 |
time.sleep(0.1)
|
187 |
+
start_time_summ = time.time()
|
188 |
st.write("β Generating Summary ...")
|
189 |
summary = get_summ(pdf.name)
|
190 |
+
st.write("β Generating Summary Done ..", round((time.time() - start_time_summ) / 60, 2), "minutes")
|
191 |
time.sleep(0.1)
|
192 |
+
start_time_tab = time.time()
|
193 |
st.write("β Table Extraction in progress ...")
|
194 |
# Table Extraction
|
195 |
# L = []
|
|
|
572 |
else:
|
573 |
L.append(row)
|
574 |
|
575 |
+
st.write("β Table Extraction Done ...", round((time.time() - start_time_summ) / 60, 2), "minutes")
|
576 |
status.update(label="Gene and SNPs succesfully collected.")
|
577 |
L = [{key: ''.join(['' if item == 'Unknow' else item for item in value]) for key, value in d.items()} for d in L]
|
578 |
L = [{key: ''.join(['Not Available' if item == '' else item for item in value]) for key, value in d.items()} for d in L]
|
|
|
627 |
with NamedTemporaryFile(dir='.', suffix=".pdf") as rotated_pdf:
|
628 |
pdf_writer.write(rotated_pdf.name)
|
629 |
# Entity Extraction
|
630 |
+
start_time_ext = time.time()
|
631 |
st.write("β Extracting Entities ...")
|
632 |
bytes_data = uploaded_file.read()
|
633 |
journal = Journal(uploaded_file.name, bytes_data)
|
|
|
658 |
chunkdf.append(df)
|
659 |
|
660 |
concat = pd.concat(chunkdf, axis=0).reset_index().drop('index', axis=1).fillna('')
|
661 |
+
st.write("β Entities Extraction Done ..", round((time.time() - start_time_ext) / 60, 2), "minutes")
|
662 |
time.sleep(0.1)
|
663 |
+
start_time_summ = time.time()
|
664 |
st.write("β Generating Summary ...")
|
665 |
summary = get_summ(pdf.name)
|
666 |
+
st.write("β Generating Summary Done ..", round((time.time() - start_time_summ) / 60, 2), "minutes")
|
667 |
time.sleep(0.1)
|
668 |
+
start_time_tab = time.time()
|
669 |
st.write("β Table Extraction in progress ...")
|
670 |
|
671 |
# Table Extraction
|
|
|
936 |
})
|
937 |
else:
|
938 |
L.append(row)
|
939 |
+
# 3
|
940 |
+
for i in range(len(output_list[2]['result'].split('\n'))):
|
941 |
+
if output_list[2]['result'].split('\n')[i] != "":
|
942 |
+
try:
|
943 |
+
row = literal_eval(output_list[2]['result'].split('\n')[i])[0]
|
944 |
+
row = {**row, **{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
945 |
'Title' : concat['title'][0],
|
946 |
'Authors' : concat['authors'][0] if 'authors' in list(concat.columns) else '',
|
947 |
'Publisher Name' : concat['publisher_name'][0] if 'publisher_name' in list(concat.columns) else '',
|
|
|
952 |
'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()) if 'study_level' in concat.columns else concat.assign(study_level='')['study_level'],
|
953 |
'Recommendation' : summary,
|
954 |
}
|
955 |
+
}
|
956 |
+
if not row['SNPs'].startswith("rs"):
|
957 |
+
row.update({
|
958 |
+
'SNPs' : "-"
|
959 |
+
})
|
960 |
+
else:
|
961 |
+
L.append(row)
|
962 |
+
except KeyError:
|
963 |
+
row = literal_eval(output_list[2]['result'].split('\n')[i])
|
964 |
row = {**row, **{
|
965 |
'Title' : concat['title'][0],
|
966 |
'Authors' : concat['authors'][0] if 'authors' in list(concat.columns) else '',
|
|
|
979 |
})
|
980 |
else:
|
981 |
L.append(row)
|
982 |
+
except ValueError:
|
983 |
+
if type(output_list[2]['result'].split('\n')[i]) is dict:
|
984 |
+
row = output_list[2]['result'].split('\n')[i]
|
985 |
+
row = {**row, **{
|
986 |
+
'Title' : concat['title'][0],
|
987 |
+
'Authors' : concat['authors'][0] if 'authors' in list(concat.columns) else '',
|
988 |
+
'Publisher Name' : concat['publisher_name'][0] if 'publisher_name' in list(concat.columns) else '',
|
989 |
+
'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())) if 'year_of_publication' in concat.columns else concat.assign(year_of_publication='')['year_of_publication'],
|
990 |
+
'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()) if 'population_race' in concat.columns else concat.assign(population_race='')['population_race'],
|
991 |
+
'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()) if 'sample_size' in concat.columns else concat.assign(sample_size='')['sample_size'],
|
992 |
+
'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()) if 'study_methodology' in concat.columns else concat.assign(study_methodology='')['study_methodology'],
|
993 |
+
'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()) if 'study_level' in concat.columns else concat.assign(study_level='')['study_level'],
|
994 |
+
'Recommendation' : summary,
|
995 |
+
}
|
996 |
+
}
|
997 |
+
if not row['SNPs'].startswith("rs"):
|
998 |
+
row.update({
|
999 |
+
'SNPs' : "-"
|
1000 |
+
})
|
1001 |
+
else:
|
1002 |
+
L.append(row)
|
1003 |
+
except SyntaxError:
|
1004 |
+
row = literal_eval("""{}""".format(output_list[2]['result'].split('\n')[i]))
|
1005 |
+
row = {**row, **{
|
1006 |
+
'Title' : concat['title'][0],
|
1007 |
+
'Authors' : concat['authors'][0] if 'authors' in list(concat.columns) else '',
|
1008 |
+
'Publisher Name' : concat['publisher_name'][0] if 'publisher_name' in list(concat.columns) else '',
|
1009 |
+
'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())) if 'year_of_publication' in concat.columns else concat.assign(year_of_publication='')['year_of_publication'],
|
1010 |
+
'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()) if 'population_race' in concat.columns else concat.assign(population_race='')['population_race'],
|
1011 |
+
'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()) if 'sample_size' in concat.columns else concat.assign(sample_size='')['sample_size'],
|
1012 |
+
'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()) if 'study_methodology' in concat.columns else concat.assign(study_methodology='')['study_methodology'],
|
1013 |
+
'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()) if 'study_level' in concat.columns else concat.assign(study_level='')['study_level'],
|
1014 |
+
'Recommendation' : summary,
|
1015 |
+
}
|
1016 |
+
}
|
1017 |
+
if not row['SNPs'].startswith("rs"):
|
1018 |
+
row.update({
|
1019 |
+
'SNPs' : "-"
|
1020 |
+
})
|
1021 |
+
else:
|
1022 |
+
L.append(row)
|
1023 |
+
|
1024 |
+
st.write("β Table Extraction Done", round((time.time() - start_time_summ) / 60, 2), "minutes")
|
1025 |
status.update(label="Gene and SNPs succesfully collected.")
|
1026 |
L = [{key: ''.join(['' if item == 'Unknow' else item for item in value]) for key, value in d.items()} for d in L]
|
1027 |
L = [{key: ''.join(['Not Available' if item == '' else item for item in value]) for key, value in d.items()} for d in L]
|
|
|
1055 |
pdf.write(uploaded_file.getbuffer())
|
1056 |
|
1057 |
# Entity Extraction
|
1058 |
+
start_time_ext = time.time()
|
1059 |
st.write("β Extracting Entities ...")
|
1060 |
bytes_data = uploaded_file.read()
|
1061 |
journal = Journal(uploaded_file.name, bytes_data)
|
|
|
1076 |
chunkdf.append(df)
|
1077 |
|
1078 |
concat = pd.concat(chunkdf, axis=0).reset_index().drop('index', axis=1).fillna('')
|
1079 |
+
st.write("β Entities Extraction Done ..", round((time.time() - start_time_ext) / 60, 2), "minutes")
|
1080 |
time.sleep(0.1)
|
1081 |
+
start_time_summ = time.time()
|
1082 |
st.write("β Generating Summary ...")
|
1083 |
|
1084 |
if 'SNPs' in list(concat.columns):
|
|
|
1090 |
|
1091 |
summary = get_summ(pdf.name)
|
1092 |
time.sleep(0.1)
|
1093 |
+
st.write("β Generating Summary Done...", round((time.time() - start_time_summ) / 60, 2), "minutes")
|
1094 |
for i in range(len(concat)):
|
1095 |
if (len(concat['genes_locus'][i].split(',')) >= 1) and concat['SNPs'][i] == '':
|
1096 |
for g in concat['genes_locus'][i].split(','):
|