fadliaulawi commited on
Commit
e79e408
β€’
1 Parent(s): e347585

Add time tracking for each process

Browse files
Files changed (1) hide show
  1. app.py +73 -64
app.py CHANGED
@@ -142,6 +142,7 @@ if uploaded_files:
142
  # st.write(pdf.name)
143
  L = []
144
  # Entity Extraction
 
145
  st.write("β˜‘ Extracting Entities ...")
146
  bytes_data = uploaded_file.read()
147
  journal = Journal(uploaded_file.name, bytes_data)
@@ -181,12 +182,14 @@ if uploaded_files:
181
  chunkdf.append(df)
182
 
183
  concat = pd.concat(chunkdf, axis=0).reset_index().drop('index', axis=1).fillna('')
184
- st.write("β˜‘ Entities Extraction Done ..")
185
  time.sleep(0.1)
 
186
  st.write("β˜‘ Generating Summary ...")
187
  summary = get_summ(pdf.name)
188
- st.write("β˜‘ Generating Summary Done ..")
189
  time.sleep(0.1)
 
190
  st.write("β˜‘ Table Extraction in progress ...")
191
  # Table Extraction
192
  # L = []
@@ -569,7 +572,7 @@ if uploaded_files:
569
  else:
570
  L.append(row)
571
 
572
- st.write("β˜‘ Table Extraction Done ...")
573
  status.update(label="Gene and SNPs succesfully collected.")
574
  L = [{key: ''.join(['' if item == 'Unknow' else item for item in value]) for key, value in d.items()} for d in L]
575
  L = [{key: ''.join(['Not Available' if item == '' else item for item in value]) for key, value in d.items()} for d in L]
@@ -624,6 +627,7 @@ if uploaded_files:
624
  with NamedTemporaryFile(dir='.', suffix=".pdf") as rotated_pdf:
625
  pdf_writer.write(rotated_pdf.name)
626
  # Entity Extraction
 
627
  st.write("β˜‘ Extracting Entities ...")
628
  bytes_data = uploaded_file.read()
629
  journal = Journal(uploaded_file.name, bytes_data)
@@ -654,12 +658,14 @@ if uploaded_files:
654
  chunkdf.append(df)
655
 
656
  concat = pd.concat(chunkdf, axis=0).reset_index().drop('index', axis=1).fillna('')
657
- st.write("β˜‘ Entities Extraction Done ..")
658
  time.sleep(0.1)
 
659
  st.write("β˜‘ Generating Summary ...")
660
  summary = get_summ(pdf.name)
661
- st.write("β˜‘ Generating Summary Done ..")
662
  time.sleep(0.1)
 
663
  st.write("β˜‘ Table Extraction in progress ...")
664
 
665
  # Table Extraction
@@ -930,53 +936,12 @@ if uploaded_files:
930
  })
931
  else:
932
  L.append(row)
933
- # 3
934
- for i in range(len(output_list[2]['result'].split('\n'))):
935
- if output_list[2]['result'].split('\n')[i] != "":
936
- try:
937
- row = literal_eval(output_list[2]['result'].split('\n')[i])[0]
938
- row = {**row, **{
939
- 'Title' : concat['title'][0],
940
- 'Authors' : concat['authors'][0] if 'authors' in list(concat.columns) else '',
941
- 'Publisher Name' : concat['publisher_name'][0] if 'publisher_name' in list(concat.columns) else '',
942
- 'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())) if 'year_of_publication' in concat.columns else concat.assign(year_of_publication='')['year_of_publication'],
943
- 'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()) if 'population_race' in concat.columns else concat.assign(population_race='')['population_race'],
944
- 'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()) if 'sample_size' in concat.columns else concat.assign(sample_size='')['sample_size'],
945
- 'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()) if 'study_methodology' in concat.columns else concat.assign(study_methodology='')['study_methodology'],
946
- 'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()) if 'study_level' in concat.columns else concat.assign(study_level='')['study_level'],
947
- 'Recommendation' : summary,
948
- }
949
- }
950
- if not row['SNPs'].startswith("rs"):
951
- row.update({
952
- 'SNPs' : "-"
953
- })
954
- else:
955
- L.append(row)
956
- except KeyError:
957
- row = literal_eval(output_list[2]['result'].split('\n')[i])
958
- row = {**row, **{
959
- 'Title' : concat['title'][0],
960
- 'Authors' : concat['authors'][0] if 'authors' in list(concat.columns) else '',
961
- 'Publisher Name' : concat['publisher_name'][0] if 'publisher_name' in list(concat.columns) else '',
962
- 'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())) if 'year_of_publication' in concat.columns else concat.assign(year_of_publication='')['year_of_publication'],
963
- 'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()) if 'population_race' in concat.columns else concat.assign(population_race='')['population_race'],
964
- 'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()) if 'sample_size' in concat.columns else concat.assign(sample_size='')['sample_size'],
965
- 'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()) if 'study_methodology' in concat.columns else concat.assign(study_methodology='')['study_methodology'],
966
- 'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()) if 'study_level' in concat.columns else concat.assign(study_level='')['study_level'],
967
- 'Recommendation' : summary,
968
- }
969
- }
970
- if not row['SNPs'].startswith("rs"):
971
- row.update({
972
- 'SNPs' : "-"
973
- })
974
- else:
975
- L.append(row)
976
- except ValueError:
977
- if type(output_list[2]['result'].split('\n')[i]) is dict:
978
- row = output_list[2]['result'].split('\n')[i]
979
- row = {**row, **{
980
  'Title' : concat['title'][0],
981
  'Authors' : concat['authors'][0] if 'authors' in list(concat.columns) else '',
982
  'Publisher Name' : concat['publisher_name'][0] if 'publisher_name' in list(concat.columns) else '',
@@ -987,15 +952,15 @@ if uploaded_files:
987
  'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()) if 'study_level' in concat.columns else concat.assign(study_level='')['study_level'],
988
  'Recommendation' : summary,
989
  }
990
- }
991
- if not row['SNPs'].startswith("rs"):
992
- row.update({
993
- 'SNPs' : "-"
994
- })
995
- else:
996
- L.append(row)
997
- except SyntaxError:
998
- row = literal_eval("""{}""".format(output_list[2]['result'].split('\n')[i]))
999
  row = {**row, **{
1000
  'Title' : concat['title'][0],
1001
  'Authors' : concat['authors'][0] if 'authors' in list(concat.columns) else '',
@@ -1014,7 +979,49 @@ if uploaded_files:
1014
  })
1015
  else:
1016
  L.append(row)
1017
- st.write("β˜‘ Table Extraction Done")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1018
  status.update(label="Gene and SNPs succesfully collected.")
1019
  L = [{key: ''.join(['' if item == 'Unknow' else item for item in value]) for key, value in d.items()} for d in L]
1020
  L = [{key: ''.join(['Not Available' if item == '' else item for item in value]) for key, value in d.items()} for d in L]
@@ -1048,6 +1055,7 @@ if uploaded_files:
1048
  pdf.write(uploaded_file.getbuffer())
1049
 
1050
  # Entity Extraction
 
1051
  st.write("β˜‘ Extracting Entities ...")
1052
  bytes_data = uploaded_file.read()
1053
  journal = Journal(uploaded_file.name, bytes_data)
@@ -1068,8 +1076,9 @@ if uploaded_files:
1068
  chunkdf.append(df)
1069
 
1070
  concat = pd.concat(chunkdf, axis=0).reset_index().drop('index', axis=1).fillna('')
1071
- st.write("β˜‘ Entities Extraction Done ..")
1072
  time.sleep(0.1)
 
1073
  st.write("β˜‘ Generating Summary ...")
1074
 
1075
  if 'SNPs' in list(concat.columns):
@@ -1081,7 +1090,7 @@ if uploaded_files:
1081
 
1082
  summary = get_summ(pdf.name)
1083
  time.sleep(0.1)
1084
- st.write("β˜‘ Generating Summary Done...")
1085
  for i in range(len(concat)):
1086
  if (len(concat['genes_locus'][i].split(',')) >= 1) and concat['SNPs'][i] == '':
1087
  for g in concat['genes_locus'][i].split(','):
 
142
  # st.write(pdf.name)
143
  L = []
144
  # Entity Extraction
145
+ start_time_ext = time.time()
146
  st.write("β˜‘ Extracting Entities ...")
147
  bytes_data = uploaded_file.read()
148
  journal = Journal(uploaded_file.name, bytes_data)
 
182
  chunkdf.append(df)
183
 
184
  concat = pd.concat(chunkdf, axis=0).reset_index().drop('index', axis=1).fillna('')
185
+ st.write("β˜‘ Entities Extraction Done ..", round((time.time() - start_time_ext) / 60, 2), "minutes")
186
  time.sleep(0.1)
187
+ start_time_summ = time.time()
188
  st.write("β˜‘ Generating Summary ...")
189
  summary = get_summ(pdf.name)
190
+ st.write("β˜‘ Generating Summary Done ..", round((time.time() - start_time_summ) / 60, 2), "minutes")
191
  time.sleep(0.1)
192
+ start_time_tab = time.time()
193
  st.write("β˜‘ Table Extraction in progress ...")
194
  # Table Extraction
195
  # L = []
 
572
  else:
573
  L.append(row)
574
 
575
+ st.write("β˜‘ Table Extraction Done ...", round((time.time() - start_time_summ) / 60, 2), "minutes")
576
  status.update(label="Gene and SNPs succesfully collected.")
577
  L = [{key: ''.join(['' if item == 'Unknow' else item for item in value]) for key, value in d.items()} for d in L]
578
  L = [{key: ''.join(['Not Available' if item == '' else item for item in value]) for key, value in d.items()} for d in L]
 
627
  with NamedTemporaryFile(dir='.', suffix=".pdf") as rotated_pdf:
628
  pdf_writer.write(rotated_pdf.name)
629
  # Entity Extraction
630
+ start_time_ext = time.time()
631
  st.write("β˜‘ Extracting Entities ...")
632
  bytes_data = uploaded_file.read()
633
  journal = Journal(uploaded_file.name, bytes_data)
 
658
  chunkdf.append(df)
659
 
660
  concat = pd.concat(chunkdf, axis=0).reset_index().drop('index', axis=1).fillna('')
661
+ st.write("β˜‘ Entities Extraction Done ..", round((time.time() - start_time_ext) / 60, 2), "minutes")
662
  time.sleep(0.1)
663
+ start_time_summ = time.time()
664
  st.write("β˜‘ Generating Summary ...")
665
  summary = get_summ(pdf.name)
666
+ st.write("β˜‘ Generating Summary Done ..", round((time.time() - start_time_summ) / 60, 2), "minutes")
667
  time.sleep(0.1)
668
+ start_time_tab = time.time()
669
  st.write("β˜‘ Table Extraction in progress ...")
670
 
671
  # Table Extraction
 
936
  })
937
  else:
938
  L.append(row)
939
+ # 3
940
+ for i in range(len(output_list[2]['result'].split('\n'))):
941
+ if output_list[2]['result'].split('\n')[i] != "":
942
+ try:
943
+ row = literal_eval(output_list[2]['result'].split('\n')[i])[0]
944
+ row = {**row, **{
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
945
  'Title' : concat['title'][0],
946
  'Authors' : concat['authors'][0] if 'authors' in list(concat.columns) else '',
947
  'Publisher Name' : concat['publisher_name'][0] if 'publisher_name' in list(concat.columns) else '',
 
952
  'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()) if 'study_level' in concat.columns else concat.assign(study_level='')['study_level'],
953
  'Recommendation' : summary,
954
  }
955
+ }
956
+ if not row['SNPs'].startswith("rs"):
957
+ row.update({
958
+ 'SNPs' : "-"
959
+ })
960
+ else:
961
+ L.append(row)
962
+ except KeyError:
963
+ row = literal_eval(output_list[2]['result'].split('\n')[i])
964
  row = {**row, **{
965
  'Title' : concat['title'][0],
966
  'Authors' : concat['authors'][0] if 'authors' in list(concat.columns) else '',
 
979
  })
980
  else:
981
  L.append(row)
982
+ except ValueError:
983
+ if type(output_list[2]['result'].split('\n')[i]) is dict:
984
+ row = output_list[2]['result'].split('\n')[i]
985
+ row = {**row, **{
986
+ 'Title' : concat['title'][0],
987
+ 'Authors' : concat['authors'][0] if 'authors' in list(concat.columns) else '',
988
+ 'Publisher Name' : concat['publisher_name'][0] if 'publisher_name' in list(concat.columns) else '',
989
+ 'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())) if 'year_of_publication' in concat.columns else concat.assign(year_of_publication='')['year_of_publication'],
990
+ 'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()) if 'population_race' in concat.columns else concat.assign(population_race='')['population_race'],
991
+ 'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()) if 'sample_size' in concat.columns else concat.assign(sample_size='')['sample_size'],
992
+ 'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()) if 'study_methodology' in concat.columns else concat.assign(study_methodology='')['study_methodology'],
993
+ 'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()) if 'study_level' in concat.columns else concat.assign(study_level='')['study_level'],
994
+ 'Recommendation' : summary,
995
+ }
996
+ }
997
+ if not row['SNPs'].startswith("rs"):
998
+ row.update({
999
+ 'SNPs' : "-"
1000
+ })
1001
+ else:
1002
+ L.append(row)
1003
+ except SyntaxError:
1004
+ row = literal_eval("""{}""".format(output_list[2]['result'].split('\n')[i]))
1005
+ row = {**row, **{
1006
+ 'Title' : concat['title'][0],
1007
+ 'Authors' : concat['authors'][0] if 'authors' in list(concat.columns) else '',
1008
+ 'Publisher Name' : concat['publisher_name'][0] if 'publisher_name' in list(concat.columns) else '',
1009
+ 'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())) if 'year_of_publication' in concat.columns else concat.assign(year_of_publication='')['year_of_publication'],
1010
+ 'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()) if 'population_race' in concat.columns else concat.assign(population_race='')['population_race'],
1011
+ 'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()) if 'sample_size' in concat.columns else concat.assign(sample_size='')['sample_size'],
1012
+ 'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()) if 'study_methodology' in concat.columns else concat.assign(study_methodology='')['study_methodology'],
1013
+ 'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()) if 'study_level' in concat.columns else concat.assign(study_level='')['study_level'],
1014
+ 'Recommendation' : summary,
1015
+ }
1016
+ }
1017
+ if not row['SNPs'].startswith("rs"):
1018
+ row.update({
1019
+ 'SNPs' : "-"
1020
+ })
1021
+ else:
1022
+ L.append(row)
1023
+
1024
+ st.write("β˜‘ Table Extraction Done", round((time.time() - start_time_summ) / 60, 2), "minutes")
1025
  status.update(label="Gene and SNPs succesfully collected.")
1026
  L = [{key: ''.join(['' if item == 'Unknow' else item for item in value]) for key, value in d.items()} for d in L]
1027
  L = [{key: ''.join(['Not Available' if item == '' else item for item in value]) for key, value in d.items()} for d in L]
 
1055
  pdf.write(uploaded_file.getbuffer())
1056
 
1057
  # Entity Extraction
1058
+ start_time_ext = time.time()
1059
  st.write("β˜‘ Extracting Entities ...")
1060
  bytes_data = uploaded_file.read()
1061
  journal = Journal(uploaded_file.name, bytes_data)
 
1076
  chunkdf.append(df)
1077
 
1078
  concat = pd.concat(chunkdf, axis=0).reset_index().drop('index', axis=1).fillna('')
1079
+ st.write("β˜‘ Entities Extraction Done ..", round((time.time() - start_time_ext) / 60, 2), "minutes")
1080
  time.sleep(0.1)
1081
+ start_time_summ = time.time()
1082
  st.write("β˜‘ Generating Summary ...")
1083
 
1084
  if 'SNPs' in list(concat.columns):
 
1090
 
1091
  summary = get_summ(pdf.name)
1092
  time.sleep(0.1)
1093
+ st.write("β˜‘ Generating Summary Done...", round((time.time() - start_time_summ) / 60, 2), "minutes")
1094
  for i in range(len(concat)):
1095
  if (len(concat['genes_locus'][i].split(',')) >= 1) and concat['SNPs'][i] == '':
1096
  for g in concat['genes_locus'][i].split(','):