Seetha commited on
Commit
51cb4ac
1 Parent(s): 5106269

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +116 -66
app.py CHANGED
@@ -10,6 +10,7 @@ from transformers import AutoTokenizer, DistilBertTokenizerFast
10
  from transformers import DistilBertForTokenClassification, Trainer, TrainingArguments
11
  import numpy as np
12
  import pandas as pd
 
13
  import json
14
  import sys
15
  import os
@@ -31,6 +32,7 @@ import json
31
  import re
32
  import numpy as np
33
  import pandas as pd
 
34
  import nltk
35
  nltk.download("punkt")
36
  #stemmer = nltk.SnowballStemmer("english")
@@ -56,9 +58,9 @@ from sklearn.feature_extraction.text import CountVectorizer
56
  #from urllib.request import urlopen
57
  #from tabulate import tabulate
58
  import csv
59
- # import gdown
60
- # import zipfile
61
- # import wget
62
  import pdfplumber
63
  import pathlib
64
  import shutil
@@ -66,6 +68,9 @@ import webbrowser
66
  from streamlit.components.v1 import html
67
  import streamlit.components.v1 as components
68
  from PyPDF2 import PdfReader
 
 
 
69
 
70
 
71
  #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
@@ -81,17 +86,20 @@ def main():
81
  k=2
82
  seed = 1
83
  k1= 5
84
-
85
- uploaded_file = st.sidebar.file_uploader("Choose a file", type = "pdf")
86
  text_list = []
87
  causal_sents = []
88
 
89
- reader = PdfReader(uploaded_file)
 
 
 
 
90
 
91
- for page in reader.pages:
92
- text = page.extract_text()
93
- text_list.append(text)
94
-
 
95
  text_list_final = [x.replace('\n', '') for x in text_list]
96
  text_list_final = re.sub('"', '', str(text_list_final))
97
 
@@ -103,8 +111,9 @@ def main():
103
  result2 = re.sub(r'[^\w\s]','',result1)
104
  result.append(result2)
105
 
106
- tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
107
- model_path = "checkpoint-2850"
 
108
 
109
  model = AutoModelForSequenceClassification.from_pretrained(model_path,id2label={0:'non-causal',1:'causal'})
110
 
@@ -117,7 +126,10 @@ def main():
117
 
118
  model_name = "distilbert-base-cased"
119
  tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
120
- model_path1 = "DistilBertforTokenclassification"
 
 
 
121
 
122
  model = DistilBertForTokenClassification.from_pretrained(model_path1) #len(unique_tags),, num_labels= 7, , id2label={0:'CT',1:'E',2:'C',3:'O'}
123
  pipe = pipeline('ner', model=model, tokenizer=tokenizer,aggregation_strategy='simple') #grouped_entities=True
@@ -162,9 +174,9 @@ def main():
162
 
163
  final_list = pd.DataFrame(
164
  {'Id': sent_id,
165
- 'Full sentence': sentence_pred,
166
  'Component': class_list,
167
- 'cause/effect': entity_list,
168
  'Label_level1': level0,
169
  'Label_level2': pred_val
170
  })
@@ -174,7 +186,7 @@ def main():
174
 
175
 
176
  final_list1 = final_list[~final_list['Component'].astype(str).str.startswith('##')]
177
-
178
  li = []
179
  uni = final_list1['Id'].unique()
180
  for i in uni:
@@ -186,17 +198,23 @@ def main():
186
  li_pan = pd.DataFrame(out,columns=['Id'])
187
  df3 = pd.merge(final_list1, li_pan[['Id']], on='Id', how='left', indicator=True) \
188
  .query("_merge == 'left_only'") \
189
- .drop('_merge',1)
190
-
191
- df = df3.groupby(['Id','Full sentence','cause/effect', 'Label_level1', 'Label_level2'])['Component'].apply(', '.join).reset_index()
192
-
193
- df["cause/effect"].replace({"C": "cause", "E": "effect"}, inplace=True)
194
- df_final = df[df['cause/effect'] != 'CT']
195
  df['New string'] = df_final['Component'].replace(r'[##]+', ' ', regex=True)
196
- df_final = df_final.drop('Component',1)
 
197
  df_final.insert(2, "Component", df['New string'], True)
198
 
199
- df_final.to_csv('predictions.csv')
 
 
 
 
 
200
 
201
  count_NP_NP = 0
202
  count_NP_investor = 0
@@ -229,8 +247,8 @@ def main():
229
  count_soc_society = 0
230
  for i in range(0,df_final['Id'].max()):
231
  j = df_final.loc[df_final['Id'] == i]
232
- cause_tab = j.loc[j['cause/effect'] == 'cause']
233
- effect_tab = j.loc[j['cause/effect'] == 'effect']
234
  cause_coun_NP = (cause_tab.Label_level2 == 'Non-performance').sum()
235
  effect_coun_NP = (effect_tab.Label_level2 == 'Non-performance').sum()
236
 
@@ -428,9 +446,13 @@ def main():
428
  # 'Society': [count_soc_np, count_soc_investor, count_soc_customer, count_soc_employee, count_soc_society]},
429
  # index=['Non-performance', 'Investors', 'Customers', 'Employees', 'Society'])
430
 
431
- df_tab.to_csv('final_data.csv')
432
-
433
- df = pd.read_csv('final_data.csv', index_col=0)
 
 
 
 
434
 
435
  # Convert to JSON format
436
  json_data = []
@@ -443,11 +465,11 @@ def main():
443
  })
444
 
445
  # Write JSON to file
446
- with open('smalljson.json', 'w') as f:
447
  json.dump(json_data, f)
448
 
449
- csv_file = "predictions.csv"
450
- json_file = "ch.json"
451
 
452
  # Open the CSV file and read the data
453
  with open(csv_file, "r") as f:
@@ -477,45 +499,73 @@ def main():
477
  csv2 = convert_df(df_tab.astype(str))
478
 
479
  with st.container():
 
 
 
480
  st.download_button(label="Download the detailed result table",data=csv1,file_name='results.csv',mime='text/csv')
481
- st.download_button(label="Download the result table",data=csv2,file_name='final_data.csv',mime='text/csv')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
482
 
483
  # # LINK TO THE CSS FILE
484
- # def tree_css(file_name):
485
- # with open('/Users/seetha/Downloads/tree.css')as f:
486
- # st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html = True)
487
- #
488
- # def div_css(file_name):
489
- # with open('/Users/seetha/Downloads/div.css')as f:
490
- # st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html = True)
491
- #
492
- # def side_css(file_name):
493
- # with open('/Users/seetha/Downloads/side.css')as f:
494
- # st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html = True)
495
- #
496
- # tree_css('tree.css')
497
- # div_css('div.css')
498
- # side_css('side.css')
499
-
500
- STREAMLIT_STATIC_PATH = pathlib.Path(st.__path__[0]) / 'static'
501
- CSS_PATH = (STREAMLIT_STATIC_PATH / "css1")
502
- if not CSS_PATH.is_dir():
503
- CSS_PATH.mkdir()
504
-
505
- css_file = CSS_PATH / "tree.css"
506
- css_file1 = CSS_PATH / "div.css"
507
- css_file2 = CSS_PATH / "side.css"
508
- jso_file = CSS_PATH / "smalljson.json"
509
- if not css_file.exists():
510
- shutil.copy("tree.css", css_file)
511
- shutil.copy("div.css", css_file1)
512
- shutil.copy("side.css", css_file2)
513
- shutil.copy("smalljson.json", jso_file)
514
-
 
 
 
 
 
 
 
 
 
 
515
  HtmlFile = open("index.html", 'r', encoding='utf-8')
516
- source_code = HtmlFile.read()
517
  #print(source_code)
518
- components.html(source_code)
519
  # # Define your javascript
520
  # my_js = """
521
  # alert("Hello World");
 
10
  from transformers import DistilBertForTokenClassification, Trainer, TrainingArguments
11
  import numpy as np
12
  import pandas as pd
13
+ import torch
14
  import json
15
  import sys
16
  import os
 
32
  import re
33
  import numpy as np
34
  import pandas as pd
35
+ import re
36
  import nltk
37
  nltk.download("punkt")
38
  #stemmer = nltk.SnowballStemmer("english")
 
58
  #from urllib.request import urlopen
59
  #from tabulate import tabulate
60
  import csv
61
+ #import gdown
62
+ import zipfile
63
+ import wget
64
  import pdfplumber
65
  import pathlib
66
  import shutil
 
68
  from streamlit.components.v1 import html
69
  import streamlit.components.v1 as components
70
  from PyPDF2 import PdfReader
71
+ from git import Repo
72
+ import io
73
+
74
 
75
 
76
  #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
86
  k=2
87
  seed = 1
88
  k1= 5
 
 
89
  text_list = []
90
  causal_sents = []
91
 
92
+ try:
93
+ uploaded_file = st.sidebar.file_uploader("Choose a file", type = "pdf")
94
+ st.stop()
95
+ except:
96
+ st.write("Upload a pdf file...")
97
 
98
+ if uploaded_file is not None:
99
+ reader = PdfReader(uploaded_file)
100
+ for page in reader.pages:
101
+ text = page.extract_text()
102
+ text_list.append(text)
103
  text_list_final = [x.replace('\n', '') for x in text_list]
104
  text_list_final = re.sub('"', '', str(text_list_final))
105
 
 
111
  result2 = re.sub(r'[^\w\s]','',result1)
112
  result.append(result2)
113
 
114
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") #bert-base-uncased
115
+
116
+ model_path = "checkpoint2850"
117
 
118
  model = AutoModelForSequenceClassification.from_pretrained(model_path,id2label={0:'non-causal',1:'causal'})
119
 
 
126
 
127
  model_name = "distilbert-base-cased"
128
  tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
129
+
130
+
131
+
132
+ model_path1 = "DistilBertForTokeClassification"
133
 
134
  model = DistilBertForTokenClassification.from_pretrained(model_path1) #len(unique_tags),, num_labels= 7, , id2label={0:'CT',1:'E',2:'C',3:'O'}
135
  pipe = pipeline('ner', model=model, tokenizer=tokenizer,aggregation_strategy='simple') #grouped_entities=True
 
174
 
175
  final_list = pd.DataFrame(
176
  {'Id': sent_id,
177
+ 'Full_sentence': sentence_pred,
178
  'Component': class_list,
179
+ 'CauseOrEffect': entity_list,
180
  'Label_level1': level0,
181
  'Label_level2': pred_val
182
  })
 
186
 
187
 
188
  final_list1 = final_list[~final_list['Component'].astype(str).str.startswith('##')]
189
+
190
  li = []
191
  uni = final_list1['Id'].unique()
192
  for i in uni:
 
198
  li_pan = pd.DataFrame(out,columns=['Id'])
199
  df3 = pd.merge(final_list1, li_pan[['Id']], on='Id', how='left', indicator=True) \
200
  .query("_merge == 'left_only'") \
201
+ .drop("_merge",axis=1)
202
+
203
+ df = df3.groupby(['Id','Full_sentence','CauseOrEffect', 'Label_level1', 'Label_level2'])['Component'].apply(', '.join).reset_index()
204
+ #st.write(df)
205
+ df["CauseOrEffect"].replace({"C": "cause", "E": "effect"}, inplace=True)
206
+ df_final = df[df['CauseOrEffect'] != 'CT']
207
  df['New string'] = df_final['Component'].replace(r'[##]+', ' ', regex=True)
208
+
209
+ df_final = df_final.drop("Component",axis=1)
210
  df_final.insert(2, "Component", df['New string'], True)
211
 
212
+ df_final.to_csv('/app/ima-pipeline-streamlit/predictions.csv')
213
+
214
+ # buffer = io.BytesIO()
215
+ # with pd.ExcelWriter(buffer, engine="xlsxwriter") as writer:
216
+ # df_final.to_excel(writer, sheet_name="Sheet1", index=False)
217
+ # writer.close()
218
 
219
  count_NP_NP = 0
220
  count_NP_investor = 0
 
247
  count_soc_society = 0
248
  for i in range(0,df_final['Id'].max()):
249
  j = df_final.loc[df_final['Id'] == i]
250
+ cause_tab = j.loc[j['CauseOrEffect'] == 'cause']
251
+ effect_tab = j.loc[j['CauseOrEffect'] == 'effect']
252
  cause_coun_NP = (cause_tab.Label_level2 == 'Non-performance').sum()
253
  effect_coun_NP = (effect_tab.Label_level2 == 'Non-performance').sum()
254
 
 
446
  # 'Society': [count_soc_np, count_soc_investor, count_soc_customer, count_soc_employee, count_soc_society]},
447
  # index=['Non-performance', 'Investors', 'Customers', 'Employees', 'Society'])
448
 
449
+ df_tab.to_csv('/app/ima-pipeline-streamlit/final_data.csv')
450
+
451
+ buffer = io.BytesIO()
452
+ with pd.ExcelWriter(buffer, engine="xlsxwriter") as writer:
453
+ df_tab.to_excel(writer,sheet_name="Sheet1",index=False)
454
+ writer.close()
455
+ df = pd.read_csv('/app/ima-pipeline-streamlit/final_data.csv', index_col=0)
456
 
457
  # Convert to JSON format
458
  json_data = []
 
465
  })
466
 
467
  # Write JSON to file
468
+ with open('/app/ima-pipeline-streamlit/ch.json', 'w') as f:
469
  json.dump(json_data, f)
470
 
471
+ csv_file = "/app/ima-pipeline-streamlit/predictions.csv"
472
+ json_file = "/app/ima-pipeline-streamlit/smalljson.json"
473
 
474
  # Open the CSV file and read the data
475
  with open(csv_file, "r") as f:
 
499
  csv2 = convert_df(df_tab.astype(str))
500
 
501
  with st.container():
502
+
503
+
504
+
505
  st.download_button(label="Download the detailed result table",data=csv1,file_name='results.csv',mime='text/csv')
506
+ # st.download_button(label="Download the result table",data=csv2,file_name='final_data.csv',mime='text/csv')
507
+
508
+ st.download_button(label="Download the detailed result table",data=buffer,file_name="df_final.xlsx",mime="application/vnd.ms-excel")
509
+ st.download_button(label="Download the result table",data=buffer,file_name="df_tab.xlsx",mime="application/vnd.ms-excel")
510
+
511
+ # repo_dir = 'IMA-pipeline-streamlit'
512
+ # repo = Repo(repo_dir)
513
+ # file_list = [
514
+ # '/app/ima-pipeline-streamlit/results.csv',
515
+ # '/app/ima-pipeline-streamlit/final_data.csv'
516
+ # ]
517
+ # commit_message = 'Add the generated files to Github'
518
+ # repo.index.add(file_list)
519
+ # repo.index.commit(commit_message)
520
+ # origin = repo.remote('origin')
521
+ # origin.push()
522
 
523
  # # LINK TO THE CSS FILE
524
+ def tree_css(file_name):
525
+ with open('tree.css')as f:
526
+ st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html = True)
527
+
528
+ def div_css(file_name):
529
+ with open('div.css')as f:
530
+ st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html = True)
531
+
532
+ def side_css(file_name):
533
+ with open('side.css')as f:
534
+ st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html = True)
535
+
536
+ tree_css('tree.css')
537
+ div_css('div.css')
538
+ side_css('side.css')
539
+ # STREAMLIT_STATIC_PATH = pathlib.Path(st.__path__[0]) / 'static'
540
+ # CSS_PATH = (STREAMLIT_STATIC_PATH / "css1")
541
+ # if not CSS_PATH.is_dir():
542
+ # CSS_PATH.mkdir()
543
+
544
+ # css_file = CSS_PATH / "tree.css"
545
+ # css_file1 = CSS_PATH / "div.css"
546
+ # css_file2 = CSS_PATH / "side.css"
547
+ # #jso_file = CSS_PATH / "smalljson.json"
548
+ # if not css_file.exists():
549
+ # shutil.copy("tree.css", css_file)
550
+ # shutil.copy("div.css", css_file1)
551
+ # shutil.copy("side.css", css_file2)
552
+ # shutil.copy("smalljson.json", jso_file)
553
+ STREAMLIT_STATIC_PATH = pathlib.Path(st.__path__[0]) / 'static'
554
+ CSS_PATH = (STREAMLIT_STATIC_PATH / "assets/css")
555
+ if not CSS_PATH.is_dir():
556
+ CSS_PATH.mkdir()
557
+
558
+ css_file = CSS_PATH / "tree.css"
559
+ css_file1 = CSS_PATH / "div.css"
560
+ css_file2 = CSS_PATH / "side.css"
561
+ if not css_file.exists():
562
+ shutil.copy("assets/css/tree.css", css_file)
563
+ shutil.copy("assets/css/div.css", css_file1)
564
+ shutil.copy("assets/css/side.css", css_file2)
565
  HtmlFile = open("index.html", 'r', encoding='utf-8')
566
+ source_code = HtmlFile.read()
567
  #print(source_code)
568
+ components.html(source_code)
569
  # # Define your javascript
570
  # my_js = """
571
  # alert("Hello World");