|
|
|
import requests |
|
import streamlit as st |
|
from sklearn.model_selection import StratifiedKFold |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.model_selection import KFold |
|
|
|
from transformers import AutoTokenizer, DistilBertTokenizerFast |
|
|
|
from transformers import DistilBertForTokenClassification, Trainer, TrainingArguments |
|
import numpy as np |
|
import pandas as pd |
|
import torch |
|
import json |
|
import sys |
|
import os |
|
from sklearn.metrics import classification_report |
|
from pandas import read_csv |
|
from sklearn.linear_model import LogisticRegression |
|
import sklearn.model_selection |
|
from sklearn.feature_extraction.text import TfidfTransformer |
|
from sklearn.feature_extraction.text import CountVectorizer |
|
from sklearn.pipeline import Pipeline, FeatureUnion |
|
import math |
|
from sklearn.metrics import accuracy_score |
|
from sklearn.metrics import precision_recall_fscore_support |
|
from sklearn.model_selection import train_test_split |
|
import json |
|
import re |
|
import numpy as np |
|
import pandas as pd |
|
import re |
|
import nltk |
|
nltk.download("punkt") |
|
import string |
|
from sklearn.model_selection import train_test_split |
|
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoConfig |
|
import torch |
|
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler |
|
import itertools |
|
import json |
|
import glob |
|
from transformers import TextClassificationPipeline, TFAutoModelForSequenceClassification, AutoTokenizer |
|
from transformers import pipeline |
|
import pickle |
|
import urllib.request |
|
import csv |
|
import pdfplumber |
|
import pathlib |
|
import shutil |
|
import webbrowser |
|
from streamlit.components.v1 import html |
|
import streamlit.components.v1 as components |
|
from PyPDF2 import PdfReader |
|
from huggingface_hub import HfApi |
|
import io |
|
from datasets import load_dataset |
|
|
|
import huggingface_hub |
|
from huggingface_hub import Repository |
|
from datetime import datetime |
|
import pathlib as Path |
|
from requests import get |
|
import urllib.request |
|
import gradio as gr |
|
from gradio import inputs, outputs |
|
from datasets import load_dataset |
|
|
|
dataset = load_dataset('Seetha/Visualization', streaming=True) |
|
df = pd.DataFrame.from_dict(dataset['train']) |
|
|
|
|
|
def main(): |
|
|
|
st.title("Text to Causal Knowledge Graph") |
|
st.sidebar.title("Please upload your text documents in one file here:") |
|
k=2 |
|
seed = 1 |
|
k1= 5 |
|
text_list = [] |
|
causal_sents = [] |
|
|
|
uploaded_file = None |
|
try: |
|
uploaded_file = st.sidebar.file_uploader("Choose a file", type = "pdf") |
|
except: |
|
uploaded_file = PdfReader('sample_anno.pdf') |
|
st.error("Please upload your own PDF to be analyzed") |
|
|
|
if uploaded_file is not None: |
|
reader = PdfReader(uploaded_file) |
|
for page in reader.pages: |
|
text = page.extract_text() |
|
text_list.append(text) |
|
else: |
|
st.error("Please upload your own PDF to be analyzed") |
|
st.stop() |
|
|
|
text_list_final = [x.replace('\n', '') for x in text_list] |
|
text_list_final = re.sub('"', '', str(text_list_final)) |
|
|
|
sentences = nltk.sent_tokenize(text_list_final) |
|
|
|
result =[] |
|
for i in sentences: |
|
result1 = i.lower() |
|
result2 = re.sub(r'[^\w\s]','',result1) |
|
result.append(result2) |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") |
|
|
|
model_path = "checkpoint-2850" |
|
|
|
model = AutoModelForSequenceClassification.from_pretrained(model_path,id2label={0:'non-causal',1:'causal'}) |
|
|
|
pipe1 = pipeline("text-classification", model=model,tokenizer=tokenizer) |
|
for sent in result: |
|
pred = pipe1(sent) |
|
for lab in pred: |
|
if lab['label'] == 'causal': |
|
causal_sents.append(sent) |
|
|
|
model_name = "distilbert-base-cased" |
|
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) |
|
|
|
|
|
|
|
model_path1 = "DistilBertforTokenclassification" |
|
|
|
model = DistilBertForTokenClassification.from_pretrained(model_path1) |
|
pipe = pipeline('ner', model=model, tokenizer=tokenizer,aggregation_strategy='simple') |
|
|
|
sentence_pred = [] |
|
class_list = [] |
|
entity_list = [] |
|
for k in causal_sents: |
|
pred= pipe(k) |
|
|
|
for i in pred: |
|
|
|
sentence_pred.append(k) |
|
class_list.append(i['word']) |
|
entity_list.append(i['entity_group']) |
|
|
|
filename = 'Checkpoint-classification.sav' |
|
loaded_model = pickle.load(open(filename, 'rb')) |
|
loaded_vectorizer = pickle.load(open('vectorizefile_classification.pickle', 'rb')) |
|
|
|
pipeline_test_output = loaded_vectorizer.transform(class_list) |
|
predicted = loaded_model.predict(pipeline_test_output) |
|
pred1 = predicted |
|
level0 = [] |
|
count =0 |
|
for i in predicted: |
|
if i == 3: |
|
level0.append('Non-Performance') |
|
count +=1 |
|
else: |
|
level0.append('Performance') |
|
count +=1 |
|
|
|
list_pred = {0: 'Customers',1:'Employees',2:'Investors',3:'Non-performance',4:'Society',5:'Unclassified'} |
|
pred_val = [list_pred[i] for i in pred1] |
|
|
|
|
|
for ind,(sent,preds) in enumerate(zip(class_list,pred_val)): |
|
if 'customers' in sent or 'client' in sent or 'consumer' in sent or 'user' in sent: |
|
pred_val[ind] = 'Customers' |
|
elif 'investor' in sent or 'finance' in sent or 'shareholder' in sent or 'stockholder' in sent or 'owners' in sent: |
|
pred_val[ind] = 'Investors' |
|
elif 'employee' in sent or 'worker' in sent or 'staff' in sent: |
|
pred_val[ind] = 'Employees' |
|
elif 'society' in sent or 'societal' in sent or 'social responsib*' in sent or 'social performance' in sent or 'community' in sent: |
|
pred_val[ind] = 'Society' |
|
|
|
sent_id, unique = pd.factorize(sentence_pred) |
|
|
|
final_list = pd.DataFrame( |
|
{'Id': sent_id, |
|
'Fullsentence': sentence_pred, |
|
'Component': class_list, |
|
'causeOrEffect': entity_list, |
|
'Labellevel1': level0, |
|
'Labellevel2': pred_val |
|
}) |
|
s = final_list['Component'].shift(-1) |
|
m = s.str.startswith('##', na=False) |
|
final_list.loc[m, 'Component'] += (' ' + s[m]) |
|
|
|
|
|
final_list1 = final_list[~final_list['Component'].astype(str).str.startswith('##')] |
|
li = [] |
|
uni = final_list1['Id'].unique() |
|
for i in uni: |
|
df_new = final_list1[final_list1['Id'] == i] |
|
uni1 = df_new['Id'].unique() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
df3 = final_list1 |
|
|
|
|
|
|
|
|
|
df3["causeOrEffect"].replace({"C": "cause", "E": "effect"}, inplace=True) |
|
df_final = df3[df3['causeOrEffect'] != 'CT'] |
|
df3['New string'] = df_final['Component'].replace(r'[##]+', ' ', regex=True) |
|
|
|
df_final = df_final.drop("Component",axis=1) |
|
df_final.insert(2, "Component", df3['New string'], True) |
|
|
|
df_final1 = df_final[df_final['Component'].str.split().str.len().gt(1)] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
count_NP_NP = 0 |
|
count_NP_investor = 0 |
|
count_NP_customer = 0 |
|
count_NP_employees = 0 |
|
count_NP_society = 0 |
|
|
|
count_inv_np = 0 |
|
count_inv_investor = 0 |
|
count_inv_customer = 0 |
|
count_inv_employee = 0 |
|
count_inv_society = 0 |
|
|
|
count_cus_np = 0 |
|
count_cus_investor = 0 |
|
count_cus_customer = 0 |
|
count_cus_employee = 0 |
|
count_cus_society = 0 |
|
|
|
count_emp_np = 0 |
|
count_emp_investor = 0 |
|
count_emp_customer = 0 |
|
count_emp_employee = 0 |
|
count_emp_society = 0 |
|
|
|
count_soc_np = 0 |
|
count_soc_investor = 0 |
|
count_soc_customer = 0 |
|
count_soc_employee = 0 |
|
count_soc_society = 0 |
|
for i in range(0,df_final['Id'].max()): |
|
j = df_final.loc[df_final['Id'] == i] |
|
cause_tab = j.loc[j['causeOrEffect'] == 'cause'] |
|
effect_tab = j.loc[j['causeOrEffect'] == 'effect'] |
|
cause_coun_NP = (cause_tab.Labellevel2 == 'Non-performance').sum() |
|
effect_coun_NP = (effect_tab.Labellevel2 == 'Non-performance').sum() |
|
|
|
if (cause_coun_NP > 0) and (effect_coun_NP > 0): |
|
count_NP = cause_coun_NP if cause_coun_NP >= effect_coun_NP else effect_coun_NP |
|
else: |
|
count_NP = 0 |
|
effect_NP_inv = (effect_tab.Labellevel2 == 'Investors').sum() |
|
if (cause_coun_NP > 0) and (effect_NP_inv > 0): |
|
count_NP_inv = cause_coun_NP if cause_coun_NP >= effect_NP_inv else effect_NP_inv |
|
else: |
|
count_NP_inv = 0 |
|
effect_NP_cus = (effect_tab.Labellevel2 == 'Customers').sum() |
|
if (cause_coun_NP > 0) and (effect_NP_cus > 0): |
|
count_NP_cus = cause_coun_NP if cause_coun_NP >= effect_NP_cus else effect_NP_cus |
|
else: |
|
count_NP_cus = 0 |
|
effect_NP_emp = (effect_tab.Labellevel2 == 'Employees').sum() |
|
if (cause_coun_NP > 0) and (effect_NP_emp > 0): |
|
count_NP_emp = cause_coun_NP if cause_coun_NP >= effect_NP_emp else effect_NP_emp |
|
else: |
|
count_NP_emp = 0 |
|
effect_NP_soc = (effect_tab.Labellevel2 == 'Society').sum() |
|
if (cause_coun_NP > 0) and (effect_NP_soc > 0): |
|
count_NP_soc = cause_coun_NP if cause_coun_NP >= effect_NP_soc else effect_NP_soc |
|
else: |
|
count_NP_soc = 0 |
|
|
|
cause_coun_inv = (cause_tab.Labellevel2 == 'Investors').sum() |
|
effect_coun_inv = (effect_tab.Labellevel2 == 'Non-performance').sum() |
|
if (cause_coun_inv > 0) and (effect_coun_inv > 0): |
|
count_NP_inv = cause_coun_inv if cause_coun_inv >= effect_coun_inv else effect_coun_inv |
|
else: |
|
count_NP_inv = 0 |
|
|
|
effect_inv_inv = (effect_tab.Labellevel2 == 'Investors').sum() |
|
if (cause_coun_inv > 0) and (effect_inv_inv > 0): |
|
count_inv_inv = cause_coun_inv if cause_coun_inv >= effect_inv_inv else effect_inv_inv |
|
else: |
|
count_inv_inv = 0 |
|
effect_inv_cus = (effect_tab.Labellevel2 == 'Customers').sum() |
|
if (cause_coun_inv > 0) and (effect_inv_cus > 0): |
|
count_inv_cus = cause_coun_inv if cause_coun_inv >= effect_inv_cus else effect_inv_cus |
|
else: |
|
count_inv_cus = 0 |
|
effect_inv_emp = (effect_tab.Labellevel2 == 'Employees').sum() |
|
if (cause_coun_inv > 0) and (effect_inv_emp > 0): |
|
count_inv_emp = cause_coun_inv if cause_coun_inv >= effect_inv_emp else effect_inv_emp |
|
else: |
|
count_inv_emp = 0 |
|
|
|
effect_inv_soc = (effect_tab.Labellevel2 == 'Society').sum() |
|
if (cause_coun_inv > 0) and (effect_inv_soc > 0): |
|
count_inv_soc = cause_coun_inv if cause_coun_inv >= effect_inv_soc else effect_inv_soc |
|
else: |
|
count_inv_soc = 0 |
|
|
|
cause_coun_cus = (cause_tab.Labellevel2 == 'Customers').sum() |
|
effect_coun_cus = (effect_tab.Labellevel2 == 'Non-performance').sum() |
|
if (cause_coun_cus > 0) and (effect_coun_cus > 0): |
|
count_NP_cus = cause_coun_cus if cause_coun_cus >= effect_coun_cus else effect_coun_cus |
|
else: |
|
count_NP_cus = 0 |
|
|
|
effect_cus_inv = (effect_tab.Labellevel2 == 'Investors').sum() |
|
if (cause_coun_cus > 0) and (effect_cus_inv > 0): |
|
count_cus_inv = cause_coun_cus if cause_coun_cus >= effect_cus_inv else effect_cus_inv |
|
else: |
|
count_cus_inv = 0 |
|
|
|
effect_cus_cus = (effect_tab.Labellevel2 == 'Customers').sum() |
|
if (cause_coun_cus > 0) and (effect_cus_cus > 0): |
|
count_cus_cus = cause_coun_cus if cause_coun_cus >= effect_cus_cus else effect_cus_cus |
|
else: |
|
count_cus_cus = 0 |
|
|
|
effect_cus_emp = (effect_tab.Labellevel2 == 'Employees').sum() |
|
if (cause_coun_cus > 0) and (effect_cus_emp > 0): |
|
count_cus_emp = cause_coun_cus if cause_coun_cus >= effect_cus_emp else effect_cus_emp |
|
else: |
|
count_cus_emp = 0 |
|
|
|
effect_cus_soc = (effect_tab.Labellevel2 == 'Society').sum() |
|
if (cause_coun_cus > 0) and (effect_cus_soc > 0): |
|
count_cus_soc = cause_coun_cus if cause_coun_cus >= effect_cus_soc else effect_cus_soc |
|
else: |
|
count_cus_soc = 0 |
|
|
|
cause_coun_emp = (cause_tab.Labellevel2 == 'Employees').sum() |
|
effect_coun_emp = (effect_tab.Labellevel2 == 'Non-performance').sum() |
|
if (cause_coun_emp > 0) and (effect_coun_emp > 0): |
|
count_NP_emp = cause_coun_emp if cause_coun_emp >= effect_coun_emp else effect_coun_emp |
|
else: |
|
count_NP_emp = 0 |
|
|
|
effect_emp_inv = (effect_tab.Labellevel2 == 'Investors').sum() |
|
if (cause_coun_emp > 0) and (effect_emp_inv > 0): |
|
count_emp_inv = cause_coun_emp if cause_coun_emp >= effect_emp_inv else effect_emp_inv |
|
else: |
|
count_emp_inv = 0 |
|
|
|
effect_emp_cus = (effect_tab.Labellevel2 == 'Customers').sum() |
|
if (cause_coun_emp > 0) and (effect_emp_cus > 0): |
|
count_emp_cus = cause_coun_emp if cause_coun_emp >= effect_emp_cus else effect_emp_cus |
|
else: |
|
count_emp_cus = 0 |
|
|
|
effect_emp_emp = (effect_tab.Labellevel2 == 'Employees').sum() |
|
if (cause_coun_emp > 0) and (effect_emp_emp > 0): |
|
count_emp_emp = cause_coun_emp if cause_coun_emp >= effect_emp_emp else effect_emp_emp |
|
else: |
|
count_emp_emp = 0 |
|
|
|
effect_emp_soc = (effect_tab.Labellevel2 == 'Society').sum() |
|
if (cause_coun_emp > 0) and (effect_emp_soc > 0): |
|
count_emp_soc = cause_coun_emp if cause_coun_emp >= effect_emp_soc else effect_emp_soc |
|
else: |
|
count_emp_soc = 0 |
|
|
|
cause_coun_soc = (cause_tab.Labellevel2 == 'Society').sum() |
|
effect_coun_soc = (effect_tab.Labellevel2 == 'Non-performance').sum() |
|
if (cause_coun_soc > 0) and (effect_coun_soc > 0): |
|
count_NP_soc = cause_coun_soc if cause_coun_soc >= effect_coun_soc else effect_coun_soc |
|
else: |
|
count_NP_soc = 0 |
|
|
|
effect_soc_inv = (effect_tab.Labellevel2 == 'Investors').sum() |
|
if (cause_coun_soc > 0) and (effect_soc_inv > 0): |
|
count_soc_inv = cause_coun_soc if cause_coun_soc >= effect_soc_inv else effect_soc_inv |
|
else: |
|
count_soc_inv = 0 |
|
|
|
effect_soc_cus = (effect_tab.Labellevel2 == 'Customers').sum() |
|
if (cause_coun_soc > 0) and (effect_soc_cus > 0): |
|
count_soc_cus = cause_coun_soc if cause_coun_soc >= effect_soc_cus else effect_soc_cus |
|
else: |
|
count_soc_cus = 0 |
|
|
|
effect_soc_emp = (effect_tab.Labellevel2 == 'Employees').sum() |
|
if (cause_coun_soc > 0) and (effect_soc_emp > 0): |
|
count_soc_emp = cause_coun_soc if cause_coun_soc >= effect_soc_emp else effect_soc_emp |
|
else: |
|
count_soc_emp = 0 |
|
|
|
effect_soc_soc = (effect_tab.Labellevel2 == 'Society').sum() |
|
if (cause_coun_soc > 0) and (effect_soc_soc > 0): |
|
count_soc_soc = cause_coun_soc if cause_coun_soc >= effect_soc_soc else effect_soc_soc |
|
else: |
|
count_soc_soc = 0 |
|
|
|
count_NP_NP = count_NP_NP + count_NP |
|
count_NP_investor = count_NP_investor + count_NP_inv |
|
count_NP_customer = count_NP_customer + count_NP_cus |
|
count_NP_employees = count_NP_employees + count_NP_emp |
|
count_NP_society = count_NP_society + count_NP_soc |
|
|
|
count_inv_np = count_inv_np + count_NP_inv |
|
count_inv_investor = count_inv_investor + count_inv_inv |
|
count_inv_customer = count_inv_customer + count_inv_cus |
|
count_inv_employee = count_inv_employee + count_inv_emp |
|
count_inv_society = count_inv_society + count_inv_soc |
|
|
|
count_cus_np = count_cus_np + count_NP_cus |
|
count_cus_investor = count_cus_investor + count_cus_inv |
|
count_cus_customer = count_cus_customer + count_cus_cus |
|
count_cus_employee = count_cus_employee + count_cus_emp |
|
count_cus_society = count_cus_society + count_cus_soc |
|
|
|
count_emp_np = count_emp_np + count_NP_emp |
|
count_emp_investor = count_emp_investor + count_emp_inv |
|
count_emp_customer = count_emp_customer + count_emp_cus |
|
count_emp_employee = count_emp_employee + count_emp_emp |
|
count_emp_society = count_emp_society + count_emp_soc |
|
|
|
count_soc_np = count_soc_np + count_NP_soc |
|
count_soc_investor = count_soc_investor + count_soc_inv |
|
count_soc_customer = count_soc_customer + count_soc_cus |
|
count_soc_employee = count_soc_employee + count_soc_emp |
|
count_soc_society = count_soc_society + count_soc_soc |
|
|
|
df_tab = pd.DataFrame(columns = ['Non-performance', 'Investors', 'Customers', 'Employees', 'Society'],index=['Non-performance', 'Investors', 'Customers', 'Employees', 'Society'], dtype=object) |
|
|
|
df_tab.loc['Non-performance'] = [count_NP_NP, count_NP_investor, count_NP_customer, count_NP_employees, count_NP_society] |
|
df_tab.loc['Investors'] = [count_inv_np, count_inv_investor, count_inv_customer, count_inv_employee, count_inv_society] |
|
df_tab.loc['Customers'] = [count_cus_np, count_cus_investor, count_cus_customer, count_cus_employee, count_cus_society] |
|
df_tab.loc['Employees'] = [count_emp_np, count_emp_investor, count_emp_customer, count_emp_employee, count_emp_society] |
|
df_tab.loc['Society'] = [count_soc_np, count_soc_investor, count_soc_customer, count_soc_employee, count_soc_society] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
buffer = io.BytesIO() |
|
with pd.ExcelWriter(buffer, engine="xlsxwriter") as writer: |
|
df_tab.to_excel(writer,sheet_name="count_result",index=False) |
|
df_final1.to_excel(writer,sheet_name="Detailed_results",index=False) |
|
writer.close() |
|
|
|
|
|
|
|
json_data = [] |
|
for row in df_tab.index: |
|
for col in df_tab.columns: |
|
json_data.append({ |
|
'source': row, |
|
'target': col, |
|
'value': int(df_tab.loc[row, col]) |
|
}) |
|
dat = json.dumps(json_data) |
|
level2_df = pd.read_json(dat) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
updated_dataset = dataset.map(lambda example: {'new_value': level2_df['value'], 'new_source':level2_df['source'], 'new_target': level2_df['target']},remove_columns=['value','source','target']) |
|
|
|
updated_dataset.push_to_hub('Seetha/Visualization') |
|
df_final1.to_csv('predictions.csv') |
|
csv_file = "predictions.csv" |
|
json_file = "detailedResults.json" |
|
|
|
|
|
with open(csv_file, "r") as f: |
|
csv_data = csv.DictReader(f) |
|
|
|
|
|
data_list = [] |
|
for row in csv_data: |
|
data_list.append(dict(row)) |
|
|
|
|
|
json_data = json.dumps(data_list) |
|
|
|
|
|
|
|
|
|
|
|
with open('detailedResults.json','w') as fi: |
|
|
|
fi.write(json_data) |
|
|
|
def convert_df(df): |
|
|
|
|
|
|
|
return df.to_csv().encode('utf-8') |
|
|
|
|
|
|
|
csv1 = convert_df(df_final1.astype(str)) |
|
csv2 = convert_df(df_tab.astype(str)) |
|
|
|
with st.container(): |
|
|
|
st.download_button(label="Download the result table",data=buffer,file_name="t2cg_outputs.xlsx",mime="application/vnd.ms-excel") |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |
|
|