Spaces:
Sleeping
Sleeping
# import all packages | |
import requests | |
import streamlit as st | |
from sklearn.model_selection import StratifiedKFold | |
from sklearn.model_selection import train_test_split | |
from sklearn.model_selection import KFold | |
# tokenizer | |
from transformers import AutoTokenizer, DistilBertTokenizerFast | |
# sequence tagging model + training-related | |
from transformers import DistilBertForTokenClassification, Trainer, TrainingArguments | |
import numpy as np | |
import pandas as pd | |
import torch | |
import json | |
import sys | |
import os | |
from sklearn.metrics import classification_report | |
from pandas import read_csv | |
from sklearn.linear_model import LogisticRegression | |
import sklearn.model_selection | |
from sklearn.feature_extraction.text import TfidfTransformer | |
from sklearn.feature_extraction.text import CountVectorizer | |
from sklearn.pipeline import Pipeline, FeatureUnion | |
import math | |
from sklearn.metrics import accuracy_score | |
from sklearn.metrics import precision_recall_fscore_support | |
from sklearn.model_selection import train_test_split | |
import json | |
import re | |
import numpy as np | |
import pandas as pd | |
import re | |
import nltk | |
nltk.download("punkt") | |
import string | |
from sklearn.model_selection import train_test_split | |
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoConfig | |
import torch | |
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler | |
import itertools | |
import json | |
import glob | |
from transformers import TextClassificationPipeline, TFAutoModelForSequenceClassification, AutoTokenizer | |
from transformers import pipeline | |
import pickle | |
import urllib.request | |
import csv | |
import pdfplumber | |
import pathlib | |
import shutil | |
import webbrowser | |
from streamlit.components.v1 import html | |
import streamlit.components.v1 as components | |
from PyPDF2 import PdfReader | |
from huggingface_hub import HfApi | |
import io | |
from datasets import load_dataset | |
import huggingface_hub | |
from huggingface_hub import Repository | |
from datetime import datetime | |
import pathlib as Path | |
DATASET_REPO_URL = "https://huggingface.co/datasets/Seetha/visual_files/raw/main" | |
DATA_FILENAME = "level2.json" | |
DATA_FILE = os.path.join(DATASET_REPO_URL, DATA_FILENAME) | |
feedback_file = Path("https://huggingface.co/datasets/Seetha/visual_files/") / f"level2.json" | |
st.write(feedback_file) | |
HF_TOKEN = os.environ.get("HF_TOKEN") | |
# repo = Repository( | |
# local_dir="huggingface-hub", clone_from=DATASET_REPO_URL, use_auth_token=HF_TOKEN | |
# ) | |
#dataset = load_dataset('Seetha/visual_files') | |
# print("is none?", HF_TOKEN is None) | |
# print("hfh", huggingface_hub.__version__) | |
def main(): | |
st.title("Text to Causal Knowledge Graph") | |
st.sidebar.title("Please upload your text documents in one file here:") | |
k=2 | |
seed = 1 | |
k1= 5 | |
text_list = [] | |
causal_sents = [] | |
uploaded_file = None | |
try: | |
uploaded_file = st.sidebar.file_uploader("Choose a file", type = "pdf") | |
except: | |
uploaded_file = PdfReader('sample_anno.pdf') | |
st.error("Please upload your own PDF to be analyzed") | |
if uploaded_file is not None: | |
reader = PdfReader(uploaded_file) | |
for page in reader.pages: | |
text = page.extract_text() | |
text_list.append(text) | |
else: | |
st.error("Please upload your own PDF to be analyzed") | |
st.stop() | |
text_list_final = [x.replace('\n', '') for x in text_list] | |
text_list_final = re.sub('"', '', str(text_list_final)) | |
sentences = nltk.sent_tokenize(text_list_final) | |
result =[] | |
for i in sentences: | |
result1 = i.lower() | |
result2 = re.sub(r'[^\w\s]','',result1) | |
result.append(result2) | |
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") #bert-base-uncased | |
model_path = "checkpoint-2850" | |
model = AutoModelForSequenceClassification.from_pretrained(model_path,id2label={0:'non-causal',1:'causal'}) | |
pipe1 = pipeline("text-classification", model=model,tokenizer=tokenizer) | |
for sent in result: | |
pred = pipe1(sent) | |
for lab in pred: | |
if lab['label'] == 'causal': #causal | |
causal_sents.append(sent) | |
model_name = "distilbert-base-cased" | |
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) | |
model_path1 = "DistilBertforTokenclassification" | |
model = DistilBertForTokenClassification.from_pretrained(model_path1) #len(unique_tags),, num_labels= 7, , id2label={0:'CT',1:'E',2:'C',3:'O'} | |
pipe = pipeline('ner', model=model, tokenizer=tokenizer,aggregation_strategy='simple') #grouped_entities=True | |
sentence_pred = [] | |
class_list = [] | |
entity_list = [] | |
for k in causal_sents: | |
pred= pipe(k) | |
#st.write(pred) | |
for i in pred: | |
sentence_pred.append(k) | |
class_list.append(i['word']) | |
entity_list.append(i['entity_group']) | |
filename = 'Checkpoint-classification.sav' | |
loaded_model = pickle.load(open(filename, 'rb')) | |
loaded_vectorizer = pickle.load(open('vectorizefile_classification.pickle', 'rb')) | |
pipeline_test_output = loaded_vectorizer.transform(class_list) | |
predicted = loaded_model.predict(pipeline_test_output) | |
pred1 = predicted | |
level0 = [] | |
count =0 | |
for i in predicted: | |
if i == 3: | |
level0.append('Non-Performance') | |
count +=1 | |
else: | |
level0.append('Performance') | |
count +=1 | |
list_pred = {0: 'Customers',1:'Employees',2:'Investors',3:'Non-performance',4:'Society',5:'Unclassified'} | |
pred_val = [list_pred[i] for i in pred1] | |
#print('count',count) | |
for ind,(sent,preds) in enumerate(zip(class_list,pred_val)): | |
if 'customers' in sent or 'client' in sent or 'consumer' in sent or 'user' in sent: | |
pred_val[ind] = 'Customers' | |
elif 'investor' in sent or 'finance' in sent or 'shareholder' in sent or 'stockholder' in sent or 'owners' in sent: | |
pred_val[ind] = 'Investors' | |
elif 'employee' in sent or 'worker' in sent or 'staff' in sent: | |
pred_val[ind] = 'Employees' | |
elif 'society' in sent or 'societal' in sent or 'social responsib*' in sent or 'social performance' in sent or 'community' in sent: | |
pred_val[ind] = 'Society' | |
sent_id, unique = pd.factorize(sentence_pred) | |
final_list = pd.DataFrame( | |
{'Id': sent_id, | |
'Fullsentence': sentence_pred, | |
'Component': class_list, | |
'causeOrEffect': entity_list, | |
'Labellevel1': level0, | |
'Labellevel2': pred_val | |
}) | |
s = final_list['Component'].shift(-1) | |
m = s.str.startswith('##', na=False) | |
final_list.loc[m, 'Component'] += (' ' + s[m]) | |
final_list1 = final_list[~final_list['Component'].astype(str).str.startswith('##')] | |
li = [] | |
uni = final_list1['Id'].unique() | |
for i in uni: | |
df_new = final_list1[final_list1['Id'] == i] | |
uni1 = df_new['Id'].unique() | |
if 'E' not in df_new.values: | |
li.append(uni1) | |
out = np.concatenate(li).ravel() | |
li_pan = pd.DataFrame(out,columns=['Id']) | |
df3 = pd.merge(final_list1, li_pan[['Id']], on='Id', how='left', indicator=True) \ | |
.query("_merge == 'left_only'") \ | |
.drop("_merge",axis=1) | |
#df = df3.groupby(['Id','Fullsentence','causeOrEffect', 'Labellevel1', 'Labellevel2'])['Component'].apply(', '.join).reset_index() | |
#st.write(df) | |
#df = df3 | |
df3["causeOrEffect"].replace({"C": "cause", "E": "effect"}, inplace=True) | |
df_final = df3[df3['causeOrEffect'] != 'CT'] | |
df3['New string'] = df_final['Component'].replace(r'[##]+', ' ', regex=True) | |
df_final = df_final.drop("Component",axis=1) | |
df_final.insert(2, "Component", df3['New string'], True) | |
df_final1 = df_final[df_final['Component'].str.split().str.len().gt(1)] | |
#st.write(df_final[df_final['Component'].str.len() != 1]) | |
#df_final1.to_csv('predictions.csv') | |
# buffer = io.BytesIO() | |
# with pd.ExcelWriter(buffer, engine="xlsxwriter") as writer: | |
# df_final.to_excel(writer, sheet_name="Sheet1", index=False) | |
# writer.close() | |
count_NP_NP = 0 | |
count_NP_investor = 0 | |
count_NP_customer = 0 | |
count_NP_employees = 0 | |
count_NP_society = 0 | |
count_inv_np = 0 | |
count_inv_investor = 0 | |
count_inv_customer = 0 | |
count_inv_employee = 0 | |
count_inv_society = 0 | |
count_cus_np = 0 | |
count_cus_investor = 0 | |
count_cus_customer = 0 | |
count_cus_employee = 0 | |
count_cus_society = 0 | |
count_emp_np = 0 | |
count_emp_investor = 0 | |
count_emp_customer = 0 | |
count_emp_employee = 0 | |
count_emp_society = 0 | |
count_soc_np = 0 | |
count_soc_investor = 0 | |
count_soc_customer = 0 | |
count_soc_employee = 0 | |
count_soc_society = 0 | |
for i in range(0,df_final['Id'].max()): | |
j = df_final.loc[df_final['Id'] == i] | |
cause_tab = j.loc[j['causeOrEffect'] == 'cause'] | |
effect_tab = j.loc[j['causeOrEffect'] == 'effect'] | |
cause_coun_NP = (cause_tab.Labellevel2 == 'Non-performance').sum() | |
effect_coun_NP = (effect_tab.Labellevel2 == 'Non-performance').sum() | |
if (cause_coun_NP > 0) and (effect_coun_NP > 0): | |
count_NP = cause_coun_NP if cause_coun_NP >= effect_coun_NP else effect_coun_NP | |
else: | |
count_NP = 0 | |
effect_NP_inv = (effect_tab.Labellevel2 == 'Investors').sum() | |
if (cause_coun_NP > 0) and (effect_NP_inv > 0): | |
count_NP_inv = cause_coun_NP if cause_coun_NP >= effect_NP_inv else effect_NP_inv | |
else: | |
count_NP_inv = 0 | |
effect_NP_cus = (effect_tab.Labellevel2 == 'Customers').sum() | |
if (cause_coun_NP > 0) and (effect_NP_cus > 0): | |
count_NP_cus = cause_coun_NP if cause_coun_NP >= effect_NP_cus else effect_NP_cus | |
else: | |
count_NP_cus = 0 | |
effect_NP_emp = (effect_tab.Labellevel2 == 'Employees').sum() | |
if (cause_coun_NP > 0) and (effect_NP_emp > 0): | |
count_NP_emp = cause_coun_NP if cause_coun_NP >= effect_NP_emp else effect_NP_emp | |
else: | |
count_NP_emp = 0 | |
effect_NP_soc = (effect_tab.Labellevel2 == 'Society').sum() | |
if (cause_coun_NP > 0) and (effect_NP_soc > 0): | |
count_NP_soc = cause_coun_NP if cause_coun_NP >= effect_NP_soc else effect_NP_soc | |
else: | |
count_NP_soc = 0 | |
cause_coun_inv = (cause_tab.Labellevel2 == 'Investors').sum() | |
effect_coun_inv = (effect_tab.Labellevel2 == 'Non-performance').sum() | |
if (cause_coun_inv > 0) and (effect_coun_inv > 0): | |
count_NP_inv = cause_coun_inv if cause_coun_inv >= effect_coun_inv else effect_coun_inv | |
else: | |
count_NP_inv = 0 | |
effect_inv_inv = (effect_tab.Labellevel2 == 'Investors').sum() | |
if (cause_coun_inv > 0) and (effect_inv_inv > 0): | |
count_inv_inv = cause_coun_inv if cause_coun_inv >= effect_inv_inv else effect_inv_inv | |
else: | |
count_inv_inv = 0 | |
effect_inv_cus = (effect_tab.Labellevel2 == 'Customers').sum() | |
if (cause_coun_inv > 0) and (effect_inv_cus > 0): | |
count_inv_cus = cause_coun_inv if cause_coun_inv >= effect_inv_cus else effect_inv_cus | |
else: | |
count_inv_cus = 0 | |
effect_inv_emp = (effect_tab.Labellevel2 == 'Employees').sum() | |
if (cause_coun_inv > 0) and (effect_inv_emp > 0): | |
count_inv_emp = cause_coun_inv if cause_coun_inv >= effect_inv_emp else effect_inv_emp | |
else: | |
count_inv_emp = 0 | |
effect_inv_soc = (effect_tab.Labellevel2 == 'Society').sum() | |
if (cause_coun_inv > 0) and (effect_inv_soc > 0): | |
count_inv_soc = cause_coun_inv if cause_coun_inv >= effect_inv_soc else effect_inv_soc | |
else: | |
count_inv_soc = 0 | |
cause_coun_cus = (cause_tab.Labellevel2 == 'Customers').sum() | |
effect_coun_cus = (effect_tab.Labellevel2 == 'Non-performance').sum() | |
if (cause_coun_cus > 0) and (effect_coun_cus > 0): | |
count_NP_cus = cause_coun_cus if cause_coun_cus >= effect_coun_cus else effect_coun_cus | |
else: | |
count_NP_cus = 0 | |
effect_cus_inv = (effect_tab.Labellevel2 == 'Investors').sum() | |
if (cause_coun_cus > 0) and (effect_cus_inv > 0): | |
count_cus_inv = cause_coun_cus if cause_coun_cus >= effect_cus_inv else effect_cus_inv | |
else: | |
count_cus_inv = 0 | |
effect_cus_cus = (effect_tab.Labellevel2 == 'Customers').sum() | |
if (cause_coun_cus > 0) and (effect_cus_cus > 0): | |
count_cus_cus = cause_coun_cus if cause_coun_cus >= effect_cus_cus else effect_cus_cus | |
else: | |
count_cus_cus = 0 | |
effect_cus_emp = (effect_tab.Labellevel2 == 'Employees').sum() | |
if (cause_coun_cus > 0) and (effect_cus_emp > 0): | |
count_cus_emp = cause_coun_cus if cause_coun_cus >= effect_cus_emp else effect_cus_emp | |
else: | |
count_cus_emp = 0 | |
effect_cus_soc = (effect_tab.Labellevel2 == 'Society').sum() | |
if (cause_coun_cus > 0) and (effect_cus_soc > 0): | |
count_cus_soc = cause_coun_cus if cause_coun_cus >= effect_cus_soc else effect_cus_soc | |
else: | |
count_cus_soc = 0 | |
cause_coun_emp = (cause_tab.Labellevel2 == 'Employees').sum() | |
effect_coun_emp = (effect_tab.Labellevel2 == 'Non-performance').sum() | |
if (cause_coun_emp > 0) and (effect_coun_emp > 0): | |
count_NP_emp = cause_coun_emp if cause_coun_emp >= effect_coun_emp else effect_coun_emp | |
else: | |
count_NP_emp = 0 | |
effect_emp_inv = (effect_tab.Labellevel2 == 'Investors').sum() | |
if (cause_coun_emp > 0) and (effect_emp_inv > 0): | |
count_emp_inv = cause_coun_emp if cause_coun_emp >= effect_emp_inv else effect_emp_inv | |
else: | |
count_emp_inv = 0 | |
effect_emp_cus = (effect_tab.Labellevel2 == 'Customers').sum() | |
if (cause_coun_emp > 0) and (effect_emp_cus > 0): | |
count_emp_cus = cause_coun_emp if cause_coun_emp >= effect_emp_cus else effect_emp_cus | |
else: | |
count_emp_cus = 0 | |
effect_emp_emp = (effect_tab.Labellevel2 == 'Employees').sum() | |
if (cause_coun_emp > 0) and (effect_emp_emp > 0): | |
count_emp_emp = cause_coun_emp if cause_coun_emp >= effect_emp_emp else effect_emp_emp | |
else: | |
count_emp_emp = 0 | |
effect_emp_soc = (effect_tab.Labellevel2 == 'Society').sum() | |
if (cause_coun_emp > 0) and (effect_emp_soc > 0): | |
count_emp_soc = cause_coun_emp if cause_coun_emp >= effect_emp_soc else effect_emp_soc | |
else: | |
count_emp_soc = 0 | |
cause_coun_soc = (cause_tab.Labellevel2 == 'Society').sum() | |
effect_coun_soc = (effect_tab.Labellevel2 == 'Non-performance').sum() | |
if (cause_coun_soc > 0) and (effect_coun_soc > 0): | |
count_NP_soc = cause_coun_soc if cause_coun_soc >= effect_coun_soc else effect_coun_soc | |
else: | |
count_NP_soc = 0 | |
effect_soc_inv = (effect_tab.Labellevel2 == 'Investors').sum() | |
if (cause_coun_soc > 0) and (effect_soc_inv > 0): | |
count_soc_inv = cause_coun_soc if cause_coun_soc >= effect_soc_inv else effect_soc_inv | |
else: | |
count_soc_inv = 0 | |
effect_soc_cus = (effect_tab.Labellevel2 == 'Customers').sum() | |
if (cause_coun_soc > 0) and (effect_soc_cus > 0): | |
count_soc_cus = cause_coun_soc if cause_coun_soc >= effect_soc_cus else effect_soc_cus | |
else: | |
count_soc_cus = 0 | |
effect_soc_emp = (effect_tab.Labellevel2 == 'Employees').sum() | |
if (cause_coun_soc > 0) and (effect_soc_emp > 0): | |
count_soc_emp = cause_coun_soc if cause_coun_soc >= effect_soc_emp else effect_soc_emp | |
else: | |
count_soc_emp = 0 | |
effect_soc_soc = (effect_tab.Labellevel2 == 'Society').sum() | |
if (cause_coun_soc > 0) and (effect_soc_soc > 0): | |
count_soc_soc = cause_coun_soc if cause_coun_soc >= effect_soc_soc else effect_soc_soc | |
else: | |
count_soc_soc = 0 | |
count_NP_NP = count_NP_NP + count_NP | |
count_NP_investor = count_NP_investor + count_NP_inv | |
count_NP_customer = count_NP_customer + count_NP_cus | |
count_NP_employees = count_NP_employees + count_NP_emp | |
count_NP_society = count_NP_society + count_NP_soc | |
count_inv_np = count_inv_np + count_NP_inv | |
count_inv_investor = count_inv_investor + count_inv_inv | |
count_inv_customer = count_inv_customer + count_inv_cus | |
count_inv_employee = count_inv_employee + count_inv_emp | |
count_inv_society = count_inv_society + count_inv_soc | |
count_cus_np = count_cus_np + count_NP_cus | |
count_cus_investor = count_cus_investor + count_cus_inv | |
count_cus_customer = count_cus_customer + count_cus_cus | |
count_cus_employee = count_cus_employee + count_cus_emp | |
count_cus_society = count_cus_society + count_cus_soc | |
count_emp_np = count_emp_np + count_NP_emp | |
count_emp_investor = count_emp_investor + count_emp_inv | |
count_emp_customer = count_emp_customer + count_emp_cus | |
count_emp_employee = count_emp_employee + count_emp_emp | |
count_emp_society = count_emp_society + count_emp_soc | |
count_soc_np = count_soc_np + count_NP_soc | |
count_soc_investor = count_soc_investor + count_soc_inv | |
count_soc_customer = count_soc_customer + count_soc_cus | |
count_soc_employee = count_soc_employee + count_soc_emp | |
count_soc_society = count_soc_society + count_soc_soc | |
df_tab = pd.DataFrame(columns = ['Non-performance', 'Investors', 'Customers', 'Employees', 'Society'],index=['Non-performance', 'Investors', 'Customers', 'Employees', 'Society'], dtype=object) | |
df_tab.loc['Non-performance'] = [count_NP_NP, count_NP_investor, count_NP_customer, count_NP_employees, count_NP_society] | |
df_tab.loc['Investors'] = [count_inv_np, count_inv_investor, count_inv_customer, count_inv_employee, count_inv_society] | |
df_tab.loc['Customers'] = [count_cus_np, count_cus_investor, count_cus_customer, count_cus_employee, count_cus_society] | |
df_tab.loc['Employees'] = [count_emp_np, count_emp_investor, count_emp_customer, count_emp_employee, count_emp_society] | |
df_tab.loc['Society'] = [count_soc_np, count_soc_investor, count_soc_customer, count_soc_employee, count_soc_society] | |
# df_tab = pd.DataFrame({ | |
# 'Non-performance': [count_NP_NP, count_NP_investor, count_NP_customer, count_NP_employees, count_NP_society], | |
# 'Investors': [count_inv_np, count_inv_investor, count_inv_customer, count_inv_employee, count_inv_society], | |
# 'Customers': [count_cus_np, count_cus_investor, count_cus_customer, count_cus_employee, count_cus_society], | |
# 'Employees': [count_emp_np, count_emp_investor, count_emp_customer, count_emp_employee, count_emp_society], | |
# 'Society': [count_soc_np, count_soc_investor, count_soc_customer, count_soc_employee, count_soc_society]}, | |
# index=['Non-performance', 'Investors', 'Customers', 'Employees', 'Society']) | |
#df_tab.to_csv('final_data.csv') | |
buffer = io.BytesIO() | |
with pd.ExcelWriter(buffer, engine="xlsxwriter") as writer: | |
df_tab.to_excel(writer,sheet_name="count_result",index=False) | |
df_final1.to_excel(writer,sheet_name="Detailed_results",index=False) | |
writer.close() | |
#df = pd.read_csv('final_data.csv', index_col=0) | |
#474-515 | |
# # Convert to JSON format | |
json_data = [] | |
for row in df_tab.index: | |
for col in df_tab.columns: | |
json_data.append({ | |
'source': row, | |
'target': col, | |
'value': int(df_tab.loc[row, col]) | |
}) | |
# base_url = "https://huggingface.co/datasets/Seetha/visual_files/tree/main" | |
# dataset = load_dataset(base_url, data_files='https://huggingface.co/datasets/Seetha/visual_files/blob/main/level2.json') | |
#st.write(dataset) | |
# Write JSON to file | |
#with open(DATA_FILE, 'w') as f: #w+ | |
with open('https://huggingface.co/datasets/Seetha/visual_files/raw/main/level2.json','w') as f: | |
st.write(f) | |
f.write(json.dump(json_data)) | |
#json.dump(json_data, f) | |
#repo.push_to_hub() | |
# commit_url = repo.push_to_hub() | |
# st.write(commit_url) | |
#dataset = load_dataset("seetha/visual_files") | |
#repo.push_to_hub("level2.json") | |
# # repo.git_pull() | |
# # repo.git_add("ch.json") | |
# # repo.git_commit(commit_message="add ch.json :)") | |
# # repo.push() | |
df_final1.to_csv('predictions.csv') | |
csv_file = "predictions.csv" | |
json_file = "detailedResults.json" | |
# Open the CSV file and read the data | |
with open(csv_file, "r") as f: | |
csv_data = csv.DictReader(f) | |
# # Convert the CSV data to a list of dictionaries | |
data_list = [] | |
for row in csv_data: | |
data_list.append(dict(row)) | |
# # Convert the list of dictionaries to JSON | |
json_data = json.dumps(data_list) | |
# # Write the JSON data to a file | |
# #with open("smalljson.json", "r+") as fi: | |
# #data = fi.read() | |
# #fi.seek(0) | |
with open('detailedResults.json','w') as fi: | |
#data = json.load(fi) | |
fi.write(json_data) | |
def convert_df(df): | |
#IMPORTANT: Cache the conversion to prevent computation on every rerun | |
return df.to_csv().encode('utf-8') | |
csv1 = convert_df(df_final1.astype(str)) | |
csv2 = convert_df(df_tab.astype(str)) | |
with st.container(): | |
st.download_button(label="Download the result table",data=buffer,file_name="t2cg_outputs.xlsx",mime="application/vnd.ms-excel") | |
if __name__ == '__main__': | |
main() | |