File size: 6,174 Bytes
10f176f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
from predict import run_prediction
from io import StringIO
import json
import gradio as gr
import spacy
from spacy import displacy
from transformers import AutoTokenizer, AutoModelForTokenClassification,RobertaTokenizer,pipeline
import torch
import nltk
from nltk.tokenize import sent_tokenize
from fin_readability_sustainability import BERTClass, do_predict
import pandas as pd
import en_core_web_sm
from fincat_utils import extract_context_words
from fincat_utils import bert_embedding_extract
import pickle
lr_clf = pickle.load(open("lr_clf_FiNCAT.pickle",'rb'))
nlp = en_core_web_sm.load()
nltk.download('punkt')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#SUSTAINABILITY STARTS
tokenizer_sus = RobertaTokenizer.from_pretrained('roberta-base')
model_sustain = BERTClass(2, "sustanability")
model_sustain.to(device)
model_sustain.load_state_dict(torch.load('sustainability_model.bin', map_location=device)['model_state_dict'])
def get_sustainability(text):
df = pd.DataFrame({'sentence':sent_tokenize(text)})
actual_predictions_sustainability = do_predict(model_sustain, tokenizer_sus, df)
highlight = []
for sent, prob in zip(df['sentence'].values, actual_predictions_sustainability[1]):
if prob>=4.384316:
highlight.append((sent, 'non-sustainable'))
elif prob<=1.423736:
highlight.append((sent, 'sustainable'))
else:
highlight.append((sent, '-'))
return highlight
#SUSTAINABILITY ENDS
#CLAIM STARTS
def score_fincat(txt):
li = []
highlight = []
txt = " " + txt + " "
k = ''
for word in txt.split():
if any(char.isdigit() for char in word):
if word[-1] in ['.', ',', ';', ":", "-", "!", "?", ")", '"', "'"]:
k = word[-1]
word = word[:-1]
st = txt.find(" " + word + k + " ")+1
k = ''
ed = st + len(word)
x = {'paragraph' : txt, 'offset_start':st, 'offset_end':ed}
context_text = extract_context_words(x)
features = bert_embedding_extract(context_text, word)
if(features[0]=='None'):
highlight.append(('None', ' '))
return highlight
prediction = lr_clf.predict(features.reshape(1, 768))
prediction_probability = '{:.4f}'.format(round(lr_clf.predict_proba(features.reshape(1, 768))[:,1][0], 4))
highlight.append((word, ' In-claim' if prediction==1 else 'Out-of-Claim'))
# li.append([word,' In-claim' if prediction==1 else 'Out-of-Claim', prediction_probability])
else:
highlight.append((word, ' '))
#headers = ['numeral', 'prediction', 'probability']
#dff = pd.DataFrame(li)
# dff.columns = headers
return highlight
##Summarization
summarizer = pipeline("summarization", model="knkarthick/MEETING_SUMMARY")
def summarize_text(text):
resp = summarizer(text)
stext = resp[0]['summary_text']
return stext
def split_in_sentences(text):
doc = nlp(text)
return [str(sent).strip() for sent in doc.sents]
def make_spans(text,results):
results_list = []
for i in range(len(results)):
results_list.append(results[i]['label'])
facts_spans = []
facts_spans = list(zip(split_in_sentences(text),results_list))
return facts_spans
##Forward Looking Statement
fls_model = pipeline("text-classification", model="yiyanghkust/finbert-fls", tokenizer="yiyanghkust/finbert-fls")
def fls(text):
results = fls_model(split_in_sentences(text))
return make_spans(text,results)
##Company Extraction
ner=pipeline('ner',model='Jean-Baptiste/camembert-ner-with-dates',tokenizer='Jean-Baptiste/camembert-ner-with-dates', aggregation_strategy="simple")
def fin_ner(text):
replaced_spans = ner(text)
new_spans=[]
for item in replaced_spans:
item['entity']=item['entity_group']
del item['entity_group']
new_spans.append(item)
return {"text": text, "entities": new_spans}
#CUAD STARTS
def load_questions():
questions = []
with open('questions.txt') as f:
questions = f.readlines()
return questions
def load_questions_short():
questions_short = []
with open('questionshort.txt') as f:
questions_short = f.readlines()
return questions_short
questions = load_questions()
questions_short = load_questions_short()
def quad(query,file):
with open(file.name) as f:
paragraph = f.read()
questions = load_questions()
questions_short = load_questions_short()
if (not len(paragraph)==0) and not (len(query)==0):
print('getting predictions')
predictions = run_prediction([query], paragraph, 'marshmellow77/roberta-base-cuad',n_best_size=5)
answer = ""
if predictions['0'] == "":
answer = 'No answer found in document'
else:
with open("nbest.json") as jf:
data = json.load(jf)
for i in range(1):
raw_answer=data['0'][i]['text']
answer += f"{data['0'][i]['text']} -- \n"
answer += f"Probability: {round(data['0'][i]['probability']*100,1)}%\n\n"
#summarizer = pipeline("summarization", model="knkarthick/MEETING_SUMMARY")
#resp = summarizer(answer)
#stext = resp[0]['summary_text']
# highlight,dff=score_fincat(answer)
return answer,summarize_text(answer),fin_ner(answer),score_fincat(answer),get_sustainability(answer),fls(answer)
# b6 = gr.Button("Get Sustainability")
#b6.click(get_sustainability, inputs = text, outputs = gr.HighlightedText())
#iface = gr.Interface(fn=get_sustainability, inputs="textbox", title="CONBERT",description="SUSTAINABILITY TOOL", outputs=gr.HighlightedText(), allow_flagging="never")
#iface.launch()
iface = gr.Interface(fn=quad, inputs=[gr.Dropdown(choices=questions,label='SEARCH QUERY'),gr.inputs.File(label='TXT FILE')], title="CONBERT",description="SUSTAINABILITY TOOL",article='Article', outputs=[gr.outputs.Textbox(label='Answer'),gr.outputs.Textbox(label='Summary'),gr.HighlightedText(label='NER'),gr.HighlightedText(label='CLAIM'),gr.HighlightedText(label='SUSTAINABILITY'),gr.HighlightedText(label='FLS')], allow_flagging="never")
iface.launch() |