Spaces:

Shredder
/

CONBERT

Runtime error

File size: 6,172 Bytes

3090b4b
 
 
1f0c01c
3090b4b
 
 
c2f789d
70b186b
 
c2f789d
10fdf8a
3090b4b
75d092f
 
 
 
70b186b
3090b4b
 
c2f789d
 
3090b4b
c2f789d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3090b4b
c2f789d
75d092f
 
 
 
 
 
 
 
 
 
 
68e73b8
75d092f
 
 
 
 
e395983
e9e58e0
 
 
75d092f
 
 
8a6156f
75d092f
e9e58e0
 
 
 
8a6156f
75d092f
 
55caca8
 
3090b4b
 
 
 
c2f789d
4390c00
 
 
 
22a8fb2
 
 
 
 
 
 
3090b4b
55caca8
3090b4b
 
 
 
 
55caca8
3090b4b
 
a5a2a2e
 
 
 
 
 
3090b4b
 
 
 
 
 
 
 
c2f789d
3090b4b
 
 
 
 
 
a5a2a2e
 
8027117
9c74623
8027117
3090b4b
 
7c39280
3090b4b
 
 
 
 
 
 
 
 
 
7574f0c
3090b4b
55caca8
 
 
 
40670fd
7574f0c
3090b4b
 
10fdf8a
 
c2f789d
 
3090b4b
 
 
f3c0dcb
55caca8
 
c2f789d

from predict import run_prediction
from io import StringIO
import json
import gradio as gr
import spacy
from spacy import displacy
from transformers import AutoTokenizer, AutoModelForTokenClassification,RobertaTokenizer,pipeline
import torch
import nltk
from nltk.tokenize import sent_tokenize
from fin_readability_sustainability import BERTClass, do_predict
import pandas as pd
import en_core_web_sm
from fincat_utils import extract_context_words
from fincat_utils import bert_embedding_extract
import pickle
lr_clf = pickle.load(open("lr_clf_FiNCAT.pickle",'rb'))

nlp = en_core_web_sm.load()
nltk.download('punkt')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#SUSTAINABILITY STARTS
tokenizer_sus = RobertaTokenizer.from_pretrained('roberta-base')
model_sustain = BERTClass(2, "sustanability")
model_sustain.to(device)
model_sustain.load_state_dict(torch.load('sustainability_model.bin', map_location=device)['model_state_dict'])

def get_sustainability(text):
  df = pd.DataFrame({'sentence':sent_tokenize(text)})
  actual_predictions_sustainability = do_predict(model_sustain, tokenizer_sus, df)
  highlight = []
  for sent, prob in zip(df['sentence'].values, actual_predictions_sustainability[1]):
    if prob>=4.384316:
      highlight.append((sent, 'non-sustainable'))
    elif prob<=1.423736:
      highlight.append((sent, 'sustainable'))
    else:
      highlight.append((sent, '-'))
  return highlight
#SUSTAINABILITY ENDS

#CLAIM STARTS
def score_fincat(txt):
  li = []
  highlight = []
  txt = " " + txt + " "
  k = ''
  for word in txt.split():
    if any(char.isdigit() for char in word):
      if word[-1] in ['.', ',', ';', ":", "-", "!", "?", ")", '"', "'"]:
        k = word[-1]
        word = word[:-1]
      st = txt.find(" " + word + k + " ")+1
      k = ''
      ed = st + len(word)
      x = {'paragraph' : txt, 'offset_start':st, 'offset_end':ed}
      context_text = extract_context_words(x)
      features = bert_embedding_extract(context_text, word)
      if(features[0]=='None'):
          continue
          #highlight.append((txt, '    '))
          #return highlight
      prediction = lr_clf.predict(features.reshape(1, 768))
      prediction_probability = '{:.4f}'.format(round(lr_clf.predict_proba(features.reshape(1, 768))[:,1][0], 4))
      highlight.append((word, '    In-claim' if prediction==1 else 'Out-of-Claim'))
     # li.append([word,'    In-claim' if prediction==1 else 'Out-of-Claim', prediction_probability])
    else:
       continue
     # highlight.append((word, '    '))
  if(len(highlight)<1):
      highlight.append((txt,'  '))   
  return highlight


##Summarization
summarizer = pipeline("summarization", model="knkarthick/MEETING_SUMMARY") 
def summarize_text(text):
    resp = summarizer(text)
    stext = resp[0]['summary_text']
    return stext


def split_in_sentences(text):
    doc = nlp(text)
    return [str(sent).strip() for sent in doc.sents]
def make_spans(text,results):
    results_list = []
    for i in range(len(results)):
        results_list.append(results[i]['label'])
    facts_spans = []
    facts_spans = list(zip(split_in_sentences(text),results_list))
    return facts_spans    
##Forward Looking Statement
fls_model = pipeline("text-classification", model="yiyanghkust/finbert-fls", tokenizer="yiyanghkust/finbert-fls")
def fls(text):
    results = fls_model(split_in_sentences(text))
    return make_spans(text,results) 
    
##Company Extraction
ner=pipeline('ner',model='Jean-Baptiste/camembert-ner-with-dates',tokenizer='Jean-Baptiste/camembert-ner-with-dates', aggregation_strategy="simple")
def fin_ner(text):
    replaced_spans = ner(text)
    new_spans=[]
    for item in replaced_spans:
        item['entity']=item['entity_group']
        del item['entity_group']
        new_spans.append(item)
    return {"text": text, "entities": new_spans}
    
     
#CUAD STARTS    
def load_questions():
    questions = []
    with open('questions.txt') as f:
        questions = f.readlines()
    return questions


def load_questions_short():
    questions_short = []
    with open('questionshort.txt') as f:
        questions_short = f.readlines()
    return questions_short
questions = load_questions()
questions_short = load_questions_short()
def quad(query,file):
    with open(file.name) as f:
        paragraph = f.read()
    questions = load_questions()
    questions_short = load_questions_short()
    if (not len(paragraph)==0) and not (len(query)==0):
        print('getting predictions')
    predictions = run_prediction([query], paragraph, 'marshmellow77/roberta-base-cuad',n_best_size=5)
    answer = ""
    if predictions['0'] == "":
        answer = 'No answer found in document'
    else:
        with open("nbest.json") as jf:
            data = json.load(jf)
            for i in range(1):
                raw_answer=data['0'][i]['text']
                answer += f"{data['0'][i]['text']} -- \n"
                answer += f"Probability: {round(data['0'][i]['probability']*100,1)}%\n\n"
    #summarizer = pipeline("summarization", model="knkarthick/MEETING_SUMMARY")
    #resp = summarizer(answer)
    #stext = resp[0]['summary_text']
    
   # highlight,dff=score_fincat(answer)
    return answer,summarize_text(answer),fin_ner(answer),score_fincat(answer),get_sustainability(answer),fls(answer)    
                
                   
# b6 = gr.Button("Get Sustainability")
              #b6.click(get_sustainability, inputs = text, outputs = gr.HighlightedText())
              
              
#iface = gr.Interface(fn=get_sustainability, inputs="textbox", title="CONBERT",description="SUSTAINABILITY TOOL", outputs=gr.HighlightedText(), allow_flagging="never")
#iface.launch()

iface = gr.Interface(fn=quad, inputs=[gr.Dropdown(choices=questions_short,label='SEARCH QUERY'),gr.inputs.File(label='TXT FILE')], title="CONBERT",description="SUSTAINABILITY TOOL",article='Article', outputs=[gr.outputs.Textbox(label='Answer'),gr.outputs.Textbox(label='Summary'),gr.HighlightedText(label='NER'),gr.HighlightedText(label='CLAIM'),gr.HighlightedText(label='SUSTAINABILITY'),gr.HighlightedText(label='FLS')], allow_flagging="never")


iface.launch()