File size: 13,297 Bytes
8499c35
 
 
 
 
 
 
 
8ea204b
8499c35
 
 
35a0403
77b71a6
be67fcf
 
8d7b496
d401ad6
6155281
 
8499c35
 
6155281
8499c35
 
8d6cc8d
cedea8d
 
 
8499c35
 
 
 
 
 
 
f03dbdb
b7ef881
8499c35
 
377fd6b
 
 
 
a31e67a
1ad3fab
377fd6b
 
1ad3fab
 
 
 
 
8499c35
e429024
 
 
 
 
 
 
 
 
 
 
e7363fe
 
e429024
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a4a68cf
e429024
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8499c35
 
 
 
 
 
f67f4fa
8499c35
 
 
 
 
 
 
a31e67a
8499c35
34b166f
8499c35
 
a31e67a
8499c35
34b166f
8499c35
 
 
 
 
f4b6788
8499c35
 
 
7a5728d
93ef2af
 
 
 
 
 
 
 
 
6a1d689
93ef2af
6a1d689
93ef2af
8499c35
7a5728d
 
 
8499c35
 
 
 
 
 
7a5728d
 
 
f4b6788
7a5728d
8499c35
7a5728d
 
 
 
 
 
 
 
 
 
 
 
 
8499c35
 
7a5728d
 
 
f4b6788
8499c35
 
7a5728d
8499c35
f4b6788
7a5728d
aba1152
8499c35
7a5728d
8499c35
7a5728d
8499c35
 
 
 
 
 
7a5728d
8499c35
 
 
 
 
 
 
 
 
 
7a5728d
 
8499c35
 
 
 
 
 
 
 
edd60a3
 
8499c35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
040ddf0
8499c35
 
 
 
 
 
edd60a3
 
8499c35
 
 
edd60a3
 
8499c35
 
 
 
 
 
 
 
 
 
 
 
 
 
c2ce126
 
8499c35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
edd60a3
8499c35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
edd60a3
 
8499c35
 
 
 
 
 
 
 
 
 
 
edd60a3
 
 
f919f8f
7a5728d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
import whisper
import os
from pytube import YouTube
import pandas as pd
import plotly_express as px
import nltk
import plotly.graph_objects as go
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import streamlit as st
import en_core_web_lg
import validators
import re
import itertools
import numpy as np
from bs4 import BeautifulSoup   
import base64, time
from annotated_text import annotated_text

nltk.download('punkt')


from nltk import sent_tokenize

time_str = time.strftime("%d%m%Y-%H%M%S")
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; 
margin-bottom: 2.5rem">{}</div> """

@st.experimental_singleton(suppress_st_warning=True)
def load_models():
    q_model = ORTModelForSequenceClassification.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
    ner_model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
    q_tokenizer = AutoTokenizer.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
    ner_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
    sent_pipe = pipeline("text-classification",model=q_model, tokenizer=q_tokenizer)
    sum_pipe = pipeline("summarization",model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn",clean_up_tokenization_spaces=True)
    ner_pipe = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer, grouped_entities=True)
    cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
    
    return sent_pipe, sum_pipe, ner_pipe, cross_encoder

@st.experimental_singleton(suppress_st_warning=True)
def load_asr_model(asr_model_name):
    asr_model = whisper.load_model(asr_model_name)
    
    return asr_model
        
@st.experimental_singleton(suppress_st_warning=True)    
def load_sbert(model_name):
    sbert = SentenceTransformer(model_name)
    
    return sbert

@st.experimental_memo(suppress_st_warning=True)
def embed_text(query,corpus,embedding_model):
    
    '''Embed text and generate semantic search scores'''
    
    #If model is e5 then apply prefixes to query and passage
    if embedding_model == 'intfloat/e5-base':
        search_input = 'query: '+ query
        passages_emb = ['passage: ' + sentence for sentence in corpus]

    elif embedding_model == 'hkunlp/instructor-base':
        search_input = [['Represent the Financial question; Input: ', query, 0]]
        passages_emb = [['Represent the Financial statement for retrieval; Input: ',sentence,0] for sentence in corpus]

    else:
        search_input = query
        passages_emb = corpus
        
    
    #Embed corpus and question
    corpus_embedding = sbert.encode(passages_emb, convert_to_tensor=True)
    question_embedding = sbert.encode(search_input, convert_to_tensor=True)
    question_embedding = question_embedding.cpu()
    corpus_embedding = corpus_embedding.cpu()
    
    # #Calculate similarity scores and rank
    hits = util.semantic_search(question_embedding, corpus_embedding, top_k=2)
    hits = hits[0]  # Get the hits for the first query

    # ##### Re-Ranking #####
    # Now, score all retrieved passages with the cross_encoder
    cross_inp = [[search_input, corpus[hit['corpus_id']]] for hit in hits]

    if embedding_model == 'hkunlp/instructor-base':
        result = []

        for sublist in cross_inp:
            question = sublist[0][0][1]
            document = sublist[1][1]
            result.append([question, document])

        cross_inp = result

    cross_scores = cross_encoder.predict(cross_inp)

    # Sort results by the cross-encoder scores
    for idx in range(len(cross_scores)):
        hits[idx]['cross-score'] = cross_scores[idx]

    # Output of top-3 hits from re-ranker
    # st.markdown("\n-------------------------\n")
    # st.subheader(f"Top-{top_k} Cross-Encoder Re-ranker hits")
    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
    
    return hits
    
@st.experimental_singleton(suppress_st_warning=True)
def get_spacy():
    nlp = en_core_web_lg.load()
    return nlp
    
@st.experimental_memo(suppress_st_warning=True)
def inference(link, upload, _asr_model):
    '''Convert Youtube video or Audio upload to text'''
    
    if validators.url(link):
    
      yt = YouTube(link)
      title = yt.title
      path = yt.streams.filter(only_audio=True)[0].download(filename="audio.mp4")
      results = _asr_model.transcribe(path, task='transcribe', language='en')
      
      return results['text'], yt.title
      
    elif upload:
      results = _asr_model.trasncribe(upload, task='transcribe', language='en')
      
      return results['text'], "Transcribed Earnings Audio"
      
@st.experimental_memo(suppress_st_warning=True)
def sentiment_pipe(earnings_text):
    '''Determine the sentiment of the text'''
    
    earnings_sentences = chunk_long_text(earnings_text,150,1,1)
    earnings_sentiment = sent_pipe(earnings_sentences)
    
    return earnings_sentiment, earnings_sentences    

@st.experimental_memo(suppress_st_warning=True)
def summarize_text(text_to_summarize,max_len,min_len):
    '''Summarize text with HF model'''
    
    summarized_text = sum_pipe(text_to_summarize,max_length=max_len,min_length=min_len,clean_up_tokenization_spaces=True,no_repeat_ngram_size=4,
           encoder_no_repeat_ngram_size=3,
           repetition_penalty=3.5,
           num_beams=4,
           early_stopping=True)
    summarized_text = ' '.join([summ['summary_text'] for summ in summarized_text])
     
    return summarized_text
     
@st.experimental_memo(suppress_st_warning=True)
def clean_text(text):
    '''Clean all text'''

    text = text.encode("ascii", "ignore").decode()  # unicode
    text = re.sub(r"https*\S+", " ", text)  # url
    text = re.sub(r"@\S+", " ", text)  # mentions
    text = re.sub(r"#\S+", " ", text)  # hastags
    text = re.sub(r"\s{2,}", " ", text)  # over spaces
    
    return text
       
@st.experimental_memo(suppress_st_warning=True)
def chunk_long_text(text,threshold,window_size=3,stride=2):
    '''Preprocess text and chunk for semantic search and sentiment analysis'''
    
    #Convert cleaned text into sentences
    sentences = sent_tokenize(text)
    out = []

    #Limit the length of each sentence to a threshold
    for chunk in sentences:
        if len(chunk.split()) < threshold:
            out.append(chunk)
        else:
            words = chunk.split()
            num = int(len(words)/threshold)
            for i in range(0,num*threshold+1,threshold):
                out.append(' '.join(words[i:threshold+i]))
    
    passages = []
    
    #Combine sentences into a window of size window_size
    for paragraph in [out]:
        for start_idx in range(0, len(paragraph), stride):
            end_idx = min(start_idx+window_size, len(paragraph))
            passages.append(" ".join(paragraph[start_idx:end_idx]))
            
    return passages
    
@st.experimental_memo(suppress_st_warning=True)
def chunk_and_preprocess_text(text,thresh=500):
    
    """Chunk text longer than n tokens for summarization"""
    
    sentences = sent_tokenize(text)
    
    current_chunk = 0
    chunks = []
    
    for sentence in sentences:
        if len(chunks) == current_chunk + 1:
            if len(chunks[current_chunk]) + len(sentence.split(" ")) <= thresh:
                chunks[current_chunk].extend(sentence.split(" "))
            else:
                current_chunk += 1
                chunks.append(sentence.split(" "))
        else:
            chunks.append(sentence.split(" "))

    for chunk_id in range(len(chunks)):
        chunks[chunk_id] = " ".join(chunks[chunk_id])
    
    return chunks    

    
def summary_downloader(raw_text):
    
	b64 = base64.b64encode(raw_text.encode()).decode()
	new_filename = "new_text_file_{}_.txt".format(time_str)
	st.markdown("#### Download Summary as a File ###")
	href = f'<a href="data:file/txt;base64,{b64}" download="{new_filename}">Click to Download!!</a>'
	st.markdown(href,unsafe_allow_html=True)

@st.experimental_memo(suppress_st_warning=True) 	
def get_all_entities_per_sentence(text):
    doc = nlp(''.join(text))

    sentences = list(doc.sents)

    entities_all_sentences = []
    for sentence in sentences:
        entities_this_sentence = []

        # SPACY ENTITIES
        for entity in sentence.ents:
            entities_this_sentence.append(str(entity))

        # FLAIR ENTITIES (CURRENTLY NOT USED)
        # sentence_entities = Sentence(str(sentence))
        # tagger.predict(sentence_entities)
        # for entity in sentence_entities.get_spans('ner'):
        #     entities_this_sentence.append(entity.text)

        # XLM ENTITIES
        entities_xlm = [entity["word"] for entity in ner_pipe(str(sentence))]
        for entity in entities_xlm:
            entities_this_sentence.append(str(entity))

        entities_all_sentences.append(entities_this_sentence)

    return entities_all_sentences
 
@st.experimental_memo(suppress_st_warning=True)    
def get_all_entities(text):
    all_entities_per_sentence = get_all_entities_per_sentence(text)
    return list(itertools.chain.from_iterable(all_entities_per_sentence))

@st.experimental_memo(suppress_st_warning=True)    
def get_and_compare_entities(article_content,summary_output):
    
    all_entities_per_sentence = get_all_entities_per_sentence(article_content)
    entities_article = list(itertools.chain.from_iterable(all_entities_per_sentence))
   
    all_entities_per_sentence = get_all_entities_per_sentence(summary_output)
    entities_summary = list(itertools.chain.from_iterable(all_entities_per_sentence))
   
    matched_entities = []
    unmatched_entities = []
    for entity in entities_summary:
        if any(entity.lower() in substring_entity.lower() for substring_entity in entities_article):
            matched_entities.append(entity)
        elif any(
                np.inner(sbert.encode(entity, show_progress_bar=False),
                         sbert.encode(art_entity, show_progress_bar=False)) > 0.9 for
                art_entity in entities_article):
            matched_entities.append(entity)
        else:
            unmatched_entities.append(entity)

    matched_entities = list(dict.fromkeys(matched_entities))
    unmatched_entities = list(dict.fromkeys(unmatched_entities))

    matched_entities_to_remove = []
    unmatched_entities_to_remove = []

    for entity in matched_entities:
        for substring_entity in matched_entities:
            if entity != substring_entity and entity.lower() in substring_entity.lower():
                matched_entities_to_remove.append(entity)

    for entity in unmatched_entities:
        for substring_entity in unmatched_entities:
            if entity != substring_entity and entity.lower() in substring_entity.lower():
                unmatched_entities_to_remove.append(entity)

    matched_entities_to_remove = list(dict.fromkeys(matched_entities_to_remove))
    unmatched_entities_to_remove = list(dict.fromkeys(unmatched_entities_to_remove))

    for entity in matched_entities_to_remove:
        matched_entities.remove(entity)
    for entity in unmatched_entities_to_remove:
        unmatched_entities.remove(entity)

    return matched_entities, unmatched_entities

@st.experimental_memo(suppress_st_warning=True) 
def highlight_entities(article_content,summary_output):
   
    markdown_start_red = "<mark class=\"entity\" style=\"background: rgb(238, 135, 135);\">"
    markdown_start_green = "<mark class=\"entity\" style=\"background: rgb(121, 236, 121);\">"
    markdown_end = "</mark>"

    matched_entities, unmatched_entities = get_and_compare_entities(article_content,summary_output)
    
    print(summary_output)

    for entity in matched_entities:
        summary_output = re.sub(f'({entity})(?![^rgb\(]*\))',markdown_start_green + entity + markdown_end,summary_output)

    for entity in unmatched_entities:
        summary_output = re.sub(f'({entity})(?![^rgb\(]*\))',markdown_start_red + entity + markdown_end,summary_output)
    
    print("")
    print(summary_output)
    
    print("")
    print(summary_output)
    
    soup = BeautifulSoup(summary_output, features="html.parser")

    return HTML_WRAPPER.format(soup)
    
    
def display_df_as_table(model,top_k,score='score'):
    '''Display the df with text and scores as a table'''
    
    df = pd.DataFrame([(hit[score],passages[hit['corpus_id']]) for hit in model[0:top_k]],columns=['Score','Text'])
    df['Score'] = round(df['Score'],2)
    
    return df   

      
def make_spans(text,results):
    results_list = []
    for i in range(len(results)):
        results_list.append(results[i]['label'])
    facts_spans = []
    facts_spans = list(zip(sent_tokenizer(text),results_list))
    return facts_spans

##Fiscal Sentiment by Sentence
def fin_ext(text):
    results = remote_clx(sent_tokenizer(text))
    return make_spans(text,results)
    
nlp = get_spacy()    
sent_pipe, sum_pipe, ner_pipe, cross_encoder  = load_models()
sbert = load_sbert('all-MiniLM-L12-v2')