nickmuchi commited on
Commit
2b5cb3b
β€’
1 Parent(s): 5d8a2a4

Create new file

Browse files
Files changed (1) hide show
  1. app.py +126 -0
app.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+ import os
3
+ from pytube import YouTube
4
+ import pandas as pd
5
+ import plotly_express as px
6
+ import nltk
7
+ import plotly.graph_objects as go
8
+ from optimum.onnxruntime import ORTModelForSequenceClassification
9
+ from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
10
+ from sentence_transformers import SentenceTransformer, CrossEncoder, util
11
+ import streamlit as st
12
+
13
+ nltk.download('punkt')
14
+
15
+ from nltk import sent_tokenize
16
+
17
+
18
+ st.set_page_config(
19
+ page_title="Home",
20
+ page_icon="πŸ“ž",
21
+ )
22
+
23
+ auth_token = os.environ.get("auth_token")
24
+
25
+ @st.experimental_singleton()
26
+ def load_models():
27
+ asr_model = whisper.load_model("small")
28
+ q_model = ORTModelForSequenceClassification.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
29
+ q_tokenizer = AutoTokenizer.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
30
+ cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
31
+
32
+ return asr_model, q_model, q_tokenizer, cross_encoder
33
+
34
+ asr_model, q_model, q_tokenizer, cross_encoder = load_models()
35
+
36
+ @st.experimental_memo(suppress_st_warning=True)
37
+ def inference(link, upload):
38
+ '''Convert Youtube video or Audio upload to text'''
39
+
40
+ if validators.url(link):
41
+
42
+ yt = YouTube(link)
43
+ title = yt.title
44
+ path = yt.streams.filter(only_audio=True)[0].download(filename="audio.mp4")
45
+ options = whisper.DecodingOptions(without_timestamps=True)
46
+ results = asr_model.transcribe(path)
47
+
48
+ return results, yt.title
49
+
50
+ elif upload:
51
+ results = asr_model.transcribe(upload)
52
+
53
+ return results, "Transcribed Earnings Audio"
54
+
55
+ @st.experimental_memo(suppress_st_warning=True)
56
+ def sentiment_pipe(earnings_text):
57
+ '''Determine the sentiment of the text'''
58
+
59
+ remote_clx = pipeline("text-classification",model=q_model, tokenizer=q_tokenizer)
60
+
61
+ earnings_sentiment = remote_clx(sent_tokenize(earnings_text))
62
+
63
+ return earnings_sentiment
64
+
65
+
66
+ def preprocess_plain_text(text,window_size=3):
67
+ '''Preprocess text for semantic search'''
68
+
69
+ text = text.encode("ascii", "ignore").decode() # unicode
70
+ text = re.sub(r"https*\S+", " ", text) # url
71
+ text = re.sub(r"@\S+", " ", text) # mentions
72
+ text = re.sub(r"#\S+", " ", text) # hastags
73
+ text = re.sub(r"\s{2,}", " ", text) # over spaces
74
+ #text = re.sub("[^.,!?%$A-Za-z0-9]+", " ", text) # special characters except .,!?
75
+
76
+ #break into lines and remove leading and trailing space on each
77
+ lines = [line.strip() for line in text.splitlines()]
78
+
79
+ # #break multi-headlines into a line each
80
+ chunks = [phrase.strip() for line in lines for phrase in line.split(" ")]
81
+
82
+ # # drop blank lines
83
+ text = '\n'.join(chunk for chunk in chunks if chunk)
84
+
85
+ ## We split this article into paragraphs and then every paragraph into sentences
86
+ paragraphs = []
87
+ for paragraph in text.replace('\n',' ').split("\n\n"):
88
+ if len(paragraph.strip()) > 0:
89
+ paragraphs.append(sent_tokenize(paragraph.strip()))
90
+
91
+ #We combine up to 3 sentences into a passage. You can choose smaller or larger values for window_size
92
+ #Smaller value: Context from other sentences might get lost
93
+ #Lager values: More context from the paragraph remains, but results are longer
94
+ window_size = window_size
95
+ passages = []
96
+ for paragraph in paragraphs:
97
+ for start_idx in range(0, len(paragraph), window_size):
98
+ end_idx = min(start_idx+window_size, len(paragraph))
99
+ passages.append(" ".join(paragraph[start_idx:end_idx]))
100
+
101
+ print(f"Sentences: {sum([len(p) for p in paragraphs])}")
102
+ print(f"Passages: {len(passages)}")
103
+
104
+ return passages
105
+
106
+ def display_df_as_table(model,top_k,score='score'):
107
+ '''Display the df with text and scores as a table'''
108
+
109
+ df = pd.DataFrame([(hit[score],passages[hit['corpus_id']]) for hit in model[0:top_k]],columns=['Score','Text'])
110
+ df['Score'] = round(df['Score'],2)
111
+
112
+ return df
113
+
114
+ def make_spans(text,results):
115
+ results_list = []
116
+ for i in range(len(results)):
117
+ results_list.append(results[i]['label'])
118
+ facts_spans = []
119
+ facts_spans = list(zip(sent_tokenizer(text),results_list))
120
+ return facts_spans
121
+
122
+ ##Fiscal Sentiment by Sentence
123
+ def fin_ext(text):
124
+ results = remote_clx(sent_tokenizer(text))
125
+ return make_spans(text,results)
126
+