Update functions.py
Browse files- functions.py +35 -41
functions.py
CHANGED
@@ -102,68 +102,61 @@ def sentiment_pipe(earnings_text):
|
|
102 |
earnings_sentiment = sent_pipe(earnings_sentences)
|
103 |
|
104 |
return earnings_sentiment, earnings_sentences
|
105 |
-
|
106 |
@st.experimental_memo(suppress_st_warning=True)
|
107 |
-
def
|
108 |
-
'''
|
109 |
-
|
110 |
text = text.encode("ascii", "ignore").decode() # unicode
|
111 |
text = re.sub(r"https*\S+", " ", text) # url
|
112 |
text = re.sub(r"@\S+", " ", text) # mentions
|
113 |
text = re.sub(r"#\S+", " ", text) # hastags
|
114 |
text = re.sub(r"\s{2,}", " ", text) # over spaces
|
115 |
-
#text = re.sub("[^.,!?%$A-Za-z0-9]+", " ", text) # special characters except .,!?
|
116 |
|
117 |
-
|
118 |
-
|
|
|
|
|
|
|
119 |
|
120 |
-
#
|
121 |
-
|
122 |
|
123 |
-
|
124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
|
126 |
-
## We split this article into paragraphs and then every paragraph into sentences
|
127 |
-
paragraphs = []
|
128 |
-
for paragraph in text.replace('\n',' ').split("\n\n"):
|
129 |
-
if len(paragraph.strip()) > 0:
|
130 |
-
paragraphs.append(sent_tokenize(paragraph.strip()))
|
131 |
-
|
132 |
-
#We combine up to 3 sentences into a passage. You can choose smaller or larger values for window_size
|
133 |
-
#Smaller value: Context from other sentences might get lost
|
134 |
-
#Lager values: More context from the paragraph remains, but results are longer
|
135 |
-
window_size = window_size
|
136 |
passages = []
|
137 |
-
|
|
|
|
|
138 |
for start_idx in range(0, len(paragraph), window_size):
|
139 |
end_idx = min(start_idx+window_size, len(paragraph))
|
140 |
passages.append(" ".join(paragraph[start_idx:end_idx]))
|
141 |
-
|
142 |
-
print(f"Sentences: {sum([len(p) for p in paragraphs])}")
|
143 |
-
print(f"Passages: {len(passages)}")
|
144 |
-
|
145 |
return passages
|
146 |
-
|
147 |
-
@st.experimental_memo(suppress_st_warning=True)
|
148 |
-
def chunk_and_preprocess_text(text):
|
149 |
|
150 |
-
"""Chunk text longer than
|
151 |
|
152 |
-
|
153 |
-
text = re.sub(r"https*\S+", " ", text) # url
|
154 |
-
text = re.sub(r"@\S+", " ", text) # mentions
|
155 |
-
text = re.sub(r"#\S+", " ", text) # hastags
|
156 |
-
text = re.sub(r"\s{2,}", " ", text) # over spaces
|
157 |
-
|
158 |
-
article = nlp(text)
|
159 |
-
sentences = [i.text for i in list(article.sents)]
|
160 |
|
161 |
current_chunk = 0
|
162 |
chunks = []
|
163 |
|
164 |
for sentence in sentences:
|
165 |
if len(chunks) == current_chunk + 1:
|
166 |
-
if len(chunks[current_chunk]) + len(sentence.split(" ")) <=
|
167 |
chunks[current_chunk].extend(sentence.split(" "))
|
168 |
else:
|
169 |
current_chunk += 1
|
@@ -174,7 +167,8 @@ def chunk_and_preprocess_text(text):
|
|
174 |
for chunk_id in range(len(chunks)):
|
175 |
chunks[chunk_id] = " ".join(chunks[chunk_id])
|
176 |
|
177 |
-
return chunks
|
|
|
178 |
|
179 |
def summary_downloader(raw_text):
|
180 |
|
@@ -318,4 +312,4 @@ def fin_ext(text):
|
|
318 |
|
319 |
nlp = get_spacy()
|
320 |
sent_pipe, sum_pipe, ner_pipe, cross_encoder = load_models()
|
321 |
-
sbert = load_sbert('all-MiniLM-
|
|
|
102 |
earnings_sentiment = sent_pipe(earnings_sentences)
|
103 |
|
104 |
return earnings_sentiment, earnings_sentences
|
105 |
+
|
106 |
@st.experimental_memo(suppress_st_warning=True)
|
107 |
+
def clean_text(text):
|
108 |
+
'''Clean all text'''
|
109 |
+
|
110 |
text = text.encode("ascii", "ignore").decode() # unicode
|
111 |
text = re.sub(r"https*\S+", " ", text) # url
|
112 |
text = re.sub(r"@\S+", " ", text) # mentions
|
113 |
text = re.sub(r"#\S+", " ", text) # hastags
|
114 |
text = re.sub(r"\s{2,}", " ", text) # over spaces
|
|
|
115 |
|
116 |
+
return text
|
117 |
+
|
118 |
+
@st.experimental_memo(suppress_st_warning=True)
|
119 |
+
def chunk_long_text(text,threshold,window_size=3):
|
120 |
+
'''Preprocess text and chunk for semantic search and sentiment analysis'''
|
121 |
|
122 |
+
#Convert cleaned text into sentences
|
123 |
+
sentences = sent_tokenize(text)
|
124 |
|
125 |
+
out = []
|
126 |
+
|
127 |
+
#Limit the length of each sentence to a threshold
|
128 |
+
for chunk in sentences:
|
129 |
+
if len(chunk.split()) < threshold:
|
130 |
+
out.append(chunk)
|
131 |
+
else:
|
132 |
+
words = chunk.split()
|
133 |
+
num = int(len(words)/threshold)
|
134 |
+
for i in range(0,num*threshold+1,threshold):
|
135 |
+
out.append(' '.join(words[i:threshold+i]))
|
136 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
passages = []
|
138 |
+
|
139 |
+
#Combine sentences into a window of size window_size
|
140 |
+
for paragraph in [out]:
|
141 |
for start_idx in range(0, len(paragraph), window_size):
|
142 |
end_idx = min(start_idx+window_size, len(paragraph))
|
143 |
passages.append(" ".join(paragraph[start_idx:end_idx]))
|
144 |
+
|
|
|
|
|
|
|
145 |
return passages
|
146 |
+
|
147 |
+
@st.experimental_memo(suppress_st_warning=True)
|
148 |
+
def chunk_and_preprocess_text(text,thresh=500):
|
149 |
|
150 |
+
"""Chunk text longer than n tokens for summarization"""
|
151 |
|
152 |
+
sentences = sent_tokenize(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
|
154 |
current_chunk = 0
|
155 |
chunks = []
|
156 |
|
157 |
for sentence in sentences:
|
158 |
if len(chunks) == current_chunk + 1:
|
159 |
+
if len(chunks[current_chunk]) + len(sentence.split(" ")) <= thresh:
|
160 |
chunks[current_chunk].extend(sentence.split(" "))
|
161 |
else:
|
162 |
current_chunk += 1
|
|
|
167 |
for chunk_id in range(len(chunks)):
|
168 |
chunks[chunk_id] = " ".join(chunks[chunk_id])
|
169 |
|
170 |
+
return chunks
|
171 |
+
|
172 |
|
173 |
def summary_downloader(raw_text):
|
174 |
|
|
|
312 |
|
313 |
nlp = get_spacy()
|
314 |
sent_pipe, sum_pipe, ner_pipe, cross_encoder = load_models()
|
315 |
+
sbert = load_sbert('all-MiniLM-L12-v2')
|