nickmuchi commited on
Commit
7a5728d
1 Parent(s): e369230

Update functions.py

Browse files
Files changed (1) hide show
  1. functions.py +35 -41
functions.py CHANGED
@@ -102,68 +102,61 @@ def sentiment_pipe(earnings_text):
102
  earnings_sentiment = sent_pipe(earnings_sentences)
103
 
104
  return earnings_sentiment, earnings_sentences
105
-
106
  @st.experimental_memo(suppress_st_warning=True)
107
- def preprocess_plain_text(text,window_size=3):
108
- '''Preprocess text for semantic search'''
109
-
110
  text = text.encode("ascii", "ignore").decode() # unicode
111
  text = re.sub(r"https*\S+", " ", text) # url
112
  text = re.sub(r"@\S+", " ", text) # mentions
113
  text = re.sub(r"#\S+", " ", text) # hastags
114
  text = re.sub(r"\s{2,}", " ", text) # over spaces
115
- #text = re.sub("[^.,!?%$A-Za-z0-9]+", " ", text) # special characters except .,!?
116
 
117
- #break into lines and remove leading and trailing space on each
118
- lines = [line.strip() for line in text.splitlines()]
 
 
 
119
 
120
- # #break multi-headlines into a line each
121
- chunks = [phrase.strip() for line in lines for phrase in line.split(" ")]
122
 
123
- # # drop blank lines
124
- text = '\n'.join(chunk for chunk in chunks if chunk)
 
 
 
 
 
 
 
 
 
125
 
126
- ## We split this article into paragraphs and then every paragraph into sentences
127
- paragraphs = []
128
- for paragraph in text.replace('\n',' ').split("\n\n"):
129
- if len(paragraph.strip()) > 0:
130
- paragraphs.append(sent_tokenize(paragraph.strip()))
131
-
132
- #We combine up to 3 sentences into a passage. You can choose smaller or larger values for window_size
133
- #Smaller value: Context from other sentences might get lost
134
- #Lager values: More context from the paragraph remains, but results are longer
135
- window_size = window_size
136
  passages = []
137
- for paragraph in paragraphs:
 
 
138
  for start_idx in range(0, len(paragraph), window_size):
139
  end_idx = min(start_idx+window_size, len(paragraph))
140
  passages.append(" ".join(paragraph[start_idx:end_idx]))
141
-
142
- print(f"Sentences: {sum([len(p) for p in paragraphs])}")
143
- print(f"Passages: {len(passages)}")
144
-
145
  return passages
146
-
147
- @st.experimental_memo(suppress_st_warning=True)
148
- def chunk_and_preprocess_text(text):
149
 
150
- """Chunk text longer than 500 tokens"""
151
 
152
- text = text.encode("ascii", "ignore").decode() # unicode
153
- text = re.sub(r"https*\S+", " ", text) # url
154
- text = re.sub(r"@\S+", " ", text) # mentions
155
- text = re.sub(r"#\S+", " ", text) # hastags
156
- text = re.sub(r"\s{2,}", " ", text) # over spaces
157
-
158
- article = nlp(text)
159
- sentences = [i.text for i in list(article.sents)]
160
 
161
  current_chunk = 0
162
  chunks = []
163
 
164
  for sentence in sentences:
165
  if len(chunks) == current_chunk + 1:
166
- if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 500:
167
  chunks[current_chunk].extend(sentence.split(" "))
168
  else:
169
  current_chunk += 1
@@ -174,7 +167,8 @@ def chunk_and_preprocess_text(text):
174
  for chunk_id in range(len(chunks)):
175
  chunks[chunk_id] = " ".join(chunks[chunk_id])
176
 
177
- return chunks
 
178
 
179
  def summary_downloader(raw_text):
180
 
@@ -318,4 +312,4 @@ def fin_ext(text):
318
 
319
  nlp = get_spacy()
320
  sent_pipe, sum_pipe, ner_pipe, cross_encoder = load_models()
321
- sbert = load_sbert('all-MiniLM-L6-v2')
 
102
  earnings_sentiment = sent_pipe(earnings_sentences)
103
 
104
  return earnings_sentiment, earnings_sentences
105
+
106
  @st.experimental_memo(suppress_st_warning=True)
107
+ def clean_text(text):
108
+ '''Clean all text'''
109
+
110
  text = text.encode("ascii", "ignore").decode() # unicode
111
  text = re.sub(r"https*\S+", " ", text) # url
112
  text = re.sub(r"@\S+", " ", text) # mentions
113
  text = re.sub(r"#\S+", " ", text) # hastags
114
  text = re.sub(r"\s{2,}", " ", text) # over spaces
 
115
 
116
+ return text
117
+
118
+ @st.experimental_memo(suppress_st_warning=True)
119
+ def chunk_long_text(text,threshold,window_size=3):
120
+ '''Preprocess text and chunk for semantic search and sentiment analysis'''
121
 
122
+ #Convert cleaned text into sentences
123
+ sentences = sent_tokenize(text)
124
 
125
+ out = []
126
+
127
+ #Limit the length of each sentence to a threshold
128
+ for chunk in sentences:
129
+ if len(chunk.split()) < threshold:
130
+ out.append(chunk)
131
+ else:
132
+ words = chunk.split()
133
+ num = int(len(words)/threshold)
134
+ for i in range(0,num*threshold+1,threshold):
135
+ out.append(' '.join(words[i:threshold+i]))
136
 
 
 
 
 
 
 
 
 
 
 
137
  passages = []
138
+
139
+ #Combine sentences into a window of size window_size
140
+ for paragraph in [out]:
141
  for start_idx in range(0, len(paragraph), window_size):
142
  end_idx = min(start_idx+window_size, len(paragraph))
143
  passages.append(" ".join(paragraph[start_idx:end_idx]))
144
+
 
 
 
145
  return passages
146
+
147
+ @st.experimental_memo(suppress_st_warning=True)
148
+ def chunk_and_preprocess_text(text,thresh=500):
149
 
150
+ """Chunk text longer than n tokens for summarization"""
151
 
152
+ sentences = sent_tokenize(text)
 
 
 
 
 
 
 
153
 
154
  current_chunk = 0
155
  chunks = []
156
 
157
  for sentence in sentences:
158
  if len(chunks) == current_chunk + 1:
159
+ if len(chunks[current_chunk]) + len(sentence.split(" ")) <= thresh:
160
  chunks[current_chunk].extend(sentence.split(" "))
161
  else:
162
  current_chunk += 1
 
167
  for chunk_id in range(len(chunks)):
168
  chunks[chunk_id] = " ".join(chunks[chunk_id])
169
 
170
+ return chunks
171
+
172
 
173
  def summary_downloader(raw_text):
174
 
 
312
 
313
  nlp = get_spacy()
314
  sent_pipe, sum_pipe, ner_pipe, cross_encoder = load_models()
315
+ sbert = load_sbert('all-MiniLM-L12-v2')