Spaces:

nickmuchi
/

Earnings-Call-Analysis-Whisperer

Running

App Files Files Community

nickmuchi commited on Oct 6, 2022

Commit

741aa8b

•

1 Parent(s): ecca100

Update pages/3_Earnings_Semantic_Search_🔎_.py

Browse files

Files changed (1) hide show

pages/3_Earnings_Semantic_Search_🔎_.py +57 -50

pages/3_Earnings_Semantic_Search_🔎_.py CHANGED Viewed

@@ -18,57 +18,64 @@ if "sen_df" not in st.session_state:
 if "earnings_passages" not in st.session_state:
     st.session_state["earnings_passages"] = ''
-if any(st.session_state["sen_df"]) or st.session_state["earnings_passages"]:
-    ## Save to a dataframe for ease of visualization
-    sen_df = st.session_state['sen_df']
-    passages = preprocess_plain_text(st.session_state['earnings_passages'],window_size=window_size)
-    ##### Sematic Search #####
-    # Encode the query using the bi-encoder and find potentially relevant passages
-    corpus_embeddings = sbert.encode(passages, convert_to_tensor=True, show_progress_bar=True)
-    question_embedding = sbert.encode(search_input, convert_to_tensor=True)
-    question_embedding = question_embedding.cpu()
-    hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k,score_function=util.dot_score)
-    hits = hits[0]  # Get the hits for the first query
-    ##### Re-Ranking #####
-    # Now, score all retrieved passages with the cross_encoder
-    cross_inp = [[search_input, passages[hit['corpus_id']]] for hit in hits]
-    cross_scores = cross_encoder.predict(cross_inp)
-    # Sort results by the cross-encoder scores
-    for idx in range(len(cross_scores)):
-        hits[idx]['cross-score'] = cross_scores[idx]
-    # Output of top-3 hits from re-ranker
-    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
-    score='cross-score'
-    df = pd.DataFrame([(hit[score],passages[hit['corpus_id']]) for hit in hits[0:int(top_k)]],columns=['Score','Text'])
-    df['Score'] = round(df['Score'],2)
-    print(f'Test: {df}')
-    def gen_annotated_text(para):
-        tag_list = []
-        for i in sent_tokenize(para):
-            label = sen_df.loc[sen_df['text']==i, 'label'].values[0]
-            if label == 'Negative':
-                tag_list.append((i,label,'#faa'))
-            elif label == 'Positive':
-                tag_list.append((i,label,'#afa'))
-            else:
-                tag_list.append((i,label,'#fea'))
-        return tag_list
-    text_to_annotate = [gen_annotated_text(para) for para in df.Text.tolist()]
-    for i in text_to_annotate:
-        annotated_text(i)
-else:
-    st.write('Please ensure you have entered the YouTube URL or uploaded the Earnings Call file')

 if "earnings_passages" not in st.session_state:
     st.session_state["earnings_passages"] = ''
+if search_input is not None:
+    if any(st.session_state["sen_df"]) or st.session_state["earnings_passages"]:
+        ## Save to a dataframe for ease of visualization
+        sen_df = st.session_state['sen_df']
+        passages = preprocess_plain_text(st.session_state['earnings_passages'],window_size=window_size)
+        ##### Sematic Search #####
+        # Encode the query using the bi-encoder and find potentially relevant passages
+        corpus_embeddings = sbert.encode(passages, convert_to_tensor=True, show_progress_bar=True)
+        question_embedding = sbert.encode(search_input, convert_to_tensor=True)
+        question_embedding = question_embedding.cpu()
+        hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k,score_function=util.dot_score)
+        hits = hits[0]  # Get the hits for the first query
+        ##### Re-Ranking #####
+        # Now, score all retrieved passages with the cross_encoder
+        cross_inp = [[search_input, passages[hit['corpus_id']]] for hit in hits]
+        cross_scores = cross_encoder.predict(cross_inp)
+        # Sort results by the cross-encoder scores
+        for idx in range(len(cross_scores)):
+            hits[idx]['cross-score'] = cross_scores[idx]
+        # Output of top-3 hits from re-ranker
+        hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
+        score='cross-score'
+        df = pd.DataFrame([(hit[score],passages[hit['corpus_id']]) for hit in hits[0:int(top_k)]],columns=['Score','Text'])
+        df['Score'] = round(df['Score'],2)
+        print(f'Test: {df}')
+        def gen_annotated_text(para):
+            tag_list = []
+            for i in sent_tokenize(para):
+                label = sen_df.loc[sen_df['text']==i, 'label'].values[0]
+                if label == 'Negative':
+                    tag_list.append((i,label,'#faa'))
+                elif label == 'Positive':
+                    tag_list.append((i,label,'#afa'))
+                else:
+                    tag_list.append((i,label,'#fea'))
+            return tag_list
+        text_to_annotate = [gen_annotated_text(para) for para in df.Text.tolist()]
+        first,second = text_to_annotate[0],text_to_annotate[-1]
+        with st.container():
+            annotate_text(*first)
+        with st.container():
+            annotate_text(*second)
+    else:
+        st.write('Please ensure you have entered the YouTube URL or uploaded the Earnings Call file')