Spaces:

nickmuchi
/

Earnings-Call-Analysis-Whisperer

Running

App Files Files Community

nickmuchi commited on Feb 5, 2023

Commit

e1f6c5c

1 Parent(s): e232116

Update functions.py

Browse files

Files changed (1) hide show

functions.py +4 -95

functions.py CHANGED Viewed

@@ -119,15 +119,6 @@ def load_asr_model(asr_model_name):
     asr_model = whisper.load_model(asr_model_name)
     return asr_model
-# @st.experimental_singleton(suppress_st_warning=True)
-# def load_sbert(model_name):
-#     if 'hkunlp' in model_name:
-#         sbert = INSTRUCTOR(model_name)
-#     else:
-#         sbert = SentenceTransformer(model_name)
-#     return sbert
 @st.experimental_singleton(suppress_st_warning=True)
 def process_corpus(corpus, tok, title, embeddings, chunk_size=200, overlap=50):
@@ -185,7 +176,7 @@ def embed_text(query,corpus,title,embedding_model,emb_tok,chain_type='stuff'):
     docs = [d[0] for d in docs]
-    if chain_type == 'stuff':
         PROMPT = PromptTemplate(template=template,
                                 input_variables=["summaries", "question"],
@@ -200,7 +191,7 @@ def embed_text(query,corpus,title,embedding_model,emb_tok,chain_type='stuff'):
         return answer['output_text']
-    elif chain_type == 'refine':
         initial_qa_prompt = PromptTemplate(
     input_variables=["context_str", "question"], template=initial_qa_template
@@ -211,62 +202,6 @@ def embed_text(query,corpus,title,embedding_model,emb_tok,chain_type='stuff'):
         return answer['output_text']
-# @st.experimental_memo(suppress_st_warning=True)
-# def embed_text(query,corpus,embedding_model):
-#     '''Embed text and generate semantic search scores'''
-#     #If model is e5 then apply prefixes to query and passage
-#     if embedding_model == 'intfloat/e5-base':
-#         search_input = 'query: '+ query
-#         passages_emb = ['passage: ' + sentence for sentence in corpus]
-#     elif embedding_model == 'hkunlp/instructor-base':
-#         search_input = [['Represent the Financial question for retrieving supporting paragraphs: ', query]]
-#         passages_emb = [['Represent the Financial paragraph for retrieval: ',sentence] for sentence in corpus]
-#     else:
-#         search_input = query
-#         passages_emb = corpus
-#     #Embed corpus and question
-#     corpus_embedding = sbert.encode(passages_emb, convert_to_tensor=True)
-#     question_embedding = sbert.encode(search_input, convert_to_tensor=True)
-#     question_embedding = question_embedding.cpu()
-#     corpus_embedding = corpus_embedding.cpu()
-#     # #Calculate similarity scores and rank
-#     hits = util.semantic_search(question_embedding, corpus_embedding, top_k=2)
-#     hits = hits[0]  # Get the hits for the first query
-#     # ##### Re-Ranking #####
-#     # Now, score all retrieved passages with the cross_encoder
-#     cross_inp = [[search_input, corpus[hit['corpus_id']]] for hit in hits]
-#     if embedding_model == 'hkunlp/instructor-base':
-#         result = []
-#         for sublist in cross_inp:
-#             question = sublist[0][0][1]
-#             document = sublist[1][1]
-#             result.append([question, document])
-#         cross_inp = result
-#     cross_scores = cross_encoder.predict(cross_inp)
-#     # Sort results by the cross-encoder scores
-#     for idx in range(len(cross_scores)):
-#         hits[idx]['cross-score'] = cross_scores[idx]
-#     # Output of top-3 hits from re-ranker
-#     # st.markdown("\n-------------------------\n")
-#     # st.subheader(f"Top-{top_k} Cross-Encoder Re-ranker hits")
-#     hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
-#     return hits
 @st.experimental_singleton(suppress_st_warning=True)
 def get_spacy():
     nlp = en_core_web_lg.load()
@@ -350,32 +285,7 @@ def chunk_long_text(text,threshold,window_size=3,stride=2):
             end_idx = min(start_idx+window_size, len(paragraph))
             passages.append(" ".join(paragraph[start_idx:end_idx]))
-    return passages
-@st.experimental_memo(suppress_st_warning=True)
-def chunk_and_preprocess_text(text,thresh=500):
-    """Chunk text longer than n tokens for summarization"""
-    sentences = sent_tokenize(text)
-    current_chunk = 0
-    chunks = []
-    for sentence in sentences:
-        if len(chunks) == current_chunk + 1:
-            if len(chunks[current_chunk]) + len(sentence.split(" ")) <= thresh:
-                chunks[current_chunk].extend(sentence.split(" "))
-            else:
-                current_chunk += 1
-                chunks.append(sentence.split(" "))
-        else:
-            chunks.append(sentence.split(" "))
-    for chunk_id in range(len(chunks)):
-        chunks[chunk_id] = " ".join(chunks[chunk_id])
-    return chunks
 def summary_downloader(raw_text):
@@ -830,5 +740,4 @@ def save_network_html(kb, filename="network.html"):
 nlp = get_spacy()
-sent_pipe, sum_pipe, ner_pipe, cross_encoder, kg_model, kg_tokenizer, emb_tokenizer  = load_models()
-sbert = load_sbert('all-MiniLM-L12-v2')

     asr_model = whisper.load_model(asr_model_name)
     return asr_model
 @st.experimental_singleton(suppress_st_warning=True)
 def process_corpus(corpus, tok, title, embeddings, chunk_size=200, overlap=50):
     docs = [d[0] for d in docs]
+    if chain_type == 'Normal':
         PROMPT = PromptTemplate(template=template,
                                 input_variables=["summaries", "question"],
         return answer['output_text']
+    elif chain_type == 'Refined':
         initial_qa_prompt = PromptTemplate(
     input_variables=["context_str", "question"], template=initial_qa_template
         return answer['output_text']
 @st.experimental_singleton(suppress_st_warning=True)
 def get_spacy():
     nlp = en_core_web_lg.load()
             end_idx = min(start_idx+window_size, len(paragraph))
             passages.append(" ".join(paragraph[start_idx:end_idx]))
+    return passages
 def summary_downloader(raw_text):
 nlp = get_spacy()
+sent_pipe, sum_pipe, ner_pipe, cross_encoder, kg_model, kg_tokenizer, emb_tokenizer  = load_models()