Spaces:

nickmuchi
/

Earnings-Call-Analysis-Whisperer

Running

App Files Files Community

nickmuchi commited on Feb 5, 2023

Commit

74f896e

1 Parent(s): 08e6e30

Update functions.py

Browse files

Files changed (1) hide show

functions.py +62 -56

functions.py CHANGED Viewed

@@ -94,6 +94,8 @@ initial_qa_template = (
     "answer the question: {question}\n.\n"
 )
 @st.experimental_singleton(suppress_st_warning=True)
 def load_models():
     q_model = ORTModelForSequenceClassification.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
@@ -116,15 +118,27 @@ def load_asr_model(asr_model_name):
     return asr_model
-@st.experimental_singleton(suppress_st_warning=True)
-def load_sbert(model_name):
-    if 'hkunlp' in model_name:
-        sbert = INSTRUCTOR(model_name)
-    else:
-        sbert = SentenceTransformer(model_name)
-    return sbert
 @st.experimental_memo(suppress_st_warning=True)
 def embed_text(query,corpus,title,embedding_model,chain_type='stuff'):
@@ -142,8 +156,6 @@ def embed_text(query,corpus,title,embedding_model,chain_type='stuff'):
         embeddings = HuggingFaceEmbeddings(model_name=f'sentence-transformers/{embedding_model}')
     docsearch = Pinecone.from_texts(
         corpus,
         embeddings,
@@ -177,61 +189,61 @@ def embed_text(query,corpus,title,embedding_model,chain_type='stuff'):
     return hits
-@st.experimental_memo(suppress_st_warning=True)
-def embed_text(query,corpus,embedding_model):
-    '''Embed text and generate semantic search scores'''
-    #If model is e5 then apply prefixes to query and passage
-    if embedding_model == 'intfloat/e5-base':
-        search_input = 'query: '+ query
-        passages_emb = ['passage: ' + sentence for sentence in corpus]
-    elif embedding_model == 'hkunlp/instructor-base':
-        search_input = [['Represent the Financial question for retrieving supporting paragraphs: ', query]]
-        passages_emb = [['Represent the Financial paragraph for retrieval: ',sentence] for sentence in corpus]
-    else:
-        search_input = query
-        passages_emb = corpus
-    #Embed corpus and question
-    corpus_embedding = sbert.encode(passages_emb, convert_to_tensor=True)
-    question_embedding = sbert.encode(search_input, convert_to_tensor=True)
-    question_embedding = question_embedding.cpu()
-    corpus_embedding = corpus_embedding.cpu()
-    # #Calculate similarity scores and rank
-    hits = util.semantic_search(question_embedding, corpus_embedding, top_k=2)
-    hits = hits[0]  # Get the hits for the first query
-    # ##### Re-Ranking #####
-    # Now, score all retrieved passages with the cross_encoder
-    cross_inp = [[search_input, corpus[hit['corpus_id']]] for hit in hits]
-    if embedding_model == 'hkunlp/instructor-base':
-        result = []
-        for sublist in cross_inp:
-            question = sublist[0][0][1]
-            document = sublist[1][1]
-            result.append([question, document])
-        cross_inp = result
-    cross_scores = cross_encoder.predict(cross_inp)
-    # Sort results by the cross-encoder scores
-    for idx in range(len(cross_scores)):
-        hits[idx]['cross-score'] = cross_scores[idx]
-    # Output of top-3 hits from re-ranker
-    # st.markdown("\n-------------------------\n")
-    # st.subheader(f"Top-{top_k} Cross-Encoder Re-ranker hits")
-    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
-    return hits
 @st.experimental_singleton(suppress_st_warning=True)
 def get_spacy():
@@ -366,12 +378,6 @@ def get_all_entities_per_sentence(text):
         for entity in sentence.ents:
             entities_this_sentence.append(str(entity))
-        # FLAIR ENTITIES (CURRENTLY NOT USED)
-        # sentence_entities = Sentence(str(sentence))
-        # tagger.predict(sentence_entities)
-        # for entity in sentence_entities.get_spans('ner'):
-        #     entities_this_sentence.append(entity.text)
         # XLM ENTITIES
         entities_xlm = [entity["word"] for entity in ner_pipe(str(sentence))]
         for entity in entities_xlm:
@@ -802,5 +808,5 @@ def save_network_html(kb, filename="network.html"):
 nlp = get_spacy()
-sent_pipe, sum_pipe, ner_pipe, cross_encoder, kg_model, kg_tokenizer  = load_models()
 sbert = load_sbert('all-MiniLM-L12-v2')

     "answer the question: {question}\n.\n"
 )
 @st.experimental_singleton(suppress_st_warning=True)
 def load_models():
     q_model = ORTModelForSequenceClassification.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
     return asr_model
+# @st.experimental_singleton(suppress_st_warning=True)
+# def load_sbert(model_name):
+#     if 'hkunlp' in model_name:
+#         sbert = INSTRUCTOR(model_name)
+#     else:
+#         sbert = SentenceTransformer(model_name)
+#     return sbert
+@st.experimental_singleton(suppress_st_warning=True)
+def process_corpus(corpus, tok, chunk_size=200, overlap=50):
+    pinecone.init(api_key="2d1e8029-2d84-4724-9f7c-a4f0f5ae908a", environment="us-west1-gcp")
+    tokenizer = tok
+    text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(tokenizer,chunk_size=chunk_size,chunk_overlap=overlap,separator='. ')
+    texts = text_splitter.split_text(corpus)
+    return texts
 @st.experimental_memo(suppress_st_warning=True)
 def embed_text(query,corpus,title,embedding_model,chain_type='stuff'):
         embeddings = HuggingFaceEmbeddings(model_name=f'sentence-transformers/{embedding_model}')
     docsearch = Pinecone.from_texts(
         corpus,
         embeddings,
     return hits
+# @st.experimental_memo(suppress_st_warning=True)
+# def embed_text(query,corpus,embedding_model):
+#     '''Embed text and generate semantic search scores'''
+#     #If model is e5 then apply prefixes to query and passage
+#     if embedding_model == 'intfloat/e5-base':
+#         search_input = 'query: '+ query
+#         passages_emb = ['passage: ' + sentence for sentence in corpus]
+#     elif embedding_model == 'hkunlp/instructor-base':
+#         search_input = [['Represent the Financial question for retrieving supporting paragraphs: ', query]]
+#         passages_emb = [['Represent the Financial paragraph for retrieval: ',sentence] for sentence in corpus]
+#     else:
+#         search_input = query
+#         passages_emb = corpus
+#     #Embed corpus and question
+#     corpus_embedding = sbert.encode(passages_emb, convert_to_tensor=True)
+#     question_embedding = sbert.encode(search_input, convert_to_tensor=True)
+#     question_embedding = question_embedding.cpu()
+#     corpus_embedding = corpus_embedding.cpu()
+#     # #Calculate similarity scores and rank
+#     hits = util.semantic_search(question_embedding, corpus_embedding, top_k=2)
+#     hits = hits[0]  # Get the hits for the first query
+#     # ##### Re-Ranking #####
+#     # Now, score all retrieved passages with the cross_encoder
+#     cross_inp = [[search_input, corpus[hit['corpus_id']]] for hit in hits]
+#     if embedding_model == 'hkunlp/instructor-base':
+#         result = []
+#         for sublist in cross_inp:
+#             question = sublist[0][0][1]
+#             document = sublist[1][1]
+#             result.append([question, document])
+#         cross_inp = result
+#     cross_scores = cross_encoder.predict(cross_inp)
+#     # Sort results by the cross-encoder scores
+#     for idx in range(len(cross_scores)):
+#         hits[idx]['cross-score'] = cross_scores[idx]
+#     # Output of top-3 hits from re-ranker
+#     # st.markdown("\n-------------------------\n")
+#     # st.subheader(f"Top-{top_k} Cross-Encoder Re-ranker hits")
+#     hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
+#     return hits
 @st.experimental_singleton(suppress_st_warning=True)
 def get_spacy():
         for entity in sentence.ents:
             entities_this_sentence.append(str(entity))
         # XLM ENTITIES
         entities_xlm = [entity["word"] for entity in ner_pipe(str(sentence))]
         for entity in entities_xlm:
 nlp = get_spacy()
+sent_pipe, sum_pipe, ner_pipe, cross_encoder, kg_model, kg_tokenizer, emb_tokenizer  = load_models()
 sbert = load_sbert('all-MiniLM-L12-v2')