nickmuchi commited on
Commit
08e6e30
1 Parent(s): d3132eb

Update functions.py

Browse files
Files changed (1) hide show
  1. functions.py +117 -2
functions.py CHANGED
@@ -21,7 +21,16 @@ import pickle, math
21
  import wikipedia
22
  from pyvis.network import Network
23
  import torch
24
- from InstructorEmbedding import INSTRUCTOR
 
 
 
 
 
 
 
 
 
25
 
26
  nltk.download('punkt')
27
 
@@ -32,6 +41,59 @@ time_str = time.strftime("%d%m%Y-%H%M%S")
32
  HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem;
33
  margin-bottom: 2.5rem">{}</div> """
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  @st.experimental_singleton(suppress_st_warning=True)
36
  def load_models():
37
  q_model = ORTModelForSequenceClassification.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
@@ -40,12 +102,13 @@ def load_models():
40
  kg_tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
41
  q_tokenizer = AutoTokenizer.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
42
  ner_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
 
43
  sent_pipe = pipeline("text-classification",model=q_model, tokenizer=q_tokenizer)
44
  sum_pipe = pipeline("summarization",model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn",clean_up_tokenization_spaces=True)
45
  ner_pipe = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer, grouped_entities=True)
46
  cross_encoder = CrossEncoder('cross-encoder/mmarco-mMiniLMv2-L12-H384-v1') #cross-encoder/ms-marco-MiniLM-L-12-v2
47
 
48
- return sent_pipe, sum_pipe, ner_pipe, cross_encoder, kg_model, kg_tokenizer
49
 
50
  @st.experimental_singleton(suppress_st_warning=True)
51
  def load_asr_model(asr_model_name):
@@ -62,6 +125,58 @@ def load_sbert(model_name):
62
 
63
  return sbert
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  @st.experimental_memo(suppress_st_warning=True)
66
  def embed_text(query,corpus,embedding_model):
67
 
 
21
  import wikipedia
22
  from pyvis.network import Network
23
  import torch
24
+ from langchain.docstore.document import Document
25
+ from langchain.embeddings import HuggingFaceEmbeddings,HuggingFaceInstructEmbeddings
26
+ from langchain.vectorstores import Pinecone
27
+ from langchain.chains.qa_with_sources import load_qa_with_sources_chain
28
+ from langchain.text_splitter import CharacterTextSplitter
29
+ from langchain.llms import OpenAI
30
+ from langchain import VectorDBQA
31
+ from langchain.chains.question_answering import load_qa_chain
32
+ from langchain.prompts import PromptTemplate
33
+ from langchain.prompts.base import RegexParser
34
 
35
  nltk.download('punkt')
36
 
 
41
  HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem;
42
  margin-bottom: 2.5rem">{}</div> """
43
 
44
+ #Stuff Chain Type Prompt template
45
+ output_parser = RegexParser(
46
+ regex=r"(.*?)\nScore: (.*)",
47
+ output_keys=["answer", "score"],
48
+ )
49
+
50
+ template = """Given the following extracted parts of a long document and a question, create a final answer with references ("SOURCES").
51
+ If you don't know the answer, just say that you don't know. Don't try to make up an answer.
52
+ ALWAYS return a "SOURCES" part in your answer.
53
+
54
+ In addition to giving an answer, also return a score of how fully it answered the user's question. This should be in the following format:
55
+
56
+ Question: [question here]
57
+ Helpful Answer: [answer here]
58
+ Score: [score between 0 and 100]
59
+
60
+ Begin!
61
+
62
+ Context:
63
+ ---------
64
+ {summaries}
65
+ ---------
66
+ Question: {question}
67
+ Helpful Answer:"""
68
+
69
+ #Refine Chain Type Prompt Template
70
+ refine_prompt_template = (
71
+ "The original question is as follows: {question}\n"
72
+ "We have provided an existing answer: {existing_answer}\n"
73
+ "We have the opportunity to refine the existing answer"
74
+ "(only if needed) with some more context below.\n"
75
+ "------------\n"
76
+ "{context_str}\n"
77
+ "------------\n"
78
+ "Given the new context, refine the original answer to better "
79
+ "answer the question. "
80
+ "If the context isn't useful, return the original answer."
81
+ )
82
+ refine_prompt = PromptTemplate(
83
+ input_variables=["question", "existing_answer", "context_str"],
84
+ template=refine_prompt_template,
85
+ )
86
+
87
+
88
+ initial_qa_template = (
89
+ "Context information is below. \n"
90
+ "---------------------\n"
91
+ "{context_str}"
92
+ "\n---------------------\n"
93
+ "Given the context information and not prior knowledge, "
94
+ "answer the question: {question}\n.\n"
95
+ )
96
+
97
  @st.experimental_singleton(suppress_st_warning=True)
98
  def load_models():
99
  q_model = ORTModelForSequenceClassification.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
 
102
  kg_tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
103
  q_tokenizer = AutoTokenizer.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
104
  ner_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
105
+ emb_tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-xl')
106
  sent_pipe = pipeline("text-classification",model=q_model, tokenizer=q_tokenizer)
107
  sum_pipe = pipeline("summarization",model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn",clean_up_tokenization_spaces=True)
108
  ner_pipe = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer, grouped_entities=True)
109
  cross_encoder = CrossEncoder('cross-encoder/mmarco-mMiniLMv2-L12-H384-v1') #cross-encoder/ms-marco-MiniLM-L-12-v2
110
 
111
+ return sent_pipe, sum_pipe, ner_pipe, cross_encoder, kg_model, kg_tokenizer, emb_tokenizer
112
 
113
  @st.experimental_singleton(suppress_st_warning=True)
114
  def load_asr_model(asr_model_name):
 
125
 
126
  return sbert
127
 
128
+ @st.experimental_memo(suppress_st_warning=True)
129
+ def embed_text(query,corpus,title,embedding_model,chain_type='stuff'):
130
+
131
+ '''Embed text and generate semantic search scores'''
132
+
133
+ index_id = "earnings-embeddings"
134
+
135
+ if 'hkunlp' in embedding_model:
136
+
137
+ embeddings = HuggingFaceInstructEmbeddings(model_name=f'hkunlp/{embedding_model}',
138
+ query_instruction='Represent the Financial question for retrieving supporting paragraphs: ',
139
+ embed_instruction='Represent the Financial paragraph for retrieval: ')
140
+
141
+ else:
142
+
143
+ embeddings = HuggingFaceEmbeddings(model_name=f'sentence-transformers/{embedding_model}')
144
+
145
+
146
+
147
+ docsearch = Pinecone.from_texts(
148
+ corpus,
149
+ embeddings,
150
+ index_name = index_id,
151
+ namespace = f'{title}-earnings',
152
+ metadatas = [
153
+ {'source':i} for i in range(len(texts))]
154
+ )
155
+
156
+ docs = docsearch.similarity_search_with_score(query, k=3, namespace = f'{title}-earnings')
157
+
158
+ docs = [d[0] for d in docs]
159
+
160
+ if chain_type == 'stuff':
161
+
162
+ PROMPT = PromptTemplate(template=template,
163
+ input_variables=["summaries", "question"],
164
+ output_parser=output_parser)
165
+
166
+ chain = load_qa_with_sources_chain(OpenAI(temperature=0),
167
+ chain_type="stuff",
168
+ prompt=PROMPT,
169
+ )
170
+
171
+ answer = chain({"input_documents": docs, "question": query}, return_only_outputs=True)
172
+
173
+ return answer['output_text']
174
+
175
+ elif chain_type == 'refine':
176
+
177
+
178
+ return hits
179
+
180
  @st.experimental_memo(suppress_st_warning=True)
181
  def embed_text(query,corpus,embedding_model):
182