nickmuchi commited on
Commit
8e2eef3
1 Parent(s): 5dfeae8

Update functions.py

Browse files
Files changed (1) hide show
  1. functions.py +9 -21
functions.py CHANGED
@@ -23,7 +23,7 @@ from pyvis.network import Network
23
  import torch
24
  from langchain.docstore.document import Document
25
  from langchain.embeddings import HuggingFaceEmbeddings,HuggingFaceInstructEmbeddings
26
- from langchain.vectorstores import Pinecone
27
  from langchain.chains.qa_with_sources import load_qa_with_sources_chain
28
  from langchain.text_splitter import CharacterTextSplitter
29
  from langchain.llms import OpenAI
@@ -43,8 +43,6 @@ time_str = time.strftime("%d%m%Y-%H%M%S")
43
  HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem;
44
  margin-bottom: 2.5rem">{}</div> """
45
 
46
- index_id = "earnings-embeddings"
47
-
48
  #Stuff Chain Type Prompt template
49
  output_parser = RegexParser(
50
  regex=r"(.*?)\nScore: (.*)",
@@ -125,25 +123,17 @@ def load_asr_model(asr_model_name):
125
  return asr_model
126
 
127
  @st.experimental_singleton(suppress_st_warning=True)
128
- def process_corpus(corpus, _tok, title, _embeddings, chunk_size=200, overlap=50):
129
 
130
  '''Process text for Semantic Search'''
131
 
132
- pinecone.init(api_key=OPEN_AI_KEY, environment="us-west1-gcp")
133
-
134
- tokenizer = _tok
135
- text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(tokenizer,chunk_size=chunk_size,chunk_overlap=overlap,separator='. ')
136
 
137
  texts = text_splitter.split_text(corpus)
138
 
139
- docsearch = Pinecone.from_texts(
140
- texts,
141
- _embeddings,
142
- index_name = "earnings-embeddings",
143
- namespace = f'{title}-earnings',
144
- metadatas = [
145
- {'source':i} for i in range(len(texts))]
146
- )
147
 
148
  return docsearch
149
 
@@ -165,17 +155,15 @@ def gen_embeddings(embedding_model):
165
  return embeddings
166
 
167
  @st.experimental_memo(suppress_st_warning=True)
168
- def embed_text(query,corpus,title,embedding_model,_emb_tok,chain_type='stuff'):
169
 
170
  '''Embed text and generate semantic search scores'''
171
 
172
  title = title.split()[0].lower()
173
-
174
- embeddings = gen_embeddings(embedding_model)
175
 
176
- docsearch = process_corpus(corpus,_emb_tok,title, embeddings)
177
 
178
- docs = docsearch.similarity_search_with_score(query, k=3, namespace = f'{title}-earnings')
179
 
180
  print(docs)
181
 
 
23
  import torch
24
  from langchain.docstore.document import Document
25
  from langchain.embeddings import HuggingFaceEmbeddings,HuggingFaceInstructEmbeddings
26
+ from langchain.vectorstores import FAISS
27
  from langchain.chains.qa_with_sources import load_qa_with_sources_chain
28
  from langchain.text_splitter import CharacterTextSplitter
29
  from langchain.llms import OpenAI
 
43
  HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem;
44
  margin-bottom: 2.5rem">{}</div> """
45
 
 
 
46
  #Stuff Chain Type Prompt template
47
  output_parser = RegexParser(
48
  regex=r"(.*?)\nScore: (.*)",
 
123
  return asr_model
124
 
125
  @st.experimental_singleton(suppress_st_warning=True)
126
+ def process_corpus(corpus, _tokenizer, title, embedding_model, chunk_size=200, overlap=50):
127
 
128
  '''Process text for Semantic Search'''
129
 
130
+ text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(tokenizer,chunk_size=chunk_size,chunk_overlap=overlap)
 
 
 
131
 
132
  texts = text_splitter.split_text(corpus)
133
 
134
+ embeddings = gen_embeddings(embedding_model)
135
+
136
+ docsearch = FAISS.from_texts(texts, embeddings)
 
 
 
 
 
137
 
138
  return docsearch
139
 
 
155
  return embeddings
156
 
157
  @st.experimental_memo(suppress_st_warning=True)
158
+ def embed_text(query,corpus,title,embedding_model,_emb_tok,_chain_type='Normal'):
159
 
160
  '''Embed text and generate semantic search scores'''
161
 
162
  title = title.split()[0].lower()
 
 
163
 
164
+ docsearch = process_corpus(corpus,emb_tok,title, embedding_model)
165
 
166
+ docs = docsearch.similarity_search_with_score(query, k=3)
167
 
168
  print(docs)
169