awinml commited on
Commit
bd9fae2
1 Parent(s): 51f6159

Upload 17 files (#22)

Browse files

- Upload 17 files (52d1b3fcb1e89cfbdbf432eb2049850470450ed4)

Files changed (6) hide show
  1. app.py +34 -7
  2. requirements.txt +1 -1
  3. utils/models.py +60 -13
  4. utils/nltkmodules.py +3 -2
  5. utils/retriever.py +120 -47
  6. utils/vector_index.py +13 -1
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import re
2
-
3
  import openai
4
  import streamlit_scrollable_textbox as stx
5
 
@@ -8,23 +8,27 @@ import streamlit as st
8
 
9
  st.set_page_config(layout="wide") # isort: split
10
 
 
11
  from utils.entity_extraction import (
12
  clean_entities,
 
13
  extract_quarter_year,
14
  extract_ticker_spacy,
15
  format_entities_flan_alpaca,
16
  generate_alpaca_ner_prompt,
17
- extract_keywords
18
  )
19
  from utils.models import (
20
  generate_entities_flan_alpaca_checkpoint,
21
  generate_entities_flan_alpaca_inference_api,
22
  generate_text_flan_t5,
23
- get_data,
24
  get_alpaca_model,
 
25
  get_flan_alpaca_xl_model,
26
  get_flan_t5_model,
27
  get_instructor_embedding_model,
 
 
 
28
  get_mpnet_embedding_model,
29
  get_sgpt_embedding_model,
30
  get_spacy_model,
@@ -55,6 +59,7 @@ from utils.retriever import (
55
  sentence_id_combine,
56
  text_lookup,
57
  year_quarter_range,
 
58
  )
59
  from utils.transcript_retrieval import retrieve_transcript
60
  from utils.vector_index import (
@@ -62,7 +67,6 @@ from utils.vector_index import (
62
  create_sparse_embeddings,
63
  hybrid_score_norm,
64
  )
65
- from utils import nltkmodules
66
 
67
  st.title("Question Answering on Earnings Call Transcripts")
68
 
@@ -75,6 +79,8 @@ col1, col2 = st.columns([3, 3], gap="medium")
75
 
76
 
77
  with st.sidebar:
 
 
78
  ner_choice = st.selectbox("Select NER Model", ["Spacy", "Alpaca"])
79
  document_type = st.selectbox(
80
  "Select Query Type", ["Single-Document", "Multi-Document"]
@@ -85,6 +91,18 @@ with st.sidebar:
85
  ["Single-Company", "Compare Companies"],
86
  )
87
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  if ner_choice == "Spacy":
89
  ner_model = get_spacy_model()
90
 
@@ -305,7 +323,7 @@ elif encoder_model == "Instructor":
305
  )
306
  pinecone_index_name = "week13-instructor-xl"
307
  pinecone_index = pinecone.Index(pinecone_index_name)
308
- retriever_model = get_instructor_embedding_model()
309
  instruction = (
310
  "Represent the financial question for retrieving supporting documents:"
311
  )
@@ -318,7 +336,7 @@ elif encoder_model == "Hybrid Instructor - SPLADE":
318
  )
319
  pinecone_index_name = "week13-splade-instructor-xl"
320
  pinecone_index = pinecone.Index(pinecone_index_name)
321
- retriever_model = get_instructor_embedding_model()
322
  (
323
  sparse_retriever_model,
324
  sparse_retriever_tokenizer,
@@ -382,6 +400,7 @@ if document_type == "Single-Document":
382
  dense_query_embedding, sparse_query_embedding = hybrid_score_norm(
383
  dense_query_embedding, sparse_query_embedding, 0.3
384
  )
 
385
  query_results = query_pinecone_sparse(
386
  dense_query_embedding,
387
  sparse_query_embedding,
@@ -392,6 +411,7 @@ if document_type == "Single-Document":
392
  ticker,
393
  participant_type,
394
  keywords,
 
395
  threshold,
396
  )
397
 
@@ -413,6 +433,7 @@ if document_type == "Single-Document":
413
  ticker,
414
  participant_type,
415
  keywords,
 
416
  threshold,
417
  )
418
 
@@ -459,6 +480,7 @@ else:
459
  ticker,
460
  participant_type,
461
  keywords,
 
462
  threshold,
463
  )
464
  results_list = sentence_id_combine(
@@ -490,6 +512,7 @@ else:
490
  ticker,
491
  participant_type,
492
  keywords,
 
493
  threshold,
494
  )
495
  results_list = sentence_id_combine(
@@ -535,6 +558,7 @@ else:
535
  ticker_first,
536
  participant_type,
537
  keywords,
 
538
  threshold,
539
  )
540
  results_list = sentence_id_combine(
@@ -557,6 +581,7 @@ else:
557
  ticker_second,
558
  participant_type,
559
  keywords,
 
560
  threshold,
561
  )
562
  results_list = sentence_id_combine(
@@ -591,6 +616,7 @@ else:
591
  ticker_first,
592
  participant_type,
593
  keywords,
 
594
  threshold,
595
  )
596
  results_list = sentence_id_combine(
@@ -612,6 +638,7 @@ else:
612
  ticker_second,
613
  participant_type,
614
  keywords,
 
615
  threshold,
616
  )
617
  results_list = sentence_id_combine(
@@ -778,7 +805,7 @@ if decoder_model == "GPT-J":
778
  )
779
  submitted = st.form_submit_button("Submit")
780
 
781
- tab1, tab2 = st.tabs(["Retrived Text", "Retrieved Documents"])
782
 
783
 
784
  with tab1:
 
1
  import re
2
+ import numpy as np
3
  import openai
4
  import streamlit_scrollable_textbox as stx
5
 
 
8
 
9
  st.set_page_config(layout="wide") # isort: split
10
 
11
+ from utils import nltkmodules
12
  from utils.entity_extraction import (
13
  clean_entities,
14
+ extract_keywords,
15
  extract_quarter_year,
16
  extract_ticker_spacy,
17
  format_entities_flan_alpaca,
18
  generate_alpaca_ner_prompt,
 
19
  )
20
  from utils.models import (
21
  generate_entities_flan_alpaca_checkpoint,
22
  generate_entities_flan_alpaca_inference_api,
23
  generate_text_flan_t5,
 
24
  get_alpaca_model,
25
+ get_data,
26
  get_flan_alpaca_xl_model,
27
  get_flan_t5_model,
28
  get_instructor_embedding_model,
29
+ get_instructor_embedding_model_api,
30
+ get_bm25_model,
31
+ preprocess_text,
32
  get_mpnet_embedding_model,
33
  get_sgpt_embedding_model,
34
  get_spacy_model,
 
59
  sentence_id_combine,
60
  text_lookup,
61
  year_quarter_range,
62
+ get_bm25_search_hits,
63
  )
64
  from utils.transcript_retrieval import retrieve_transcript
65
  from utils.vector_index import (
 
67
  create_sparse_embeddings,
68
  hybrid_score_norm,
69
  )
 
70
 
71
  st.title("Question Answering on Earnings Call Transcripts")
72
 
 
79
 
80
 
81
  with st.sidebar:
82
+ use_bm25 = st.checkbox("Use BM25 for filtering results")
83
+
84
  ner_choice = st.selectbox("Select NER Model", ["Spacy", "Alpaca"])
85
  document_type = st.selectbox(
86
  "Select Query Type", ["Single-Document", "Multi-Document"]
 
91
  ["Single-Company", "Compare Companies"],
92
  )
93
 
94
+
95
+ corpus, bm25 = get_bm25_model(data)
96
+
97
+ tokenized_query = preprocess_text(query_text).split()
98
+ sparse_scores = np.argsort(bm25.get_scores(tokenized_query), axis=0)[::-1]
99
+ indices_hits = get_bm25_search_hits(corpus, sparse_scores, 50)
100
+
101
+ if use_bm25 == True:
102
+ indices = indices_hits
103
+ else:
104
+ indices = None
105
+
106
  if ner_choice == "Spacy":
107
  ner_model = get_spacy_model()
108
 
 
323
  )
324
  pinecone_index_name = "week13-instructor-xl"
325
  pinecone_index = pinecone.Index(pinecone_index_name)
326
+ retriever_model = get_instructor_embedding_model_api()
327
  instruction = (
328
  "Represent the financial question for retrieving supporting documents:"
329
  )
 
336
  )
337
  pinecone_index_name = "week13-splade-instructor-xl"
338
  pinecone_index = pinecone.Index(pinecone_index_name)
339
+ retriever_model = get_instructor_embedding_model_api()
340
  (
341
  sparse_retriever_model,
342
  sparse_retriever_tokenizer,
 
400
  dense_query_embedding, sparse_query_embedding = hybrid_score_norm(
401
  dense_query_embedding, sparse_query_embedding, 0.3
402
  )
403
+
404
  query_results = query_pinecone_sparse(
405
  dense_query_embedding,
406
  sparse_query_embedding,
 
411
  ticker,
412
  participant_type,
413
  keywords,
414
+ indices,
415
  threshold,
416
  )
417
 
 
433
  ticker,
434
  participant_type,
435
  keywords,
436
+ indices,
437
  threshold,
438
  )
439
 
 
480
  ticker,
481
  participant_type,
482
  keywords,
483
+ indices,
484
  threshold,
485
  )
486
  results_list = sentence_id_combine(
 
512
  ticker,
513
  participant_type,
514
  keywords,
515
+ indices,
516
  threshold,
517
  )
518
  results_list = sentence_id_combine(
 
558
  ticker_first,
559
  participant_type,
560
  keywords,
561
+ indices,
562
  threshold,
563
  )
564
  results_list = sentence_id_combine(
 
581
  ticker_second,
582
  participant_type,
583
  keywords,
584
+ indices,
585
  threshold,
586
  )
587
  results_list = sentence_id_combine(
 
616
  ticker_first,
617
  participant_type,
618
  keywords,
619
+ indices,
620
  threshold,
621
  )
622
  results_list = sentence_id_combine(
 
638
  ticker_second,
639
  participant_type,
640
  keywords,
641
+ indices,
642
  threshold,
643
  )
644
  results_list = sentence_id_combine(
 
805
  )
806
  submitted = st.form_submit_button("Submit")
807
 
808
+ tab1, tab2 = st.tabs(["Retrieved Text", "Retrieved Documents"])
809
 
810
 
811
  with tab1:
requirements.txt CHANGED
@@ -14,4 +14,4 @@ streamlit-scrollable-textbox
14
  openai
15
  InstructorEmbedding
16
  gradio_client
17
-
 
14
  openai
15
  InstructorEmbedding
16
  gradio_client
17
+ rank_bm25
utils/models.py CHANGED
@@ -20,26 +20,59 @@ from transformers import (
20
  T5Tokenizer,
21
  pipeline,
22
  )
23
-
24
- import pinecone
 
 
 
 
25
  import streamlit as st
26
 
27
 
28
- @st.experimental_singleton
29
  def get_data():
30
  data = pd.read_csv("earnings_calls_cleaned_metadata.csv")
31
  return data
32
 
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  # Initialize Spacy Model
35
 
36
 
37
- @st.experimental_singleton
38
  def get_spacy_model():
39
  return spacy.load("en_core_web_trf")
40
 
41
 
42
- @st.experimental_singleton
43
  def get_flan_alpaca_xl_model():
44
  model = AutoModelForSeq2SeqLM.from_pretrained(
45
  "/home/user/app/models/flan-alpaca-xl/"
@@ -53,19 +86,19 @@ def get_flan_alpaca_xl_model():
53
  # Initialize models from HuggingFace
54
 
55
 
56
- @st.experimental_singleton
57
  def get_t5_model():
58
  return pipeline("summarization", model="t5-small", tokenizer="t5-small")
59
 
60
 
61
- @st.experimental_singleton
62
  def get_flan_t5_model():
63
  tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
64
  model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large")
65
  return model, tokenizer
66
 
67
 
68
- @st.experimental_singleton
69
  def get_mpnet_embedding_model():
70
  device = "cuda" if torch.cuda.is_available() else "cpu"
71
  model = SentenceTransformer(
@@ -75,7 +108,7 @@ def get_mpnet_embedding_model():
75
  return model
76
 
77
 
78
- @st.experimental_singleton
79
  def get_splade_sparse_embedding_model():
80
  model_sparse = "naver/splade-cocondenser-ensembledistil"
81
  # check device
@@ -87,7 +120,7 @@ def get_splade_sparse_embedding_model():
87
  return model_sparse, tokenizer
88
 
89
 
90
- @st.experimental_singleton
91
  def get_sgpt_embedding_model():
92
  device = "cuda" if torch.cuda.is_available() else "cpu"
93
  model = SentenceTransformer(
@@ -97,20 +130,34 @@ def get_sgpt_embedding_model():
97
  return model
98
 
99
 
100
- @st.experimental_singleton
101
  def get_instructor_embedding_model():
102
  device = "cuda" if torch.cuda.is_available() else "cpu"
103
  model = INSTRUCTOR("hkunlp/instructor-xl")
104
  return model
105
 
 
 
 
 
 
106
 
107
- @st.experimental_singleton
108
  def get_alpaca_model():
109
  client = Client("https://awinml-alpaca-cpp.hf.space")
110
  return client
111
 
112
 
113
- @st.experimental_memo
 
 
 
 
 
 
 
 
 
114
  def save_key(api_key):
115
  return api_key
116
 
 
20
  T5Tokenizer,
21
  pipeline,
22
  )
23
+ from rank_bm25 import BM25Okapi, BM25L, BM25Plus
24
+ import numpy as np
25
+ from nltk.tokenize import word_tokenize
26
+ from nltk.corpus import stopwords
27
+ from nltk.stem.porter import PorterStemmer
28
+ import re
29
  import streamlit as st
30
 
31
 
32
+ @st.cache_resource
33
  def get_data():
34
  data = pd.read_csv("earnings_calls_cleaned_metadata.csv")
35
  return data
36
 
37
 
38
+ # Preprocessing for BM25
39
+
40
+
41
+ def tokenizer(
42
+ string, reg="[a-zA-Z'-]+|[0-9]{1,}%|[0-9]{1,}\.[0-9]{1,}%|\d+\.\d+%}"
43
+ ):
44
+ regex = reg
45
+ string = string.replace("-", " ")
46
+ return " ".join(re.findall(regex, string))
47
+
48
+
49
+ def preprocess_text(text):
50
+ # Convert to lowercase
51
+ text = text.lower()
52
+ # Tokenize the text
53
+ tokens = word_tokenize(text)
54
+ # Remove stop words
55
+ stop_words = set(stopwords.words("english"))
56
+ tokens = [token for token in tokens if token not in stop_words]
57
+ # Stem the tokens
58
+ porter_stemmer = PorterStemmer()
59
+ tokens = [porter_stemmer.stem(token) for token in tokens]
60
+ # Join the tokens back into a single string
61
+ preprocessed_text = " ".join(tokens)
62
+ preprocessed_text = tokenizer(preprocessed_text)
63
+
64
+ return preprocessed_text
65
+
66
+
67
  # Initialize Spacy Model
68
 
69
 
70
+ @st.cache_resource
71
  def get_spacy_model():
72
  return spacy.load("en_core_web_trf")
73
 
74
 
75
+ @st.cache_resource
76
  def get_flan_alpaca_xl_model():
77
  model = AutoModelForSeq2SeqLM.from_pretrained(
78
  "/home/user/app/models/flan-alpaca-xl/"
 
86
  # Initialize models from HuggingFace
87
 
88
 
89
+ @st.cache_resource
90
  def get_t5_model():
91
  return pipeline("summarization", model="t5-small", tokenizer="t5-small")
92
 
93
 
94
+ @st.cache_resource
95
  def get_flan_t5_model():
96
  tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
97
  model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large")
98
  return model, tokenizer
99
 
100
 
101
+ @st.cache_resource
102
  def get_mpnet_embedding_model():
103
  device = "cuda" if torch.cuda.is_available() else "cpu"
104
  model = SentenceTransformer(
 
108
  return model
109
 
110
 
111
+ @st.cache_resource
112
  def get_splade_sparse_embedding_model():
113
  model_sparse = "naver/splade-cocondenser-ensembledistil"
114
  # check device
 
120
  return model_sparse, tokenizer
121
 
122
 
123
+ @st.cache_resource
124
  def get_sgpt_embedding_model():
125
  device = "cuda" if torch.cuda.is_available() else "cpu"
126
  model = SentenceTransformer(
 
130
  return model
131
 
132
 
133
+ @st.cache_resource
134
  def get_instructor_embedding_model():
135
  device = "cuda" if torch.cuda.is_available() else "cpu"
136
  model = INSTRUCTOR("hkunlp/instructor-xl")
137
  return model
138
 
139
+ @st.cache_resource
140
+ def get_instructor_embedding_model_api():
141
+ client = Client("https://awinml-api-instructor-xl-2.hf.space/")
142
+ return client
143
+
144
 
145
+ @st.cache_resource
146
  def get_alpaca_model():
147
  client = Client("https://awinml-alpaca-cpp.hf.space")
148
  return client
149
 
150
 
151
+ @st.cache_resource
152
+ def get_bm25_model(data):
153
+ corpus = data.Text.tolist()
154
+ corpus_clean = [preprocess_text(x) for x in corpus]
155
+ tokenized_corpus = [doc.split(" ") for doc in corpus_clean]
156
+ bm25 = BM25Plus(tokenized_corpus)
157
+ return corpus, bm25
158
+
159
+
160
+ @st.cache_resource
161
  def save_key(api_key):
162
  return api_key
163
 
utils/nltkmodules.py CHANGED
@@ -1,4 +1,5 @@
1
  import nltk
2
 
3
- nltk.download('wordnet')
4
- nltk.download('punkt')
 
 
1
  import nltk
2
 
3
+ nltk.download("wordnet")
4
+ nltk.download("punkt")
5
+ nltk.download("stopwords")
utils/retriever.py CHANGED
@@ -1,6 +1,16 @@
1
- def query_pinecone_sparse(
 
 
 
 
 
 
 
 
 
 
 
2
  dense_vec,
3
- sparse_vec,
4
  top_k,
5
  index,
6
  year,
@@ -8,6 +18,7 @@ def query_pinecone_sparse(
8
  ticker,
9
  participant_type,
10
  keywords=None,
 
11
  threshold=0.25,
12
  ):
13
  if participant_type == "Company Speaker":
@@ -16,68 +27,126 @@ def query_pinecone_sparse(
16
  participant = "Question"
17
 
18
  # Create filter dictionary based on keywords
19
- filter_dict = [{'Keywords': word} for word in keywords]
20
 
21
  if year == "All":
22
  if quarter == "All":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  xc = index.query(
24
  vector=dense_vec,
25
- sparse_vector=sparse_vec,
26
  top_k=top_k,
27
  filter={
28
- "Year": {
29
- "$in": [
30
- int("2020"),
31
- int("2019"),
32
- int("2018"),
33
- int("2017"),
34
- int("2016"),
35
- ]
36
- },
37
- "Quarter": {"$in": ["Q1", "Q2", "Q3", "Q4"]},
38
  "Ticker": {"$eq": ticker},
39
  "QA_Flag": {"$eq": participant},
40
- "Keywords": {"$in": keywords}
 
41
  },
42
  include_metadata=True,
43
  )
44
  else:
45
  xc = index.query(
46
  vector=dense_vec,
47
- sparse_vector=sparse_vec,
48
  top_k=top_k,
49
  filter={
50
- "Year": {
51
- "$in": [
52
- int("2020"),
53
- int("2019"),
54
- int("2018"),
55
- int("2017"),
56
- int("2016"),
57
- ]
58
- },
59
  "Quarter": {"$eq": quarter},
60
  "Ticker": {"$eq": ticker},
61
  "QA_Flag": {"$eq": participant},
62
- "Keywords": {"$in": keywords}
63
  },
64
  include_metadata=True,
65
  )
66
- else:
67
- # search pinecone index for context passage with the answer
68
- xc = index.query(
69
- vector=dense_vec,
70
- sparse_vector=sparse_vec,
71
- top_k=top_k,
72
- filter={
73
- "Year": int(year),
74
- "Quarter": {"$eq": quarter},
75
- "Ticker": {"$eq": ticker},
76
- "QA_Flag": {"$eq": participant},
77
- "Keywords": {"$in": keywords}
78
- },
79
- include_metadata=True,
80
- )
81
  # filter the context passages based on the score threshold
82
  filtered_matches = []
83
  for match in xc["matches"]:
@@ -87,8 +156,9 @@ def query_pinecone_sparse(
87
  return xc
88
 
89
 
90
- def query_pinecone(
91
  dense_vec,
 
92
  top_k,
93
  index,
94
  year,
@@ -96,6 +166,7 @@ def query_pinecone(
96
  ticker,
97
  participant_type,
98
  keywords=None,
 
99
  threshold=0.25,
100
  ):
101
  if participant_type == "Company Speaker":
@@ -104,13 +175,13 @@ def query_pinecone(
104
  participant = "Question"
105
 
106
  # Create filter dictionary based on keywords
107
- filter_dict = [{'Keywords': word} for word in keywords]
108
-
109
 
110
  if year == "All":
111
  if quarter == "All":
112
  xc = index.query(
113
  vector=dense_vec,
 
114
  top_k=top_k,
115
  filter={
116
  "Year": {
@@ -125,13 +196,14 @@ def query_pinecone(
125
  "Quarter": {"$in": ["Q1", "Q2", "Q3", "Q4"]},
126
  "Ticker": {"$eq": ticker},
127
  "QA_Flag": {"$eq": participant},
128
- "Keywords": {"$in": keywords}
129
  },
130
  include_metadata=True,
131
  )
132
  else:
133
  xc = index.query(
134
  vector=dense_vec,
 
135
  top_k=top_k,
136
  filter={
137
  "Year": {
@@ -146,7 +218,7 @@ def query_pinecone(
146
  "Quarter": {"$eq": quarter},
147
  "Ticker": {"$eq": ticker},
148
  "QA_Flag": {"$eq": participant},
149
- "Keywords": {"$in": keywords}
150
  },
151
  include_metadata=True,
152
  )
@@ -154,13 +226,14 @@ def query_pinecone(
154
  # search pinecone index for context passage with the answer
155
  xc = index.query(
156
  vector=dense_vec,
 
157
  top_k=top_k,
158
  filter={
159
  "Year": int(year),
160
  "Quarter": {"$eq": quarter},
161
  "Ticker": {"$eq": ticker},
162
  "QA_Flag": {"$eq": participant},
163
- "Keywords": {"$in": keywords}
164
  },
165
  include_metadata=True,
166
  )
 
1
+ def get_bm25_search_hits(corpus, sparse_scores, top_n=50):
2
+ bm25_search = []
3
+ indices = []
4
+ for idx in sparse_scores:
5
+ if len(bm25_search) <= top_n:
6
+ bm25_search.append(corpus[idx])
7
+ indices.append(idx)
8
+ indices = [int(x) for x in indices]
9
+ return indices
10
+
11
+
12
+ def query_pinecone(
13
  dense_vec,
 
14
  top_k,
15
  index,
16
  year,
 
18
  ticker,
19
  participant_type,
20
  keywords=None,
21
+ indices=None,
22
  threshold=0.25,
23
  ):
24
  if participant_type == "Company Speaker":
 
27
  participant = "Question"
28
 
29
  # Create filter dictionary based on keywords
30
+ filter_dict = [{"Keywords": word} for word in keywords]
31
 
32
  if year == "All":
33
  if quarter == "All":
34
+ if indices != None:
35
+ xc = index.query(
36
+ vector=dense_vec,
37
+ top_k=top_k,
38
+ filter={
39
+ "Year": {
40
+ "$in": [
41
+ int("2020"),
42
+ int("2019"),
43
+ int("2018"),
44
+ int("2017"),
45
+ int("2016"),
46
+ ]
47
+ },
48
+ "Quarter": {"$in": ["Q1", "Q2", "Q3", "Q4"]},
49
+ "Ticker": {"$eq": ticker},
50
+ "QA_Flag": {"$eq": participant},
51
+ "Keywords": {"$in": keywords},
52
+ "index": {"$in": indices},
53
+ },
54
+ include_metadata=True,
55
+ )
56
+ else:
57
+ xc = index.query(
58
+ vector=dense_vec,
59
+ top_k=top_k,
60
+ filter={
61
+ "Year": {
62
+ "$in": [
63
+ int("2020"),
64
+ int("2019"),
65
+ int("2018"),
66
+ int("2017"),
67
+ int("2016"),
68
+ ]
69
+ },
70
+ "Quarter": {"$in": ["Q1", "Q2", "Q3", "Q4"]},
71
+ "Ticker": {"$eq": ticker},
72
+ "QA_Flag": {"$eq": participant},
73
+ "Keywords": {"$in": keywords},
74
+ },
75
+ include_metadata=True,
76
+ )
77
+ else:
78
+ if indices != None:
79
+ xc = index.query(
80
+ vector=dense_vec,
81
+ top_k=top_k,
82
+ filter={
83
+ "Year": {
84
+ "$in": [
85
+ int("2020"),
86
+ int("2019"),
87
+ int("2018"),
88
+ int("2017"),
89
+ int("2016"),
90
+ ]
91
+ },
92
+ "Quarter": {"$eq": quarter},
93
+ "Ticker": {"$eq": ticker},
94
+ "QA_Flag": {"$eq": participant},
95
+ "Keywords": {"$in": keywords},
96
+ "index": {"$in": indices},
97
+ },
98
+ include_metadata=True,
99
+ )
100
+ else:
101
+ xc = index.query(
102
+ vector=dense_vec,
103
+ top_k=top_k,
104
+ filter={
105
+ "Year": {
106
+ "$in": [
107
+ int("2020"),
108
+ int("2019"),
109
+ int("2018"),
110
+ int("2017"),
111
+ int("2016"),
112
+ ]
113
+ },
114
+ "Quarter": {"$eq": quarter},
115
+ "Ticker": {"$eq": ticker},
116
+ "QA_Flag": {"$eq": participant},
117
+ "Keywords": {"$in": keywords},
118
+ },
119
+ include_metadata=True,
120
+ )
121
+ else:
122
+ # search pinecone index for context passage with the answer
123
+ if indices != None:
124
  xc = index.query(
125
  vector=dense_vec,
 
126
  top_k=top_k,
127
  filter={
128
+ "Year": int(year),
129
+ "Quarter": {"$eq": quarter},
 
 
 
 
 
 
 
 
130
  "Ticker": {"$eq": ticker},
131
  "QA_Flag": {"$eq": participant},
132
+ "Keywords": {"$in": keywords},
133
+ "index": {"$in": indices},
134
  },
135
  include_metadata=True,
136
  )
137
  else:
138
  xc = index.query(
139
  vector=dense_vec,
 
140
  top_k=top_k,
141
  filter={
142
+ "Year": int(year),
 
 
 
 
 
 
 
 
143
  "Quarter": {"$eq": quarter},
144
  "Ticker": {"$eq": ticker},
145
  "QA_Flag": {"$eq": participant},
146
+ "Keywords": {"$in": keywords},
147
  },
148
  include_metadata=True,
149
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  # filter the context passages based on the score threshold
151
  filtered_matches = []
152
  for match in xc["matches"]:
 
156
  return xc
157
 
158
 
159
+ def query_pinecone_sparse(
160
  dense_vec,
161
+ sparse_vec,
162
  top_k,
163
  index,
164
  year,
 
166
  ticker,
167
  participant_type,
168
  keywords=None,
169
+ indices=None,
170
  threshold=0.25,
171
  ):
172
  if participant_type == "Company Speaker":
 
175
  participant = "Question"
176
 
177
  # Create filter dictionary based on keywords
178
+ filter_dict = [{"Keywords": word} for word in keywords]
 
179
 
180
  if year == "All":
181
  if quarter == "All":
182
  xc = index.query(
183
  vector=dense_vec,
184
+ sparse_vector=sparse_vec,
185
  top_k=top_k,
186
  filter={
187
  "Year": {
 
196
  "Quarter": {"$in": ["Q1", "Q2", "Q3", "Q4"]},
197
  "Ticker": {"$eq": ticker},
198
  "QA_Flag": {"$eq": participant},
199
+ "Keywords": {"$in": keywords},
200
  },
201
  include_metadata=True,
202
  )
203
  else:
204
  xc = index.query(
205
  vector=dense_vec,
206
+ sparse_vector=sparse_vec,
207
  top_k=top_k,
208
  filter={
209
  "Year": {
 
218
  "Quarter": {"$eq": quarter},
219
  "Ticker": {"$eq": ticker},
220
  "QA_Flag": {"$eq": participant},
221
+ "Keywords": {"$in": keywords},
222
  },
223
  include_metadata=True,
224
  )
 
226
  # search pinecone index for context passage with the answer
227
  xc = index.query(
228
  vector=dense_vec,
229
+ sparse_vector=sparse_vec,
230
  top_k=top_k,
231
  filter={
232
  "Year": int(year),
233
  "Quarter": {"$eq": quarter},
234
  "Ticker": {"$eq": ticker},
235
  "QA_Flag": {"$eq": participant},
236
+ "Keywords": {"$in": keywords},
237
  },
238
  include_metadata=True,
239
  )
utils/vector_index.py CHANGED
@@ -1,11 +1,23 @@
1
  import torch
 
 
2
 
3
 
4
  def create_dense_embeddings(query, model, instruction=None):
5
  if instruction == None:
6
  dense_emb = model.encode([query]).tolist()
7
  else:
8
- dense_emb = model.encode([[instruction, query]]).tolist()
 
 
 
 
 
 
 
 
 
 
9
  return dense_emb
10
 
11
 
 
1
  import torch
2
+ import json
3
+ import numpy as np
4
 
5
 
6
  def create_dense_embeddings(query, model, instruction=None):
7
  if instruction == None:
8
  dense_emb = model.encode([query]).tolist()
9
  else:
10
+ # Fetching embedding from API for Instructor
11
+ json_output_embedding = model.predict(
12
+ instruction,
13
+ query,
14
+ api_name="/predict",
15
+ )
16
+
17
+ json_file = open(json_output_embedding, "r")
18
+ json_dict = json.load(json_file)
19
+ dense_array = np.array(json_dict["data"], dtype=np.float64)
20
+ dense_emb = dense_array.tolist()
21
  return dense_emb
22
 
23