Jorge Henao commited on
Commit
9e6217b
1 Parent(s): 9735086

pinecone fixes

Browse files
Files changed (4) hide show
  1. app_pinecode.py +1 -1
  2. config.py +1 -1
  3. pinecode_quieries.py +16 -12
  4. requirements.txt +4 -4
app_pinecode.py CHANGED
@@ -58,7 +58,7 @@ if __name__ == "__main__":
58
  # Every form must have a submit button.
59
  submitted = st.form_submit_button("Buscar")
60
 
61
- results = search("que es el adres", retriever_top_k=5, reader_top_k=3)
62
 
63
  # on submit we execute search
64
  if(submitted):
 
58
  # Every form must have a submit button.
59
  submitted = st.form_submit_button("Buscar")
60
 
61
+ #results = search("que es el adres", retriever_top_k=5, reader_top_k=3)
62
 
63
  # on submit we execute search
64
  if(submitted):
config.py CHANGED
@@ -2,7 +2,7 @@ class Config():
2
  es_host = "saimon-askwdemocracy.es.us-central1.gcp.cloud.es.io"
3
  es_user = "elastic"
4
  es_password = "53f2a7a9-ea9d-4fd2-a8bc-f471b67f0262"
5
- proposals_index = "semantic-text-search"
6
  reader_model_name_or_path = "deepset/xlm-roberta-base-squad2-distilled"
7
  #reader_model_name_or_path = "deepset/xlm-roberta-base-squad2"
8
  use_gpu = True
 
2
  es_host = "saimon-askwdemocracy.es.us-central1.gcp.cloud.es.io"
3
  es_user = "elastic"
4
  es_password = "53f2a7a9-ea9d-4fd2-a8bc-f471b67f0262"
5
+ proposals_index = "docsreloaded"
6
  reader_model_name_or_path = "deepset/xlm-roberta-base-squad2-distilled"
7
  #reader_model_name_or_path = "deepset/xlm-roberta-base-squad2"
8
  use_gpu = True
pinecode_quieries.py CHANGED
@@ -46,6 +46,9 @@ class PineconeRetriever(BaseComponent):
46
  self.index = pinecone.Index(index_name)
47
 
48
  def run(self, query: str, top_k: Optional[int]):
 
 
 
49
  # process the inputs
50
  vector_embeddings = self.sts_model.encode(query).tolist()
51
  response = self.index.query([vector_embeddings], top_k=top_k, include_metadata=True)
@@ -57,7 +60,7 @@ class PineconeRetriever(BaseComponent):
57
  'source': d["metadata"]['source']
58
  }
59
  )
60
- for d in response["matches"]
61
  ]
62
  output = {"documents": docs, "query": query}
63
  return output, "output_1"
@@ -84,36 +87,37 @@ class PinecodeProposalQueries(DocumentQueries):
84
  self.reader = reader
85
 
86
  #pinecone.init(api_key=es_password, environment="us-east1-gcp")
87
- index_name = "semantic-text-search"
88
 
89
  self.document_store = PineconeDocumentStore(
90
  api_key=es_password,
91
  environment = "us-east1-gcp",
92
  index=index_name,
93
  similarity="cosine",
94
- embedding_dim=384
 
95
  )
96
  self.pipe = Pipeline()
97
  pinecone_retriever = PineconeRetriever("sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
98
  es_password, "us-east1-gcp",
99
  index_name)
100
- self.pipe.add_node(component=pinecone_retriever, name="Retriever", inputs=["Query"])
101
- self.pipe.add_node(component=self.reader, name="Reader", inputs=["Retriever"])
102
 
103
  # #self.retriever = BM25Retriever(document_store = self.document_store)
104
- # self.retriever = EmbeddingRetriever(
105
- # document_store=self.document_store,
106
- # #embedding_model="multi-qa-distilbert-dot-v1",
107
- # embedding_model = "sentence-transformers/msmarco-MiniLM-L6-cos-v5",
108
- # model_format="sentence_transformers"
109
- # )
110
 
111
  # retriever_model = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
112
 
113
 
114
 
115
  #self.document_store.update_embeddings(self.retriever, update_existing_embeddings=False)
116
- #self.pipe = ExtractiveQAPipeline (reader = self.reader, retriever = self.retriever)
117
  #self.pipe = DocumentSearchPipeline(self.retriever)
118
 
119
  def search_by_query(self, query : str, retriever_top_k: int, reader_top_k: int, es_index: str = None) :
 
46
  self.index = pinecone.Index(index_name)
47
 
48
  def run(self, query: str, top_k: Optional[int]):
49
+ #TODO: problema de versiones del cliente de pinecone. en esta versión retorna una coleción de resutsl con los matches.
50
+ #TODO2: no esta el contenido dentro de los metadatos, ni en ningun lado. normal? , workarround. guararlo en otrometado.
51
+ #TODO3: hacer consulta por id, pastelarlo desde el colab que si retorna?
52
  # process the inputs
53
  vector_embeddings = self.sts_model.encode(query).tolist()
54
  response = self.index.query([vector_embeddings], top_k=top_k, include_metadata=True)
 
60
  'source': d["metadata"]['source']
61
  }
62
  )
63
+ for d in response['matches']
64
  ]
65
  output = {"documents": docs, "query": query}
66
  return output, "output_1"
 
87
  self.reader = reader
88
 
89
  #pinecone.init(api_key=es_password, environment="us-east1-gcp")
90
+ index_name = es_index
91
 
92
  self.document_store = PineconeDocumentStore(
93
  api_key=es_password,
94
  environment = "us-east1-gcp",
95
  index=index_name,
96
  similarity="cosine",
97
+ embedding_dim=384,
98
+ metadata_config = {"indexed": ["title", "content"]}
99
  )
100
  self.pipe = Pipeline()
101
  pinecone_retriever = PineconeRetriever("sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
102
  es_password, "us-east1-gcp",
103
  index_name)
104
+ #self.pipe.add_node(component=pinecone_retriever, name="Retriever", inputs=["Query"])
105
+ #self.pipe.add_node(component=self.reader, name="Reader", inputs=["Retriever"])
106
 
107
  # #self.retriever = BM25Retriever(document_store = self.document_store)
108
+ self.retriever = EmbeddingRetriever(
109
+ document_store=self.document_store,
110
+ #embedding_model="multi-qa-distilbert-dot-v1",
111
+ embedding_model = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
112
+ model_format="sentence_transformers"
113
+ )
114
 
115
  # retriever_model = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
116
 
117
 
118
 
119
  #self.document_store.update_embeddings(self.retriever, update_existing_embeddings=False)
120
+ self.pipe = ExtractiveQAPipeline (reader = self.reader, retriever = self.retriever)
121
  #self.pipe = DocumentSearchPipeline(self.retriever)
122
 
123
  def search_by_query(self, query : str, retriever_top_k: int, reader_top_k: int, es_index: str = None) :
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- transformers==4.19.2
2
- torch==1.10.2
3
- #farm-haystack==1.5.0
4
- farm-haystack[pinecone]==1.5.0
 
1
+ sentence-transformers
2
+ farm-haystack[pinecone]
3
+ pinecone-client
4
+ streamlit