Spaces:
Runtime error
Runtime error
File size: 3,510 Bytes
fb2b8cf f440070 dd7fea7 fb2b8cf 7c91199 e41891c fb2b8cf dd7fea7 c73d053 e41891c c73d053 fb2b8cf f440070 fb2b8cf cf1f09e f440070 fb2b8cf f440070 e41891c f440070 fb2b8cf f440070 fb2b8cf e41891c fb2b8cf e41891c fb2b8cf f440070 fb2b8cf f440070 31dada8 f440070 31dada8 da2a66c fb2b8cf da2a66c fb2b8cf f440070 fb2b8cf f440070 fb2b8cf f440070 fb2b8cf f440070 fb2b8cf dd7fea7 fb2b8cf 31dada8 fb2b8cf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import os
import cohere
from typing import List
import pinecone
from easygoogletranslate import EasyGoogleTranslate
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
PINECONE_ENV = os.environ.get("PINECONE_ENV")
COHERE_API_KEY = os.environ.get("COHERE_API_KEY")
MODEL_NAME = "multilingual-22-12"
COLLECTION = "wiki-embed"
# create qdrant and cohere client
cohere_client = cohere.Client(COHERE_API_KEY)
translator = EasyGoogleTranslate()
def init_pinecone():
pinecone.init(api_key= PINECONE_API_KEY,
environment=PINECONE_ENV)
index = pinecone.Index(COLLECTION)
return index
index = init_pinecone()
def embed_user_query(user_query):
embeddings = cohere_client.embed(
texts=[user_query],
model=MODEL_NAME,
)
query_embedding = embeddings.embeddings[0]
return query_embedding, user_query
def search_wiki_for_query(
query_embedding,
num_results = 3,
languages = [],
):
language_mapping = {
"English": "en",
"Yoruba": "yo",
"Igbo": "ig",
"Hause": "ha",
}
# index.query(query_embedding, top_k=num_results, include_metadata=True)
# prepare filters to narrow down search results
# if the `match_text` list is not empty then create filter to find exact matching text in the documents
query_results = index.query(
top_k=3,
include_metadata=True,
vector= query_embedding,
filter={
'lang': {'$in': [language_mapping[lang] for lang in languages]}
}
)
metadata = [record["metadata"] for record in query_results["matches"]]
return metadata
def cross_lingual_document_search(
user_input: str, num_results: int, languages, text_match
) -> List:
# create an embedding for the input query
query_embedding, _ = embed_user_query(user_input)
# retrieve search results
metadata = search_wiki_for_query(
query_embedding,
num_results,
languages,
)
results = [result['title']+"\n"+result['text'] for result in metadata]
url_list = [result['url'] + "\n\n" for result in metadata]
return results + url_list
url_list = [result['url'] + "\n\n" for result in metadata]
return results + url_list
def document_source(
user_input: str, num_results: int, languages, text_match
) -> List:
query_embedding, _ = embed_user_query(user_input)
# retrieve search results
metadata = search_wiki_for_query(
query_embedding,
num_results,
languages,
)
results = [result['url'] for result in metadata]
if num_results > len(results):
remaining_inputs = num_results - len(results)
for input in range(remaining_inputs):
results.append("")
return results
def translate_text(doc):
doc = " ".join(doc.split()[:4800])
result = translator.translate(doc, target_language='en')
return result
def translate_search_result():
pass
if __name__ == "__main__":
# query_embedding, user_query = embed_user_query("Who is the president of Nigeria")
# result = search_wiki_for_query(query_embedding,user_query=user_query)
# for item in result:
# print(item.payload["url"])
result = cross_lingual_document_search("Who is the president of Nigeria",
num_results=3,
languages=["Yoruba"],
text_match=False)
print(result, len(result)) |