File size: 3,510 Bytes
fb2b8cf
 
 
f440070
dd7fea7
 
fb2b8cf
7c91199
 
 
e41891c
fb2b8cf
 
 
 
 
 
 
dd7fea7
 
c73d053
 
 
 
 
 
e41891c
 
 
 
 
c73d053
fb2b8cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f440070
fb2b8cf
 
 
 
 
 
 
 
cf1f09e
f440070
fb2b8cf
 
f440070
e41891c
f440070
 
 
 
 
fb2b8cf
f440070
 
 
 
fb2b8cf
 
 
e41891c
fb2b8cf
 
 
e41891c
fb2b8cf
f440070
fb2b8cf
 
 
 
f440070
 
31dada8
f440070
31dada8
da2a66c
fb2b8cf
da2a66c
fb2b8cf
 
 
 
 
 
 
f440070
fb2b8cf
 
 
 
f440070
 
 
 
 
fb2b8cf
f440070
fb2b8cf
f440070
fb2b8cf
dd7fea7
 
 
 
fb2b8cf
31dada8
 
 
fb2b8cf
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import os
import cohere
from typing import List
import pinecone
from easygoogletranslate import EasyGoogleTranslate


PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
PINECONE_ENV = os.environ.get("PINECONE_ENV")
COHERE_API_KEY = os.environ.get("COHERE_API_KEY")


MODEL_NAME = "multilingual-22-12"
COLLECTION = "wiki-embed"

# create qdrant and cohere client
cohere_client = cohere.Client(COHERE_API_KEY)

translator = EasyGoogleTranslate()

def init_pinecone():
    pinecone.init(api_key= PINECONE_API_KEY,
            environment=PINECONE_ENV)
    index = pinecone.Index(COLLECTION)
    return index


    

index = init_pinecone()


def embed_user_query(user_query):

    embeddings = cohere_client.embed(
        texts=[user_query],
        model=MODEL_NAME,
    )
    query_embedding = embeddings.embeddings[0]
    return query_embedding, user_query


def search_wiki_for_query(
    query_embedding,
    num_results = 3,
    languages = [],
):


    language_mapping = {
        "English": "en",
        "Yoruba": "yo",
        "Igbo": "ig",
        "Hause": "ha",
    }

    # index.query(query_embedding, top_k=num_results, include_metadata=True)

    # prepare filters to narrow down search results
    # if the `match_text` list is not empty then create filter to find exact matching text in the documents
    query_results = index.query(
        top_k=3,
        include_metadata=True,
        vector= query_embedding,
        filter={
            'lang': {'$in': [language_mapping[lang] for lang in languages]}
        }
    )

    metadata = [record["metadata"] for record in query_results["matches"]]

    return metadata


def cross_lingual_document_search(
    user_input: str, num_results: int, languages, text_match
) -> List:
    # create an embedding for the input query
    query_embedding, _ = embed_user_query(user_input)

    # retrieve search results
    metadata = search_wiki_for_query(
        query_embedding,
        num_results,
        languages,
    )

    results = [result['title']+"\n"+result['text'] for result in metadata]
    url_list = [result['url'] + "\n\n" for result in metadata]

    return results + url_list
    url_list = [result['url'] + "\n\n" for result in metadata]

    return results + url_list

def document_source(
    user_input: str, num_results: int, languages, text_match
) -> List:
    query_embedding, _ = embed_user_query(user_input)

    # retrieve search results
    metadata = search_wiki_for_query(
        query_embedding,
        num_results,
        languages,
    )

    results = [result['url'] for result in metadata]

    if num_results > len(results):
        remaining_inputs = num_results - len(results)
        for input in range(remaining_inputs):
            results.append("")

    return results

def translate_text(doc):
    doc = " ".join(doc.split()[:4800])
    result = translator.translate(doc, target_language='en')
    return result

def translate_search_result():
    pass

if __name__ == "__main__":
    # query_embedding, user_query = embed_user_query("Who is the president of Nigeria")
    # result = search_wiki_for_query(query_embedding,user_query=user_query)

    # for item in result:
    #     print(item.payload["url"])
    result = cross_lingual_document_search("Who is the president of Nigeria", 
                                        num_results=3, 
                                        languages=["Yoruba"], 
                                        text_match=False)
    print(result, len(result))