Spaces:

paulokewunmi
/

omowe.ai

Runtime error

App Files Files Community

paulokewunmi commited on May 5, 2023

Commit

f440070

•

1 Parent(s): ffa8f17

Change vector db to pinecone

Browse files

Files changed (5) hide show

app.py +2 -1
requirements.txt +1 -2
src/document_utils_v2.py +151 -0
src/wiki_search.py +37 -79
src/wiki_search_v2.py +162 -0

app.py CHANGED Viewed

@@ -65,13 +65,14 @@ with gr.Blocks(theme=custom_theme) as demo:
                         "Hausa",
                     ],
                     label="Filter results based on language",
                 )
             with gr.Row():
                 with gr.Column():
                     user_query = gr.Text(
                         label="Enter query here",
-                        placeholder="Search through all your documents",
                     )
                     num_search_results = gr.Slider(

                         "Hausa",
                     ],
                     label="Filter results based on language",
+                    value = "Yoruba"
                 )
             with gr.Row():
                 with gr.Column():
                     user_query = gr.Text(
                         label="Enter query here",
+                        placeholder="Search through all your study materials",
                     )
                     num_search_results = gr.Slider(

requirements.txt CHANGED Viewed

@@ -1,6 +1,5 @@
 cohere
-qdrant_client==0.11.0
 gradio
 langchain
 black
-python-dotenv

 cohere
 gradio
 langchain
 black
+"pinecone-client[grpc]"

src/document_utils_v2.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import os
+import sys
+import pandas as pd
+from typing import List
+import cohere
+from langchain.embeddings.cohere import CohereEmbeddings
+from langchain.llms import Cohere
+from langchain.prompts import PromptTemplate
+from langchain.vectorstores import Qdrant
+from langchain.chains.question_answering import load_qa_chain
+sys.path.append(os.path.abspath('..'))
+from src.constants import SUMMARIZATION_MODEL, EXAMPLES_FILE_PATH
+QDRANT_HOST = os.environ.get("QDRANT_HOST")
+QDRANT_API_KEY = os.environ.get("QDRANT_API_KEY")
+COHERE_API_KEY = os.environ.get("COHERE_API_KEY")
+def replace_text(text):
+    if text.startswith("The answer is "):
+        text = text.replace("The answer is ", "", 1)
+    return text
+def summarize(
+    document: str,
+    summary_length: str,
+    summary_format: str,
+    extractiveness: str = "high",
+    temperature: float = 0.6,
+) -> str:
+    """
+    Generates a summary for the input document using Cohere's summarize API.
+        Args:
+            document (`str`):
+                The document given by the user for which summary must be generated.
+            summary_length (`str`):
+                A value such as 'short', 'medium', 'long' indicating the length of the summary.
+            summary_format (`str`):
+                This indicates whether the generated summary should be in 'paragraph' format or 'bullets'.
+            extractiveness (`str`, *optional*, defaults to 'high'):
+                A value such as 'low', 'medium', 'high' indicating how close the generated summary should be in meaning to the original text.
+            temperature (`str`):
+                This controls the randomness of the output. Lower values tend to generate more “predictable” output, while higher values tend to generate more “creative” output.
+        Returns:
+            generated_summary (`str`):
+                The generated summary from the summarization model.
+    """
+    summary_response = cohere.Client(COHERE_API_KEY).summarize(
+        text=document,
+        length=summary_length,
+        format=summary_format,
+        model=SUMMARIZATION_MODEL,
+        extractiveness=extractiveness,
+        temperature=temperature,
+    )
+    generated_summary = summary_response.summary
+    return generated_summary
+def question_answer(input_document: str, history: List) -> str:
+    """
+    Generates an appropriate answer for the question asked by the user based on the input document.
+        Args:
+            input_document (`str`):
+                The document given by the user for which summary must be generated.
+            history (`List[List[str,str]]`):
+                A list made up of pairs of input question asked by the user & corresponding generated answers. It is used to keep track of the history of the chat between the user and the model.
+        Returns:
+            answer (`str`):
+                The generated answer corresponding to the input question and document received from the user.
+    """
+    context = input_document
+    # The last element of the `history` list contains the most recent question asked by the user whose answer needs to be generated.
+    question = history[-1][0]
+    word_list = context.split()
+    # texts = [context[k : k + 256] for k in range(0, len(context.split()), 256)]
+    texts =  [" ".join(word_list[k : k + 256]) for k in range(0, len(word_list), 256)]
+    # print(texts)
+    embeddings = CohereEmbeddings(
+        model="multilingual-22-12", cohere_api_key=COHERE_API_KEY
+    )
+    context_index = Qdrant.from_texts(
+        texts, embeddings, url=QDRANT_HOST, api_key=QDRANT_API_KEY
+    )
+    prompt_template = """Text: {context}
+    Question: {question}
+    Answer the question based on the text provided. If the text doesn't contain the answer, reply that the answer is not available."""
+    PROMPT = PromptTemplate(
+        template=prompt_template, input_variables=["context", "question"]
+    )
+    # Generate the answer given the context
+    chain = load_qa_chain(
+        Cohere(
+            model="command-xlarge-nightly", temperature=0, cohere_api_key=COHERE_API_KEY
+        ),
+        chain_type="stuff",
+        prompt=PROMPT,
+    )
+    relevant_context = context_index.similarity_search(question)
+    answer = chain.run(input_documents=relevant_context, question=question)
+    answer = answer.replace("\n", "").replace("Answer:", "")
+    answer = replace_text(answer)
+    return answer
+def generate_questions(input_document: str) -> str:
+    generated_response = cohere.Client(COHERE_API_KEY).generate(
+        prompt = f"Give me 5 different questions to test understanding of the following text provided. Here's the provided text: {input_document}. Now what is Questions 1 to 5  ?:",
+        max_tokens = 200,
+        temperature = 0.55
+    )
+    # prompt = f"Generate 5 different quiz questions to test the understanding of the following text. Here's the provided text: {input_document}. Whats Questions 1 to 5 of the quiz ?:"
+    # print(prompt)
+    return generated_response.generations[0].text
+def load_science():
+    examples_df = pd.read_csv(EXAMPLES_FILE_PATH)
+    science_doc = examples_df["doc"].iloc[0]
+    sample_question = examples_df["question"].iloc[0]
+    return science_doc, sample_question
+def load_history():
+    examples_df = pd.read_csv(EXAMPLES_FILE_PATH)
+    history_doc = examples_df["doc"].iloc[1]
+    sample_question = examples_df["question"].iloc[1]
+    return history_doc, sample_question
+if __name__ == "__main__":
+    with open('sample_text.txt', 'r') as file:
+        text = file.read()
+    # summary = summarize(text, summary_length="short", summary_format="bullets")
+    # print(summary)
+    # answer = question_answer(text, [["what is photosynthesis", None]])
+    # print(answer)
+    question = question_answer(text, ["Whats photosynthesis"])
+    print(question)

src/wiki_search.py CHANGED Viewed

@@ -1,14 +1,10 @@
 import os
 import cohere
 from typing import List
-from qdrant_client import QdrantClient
-from qdrant_client import models
-# load environment variables
-QDRANT_HOST = os.environ.get("QDRANT_HOST")
-QDRANT_API_KEY = os.environ.get("QDRANT_API_KEY")
 COHERE_API_KEY = os.environ.get("COHERE_API_KEY")
 MODEL_NAME = "multilingual-22-12"
@@ -17,12 +13,6 @@ COLLECTION = "wiki-embed"
 # create qdrant and cohere client
 cohere_client = cohere.Client(COHERE_API_KEY)
-qdrant_client = QdrantClient(
-    host=QDRANT_HOST,
-    api_key=QDRANT_API_KEY,
-    port = 443,
-)
 def embed_user_query(user_query):
     embeddings = cohere_client.embed(
@@ -36,10 +26,13 @@ def embed_user_query(user_query):
 def search_wiki_for_query(
     query_embedding,
     num_results = 3,
-    user_query= "",
     languages = [],
-    match_text = None,
 ):
     filters = []
     language_mapping = {
@@ -49,78 +42,45 @@ def search_wiki_for_query(
         "Hause": "ha",
     }
     # prepare filters to narrow down search results
     # if the `match_text` list is not empty then create filter to find exact matching text in the documents
-    if match_text:
-        filters.append(
-            models.FieldCondition(
-                key="text",
-                match=models.MatchText(text=user_query),
-            )
-        )
-    # filter documents based on language before performing search:
-    if languages:
-        for lang in languages:
-            filters.append(
-                models.FieldCondition(
-                    key="lang",
-                    match=models.MatchValue(
-                        value=language_mapping[lang],
-                    ),
-                )
-            )
-    # perform search and get results
-    results = qdrant_client.search(
-        collection_name=COLLECTION,
-        query_filter=models.Filter(should=filters),
-        search_params=models.SearchParams(hnsw_ef=128, exact=False),
-        query_vector=query_embedding,
-        limit=num_results,
     )
-    return results
 def cross_lingual_document_search(
     user_input: str, num_results: int, languages, text_match
 ) -> List:
-    """
-    Wrapper function for performing search on the collection of documents for the given user query.
-    Prepares query embedding, retrieves search results, checks if expected number of search results are being returned.
-        Args:
-            user_input (`str`):
-                The user input based on which search will be performed.
-            num_results (`str`):
-                The number of expected search results.
-            languages (`str`):
-                The list of languages based on which search results must be filtered.
-            text_match (`str`):
-                A field based on which it is decided whether to perform full-text-match while performing search.
-        Returns:
-            final_results (`List[str]`):
-                A list containing the final search results corresponding to the given user input.
-    """
     # create an embedding for the input query
     query_embedding, _ = embed_user_query(user_input)
     # retrieve search results
-    result = search_wiki_for_query(
         query_embedding,
         num_results,
-        user_input,
         languages,
-        text_match,
     )
-    final_results = [result[i].payload["text"] for i in range(len(result))]
-    # check if number of search results obtained (i.e. `final_results`) is matching with number of expected search results i.e. `num_results`
-    if num_results > len(final_results):
-        remaining_inputs = num_results - len(final_results)
         for input in range(remaining_inputs):
-            final_results.append("")
-    return final_results
 def document_source(
     user_input: str, num_results: int, languages, text_match
@@ -128,22 +88,20 @@ def document_source(
     query_embedding, _ = embed_user_query(user_input)
     # retrieve search results
-    result = search_wiki_for_query(
         query_embedding,
         num_results,
-        user_input,
         languages,
-        text_match,
     )
-    sources = [result[i].payload["url"] for i in range(len(result))]
-    # check if number of search results obtained (i.e. `final_results`) is matching with number of expected search results i.e. `num_results`
-    if num_results > len(sources):
-        remaining_inputs = num_results - len(sources)
         for input in range(remaining_inputs):
-            sources.append("")
-    return sources
 def translate_search_result():

 import os
 import cohere
 from typing import List
+import pinecone
+PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
+PINECONE_ENV = os.environ.get("PINECONE_ENV")
 COHERE_API_KEY = os.environ.get("COHERE_API_KEY")
 MODEL_NAME = "multilingual-22-12"
 # create qdrant and cohere client
 cohere_client = cohere.Client(COHERE_API_KEY)
 def embed_user_query(user_query):
     embeddings = cohere_client.embed(
 def search_wiki_for_query(
     query_embedding,
     num_results = 3,
     languages = [],
 ):
+    pinecone.init(api_key= PINECONE_API_KEY,
+            environment=PINECONE_ENV)
+    index = pinecone.GRPCIndex(COLLECTION)
     filters = []
     language_mapping = {
         "Hause": "ha",
     }
+    index.query(query_embedding, top_k=num_results, include_metadata=True)
     # prepare filters to narrow down search results
     # if the `match_text` list is not empty then create filter to find exact matching text in the documents
+    query_results = index.query(
+        top_k=3,
+        include_metadata=True,
+        vector= query_embedding,
+        filter={
+            'lang': {'$in': [language_mapping[lang] for lang in languages]}
+        }
     )
+    metadata = [record["metadata"] for record in query_results["matches"]]
+    return metadata
 def cross_lingual_document_search(
     user_input: str, num_results: int, languages, text_match
 ) -> List:
     # create an embedding for the input query
     query_embedding, _ = embed_user_query(user_input)
     # retrieve search results
+    metadata = search_wiki_for_query(
         query_embedding,
         num_results,
         languages,
     )
+    results = [result['title']+"\n"+result['text'] for result in metadata]
+    if num_results > len(results):
+        remaining_inputs = num_results - len(results)
         for input in range(remaining_inputs):
+            results.append("")
+    return results
 def document_source(
     user_input: str, num_results: int, languages, text_match
     query_embedding, _ = embed_user_query(user_input)
     # retrieve search results
+    metadata = search_wiki_for_query(
         query_embedding,
         num_results,
         languages,
     )
+    results = [result['url'] for result in metadata]
+    if num_results > len(results):
+        remaining_inputs = num_results - len(results)
         for input in range(remaining_inputs):
+            results.append("")
+    return results
 def translate_search_result():

src/wiki_search_v2.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import os
+import cohere
+from typing import List
+from qdrant_client import QdrantClient
+from qdrant_client import models
+# load environment variables
+QDRANT_HOST = os.environ.get("QDRANT_HOST")
+QDRANT_API_KEY = os.environ.get("QDRANT_API_KEY")
+COHERE_API_KEY = os.environ.get("COHERE_API_KEY")
+MODEL_NAME = "multilingual-22-12"
+COLLECTION = "wiki-embed"
+# create qdrant and cohere client
+cohere_client = cohere.Client(COHERE_API_KEY)
+qdrant_client = QdrantClient(
+    host=QDRANT_HOST,
+    api_key=QDRANT_API_KEY,
+    port = 443,
+)
+def embed_user_query(user_query):
+    embeddings = cohere_client.embed(
+        texts=[user_query],
+        model=MODEL_NAME,
+    )
+    query_embedding = embeddings.embeddings[0]
+    return query_embedding, user_query
+def search_wiki_for_query(
+    query_embedding,
+    num_results = 3,
+    user_query= "",
+    languages = [],
+    match_text = None,
+):
+    filters = []
+    language_mapping = {
+        "English": "en",
+        "Yoruba": "yo",
+        "Igbo": "ig",
+        "Hause": "ha",
+    }
+    # prepare filters to narrow down search results
+    # if the `match_text` list is not empty then create filter to find exact matching text in the documents
+    if match_text:
+        filters.append(
+            models.FieldCondition(
+                key="text",
+                match=models.MatchText(text=user_query),
+            )
+        )
+    # filter documents based on language before performing search:
+    if languages:
+        for lang in languages:
+            filters.append(
+                models.FieldCondition(
+                    key="lang",
+                    match=models.MatchValue(
+                        value=language_mapping[lang],
+                    ),
+                )
+            )
+    # perform search and get results
+    results = qdrant_client.search(
+        collection_name=COLLECTION,
+        query_filter=models.Filter(should=filters),
+        search_params=models.SearchParams(hnsw_ef=128, exact=False),
+        query_vector=query_embedding,
+        limit=num_results,
+    )
+    return results
+def cross_lingual_document_search(
+    user_input: str, num_results: int, languages, text_match
+) -> List:
+    """
+    Wrapper function for performing search on the collection of documents for the given user query.
+    Prepares query embedding, retrieves search results, checks if expected number of search results are being returned.
+        Args:
+            user_input (`str`):
+                The user input based on which search will be performed.
+            num_results (`str`):
+                The number of expected search results.
+            languages (`str`):
+                The list of languages based on which search results must be filtered.
+            text_match (`str`):
+                A field based on which it is decided whether to perform full-text-match while performing search.
+        Returns:
+            final_results (`List[str]`):
+                A list containing the final search results corresponding to the given user input.
+    """
+    # create an embedding for the input query
+    query_embedding, _ = embed_user_query(user_input)
+    # retrieve search results
+    result = search_wiki_for_query(
+        query_embedding,
+        num_results,
+        user_input,
+        languages,
+        text_match,
+    )
+    final_results = [result[i].payload["text"] for i in range(len(result))]
+    # check if number of search results obtained (i.e. `final_results`) is matching with number of expected search results i.e. `num_results`
+    if num_results > len(final_results):
+        remaining_inputs = num_results - len(final_results)
+        for input in range(remaining_inputs):
+            final_results.append("")
+    return final_results
+def document_source(
+    user_input: str, num_results: int, languages, text_match
+) -> List:
+    query_embedding, _ = embed_user_query(user_input)
+    # retrieve search results
+    result = search_wiki_for_query(
+        query_embedding,
+        num_results,
+        user_input,
+        languages,
+        text_match,
+    )
+    sources = [result[i].payload["url"] for i in range(len(result))]
+    # check if number of search results obtained (i.e. `final_results`) is matching with number of expected search results i.e. `num_results`
+    if num_results > len(sources):
+        remaining_inputs = num_results - len(sources)
+        for input in range(remaining_inputs):
+            sources.append("")
+    return sources
+def translate_search_result():
+    pass
+if __name__ == "__main__":
+    # query_embedding, user_query = embed_user_query("Who is the president of Nigeria")
+    # result = search_wiki_for_query(query_embedding,user_query=user_query)
+    # for item in result:
+    #     print(item.payload["url"])
+    result = cross_lingual_document_search("Who is the president of Nigeria",
+                                        num_results=3,
+                                        languages=["Yoruba"],
+                                        text_match=False)
+    print(result, len(result))