# -*- coding: utf-8 -*- """multilingual_Semantic_Search.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1Wg8tD1NJqY0lnvSnsZQhB66pAvxSu65h # Multilingual Semantic Search Language models give computers the ability to search by meaning and go beyond searching by matching keywords. This capability is called semantic search. ![Searching an archive using sentence embeddings](https://github.com/cohere-ai/notebooks/raw/main/notebooks/images/basic-semantic-search-overview.png?3) In this notebook, we'll build a simple semantic search engine. The applications of semantic search go beyond building a web search engine. They can empower a private search engine for internal documents or records. It can also be used to power features like StackOverflow's "similar questions" feature. 1. Get the archive of questions 2. [Embed](https://docs.cohere.ai/embed-reference/) the archive 3. Search using an index and nearest neighbor search 4. Visualize the archive based on the embeddings """ # Install Cohere for embeddings, Umap to reduce embeddings to 2 dimensions, # Altair for visualization, Annoy for approximate nearest neighbor search !pip install cohere umap-learn altair annoy datasets tqdm """Get your Cohere API key by [signing up here](https://os.cohere.ai/register). Paste it in the cell below.""" pip install umap #@title Import libraries (Run this cell to execute required code) {display-mode: "form"} import cohere import numpy as np import re import pandas as pd from tqdm import tqdm from datasets import load_dataset import umap import altair as alt from sklearn.metrics.pairwise import cosine_similarity from annoy import AnnoyIndex import warnings warnings.filterwarnings('ignore') pd.set_option('display.max_colwidth', None) """You'll need your API key for this next cell. [Sign up to Cohere](https://os.cohere.ai/) and get one if you haven't yet.""" # Paste your API key here. Remember to not share publicly api_key = 'twdqnY8kzEsMnu3N0bTX2JsqFUWybVczDDNZTjpd' # Create and retrieve a Cohere API key from os.cohere.ai co = cohere.Client(api_key) """## 1. Get The Archive of Questions We'll use the [trec](https://www.tensorflow.org/datasets/catalog/trec) dataset which is made up of questions and their categories. """ # # Get dataset # dataset = load_dataset("trec", split="train") # # Import into a pandas dataframe, take only the first 1000 rows # df = pd.DataFrame(dataset)[:1000] # # Preview the data to ensure it has loaded correctly # df.head(10) import pandas as pd # Get dataset # dataset = load_dataset("trec", split="train") # https://www.shanelynn.ie/pandas-csv-error-error-tokenizing-data-c-error-eof-inside-string-starting-at-line/ df = pd.read_excel("/content/news_articles_dataset.xlsx") df.head() df.columns # combine columns , 'summary' cols = ['Title ', 'News'] df['text'] = df[cols].apply(lambda row: ' \n '.join(row.values.astype(str)), axis=1) df['text'].head() """## 2. Embed the archive The next step is to embed the text of the questions. ![embedding archive texts](https://github.com/cohere-ai/notebooks/raw/main/notebooks/images/semantic-search-embed-text-archive.png) To get a thousand embeddings of this length should take about fifteen seconds. """ # Get the embeddings embeds = co.embed(texts=list(df['text']),model="multilingual-22-12",truncate="LEFT").embeddings # Check the dimensions of the embeddings embeds = np.array(embeds) print(embeds.shape) print(embeds) print(df['text'][0]) print(embeds[0]) print(embeds.shape) """## 3. Search using an index and nearest neighbor search ![Building the search index from the embeddings](https://github.com/cohere-ai/notebooks/raw/main/notebooks/images/semantic-search-index.png) Let's now use [Annoy](https://github.com/spotify/annoy) to build an index that stores the embeddings in a way that is optimized for fast search. This approach scales well to a large number of texts (other options include [Faiss](https://github.com/facebookresearch/faiss), [ScaNN](https://github.com/google-research/google-research/tree/master/scann), and [PyNNDescent](https://github.com/lmcinnes/pynndescent)). After building the index, we can use it to retrieve the nearest neighbors either of existing questions (section 3.1), or of new questions that we embed (section 3.2). """ # Create the search index, pass the size of embedding search_index = AnnoyIndex(embeds.shape[1], 'angular') print(search_index) # Add all the vectors to the search index for i in range(len(embeds)): search_index.add_item(i, embeds[i]) print(search_index) search_index.build(10) # 10 trees search_index.save('test.ann') """### 3.1. Find the neighbors of an example from the dataset If we're only interested in measuring the distance between the questions in the dataset (no outside queries), a simple way is to calculate the distance between every pair of embeddings we have. """ # Choose an example (we'll retrieve others similar to it) example_id = 5 # Retrieve nearest neighbors similar_item_ids = search_index.get_nns_by_item(example_id,10, include_distances=True) # Format and print the text and distances results = pd.DataFrame(data={'texts': df.iloc[similar_item_ids[0]]['text'], 'distance': similar_item_ids[1]}).drop(example_id) print(f"Question:'{df.iloc[example_id]['text']}'\nNearest neighbors:") results """### 3.2. Find the neighbors of a user query We're not limited to searching using existing items. If we get a query, we can embed it and find its nearest neighbors from the dataset. """ # query = "skin care ayurveda" # query = "how much money did skin care ayurveda raise" # query = "semelso wife arrest" # query = "avatar 2 movie collection" # query = "బాలయ్య మాస్ ట్రీట్" def multilingual_semantic_search(query): # query = "is messi the best footballer of all time?" # Get the query's embedding query_embed = co.embed(texts=[query], model="multilingual-22-12", truncate="LEFT").embeddings # Retrieve the nearest neighbors similar_item_ids = search_index.get_nns_by_vector(query_embed[0],10, include_distances=True) # Format the results # results = pd.DataFrame(data={'texts': df.iloc[similar_item_ids[0]]['text'], # 'distance': similar_item_ids[1]}) results = pd.DataFrame(data={'title': df.iloc[similar_item_ids[0]]['Title '], 'news': df.iloc[similar_item_ids[0]]['News'], 'distance': similar_item_ids[1]}) response = {} # JSON response # for i in similar_item_ids[0]: # # print(i) # response[i] = \ # { \ # "title": df.iloc[i]['Title '], \ # "news": df.iloc[i]['News'] # } response = """ """ for i in similar_item_ids[0]: # print(i) response += "Title: " + df.iloc[i]['Title '] + " \n " +"Short News: "+ df.iloc[i]['News'] + "\n\n" # print(similar_item_ids) # print(similar_item_ids[0]) # print(similar_item_ids[1]) # print(f"Query:'{query}'\nNearest neighbors:") # print(results) # print("----------------------") # print(type(response)) print(response) return response multilingual_semantic_search("is messi the best footballer of all time?") !pip install gradio import gradio as gr # demo = gr.Interface(fn=multilingual_semantic_search, inputs="text", outputs="text") with gr.Blocks() as demo: gr.Markdown("🌍 This app uses a multilingual semantic model from COhere to 🚀 revolutionize the media and news industry in multilingual markets like India, allowing anyone to track 📰 regional news in real-time without the need for translation or understanding of other regional languages. 🙌") name = gr.Textbox(label="*Semantic search enable! Search for a news...") output = gr.Textbox(label="Semantic search results") greet_btn = gr.Button("Search") theme="darkpeach" greet_btn.click(fn=multilingual_semantic_search, inputs=name, outputs=output) demo.launch() #!pip install gradio """## 4. Visualizing the archive Finally, let's plot out all the questions onto a 2D chart so you're able to visualize the semantic similarities of this dataset! """ #@title Plot the archive {display-mode: "form"} # UMAP reduces the dimensions from 1024 to 2 dimensions that we can plot reducer = umap.UMAP(n_neighbors=20) umap_embeds = reducer.fit_transform(embeds) # Prepare the data to plot and interactive visualization # using Altair df_explore = pd.DataFrame(data={'text': df['text']}) df_explore['x'] = umap_embeds[:,0] df_explore['y'] = umap_embeds[:,1] # Plot chart = alt.Chart(df_explore).mark_circle(size=60).encode( x=#'x', alt.X('x', scale=alt.Scale(zero=False) ), y= alt.Y('y', scale=alt.Scale(zero=False) ), tooltip=['text'] ).properties( width=700, height=400 ) chart.interactive() """Hover over the points to read the text. Do you see some of the patterns in clustered points? Similar questions, or questions asking about similar topics? This concludes this introductory guide to semantic search using sentence embeddings. As you continue the path of building a search product additional considerations arise (like dealing with long texts, or finetuning to better improve the embeddings for a specific use case). We can’t wait to see what you start building! Share your projects or find support at [community.cohere.ai](https://community.cohere.ai). """