kaledarshan's picture
Update app.py
3fa6c66
raw
history blame contribute delete
No virus
9.67 kB
# -*- coding: utf-8 -*-
"""multilingual_Semantic_Search.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1Wg8tD1NJqY0lnvSnsZQhB66pAvxSu65h
# Multilingual Semantic Search
Language models give computers the ability to search by meaning and go beyond searching by matching keywords. This capability is called semantic search.
![Searching an archive using sentence embeddings](https://github.com/cohere-ai/notebooks/raw/main/notebooks/images/basic-semantic-search-overview.png?3)
In this notebook, we'll build a simple semantic search engine. The applications of semantic search go beyond building a web search engine. They can empower a private search engine for internal documents or records. It can also be used to power features like StackOverflow's "similar questions" feature.
1. Get the archive of questions
2. [Embed](https://docs.cohere.ai/embed-reference/) the archive
3. Search using an index and nearest neighbor search
4. Visualize the archive based on the embeddings
"""
# Install Cohere for embeddings, Umap to reduce embeddings to 2 dimensions,
# Altair for visualization, Annoy for approximate nearest neighbor search
#!pip install cohere umap-learn altair annoy datasets tqdm
"""Get your Cohere API key by [signing up here](https://os.cohere.ai/register). Paste it in the cell below."""
#pip install umap
#@title Import libraries (Run this cell to execute required code) {display-mode: "form"}
import cohere
import numpy as np
import re
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
import umap
import altair as alt
from sklearn.metrics.pairwise import cosine_similarity
from annoy import AnnoyIndex
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', None)
"""You'll need your API key for this next cell. [Sign up to Cohere](https://os.cohere.ai/) and get one if you haven't yet."""
# Paste your API key here. Remember to not share publicly
api_key = 'twdqnY8kzEsMnu3N0bTX2JsqFUWybVczDDNZTjpd'
# Create and retrieve a Cohere API key from os.cohere.ai
co = cohere.Client(api_key)
"""## 1. Get The Archive of Questions
We'll use the [trec](https://www.tensorflow.org/datasets/catalog/trec) dataset which is made up of questions and their categories.
"""
# # Get dataset
# dataset = load_dataset("trec", split="train")
# # Import into a pandas dataframe, take only the first 1000 rows
# df = pd.DataFrame(dataset)[:1000]
# # Preview the data to ensure it has loaded correctly
# df.head(10)
import pandas as pd
# Get dataset
# dataset = load_dataset("trec", split="train")
# https://www.shanelynn.ie/pandas-csv-error-error-tokenizing-data-c-error-eof-inside-string-starting-at-line/
df = pd.read_excel("news_articles_dataset.xlsx")
df.head()
df.columns
# combine columns , 'summary'
cols = ['Title ', 'News']
df['text'] = df[cols].apply(lambda row: ' \n '.join(row.values.astype(str)), axis=1)
df['text'].head()
"""## 2. Embed the archive
The next step is to embed the text of the questions.
![embedding archive texts](https://github.com/cohere-ai/notebooks/raw/main/notebooks/images/semantic-search-embed-text-archive.png)
To get a thousand embeddings of this length should take about fifteen seconds.
"""
# Get the embeddings
embeds = co.embed(texts=list(df['text']),model="multilingual-22-12",truncate="LEFT").embeddings
# Check the dimensions of the embeddings
embeds = np.array(embeds)
print(embeds.shape)
print(embeds)
print(df['text'][0])
print(embeds[0])
print(embeds.shape)
"""## 3. Search using an index and nearest neighbor search
![Building the search index from the embeddings](https://github.com/cohere-ai/notebooks/raw/main/notebooks/images/semantic-search-index.png)
Let's now use [Annoy](https://github.com/spotify/annoy) to build an index that stores the embeddings in a way that is optimized for fast search. This approach scales well to a large number of texts (other options include [Faiss](https://github.com/facebookresearch/faiss), [ScaNN](https://github.com/google-research/google-research/tree/master/scann), and [PyNNDescent](https://github.com/lmcinnes/pynndescent)).
After building the index, we can use it to retrieve the nearest neighbors either of existing questions (section 3.1), or of new questions that we embed (section 3.2).
"""
# Create the search index, pass the size of embedding
search_index = AnnoyIndex(embeds.shape[1], 'angular')
print(search_index)
# Add all the vectors to the search index
for i in range(len(embeds)):
search_index.add_item(i, embeds[i])
print(search_index)
search_index.build(10) # 10 trees
search_index.save('test.ann')
"""### 3.1. Find the neighbors of an example from the dataset
If we're only interested in measuring the distance between the questions in the dataset (no outside queries), a simple way is to calculate the distance between every pair of embeddings we have.
"""
# Choose an example (we'll retrieve others similar to it)
example_id = 5
# Retrieve nearest neighbors
similar_item_ids = search_index.get_nns_by_item(example_id,10,
include_distances=True)
# Format and print the text and distances
results = pd.DataFrame(data={'texts': df.iloc[similar_item_ids[0]]['text'],
'distance': similar_item_ids[1]}).drop(example_id)
print(f"Question:'{df.iloc[example_id]['text']}'\nNearest neighbors:")
results
"""### 3.2. Find the neighbors of a user query
We're not limited to searching using existing items. If we get a query, we can embed it and find its nearest neighbors from the dataset.
"""
# query = "skin care ayurveda"
# query = "how much money did skin care ayurveda raise"
# query = "semelso wife arrest"
# query = "avatar 2 movie collection"
# query = "బాలయ్య మాస్ ట్రీట్"
def multilingual_semantic_search(query):
# query = "is messi the best footballer of all time?"
# Get the query's embedding
query_embed = co.embed(texts=[query],
model="multilingual-22-12",
truncate="LEFT").embeddings
# Retrieve the nearest neighbors
similar_item_ids = search_index.get_nns_by_vector(query_embed[0],10,
include_distances=True)
# Format the results
# results = pd.DataFrame(data={'texts': df.iloc[similar_item_ids[0]]['text'],
# 'distance': similar_item_ids[1]})
results = pd.DataFrame(data={'title': df.iloc[similar_item_ids[0]]['Title '],
'news': df.iloc[similar_item_ids[0]]['News'],
'distance': similar_item_ids[1]})
response = {}
# JSON response
# for i in similar_item_ids[0]:
# # print(i)
# response[i] = \
# { \
# "title": df.iloc[i]['Title '], \
# "news": df.iloc[i]['News']
# }
response = """ """
for i in similar_item_ids[0]:
# print(i)
response += "Title: " + df.iloc[i]['Title '] + " \n " +"Short News: "+ df.iloc[i]['News'] + "\n\n"
# print(similar_item_ids)
# print(similar_item_ids[0])
# print(similar_item_ids[1])
# print(f"Query:'{query}'\nNearest neighbors:")
# print(results)
# print("----------------------")
# print(type(response))
print(response)
return response
multilingual_semantic_search("is messi the best footballer of all time?")
#!pip install gradio
import gradio as gr
# demo = gr.Interface(fn=multilingual_semantic_search, inputs="text", outputs="text")
with gr.Blocks() as demo:
gr.Markdown("🌍 This app uses a multilingual semantic model from COhere to 🚀 revolutionize the media and news industry in multilingual markets like India, allowing anyone to track 📰 regional news in real-time without the need for translation or understanding of other regional languages. 🙌")
name = gr.Textbox(label="*Semantic search enable! Search for a news...")
output = gr.Textbox(label="Semantic search results")
greet_btn = gr.Button("Search")
theme="darkpeach"
greet_btn.click(fn=multilingual_semantic_search, inputs=name, outputs=output)
demo.launch()
#!pip install gradio
"""## 4. Visualizing the archive
Finally, let's plot out all the questions onto a 2D chart so you're able to visualize the semantic similarities of this dataset!
"""
#@title Plot the archive {display-mode: "form"}
# UMAP reduces the dimensions from 1024 to 2 dimensions that we can plot
reducer = umap.UMAP(n_neighbors=20)
umap_embeds = reducer.fit_transform(embeds)
# Prepare the data to plot and interactive visualization
# using Altair
df_explore = pd.DataFrame(data={'text': df['text']})
df_explore['x'] = umap_embeds[:,0]
df_explore['y'] = umap_embeds[:,1]
# Plot
chart = alt.Chart(df_explore).mark_circle(size=60).encode(
x=#'x',
alt.X('x',
scale=alt.Scale(zero=False)
),
y=
alt.Y('y',
scale=alt.Scale(zero=False)
),
tooltip=['text']
).properties(
width=700,
height=400
)
chart.interactive()
"""Hover over the points to read the text. Do you see some of the patterns in clustered points? Similar questions, or questions asking about similar topics?
This concludes this introductory guide to semantic search using sentence embeddings. As you continue the path of building a search product additional considerations arise (like dealing with long texts, or finetuning to better improve the embeddings for a specific use case).
We can’t wait to see what you start building! Share your projects or find support at [community.cohere.ai](https://community.cohere.ai).
"""