Spaces:
Runtime error
Runtime error
kaledarshan
commited on
Commit
•
6d364c2
1
Parent(s):
4d01d7c
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""multilingual_Semantic_Search.ipynb
|
3 |
+
|
4 |
+
Automatically generated by Colaboratory.
|
5 |
+
|
6 |
+
Original file is located at
|
7 |
+
https://colab.research.google.com/drive/1Wg8tD1NJqY0lnvSnsZQhB66pAvxSu65h
|
8 |
+
|
9 |
+
# Multilingual Semantic Search
|
10 |
+
Language models give computers the ability to search by meaning and go beyond searching by matching keywords. This capability is called semantic search.
|
11 |
+
|
12 |
+
![Searching an archive using sentence embeddings](https://github.com/cohere-ai/notebooks/raw/main/notebooks/images/basic-semantic-search-overview.png?3)
|
13 |
+
|
14 |
+
In this notebook, we'll build a simple semantic search engine. The applications of semantic search go beyond building a web search engine. They can empower a private search engine for internal documents or records. It can also be used to power features like StackOverflow's "similar questions" feature.
|
15 |
+
|
16 |
+
1. Get the archive of questions
|
17 |
+
2. [Embed](https://docs.cohere.ai/embed-reference/) the archive
|
18 |
+
3. Search using an index and nearest neighbor search
|
19 |
+
4. Visualize the archive based on the embeddings
|
20 |
+
"""
|
21 |
+
|
22 |
+
# Install Cohere for embeddings, Umap to reduce embeddings to 2 dimensions,
|
23 |
+
# Altair for visualization, Annoy for approximate nearest neighbor search
|
24 |
+
#!pip install cohere umap-learn altair annoy datasets tqdm
|
25 |
+
|
26 |
+
"""Get your Cohere API key by [signing up here](https://os.cohere.ai/register). Paste it in the cell below."""
|
27 |
+
|
28 |
+
#pip install umap
|
29 |
+
|
30 |
+
#@title Import libraries (Run this cell to execute required code) {display-mode: "form"}
|
31 |
+
|
32 |
+
import cohere
|
33 |
+
import numpy as np
|
34 |
+
import re
|
35 |
+
import pandas as pd
|
36 |
+
from tqdm import tqdm
|
37 |
+
from datasets import load_dataset
|
38 |
+
import umap
|
39 |
+
import altair as alt
|
40 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
41 |
+
from annoy import AnnoyIndex
|
42 |
+
import warnings
|
43 |
+
warnings.filterwarnings('ignore')
|
44 |
+
pd.set_option('display.max_colwidth', None)
|
45 |
+
|
46 |
+
"""You'll need your API key for this next cell. [Sign up to Cohere](https://os.cohere.ai/) and get one if you haven't yet."""
|
47 |
+
|
48 |
+
# Paste your API key here. Remember to not share publicly
|
49 |
+
api_key = 'twdqnY8kzEsMnu3N0bTX2JsqFUWybVczDDNZTjpd'
|
50 |
+
|
51 |
+
# Create and retrieve a Cohere API key from os.cohere.ai
|
52 |
+
co = cohere.Client(api_key)
|
53 |
+
|
54 |
+
"""## 1. Get The Archive of Questions
|
55 |
+
We'll use the [trec](https://www.tensorflow.org/datasets/catalog/trec) dataset which is made up of questions and their categories.
|
56 |
+
"""
|
57 |
+
|
58 |
+
# # Get dataset
|
59 |
+
# dataset = load_dataset("trec", split="train")
|
60 |
+
|
61 |
+
# # Import into a pandas dataframe, take only the first 1000 rows
|
62 |
+
# df = pd.DataFrame(dataset)[:1000]
|
63 |
+
|
64 |
+
# # Preview the data to ensure it has loaded correctly
|
65 |
+
# df.head(10)
|
66 |
+
|
67 |
+
import pandas as pd
|
68 |
+
|
69 |
+
# Get dataset
|
70 |
+
# dataset = load_dataset("trec", split="train")
|
71 |
+
# https://www.shanelynn.ie/pandas-csv-error-error-tokenizing-data-c-error-eof-inside-string-starting-at-line/
|
72 |
+
df = pd.read_excel("/content/news_articles_dataset.xlsx")
|
73 |
+
|
74 |
+
df.head()
|
75 |
+
|
76 |
+
df.columns
|
77 |
+
|
78 |
+
# combine columns , 'summary'
|
79 |
+
cols = ['Title ', 'News']
|
80 |
+
df['text'] = df[cols].apply(lambda row: ' \n '.join(row.values.astype(str)), axis=1)
|
81 |
+
df['text'].head()
|
82 |
+
|
83 |
+
"""## 2. Embed the archive
|
84 |
+
The next step is to embed the text of the questions.
|
85 |
+
|
86 |
+
![embedding archive texts](https://github.com/cohere-ai/notebooks/raw/main/notebooks/images/semantic-search-embed-text-archive.png)
|
87 |
+
|
88 |
+
To get a thousand embeddings of this length should take about fifteen seconds.
|
89 |
+
"""
|
90 |
+
|
91 |
+
# Get the embeddings
|
92 |
+
embeds = co.embed(texts=list(df['text']),model="multilingual-22-12",truncate="LEFT").embeddings
|
93 |
+
|
94 |
+
# Check the dimensions of the embeddings
|
95 |
+
embeds = np.array(embeds)
|
96 |
+
print(embeds.shape)
|
97 |
+
print(embeds)
|
98 |
+
print(df['text'][0])
|
99 |
+
print(embeds[0])
|
100 |
+
|
101 |
+
print(embeds.shape)
|
102 |
+
|
103 |
+
"""## 3. Search using an index and nearest neighbor search
|
104 |
+
![Building the search index from the embeddings](https://github.com/cohere-ai/notebooks/raw/main/notebooks/images/semantic-search-index.png)
|
105 |
+
Let's now use [Annoy](https://github.com/spotify/annoy) to build an index that stores the embeddings in a way that is optimized for fast search. This approach scales well to a large number of texts (other options include [Faiss](https://github.com/facebookresearch/faiss), [ScaNN](https://github.com/google-research/google-research/tree/master/scann), and [PyNNDescent](https://github.com/lmcinnes/pynndescent)).
|
106 |
+
|
107 |
+
After building the index, we can use it to retrieve the nearest neighbors either of existing questions (section 3.1), or of new questions that we embed (section 3.2).
|
108 |
+
"""
|
109 |
+
|
110 |
+
# Create the search index, pass the size of embedding
|
111 |
+
search_index = AnnoyIndex(embeds.shape[1], 'angular')
|
112 |
+
print(search_index)
|
113 |
+
# Add all the vectors to the search index
|
114 |
+
for i in range(len(embeds)):
|
115 |
+
search_index.add_item(i, embeds[i])
|
116 |
+
print(search_index)
|
117 |
+
|
118 |
+
|
119 |
+
search_index.build(10) # 10 trees
|
120 |
+
search_index.save('test.ann')
|
121 |
+
|
122 |
+
"""### 3.1. Find the neighbors of an example from the dataset
|
123 |
+
If we're only interested in measuring the distance between the questions in the dataset (no outside queries), a simple way is to calculate the distance between every pair of embeddings we have.
|
124 |
+
"""
|
125 |
+
|
126 |
+
# Choose an example (we'll retrieve others similar to it)
|
127 |
+
example_id = 5
|
128 |
+
|
129 |
+
# Retrieve nearest neighbors
|
130 |
+
similar_item_ids = search_index.get_nns_by_item(example_id,10,
|
131 |
+
include_distances=True)
|
132 |
+
# Format and print the text and distances
|
133 |
+
results = pd.DataFrame(data={'texts': df.iloc[similar_item_ids[0]]['text'],
|
134 |
+
'distance': similar_item_ids[1]}).drop(example_id)
|
135 |
+
|
136 |
+
print(f"Question:'{df.iloc[example_id]['text']}'\nNearest neighbors:")
|
137 |
+
results
|
138 |
+
|
139 |
+
"""### 3.2. Find the neighbors of a user query
|
140 |
+
We're not limited to searching using existing items. If we get a query, we can embed it and find its nearest neighbors from the dataset.
|
141 |
+
"""
|
142 |
+
|
143 |
+
# query = "skin care ayurveda"
|
144 |
+
# query = "how much money did skin care ayurveda raise"
|
145 |
+
# query = "semelso wife arrest"
|
146 |
+
# query = "avatar 2 movie collection"
|
147 |
+
# query = "బాలయ్య మాస్ ట్రీట్"
|
148 |
+
|
149 |
+
def multilingual_semantic_search(query):
|
150 |
+
# query = "is messi the best footballer of all time?"
|
151 |
+
|
152 |
+
# Get the query's embedding
|
153 |
+
query_embed = co.embed(texts=[query],
|
154 |
+
model="multilingual-22-12",
|
155 |
+
truncate="LEFT").embeddings
|
156 |
+
|
157 |
+
# Retrieve the nearest neighbors
|
158 |
+
similar_item_ids = search_index.get_nns_by_vector(query_embed[0],10,
|
159 |
+
include_distances=True)
|
160 |
+
# Format the results
|
161 |
+
# results = pd.DataFrame(data={'texts': df.iloc[similar_item_ids[0]]['text'],
|
162 |
+
# 'distance': similar_item_ids[1]})
|
163 |
+
|
164 |
+
results = pd.DataFrame(data={'title': df.iloc[similar_item_ids[0]]['Title '],
|
165 |
+
'news': df.iloc[similar_item_ids[0]]['News'],
|
166 |
+
'distance': similar_item_ids[1]})
|
167 |
+
|
168 |
+
response = {}
|
169 |
+
|
170 |
+
# JSON response
|
171 |
+
# for i in similar_item_ids[0]:
|
172 |
+
# # print(i)
|
173 |
+
# response[i] = \
|
174 |
+
# { \
|
175 |
+
# "title": df.iloc[i]['Title '], \
|
176 |
+
# "news": df.iloc[i]['News']
|
177 |
+
# }
|
178 |
+
|
179 |
+
response = """ """
|
180 |
+
for i in similar_item_ids[0]:
|
181 |
+
# print(i)
|
182 |
+
response += "Title: " + df.iloc[i]['Title '] + " \n " +"Short News: "+ df.iloc[i]['News'] + "\n\n"
|
183 |
+
|
184 |
+
# print(similar_item_ids)
|
185 |
+
# print(similar_item_ids[0])
|
186 |
+
# print(similar_item_ids[1])
|
187 |
+
|
188 |
+
# print(f"Query:'{query}'\nNearest neighbors:")
|
189 |
+
# print(results)
|
190 |
+
# print("----------------------")
|
191 |
+
# print(type(response))
|
192 |
+
|
193 |
+
print(response)
|
194 |
+
return response
|
195 |
+
|
196 |
+
multilingual_semantic_search("is messi the best footballer of all time?")
|
197 |
+
|
198 |
+
#!pip install gradio
|
199 |
+
import gradio as gr
|
200 |
+
# demo = gr.Interface(fn=multilingual_semantic_search, inputs="text", outputs="text")
|
201 |
+
with gr.Blocks() as demo:
|
202 |
+
gr.Markdown("🌍 This app uses a multilingual semantic model from COhere to 🚀 revolutionize the media and news industry in multilingual markets like India, allowing anyone to track 📰 regional news in real-time without the need for translation or understanding of other regional languages. 🙌")
|
203 |
+
name = gr.Textbox(label="*Semantic search enable! Search for a news...")
|
204 |
+
output = gr.Textbox(label="Semantic search results")
|
205 |
+
greet_btn = gr.Button("Search")
|
206 |
+
theme="darkpeach"
|
207 |
+
greet_btn.click(fn=multilingual_semantic_search, inputs=name, outputs=output)
|
208 |
+
demo.launch()
|
209 |
+
|
210 |
+
#!pip install gradio
|
211 |
+
|
212 |
+
"""## 4. Visualizing the archive
|
213 |
+
Finally, let's plot out all the questions onto a 2D chart so you're able to visualize the semantic similarities of this dataset!
|
214 |
+
"""
|
215 |
+
|
216 |
+
#@title Plot the archive {display-mode: "form"}
|
217 |
+
|
218 |
+
# UMAP reduces the dimensions from 1024 to 2 dimensions that we can plot
|
219 |
+
reducer = umap.UMAP(n_neighbors=20)
|
220 |
+
umap_embeds = reducer.fit_transform(embeds)
|
221 |
+
# Prepare the data to plot and interactive visualization
|
222 |
+
# using Altair
|
223 |
+
df_explore = pd.DataFrame(data={'text': df['text']})
|
224 |
+
df_explore['x'] = umap_embeds[:,0]
|
225 |
+
df_explore['y'] = umap_embeds[:,1]
|
226 |
+
|
227 |
+
# Plot
|
228 |
+
chart = alt.Chart(df_explore).mark_circle(size=60).encode(
|
229 |
+
x=#'x',
|
230 |
+
alt.X('x',
|
231 |
+
scale=alt.Scale(zero=False)
|
232 |
+
),
|
233 |
+
y=
|
234 |
+
alt.Y('y',
|
235 |
+
scale=alt.Scale(zero=False)
|
236 |
+
),
|
237 |
+
tooltip=['text']
|
238 |
+
).properties(
|
239 |
+
width=700,
|
240 |
+
height=400
|
241 |
+
)
|
242 |
+
chart.interactive()
|
243 |
+
|
244 |
+
"""Hover over the points to read the text. Do you see some of the patterns in clustered points? Similar questions, or questions asking about similar topics?
|
245 |
+
|
246 |
+
This concludes this introductory guide to semantic search using sentence embeddings. As you continue the path of building a search product additional considerations arise (like dealing with long texts, or finetuning to better improve the embeddings for a specific use case).
|
247 |
+
|
248 |
+
|
249 |
+
We can’t wait to see what you start building! Share your projects or find support at [community.cohere.ai](https://community.cohere.ai).
|
250 |
+
|
251 |
+
"""
|