Spaces:
Runtime error
Runtime error
# -*- coding: utf-8 -*- | |
"""Task22.ipynb | |
Automatically generated by Colab. | |
Original file is located at | |
https://colab.research.google.com/drive/1yBvg6i_GsMk--P2nuSG-mfqCDbuIcEpx | |
# Task 2 | |
- Raghad Al-Rasheed | |
- Fawwaz Alsheikhi | |
using the E5 model as the embedding model and translated dataset from huggingface | |
""" | |
"""## Downloading the Embedding model""" | |
from sentence_transformers import SentenceTransformer | |
import nltk | |
nltk.download('punkt') | |
from nltk.tokenize import word_tokenize | |
import math | |
from scipy import spatial | |
model = SentenceTransformer("intfloat/multilingual-e5-large") | |
"""## Downloading Translated data from english to arabic""" | |
from datasets import load_dataset | |
ds = load_dataset("Helsinki-NLP/news_commentary", "ar-en",split="train") | |
import pandas as pd | |
df = pd.DataFrame(ds['translation']) | |
df['ar'] | |
df['ar'][0] | |
"""### Extracting the first 10000 rows out of the data""" | |
df=df.head(10000) | |
df['ar'].shape | |
documents =[doc for doc in df['ar']] | |
documents[9999] | |
"""## Embedding the sentences by rows""" | |
embeddings = model.encode(documents) | |
from sentence_transformers import SentenceTransformer | |
import nltk | |
nltk.download('punkt') | |
from nltk.tokenize import word_tokenize | |
import math | |
from scipy import spatial | |
import scipy | |
def semantic_search(query, embeddings, documents): | |
query_embedding = model.encode(query) | |
document_embeddings = embeddings | |
scores = [scipy.spatial.distance.cosine(query_embedding, doc) for doc in document_embeddings] | |
ls1 = list() | |
for i, score in enumerate(scores): | |
ls1.append([documents[i],score]) | |
print(scores.index(min(scores))) | |
most_similar_doc = documents[scores.index(min(scores))] | |
print("Most similar document", most_similar_doc) | |
return ls1 | |
output = semantic_search("ـ لم يكن من السهل قط أن ينخرط المرء في محادثة عقلانية حول قيمة الذهب.",embeddings, documents) | |
documents[999] | |
"""### Extracting top three related sentences""" | |
ranked = sorted(output, key=lambda x: x[1]) | |
ranked[:3] | |
df | |
"""## using english with arabic to see the semantic search of multilangual model""" | |
df['ar'] | |
df['en'] | |
df_ar = df['ar'].tolist()[:5000] | |
df_en = df['en'].tolist()[:5000] | |
combined_list = df_ar + df_en | |
print(len(combined_list)) | |
embeddings1 = model.encode(combined_list) | |
def semantic_search(query): | |
query_embedding = model.encode(query) | |
document_embeddings = embeddings1 | |
scores = [scipy.spatial.distance.cosine(query_embedding, doc) for doc in document_embeddings] | |
ls1 = list() | |
for i, score in enumerate(scores): | |
ls1.append([combined_list[i],score]) | |
print(scores.index(min(scores))) | |
most_similar_doc = combined_list[scores.index(min(scores))] | |
print("Most similar document", most_similar_doc) | |
return ls1 | |
output = semantic_search("لذهب بعشرة آلاف دولار؟") | |
ranked = sorted(output, key=lambda x: x[1]) | |
ranked[:3] | |
import gradio as gr | |
demo = gr.Interface(fn=semantic_search,inputs = ["text"], outputs=["text", "text", "text"]) | |
if __name__ == "__main__": | |
demo.launch() | |