Spaces:
Configuration error
Configuration error
File size: 3,610 Bytes
5b81931 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
#from dotenv import load_dotenv
from annoy import AnnoyIndex
import pandas as pd
import numpy as np
import cohere
import os
import plotly.express as px
import umap
import plotly.graph_objects as go
def get_key():
key = "7rMjNpj7LLTNlAcoR1Sc6cH23aURrBQoMPi9vzam"
#load_dotenv()
return key
def import_ds():
newsfiles = ['amharic','hausa','swahili','yoruba','igbo']
df_am = pd.read_csv(f'{newsfiles[0]}.csv')
df_am = df_am.sample(frac=0.5)
#df_en = pd.read_csv(f'{newsfiles[1]}.csv')
#df_en = df_en.sample(frac=0.3)
df_hs = pd.read_csv(f'{newsfiles[1]}.csv')
df_hs = df_hs.sample(frac=0.5)
df_sw = pd.read_csv(f'{newsfiles[2]}.csv')
df_sw = df_sw.sample(frac=0.5)
df_yr = pd.read_csv(f'{newsfiles[3]}.csv')
df_yr = df_yr.sample(frac=0.5)
df_ig = pd.read_csv(f'{newsfiles[4]}.csv')
df_ig = df_ig.sample(frac=0.5)
df_news = pd.concat([df_am,df_hs,df_sw,df_yr,df_ig],axis=0)
df_news = df_news.sample(frac = 1)
df_news = df_news[df_news['title'].notna()]
df_news = df_news.drop_duplicates("title")
df_news = df_news.sample(500)
return df_news
def getEmbeddings(co,df):
df['text'] = df['title'] + df['summary']
df = df.drop(['title','id','summary'],axis=1)
embeds = co.embed(texts=list(df['text']),model="multilingual-22-12",truncate="RIGHT").embeddings
embeds = np.array(embeds)
return embeds
def semantic_search(emb,indexfile):
emb = np.array(emb)
search_index = AnnoyIndex(emb.shape[1], 'angular')
print(emb.shape[1])
for i in range(len(emb)):
search_index.add_item(i, emb[i])
search_index.build(10)
search_index.save(indexfile)
def get_query_embed(co, query):
query_embed = co.embed(texts=[query],
model='multilingual-22-12',
truncate='right').embeddings
return np.array(query_embed)
def getClosestNeighbours(indexfile,query_embed,neighbours=15):
search_index = AnnoyIndex(768, 'angular')
search_index.load(indexfile)
# Retrieve the nearest neighbors
similar_item_ids = search_index.get_nns_by_vector(query_embed[0],neighbours,
include_distances=True)
return similar_item_ids
def display_news(df,similar_item_ids):
# Format the results
#print(similar_item_ids)
results = pd.DataFrame(data={'title': df.iloc[similar_item_ids[0]]['title'],
'url': df.iloc[similar_item_ids[0]]['url'],
'summary': df.iloc[similar_item_ids[0]]['summary']})
#'distance': similar_item_ids[1]})
results.reset_index(drop=True, inplace=True)
return results
def getUMAPEmbed(embeds):
# Map the nearest embeddings to 2d
reducer = umap.UMAP(n_neighbors=20)
return reducer.fit_transform(embeds)
def plot2DChart(df, umap_embeds, clusters=None):
if clusters is None:
clusters = {}
df_viz = pd.DataFrame(data={'url': df['url'], 'title': df['title']})
df_viz['x'] = umap_embeds[:, 0]
df_viz['y'] = umap_embeds[:, 1]
#print(df_explore)
# Plot
fig = px.scatter(df_viz, x='x', y='y', hover_data=['title'])
fig.data = fig.data[::-1]
return fig
if __name__ == '__main__':
key = get_key()
co = cohere.Client(key)
df_news = import_ds()
embed = process(co,df_news)
semantic_search(embed)
getClosestNeighbours(df_news) |