Spaces:

subashdvorak
/

trygithubactions

Sleeping

App Files Files Community

trygithubactions / src /genai /ideation_agent /utils /tools.py

subashpoudel

Next commit

46178b9 5 months ago

raw

history blame contribute delete

2.81 kB


	from langchain_core.tools import tool
	from .state import QueryFormatter
	import pandas as pd
	import numpy as np
	import ast
	import faiss
	import tiktoken
	from src.genai.utils.models_loader import embedding_model
	from src.genai.utils.data_loader import caption_index , caption_df, ideas_index , ideas_df
	from src.genai.utils.utils import clean_text


	class Retrieval:
	def __init__(self, business_details):
	self.business_details = business_details
	self.query_embedding = np.array(embedding_model.embed_query(str(business_details))).reshape(1, -1).astype('float32')
	faiss.normalize_L2(self.query_embedding)

	def influencers_data(self):
	top_k = len(caption_df)
	distances, indices = caption_index.search(self.query_embedding, top_k)

	similarity_threshold = 0.35
	selected = [(idx, sim) for idx, sim in zip(indices[0], distances[0]) if sim >= similarity_threshold]

	if not selected:
	return "No influencers found."

	# === Format results ===
	outer_list = []
	for rank, (idx, sim) in enumerate(selected, 1):
	row = caption_df.iloc[idx]
	res = {
	'rank': rank,
	'username': row['username'],
	'visible_text_or_brandings': row['visible_texts_or_brandings'],
	'likesCount': row['likesCount'],
	'commentCount': row['commentCount'],
	'product_or_service_details': row['product_or_service_details'],
	}

	inner_list = [
	f"[{res['rank']}]. The influencer name is: {res['username']} — Likes: {res['likesCount']}, Comments: {res['commentCount']}",
	f"The branding or promotion done is:\n{res['visible_text_or_brandings']}",
	f"The details of product or service is:\n{res['product_or_service_details']}"
	]
	outer_list.append(inner_list)

	cleaned_response = clean_text(str(outer_list))
	encoding = tiktoken.encoding_for_model('gpt-4o-mini')
	tokens = encoding.encode(cleaned_response)
	trimmed_response = tokens[:100]
	return encoding.decode(trimmed_response)

	def imdb_ideas(self):
	top_k = 4
	distances, indices = ideas_index.search(self.query_embedding, top_k)

	outer_list = []
	for rank, (idx, sim) in enumerate(zip(indices[0], distances[0]), 1):
	row = ideas_df.iloc[idx]
	res = {
	'rank': rank,
	'idea': row['idea'],
	}

	inner_list = [
	f"Idea [{res['rank']}]: {res['idea']}\n",
	]
	outer_list.append(inner_list)

	cleaned_response = clean_text(str(outer_list))
	return str(cleaned_response)