Spaces:

cdy3870
/

Fetch_App

Running

Fetch_App / offer_pipeline.py

Calvin

final touches

f94a42e 11 months ago

6.45 kB

	import streamlit as st
	from transformers import pipeline
	import pickle
	import os
	import pandas as pd
	import ast
	import string
	import re
	from sentence_transformers import SentenceTransformer, util

	st.set_page_config(
	page_title="Offer Recommender",
	layout="wide"
	)

	# Download and cache models
	pipe = pipeline(task="zero-shot-classification", model="valhalla/distilbart-mnli-12-3")
	model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

	# Directory of csv files
	dire = "DS_NLP_search_data"

	# Use Streamlit caching to load data once
	@st.cache_data
	def get_processed_offers():
	'''
	Load processed offers from exploration notebook and cache

	Returns:
	processed_offers (pd.DataFrame) : zero-shot categorized offers
	'''
	processed_offers = pd.read_csv(os.path.join(dire, "processed_offers.csv"))
	processed_offers["CATEGORY"] = processed_offers["CATEGORY"].map(ast.literal_eval)

	return processed_offers


	@st.cache_data
	def get_categories_data():
	'''
	Load raw category data and cache

	Returns:
	cats (pd.DataFrame) : raw category data
	'''

	cats = pd.read_csv(os.path.join(dire, "categories.csv"))

	return cats


	@st.cache_data
	def get_offers_data():
	'''
	Load raw offfers data and cache

	Returns:
	cats (pd.DataFrame) : raw offers data
	'''

	offers = pd.read_csv(os.path.join(dire, "offer_retailer.csv"))

	return offers


	@st.cache_data
	def get_categories(cats_):
	'''
	Extract, load categories and cache

	Parameters:
	cats_ (pd.DataFrame) : raw categories data

	Returns:
	categories (List) : child categories
	'''

	categories = list(cats_["IS_CHILD_CATEGORY_TO"].unique())
	for x in ["Mature"]:
	if x in categories:
	categories.remove(x)

	return categories


	def check_in_offer(search_str, offer_rets):
	'''
	Determine if the input text is directly in the offer with basic string matching

	Parameters:
	search_str (string) : user text input
	offer_rets (pd.DataFrame) : raw offer data

	Returns:
	df (pd.DataFrame) : offers with text input
	'''

	offers = []
	for i in range(len(offer_rets)):
	offer_str = offer_rets.iloc[i]["OFFER"]
	parsed_str = offer_str.lower().translate(str.maketrans('', '', string.punctuation))
	parsed_str = re.sub('[^a-zA-Z0-9 \n\.]', '', parsed_str)

	if search_str.lower() in parsed_str.split(" "):
	offers.append(offer_str)
	df = pd.DataFrame({"OFFER":offers})

	return df


	def is_retailer(search_str, threshold=0.5):
	'''
	Determine if the text input is highly likely to be a retailer

	Parameters:
	search_str (string) : user text input
	threshold (int) : probability threshold

	Returns:
	is_ret (boolean) : true if retailer, false otherwise
	'''

	processed_search_str = search_str.lower().capitalize()
	labels = pipe(processed_search_str,
	candidate_labels=["brand", "retailer", "item"],
	)

	is_ret = labels["labels"][0] == "retailer" and labels["scores"][0] > threshold

	return is_ret


	def perform_cat_inference(search_str, categories, cats, processed_offers):
	'''
	Perform zero shot learning twice and return the offers relevant to the child categories

	Parameters:
	search_str (string) : user text input
	categories (pd.DataFrame) : list of categories
	cats (pd.DataFrame) : raw category data
	processed_offers (pd.DataFrame) : processed_offer_data

	Returns:
	offers (pd.DataFrame) : relevant offers
	labels (dict) : parent categories and their probability scores
	labels_2 (dict) : child categories and their probability scores
	'''

	labels = pipe(search_str,
	candidate_labels=categories,
	)
	# labels = [l for i, l in enumerate(labels["labels"]) if labels["scores"][i] > 0.20]
	filtered_cats = list(cats[cats["IS_CHILD_CATEGORY_TO"].isin(labels["labels"][:3])]["PRODUCT_CATEGORY"].unique())
	labels_2 = pipe(search_str,
	candidate_labels=filtered_cats,
	)
	top_labels = labels_2["labels"][:3]
	offers = processed_offers[processed_offers["CATEGORY"].apply(lambda x: bool(set(x) & set(top_labels)))]["OFFER"].reset_index()

	return offers, labels, labels_2


	def sort_by_similarity(search_str, related_offers):
	'''
	Use sentence embeddings to evaluate the similarity of relevant offers to the text input

	Parameters:
	search_str (string) : user text input
	related_offers (pd.DataFrame) : relevant offers discovered by zero shot learning

	Returns:
	df (pd.DataFrame) : relevant offers and their similiarity scores
	'''

	temp_dict = {}
	embedding_1 = model.encode(search_str, convert_to_tensor=True)

	for offer in list(related_offers["OFFER"]):
	embedding_2 = model.encode(offer, convert_to_tensor=True)

	temp_dict[offer] = float(util.pytorch_cos_sim(embedding_1, embedding_2))

	sorted_dict = dict(sorted(temp_dict.items(), key=lambda x : x[1], reverse=True))
	df = pd.DataFrame({"OFFER":list(sorted_dict.keys())[:20], "scores":list(sorted_dict.values())[:20]})

	return df


	def main():
	# Load and cache data
	col_1, col_2, col_3 = st.columns(3)
	search_str = col_1.text_input("Enter a retailer, brand, or category").capitalize()
	processed_offers = get_processed_offers()
	cats = get_categories_data()
	offer_rets = get_offers_data()
	categories = get_categories(cats)

	if col_1.button("Search", type="primary"):
	# Check offers where the text is directly in it
	retail = is_retailer(search_str)
	direct_offers = check_in_offer(search_str, offer_rets)
	col_2.write("Directly related offers")

	if len(direct_offers) == 0:
	col_2.write("None found")
	else:
	col_2.table(direct_offers)

	if retail:
	# If retail, we directly compare every offer using sentence embeddings
	related_offers = offer_rets[~offer_rets["OFFER"].isin(list(direct_offers["OFFER"]))]
	else:
	# Otherwise, we use zero shot learning with processed offers to narrow down our search
	related_offers, labels_1, labels_2 = perform_cat_inference(search_str, categories, cats, processed_offers)
	related_offers = related_offers[~related_offers["OFFER"].isin(list(direct_offers["OFFER"]))]

	col_2.write("Parent categories probabilities")
	col_2.table(pd.DataFrame({"labels": labels_1["labels"][:5], "scores": labels_1["scores"][:5]}))
	col_2.write("Child categories probabilities")
	col_2.table(pd.DataFrame({"labels": labels_2["labels"][:5], "scores": labels_2["scores"][:5]}))

	col_2.write("Other related offers")
	sorted_offers = sort_by_similarity(search_str, related_offers)

	if len(sorted_offers) == 0:
	col_2.write("None found")
	else:
	col_2.table(sorted_offers)

	if __name__ == "__main__":
	main()