Spaces:

cdy3870
/

Fetch_App

Running

File size: 6,453 Bytes

14e17ef
 
 
 
 
 
 
 
ee79db1
14e17ef
 
 
 
 
 
f94a42e
 
ee79db1
 
f94a42e
14e17ef
 
f94a42e
14e17ef
 
f94a42e
 
 
 
 
 
14e17ef
 
f94a42e
14e17ef
 
f94a42e
14e17ef
 
f94a42e
 
 
 
 
 
 
14e17ef
f94a42e
14e17ef
 
f94a42e
14e17ef
 
f94a42e
 
 
 
 
 
 
14e17ef
f94a42e
14e17ef
 
f94a42e
14e17ef
 
f94a42e
 
 
 
 
 
 
 
 
 
14e17ef
 
 
 
f94a42e
14e17ef
 
f94a42e
14e17ef
f94a42e
 
 
 
 
 
 
 
 
 
 
14e17ef
 
 
 
 
f94a42e
14e17ef
 
 
f94a42e
14e17ef
 
f94a42e
14e17ef
f94a42e
 
 
 
 
 
 
 
 
 
 
14e17ef
 
 
 
 
f94a42e
 
 
 
14e17ef
 
f94a42e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14e17ef
 
 
 
 
 
 
 
 
 
 
 
 
f94a42e
ee79db1
f94a42e
 
 
 
 
 
 
 
 
 
 
ee79db1
 
 
 
 
 
 
 
 
 
f94a42e
ee79db1
 
f94a42e
14e17ef
f94a42e
14e17ef
f94a42e
14e17ef
 
 
 
 
f94a42e
 
14e17ef
 
ee79db1
f94a42e
 
 
 
 
14e17ef
 
f94a42e
ee79db1
14e17ef
f94a42e
14e17ef
ee79db1
 
f94a42e
14e17ef
f94a42e
14e17ef
ee79db1
 
 
14e17ef
f94a42e
 
 
 
 
 
14e17ef

import streamlit as st
from transformers import pipeline
import pickle
import os
import pandas as pd
import ast
import string
import re
from sentence_transformers import SentenceTransformer, util

st.set_page_config(
	page_title="Offer Recommender",
	layout="wide"
)

# Download and cache models
pipe = pipeline(task="zero-shot-classification", model="valhalla/distilbart-mnli-12-3")
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Directory of csv files
dire = "DS_NLP_search_data"

# Use Streamlit caching to load data once
@st.cache_data
def get_processed_offers():
	'''
	Load processed offers from exploration notebook and cache

	Returns:
		processed_offers (pd.DataFrame) :  zero-shot categorized offers
	'''
	processed_offers = pd.read_csv(os.path.join(dire, "processed_offers.csv"))
	processed_offers["CATEGORY"] = processed_offers["CATEGORY"].map(ast.literal_eval)

	return processed_offers


@st.cache_data
def get_categories_data():
	'''
	Load raw category data and cache

	Returns:
		cats (pd.DataFrame) :  raw category data
	'''

	cats = pd.read_csv(os.path.join(dire, "categories.csv"))

	return cats


@st.cache_data
def get_offers_data():
	'''
	Load raw offfers data and cache

	Returns:
		cats (pd.DataFrame) :  raw offers data
	'''

	offers = pd.read_csv(os.path.join(dire, "offer_retailer.csv"))

	return offers


@st.cache_data
def get_categories(cats_):
	'''
	Extract, load categories and cache

	Parameters:
		cats_ (pd.DataFrame) : raw categories data

	Returns:
		categories (List) :  child categories
	'''

	categories = list(cats_["IS_CHILD_CATEGORY_TO"].unique())
	for x in ["Mature"]:
		if x in categories:
			categories.remove(x)

	return categories


def check_in_offer(search_str, offer_rets):
	'''
	Determine if the input text is directly in the offer with basic string matching

	Parameters:
		search_str (string) : user text input
		offer_rets (pd.DataFrame) : raw offer data

	Returns:
		df (pd.DataFrame) :  offers with text input
	'''

	offers = []
	for i in range(len(offer_rets)):
		offer_str = offer_rets.iloc[i]["OFFER"]
		parsed_str = offer_str.lower().translate(str.maketrans('', '', string.punctuation))
		parsed_str = re.sub('[^a-zA-Z0-9 \n\.]', '', parsed_str)

		if search_str.lower() in parsed_str.split(" "):
		  offers.append(offer_str)
	df = pd.DataFrame({"OFFER":offers})

	return df


def is_retailer(search_str, threshold=0.5):
	'''
	Determine if the text input is highly likely to be a retailer

	Parameters:
		search_str (string) : user text input
		threshold (int) : probability threshold

	Returns:
		is_ret (boolean) :  true if retailer, false otherwise
	'''

	processed_search_str = search_str.lower().capitalize()
	labels = pipe(processed_search_str,
	  candidate_labels=["brand", "retailer", "item"],
	)

	is_ret = labels["labels"][0] == "retailer" and labels["scores"][0] > threshold

	return is_ret


def perform_cat_inference(search_str, categories, cats, processed_offers):
	'''
	Perform zero shot learning twice and return the offers relevant to the child categories

	Parameters:
		search_str (string) : user text input
		categories (pd.DataFrame) : list of categories
		cats (pd.DataFrame) : raw category data
		processed_offers (pd.DataFrame) : processed_offer_data

	Returns:
		offers (pd.DataFrame) : relevant offers  
		labels (dict) : parent categories and their probability scores
		labels_2 (dict) : child categories and their probability scores
	'''

	labels = pipe(search_str,
		candidate_labels=categories,
	)
	# labels = [l for i, l in enumerate(labels["labels"]) if labels["scores"][i] > 0.20]
	filtered_cats = list(cats[cats["IS_CHILD_CATEGORY_TO"].isin(labels["labels"][:3])]["PRODUCT_CATEGORY"].unique())
	labels_2 = pipe(search_str,
		candidate_labels=filtered_cats,
	)
	top_labels = labels_2["labels"][:3]
	offers = processed_offers[processed_offers["CATEGORY"].apply(lambda x: bool(set(x) & set(top_labels)))]["OFFER"].reset_index()

	return offers, labels, labels_2


def sort_by_similarity(search_str, related_offers):
	'''
	Use sentence embeddings to evaluate the similarity of relevant offers to the text input

	Parameters:
		search_str (string) : user text input
		related_offers (pd.DataFrame) : relevant offers discovered by zero shot learning

	Returns:
		df (pd.DataFrame) : relevant offers and their similiarity scores  
	'''

	temp_dict = {}
	embedding_1 = model.encode(search_str, convert_to_tensor=True)

	for offer in list(related_offers["OFFER"]):
		embedding_2 = model.encode(offer, convert_to_tensor=True)

		temp_dict[offer] = float(util.pytorch_cos_sim(embedding_1, embedding_2))

	sorted_dict = dict(sorted(temp_dict.items(), key=lambda x : x[1], reverse=True))
	df = pd.DataFrame({"OFFER":list(sorted_dict.keys())[:20], "scores":list(sorted_dict.values())[:20]})

	return df


def main():
	# Load and cache data
	col_1, col_2, col_3 = st.columns(3)
	search_str = col_1.text_input("Enter a retailer, brand, or category").capitalize()
	processed_offers = get_processed_offers()
	cats = get_categories_data()
	offer_rets = get_offers_data()
	categories = get_categories(cats)

	if col_1.button("Search", type="primary"):
		# Check offers where the text is directly in it
		retail = is_retailer(search_str)
		direct_offers = check_in_offer(search_str, offer_rets)
		col_2.write("Directly related offers")

		if len(direct_offers) == 0:
			col_2.write("None found")
		else:
			col_2.table(direct_offers)

		if retail:
			# If retail, we directly compare every offer using sentence embeddings
			related_offers = offer_rets[~offer_rets["OFFER"].isin(list(direct_offers["OFFER"]))]
		else:
			# Otherwise, we use zero shot learning with processed offers to narrow down our search
			related_offers, labels_1, labels_2 = perform_cat_inference(search_str, categories, cats, processed_offers) 
			related_offers = related_offers[~related_offers["OFFER"].isin(list(direct_offers["OFFER"]))]

			col_2.write("Parent categories probabilities")
			col_2.table(pd.DataFrame({"labels": labels_1["labels"][:5], "scores": labels_1["scores"][:5]}))
			col_2.write("Child categories probabilities")
			col_2.table(pd.DataFrame({"labels": labels_2["labels"][:5], "scores": labels_2["scores"][:5]}))
		
		col_2.write("Other related offers")
		sorted_offers = sort_by_similarity(search_str, related_offers)

		if len(sorted_offers) == 0:
			col_2.write("None found")
		else:
			col_2.table(sorted_offers)

if __name__ == "__main__":
	main()