Spaces:

Startup-Exchange
/

TalentLLM

No application file

File size: 12,805 Bytes

37d3a3b

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Redis LangChain OpenAI eCommerce Chatbot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "5-h_nDGp3Kdf",
    "outputId": "94191443-3844-4c1d-a26f-7619d976a55b",
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/usr/bin/zsh: /home/green/miniconda3/lib/libtinfo.so.6: no version information available (required by /usr/bin/zsh)\n",
      "\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.1.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.3.1\u001b[0m\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "# Install requirements\n",
    "!pip install -r requirements.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/usr/bin/zsh: /home/green/miniconda3/lib/libtinfo.so.6: no version information available (required by /usr/bin/zsh)\n",
      "/home/linuxbrew/.linuxbrew/opt/python@3.11/lib/python3.11/site-packages/gdown/cli.py:126: FutureWarning: Option `--id` was deprecated in version 4.3.1 and will be removed in 5.0. You don't need to pass it anymore to use a file ID.\n",
      "  warnings.warn(\n",
      "Downloading...\n",
      "From (uriginal): https://drive.google.com/uc?id=1tHWB6u3yQCuAgOYc-DxtZ8Mru3uV5_lj\n",
      "From (redirected): https://drive.google.com/uc?id=1tHWB6u3yQCuAgOYc-DxtZ8Mru3uV5_lj&confirm=t&uuid=f678b48d-4f3e-44f9-bf60-03ca828cb67c\n",
      "To: /home/green/code/gatech/ai_atl/inital_work/product_data.csv\n",
      "100%|████████████████████████████████████████| 225M/225M [00:09<00:00, 24.0MB/s]\n"
     ]
    }
   ],
   "source": [
    "# Download the dataset\n",
    "!gdown --id 1tHWB6u3yQCuAgOYc-DxtZ8Mru3uV5_lj"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Preprocess dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "MAX_TEXT_LENGTH=512\n",
    "\n",
    "def auto_truncate(val):\n",
    "    \"\"\"Truncate the given text.\"\"\"\n",
    "    return val[:MAX_TEXT_LENGTH]\n",
    "\n",
    "# Load Product data and truncate long text fields\n",
    "all_prods_df = pd.read_csv(\"product_data.csv\", converters={\n",
    "    'bullet_point': auto_truncate,\n",
    "    'item_keywords': auto_truncate,\n",
    "    'item_name': auto_truncate\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 669
    },
    "id": "00_n4VWH7FoB",
    "outputId": "f26daa8c-4af9-4def-d5ab-3197777fe2f9",
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Contruct a primary key from item ID and domain name\n",
    "all_prods_df['primary_key'] = (\n",
    "    all_prods_df['item_id'] + '-' + all_prods_df['domain_name']\n",
    ")\n",
    "# Replace empty strings with None and drop\n",
    "all_prods_df['item_keywords'].replace('', None, inplace=True)\n",
    "all_prods_df.dropna(subset=['item_keywords'], inplace=True)\n",
    "\n",
    "# Reset pandas dataframe index\n",
    "all_prods_df.reset_index(drop=True, inplace=True)\n",
    "\n",
    "all_prods_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Num products to use (subset)\n",
    "NUMBER_PRODUCTS = 2500  \n",
    "\n",
    "# Get the first 1000 products with non-empty item keywords\n",
    "product_metadata = ( \n",
    "    all_prods_df\n",
    "     .head(NUMBER_PRODUCTS)\n",
    "     .to_dict(orient='index')\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "Iw7rlppY8f3a",
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Check one of the products\n",
    "product_metadata[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Set up Redis as a vector db"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from langchain.embeddings import OpenAIEmbeddings\n",
    "from langchain.vectorstores.redis import Redis as RedisVectorStore\n",
    "\n",
    "# data that will be embedded and converted to vectors\n",
    "texts = [\n",
    "    v['item_name'] for k, v in product_metadata.items()\n",
    "]\n",
    "\n",
    "# product metadata that we'll store along our vectors\n",
    "metadatas = list(product_metadata.values())\n",
    "\n",
    "# we will use OpenAI as our embeddings provider\n",
    "embedding = OpenAIEmbeddings()\n",
    "\n",
    "# name of the Redis search index to create\n",
    "index_name = \"products\"\n",
    "\n",
    "# assumes you have a redis stack server running on within your docker compose network\n",
    "redis_url = \"redis://redis:6379\"\n",
    "\n",
    "# create and load redis with documents\n",
    "vectorstore = RedisVectorStore.from_texts(\n",
    "    texts=texts,\n",
    "    metadatas=metadatas,\n",
    "    embedding=embedding,\n",
    "    index_name=index_name,\n",
    "    redis_url=redis_url\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Build the ChatBot with ConversationalRetrieverChain"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from langchain.callbacks.base import CallbackManager\n",
    "from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler\n",
    "from langchain.chains import (\n",
    "    ConversationalRetrievalChain,\n",
    "    LLMChain\n",
    ")\n",
    "from langchain.chains.question_answering import load_qa_chain\n",
    "from langchain.llms import OpenAI\n",
    "from langchain.prompts.prompt import PromptTemplate\n",
    "\n",
    "template = \"\"\"Given the following chat history and a follow up question, rephrase the follow up input question to be a standalone question.\n",
    "Or end the conversation if it seems like it's done.\n",
    "\n",
    "Chat History:\\\"\"\"\n",
    "{chat_history}\n",
    "\\\"\"\"\n",
    "\n",
    "Follow Up Input: \\\"\"\"\n",
    "{question}\n",
    "\\\"\"\"\n",
    "\n",
    "Standalone question:\"\"\"\n",
    "\n",
    "condense_question_prompt = PromptTemplate.from_template(template)\n",
    "\n",
    "template = \"\"\"You are a friendly, conversational retail shopping assistant. Use the following context including product names, descriptions, and keywords to show the shopper whats available, help find what they want, and answer any questions.\n",
    "It's ok if you don't know the answer.\n",
    "\n",
    "Context:\\\"\"\"\n",
    "{context}\n",
    "\\\"\"\"\n",
    "\n",
    "Question:\\\"\n",
    "\\\"\"\"\n",
    "\n",
    "Helpful Answer:\"\"\"\n",
    "\n",
    "qa_prompt= PromptTemplate.from_template(template)\n",
    "\n",
    "\n",
    "# define two LLM models from OpenAI\n",
    "llm = OpenAI(temperature=0)\n",
    "\n",
    "streaming_llm = OpenAI(\n",
    "    streaming=True,\n",
    "    callback_manager=CallbackManager([\n",
    "        StreamingStdOutCallbackHandler()]),\n",
    "    verbose=True,\n",
    "    temperature=0.2,\n",
    "    max_tokens=150\n",
    ")\n",
    "\n",
    "# use the LLM Chain to create a question creation chain\n",
    "question_generator = LLMChain(\n",
    "    llm=llm,\n",
    "    prompt=condense_question_prompt\n",
    ")\n",
    "\n",
    "# use the streaming LLM to create a question answering chain\n",
    "doc_chain = load_qa_chain(\n",
    "    llm=streaming_llm,\n",
    "    chain_type=\"stuff\",\n",
    "    prompt=qa_prompt\n",
    ")\n",
    "\n",
    "\n",
    "chatbot = ConversationalRetrievalChain(\n",
    "    retriever=vectorstore.as_retriever(),\n",
    "    combine_docs_chain=doc_chain,\n",
    "    question_generator=question_generator\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# create a chat history buffer\n",
    "chat_history = []\n",
    "\n",
    "# gather user input for the first question to kick off the bot\n",
    "question = input(\"Hi! What are you looking for today?\")\n",
    "\n",
    "# keep the bot running in a loop to simulate a conversation\n",
    "while True:\n",
    "    result = chatbot(\n",
    "        {\"question\": question, \"chat_history\": chat_history}\n",
    "    )\n",
    "    print(\"\\n\")\n",
    "    chat_history.append((result[\"question\"], result[\"answer\"]))\n",
    "    question = input()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Customize your chains for even better performance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "from langchain.schema import BaseRetriever\n",
    "from langchain.vectorstores import VectorStore\n",
    "from langchain.schema import Document\n",
    "from pydantic import BaseModel\n",
    "\n",
    "\n",
    "class RedisProductRetriever(BaseRetriever, BaseModel):\n",
    "    vectorstore: VectorStore\n",
    "\n",
    "    class Config:\n",
    "        \n",
    "        arbitrary_types_allowed = True\n",
    "\n",
    "    def combine_metadata(self, doc) -> str:\n",
    "        metadata = doc.metadata\n",
    "        return (\n",
    "            \"Item Name: \" + metadata[\"item_name\"] + \". \" +\n",
    "            \"Item Description: \" + metadata[\"bullet_point\"] + \". \" +\n",
    "            \"Item Keywords: \" + metadata[\"item_keywords\"] + \".\"\n",
    "        )\n",
    "\n",
    "    def get_relevant_documents(self, query):\n",
    "        docs = []\n",
    "        for doc in self.vectorstore.similarity_search(query):\n",
    "            content = self.combine_metadata(doc)\n",
    "            docs.append(Document(\n",
    "                page_content=content,\n",
    "                metadata=doc.metadata\n",
    "            ))\n",
    "        return docs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Setup ChatBot with new retriever"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "redis_product_retriever = RedisProductRetriever(vectorstore=vectorstore)\n",
    "\n",
    "chatbot = ConversationalRetrievalChain(\n",
    "    retriever=redis_product_retriever,\n",
    "    combine_docs_chain=doc_chain,\n",
    "    question_generator=question_generator\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Retry"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# create a chat history buffer\n",
    "chat_history = []\n",
    "\n",
    "# gather user input for the first question to kick off the bot\n",
    "question = input(\"Hi! What are you looking for today?\")\n",
    "\n",
    "# keep the bot running in a loop to simulate a conversation\n",
    "while True:\n",
    "    result = chatbot(\n",
    "        {\"question\": question, \"chat_history\": chat_history}\n",
    "    )\n",
    "    print(\"\\n\")\n",
    "    chat_history.append((result[\"question\"], result[\"answer\"]))\n",
    "    question = input()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "colab": {
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}