{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Create vecdb - notebook" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from langchain_community.vectorstores import Chroma\n", "from langchain_together.embeddings import TogetherEmbeddings\n", "\n", "\n", "import os\n", "from dotenv import load_dotenv\n", "load_dotenv()\n", "together_api_key = os.getenv(\"TOGETHER_API_KEY\")\n", "\n", "embeddings = TogetherEmbeddings(model=\"togethercomputer/m2-bert-80M-2k-retrieval\")\n", "\n", "# Load\n", "from langchain_community.document_loaders import WebBaseLoader\n", "loader = WebBaseLoader(\"https://lexfridman.com/sam-altman-2-transcript/\")\n", "data = loader.load()\n", "\n", "# Split\n", "from langchain_text_splitters import RecursiveCharacterTextSplitter\n", "text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=250)\n", "all_splits = text_splitter.split_documents(data)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# Add to vectorDB\n", "vectorstore = Chroma.from_documents(persist_directory=\"vecdb_test\",\n", " documents=all_splits, \n", " collection_name=\"rag-chroma\",\n", " embedding=embeddings,\n", " )\n", "retriever = vectorstore.as_retriever()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Huggingface transformers embeddings\n", "\n", "more complicated but \"free\" way of creating embeddings\n", "you will need to install\n", "```\n", "sentence-transformers\n", "einops\n", "opt_einsum\n", "```" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "from langchain_community.embeddings import HuggingFaceEmbeddings" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/opt/miniconda3/envs/langcorn/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "No sentence-transformers model found with name togethercomputer/m2-bert-80M-2k-retrieval. Creating a new one with MEAN pooling.\n", "You are using a model of type m2_bert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "-- Bidirectional: True\n", "-- Using Long Conv Residual: True\n", "-- Hyena w: 10\n", "-- Hyena w mod: 1\n", "-- Hyena filter order: 128\n", "-- Hyena filter dropout: 0.2\n", "-- Hyena filter wd: 0.1\n", "-- Hyena filter emb dim: 5\n", "-- Hyena filter lr: 0.001\n", "-- Hyena filter lr pos emb: 1e-05\n" ] } ], "source": [ "model_name = \"togethercomputer/m2-bert-80M-2k-retrieval\"\n", "model_kwargs = {'device': 'cpu', 'trust_remote_code': True}\n", "encode_kwargs = {'normalize_embeddings': False}\n", "hf = HuggingFaceEmbeddings(\n", " model_name=model_name,\n", " model_kwargs=model_kwargs,\n", " encode_kwargs=encode_kwargs\n", ")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Add to vectorDB\n", "vectorstore = Chroma.from_documents(persist_directory=\"vecdb_hf_test\",\n", " documents=all_splits, \n", " collection_name=\"rag-chroma\",\n", " embedding=hf,\n", " )\n", "retriever = vectorstore.as_retriever()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "langcorn", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" } }, "nbformat": 4, "nbformat_minor": 2 }