!pip install -q langchain !pip install -q torch !pip install -q transformers !pip install -q sentence-transformers !pip install -q datasets !pip install -q faiss-cpu from langchain.document_loaders import HuggingFaceDatasetLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import FAISS from transformers import AutoTokenizer, AutoModelForQuestionAnswering from transformers import AutoTokenizer, pipeline from langchain import HuggingFacePipeline from langchain.chains import RetrievalQA # Specify the dataset name and the column containing the content dataset_name = "databricks/databricks-dolly-15k" page_content_column = "context" # or any other column you're interested in # Create a loader instance loader = HuggingFaceDatasetLoader(dataset_name, page_content_column) # Load the data data = loader.load() # Display the first 15 entries data[:2] # Create an instance of the RecursiveCharacterTextSplitter class with specific parameters. # It splits text into chunks of 1000 characters each with a 150-character overlap. text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150) # 'data' holds the text you want to split, split the text into documents using the text splitter. docs = text_splitter.split_documents(data) # Define the path to the pre-trained model you want to use modelPath = "sentence-transformers/all-MiniLM-l6-v2" # Create a dictionary with model configuration options, specifying to use the CPU for computations model_kwargs = {'device':'cpu'} # Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False encode_kwargs = {'normalize_embeddings': False} # Initialize an instance of HuggingFaceEmbeddings with the specified parameters embeddings = HuggingFaceEmbeddings( model_name=modelPath, # Provide the pre-trained model's path model_kwargs=model_kwargs, # Pass the model configuration options encode_kwargs=encode_kwargs # Pass the encoding options