Spaces:
Runtime error
Runtime error
!pip install -q langchain | |
!pip install -q torch | |
!pip install -q transformers | |
!pip install -q sentence-transformers | |
!pip install -q datasets | |
!pip install -q faiss-cpu | |
from langchain.document_loaders import HuggingFaceDatasetLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.vectorstores import FAISS | |
from transformers import AutoTokenizer, AutoModelForQuestionAnswering | |
from transformers import AutoTokenizer, pipeline | |
from langchain import HuggingFacePipeline | |
from langchain.chains import RetrievalQA | |
# Specify the dataset name and the column containing the content | |
dataset_name = "databricks/databricks-dolly-15k" | |
page_content_column = "context" # or any other column you're interested in | |
# Create a loader instance | |
loader = HuggingFaceDatasetLoader(dataset_name, page_content_column) | |
# Load the data | |
data = loader.load() | |
# Display the first 15 entries | |
data[:2] | |
# Create an instance of the RecursiveCharacterTextSplitter class with specific parameters. | |
# It splits text into chunks of 1000 characters each with a 150-character overlap. | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150) | |
# 'data' holds the text you want to split, split the text into documents using the text splitter. | |
docs = text_splitter.split_documents(data) | |
# Define the path to the pre-trained model you want to use | |
modelPath = "sentence-transformers/all-MiniLM-l6-v2" | |
# Create a dictionary with model configuration options, specifying to use the CPU for computations | |
model_kwargs = {'device':'cpu'} | |
# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False | |
encode_kwargs = {'normalize_embeddings': False} | |
# Initialize an instance of HuggingFaceEmbeddings with the specified parameters | |
embeddings = HuggingFaceEmbeddings( | |
model_name=modelPath, # Provide the pre-trained model's path | |
model_kwargs=model_kwargs, # Pass the model configuration options | |
encode_kwargs=encode_kwargs # Pass the encoding options |