Spaces:
Runtime error
Runtime error
# import packages | |
__import__('pysqlite3') | |
import sys | |
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3') | |
from sentence_transformers import SentenceTransformer | |
import chromadb | |
from datasets import load_dataset | |
from gpt4all import GPT4All | |
# Embedding vector | |
class VectorStore: | |
def __init__(self, collection_name): | |
# Initialize the embedding model | |
self.embedding_model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1') | |
self.chroma_client = chromadb.Client() | |
self.collection = self.chroma_client.create_collection(name=collection_name) | |
# Method to populate the vector store with embeddings from a dataset | |
def populate_vectors(self, dataset): | |
# Select the text columns to concatenate | |
title = dataset['train']['title_cleaned'][:5000] # Limiting to 100 examples for the demo | |
recipe = dataset['train']['recipe_new'][:5000] | |
meal_type = dataset['train']['meal_type'][:5000] | |
allergy = dataset['train']['allergy_type'][:5000] | |
ingredients_alternative = dataset['train']['ingredients_alternatives'][:5000] | |
# Concatenate the text from both columns | |
texts = [f"{tit} {rep} {meal} {alle} {ingr} " for tit, rep, meal,alle, ingr in zip(title,recipe,meal_type,allergy,ingredients_alternative)] | |
for i, item in enumerate(texts): | |
embeddings = self.embedding_model.encode(item).tolist() | |
self.collection.add(embeddings=[embeddings], documents=[item], ids=[str(i)]) | |
# # Method to search the ChromaDB collection for relevant context based on a query | |
def search_context(self, query, n_results=1): | |
query_embeddings = self.embedding_model.encode(query).tolist() | |
return self.collection.query(query_embeddings=query_embeddings, n_results=n_results) | |
# importing dataset hosted on huggingface | |
# dataset details - https://huggingface.co/datasets/Thefoodprocessor/recipe_new_with_features_full | |
dataset = load_dataset('Thefoodprocessor/recipe_new_with_features_full') | |
# create a vector embedding | |
vector_store = VectorStore("embedding_vector") | |
vector_store.populate_vectors(dataset) | |
# loading gpt4all language model | |
# load model Chat based model mistral-7b-openorca.gguf2.Q4_0.gguf | |
# detail about gpt4all and model information - https://gpt4all.io/index.html | |
model_name = 'Meta-Llama-3-8B-Instruct.Q4_0.gguf' # .gguf represents quantized model | |
model_path = "gpt4all" | |
# add path to download load the model locally, download once and load for subsequent inference | |
model = GPT4All(model_name=model_name, model_path=model_path,device="cuda") | |