Ankitajadhav commited on
Commit
5ecd97e
1 Parent(s): 94005ba

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -21
app.py CHANGED
@@ -19,33 +19,45 @@ class VectorStore:
19
  self.collection = self.chroma_client.create_collection(name=collection_name)
20
 
21
  # Method to populate the vector store with embeddings from a dataset
22
- def populate_vectors(self, dataset):
23
- # Select the text columns to concatenate
24
- title = dataset['train']['title_cleaned'][:2500] # Limiting to 100 examples for the demo
25
- recipe = dataset['train']['recipe_new'][:2500]
26
- meal_type = dataset['train']['meal_type'][:2500]
27
- allergy = dataset['train']['allergy_type'][:2500]
28
- ingredients_alternative = dataset['train']['ingredients_alternatives'][:2500]
29
-
30
- # Concatenate the text from both columns
31
- texts = [f"{tit} {rep} {meal} {alle} {ingr} " for tit, rep, meal,alle, ingr in zip(title,recipe,meal_type,allergy,ingredients_alternative)]
32
- for i, item in enumerate(texts):
33
- embeddings = self.embedding_model.encode(item).tolist()
34
- self.collection.add(embeddings=[embeddings], documents=[item], ids=[str(i)])
35
-
36
- # # Method to search the ChromaDB collection for relevant context based on a query
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  def search_context(self, query, n_results=1):
38
  query_embeddings = self.embedding_model.encode(query).tolist()
39
  return self.collection.query(query_embeddings=query_embeddings, n_results=n_results)
40
 
41
-
42
- # importing dataset hosted on huggingface
43
- # dataset details - https://huggingface.co/datasets/Thefoodprocessor/recipe_new_with_features_full
44
- dataset = load_dataset('Thefoodprocessor/recipe_new_with_features_full')
45
-
46
  # create a vector embedding
47
  vector_store = VectorStore("embedding_vector")
48
- vector_store.populate_vectors(dataset)
49
 
50
 
51
  # Load the model and tokenizer
 
19
  self.collection = self.chroma_client.create_collection(name=collection_name)
20
 
21
  # Method to populate the vector store with embeddings from a dataset
22
+ def populate_vectors(self, dataset, batch_size=100):
23
+ # Use dataset streaming
24
+ dataset = load_dataset('Thefoodprocessor/recipe_new_with_features_full', split='train', streaming=True)
25
+
26
+ # Process in batches
27
+ texts = []
28
+
29
+ for i, example in enumerate(dataset):
30
+ title = example['title_cleaned']
31
+ recipe = example['recipe_new']
32
+ meal_type = example['meal_type']
33
+ allergy = example['allergy_type']
34
+ ingredients_alternative = example['ingredients_alternatives']
35
+
36
+ # Concatenate the text from the columns
37
+ text = f"{title} {recipe} {meal_type} {allergy} {ingredients_alternative}"
38
+ texts.append(text)
39
+
40
+ # Process the batch
41
+ if (i + 1) % batch_size == 0:
42
+ self._process_batch(texts, i)
43
+ texts = []
44
+
45
+ # Process the remaining texts
46
+ if texts:
47
+ self._process_batch(texts, i)
48
+
49
+ def _process_batch(self, texts, batch_start_idx):
50
+ embeddings = self.embedding_model.encode(texts, batch_size=len(texts)).tolist()
51
+ for j, embedding in enumerate(embeddings):
52
+ self.collection.add(embeddings=[embedding], documents=[texts[j]], ids=[str(batch_start_idx + j)])
53
+
54
  def search_context(self, query, n_results=1):
55
  query_embeddings = self.embedding_model.encode(query).tolist()
56
  return self.collection.query(query_embeddings=query_embeddings, n_results=n_results)
57
 
 
 
 
 
 
58
  # create a vector embedding
59
  vector_store = VectorStore("embedding_vector")
60
+ vector_store.populate_vectors(dataset=None)
61
 
62
 
63
  # Load the model and tokenizer