Carlos Rosas commited on
Commit
9d45bed
·
verified ·
1 Parent(s): 8b9eee8

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +38 -4
README.md CHANGED
@@ -66,16 +66,18 @@ This answer will also contain the excerpts of the documents used and the ID of t
66
 
67
  ### Example Usage
68
 
69
- In this example, we will be using lancedb for the retrieval part. You can use your preffered embedding model to create the embeddings together with Lancedb and add them to the database, and its hybrid search feature allows us to combine vector search with keyword search for better retrieval.
70
 
71
  ```python
72
  import lancedb
73
  from vllm import LLM, SamplingParams
74
  import pandas as pd
 
 
 
75
 
76
  # Initialize LanceDB
77
  db = lancedb.connect("lancedb_data")
78
- table = db.open_table("education")
79
 
80
  # We will create some ficticious education documents to add to the database
81
  documents = [
@@ -96,8 +98,40 @@ documents = [
96
  }
97
  ]
98
 
99
- # Add documents to LanceDB
100
- table.add(documents)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
  # Load the model
103
  model_name = "PleIAs/Cassandre-RAG"
 
66
 
67
  ### Example Usage
68
 
69
+ In this example, we will be using BGE for the embeddings and lancedb for the retrieval part. You can use your preffered embedding model to create the embeddings and add them to the database, LanceDB hybrid search feature allows us to combine vector search with keyword search for better retrieval.
70
 
71
  ```python
72
  import lancedb
73
  from vllm import LLM, SamplingParams
74
  import pandas as pd
75
+ import lancedb
76
+ from lancedb.pydantic import LanceModel, Vector
77
+ from lancedb.embeddings import get_registry
78
 
79
  # Initialize LanceDB
80
  db = lancedb.connect("lancedb_data")
 
81
 
82
  # We will create some ficticious education documents to add to the database
83
  documents = [
 
98
  }
99
  ]
100
 
101
+ #BGE embedding model loading and creating the structure adapted to lance
102
+ model = get_registry().get("sentence-transformers").create(name="BAAI/bge-m3", device="cuda")
103
+
104
+ class Documents(LanceModel):
105
+ main_title: str
106
+ text: str = model.SourceField()
107
+ hash: str
108
+ vector: Vector(model.ndims()) = model.VectorField()
109
+
110
+ #Create table
111
+ table = db.create_table("example", schema=Documents, mode="overwrite")
112
+
113
+ def process_batch(batch):
114
+ processed_documents = []
115
+ for item in batch:
116
+ try:
117
+ processed_documents.append({
118
+ "hash": item.get("hash", ""),
119
+ "main_title": item.get("main_title", ""),
120
+ "text": item.get("text", "")
121
+ # Add any other fields you want to include
122
+ })
123
+ except Exception as e:
124
+ print(f"Error processing item: {item}")
125
+ print(f"Error message: {str(e)}")
126
+ return processed_documents
127
+
128
+ # Process and add documents in batches
129
+ batch_size = 2 # Adjust as needed
130
+ for i in tqdm(range(0, len(documents), batch_size)):
131
+ batch = documents[i:i+batch_size]
132
+ processed_batch = process_batch(batch)
133
+ if processed_batch: # Only add if the batch is not empty
134
+ table.add(processed_batch)
135
 
136
  # Load the model
137
  model_name = "PleIAs/Cassandre-RAG"