Carlos Rosas
commited on
Update README.md
Browse files
README.md
CHANGED
@@ -66,16 +66,18 @@ This answer will also contain the excerpts of the documents used and the ID of t
|
|
66 |
|
67 |
### Example Usage
|
68 |
|
69 |
-
In this example, we will be using lancedb for the retrieval part. You can use your preffered embedding model to create the embeddings
|
70 |
|
71 |
```python
|
72 |
import lancedb
|
73 |
from vllm import LLM, SamplingParams
|
74 |
import pandas as pd
|
|
|
|
|
|
|
75 |
|
76 |
# Initialize LanceDB
|
77 |
db = lancedb.connect("lancedb_data")
|
78 |
-
table = db.open_table("education")
|
79 |
|
80 |
# We will create some ficticious education documents to add to the database
|
81 |
documents = [
|
@@ -96,8 +98,40 @@ documents = [
|
|
96 |
}
|
97 |
]
|
98 |
|
99 |
-
#
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
|
102 |
# Load the model
|
103 |
model_name = "PleIAs/Cassandre-RAG"
|
|
|
66 |
|
67 |
### Example Usage
|
68 |
|
69 |
+
In this example, we will be using BGE for the embeddings and lancedb for the retrieval part. You can use your preffered embedding model to create the embeddings and add them to the database, LanceDB hybrid search feature allows us to combine vector search with keyword search for better retrieval.
|
70 |
|
71 |
```python
|
72 |
import lancedb
|
73 |
from vllm import LLM, SamplingParams
|
74 |
import pandas as pd
|
75 |
+
import lancedb
|
76 |
+
from lancedb.pydantic import LanceModel, Vector
|
77 |
+
from lancedb.embeddings import get_registry
|
78 |
|
79 |
# Initialize LanceDB
|
80 |
db = lancedb.connect("lancedb_data")
|
|
|
81 |
|
82 |
# We will create some ficticious education documents to add to the database
|
83 |
documents = [
|
|
|
98 |
}
|
99 |
]
|
100 |
|
101 |
+
#BGE embedding model loading and creating the structure adapted to lance
|
102 |
+
model = get_registry().get("sentence-transformers").create(name="BAAI/bge-m3", device="cuda")
|
103 |
+
|
104 |
+
class Documents(LanceModel):
|
105 |
+
main_title: str
|
106 |
+
text: str = model.SourceField()
|
107 |
+
hash: str
|
108 |
+
vector: Vector(model.ndims()) = model.VectorField()
|
109 |
+
|
110 |
+
#Create table
|
111 |
+
table = db.create_table("example", schema=Documents, mode="overwrite")
|
112 |
+
|
113 |
+
def process_batch(batch):
|
114 |
+
processed_documents = []
|
115 |
+
for item in batch:
|
116 |
+
try:
|
117 |
+
processed_documents.append({
|
118 |
+
"hash": item.get("hash", ""),
|
119 |
+
"main_title": item.get("main_title", ""),
|
120 |
+
"text": item.get("text", "")
|
121 |
+
# Add any other fields you want to include
|
122 |
+
})
|
123 |
+
except Exception as e:
|
124 |
+
print(f"Error processing item: {item}")
|
125 |
+
print(f"Error message: {str(e)}")
|
126 |
+
return processed_documents
|
127 |
+
|
128 |
+
# Process and add documents in batches
|
129 |
+
batch_size = 2 # Adjust as needed
|
130 |
+
for i in tqdm(range(0, len(documents), batch_size)):
|
131 |
+
batch = documents[i:i+batch_size]
|
132 |
+
processed_batch = process_batch(batch)
|
133 |
+
if processed_batch: # Only add if the batch is not empty
|
134 |
+
table.add(processed_batch)
|
135 |
|
136 |
# Load the model
|
137 |
model_name = "PleIAs/Cassandre-RAG"
|