datbkpro commited on
Commit
8cc3acd
·
verified ·
1 Parent(s): b846ef6

Update core/rag_system.py

Browse files
Files changed (1) hide show
  1. core/rag_system.py +84 -36
core/rag_system.py CHANGED
@@ -70,68 +70,116 @@ class EnhancedRAGSystem:
70
  self.add_documents(english_data, english_metadatas)
71
 
72
  def add_documents(self, documents: List[str], metadatas: List[Dict] = None):
73
- """Thêm documents vào database với embedding phù hợp"""
 
 
74
  if not documents:
 
75
  return
76
 
77
  # Ensure metadatas has the same length as documents
78
  if metadatas is None:
79
  metadatas = [{} for _ in documents]
 
80
  elif len(metadatas) != len(documents):
81
- # Extend or truncate metadatas to match documents length
82
- if len(metadatas) < len(documents):
83
- metadatas = metadatas + [{} for _ in range(len(documents) - len(metadatas))]
84
- else:
85
- metadatas = metadatas[:len(documents)]
 
 
 
 
86
 
87
- # Detect language for each document and create embeddings accordingly
88
- new_embeddings_list = []
89
  valid_documents = []
90
  valid_metadatas = []
91
 
92
  for i, doc in enumerate(documents):
93
- if not doc or len(doc.strip()) == 0:
94
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
- language = metadatas[i].get('language', 'vi')
97
- embedding_model = self.multilingual_manager.get_embedding_model(language)
98
-
99
- if embedding_model is not None:
100
- try:
101
- # Create embedding for this document
102
- doc_embedding = embedding_model.encode([doc])
103
- new_embeddings_list.append(doc_embedding[0])
104
- valid_documents.append(doc)
105
- valid_metadatas.append(metadatas[i])
106
 
107
- except Exception as e:
108
- print(f"❌ Lỗi tạo embedding cho document {i}: {e}")
 
 
 
 
 
109
 
110
- if not valid_documents:
 
 
 
111
  return
112
 
113
- # Convert list of embeddings to numpy array
114
- new_embeddings = np.array(new_embeddings_list)
 
 
 
 
 
115
 
116
- # Handle dimension mismatch
117
- if self.embeddings is not None and self.embeddings.shape[1] != new_embeddings.shape[1]:
118
- print(f"⚠️ Phát hiện dimension mismatch ({self.embeddings.shape[1]} vs {new_embeddings.shape[1]}), tạo index mới...")
119
- self.embeddings = None
120
- self.index = None
121
 
122
- # Update embeddings
123
  if self.embeddings is None:
 
124
  self.embeddings = new_embeddings
125
- self.current_dimension = new_embeddings.shape[1]
 
 
126
  else:
127
- self.embeddings = np.vstack([self.embeddings, new_embeddings])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
  # Update FAISS index
130
  self._update_faiss_index()
131
 
132
- self.documents.extend(valid_documents)
133
- self.metadatas.extend(valid_metadatas)
134
- print(f" Đã thêm {len(valid_documents)} documents vào RAG database")
135
 
136
  def _update_faiss_index(self):
137
  """Cập nhật FAISS index với embeddings hiện tại"""
 
70
  self.add_documents(english_data, english_metadatas)
71
 
72
  def add_documents(self, documents: List[str], metadatas: List[Dict] = None):
73
+ """Thêm documents vào database - ĐÃ SỬA LỖI"""
74
+ print(f"🔄 RAG System: Bắt đầu thêm {len(documents)} documents...")
75
+
76
  if not documents:
77
+ print("❌ RAG System: Không có documents để thêm")
78
  return
79
 
80
  # Ensure metadatas has the same length as documents
81
  if metadatas is None:
82
  metadatas = [{} for _ in documents]
83
+ print("📝 Tạo metadata mặc định")
84
  elif len(metadatas) != len(documents):
85
+ print(f"⚠️ Metadata length mismatch: {len(metadatas)} vs {len(documents)}")
86
+ # Fix metadata length
87
+ new_metadatas = []
88
+ for i in range(len(documents)):
89
+ if i < len(metadatas):
90
+ new_metadatas.append(metadatas[i])
91
+ else:
92
+ new_metadatas.append({"source": "upload", "language": "vi"})
93
+ metadatas = new_metadatas
94
 
95
+ # Filter valid documents
 
96
  valid_documents = []
97
  valid_metadatas = []
98
 
99
  for i, doc in enumerate(documents):
100
+ if doc and isinstance(doc, str) and len(doc.strip()) > 5: # At least 5 characters
101
+ valid_documents.append(doc.strip())
102
+ valid_metadatas.append(metadatas[i] if i < len(metadatas) else {})
103
+ else:
104
+ print(f"⚠️ Bỏ qua document {i}: không hợp lệ")
105
+
106
+ print(f"📊 Documents hợp lệ: {len(valid_documents)}/{len(documents)}")
107
+
108
+ if not valid_documents:
109
+ print("❌ Không có documents hợp lệ để thêm")
110
+ return
111
+
112
+ # Create embeddings
113
+ new_embeddings_list = []
114
+ successful_embeddings = 0
115
+
116
+ for i, doc in enumerate(valid_documents):
117
+ try:
118
+ language = valid_metadatas[i].get('language', 'vi')
119
+ embedding_model = self.multilingual_manager.get_embedding_model(language)
120
 
121
+ if embedding_model is None:
122
+ print(f"⚠️ Không có embedding model cho document {i}")
123
+ continue
 
 
 
 
 
 
 
124
 
125
+ # Create embedding
126
+ doc_embedding = embedding_model.encode([doc])
127
+ new_embeddings_list.append(doc_embedding[0])
128
+ successful_embeddings += 1
129
+
130
+ except Exception as e:
131
+ print(f"❌ Lỗi embedding document {i}: {e}")
132
 
133
+ print(f"📊 Embeddings thành công: {successful_embeddings}/{len(valid_documents)}")
134
+
135
+ if not new_embeddings_list:
136
+ print("❌ Không tạo được embeddings nào")
137
  return
138
 
139
+ # Convert to numpy array
140
+ try:
141
+ new_embeddings = np.array(new_embeddings_list)
142
+ print(f"✅ Embedding matrix shape: {new_embeddings.shape}")
143
+ except Exception as e:
144
+ print(f"❌ Lỗi tạo embedding matrix: {e}")
145
+ return
146
 
147
+ # Handle existing embeddings
148
+ old_doc_count = len(self.documents)
 
 
 
149
 
 
150
  if self.embeddings is None:
151
+ # First time initialization
152
  self.embeddings = new_embeddings
153
+ self.documents = valid_documents
154
+ self.metadatas = valid_metadatas
155
+ print("✅ Khởi tạo RAG system lần đầu")
156
  else:
157
+ # Append to existing
158
+ try:
159
+ # Check dimension compatibility
160
+ if self.embeddings.shape[1] != new_embeddings.shape[1]:
161
+ print(f"⚠️ Dimension mismatch: {self.embeddings.shape[1]} vs {new_embeddings.shape[1]}")
162
+ print("🔄 Tạo system mới do dimension không khớp")
163
+ self.embeddings = new_embeddings
164
+ self.documents = valid_documents
165
+ self.metadatas = valid_metadatas
166
+ else:
167
+ # Compatible dimensions, append
168
+ self.embeddings = np.vstack([self.embeddings, new_embeddings])
169
+ self.documents.extend(valid_documents)
170
+ self.metadatas.extend(valid_metadatas)
171
+ print("✅ Đã thêm vào system hiện có")
172
+
173
+ except Exception as e:
174
+ print(f"❌ Lỗi khi thêm vào system: {e}")
175
+ return
176
 
177
  # Update FAISS index
178
  self._update_faiss_index()
179
 
180
+ new_doc_count = len(self.documents)
181
+ print(f"🎉 THÀNH CÔNG: Đã thêm {new_doc_count - old_doc_count} documents mới")
182
+ print(f"📊 Tổng documents: {new_doc_count}")
183
 
184
  def _update_faiss_index(self):
185
  """Cập nhật FAISS index với embeddings hiện tại"""