kn29 commited on
Commit
b2d82e1
·
verified ·
1 Parent(s): 00dbf94

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -4
app.py CHANGED
@@ -311,21 +311,34 @@ async def process_document_pipeline(
311
  logger.info(f"🧩 Creating chunks and embeddings for session {session_id} using {HF_MODEL_ID}")
312
  chunks = chunk_text_hierarchical(text, filename)
313
 
 
 
314
  # Create embeddings and store chunks
315
  chunks_to_store = []
316
- for chunk in chunks:
 
 
 
 
 
 
317
  # Create embedding
318
- embedding = create_embedding(chunk['text'])
 
 
 
 
319
 
 
320
  chunk_doc = {
321
  "session_id": session_id,
322
  "chunk_id": chunk['id'],
323
- "text": chunk['text'],
324
  "title": chunk['title'],
325
  "section_type": chunk['section_type'],
326
  "importance_score": chunk['importance_score'],
327
  "entities": chunk['entities'],
328
- "embedding": embedding.tolist(), # Convert numpy array to list
329
  "created_at": datetime.utcnow()
330
  }
331
  chunks_to_store.append(chunk_doc)
@@ -333,6 +346,9 @@ async def process_document_pipeline(
333
  # Batch insert chunks
334
  if chunks_to_store:
335
  await db.chunks.insert_many(chunks_to_store)
 
 
 
336
 
337
  # Update session as completed
338
  await db.sessions.update_one(
 
311
  logger.info(f"🧩 Creating chunks and embeddings for session {session_id} using {HF_MODEL_ID}")
312
  chunks = chunk_text_hierarchical(text, filename)
313
 
314
+ logger.info(f"📊 Created {len(chunks)} chunks from document")
315
+
316
  # Create embeddings and store chunks
317
  chunks_to_store = []
318
+ for i, chunk in enumerate(chunks):
319
+ # Validate chunk has text
320
+ chunk_text = chunk.get('text', '').strip()
321
+ if not chunk_text:
322
+ logger.warning(f"⚠️ Skipping chunk {i} - no text content")
323
+ continue
324
+
325
  # Create embedding
326
+ try:
327
+ embedding = create_embedding(chunk_text)
328
+ except Exception as e:
329
+ logger.error(f"❌ Failed to create embedding for chunk {i}: {e}")
330
+ continue
331
 
332
+ # FIXED: Use 'content' field instead of 'text'
333
  chunk_doc = {
334
  "session_id": session_id,
335
  "chunk_id": chunk['id'],
336
+ "content": chunk_text, # Changed from 'text' to 'content'
337
  "title": chunk['title'],
338
  "section_type": chunk['section_type'],
339
  "importance_score": chunk['importance_score'],
340
  "entities": chunk['entities'],
341
+ "embedding": embedding.tolist(),
342
  "created_at": datetime.utcnow()
343
  }
344
  chunks_to_store.append(chunk_doc)
 
346
  # Batch insert chunks
347
  if chunks_to_store:
348
  await db.chunks.insert_many(chunks_to_store)
349
+ logger.info(f"✅ Stored {len(chunks_to_store)} chunks with embeddings")
350
+ else:
351
+ raise Exception("No valid chunks created from document")
352
 
353
  # Update session as completed
354
  await db.sessions.update_one(