File size: 2,469 Bytes
0d96daf
 
 
 
 
89397a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d96daf
 
 
 
 
89397a4
 
 
 
 
 
 
 
 
 
0d96daf
 
 
89397a4
0d96daf
 
 
 
 
 
89397a4
0d96daf
 
89397a4
 
 
 
 
 
 
 
 
0d96daf
 
 
 
 
89397a4
 
0d96daf
 
89397a4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import os
import sys
import logging
from pathlib import Path

# Setup logging cho HuggingFace environment
def setup_logging():
    """Setup logging phù hợp với HF environment"""
    if os.getenv("SPACE_ID"):
        # Trên HF, chỉ log ra console
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
            handlers=[logging.StreamHandler()]
        )
    else:
        # Local, có thể ghi file
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
            handlers=[
                logging.StreamHandler(),
                logging.FileHandler("embed_data.log", encoding='utf-8')
            ]
        )

setup_logging()
logger = logging.getLogger(__name__)

def setup_data():
    """Setup and embed data on startup"""
    try:
        logger.info("Starting data setup process...")
        
        # Kiểm tra data directory
        data_dir = "data"
        if not os.path.exists(data_dir):
            logger.error(f"Data directory {data_dir} not found!")
            return
        
        # Import sau khi đã setup logging
        logger.info("Importing embedding modules...")
        from core.embedding_model import get_embedding_model
        
        # Kiểm tra xem đã có data chưa
        logger.info("Checking existing embeddings...")
        embedding_model = get_embedding_model()
        current_count = embedding_model.count()
        
        logger.info(f"Current embeddings count: {current_count}")
        
        # Nếu chưa có data hoặc ít hơn expected, thì embed
        if current_count < 50:  # Threshold thấp hơn để test
            logger.info("Starting data embedding process...")
            
            # Import embed function
            from scripts.embed_data import embed_all_data
            
            # Chạy embedding
            embed_all_data(data_dir, force=False)
            
            # Kiểm tra lại
            final_count = embedding_model.count()
            logger.info(f"Embedding completed! Final count: {final_count}")
        else:
            logger.info("Data already embedded, skipping...")
            
    except Exception as e:
        logger.error(f"Error in setup_data: {e}")
        import traceback
        logger.error(traceback.format_exc())

if __name__ == "__main__":
    setup_data()