Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| """ | |
| Mining Script: Vietnamese Food Nutrition Database | |
| Processes Vietnamese food CSV into ChromaDB for NutritionAgent | |
| """ | |
| import sys | |
| import pandas as pd | |
| from pathlib import Path | |
| def process_vietnamese_nutrition(): | |
| """Process Vietnamese food nutrition CSV into ChromaDB""" | |
| try: | |
| from sentence_transformers import SentenceTransformer | |
| import chromadb | |
| print("π Processing Vietnamese Food Nutrition Database...") | |
| # Load CSV | |
| csv_path = Path("data_mining/datasets/vietnamese_food_nutrition.csv") | |
| if not csv_path.exists(): | |
| print("β CSV not found. Creating it first...") | |
| import vn_food_db | |
| vn_food_db.vn_food_db() | |
| df = pd.read_csv(csv_path) | |
| print(f"π Loaded: {len(df)} Vietnamese foods") | |
| # Initialize | |
| print("π€ Loading embedding model...") | |
| embedder = SentenceTransformer('keepitreal/vietnamese-sbert') | |
| output_dir = Path("data_mining/output") | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| client = chromadb.PersistentClient(path=str(output_dir / "vietnamese_nutrition_chroma")) | |
| collection = client.get_or_create_collection( | |
| name="vietnamese_nutrition", | |
| metadata={"description": "Vietnamese Food Nutrition Database"} | |
| ) | |
| # Process foods | |
| print("π¦ Creating ChromaDB...") | |
| batch_size = 20 | |
| for i in range(0, len(df), batch_size): | |
| batch = df.iloc[i:i+batch_size] | |
| ids = [] | |
| texts = [] | |
| metadatas = [] | |
| for idx, row in batch.iterrows(): | |
| # Create document | |
| text = f"""MΓ³n Δn: {row['name_vi']} ({row['name_en']}) | |
| Calories: {row['calories']} kcal | |
| Protein: {row['protein_g']}g | |
| Carbohydrates: {row['carbs_g']}g | |
| Fat: {row['fat_g']}g | |
| Fiber: {row['fiber_g']}g | |
| Category: {row['category']}""" | |
| ids.append(f"food_{idx}") | |
| texts.append(text) | |
| metadatas.append({ | |
| 'name_vi': row['name_vi'], | |
| 'name_en': row['name_en'], | |
| 'calories': int(row['calories']), | |
| 'category': row['category'], | |
| 'source': 'vietnamese_food_db' | |
| }) | |
| # Generate embeddings | |
| embeddings = embedder.encode(texts, show_progress_bar=False) | |
| # Add to collection | |
| collection.add( | |
| ids=ids, | |
| embeddings=embeddings.tolist(), | |
| documents=texts, | |
| metadatas=metadatas | |
| ) | |
| print(f" Processed {min(i+batch_size, len(df))}/{len(df)} foods...") | |
| print(f"\nβ Vietnamese Nutrition ChromaDB created!") | |
| print(f" Output: {output_dir / 'vietnamese_nutrition_chroma'}") | |
| print(f" Records: {len(df)} foods") | |
| return True | |
| except ImportError as e: | |
| print(f"β Missing library: {e}") | |
| print(" Install: pip install sentence-transformers chromadb pandas") | |
| return False | |
| except Exception as e: | |
| print(f"β Error: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return False | |
| def main(): | |
| """Main execution""" | |
| print("=" * 60) | |
| print("Vietnamese Food Nutrition Database Mining") | |
| print("=" * 60) | |
| success = process_vietnamese_nutrition() | |
| if success: | |
| print("\n" + "=" * 60) | |
| print("β SUCCESS! Vietnamese nutrition data ready for RAG") | |
| print("=" * 60) | |
| else: | |
| print("\nβ FAILED!") | |
| return success | |
| if __name__ == "__main__": | |
| success = main() | |
| sys.exit(0 if success else 1) | |