DawnC commited on
Commit
6a3bd1f
·
verified ·
1 Parent(s): 72d88d1

Upload 22 files

Browse files

Create Pixcribe Project

app.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from PIL import Image
4
+ import spaces
5
+
6
+ from pixcribe_pipeline import PixcribePipeline
7
+ from ui_manager import UIManager
8
+
9
+ # Initialize Pipeline and UI Manager
10
+ print("Initializing Pixcribe...")
11
+ print("⏳ Loading models (this may take 60-90 seconds on first run)...")
12
+ pipeline = PixcribePipeline(yolo_variant='l')
13
+ ui_manager = UIManager()
14
+ print("✅ All models loaded successfully!")
15
+
16
+ @spaces.GPU(duration=180)
17
+ def process_wrapper(image, yolo_variant, caption_language):
18
+ """Process image and return formatted results
19
+
20
+ This function uses GPU-accelerated models:
21
+ - YOLOv11 (object detection)
22
+ - OpenCLIP ViT-H/14 (semantic understanding)
23
+ - EasyOCR (text extraction)
24
+ - Places365 (scene analysis)
25
+ - Qwen2.5-VL-7B (caption generation)
26
+
27
+ Total processing time: ~2-3 seconds on L4 GPU
28
+ """
29
+
30
+ if image is None:
31
+ return None, "<div style='color: #E74C3C; padding: 24px; text-align: center;'>Please upload an image</div>"
32
+
33
+ try:
34
+ platform = 'instagram'
35
+
36
+ results = pipeline.process_image(image, platform, yolo_variant, caption_language)
37
+
38
+ if results is None:
39
+ return None, "<div style='color: #E74C3C; padding: 24px; text-align: center;'>Processing failed. Check terminal logs for details.</div>"
40
+
41
+ except Exception as e:
42
+ import traceback
43
+ error_msg = traceback.format_exc()
44
+ print("="*60)
45
+ print("ERROR DETAILS:")
46
+ print(error_msg)
47
+ print("="*60)
48
+
49
+ # Return detailed error to user
50
+ error_html = f"""
51
+ <div style='background: #FADBD8; border: 2px solid #E74C3C; border-radius: 20px; padding: 28px; margin: 16px 0;'>
52
+ <h3 style='color: #C0392B; margin-top: 0; font-size: 22px;'>❌ Processing Error</h3>
53
+ <p style='color: #E74C3C; font-weight: bold; font-size: 17px; margin-bottom: 16px;'>{str(e)}</p>
54
+ <details style='margin-top: 12px;'>
55
+ <summary style='cursor: pointer; color: #C0392B; font-weight: bold; font-size: 16px;'>View Full Error Trace</summary>
56
+ <pre style='background: white; padding: 16px; border-radius: 12px; overflow-x: auto; font-size: 13px; color: #2C3E50; margin-top: 12px;'>{error_msg}</pre>
57
+ </details>
58
+ </div>
59
+ """
60
+ return None, error_html
61
+
62
+ # Get visualized image with brand boxes
63
+ visualized_image = results.get('visualized_image', image)
64
+
65
+ # Format captions with copy functionality
66
+ captions_html = ui_manager.format_captions_with_copy(results['captions'])
67
+
68
+ return visualized_image, captions_html
69
+
70
+ # Create Gradio Interface
71
+ with gr.Blocks(css=ui_manager.custom_css, title="Pixcribe - AI Social Media Captions") as app:
72
+
73
+ # Header
74
+ ui_manager.create_header()
75
+
76
+ # Info Banner - Loading Time Notice
77
+ ui_manager.create_info_banner()
78
+
79
+ # Top Row - Upload Images & Detected Objects
80
+ with gr.Row(elem_classes="main-row"):
81
+ # Left - Upload Card
82
+ with gr.Column(scale=1):
83
+ with gr.Group(elem_classes="upload-card"):
84
+ image_input = gr.Image(
85
+ type="pil",
86
+ label="Upload Image",
87
+ elem_classes="upload-area"
88
+ )
89
+
90
+ # Right - Detected Objects
91
+ with gr.Column(scale=1):
92
+ with gr.Group(elem_classes="results-card"):
93
+ gr.Markdown("### Detected Objects", elem_classes="section-title")
94
+ visualized_image = gr.Image(
95
+ label="",
96
+ elem_classes="image-container"
97
+ )
98
+
99
+ # Bottom - Settings Section (Full Width)
100
+ with gr.Group(elem_classes="settings-container"):
101
+ gr.Markdown("### Settings", elem_classes="section-title-left")
102
+
103
+ with gr.Row(elem_classes="settings-row"):
104
+ caption_language = gr.Radio(
105
+ choices=[
106
+ ('繁體中文', 'zh'),
107
+ ('English', 'en')
108
+ ],
109
+ value='en',
110
+ label="Caption Language",
111
+ elem_classes="radio-group-inline"
112
+ )
113
+
114
+ yolo_variant = gr.Radio(
115
+ choices=[
116
+ ('Fast (m)', 'm'),
117
+ ('Balanced (l)', 'l'),
118
+ ('Accurate (x)', 'x')
119
+ ],
120
+ value='l',
121
+ label="Detection Mode",
122
+ elem_classes="radio-group-inline"
123
+ )
124
+
125
+ # Generate Button (Centered)
126
+ with gr.Row(elem_classes="button-row"):
127
+ analyze_btn = gr.Button(
128
+ "Generate Captions",
129
+ variant="primary",
130
+ elem_classes="generate-button"
131
+ )
132
+
133
+ # Processing Time Notice
134
+ gr.HTML("""
135
+ <div style="text-align: center; margin-top: 16px; color: #7F8C8D; font-size: 14px;">
136
+ <span style="opacity: 0.8;">⚡ Please be patient - AI processing may take some time</span>
137
+ </div>
138
+ """)
139
+
140
+ # Caption Results (Full Width)
141
+ with gr.Group(elem_classes="caption-results-container"):
142
+ gr.Markdown("### 📝 Generated Captions", elem_classes="section-title")
143
+ caption_output = gr.HTML(
144
+ label="",
145
+ elem_id="caption-results"
146
+ )
147
+
148
+ # Footer
149
+ ui_manager.create_footer()
150
+
151
+ # Connect button to processing function
152
+ analyze_btn.click(
153
+ fn=process_wrapper,
154
+ inputs=[image_input, yolo_variant, caption_language],
155
+ outputs=[visualized_image, caption_output]
156
+ )
157
+
158
+ if __name__ == "__main__":
159
+ app.launch(share=True)
brand_detection_optimizer.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+ from PIL import Image
4
+ from typing import Dict, List, Tuple
5
+ import numpy as np
6
+
7
+ class BrandDetectionOptimizer:
8
+ """
9
+ 智能品牌檢測優化器 - 性能與準確度平衡
10
+ 通過快速預篩選減少不必要的深度檢測
11
+ """
12
+
13
+ def __init__(self, clip_manager, ocr_manager, prompt_library):
14
+ self.clip_manager = clip_manager
15
+ self.ocr_manager = ocr_manager
16
+ self.prompt_library = prompt_library
17
+
18
+ def quick_brand_prescreening(self, image: Image.Image) -> List[str]:
19
+ """
20
+ 快速品牌預篩選 - 只檢測最可能的品牌類別
21
+ 大幅減少需要深度檢測的品牌數量
22
+
23
+ Returns:
24
+ List of brand names that are likely present
25
+ """
26
+ # Step 1: OCR 快速掃描(最快且最準確的方法)
27
+ likely_brands = set()
28
+
29
+ ocr_results = self.ocr_manager.extract_text(image, use_brand_preprocessing=True)
30
+
31
+ for ocr_item in ocr_results:
32
+ text = ocr_item['text'].upper()
33
+
34
+ # 過濾太短的文字(避免誤匹配)
35
+ if len(text) < 2:
36
+ continue
37
+
38
+ # 檢查所有品牌的別名
39
+ for brand_name, brand_info in self.prompt_library.get_all_brands().items():
40
+ aliases = [alias.upper() for alias in brand_info.get('aliases', [])]
41
+
42
+ # 完全匹配或部分匹配(但要求較高相似度)
43
+ for alias in aliases:
44
+ # 完全匹配
45
+ if alias == text:
46
+ likely_brands.add(brand_name)
47
+ break
48
+ # 部分匹配:要求別名長度 >= 3 且匹配度高
49
+ elif len(alias) >= 3:
50
+ if alias in text and len(alias) / len(text) > 0.6:
51
+ likely_brands.add(brand_name)
52
+ break
53
+ elif text in alias and len(text) / len(alias) > 0.6:
54
+ likely_brands.add(brand_name)
55
+ break
56
+
57
+ # Step 2: 視覺特徵快速分類(使用通用品牌類別)
58
+ category_prompts = {
59
+ 'luxury': 'luxury brand product with monogram pattern and leather details',
60
+ 'sportswear': 'sportswear brand product with athletic logo and swoosh design',
61
+ 'tech': 'technology brand product with minimalist design and metal finish',
62
+ 'automotive': 'luxury car brand with distinctive grille and emblem',
63
+ 'watches': 'luxury watch with distinctive dial and brand logo',
64
+ 'fashion': 'fashion brand product with signature pattern or logo'
65
+ }
66
+
67
+ category_scores = self.clip_manager.classify_zero_shot(
68
+ image, list(category_prompts.values())
69
+ )
70
+
71
+ # 獲取最可能的類別(top 2)
72
+ sorted_categories = sorted(
73
+ category_scores.items(), key=lambda x: x[1], reverse=True
74
+ )[:2]
75
+
76
+ # 將類別映射回品牌
77
+ category_mapping = {v: k for k, v in category_prompts.items()}
78
+
79
+ for prompt_text, score in sorted_categories:
80
+ if score > 0.30: # 提高閾值,減少誤判(0.15 → 0.30)
81
+ category = category_mapping[prompt_text]
82
+ # 添加該類別的所有品牌
83
+ category_brands = self.prompt_library.get_brands_by_category(category)
84
+ likely_brands.update(category_brands.keys())
85
+
86
+ # Step 3: 如果完全沒有線索,只添加視覺特徵最明顯的 3 個品牌(保底)
87
+ # 注意:這不是硬編碼,而是在無任何線索時的合理默認值
88
+ if not likely_brands:
89
+ # 只添加視覺特徵極其明顯且常見的品牌
90
+ default_brands = ['Louis Vuitton', 'Gucci', 'Nike']
91
+ likely_brands.update(default_brands)
92
+
93
+ # 返回品牌列表(不限制數量,而是依賴質量過濾)
94
+ return list(likely_brands)
95
+
96
+ def smart_region_selection(self, image: Image.Image,
97
+ saliency_regions: List[Dict]) -> List[Tuple[int, int, int, int]]:
98
+ """
99
+ 智能區域選擇 - 只掃描有品牌可能性的區域
100
+ 替代低效的網格掃描
101
+
102
+ Args:
103
+ image: PIL Image
104
+ saliency_regions: Saliency detection results
105
+
106
+ Returns:
107
+ List of bboxes (x1, y1, x2, y2) to scan
108
+ """
109
+ regions_to_scan = []
110
+ img_width, img_height = image.size
111
+
112
+ # Strategy 1: 使用顯著性區域(最有可能包含品牌)
113
+ if saliency_regions:
114
+ for region in saliency_regions[:3]: # Top 3 salient regions
115
+ bbox = region.get('bbox')
116
+ if bbox:
117
+ # 擴展區域以包含周邊context
118
+ x1, y1, x2, y2 = bbox
119
+ padding = 20
120
+ x1 = max(0, x1 - padding)
121
+ y1 = max(0, y1 - padding)
122
+ x2 = min(img_width, x2 + padding)
123
+ y2 = min(img_height, y2 + padding)
124
+
125
+ # 確保區域夠大
126
+ if (x2 - x1) > 100 and (y2 - y1) > 100:
127
+ regions_to_scan.append((x1, y1, x2, y2))
128
+
129
+ # Strategy 2: 中心區域(品牌通常在中心)
130
+ center_x = img_width // 2
131
+ center_y = img_height // 2
132
+ center_size = min(img_width, img_height) // 2
133
+
134
+ center_bbox = (
135
+ max(0, center_x - center_size // 2),
136
+ max(0, center_y - center_size // 2),
137
+ min(img_width, center_x + center_size // 2),
138
+ min(img_height, center_y + center_size // 2)
139
+ )
140
+ regions_to_scan.append(center_bbox)
141
+
142
+ # Strategy 3: 如果沒有顯著區域,使用全圖
143
+ if not regions_to_scan:
144
+ regions_to_scan.append((0, 0, img_width, img_height))
145
+
146
+ return regions_to_scan
147
+
148
+ def compute_brand_confidence_boost(self, brand_name: str,
149
+ ocr_results: List[Dict],
150
+ base_confidence: float) -> float:
151
+ """
152
+ 基於 OCR 結果提升品牌信心度
153
+ 如果 OCR 檢測到品牌名稱,大幅提升信心度
154
+
155
+ Args:
156
+ brand_name: Brand name
157
+ ocr_results: OCR detection results
158
+ base_confidence: Base confidence from visual matching
159
+
160
+ Returns:
161
+ Boosted confidence score
162
+ """
163
+ brand_info = self.prompt_library.get_brand_prompts(brand_name)
164
+ if not brand_info:
165
+ return base_confidence
166
+
167
+ aliases = [alias.upper() for alias in brand_info.get('aliases', [])]
168
+
169
+ max_boost = 0.0
170
+ for ocr_item in ocr_results:
171
+ text = ocr_item['text'].upper()
172
+ ocr_conf = ocr_item['confidence']
173
+
174
+ for alias in aliases:
175
+ # 完全匹配
176
+ if alias == text:
177
+ max_boost = max(max_boost, 0.40 * ocr_conf) # 最高提升 0.40
178
+ # 部分匹配
179
+ elif alias in text or text in alias:
180
+ if len(alias) > 2: # 避免短字符串誤匹配
181
+ max_boost = max(max_boost, 0.25 * ocr_conf)
182
+
183
+ # 應用提升,但不超過 0.95
184
+ boosted_confidence = min(base_confidence + max_boost, 0.95)
185
+ return boosted_confidence
186
+
187
+ print("✓ BrandDetectionOptimizer (performance and accuracy optimizer) defined")
brand_prompts.py ADDED
@@ -0,0 +1,970 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from typing import Dict, List, Optional
3
+
4
+ class BrandPrompts:
5
+ """
6
+ 品牌視覺特徵與多模態識別資料庫
7
+ 提供品牌的視覺線索、OpenCLIP prompts、Hashtags
8
+ """
9
+
10
+ def __init__(self):
11
+ """初始化品牌資料庫"""
12
+
13
+ self.brand_prompts = {
14
+ # ===== 奢侈品牌 Luxury Brands =====
15
+ 'luxury': {
16
+ "Louis Vuitton": {
17
+ "strong_cues": [
18
+ "LV monogram pattern with interlocking L and V letters on brown canvas",
19
+ "Brown canvas with golden hardware and leather trim showing Louis Vuitton signature",
20
+ "Damier checkerboard pattern in brown and tan showing LV design",
21
+ "Monogram flower motifs with LV initials repeated across surface"
22
+ ],
23
+ "weak_cues": [
24
+ "Luxury handbag with structured silhouette and top handles",
25
+ "Premium leather goods with golden metal accents",
26
+ "Designer bag with classic proportions and elegant hardware"
27
+ ],
28
+ "region_contexts": ["bag_panel", "luggage_surface", "wallet_front", "accessory_detail"],
29
+ "openclip_prompts": {
30
+ "bag_panel": [
31
+ "Louis Vuitton monogram canvas handbag with leather trim and brass hardware",
32
+ "LV brown monogram pattern on luxury bag with golden clasps",
33
+ "Designer handbag showing Louis Vuitton signature canvas and leather details"
34
+ ],
35
+ "luggage_surface": [
36
+ "Louis Vuitton monogram luggage with brown canvas and leather corners",
37
+ "LV travel bag showing iconic monogram pattern with metallic accents",
38
+ "Luxury suitcase with Louis Vuitton canvas and protective leather trim"
39
+ ],
40
+ "wallet_front": [
41
+ "Louis Vuitton monogram wallet with compact folding design",
42
+ "LV small leather good showing monogram canvas and card slots",
43
+ "Designer wallet with Louis Vuitton pattern and golden hardware"
44
+ ]
45
+ },
46
+ "aliases": ["LV", "Louis Vuitton Monogram", "VUITTON", "LOUIS VUITTON"],
47
+ "hashtags": {
48
+ "zh": ["LV", "路易威登", "奢侈品", "精品包", "時尚"],
49
+ "en": ["LouisVuitton", "LV", "LuxuryFashion", "DesignerBag", "Luxury"]
50
+ },
51
+ "visual_distinctive": True,
52
+ "text_prominent": False
53
+ },
54
+
55
+ "Gucci": {
56
+ "strong_cues": [
57
+ "Interlocking double G logo in gold or silver metal with heart shape design",
58
+ "GG logo in shiny gold brass hardware on black quilted leather",
59
+ "Green and red web stripe on beige or brown canvas background",
60
+ "GG monogram pattern repeated across fabric or leather surface",
61
+ "Chevron matelassé quilted leather with V-shaped stitching pattern",
62
+ "Heart-shaped double G logo with antique gold finish",
63
+ "Bamboo handle detail on handbag with curved shape and metal hardware"
64
+ ],
65
+ "weak_cues": [
66
+ "Luxury fashion item with bold logo placement and premium materials",
67
+ "Designer accessory with distinctive hardware and Italian branding",
68
+ "High-end quilted leather goods with geometric stitching pattern",
69
+ "Black leather handbag with gold chain strap and structured silhouette",
70
+ "Luxury bag with chevron quilting and metallic hardware accents"
71
+ ],
72
+ "region_contexts": ["bag_front", "bag_panel", "belt_buckle", "shoe_detail", "accessory_surface", "logo_area"],
73
+ "openclip_prompts": {
74
+ "bag_front": [
75
+ "Gucci Marmont handbag with heart-shaped GG logo in antique gold on quilted black leather",
76
+ "Designer bag showing Gucci chevron matelassé quilted pattern with gold GG hardware",
77
+ "Luxury handbag with double G heart logo and V-shaped quilting on black leather",
78
+ "Gucci bag with interlocking GG logo web stripe and canvas texture",
79
+ "Black quilted leather Gucci bag with shiny gold double G emblem and chain strap",
80
+ "Gucci Marmont camera bag with chevron quilted leather and gold hardware",
81
+ "Designer handbag featuring Gucci signature GG logo with geometric quilting pattern"
82
+ ],
83
+ "bag_panel": [
84
+ "Gucci matelassé quilted leather surface with chevron V-pattern stitching",
85
+ "Black quilted leather panel with Gucci heart-shaped GG logo in center",
86
+ "Luxury leather with geometric quilting showing Gucci craftsmanship and gold hardware",
87
+ "Chevron stitched leather surface with interlocking GG emblem in antique gold",
88
+ "Gucci quilted pattern with V-shaped chevron design and metallic logo placement"
89
+ ],
90
+ "belt_buckle": [
91
+ "Gucci belt with large interlocking GG buckle in polished gold metal",
92
+ "Designer belt showing double G logo buckle with black or brown leather strap",
93
+ "Luxury belt with Gucci signature GG buckle in brass finish and Italian leather",
94
+ "Gucci GG Marmont belt with textured double G buckle and leather band"
95
+ ],
96
+ "logo_area": [
97
+ "Close-up of Gucci interlocking GG logo in gold metal with heart shape",
98
+ "Gucci double G emblem in antique gold brass on black leather background",
99
+ "Heart-shaped GG logo with metallic gold finish showing Gucci branding",
100
+ "Shiny gold Gucci GG hardware on quilted matelassé leather surface"
101
+ ]
102
+ },
103
+ "aliases": ["GG", "GUCCI", "Gucci Marmont"],
104
+ "hashtags": {
105
+ "zh": ["Gucci", "古馳", "奢侈品", "精品", "義大利時尚", "Marmont"],
106
+ "en": ["Gucci", "LuxuryFashion", "DesignerBrand", "ItalianFashion", "GG", "GucciMarmont"]
107
+ },
108
+ "visual_distinctive": True,
109
+ "text_prominent": False
110
+ },
111
+
112
+ "Chanel": {
113
+ "strong_cues": [
114
+ "Interlocking double C logo in metal or quilted leather",
115
+ "Quilted diamond pattern leather with chain strap",
116
+ "Black and white color scheme with gold or silver chain",
117
+ "Camellia flower motif as decorative element"
118
+ ],
119
+ "weak_cues": [
120
+ "Elegant quilted leather handbag with chain details",
121
+ "Luxury fashion item with classic French design",
122
+ "Designer accessory with sophisticated minimalist styling"
123
+ ],
124
+ "region_contexts": ["bag_flap", "jewelry_detail", "perfume_bottle", "clothing_label"],
125
+ "openclip_prompts": {
126
+ "bag_flap": [
127
+ "Chanel quilted bag with interlocking CC logo and chain strap",
128
+ "Classic flap bag showing Chanel diamond quilting and gold hardware",
129
+ "Luxury handbag with Chanel CC closure and leather chain"
130
+ ]
131
+ },
132
+ "aliases": ["CC", "CHANEL"],
133
+ "hashtags": {
134
+ "zh": ["Chanel", "香奈兒", "奢侈品", "精品包", "法國時尚"],
135
+ "en": ["Chanel", "ChanelBag", "Luxury", "FrenchFashion", "ClassicBag"]
136
+ },
137
+ "visual_distinctive": True,
138
+ "text_prominent": False
139
+ },
140
+
141
+ "Hermès": {
142
+ "strong_cues": [
143
+ "Orange box or shopping bag with brown ribbon",
144
+ "Birkin or Kelly bag with distinctive silhouette and hardware",
145
+ "H logo belt buckle in polished metal",
146
+ "Saddle stitching on leather goods with equestrian heritage"
147
+ ],
148
+ "weak_cues": [
149
+ "Ultra-luxury leather handbag with exceptional craftsmanship",
150
+ "Designer accessory with understated elegance and premium materials",
151
+ "High-end fashion item with classic proportions and hardware"
152
+ ],
153
+ "region_contexts": ["bag_structure", "belt_buckle", "scarf_pattern", "packaging"],
154
+ "openclip_prompts": {
155
+ "bag_structure": [
156
+ "Hermès Birkin bag with structured leather and golden padlock",
157
+ "Luxury handbag showing Hermès Kelly bag silhouette with turnlock",
158
+ "Designer bag with Hermès craftsmanship and distinctive hardware"
159
+ ]
160
+ },
161
+ "aliases": ["HERMES", "HERMÈS", "BIRKIN", "KELLY"],
162
+ "hashtags": {
163
+ "zh": ["Hermès", "愛馬仕", "柏金包", "奢侈品", "頂級精品"],
164
+ "en": ["Hermes", "Birkin", "KellyBag", "Luxury", "UltraLuxury"]
165
+ },
166
+ "visual_distinctive": True,
167
+ "text_prominent": False
168
+ },
169
+
170
+ "Prada": {
171
+ "strong_cues": [
172
+ "Triangular metal logo plate with Prada Milano text",
173
+ "Saffiano leather with crosshatch texture pattern",
174
+ "Black nylon bag with triangular logo badge",
175
+ "Minimalist design with subtle branding placement"
176
+ ],
177
+ "weak_cues": [
178
+ "Italian luxury handbag with clean modern lines",
179
+ "Designer accessory with understated contemporary styling",
180
+ "High-end leather goods with minimalist aesthetic"
181
+ ],
182
+ "region_contexts": ["bag_front", "wallet_surface", "shoe_heel", "clothing_tag"],
183
+ "openclip_prompts": {
184
+ "bag_front": [
185
+ "Prada bag with triangular metal logo and saffiano leather",
186
+ "Designer handbag showing Prada Milano badge with textured leather",
187
+ "Luxury bag with Prada signature triangle and minimalist design"
188
+ ]
189
+ },
190
+ "aliases": ["PRADA", "MILANO"],
191
+ "hashtags": {
192
+ "zh": ["Prada", "普拉達", "奢侈品", "義大利精品", "時尚"],
193
+ "en": ["Prada", "ItalianLuxury", "DesignerBag", "LuxuryFashion", "Minimalist"]
194
+ },
195
+ "visual_distinctive": True,
196
+ "text_prominent": False
197
+ }
198
+ },
199
+
200
+ # ===== 運動品牌 Sportswear Brands =====
201
+ 'sportswear': {
202
+ "Nike": {
203
+ "strong_cues": [
204
+ "Swoosh logo in black white or colored variations",
205
+ "Just Do It slogan text accompanying swoosh",
206
+ "Air Jordan jumpman silhouette logo",
207
+ "Nike Air branding on shoe midsole or tongue"
208
+ ],
209
+ "weak_cues": [
210
+ "Athletic footwear with sporty performance design",
211
+ "Sportswear with moisture-wicking technical fabric",
212
+ "Running shoe with cushioned midsole and branded details"
213
+ ],
214
+ "region_contexts": ["shoe_side", "apparel_chest", "equipment_surface", "logo_placement"],
215
+ "openclip_prompts": {
216
+ "shoe_side": [
217
+ "Nike sneaker with swoosh logo on side panel",
218
+ "Athletic shoe showing Nike branding and Air technology",
219
+ "Running shoe with Nike swoosh and performance design"
220
+ ],
221
+ "apparel_chest": [
222
+ "Nike athletic wear with swoosh logo on chest",
223
+ "Sports apparel showing Nike branding and technical fabric",
224
+ "Performance clothing with Nike swoosh and Just Do It text"
225
+ ]
226
+ },
227
+ "aliases": ["NIKE", "JUST DO IT", "swoosh"],
228
+ "hashtags": {
229
+ "zh": ["Nike", "耐吉", "運動", "球鞋", "運動品牌"],
230
+ "en": ["Nike", "JustDoIt", "Sneakers", "Athletic", "Sportswear"]
231
+ },
232
+ "visual_distinctive": True,
233
+ "text_prominent": True
234
+ },
235
+
236
+ "Adidas": {
237
+ "strong_cues": [
238
+ "Three stripes design on side of shoes or apparel",
239
+ "Trefoil logo with three-leaf clover design",
240
+ "Performance logo with three bars forming mountain shape",
241
+ "Boost technology branding on shoe midsole"
242
+ ],
243
+ "weak_cues": [
244
+ "Athletic footwear with three-stripe design element",
245
+ "Sportswear with retro or performance styling",
246
+ "Running shoe with distinctive midsole technology"
247
+ ],
248
+ "region_contexts": ["shoe_side", "apparel_sleeve", "equipment_detail", "logo_area"],
249
+ "openclip_prompts": {
250
+ "shoe_side": [
251
+ "Adidas sneaker with three stripes on side panel",
252
+ "Athletic shoe showing Adidas branding and Boost sole",
253
+ "Sports footwear with Adidas three-stripe design"
254
+ ]
255
+ },
256
+ "aliases": ["ADIDAS", "ORIGINALS", "three stripes"],
257
+ "hashtags": {
258
+ "zh": ["Adidas", "愛迪達", "三條線", "運動", "球鞋"],
259
+ "en": ["Adidas", "ThreeStripes", "Sneakers", "Sportswear", "Athletic"]
260
+ },
261
+ "visual_distinctive": True,
262
+ "text_prominent": True
263
+ },
264
+
265
+ "Puma": {
266
+ "strong_cues": [
267
+ "Leaping puma cat logo in silhouette form",
268
+ "Puma wordmark text in distinctive font",
269
+ "Formstrip design on side of shoes",
270
+ "Cat logo combined with Puma text branding"
271
+ ],
272
+ "weak_cues": [
273
+ "Athletic footwear with sleek performance design",
274
+ "Sportswear with modern styling and branding",
275
+ "Running shoe with lightweight construction"
276
+ ],
277
+ "region_contexts": ["shoe_side", "apparel_detail", "equipment_logo"],
278
+ "openclip_prompts": {
279
+ "shoe_side": [
280
+ "Puma sneaker with cat logo and formstrip design",
281
+ "Athletic shoe showing Puma branding on side",
282
+ "Sports footwear with Puma leaping cat emblem"
283
+ ]
284
+ },
285
+ "aliases": ["PUMA"],
286
+ "hashtags": {
287
+ "zh": ["Puma", "彪馬", "運動品牌", "球鞋"],
288
+ "en": ["Puma", "Sneakers", "Athletic", "Sportswear"]
289
+ },
290
+ "visual_distinctive": True,
291
+ "text_prominent": True
292
+ },
293
+
294
+ "Under Armour": {
295
+ "strong_cues": [
296
+ "Interlocking UA logo design",
297
+ "HeatGear or ColdGear technology branding",
298
+ "Under Armour wordmark in athletic font",
299
+ "Performance fabric with visible texture pattern"
300
+ ],
301
+ "weak_cues": [
302
+ "Athletic apparel with technical performance features",
303
+ "Sportswear with moisture management technology",
304
+ "Training gear with modern athletic design"
305
+ ],
306
+ "region_contexts": ["apparel_chest", "shoe_detail", "equipment_surface"],
307
+ "openclip_prompts": {
308
+ "apparel_chest": [
309
+ "Under Armour shirt with UA logo on chest",
310
+ "Athletic wear showing Under Armour branding and HeatGear",
311
+ "Performance apparel with Under Armour logo and technical fabric"
312
+ ]
313
+ },
314
+ "aliases": ["UA", "UNDER ARMOUR"],
315
+ "hashtags": {
316
+ "zh": ["UnderArmour", "安德瑪", "運動服飾", "訓練裝備"],
317
+ "en": ["UnderArmour", "UA", "Athletic", "PerformanceGear", "Training"]
318
+ },
319
+ "visual_distinctive": False,
320
+ "text_prominent": True
321
+ }
322
+ },
323
+
324
+ # ===== 科技品牌 Tech Brands =====
325
+ 'tech': {
326
+ "Apple": {
327
+ "strong_cues": [
328
+ "Bitten apple logo in silver white or black",
329
+ "Minimalist aluminum or glass device design",
330
+ "iPhone with distinctive notch or dynamic island",
331
+ "MacBook with glowing apple logo on lid"
332
+ ],
333
+ "weak_cues": [
334
+ "Sleek electronic device with premium materials",
335
+ "Smartphone with edge-to-edge display design",
336
+ "Laptop with thin profile and minimal branding"
337
+ ],
338
+ "region_contexts": ["device_back", "laptop_lid", "packaging", "product_front"],
339
+ "openclip_prompts": {
340
+ "device_back": [
341
+ "iPhone back with apple logo and camera array",
342
+ "Apple device showing bitten apple emblem and glass back",
343
+ "Smartphone with Apple branding and premium finish"
344
+ ],
345
+ "laptop_lid": [
346
+ "MacBook with glowing apple logo on aluminum lid",
347
+ "Apple laptop showing minimalist design and apple emblem",
348
+ "Premium notebook with Apple branding and sleek profile"
349
+ ]
350
+ },
351
+ "aliases": ["APPLE", "IPHONE", "IPAD", "MACBOOK", "apple logo"],
352
+ "hashtags": {
353
+ "zh": ["Apple", "蘋果", "iPhone", "科技", "蘋果產品"],
354
+ "en": ["Apple", "iPhone", "MacBook", "Tech", "iOS"]
355
+ },
356
+ "visual_distinctive": True,
357
+ "text_prominent": False
358
+ },
359
+
360
+ "Samsung": {
361
+ "strong_cues": [
362
+ "Samsung wordmark logo in blue or white",
363
+ "Galaxy branding on smartphone",
364
+ "Curved edge display on premium devices",
365
+ "S Pen stylus with Samsung device"
366
+ ],
367
+ "weak_cues": [
368
+ "Android smartphone with large display",
369
+ "Electronic device with modern design",
370
+ "Tech product with screen and branding"
371
+ ],
372
+ "region_contexts": ["device_front", "product_back", "packaging"],
373
+ "openclip_prompts": {
374
+ "device_front": [
375
+ "Samsung Galaxy phone with curved display and minimal bezels",
376
+ "Smartphone showing Samsung branding and screen",
377
+ "Android device with Samsung logo and modern design"
378
+ ]
379
+ },
380
+ "aliases": ["SAMSUNG", "Galaxy"],
381
+ "hashtags": {
382
+ "zh": ["Samsung", "三星", "Galaxy", "安卓", "科技"],
383
+ "en": ["Samsung", "Galaxy", "Android", "Tech", "Smartphone"]
384
+ },
385
+ "visual_distinctive": False,
386
+ "text_prominent": True
387
+ },
388
+
389
+ "Microsoft": {
390
+ "strong_cues": [
391
+ "Four-colored square window logo",
392
+ "Surface branding on devices",
393
+ "Windows logo on keyboard or device",
394
+ "Xbox green logo on gaming products"
395
+ ],
396
+ "weak_cues": [
397
+ "Premium laptop or tablet device",
398
+ "Gaming console or controller",
399
+ "Computer hardware with modern design"
400
+ ],
401
+ "region_contexts": ["device_surface", "keyboard_area", "product_branding"],
402
+ "openclip_prompts": {
403
+ "device_surface": [
404
+ "Microsoft Surface laptop with logo and premium build",
405
+ "Device showing Microsoft branding and sleek design",
406
+ "Surface product with distinctive kickstand and logo"
407
+ ]
408
+ },
409
+ "aliases": ["MICROSOFT", "Surface", "Windows"],
410
+ "hashtags": {
411
+ "zh": ["Microsoft", "微軟", "Surface", "科技", "Windows"],
412
+ "en": ["Microsoft", "Surface", "Windows", "Tech", "Xbox"]
413
+ },
414
+ "visual_distinctive": False,
415
+ "text_prominent": True
416
+ }
417
+ },
418
+
419
+ # ===== 汽車品牌 Automotive Brands =====
420
+ 'automotive': {
421
+ "Mercedes-Benz": {
422
+ "strong_cues": [
423
+ "Three-pointed star logo in circle",
424
+ "Mercedes-Benz wordmark on vehicle",
425
+ "Large star emblem on front grille",
426
+ "Hood ornament with standing star"
427
+ ],
428
+ "weak_cues": [
429
+ "Luxury vehicle with premium design",
430
+ "Car with elegant styling and badge",
431
+ "Automobile with refined details"
432
+ ],
433
+ "region_contexts": ["front_grille", "hood_ornament", "wheel_center", "badge"],
434
+ "openclip_prompts": {
435
+ "front_grille": [
436
+ "Mercedes-Benz front with three-pointed star on grille",
437
+ "Luxury car showing Mercedes logo and elegant grille design",
438
+ "Vehicle with Mercedes-Benz star emblem and premium styling"
439
+ ]
440
+ },
441
+ "aliases": ["Mercedes", "Benz", "MB", "MERCEDES-BENZ"],
442
+ "hashtags": {
443
+ "zh": ["Mercedes", "賓士", "豪華車", "汽車", "德國車"],
444
+ "en": ["Mercedes", "Benz", "LuxuryCar", "German", "Automotive"]
445
+ },
446
+ "visual_distinctive": True,
447
+ "text_prominent": False
448
+ },
449
+
450
+ "BMW": {
451
+ "strong_cues": [
452
+ "Blue and white roundel logo with BMW letters",
453
+ "Kidney grille design on front",
454
+ "Hofmeister kink in rear window design",
455
+ "BMW M badge for performance models"
456
+ ],
457
+ "weak_cues": [
458
+ "Luxury sports sedan with dynamic styling",
459
+ "Premium vehicle with distinctive design",
460
+ "Car with performance-oriented features"
461
+ ],
462
+ "region_contexts": ["front_badge", "wheel_center", "rear_emblem"],
463
+ "openclip_prompts": {
464
+ "front_badge": [
465
+ "BMW front with blue and white roundel and kidney grille",
466
+ "Luxury car showing BMW logo and distinctive grille design",
467
+ "Vehicle with BMW emblem and sporty styling"
468
+ ]
469
+ },
470
+ "aliases": ["BMW"],
471
+ "hashtags": {
472
+ "zh": ["BMW", "寶馬", "豪華車", "德國車", "性能車"],
473
+ "en": ["BMW", "LuxuryCar", "German", "Performance", "Ultimate Driving Machine"]
474
+ },
475
+ "visual_distinctive": True,
476
+ "text_prominent": False
477
+ },
478
+
479
+ "Tesla": {
480
+ "strong_cues": [
481
+ "T-shaped logo resembling cross-section of electric motor",
482
+ "Tesla wordmark on vehicle",
483
+ "Minimalist design with flush door handles",
484
+ "Large touchscreen display in interior"
485
+ ],
486
+ "weak_cues": [
487
+ "Electric vehicle with modern design",
488
+ "Car with clean aerodynamic styling",
489
+ "Automobile with minimal exterior branding"
490
+ ],
491
+ "region_contexts": ["front_badge", "rear_emblem", "wheel_center"],
492
+ "openclip_prompts": {
493
+ "front_badge": [
494
+ "Tesla front with T logo and minimalist design",
495
+ "Electric vehicle showing Tesla branding and clean styling",
496
+ "Car with Tesla emblem and aerodynamic profile"
497
+ ]
498
+ },
499
+ "aliases": ["TESLA"],
500
+ "hashtags": {
501
+ "zh": ["Tesla", "特斯拉", "電動車", "科技", "環保"],
502
+ "en": ["Tesla", "ElectricVehicle", "EV", "Tech", "Sustainable"]
503
+ },
504
+ "visual_distinctive": True,
505
+ "text_prominent": False
506
+ }
507
+ },
508
+
509
+ # ===== 鐘錶品牌 Watch Brands =====
510
+ 'watches': {
511
+ "Rolex": {
512
+ "strong_cues": [
513
+ "Crown logo at 12 o'clock position",
514
+ "Rolex wordmark on dial with Oyster Perpetual text",
515
+ "Cyclops date magnifier on crystal",
516
+ "Jubilee or Oyster bracelet design"
517
+ ],
518
+ "weak_cues": [
519
+ "Luxury watch with metal bracelet",
520
+ "Timepiece with classic round case",
521
+ "Wristwatch with premium finish"
522
+ ],
523
+ "region_contexts": ["watch_dial", "bracelet_clasp", "case_side"],
524
+ "openclip_prompts": {
525
+ "watch_dial": [
526
+ "Rolex watch dial with crown logo and Oyster Perpetual text",
527
+ "Luxury timepiece showing Rolex branding and date window",
528
+ "Wristwatch with Rolex crown emblem and classic design"
529
+ ]
530
+ },
531
+ "aliases": ["ROLEX", "OYSTER PERPETUAL"],
532
+ "hashtags": {
533
+ "zh": ["Rolex", "勞力士", "手錶", "奢華", "瑞士錶"],
534
+ "en": ["Rolex", "LuxuryWatch", "Swiss", "Timepiece", "OysterPerpetual"]
535
+ },
536
+ "visual_distinctive": True,
537
+ "text_prominent": True
538
+ },
539
+
540
+ "Omega": {
541
+ "strong_cues": [
542
+ "Omega symbol Ω on dial or case",
543
+ "Seamaster or Speedmaster model branding",
544
+ "Co-Axial escapement text on dial",
545
+ "Distinctive bracelet or strap design"
546
+ ],
547
+ "weak_cues": [
548
+ "Swiss luxury watch with sporty design",
549
+ "Timepiece with professional appearance",
550
+ "Wristwatch with precision craftsmanship"
551
+ ],
552
+ "region_contexts": ["watch_dial", "case_back", "bracelet"],
553
+ "openclip_prompts": {
554
+ "watch_dial": [
555
+ "Omega watch dial with Ω symbol and Seamaster branding",
556
+ "Luxury timepiece showing Omega logo and Co-Axial text",
557
+ "Wristwatch with Omega emblem and professional design"
558
+ ]
559
+ },
560
+ "aliases": ["OMEGA", "Ω"],
561
+ "hashtags": {
562
+ "zh": ["Omega", "歐米茄", "手錶", "瑞士錶", "奢華"],
563
+ "en": ["Omega", "Seamaster", "Speedmaster", "SwissWatch", "Luxury"]
564
+ },
565
+ "visual_distinctive": True,
566
+ "text_prominent": True
567
+ }
568
+ },
569
+
570
+ # ===== 時尚品牌 Fashion Brands =====
571
+ 'fashion': {
572
+ "Zara": {
573
+ "strong_cues": [
574
+ "Zara wordmark in sans-serif font",
575
+ "Minimalist clothing tag design",
576
+ "Fast fashion styling with current trends",
577
+ "Zara logo on shopping bag or packaging"
578
+ ],
579
+ "weak_cues": [
580
+ "Contemporary fashion apparel",
581
+ "Trendy clothing with modern cut",
582
+ "Affordable fashion item"
583
+ ],
584
+ "region_contexts": ["clothing_tag", "shopping_bag", "label"],
585
+ "openclip_prompts": {
586
+ "clothing_tag": [
587
+ "Zara clothing tag with brand logo",
588
+ "Fashion item showing Zara label",
589
+ "Apparel with Zara branding"
590
+ ]
591
+ },
592
+ "aliases": ["ZARA"],
593
+ "hashtags": {
594
+ "zh": ["Zara", "時尚", "快時尚", "穿搭"],
595
+ "en": ["Zara", "Fashion", "FastFashion", "Style", "OOTD"]
596
+ },
597
+ "visual_distinctive": False,
598
+ "text_prominent": True
599
+ },
600
+
601
+ "H&M": {
602
+ "strong_cues": [
603
+ "H&M logo in red and white",
604
+ "Hennes & Mauritz full brand name",
605
+ "Conscious collection labeling",
606
+ "Distinctive red shopping bag"
607
+ ],
608
+ "weak_cues": [
609
+ "Affordable fashion clothing",
610
+ "Casual apparel with trendy design",
611
+ "Fast fashion item"
612
+ ],
613
+ "region_contexts": ["clothing_tag", "label", "shopping_bag"],
614
+ "openclip_prompts": {
615
+ "clothing_tag": [
616
+ "H&M clothing tag with red and white logo",
617
+ "Fashion item showing H&M branding",
618
+ "Apparel with Hennes & Mauritz label"
619
+ ]
620
+ },
621
+ "aliases": ["HM", "H&M", "HENNES", "MAURITZ"],
622
+ "hashtags": {
623
+ "zh": ["HM", "時尚", "快時尚", "平價時尚"],
624
+ "en": ["HM", "Fashion", "FastFashion", "Style", "AffordableFashion"]
625
+ },
626
+ "visual_distinctive": False,
627
+ "text_prominent": True
628
+ },
629
+
630
+ "Ralph Lauren": {
631
+ "strong_cues": [
632
+ "Polo player on horse logo",
633
+ "Polo Ralph Lauren text branding",
634
+ "Preppy American style clothing",
635
+ "Polo shirt with collar and logo"
636
+ ],
637
+ "weak_cues": [
638
+ "Classic American fashion item",
639
+ "Preppy styled clothing",
640
+ "Casual wear with logo detail"
641
+ ],
642
+ "region_contexts": ["shirt_chest", "clothing_tag", "logo_placement"],
643
+ "openclip_prompts": {
644
+ "shirt_chest": [
645
+ "Polo shirt with Ralph Lauren polo player logo",
646
+ "Casual wear showing Polo Ralph Lauren emblem",
647
+ "Apparel with Ralph Lauren polo player branding"
648
+ ]
649
+ },
650
+ "aliases": ["Polo", "RALPH LAUREN", "RL"],
651
+ "hashtags": {
652
+ "zh": ["RalphLauren", "Polo", "美式風格", "經典時尚"],
653
+ "en": ["RalphLauren", "Polo", "AmericanStyle", "Preppy", "Classic"]
654
+ },
655
+ "visual_distinctive": True,
656
+ "text_prominent": True
657
+ },
658
+
659
+ "Tommy Hilfiger": {
660
+ "strong_cues": [
661
+ "Red white and blue flag logo",
662
+ "Tommy Hilfiger wordmark text",
663
+ "Preppy American sportswear styling",
664
+ "Flag emblem on clothing"
665
+ ],
666
+ "weak_cues": [
667
+ "Casual American fashion",
668
+ "Sporty preppy clothing",
669
+ "Logo-embellished apparel"
670
+ ],
671
+ "region_contexts": ["clothing_chest", "tag", "logo_area"],
672
+ "openclip_prompts": {
673
+ "clothing_chest": [
674
+ "Tommy Hilfiger apparel with flag logo",
675
+ "Casual wear showing Tommy Hilfiger branding",
676
+ "Clothing with red white blue Tommy emblem"
677
+ ]
678
+ },
679
+ "hashtags": {
680
+ "zh": ["TommyHilfiger", "美式休閒", "時尚", "經典"],
681
+ "en": ["TommyHilfiger", "American", "Preppy", "Fashion", "Classic"]
682
+ }
683
+ },
684
+
685
+ "Uniqlo": {
686
+ "strong_cues": [
687
+ "Uniqlo wordmark in red and white",
688
+ "LifeWear philosophy branding",
689
+ "Minimalist Japanese design aesthetic",
690
+ "HeatTech or AIRism technology labels"
691
+ ],
692
+ "weak_cues": [
693
+ "Simple functional clothing",
694
+ "Basic casual apparel",
695
+ "Affordable everyday wear"
696
+ ],
697
+ "region_contexts": ["clothing_tag", "label", "shopping_bag"],
698
+ "openclip_prompts": {
699
+ "clothing_tag": [
700
+ "Uniqlo clothing tag with brand logo",
701
+ "Apparel showing Uniqlo LifeWear branding",
702
+ "Clothing with Uniqlo label and technology marker"
703
+ ]
704
+ },
705
+ "hashtags": {
706
+ "zh": ["Uniqlo", "優衣庫", "日系", "簡約", "基本款"],
707
+ "en": ["Uniqlo", "LifeWear", "Japanese", "Minimalist", "Basics"]
708
+ }
709
+ },
710
+
711
+ "Gap": {
712
+ "strong_cues": [
713
+ "Gap logo in blue square",
714
+ "Classic American casual styling",
715
+ "Denim and khaki product focus",
716
+ "Gap wordmark on tags"
717
+ ],
718
+ "weak_cues": [
719
+ "Casual American clothing",
720
+ "Basic everyday apparel",
721
+ "Classic wardrobe staples"
722
+ ],
723
+ "region_contexts": ["clothing_tag", "label", "logo_placement"],
724
+ "openclip_prompts": {
725
+ "clothing_tag": [
726
+ "Gap clothing tag with blue logo",
727
+ "Apparel showing Gap branding",
728
+ "Casual wear with Gap label"
729
+ ]
730
+ },
731
+ "hashtags": {
732
+ "zh": ["Gap", "美式休閒", "經典", "基本款"],
733
+ "en": ["Gap", "American", "Casual", "Classic", "Everyday"]
734
+ }
735
+ },
736
+
737
+ "Lacoste": {
738
+ "strong_cues": [
739
+ "Green crocodile logo",
740
+ "Polo shirt with crocodile emblem",
741
+ "French sportswear styling",
742
+ "Crocodile on left chest area"
743
+ ],
744
+ "weak_cues": [
745
+ "Tennis-inspired fashion",
746
+ "Sporty casual clothing",
747
+ "Preppy athletic wear"
748
+ ],
749
+ "region_contexts": ["shirt_chest", "clothing_detail", "logo_area"],
750
+ "openclip_prompts": {
751
+ "shirt_chest": [
752
+ "Lacoste polo shirt with green crocodile logo",
753
+ "Sportswear showing Lacoste emblem on chest",
754
+ "Tennis apparel with Lacoste crocodile branding"
755
+ ]
756
+ },
757
+ "hashtags": {
758
+ "zh": ["Lacoste", "鱷魚", "法國", "網球", "運動時尚"],
759
+ "en": ["Lacoste", "Crocodile", "French", "Tennis", "Sporty"]
760
+ }
761
+ },
762
+
763
+ "Calvin Klein": {
764
+ "strong_cues": [
765
+ "CK logo or Calvin Klein wordmark",
766
+ "Minimalist modern design aesthetic",
767
+ "Monochromatic color schemes",
768
+ "Underwear waistband with CK logo"
769
+ ],
770
+ "weak_cues": [
771
+ "Contemporary minimalist fashion",
772
+ "Modern casual clothing",
773
+ "Designer basics"
774
+ ],
775
+ "region_contexts": ["clothing_tag", "waistband", "logo_area"],
776
+ "openclip_prompts": {
777
+ "clothing_tag": [
778
+ "Calvin Klein clothing with CK logo",
779
+ "Apparel showing Calvin Klein minimalist branding",
780
+ "Fashion item with CK monogram"
781
+ ]
782
+ },
783
+ "hashtags": {
784
+ "zh": ["CalvinKlein", "CK", "簡約", "美式時尚", "現代"],
785
+ "en": ["CalvinKlein", "CK", "Minimalist", "Modern", "Designer"]
786
+ }
787
+ },
788
+
789
+ "Levi's": {
790
+ "strong_cues": [
791
+ "Red tab on back pocket of jeans",
792
+ "Two horse leather patch on waistband",
793
+ "501 or other style number branding",
794
+ "Arcuate stitching pattern on back pockets"
795
+ ],
796
+ "weak_cues": [
797
+ "Classic denim jeans",
798
+ "American workwear styling",
799
+ "Vintage-inspired casual wear"
800
+ ],
801
+ "region_contexts": ["jeans_pocket", "waistband_patch", "back_detail"],
802
+ "openclip_prompts": {
803
+ "jeans_pocket": [
804
+ "Levi's jeans with red tab on back pocket",
805
+ "Denim showing Levi's two horse patch and arcuate stitching",
806
+ "Jeans with Levi's 501 branding and classic details"
807
+ ]
808
+ },
809
+ "hashtags": {
810
+ "zh": ["Levis", "李維斯", "牛仔褲", "丹寧", "美式"],
811
+ "en": ["Levis", "Denim", "Jeans", "American", "501"]
812
+ }
813
+ },
814
+
815
+ "The North Face": {
816
+ "strong_cues": [
817
+ "Half dome logo design",
818
+ "The North Face wordmark",
819
+ "Outdoor technical gear styling",
820
+ "Logo patch on jacket or backpack"
821
+ ],
822
+ "weak_cues": [
823
+ "Outdoor athletic apparel",
824
+ "Technical outdoor gear",
825
+ "Adventure clothing"
826
+ ],
827
+ "region_contexts": ["jacket_chest", "backpack_front", "apparel_sleeve"],
828
+ "openclip_prompts": {
829
+ "jacket_chest": [
830
+ "The North Face jacket with half dome logo",
831
+ "Outdoor apparel showing North Face branding",
832
+ "Technical gear with The North Face emblem"
833
+ ]
834
+ },
835
+ "hashtags": {
836
+ "zh": ["TheNorthFace", "北臉", "戶外", "機能", "登山"],
837
+ "en": ["TheNorthFace", "Outdoor", "Adventure", "Technical", "Hiking"]
838
+ }
839
+ }
840
+ }
841
+ }
842
+
843
+ print(f"✓ Brand Prompts initialized with {self._count_brands()} brands across {len(self.brand_prompts)} categories")
844
+
845
+ def _count_brands(self) -> int:
846
+ """計算總品牌數量"""
847
+ total = 0
848
+ for category in self.brand_prompts.values():
849
+ total += len(category)
850
+ return total
851
+
852
+ def get_prompts(self, brand_name: str) -> Optional[Dict]:
853
+ """
854
+ 取得特定品牌的完整 prompt 資料
855
+
856
+ Args:
857
+ brand_name: 品牌名稱
858
+
859
+ Returns:
860
+ 品牌資料字典,若不存在則返回 None
861
+ """
862
+ for category in self.brand_prompts.values():
863
+ if brand_name in category:
864
+ result = category[brand_name].copy()
865
+ result['category'] = self.get_brand_category(brand_name)
866
+ return result
867
+ return None
868
+
869
+ def get_brand_category(self, brand_name: str) -> str:
870
+ """
871
+ 取得品牌類別
872
+
873
+ Args:
874
+ brand_name: 品牌名稱
875
+
876
+ Returns:
877
+ 品牌類別(luxury, sportswear, tech, etc.)
878
+ """
879
+ for category_name, brands in self.brand_prompts.items():
880
+ if brand_name in brands:
881
+ return category_name
882
+ return 'unknown'
883
+
884
+ def get_all_brands(self) -> Dict:
885
+ """
886
+ 取得所有品牌的扁平化字典
887
+
888
+ Returns:
889
+ 扁平化的品牌字典 {brand_name: brand_data}
890
+ """
891
+ flat_brands = {}
892
+ for category_name, brands in self.brand_prompts.items():
893
+ for brand_name, brand_data in brands.items():
894
+ brand_data_copy = brand_data.copy()
895
+ brand_data_copy['category'] = category_name
896
+ flat_brands[brand_name] = brand_data_copy
897
+ return flat_brands
898
+
899
+ def get_brands_by_category(self, category: str) -> Dict:
900
+ """
901
+ 取得特定類別的所有品牌
902
+
903
+ Args:
904
+ category: 類別名稱
905
+
906
+ Returns:
907
+ 該類別的品牌字典
908
+ """
909
+ return self.brand_prompts.get(category, {})
910
+
911
+ def search_brand_by_alias(self, alias: str) -> Optional[str]:
912
+ """
913
+ 根據別名搜尋品牌名稱(模糊匹配)
914
+
915
+ Args:
916
+ alias: 品牌別名或簡稱
917
+
918
+ Returns:
919
+ 品牌正式名稱,若找不到則返回 None
920
+ """
921
+ alias_lower = alias.lower()
922
+
923
+ # 簡單的別名映射
924
+ alias_map = {
925
+ 'lv': 'Louis Vuitton',
926
+ 'ck': 'Calvin Klein',
927
+ 'tnf': 'The North Face',
928
+ 'ua': 'Under Armour',
929
+ 'hm': 'H&M'
930
+ }
931
+
932
+ if alias_lower in alias_map:
933
+ return alias_map[alias_lower]
934
+
935
+ # 模糊匹配品牌名稱
936
+ for brand_name in self.get_all_brands().keys():
937
+ if alias_lower in brand_name.lower():
938
+ return brand_name
939
+
940
+ return None
941
+
942
+ def get_hashtags(self, brand_name: str, language: str = 'zh') -> List[str]:
943
+ """
944
+ 取得品牌的 hashtags
945
+
946
+ Args:
947
+ brand_name: 品牌名稱
948
+ language: 語言 ('zh', 'en', 或 'zh-en')
949
+
950
+ Returns:
951
+ Hashtag 列表
952
+ """
953
+ brand_data = self.get_prompts(brand_name)
954
+ if not brand_data:
955
+ return []
956
+
957
+ hashtags = brand_data.get('hashtags', {})
958
+
959
+ if language == 'zh':
960
+ return hashtags.get('zh', [])
961
+ elif language == 'en':
962
+ return hashtags.get('en', [])
963
+ elif language == 'zh-en' or language == 'both':
964
+ zh_tags = hashtags.get('zh', [])
965
+ en_tags = hashtags.get('en', [])
966
+ return zh_tags + en_tags
967
+ else:
968
+ return hashtags.get('zh', [])
969
+
970
+ print("✓ BrandPrompts defined")
brand_recognition_manager.py ADDED
@@ -0,0 +1,420 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import math
3
+ from PIL import Image
4
+ from typing import Dict, List, Tuple
5
+ from rapidfuzz import fuzz
6
+ from prompt_library_manager import PromptLibraryManager
7
+ from brand_detection_optimizer import BrandDetectionOptimizer
8
+
9
+ class BrandRecognitionManager:
10
+ """Multi-modal brand recognition with detailed prompts (Visual + Text)"""
11
+
12
+ def __init__(self, clip_manager, ocr_manager, prompt_library=None):
13
+ self.clip_manager = clip_manager
14
+ self.ocr_manager = ocr_manager
15
+ self.prompt_library = prompt_library
16
+ self.flat_brands = prompt_library.get_all_brands()
17
+
18
+ # Initialize optimizer for smart brand detection
19
+ self.optimizer = BrandDetectionOptimizer(clip_manager, ocr_manager, prompt_library)
20
+
21
+ print(f"✓ Brand Recognition Manager loaded with {len(self.flat_brands)} brands (with optimizer)")
22
+
23
+ def recognize_brand(self, image_region: Image.Image, full_image: Image.Image,
24
+ region_bbox: List[int] = None) -> List[Tuple[str, float, List[int]]]:
25
+ """Recognize brands using detailed context-aware prompts
26
+
27
+ Args:
28
+ image_region: Cropped region containing potential brand
29
+ full_image: Full image for OCR
30
+ region_bbox: Bounding box [x1, y1, x2, y2] for visualization
31
+
32
+ Returns:
33
+ List of (brand_name, confidence, bbox) tuples
34
+ """
35
+
36
+ # Step 1: Classify region context
37
+ region_context = self._classify_region_context(image_region)
38
+ print(f" [DEBUG] Region context classified as: {region_context}")
39
+
40
+ # Step 2: Use context-specific OpenCLIP prompts
41
+ brand_scores = {}
42
+
43
+ for brand_name, brand_info in self.flat_brands.items():
44
+ # Get best matching context for this brand
45
+ best_context = self._match_region_to_brand_context(region_context, brand_info['region_contexts'])
46
+
47
+ if best_context and best_context in brand_info['openclip_prompts']:
48
+ # Use context-specific prompts
49
+ prompts = brand_info['openclip_prompts'][best_context]
50
+ visual_scores = self.clip_manager.classify_zero_shot(image_region, prompts)
51
+
52
+ # Average scores from all prompts
53
+ avg_score = sum(visual_scores.values()) / len(visual_scores) if visual_scores else 0.0
54
+ else:
55
+ # Fallback to strong cues
56
+ prompts = brand_info['strong_cues'][:5] # Top 5 strong cues
57
+ visual_scores = self.clip_manager.classify_zero_shot(image_region, prompts)
58
+ avg_score = sum(visual_scores.values()) / len(visual_scores) if visual_scores else 0.0
59
+
60
+ brand_scores[brand_name] = avg_score
61
+
62
+ # Step 2.5: Multi-scale visual matching for better robustness
63
+ brand_scores = self._multi_scale_visual_matching(image_region, brand_scores)
64
+
65
+ # Step 3: OCR text matching with brand-optimized preprocessing
66
+ ocr_results = self.ocr_manager.extract_text(full_image, use_brand_preprocessing=True)
67
+ text_matches = self._fuzzy_text_matching(ocr_results)
68
+
69
+ print(f" [DEBUG] OCR found {len(ocr_results)} text regions")
70
+ if text_matches:
71
+ print(f" [DEBUG] OCR brand matches: {text_matches}")
72
+
73
+ # Step 4: Adaptive weighted fusion (dynamic weights per brand)
74
+ final_scores = {}
75
+ for brand_name in self.flat_brands.keys():
76
+ visual_score = brand_scores.get(brand_name, 0.0)
77
+ text_score, ocr_conf = text_matches.get(brand_name, (0.0, 0.0))
78
+
79
+ # Calculate adaptive weights based on brand characteristics
80
+ visual_weight, text_weight, ocr_weight = self._calculate_adaptive_weights(
81
+ brand_name, visual_score, text_score, ocr_conf
82
+ )
83
+
84
+ # Weighted fusion with adaptive weights
85
+ final_score = (
86
+ visual_weight * self._scale_visual(visual_score) +
87
+ text_weight * text_score +
88
+ ocr_weight * ocr_conf
89
+ )
90
+ final_scores[brand_name] = final_score
91
+
92
+ sorted_scores = sorted(final_scores.items(), key=lambda x: x[1], reverse=True)[:5]
93
+ print(f" [DEBUG] Top 5 brand scores:")
94
+ for brand, score in sorted_scores:
95
+ print(f" {brand}: {score:.4f} (visual={brand_scores.get(brand, 0):.4f}, text={text_matches.get(brand, (0, 0))[0]:.4f})")
96
+
97
+ # Return confident matches with bounding boxes
98
+ confident_brands = []
99
+ for brand_name, score in final_scores.items():
100
+ if score > 0.10:
101
+ confident_brands.append((brand_name, score, region_bbox))
102
+ print(f" [DEBUG] ✓ Brand detected: {brand_name} (confidence: {score:.4f})")
103
+
104
+ confident_brands.sort(key=lambda x: x[1], reverse=True)
105
+
106
+ if not confident_brands:
107
+ print(f" [DEBUG] ✗ No brands passed threshold 0.10")
108
+
109
+ return confident_brands
110
+
111
+ def _classify_region_context(self, image_region: Image.Image) -> str:
112
+ """Classify what type of region this is (bag_panel, shoe_side, etc.)"""
113
+ context_labels = [
114
+ 'bag panel with pattern',
115
+ 'luggage surface with branding',
116
+ 'luxury trunk with monogram pattern',
117
+ 'vintage travel trunk with hardware',
118
+ 'shoe side view',
119
+ 'device back cover',
120
+ 'apparel chest area',
121
+ 'belt buckle',
122
+ 'storefront sign',
123
+ 'product tag or label',
124
+ 'wallet surface',
125
+ 'perfume bottle',
126
+ 'watch dial or face',
127
+ 'car front grille',
128
+ 'laptop lid'
129
+ ]
130
+
131
+ scores = self.clip_manager.classify_zero_shot(image_region, context_labels)
132
+
133
+ # Map to simplified contexts
134
+ context_mapping = {
135
+ 'bag panel with pattern': 'bag_panel',
136
+ 'luggage surface with branding': 'luggage_surface',
137
+ 'luxury trunk with monogram pattern': 'trunk_body',
138
+ 'vintage travel trunk with hardware': 'trunk_body',
139
+ 'shoe side view': 'shoe_side',
140
+ 'device back cover': 'device_back',
141
+ 'apparel chest area': 'apparel_chest',
142
+ 'belt buckle': 'belt_buckle',
143
+ 'storefront sign': 'storefront',
144
+ 'product tag or label': 'product_tag',
145
+ 'wallet surface': 'wallet',
146
+ 'perfume bottle': 'perfume_bottle',
147
+ 'watch dial or face': 'watch_dial',
148
+ 'car front grille': 'car_front',
149
+ 'laptop lid': 'laptop_lid'
150
+ }
151
+
152
+ top_context = max(scores.items(), key=lambda x: x[1])[0]
153
+ return context_mapping.get(top_context, 'unknown')
154
+
155
+ def _match_region_to_brand_context(self, region_context: str, brand_contexts: List[str]) -> str:
156
+ """Match detected region context to brand's available contexts"""
157
+ if region_context in brand_contexts:
158
+ return region_context
159
+ # Fuzzy matching
160
+ for brand_context in brand_contexts:
161
+ if region_context.split('_')[0] in brand_context:
162
+ return brand_context
163
+ return None
164
+
165
+ def _fuzzy_text_matching(self, ocr_results: List[Dict]) -> Dict[str, Tuple[float, float]]:
166
+ """Fuzzy text matching using brand aliases (optimized for logo text)"""
167
+ matches = {}
168
+
169
+ for ocr_item in ocr_results:
170
+ text = ocr_item['text']
171
+ conf = ocr_item['confidence']
172
+
173
+ for brand_name, brand_info in self.flat_brands.items():
174
+ # Check all aliases
175
+ all_names = [brand_name] + brand_info.get('aliases', [])
176
+
177
+ for alias in all_names:
178
+ ratio = fuzz.ratio(text, alias) / 100.0
179
+ if ratio > 0.70: # Lowered threshold for better recall
180
+ if brand_name not in matches or ratio > matches[brand_name][0]:
181
+ matches[brand_name] = (ratio, conf)
182
+
183
+ return matches
184
+
185
+ def _scale_visual(self, score: float) -> float:
186
+ """Scale visual score using sigmoid"""
187
+ return 1 / (1 + math.exp(-10 * (score - 0.5)))
188
+
189
+ def _calculate_adaptive_weights(self, brand_name: str, visual_score: float,
190
+ text_score: float, ocr_conf: float) -> tuple:
191
+ """
192
+ Calculate adaptive weights based on brand characteristics and signal strengths
193
+
194
+ Args:
195
+ brand_name: Name of the brand
196
+ visual_score: Visual similarity score
197
+ text_score: Text matching score
198
+ ocr_conf: OCR confidence
199
+
200
+ Returns:
201
+ Tuple of (visual_weight, text_weight, ocr_weight)
202
+ """
203
+ brand_info = self.prompt_library.get_brand_prompts(brand_name)
204
+
205
+ if not brand_info:
206
+ # Default balanced weights
207
+ return 0.50, 0.30, 0.20
208
+
209
+ # Base weights based on brand characteristics
210
+ if brand_info.get('visual_distinctive', False):
211
+ # Visually distinctive brands (LV, Burberry)
212
+ visual_weight = 0.65
213
+ text_weight = 0.20
214
+ ocr_weight = 0.15
215
+ elif brand_info.get('text_prominent', False):
216
+ # Text-prominent brands (Nike, Adidas)
217
+ visual_weight = 0.30
218
+ text_weight = 0.30
219
+ ocr_weight = 0.40
220
+ else:
221
+ # Balanced for general brands
222
+ visual_weight = 0.50
223
+ text_weight = 0.30
224
+ ocr_weight = 0.20
225
+
226
+ # Dynamic adjustment based on signal strength
227
+ # If visual signal is very strong, boost its weight
228
+ if visual_score > 0.7:
229
+ boost = 0.10
230
+ visual_weight += boost
231
+ text_weight -= boost * 0.5
232
+ ocr_weight -= boost * 0.5
233
+
234
+ # If OCR has very high confidence, boost its weight
235
+ if ocr_conf > 0.85:
236
+ boost = 0.10
237
+ ocr_weight += boost
238
+ visual_weight -= boost * 0.6
239
+ text_weight -= boost * 0.4
240
+
241
+ # If text match is very strong, boost its weight
242
+ if text_score > 0.80:
243
+ boost = 0.08
244
+ text_weight += boost
245
+ visual_weight -= boost * 0.5
246
+ ocr_weight -= boost * 0.5
247
+
248
+ # Normalize weights to sum to 1
249
+ total = visual_weight + text_weight + ocr_weight
250
+ return visual_weight / total, text_weight / total, ocr_weight / total
251
+
252
+ def _multi_scale_visual_matching(self, image_region: Image.Image,
253
+ initial_scores: Dict[str, float]) -> Dict[str, float]:
254
+ """
255
+ Apply multi-scale matching to improve robustness
256
+
257
+ Args:
258
+ image_region: Image region to analyze
259
+ initial_scores: Initial brand scores from single-scale matching
260
+
261
+ Returns:
262
+ Updated brand scores with multi-scale matching
263
+ """
264
+ scales = [0.8, 1.0, 1.2] # Three scales
265
+ multi_scale_scores = {brand: [] for brand in initial_scores.keys()}
266
+
267
+ for scale in scales:
268
+ # Resize image
269
+ new_width = int(image_region.width * scale)
270
+ new_height = int(image_region.height * scale)
271
+
272
+ # Ensure minimum size
273
+ if new_width < 50 or new_height < 50:
274
+ continue
275
+
276
+ try:
277
+ scaled_img = image_region.resize((new_width, new_height), Image.Resampling.LANCZOS)
278
+
279
+ # Re-run classification on each brand's prompts
280
+ for brand_name, brand_info in self.flat_brands.items():
281
+ # Get context-specific prompts
282
+ best_context = self._match_region_to_brand_context(
283
+ 'bag_panel', # Default context, ideally should be passed as parameter
284
+ brand_info.get('region_contexts', [])
285
+ )
286
+
287
+ if best_context and best_context in brand_info.get('openclip_prompts', {}):
288
+ prompts = brand_info['openclip_prompts'][best_context]
289
+ visual_scores = self.clip_manager.classify_zero_shot(scaled_img, prompts)
290
+ avg_score = sum(visual_scores.values()) / len(visual_scores) if visual_scores else 0.0
291
+ else:
292
+ prompts = brand_info.get('strong_cues', [])[:3]
293
+ visual_scores = self.clip_manager.classify_zero_shot(scaled_img, prompts)
294
+ avg_score = sum(visual_scores.values()) / len(visual_scores) if visual_scores else 0.0
295
+
296
+ multi_scale_scores[brand_name].append(avg_score)
297
+
298
+ except Exception as e:
299
+ # Skip this scale if error occurs
300
+ continue
301
+
302
+ # Aggregate multi-scale scores (use max score across scales)
303
+ final_scores = {}
304
+ for brand_name, scores in multi_scale_scores.items():
305
+ if scores:
306
+ final_scores[brand_name] = max(scores)
307
+ else:
308
+ final_scores[brand_name] = initial_scores.get(brand_name, 0.0)
309
+
310
+ return final_scores
311
+
312
+ def scan_full_image_for_brands(self, full_image: Image.Image,
313
+ exclude_bboxes: List[List[int]] = None,
314
+ saliency_regions: List[Dict] = None) -> List[Tuple[str, float, List[int]]]:
315
+ """
316
+ 智能全圖品牌掃描 - 性能優化版本
317
+ 使用預篩選和智能區域選擇大幅減少檢測時間
318
+
319
+ Args:
320
+ full_image: PIL Image (full image)
321
+ exclude_bboxes: List of bboxes to exclude (already detected)
322
+ saliency_regions: Saliency detection results for smart region selection
323
+
324
+ Returns:
325
+ List of (brand_name, confidence, bbox) tuples
326
+ """
327
+ if exclude_bboxes is None:
328
+ exclude_bboxes = []
329
+
330
+ detected_brands = {} # brand_name -> (confidence, bbox)
331
+ img_width, img_height = full_image.size
332
+
333
+ # OPTIMIZATION 1: 快速品牌預篩選
334
+ likely_brands = self.optimizer.quick_brand_prescreening(full_image)
335
+ print(f" Quick prescreening found {len(likely_brands)} potential brands")
336
+
337
+ # OPTIMIZATION 2: 智能區域選擇(只掃描有意義的區域)
338
+ regions_to_scan = self.optimizer.smart_region_selection(full_image, saliency_regions or [])
339
+ print(f" Scanning {len(regions_to_scan)} intelligent regions")
340
+
341
+ # 掃描選定的區域
342
+ for region_bbox in regions_to_scan:
343
+ x1, y1, x2, y2 = region_bbox
344
+
345
+ # 跳過已檢測區域
346
+ if self._bbox_overlap(list(region_bbox), exclude_bboxes):
347
+ continue
348
+
349
+ # 提取區域
350
+ region = full_image.crop(region_bbox)
351
+
352
+ # 只檢測預篩選的品牌(而非所有20+品牌)
353
+ for brand_name in likely_brands:
354
+ brand_info = self.flat_brands.get(brand_name)
355
+ if not brand_info:
356
+ continue
357
+
358
+ # only use strong_cues
359
+ strong_cues = brand_info.get('strong_cues', [])[:5] # Top 5
360
+ if not strong_cues:
361
+ continue
362
+
363
+ visual_scores = self.clip_manager.classify_zero_shot(region, strong_cues)
364
+ avg_score = sum(visual_scores.values()) / len(visual_scores) if visual_scores else 0.0
365
+
366
+ # OCR 增強
367
+ ocr_results = self.ocr_manager.extract_text(full_image, use_brand_preprocessing=True)
368
+ boosted_score = self.optimizer.compute_brand_confidence_boost(
369
+ brand_name, ocr_results, avg_score
370
+ )
371
+
372
+ # 極度寬鬆的閾值以最大化檢測率
373
+ if boosted_score > 0.08: # 降低到 0.08
374
+ # 更新最佳結果
375
+ if brand_name not in detected_brands or boosted_score > detected_brands[brand_name][0]:
376
+ detected_brands[brand_name] = (boosted_score, list(region_bbox))
377
+
378
+ # 轉換為列表格式
379
+ final_brands = [
380
+ (brand_name, confidence, bbox)
381
+ for brand_name, (confidence, bbox) in detected_brands.items()
382
+ ]
383
+
384
+ # 按信心度排序
385
+ final_brands.sort(key=lambda x: x[1], reverse=True)
386
+
387
+ return final_brands[:5] # 返回前5個
388
+
389
+ def _bbox_overlap(self, bbox1: List[int], bbox_list: List[List[int]]) -> bool:
390
+ """Check if bbox1 overlaps significantly with any bbox in bbox_list"""
391
+ if not bbox_list:
392
+ return False
393
+
394
+ x1_1, y1_1, x2_1, y2_1 = bbox1
395
+
396
+ for bbox2 in bbox_list:
397
+ if bbox2 is None:
398
+ continue
399
+
400
+ x1_2, y1_2, x2_2, y2_2 = bbox2
401
+
402
+ # Calculate intersection
403
+ x_left = max(x1_1, x1_2)
404
+ y_top = max(y1_1, y1_2)
405
+ x_right = min(x2_1, x2_2)
406
+ y_bottom = min(y2_1, y2_2)
407
+
408
+ if x_right < x_left or y_bottom < y_top:
409
+ continue
410
+
411
+ intersection_area = (x_right - x_left) * (y_bottom - y_top)
412
+ bbox1_area = (x2_1 - x1_1) * (y2_1 - y1_1)
413
+
414
+ # 如果重疊超過 30%,視為重疊
415
+ if intersection_area / bbox1_area > 0.3:
416
+ return True
417
+
418
+ return False
419
+
420
+ print("✓ BrandRecognitionManager (with full-image scan for commercial use) defined")
brand_verification_manager.py ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import json
3
+ import re
4
+ from PIL import Image
5
+ from typing import List, Dict, Tuple
6
+ from datetime import datetime
7
+ from caption_generation_manager import CaptionGenerationManager
8
+
9
+ class BrandVerificationManager:
10
+ """VLM-based brand verification and three-way voting system"""
11
+
12
+ def __init__(self, caption_generator: CaptionGenerationManager = None):
13
+ """
14
+ Args:
15
+ caption_generator: CaptionGenerationManager instance for VLM access
16
+ """
17
+ if caption_generator is None:
18
+ caption_generator = CaptionGenerationManager()
19
+
20
+ self.caption_generator = caption_generator
21
+
22
+ # Confidence mapping for VLM responses
23
+ self.confidence_map = {
24
+ 'high': 0.9,
25
+ 'medium': 0.7,
26
+ 'low': 0.5,
27
+ 'very high': 0.95,
28
+ 'very low': 0.3
29
+ }
30
+
31
+ print("✓ Brand Verification Manager initialized with VLM")
32
+
33
+ def verify_brands(self, image: Image.Image, detected_brands: List[Tuple[str, float, list]]) -> Dict:
34
+ """
35
+ Use VLM to verify detected brands
36
+
37
+ Args:
38
+ image: PIL Image
39
+ detected_brands: List of (brand_name, confidence, bbox) tuples
40
+
41
+ Returns:
42
+ Dictionary with verification results
43
+ """
44
+ if not detected_brands:
45
+ return {
46
+ 'verified_brands': [],
47
+ 'false_positives': [],
48
+ 'additional_brands': [],
49
+ 'confidence': 0.0
50
+ }
51
+
52
+ # Construct verification prompt
53
+ brand_list = ', '.join([f"{brand[0]} (confidence: {brand[1]:.2f})"
54
+ for brand in detected_brands[:3]]) # Top 3 brands
55
+
56
+ verification_prompt = f"""Analyze this image carefully. Our computer vision system detected the following brands: {brand_list}.
57
+
58
+ Please verify each brand identification:
59
+
60
+ 1. Are these brand identifications correct based on visible logos, patterns, text, or distinctive features?
61
+ 2. If incorrect, what brands do you actually see (if any)?
62
+ 3. Describe the visual evidence (logo shape, text, pattern, color scheme, hardware) that supports your conclusion.
63
+
64
+ Respond in JSON format:
65
+ {{
66
+ "verified_brands": [
67
+ {{"name": "Brand Name", "confidence": "high/medium/low", "evidence": "description of visual evidence"}}
68
+ ],
69
+ "false_positives": ["brand names that were incorrectly detected"],
70
+ "additional_brands": ["brands we missed but you can see"]
71
+ }}
72
+
73
+ IMPORTANT: Only include brands you can clearly identify with visual evidence. If unsure, use "low" confidence."""
74
+
75
+ # Generate VLM response
76
+ try:
77
+ response = self._generate_vlm_response(image, verification_prompt)
78
+ parsed_result = self._parse_verification_response(response)
79
+ return parsed_result
80
+
81
+ except Exception as e:
82
+ print(f"VLM verification error: {e}")
83
+ # Fallback to original detections
84
+ return {
85
+ 'verified_brands': [
86
+ {'name': brand[0], 'confidence': 'medium', 'evidence': 'VLM verification failed'}
87
+ for brand in detected_brands
88
+ ],
89
+ 'false_positives': [],
90
+ 'additional_brands': []
91
+ }
92
+
93
+ def three_way_voting(self, openclip_brands: List[Tuple], ocr_brands: Dict,
94
+ vlm_result: Dict) -> List[Tuple[str, float, list]]:
95
+ """
96
+ Three-way voting: OpenCLIP vs OCR vs VLM
97
+
98
+ Args:
99
+ openclip_brands: List of (brand_name, confidence, bbox) from OpenCLIP
100
+ ocr_brands: Dict of {brand_name: (text_score, ocr_conf)} from OCR
101
+ vlm_result: Verification result from VLM
102
+
103
+ Returns:
104
+ List of (brand_name, final_confidence, bbox) tuples
105
+ """
106
+ votes = {} # brand_name -> {votes: int, sources: list, bbox: list}
107
+ confidence_scores = {} # brand_name -> list of (source, confidence)
108
+
109
+ # Vote 1: OpenCLIP
110
+ for brand_name, confidence, bbox in openclip_brands:
111
+ if brand_name not in votes:
112
+ votes[brand_name] = {'votes': 0, 'sources': [], 'bbox': bbox}
113
+ confidence_scores[brand_name] = []
114
+
115
+ votes[brand_name]['votes'] += 1
116
+ votes[brand_name]['sources'].append('openclip')
117
+ confidence_scores[brand_name].append(('openclip', confidence * 0.8))
118
+
119
+ # Vote 2: OCR
120
+ for brand_name, (text_score, ocr_conf) in ocr_brands.items():
121
+ if brand_name not in votes:
122
+ # OCR found a brand not detected by OpenCLIP
123
+ votes[brand_name] = {'votes': 0, 'sources': [], 'bbox': None}
124
+ confidence_scores[brand_name] = []
125
+
126
+ votes[brand_name]['votes'] += 1
127
+ votes[brand_name]['sources'].append('ocr')
128
+ combined_ocr_score = (text_score + ocr_conf) / 2
129
+ confidence_scores[brand_name].append(('ocr', combined_ocr_score * 0.7))
130
+
131
+ # Vote 3: VLM (double weight - most reliable)
132
+ for brand_info in vlm_result.get('verified_brands', []):
133
+ brand_name = brand_info['name']
134
+ vlm_confidence_level = brand_info.get('confidence', 'medium')
135
+ vlm_confidence = self.confidence_map.get(vlm_confidence_level.lower(), 0.7)
136
+
137
+ if brand_name not in votes:
138
+ # VLM found a brand missed by both OpenCLIP and OCR
139
+ votes[brand_name] = {'votes': 0, 'sources': [], 'bbox': None}
140
+ confidence_scores[brand_name] = []
141
+
142
+ votes[brand_name]['votes'] += 2 # VLM gets double vote
143
+ votes[brand_name]['sources'].append('vlm')
144
+ confidence_scores[brand_name].append(('vlm', vlm_confidence))
145
+
146
+ # Remove false positives flagged by VLM
147
+ for false_positive in vlm_result.get('false_positives', []):
148
+ if false_positive in votes:
149
+ # Reduce votes significantly
150
+ votes[false_positive]['votes'] = max(0, votes[false_positive]['votes'] - 2)
151
+
152
+ # Calculate final scores
153
+ final_brands = []
154
+ for brand_name, vote_info in votes.items():
155
+ if vote_info['votes'] <= 0:
156
+ continue # Skip brands with no votes
157
+
158
+ # Calculate weighted average confidence
159
+ scores = confidence_scores.get(brand_name, [])
160
+ if not scores:
161
+ continue
162
+
163
+ # VLM has highest weight, OpenCLIP medium, OCR lowest
164
+ weighted_sum = 0.0
165
+ weight_total = 0.0
166
+
167
+ for source, score in scores:
168
+ if source == 'vlm':
169
+ weight = 1.0
170
+ elif source == 'openclip':
171
+ weight = 0.6
172
+ else: # ocr
173
+ weight = 0.4
174
+
175
+ weighted_sum += score * weight
176
+ weight_total += weight
177
+
178
+ avg_confidence = weighted_sum / weight_total if weight_total > 0 else 0.0
179
+
180
+ # Boost confidence if multiple sources agree
181
+ if vote_info['votes'] >= 2:
182
+ avg_confidence *= 1.15 # 15% boost for agreement
183
+
184
+ # Cap at 0.95
185
+ avg_confidence = min(avg_confidence, 0.95)
186
+
187
+ # Only include if confidence is reasonable
188
+ if avg_confidence > 0.30:
189
+ final_brands.append((brand_name, avg_confidence, vote_info['bbox']))
190
+
191
+ # Sort by confidence
192
+ final_brands.sort(key=lambda x: x[1], reverse=True)
193
+
194
+ return final_brands
195
+
196
+ def extract_visual_evidence(self, image: Image.Image, brand_name: str) -> Dict:
197
+ """
198
+ Extract detailed visual evidence for identified brand
199
+
200
+ Args:
201
+ image: PIL Image
202
+ brand_name: Identified brand name
203
+
204
+ Returns:
205
+ Dictionary with evidence description
206
+ """
207
+ evidence_prompt = f"""You identified {brand_name} in this image. Please describe the specific visual evidence:
208
+
209
+ 1. Logo appearance: Describe the logo's shape, style, color, and exact location in the image
210
+ 2. Text elements: What text did you see? (exact wording, font style, placement)
211
+ 3. Distinctive patterns: Any signature patterns, textures, or design elements
212
+ 4. Color scheme: Brand-specific colors used
213
+ 5. Product features: Distinctive product design characteristics
214
+
215
+ Be specific and detailed. Focus on objective visual features."""
216
+
217
+ try:
218
+ evidence_description = self._generate_vlm_response(image, evidence_prompt)
219
+
220
+ return {
221
+ 'brand': brand_name,
222
+ 'evidence_description': evidence_description,
223
+ 'timestamp': datetime.now().isoformat()
224
+ }
225
+
226
+ except Exception as e:
227
+ return {
228
+ 'brand': brand_name,
229
+ 'evidence_description': f"Evidence extraction failed: {str(e)}",
230
+ 'timestamp': datetime.now().isoformat()
231
+ }
232
+
233
+ def _generate_vlm_response(self, image: Image.Image, prompt: str) -> str:
234
+ """
235
+ Generate VLM response for given image and prompt
236
+
237
+ Args:
238
+ image: PIL Image
239
+ prompt: Text prompt
240
+
241
+ Returns:
242
+ VLM response string
243
+ """
244
+ from qwen_vl_utils import process_vision_info
245
+
246
+ messages = [{
247
+ "role": "user",
248
+ "content": [
249
+ {"type": "image", "image": image},
250
+ {"type": "text", "text": prompt}
251
+ ]
252
+ }]
253
+
254
+ text = self.caption_generator.processor.apply_chat_template(
255
+ messages, tokenize=False, add_generation_prompt=True
256
+ )
257
+
258
+ image_inputs, video_inputs = process_vision_info(messages)
259
+ inputs = self.caption_generator.processor(
260
+ text=[text],
261
+ images=image_inputs,
262
+ videos=video_inputs,
263
+ padding=True,
264
+ return_tensors="pt"
265
+ ).to(self.caption_generator.model.device)
266
+
267
+ # Generate with low temperature for factual responses
268
+ generation_config = {
269
+ 'temperature': 0.3, # Low temperature for factual verification
270
+ 'top_p': 0.9,
271
+ 'max_new_tokens': 300,
272
+ 'repetition_penalty': 1.1
273
+ }
274
+
275
+ generated_ids = self.caption_generator.model.generate(
276
+ **inputs,
277
+ **generation_config
278
+ )
279
+
280
+ # Trim input tokens
281
+ generated_ids_trimmed = [
282
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
283
+ ]
284
+
285
+ output_text = self.caption_generator.processor.batch_decode(
286
+ generated_ids_trimmed,
287
+ skip_special_tokens=True,
288
+ clean_up_tokenization_spaces=False
289
+ )[0]
290
+
291
+ return output_text
292
+
293
+ def _parse_verification_response(self, response: str) -> Dict:
294
+ """
295
+ Parse VLM verification response
296
+
297
+ Args:
298
+ response: VLM response string
299
+
300
+ Returns:
301
+ Parsed dictionary
302
+ """
303
+ try:
304
+ # Try to extract JSON from response
305
+ json_match = re.search(r'\{.*\}', response, re.DOTALL)
306
+ if json_match:
307
+ result = json.loads(json_match.group())
308
+ return result
309
+ except json.JSONDecodeError:
310
+ pass
311
+
312
+ # Fallback: rule-based parsing
313
+ return self._rule_based_parse(response)
314
+
315
+ def _rule_based_parse(self, response: str) -> Dict:
316
+ """
317
+ Fallback rule-based parsing if JSON fails
318
+
319
+ Args:
320
+ response: VLM response string
321
+
322
+ Returns:
323
+ Parsed dictionary
324
+ """
325
+ result = {
326
+ 'verified_brands': [],
327
+ 'false_positives': [],
328
+ 'additional_brands': []
329
+ }
330
+
331
+ # Simple pattern matching
332
+ lines = response.lower().split('\n')
333
+
334
+ for line in lines:
335
+ # Look for brand names mentioned with positive sentiment
336
+ if any(word in line for word in ['correct', 'yes', 'visible', 'see', 'identified']):
337
+ # Extract potential brand names (capitalize words)
338
+ words = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', response)
339
+ for word in words:
340
+ if len(word) > 2: # Avoid short words
341
+ result['verified_brands'].append({
342
+ 'name': word,
343
+ 'confidence': 'medium',
344
+ 'evidence': 'Extracted from VLM response'
345
+ })
346
+
347
+ return result
348
+
349
+ print("✓ BrandVerificationManager (VLM verification and voting) defined")
brand_visualization_manager.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import cv2
3
+ import numpy as np
4
+ from PIL import Image, ImageDraw, ImageFont
5
+ from typing import List, Tuple, Dict
6
+
7
+ class BrandVisualizationManager:
8
+ """Visualize detected brands with bounding boxes and labels (like YOLO)"""
9
+
10
+ def __init__(self):
11
+ """Initialize visualization manager"""
12
+ # Color palette for different brand categories
13
+ self.colors = {
14
+ 'luxury': (218, 165, 32), # Gold
15
+ 'sportswear': (0, 191, 255), # Deep Sky Blue
16
+ 'tech': (169, 169, 169), # Dark Gray
17
+ 'automotive': (220, 20, 60), # Crimson
18
+ 'fashion': (186, 85, 211), # Medium Orchid
19
+ 'watches': (184, 134, 11), # Dark Goldenrod
20
+ 'default': (0, 255, 0) # Green
21
+ }
22
+
23
+ print("✓ Brand Visualization Manager initialized")
24
+
25
+ def draw_brand_detections(self, image: Image.Image, brand_detections: List[Dict],
26
+ show_confidence: bool = True) -> Image.Image:
27
+ """Draw bounding boxes and labels for detected brands
28
+
29
+ Args:
30
+ image: PIL Image
31
+ brand_detections: List of dicts with keys: 'name', 'confidence', 'bbox', 'category'
32
+ show_confidence: Whether to show confidence scores
33
+
34
+ Returns:
35
+ Image with drawn bounding boxes
36
+ """
37
+ if not brand_detections:
38
+ return image
39
+
40
+ # Convert PIL to OpenCV format
41
+ img_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
42
+
43
+ for detection in brand_detections:
44
+ brand_name = detection.get('name', 'Unknown')
45
+ confidence = detection.get('confidence', 0.0)
46
+ bbox = detection.get('bbox')
47
+ category = detection.get('category', 'default')
48
+
49
+ if bbox is None:
50
+ continue
51
+
52
+ x1, y1, x2, y2 = bbox
53
+ color = self.colors.get(category, self.colors['default'])
54
+
55
+ # Draw bounding box
56
+ cv2.rectangle(img_cv, (int(x1), int(y1)), (int(x2), int(y2)), color, 3)
57
+
58
+ # Prepare label text
59
+ if show_confidence:
60
+ label = f"{brand_name} {confidence:.2f}"
61
+ else:
62
+ label = brand_name
63
+
64
+ # Calculate text size
65
+ font = cv2.FONT_HERSHEY_SIMPLEX
66
+ font_scale = 0.7
67
+ thickness = 2
68
+ (text_width, text_height), baseline = cv2.getTextSize(label, font, font_scale, thickness)
69
+
70
+ # Draw label background
71
+ cv2.rectangle(img_cv,
72
+ (int(x1), int(y1) - text_height - 10),
73
+ (int(x1) + text_width + 10, int(y1)),
74
+ color, -1)
75
+
76
+ # Draw label text
77
+ cv2.putText(img_cv, label,
78
+ (int(x1) + 5, int(y1) - 5),
79
+ font, font_scale, (255, 255, 255), thickness, cv2.LINE_AA)
80
+
81
+ # Convert back to PIL
82
+ img_pil = Image.fromarray(cv2.cvtColor(img_cv, cv2.COLOR_BGR2RGB))
83
+ return img_pil
84
+
85
+ def format_brand_list(self, brand_detections: List[Dict]) -> str:
86
+ """Format brand detections as readable text
87
+
88
+ Args:
89
+ brand_detections: List of brand detection dicts
90
+
91
+ Returns:
92
+ Formatted string with brand names and confidences
93
+ """
94
+ if not brand_detections:
95
+ return "No brands identified"
96
+
97
+ formatted = []
98
+ for detection in brand_detections:
99
+ brand_name = detection.get('name', 'Unknown')
100
+ confidence = detection.get('confidence', 0.0)
101
+ # 移除 category 標籤顯示,保持簡潔
102
+
103
+ formatted.append(f"{brand_name} ({confidence:.2f})")
104
+
105
+ return ", ".join(formatted)
106
+
107
+ print("✓ BrandVisualizationManager defined")
caption_generation_manager.py ADDED
@@ -0,0 +1,499 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoModelForImageTextToText, AutoProcessor
3
+ from qwen_vl_utils import process_vision_info
4
+ from PIL import Image
5
+ from typing import List, Dict
6
+ import json
7
+ from opencc import OpenCC
8
+ import warnings
9
+
10
+ class CaptionGenerationManager:
11
+ """Caption generation using Vision-Language Models (supports Qwen2.5-VL, Qwen3-VL, etc.)"""
12
+
13
+ def __init__(self, model_name: str = "Qwen/Qwen2.5-VL-7B-Instruct"):
14
+ """
15
+ Args:
16
+ model_name: Vision-Language model name, e.g.:
17
+ - "Qwen/Qwen2.5-VL-7B-Instruct" (default)
18
+ - "Qwen/Qwen3-VL-8B-Instruct" (2025 latest)
19
+ """
20
+ print(f"Loading Vision-Language Model: {model_name}...")
21
+
22
+ # Suppress processor warning
23
+ warnings.filterwarnings("ignore", category=FutureWarning, module="transformers")
24
+
25
+ # Use Auto* classes for flexibility (supports Qwen2.5-VL, Qwen3-VL, etc.)
26
+ self.processor = AutoProcessor.from_pretrained(model_name, use_fast=False)
27
+ self.model = AutoModelForImageTextToText.from_pretrained(
28
+ model_name,
29
+ dtype=torch.bfloat16, # Changed from torch_dtype to dtype
30
+ device_map="auto"
31
+ )
32
+
33
+ # Simplified Chinese to Traditional Chinese converter
34
+ self.cc = OpenCC('s2t') # Simplified to Traditional
35
+
36
+ self.generation_config = {
37
+ 'temperature': 0.7,
38
+ 'top_p': 0.9,
39
+ 'max_new_tokens': 300, # Increased from 200 to prevent truncation
40
+ 'repetition_penalty': 1.1
41
+ }
42
+
43
+ # Platform-specific templates
44
+ self.platform_templates = {
45
+ 'instagram': {
46
+ 'style': 'storytelling, aesthetic',
47
+ 'emoji_count': '2-3',
48
+ 'hashtag_count': '8-10',
49
+ 'min_length': 120, # Increased for richer content
50
+ 'max_length': 220, # Allow more detailed descriptions
51
+ 'features': ['call-to-action', 'question', 'relatable']
52
+ },
53
+ 'tiktok': {
54
+ 'style': 'brief, punchy',
55
+ 'emoji_count': '1-2',
56
+ 'hashtag_count': '5-8',
57
+ 'min_length': 60,
58
+ 'max_length': 120,
59
+ 'features': ['trending', 'POV', 'relatable']
60
+ },
61
+ 'xiaohongshu': {
62
+ 'style': 'structured, informative, detailed',
63
+ 'emoji_count': '5-8',
64
+ 'hashtag_count': '8-12',
65
+ 'min_length': 180,
66
+ 'max_length': 500,
67
+ 'features': ['tips', 'bullets', 'sharing-tone']
68
+ }
69
+ }
70
+
71
+ print(f"✓ {model_name.split('/')[-1]} loaded successfully (using Auto* classes for flexibility)")
72
+
73
+ def construct_prompt(self, analysis_results: Dict, platform: str = 'instagram', language: str = 'zh') -> str:
74
+ """Construct prompt with language support ensuring consistency
75
+
76
+ Args:
77
+ language: 'zh' (Traditional Chinese), 'en' (English), 'zh-en' (Bilingual)
78
+ """
79
+ platform_config = self.platform_templates.get(platform, self.platform_templates['instagram'])
80
+
81
+ # Language-specific instructions
82
+ language_instructions = {
83
+ 'zh': '請使用繁體中文生成標題和標籤。語言要自然流暢,符合華語社群媒體的表達習慣。避免使用簡體字。當偵測到品牌時,必須在標題中提及品牌名稱。',
84
+ 'en': '''🚨 CRITICAL LANGUAGE REQUIREMENT 🚨
85
+ Generate captions and hashtags EXCLUSIVELY in English.
86
+ - NEVER use Chinese characters (Traditional or Simplified)
87
+ - NEVER mix languages
88
+ - Use natural, engaging language suitable for international social media
89
+ - When brands are detected, mention them naturally in English
90
+ - All text output must be 100% English only
91
+ This is MANDATORY and NON-NEGOTIABLE.''',
92
+ 'zh-en': '''生成雙語內容:標題使用繁體中文,同時提供英文翻譯。標籤混合使用中英文以擴大觸及範圍。當偵測到品牌時,必須在標題中提及品牌名稱。
93
+
94
+ 🚨 重要:雙語一致性要求 🚨
95
+ - 中文和英文必須表達相同的核心意義
96
+ - 允許表達方式的差異(形容詞、語法不同)
97
+ - 但整體訊息、語氣、品牌提及必須一致
98
+ - 兩種語言都要朝同一方向詮釋內容'''
99
+ }
100
+
101
+ system_instruction = f"""You are a professional social media content strategist.
102
+
103
+ {language_instructions.get(language, language_instructions['zh'])}
104
+
105
+ Target platform: {platform}
106
+ Content style: Authentic, creative, and optimized for engagement.
107
+
108
+ CRITICAL RULE: Never include hashtags (symbols starting with #) in the caption text. Hashtags must only appear in the separate 'hashtags' array."""
109
+
110
+ # Extract analysis context
111
+ objects = analysis_results.get('detections', [])
112
+ brands = analysis_results.get('brands', [])
113
+ scene_info = analysis_results.get('scene_analysis', {})
114
+ composition = analysis_results.get('composition', {})
115
+
116
+ # FIXED: Get fused lighting from scene_info (it's been updated by DetectionFusionManager)
117
+ lighting = scene_info.get('lighting', {}).get('top', 'natural light')
118
+ lighting_confidence = scene_info.get('lighting', {}).get('confidence', 0.7)
119
+
120
+ # Provide explicit Chinese translations to ensure consistency
121
+ lighting_translations_zh = {
122
+ 'soft diffused light': '柔和漫射光',
123
+ 'overcast atmosphere': '陰天氛圍',
124
+ 'natural daylight': '自然日光',
125
+ 'warm ambient light': '溫暖環境光',
126
+ 'evening light': '傍晚光線',
127
+ 'bright sunlight': '明亮陽光',
128
+ 'golden hour': '金黃時刻',
129
+ 'blue hour': '藍調時刻'
130
+ }
131
+
132
+ # Get appropriate lighting description based on language
133
+ if language == 'zh':
134
+ lighting_zh = lighting_translations_zh.get(lighting, lighting)
135
+ lighting_display = lighting_zh
136
+ else:
137
+ # For English and bilingual, use English only
138
+ lighting_display = lighting
139
+ lighting_zh = lighting
140
+
141
+ objects_str = ', '.join([obj['class_name'] for obj in objects[:10]])
142
+
143
+ # CRITICAL: Emphasize brands EXTREMELY prominently - repeat multiple times
144
+ if brands:
145
+ brands_list = [b[0] for b in brands[:5]]
146
+ brands_str = ', '.join(brands_list)
147
+ brand_emphasis = f"""
148
+
149
+ 🚨 CRITICAL BRAND REQUIREMENT 🚨
150
+ The following brands were POSITIVELY IDENTIFIED in this image: {brands_str}
151
+
152
+ YOU ABSOLUTELY MUST:
153
+ 1. Mention the brand name "{brands_list[0]}" explicitly in the FIRST sentence
154
+ 2. Use the exact brand name - do not use generic terms like "bag" or "accessory" without the brand
155
+ 3. Write naturally as if you're excited to share this {brands_list[0]} item
156
+ 4. Example: "在傍晚光線下,這款{brands_list[0]}經典黑色菱格紋皮革包..." (CORRECT)
157
+ 5. NOT acceptable: "在傍晚光線下,這款經典黑色菱格紋皮革包..." (WRONG - missing brand name!)
158
+
159
+ THIS IS MANDATORY - The caption will be rejected if it doesn't mention {brands_str}.
160
+ """
161
+ else:
162
+ brands_str = 'None detected'
163
+ brand_emphasis = ""
164
+
165
+ # Enhanced scene description
166
+ urban_scene = scene_info.get('urban', {}).get('top', 'unknown')
167
+ mood = scene_info.get('mood', {}).get('top', 'neutral')
168
+ comp_type = composition.get('composition_type', 'standard')
169
+
170
+ context = f"""
171
+ Analyze this image and generate an engaging, DETAILED social media caption with rich visual descriptions.
172
+
173
+ **Visual Elements (Describe in Detail):**
174
+ - Detected objects: {objects_str}
175
+ - Scene composition: {comp_type}
176
+ - Urban environment: {urban_scene}
177
+ - **IMPORTANT**: Include specific details about:
178
+ * Materials (leather, metal, fabric, canvas, etc.)
179
+ * Colors (use descriptive terms: jet black, antique gold, midnight blue, etc.)
180
+ * Textures (quilted, smooth, matte, glossy, metallic, etc.)
181
+ * Design features (stitching patterns, hardware, logos, emblems, etc.)
182
+ * Reflections and lighting effects on surfaces
183
+
184
+ **Atmosphere:**
185
+ - Lighting (analyzed with Places365 + CV): {lighting_display} (confidence: {lighting_confidence:.2f})
186
+ - Mood: {mood}
187
+
188
+ **Brand Detection:**
189
+ - Identified brands: {brands_str}{brand_emphasis}
190
+
191
+ **Caption Structure (Required - BE SPECIFIC AND DETAILED):**
192
+ 1. Opening hook - Most striking visual element with SPECIFIC details (1-2 sentences)
193
+ {f"- 🚨 MANDATORY: Start with the BRAND NAME '{brands_list[0]}' in the FIRST sentence!" if brands else ""}
194
+ {f"- Example (CORRECT): '這款{brands_list[0]}經典黑色菱格紋皮革包...'" if brands else ""}
195
+ {f"- Example (WRONG): '這款經典黑色菱格紋皮革包...' (missing {brands_list[0]}!)" if brands else ""}
196
+ - Be SPECIFIC: Include material, color, design features WITH the brand name
197
+
198
+ 2. Visual details - Describe materials, textures, colors, and design elements (2-3 sentences)
199
+ - Be SPECIFIC: mention quilting patterns, metal finishes, chain details, logo placements
200
+ - Describe how light interacts with materials (reflections on leather, gleam of metal)
201
+ - MUST use the EXACT lighting description: "{lighting_display}"
202
+
203
+ 3. Atmospheric context - How lighting and mood create the scene's character (1-2 sentences)
204
+ - Connect lighting to the overall visual impact
205
+ - Describe depth, shadows, contrasts
206
+
207
+ 4. Emotional connection & Engagement - How this resonates with viewers + call-to-action (1 sentence)
208
+
209
+ **Content Requirements:**
210
+ - Minimum information: 3-4 specific visual details per caption
211
+ - Include material types, color descriptions, design features
212
+ - Describe how lighting affects the appearance
213
+ - Make it vivid and immersive
214
+
215
+ Platform style: {platform_config['style']}
216
+ """
217
+
218
+ # Language-specific examples with DETAILED visual descriptions AND BRAND NAMES
219
+ if language == 'zh':
220
+ brand_name_zh = brands_list[0] if brands else "Gucci" # Use detected brand or example
221
+ example_correct = f"""正確範例 - 詳細描述 + 品牌提及 (繁體中文):
222
+ "在{lighting_zh}的映襯下,這款{brand_name_zh}經典黑色菱格紋皮革包展現奢華質感,V字形縫線在柔軟小牛皮上勾勒出精緻的幾何圖案,復古金色雙G標誌在深色背景中熠熠生輝。金屬鏈條肩帶反射著{lighting_zh},增添層次感與立體效果。皮革表面細膩的光澤與霧面質地形成迷人對比,每個細節都彰顯義大利工藝的極致追求。這樣的{brand_name_zh}單品不只是配件,更是品味與格調的完美詮釋。你的衣櫃裡有哪件經典單品?✨🖤"
223
+
224
+ 注意:品牌名稱 "{brand_name_zh}" 出現在第一句!這是正確的做法。
225
+
226
+ CRITICAL:
227
+ - 必須包含材質描述(皮革、金屬等)
228
+ - 必須包含顏色細節(黑色、復古金色等)
229
+ - 必須包含設計特點(縫線、標誌、鏈條等)
230
+ - 必須使用"{lighting_zh}"來描述光線
231
+ """
232
+ elif language == 'en':
233
+ brand_name_en = brands_list[0] if brands else "Gucci" # Use detected brand or example
234
+ example_correct = f"""CORRECT EXAMPLE - Detailed Description + Brand Mention (ENGLISH ONLY - NO CHINESE):
235
+ "Under the {lighting}, this {brand_name_en} classic black quilted leather bag showcases luxurious craftsmanship. V-shaped stitching traces intricate geometric patterns across supple calfskin, while the antique gold double-G logo gleams against the dark backdrop. The metal chain strap catches and reflects the {lighting}, adding dimension and depth to the piece. The leather surface presents a captivating contrast between fine sheen and matte texture, with every detail exemplifying Italian artisanship at its finest. This {brand_name_en} piece isn't just an accessory – it's a perfect expression of taste and sophistication. What's your timeless wardrobe essential? ✨🖤"
236
+
237
+ NOTE: Brand name "{brand_name_en}" appears in the FIRST sentence! This is the correct approach.
238
+
239
+ 🚨 ABSOLUTE REQUIREMENT FOR ENGLISH MODE 🚨
240
+ - Output must be 100% ENGLISH - zero Chinese characters allowed
241
+ - MUST include material descriptions (leather, metal, etc.)
242
+ - MUST include color details (black, antique gold, etc.)
243
+ - MUST include design features (stitching, logo, chain, etc.)
244
+ - MUST use "{lighting}" to describe the lighting
245
+ - NO Chinese characters anywhere in the output
246
+ """
247
+ else: # zh-en bilingual
248
+ brand_name_en = brands_list[0] if brands else "Gucci"
249
+ example_correct = f"""BILINGUAL EXAMPLE - 雙語範例:
250
+ Caption in Traditional Chinese, with English hashtags support.
251
+ (Details omitted for brevity)
252
+ """
253
+
254
+ # Language-specific hashtag instructions
255
+ if language == 'zh':
256
+ hashtag_instruction = """
257
+ 【CRITICAL HASHTAG REQUIREMENT - 繁體中文】:
258
+ - ALL hashtags MUST be in Traditional Chinese (繁體中文)
259
+ - NEVER use English hashtags when language is 繁體中文
260
+ - Examples of CORRECT hashtags: ["時尚包包", "奢華風格", "皮革工藝", "精品配件"]
261
+ - Examples of WRONG hashtags: ["FashionBlogger", "LuxuryLifestyle"] - DO NOT USE THESE
262
+ """
263
+ elif language == 'en':
264
+ hashtag_instruction = """
265
+ 【CRITICAL HASHTAG REQUIREMENT - English】:
266
+ - ALL hashtags MUST be in English
267
+ - NEVER use Chinese characters in hashtags
268
+ - Examples of CORRECT hashtags: ["FashionBlogger", "LuxuryLifestyle", "LeatherCraft"]
269
+ """
270
+ else: # zh-en
271
+ hashtag_instruction = """
272
+ 【CRITICAL HASHTAG REQUIREMENT - Bilingual】:
273
+ - Hashtags should MIX Traditional Chinese and English
274
+ - First half in Chinese, second half in English
275
+ - Example: ["時尚包包", "奢華風格", "FashionBlogger", "LuxuryLifestyle"]
276
+ """
277
+
278
+ output_format = f"""
279
+ Generate output in JSON format:
280
+ {{
281
+ "caption": "string (minimum {platform_config['min_length']} chars, maximum {platform_config['max_length']} chars, engaging and descriptive)",
282
+ "hashtags": ["tag1", "tag2", ...] ({platform_config['hashtag_count']} relevant hashtags),
283
+ "tone": "casual|professional|playful",
284
+ "platform": "{platform}"
285
+ }}
286
+
287
+ {hashtag_instruction}
288
+
289
+ STRICT REQUIREMENTS:
290
+ 1. Caption length: {platform_config['min_length']}-{platform_config['max_length']} characters
291
+ 2. 🚨 EMOJI REQUIREMENT 🚨 - MUST use EXACTLY {platform_config['emoji_count']} emojis naturally integrated into caption text
292
+ - Professional style: 1-2 emojis (e.g., ✨💼🌟)
293
+ - Creative style: 2-3 emojis (e.g., 🎨✨💫🌙)
294
+ - Authentic style: 2-3 emojis (e.g., 💖👜✨🖤)
295
+ - Place emojis naturally within or at end of sentences
296
+ 3. Caption must be pure descriptive text only - absolutely NO hashtags allowed
297
+ 4. 🚨 CALL-TO-ACTION REQUIREMENT 🚨 - MUST include an engaging question or CTA at the end
298
+ - Professional: Brief professional question (e.g., "What's your go-to piece?")
299
+ - Creative: Thought-provoking question (e.g., "How does this speak to you?")
300
+ - Authentic: Personal question (e.g., "What's your favorite timeless accessory?")
301
+ 5. Write 3-4 complete sentences following the structure above
302
+ 6. Be specific and vivid - describe what you see in detail
303
+ 7. 【CRITICAL】 MUST use the EXACT lighting description: "{lighting_display}"
304
+ - DO NOT substitute with similar terms
305
+ - DO NOT use "金黃時刻" if the lighting is "{lighting_zh if language == 'zh' else lighting}"
306
+ - DO NOT invent your own lighting description
307
+ 8. 🚨 HASHTAG REQUIREMENT 🚨 - Generate {platform_config['hashtag_count']} relevant hashtags
308
+ - Hashtags go ONLY in the 'hashtags' array, NEVER in the caption text
309
+ - Mix of broad and specific tags
310
+ - Include brand name as hashtag if detected
311
+ 9. {"🚨 CRITICAL BRAND REQUIREMENT 🚨 - The brand name '" + brands_list[0] + "' MUST appear in the FIRST sentence of your caption. This is MANDATORY and NON-NEGOTIABLE. Example: " + ("'這款" + brands_list[0] + "經典黑色...'" if language == 'zh' else "'This " + brands_list[0] + " classic black...'") if brands else "No brands detected to mention"}
312
+ 10. {"🚨 LANGUAGE REQUIREMENT 🚨 - Output must be 100% ENGLISH ONLY. NO Chinese characters allowed anywhere." if language == 'en' else ""}
313
+
314
+ WRONG EXAMPLE (DO NOT DO THIS):
315
+ "Lost in the city's towering skyscrapers 🏙️✨ | #UrbanVibes #CityLife"
316
+
317
+ {example_correct}
318
+ """
319
+
320
+ full_prompt = f"{system_instruction}\n\n{context}\n\n{output_format}"
321
+ return full_prompt
322
+
323
+ def generate_captions(self, analysis_results: Dict, image: Image.Image,
324
+ platform: str = 'instagram', language: str = 'zh') -> List[Dict]:
325
+ """Generate 3 captions with distinct styles: Professional, Creative, Authentic"""
326
+
327
+ # Extract brands for style instructions
328
+ brands_in_image = analysis_results.get('brands', [])
329
+ brand_names = [b[0] for b in brands_in_image[:3]] if brands_in_image else []
330
+ brand_mention_requirement = f" CRITICAL: Mention {', '.join(brand_names)} brand(s) naturally in the caption." if brand_names else ""
331
+
332
+ # Define 3 distinct styles
333
+ styles = [
334
+ {
335
+ 'name': 'professional',
336
+ 'temp': 0.6,
337
+ 'instruction': f'Professional style: Concise, elegant, sophisticated. Focus on quality and craftsmanship. Use refined language.{brand_mention_requirement}',
338
+ 'length_modifier': 0.8 # Shorter, more concise
339
+ },
340
+ {
341
+ 'name': 'creative',
342
+ 'temp': 0.7,
343
+ 'instruction': f'Creative style: Artistic, expressive, imaginative. Use vivid metaphors and sensory descriptions. Balance detail with flair.{brand_mention_requirement}',
344
+ 'length_modifier': 1.0 # Medium length
345
+ },
346
+ {
347
+ 'name': 'authentic',
348
+ 'temp': 0.8,
349
+ 'instruction': f'Authentic style: Personal, detailed, storytelling. Share rich observations and genuine feelings. Most descriptive and engaging.{brand_mention_requirement}',
350
+ 'length_modifier': 1.2 # Longer, more detailed
351
+ }
352
+ ]
353
+
354
+ variations = []
355
+
356
+ for style in styles:
357
+ # Build style-specific prompt
358
+ base_prompt = self.construct_prompt(analysis_results, platform, language)
359
+
360
+ # Add style instruction
361
+ style_prompt = f"""{base_prompt}
362
+
363
+ **STYLE REQUIREMENT FOR THIS CAPTION:**
364
+ {style['instruction']}
365
+
366
+ Adjust tone to be clearly '{style['name']}' - this should be noticeably different from other styles."""
367
+
368
+ messages = [{
369
+ "role": "user",
370
+ "content": [
371
+ {"type": "image", "image": image},
372
+ {"type": "text", "text": style_prompt}
373
+ ]
374
+ }]
375
+
376
+ text = self.processor.apply_chat_template(
377
+ messages, tokenize=False, add_generation_prompt=True
378
+ )
379
+
380
+ image_inputs, video_inputs = process_vision_info(messages)
381
+ inputs = self.processor(
382
+ text=[text],
383
+ images=image_inputs,
384
+ videos=video_inputs,
385
+ padding=True,
386
+ return_tensors="pt"
387
+ )
388
+
389
+ if torch.cuda.is_available():
390
+ inputs = inputs.to("cuda")
391
+
392
+ # Generate with style-specific temperature
393
+ config = self.generation_config.copy()
394
+ config['temperature'] = style['temp']
395
+
396
+ with torch.no_grad():
397
+ generated_ids = self.model.generate(**inputs, **config)
398
+
399
+ generated_ids_trimmed = [
400
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
401
+ ]
402
+
403
+ output_text = self.processor.batch_decode(
404
+ generated_ids_trimmed,
405
+ skip_special_tokens=True,
406
+ clean_up_tokenization_spaces=False
407
+ )[0]
408
+
409
+ parsed = self._parse_json_output(output_text)
410
+ if parsed:
411
+ # Force the correct tone
412
+ parsed['tone'] = style['name']
413
+
414
+ # Remove any hashtags that leaked into caption
415
+ if 'caption' in parsed:
416
+ parsed['caption'] = self._remove_hashtags_from_caption(parsed['caption'])
417
+
418
+ # Convert Simplified Chinese to Traditional if language is 'zh'
419
+ if language == 'zh' or language == 'zh-en':
420
+ parsed = self._convert_to_traditional(parsed)
421
+
422
+ variations.append(parsed)
423
+
424
+ return variations if variations else [self._get_fallback_caption(platform, language)]
425
+
426
+ def _remove_hashtags_from_caption(self, caption: str) -> str:
427
+ """Remove any hashtags, pipes, and debug info that leaked into caption text"""
428
+ import re
429
+
430
+ # CRITICAL FIX: Remove pipe symbol and everything after it (debug info)
431
+ # Example: "Text 🕰️🌉 | SoftDiffusedLight" -> "Text 🕰️🌉"
432
+ if '|' in caption:
433
+ caption = caption.split('|')[0].strip()
434
+
435
+ # Remove hashtags (words starting with #)
436
+ caption = re.sub(r'#\w+', '', caption)
437
+ caption = re.sub(r'#[\u4e00-\u9fff]+', '', caption) # Remove Chinese hashtags
438
+
439
+ # Remove standalone weird text patterns (like "BLACKBELT")
440
+ # If there's a suspicious all-caps word at the end without context, remove it
441
+ words = caption.split()
442
+ if len(words) > 0:
443
+ last_word = words[-1].strip('✨💎👗🌟💫🖤')
444
+ # If last "word" is all caps and doesn't look like a normal sentence word, remove it
445
+ if last_word.isupper() and len(last_word) > 3 and not any(char in last_word for char in '.,!?'):
446
+ caption = ' '.join(words[:-1])
447
+
448
+ # Remove excessive emojis at the end (more than 3)
449
+ emoji_pattern = r'[\U0001F300-\U0001F9FF]{4,}$'
450
+ caption = re.sub(emoji_pattern, '', caption)
451
+
452
+ # Remove multiple spaces
453
+ caption = re.sub(r'\s+', ' ', caption)
454
+
455
+ # Remove trailing/leading whitespace
456
+ caption = caption.strip()
457
+
458
+ # Final cleanup: if caption ends with weird patterns like "✨X 👗💎", clean it
459
+ if re.search(r'[✨💎👗🌟💫🖤]{2,}\s*$', caption):
460
+ caption = re.sub(r'[✨💎👗🌟💫🖤\s]+$', '', caption).strip()
461
+
462
+ return caption
463
+
464
+ def _convert_to_traditional(self, caption: Dict) -> Dict:
465
+ """Convert Simplified Chinese to Traditional Chinese"""
466
+ if 'caption' in caption:
467
+ caption['caption'] = self.cc.convert(caption['caption'])
468
+ return caption
469
+
470
+ def _parse_json_output(self, text: str) -> Dict:
471
+ """Parse JSON output"""
472
+ try:
473
+ start = text.find('{')
474
+ end = text.rfind('}') + 1
475
+ if start != -1 and end > start:
476
+ json_str = text[start:end]
477
+ return json.loads(json_str)
478
+ except:
479
+ pass
480
+ return None
481
+
482
+ def _get_fallback_caption(self, platform: str, language: str) -> Dict:
483
+ """Fallback caption"""
484
+ if language == 'en':
485
+ return {
486
+ 'caption': 'Every moment tells a story worth sharing. The world around us is filled with beauty waiting to be discovered. Take a pause and appreciate the details that make life extraordinary. What caught your eye today? ✨',
487
+ 'hashtags': ['photography', 'daily', 'lifestyle', 'moment', 'capture'],
488
+ 'tone': 'casual',
489
+ 'platform': platform
490
+ }
491
+ else:
492
+ return {
493
+ 'caption': '每個瞬間都值得被記錄與分享。生活中充滿了等待被發現的美好細節。停下腳步,用心感受周遭的一切。今天什麼畫面觸動了你的心?✨',
494
+ 'hashtags': ['攝影', '日常', '生活', '瞬間', '分享'],
495
+ 'tone': 'casual',
496
+ 'platform': platform
497
+ }
498
+
499
+ print("✓ CaptionGenerationManager (with Auto* classes for flexible model support) defined")
detection_fusion_manager.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict
2
+ import numpy as np
3
+
4
+ class DetectionFusionManager:
5
+ """Integrate and prioritize detection results with intelligent lighting fusion"""
6
+
7
+ def __init__(self, clip_manager):
8
+ self.clip_manager = clip_manager
9
+
10
+ def fuse_lighting_analysis(self, cv_lighting: Dict, clip_scene: Dict) -> Dict:
11
+ """Intelligently fuse CV+Places365 lighting with CLIP scene understanding"""
12
+
13
+ cv_lighting_type = cv_lighting.get('lighting_type', 'soft diffused light')
14
+ cv_confidence = cv_lighting.get('confidence', 0.7)
15
+ cv_features = cv_lighting.get('cv_features', {})
16
+
17
+ # Get CLIP's lighting prediction
18
+ clip_lighting_data = clip_scene.get('lighting', {})
19
+ clip_lighting_type = clip_lighting_data.get('top', 'natural light')
20
+ clip_confidence = clip_lighting_data.get('confidence', 0.5)
21
+
22
+ # Intelligent fusion strategy:
23
+ # 1. If CV has high confidence (>0.85), trust it
24
+ # 2. If CV and CLIP semantically agree, boost confidence
25
+ # 3. Otherwise, weighted average based on confidence
26
+
27
+ if cv_confidence > 0.85:
28
+ # High confidence from CV+Places365
29
+ final_lighting = cv_lighting_type
30
+ final_confidence = cv_confidence
31
+ fusion_method = 'cv_dominant'
32
+
33
+ elif self._lighting_semantically_similar(cv_lighting_type, clip_lighting_type):
34
+ # Semantic agreement between CV and CLIP
35
+ final_lighting = cv_lighting_type # Prefer CV's specific description
36
+ # Boost confidence when both agree
37
+ final_confidence = min(cv_confidence * 1.15, 0.95)
38
+ fusion_method = 'consensus'
39
+
40
+ else:
41
+ # Weighted fusion based on confidence
42
+ cv_weight = cv_confidence / (cv_confidence + clip_confidence)
43
+ clip_weight = 1.0 - cv_weight
44
+
45
+ # If CV weight is higher, use CV result
46
+ if cv_weight > 0.6:
47
+ final_lighting = cv_lighting_type
48
+ final_confidence = cv_confidence * 0.9
49
+ fusion_method = 'cv_weighted'
50
+ else:
51
+ # Use more generic description when uncertain
52
+ final_lighting = self._generalize_lighting_description(
53
+ cv_lighting_type, clip_lighting_type, cv_features
54
+ )
55
+ final_confidence = (cv_confidence * cv_weight + clip_confidence * clip_weight) * 0.85
56
+ fusion_method = 'generalized'
57
+
58
+ return {
59
+ 'lighting_type': final_lighting,
60
+ 'confidence': min(final_confidence, 0.95),
61
+ 'cv_analysis': cv_lighting_type,
62
+ 'clip_prediction': clip_lighting_type,
63
+ 'fusion_method': fusion_method,
64
+ 'cv_confidence': cv_confidence,
65
+ 'clip_confidence': clip_confidence
66
+ }
67
+
68
+ def _lighting_semantically_similar(self, cv_type: str, clip_type: str) -> bool:
69
+ """Check if two lighting descriptions are semantically similar"""
70
+ # Define semantic similarity groups
71
+ similarity_groups = [
72
+ {'soft', 'diffused', 'overcast', 'cloudy'},
73
+ {'bright', 'sunny', 'sunlight', 'clear'},
74
+ {'warm', 'golden', 'amber', 'evening'},
75
+ {'natural', 'daylight', 'outdoor'},
76
+ {'cool', 'blue', 'twilight'},
77
+ ]
78
+
79
+ cv_words = set(cv_type.lower().split())
80
+ clip_words = set(clip_type.lower().split())
81
+
82
+ # Check if both descriptions share words from same semantic group
83
+ for group in similarity_groups:
84
+ cv_match = cv_words & group
85
+ clip_match = clip_words & group
86
+ if cv_match and clip_match:
87
+ return True
88
+
89
+ # Direct word overlap
90
+ common_words = cv_words & clip_words
91
+ return len(common_words) >= 1
92
+
93
+ def _generalize_lighting_description(self, cv_type: str, clip_type: str,
94
+ cv_features: Dict) -> str:
95
+ """Generate a generalized lighting description when CV and CLIP disagree"""
96
+
97
+ brightness = cv_features.get('brightness', 128)
98
+ contrast = cv_features.get('contrast', 50)
99
+ color_temp = cv_features.get('color_temp', 1.0)
100
+
101
+ # Use feature-based generalization (not hard thresholds)
102
+ brightness_norm = brightness / 255.0
103
+ contrast_norm = min(contrast / 100.0, 1.0)
104
+
105
+ # Decision tree based on physical features
106
+ if contrast_norm < 0.5:
107
+ # Low contrast
108
+ if color_temp < 1.0:
109
+ return 'soft diffused light'
110
+ else:
111
+ return 'warm ambient light'
112
+ elif brightness_norm > 0.7:
113
+ # High brightness
114
+ return 'natural daylight'
115
+ elif color_temp > 1.1:
116
+ # Warm temperature
117
+ return 'warm ambient light'
118
+ else:
119
+ # Default safe description
120
+ return 'soft diffused light'
121
+
122
+ def analyze_composition(self, image, detections: List[Dict]) -> Dict:
123
+ """Analyze image composition"""
124
+ if not detections:
125
+ return {'composition_type': 'empty', 'vertical_ratio': 0.0}
126
+
127
+ # Calculate vertical element ratio
128
+ vertical_objects = [
129
+ d for d in detections
130
+ if (d['bbox'][3] - d['bbox'][1]) > (d['bbox'][2] - d['bbox'][0])
131
+ ]
132
+ vertical_ratio = len(vertical_objects) / max(len(detections), 1)
133
+
134
+ # Determine composition type
135
+ if vertical_ratio > 0.6:
136
+ composition_type = 'urban canyon'
137
+ elif vertical_ratio > 0.4:
138
+ composition_type = 'vertical emphasis'
139
+ else:
140
+ composition_type = 'standard street view'
141
+
142
+ return {
143
+ 'composition_type': composition_type,
144
+ 'vertical_ratio': vertical_ratio,
145
+ 'vertical_objects_count': len(vertical_objects),
146
+ 'total_objects': len(detections)
147
+ }
148
+
149
+ def fuse_detections(self, yolo_results: List[Dict], unknown_regions: List[Dict],
150
+ scene_info: Dict, image=None, cv_lighting: Dict = None) -> Dict:
151
+ """Fuse all detection results with intelligent lighting fusion"""
152
+ all_detections = []
153
+
154
+ # Process YOLO detections with attention scores
155
+ for det in yolo_results:
156
+ attention_score = self._calculate_attention_score(det)
157
+ det['attention_score'] = attention_score
158
+ all_detections.append(det)
159
+
160
+ # Classify unknown regions using OpenCLIP
161
+ for region in unknown_regions:
162
+ if 'image' not in region:
163
+ continue
164
+
165
+ classification = self.clip_manager.classify_hierarchical(region['image'])
166
+
167
+ detection = {
168
+ 'class_name': classification['top_prediction'],
169
+ 'bbox': region['bbox'],
170
+ 'confidence': classification.get('confidence', 0.5),
171
+ 'attention_score': region.get('saliency_score', 0.5),
172
+ 'source': 'openclip'
173
+ }
174
+ all_detections.append(detection)
175
+
176
+ # Sort by attention score
177
+ ranked_detections = sorted(
178
+ all_detections,
179
+ key=lambda x: x['attention_score'],
180
+ reverse=True
181
+ )
182
+
183
+ # Filter top 15
184
+ filtered = []
185
+ for det in ranked_detections:
186
+ if len(filtered) >= 15:
187
+ if det.get('brand') and det.get('brand_confidence', 0) > 0.45:
188
+ filtered.append(det)
189
+ else:
190
+ break
191
+ else:
192
+ filtered.append(det)
193
+
194
+ # Analyze composition
195
+ composition = self.analyze_composition(image, filtered) if image else {}
196
+
197
+ # Intelligent lighting fusion
198
+ if cv_lighting:
199
+ fused_lighting = self.fuse_lighting_analysis(cv_lighting, scene_info)
200
+ # Update scene_info with fused lighting
201
+ scene_info['lighting'] = {
202
+ 'top': fused_lighting['lighting_type'],
203
+ 'confidence': fused_lighting['confidence'],
204
+ 'fusion_details': fused_lighting
205
+ }
206
+
207
+ return {
208
+ 'detections': filtered,
209
+ 'scene_info': scene_info,
210
+ 'composition': composition,
211
+ 'total_objects': len(all_detections)
212
+ }
213
+
214
+ def _calculate_attention_score(self, detection: Dict) -> float:
215
+ """Calculate attention score based on position, size, and confidence"""
216
+ bbox = detection['bbox']
217
+ x1, y1, x2, y2 = bbox
218
+
219
+ center_x = (x1 + x2) / 2
220
+ center_y = (y1 + y2) / 2
221
+
222
+ if x2 > 100:
223
+ position_score = 0.5
224
+ else:
225
+ position_score = 1.0 - (abs(center_x - 0.5) + abs(center_y - 0.5))
226
+
227
+ area = abs((x2 - x1) * (y2 - y1))
228
+ if x2 > 100:
229
+ area = area / (1000 * 1000)
230
+ size_score = min(area, 0.5)
231
+
232
+ conf_score = detection.get('confidence', 0.5)
233
+
234
+ attention = (
235
+ 0.3 * position_score +
236
+ 0.3 * size_score +
237
+ 0.4 * conf_score
238
+ )
239
+
240
+ return attention
241
+
242
+ print("✓ DetectionFusionManager (V2 with intelligent fusion) defined")
image_processor_manager.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+ import numpy as np
4
+ from PIL import Image
5
+ from typing import Tuple, Optional, Union
6
+ import torchvision.transforms as transforms
7
+
8
+ class ImageProcessorManager:
9
+ """Image validation, preprocessing and format standardization"""
10
+
11
+ def __init__(self):
12
+ self.supported_formats = ['JPEG', 'PNG', 'WEBP', 'JPG']
13
+ self.min_resolution = (224, 224)
14
+
15
+ # CLIP preprocessing transform
16
+ self.clip_transform = transforms.Compose([
17
+ transforms.Resize((336, 336), interpolation=transforms.InterpolationMode.BICUBIC),
18
+ transforms.ToTensor(),
19
+ transforms.Normalize(
20
+ mean=[0.48145466, 0.4578275, 0.40821073],
21
+ std=[0.26862954, 0.26130258, 0.27577711]
22
+ )
23
+ ])
24
+
25
+ def load_image(self, file_path: Union[str, Image.Image]) -> Image.Image:
26
+ """Load and validate image"""
27
+ if isinstance(file_path, Image.Image):
28
+ image = file_path
29
+ else:
30
+ try:
31
+ image = Image.open(file_path)
32
+ except Exception as e:
33
+ raise ValueError(f"Failed to load image: {e}")
34
+
35
+ # Convert to RGB
36
+ if image.mode != 'RGB':
37
+ image = image.convert('RGB')
38
+
39
+ # Check resolution
40
+ if image.size[0] < self.min_resolution[0] or image.size[1] < self.min_resolution[1]:
41
+ raise ValueError(f"Image resolution too low, minimum required: {self.min_resolution}")
42
+
43
+ return image
44
+
45
+ def preprocess_for_yolo(self, image: Image.Image) -> np.ndarray:
46
+ """Preprocess image for YOLO (keep original format)"""
47
+ return np.array(image)
48
+
49
+ def preprocess_for_clip(self, image: Image.Image) -> torch.Tensor:
50
+ """Preprocess image for CLIP (336x336, ImageNet normalization)"""
51
+ return self.clip_transform(image)
52
+
53
+ def preprocess_for_qwen(self, image: Image.Image) -> Image.Image:
54
+ """Preprocess image for Qwen2.5-VL (dynamic resolution)"""
55
+ return image
56
+
57
+ def resize_with_aspect_ratio(self, image: Image.Image, max_size: int = 1024) -> Image.Image:
58
+ """Resize image while maintaining aspect ratio"""
59
+ width, height = image.size
60
+ if max(width, height) > max_size:
61
+ if width > height:
62
+ new_width = max_size
63
+ new_height = int(height * (max_size / width))
64
+ else:
65
+ new_height = max_size
66
+ new_width = int(width * (max_size / height))
67
+ image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
68
+ return image
69
+
70
+ print("✓ ImageProcessorManager defined")
landmark_prompts.py ADDED
@@ -0,0 +1,1030 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from typing import Dict, List, Optional
3
+
4
+ class LandmarkPrompts:
5
+ """
6
+ 世界地標視覺描述與 Hashtag 資料庫
7
+ 提供 20 個世界知名地標的詳細資料
8
+ """
9
+
10
+ def __init__(self):
11
+ """初始化地標資料庫"""
12
+
13
+ self.landmarks = {
14
+ # ===== 歐洲 Europe =====
15
+ "Big Ben": {
16
+ "name": "Big Ben",
17
+ "official_name": "Elizabeth Tower",
18
+ "location": {
19
+ "city": "London",
20
+ "country": "United Kingdom",
21
+ "region": "Westminster",
22
+ "continent": "Europe"
23
+ },
24
+ "visual_cues": {
25
+ "iconic_view": [
26
+ "Gothic Revival clock tower with four ornate clock faces rising beside Westminster Palace and Thames River",
27
+ "Tall Victorian tower with intricate stone detailing golden clock faces and pointed spire against London sky",
28
+ "Famous clock tower landmark showing detailed Gothic architecture with Palace of Westminster backdrop",
29
+ "Majestic bell tower with elaborate Victorian Gothic design overlooking Westminster Bridge"
30
+ ],
31
+ "architectural_details": [
32
+ "Ornate clock faces with Roman numerals surrounded by decorative Gothic stonework and gilded details",
33
+ "Victorian Gothic Revival architecture featuring pointed arches flying buttresses and limestone facade",
34
+ "Detailed carved stonework showing Gothic tracery pinnacles and decorative moldings on tower exterior",
35
+ "Cast iron and gold leaf clock mechanisms visible within ornamental Gothic Revival tower framework"
36
+ ],
37
+ "contextual_view": [
38
+ "Clock tower rising above Westminster Bridge with red double-decker buses and Thames River in foreground",
39
+ "Big Ben silhouetted against dramatic London sunset with Westminster Palace and river reflections",
40
+ "Famous landmark viewed from Parliament Square with traffic pedestrians and London Eye in distance",
41
+ "Tower seen through tree branches in nearby park with Westminster Abbey and government buildings visible"
42
+ ],
43
+ "seasonal_lighting": [
44
+ "Tower illuminated at night with golden clock faces glowing against dark sky creating iconic London scene",
45
+ "Soft morning light highlighting limestone details as mist rises from Thames creating atmospheric mood",
46
+ "Dramatic storm clouds gathering behind tower with contrasting sunlight illuminating Gothic stonework",
47
+ "Winter scene with tower emerging from fog as streetlights reflect on wet Westminster Bridge pavement"
48
+ ]
49
+ },
50
+ "hashtags": {
51
+ "zh": ["大笨鐘", "倫敦地標", "西敏寺", "泰晤士河", "英國旅遊", "倫敦", "英國"],
52
+ "en": ["BigBen", "London", "Westminster", "Thames", "UKTravel", "LondonLandmarks", "ElizabethTower"]
53
+ },
54
+ "cultural_info": {
55
+ "built_year": 1859,
56
+ "architect": "Augustus Pugin",
57
+ "architectural_style": "Gothic Revival"
58
+ }
59
+ },
60
+
61
+ "Eiffel Tower": {
62
+ "name": "Eiffel Tower",
63
+ "official_name": "La Tour Eiffel",
64
+ "location": {
65
+ "city": "Paris",
66
+ "country": "France",
67
+ "region": "Champ de Mars",
68
+ "continent": "Europe"
69
+ },
70
+ "visual_cues": {
71
+ "iconic_view": [
72
+ "Iconic iron lattice tower rising 330 meters above Champ de Mars with distinctive tapering silhouette",
73
+ "Wrought iron structure with three observation levels showing intricate lattice framework against Paris sky",
74
+ "Famous Parisian landmark with characteristic brown paint and elegant art nouveau iron lattice design",
75
+ "Monumental tower structure displaying puddle iron construction with four massive arched base legs"
76
+ ],
77
+ "architectural_details": [
78
+ "Intricate wrought iron lattice work showing 18000 metallic parts joined by 2.5 million rivets",
79
+ "Distinctive curved base arches with elevator shafts and lattice framework creating transparent appearance",
80
+ "Observation deck platforms with iron railings providing panoramic views across Paris rooftops",
81
+ "Antique elevators and iron staircases winding through lattice structure between three viewing levels"
82
+ ],
83
+ "contextual_view": [
84
+ "Tower framed by Trocadéro fountains with reflecting pools and Parisian cityscape in background",
85
+ "Eiffel Tower viewed from Seine River with tourist boats and bridges in romantic Parisian setting",
86
+ "Landmark rising above Champ de Mars gardens with visitors and green lawns in foreground",
87
+ "Tower seen from Montparnasse showing Paris rooftops Sacré-Cœur and urban landscape panorama"
88
+ ],
89
+ "seasonal_lighting": [
90
+ "Tower illuminated at night with golden lights creating magical sparkling effect every hour",
91
+ "Sunset silhouette with tower's iron structure outlined against orange and pink Paris sky",
92
+ "Cherry blossoms framing tower in spring with soft natural light on iron lattice",
93
+ "Winter scene with tower emerging from clouds as snow dusts Champ de Mars gardens"
94
+ ]
95
+ },
96
+ "hashtags": {
97
+ "zh": ["艾菲爾鐵塔", "巴黎鐵塔", "巴黎地標", "法國旅遊", "巴黎", "鐵塔"],
98
+ "en": ["EiffelTower", "Paris", "ParisLandmark", "TourEiffel", "France", "ParisTravel"]
99
+ },
100
+ "cultural_info": {
101
+ "built_year": 1889,
102
+ "architect": "Gustave Eiffel",
103
+ "architectural_style": "Structural Expressionism"
104
+ }
105
+ },
106
+
107
+ "Colosseum": {
108
+ "name": "Colosseum",
109
+ "official_name": "Flavian Amphitheatre",
110
+ "location": {
111
+ "city": "Rome",
112
+ "country": "Italy",
113
+ "region": "Lazio",
114
+ "continent": "Europe"
115
+ },
116
+ "visual_cues": {
117
+ "iconic_view": [
118
+ "Ancient Roman amphitheater with massive oval structure showing three tiers of arches in weathered stone",
119
+ "Iconic ruined arena with partially collapsed walls revealing internal chambers and underground passages",
120
+ "Monumental stone amphitheater displaying Roman engineering with distinctive arched facade and columns",
121
+ "Historic gladiatorial arena showing travertine limestone construction with Doric Ionic and Corinthian orders"
122
+ ],
123
+ "architectural_details": [
124
+ "Three stories of arches supported by columns showing progression of classical orders from ground to top",
125
+ "Weathered travertine blocks and brick revealing ancient construction techniques and earthquake damage",
126
+ "Hypogeum underground chambers visible through arena floor showing complex staging machinery areas",
127
+ "Massive exterior wall with remaining arches brackets and column fragments from original four-story height"
128
+ ],
129
+ "contextual_view": [
130
+ "Colosseum rising above Roman Forum with ancient temples columns and ruins in surrounding area",
131
+ "Amphitheater viewed from Palatine Hill showing relationship to Imperial Palace and Roman landscape",
132
+ "Monument surrounded by modern Rome with traffic tourists and urban development contrasting ancient stone",
133
+ "Arena illuminated at dusk with Constantine's Arch and Roman ruins visible in archaeological park"
134
+ ],
135
+ "seasonal_lighting": [
136
+ "Golden hour light warming travertine stone with dramatic shadows emphasizing architectural depth",
137
+ "Night illumination creating dramatic effect on ancient arches with warm amber lighting",
138
+ "Overcast sky providing even light showing weathering patterns and stone texture details",
139
+ "Bright midday sun creating strong contrast between light and shadow in deep archways"
140
+ ]
141
+ },
142
+ "hashtags": {
143
+ "zh": ["羅馬競技場", "古羅馬", "羅馬", "義大利旅遊", "古蹟", "世界遺產"],
144
+ "en": ["Colosseum", "Rome", "AncientRome", "Italy", "Roman", "WorldHeritage"]
145
+ },
146
+ "cultural_info": {
147
+ "built_year": 80,
148
+ "architect": "Emperor Vespasian",
149
+ "architectural_style": "Ancient Roman"
150
+ }
151
+ },
152
+
153
+ "Sagrada Familia": {
154
+ "name": "Sagrada Familia",
155
+ "official_name": "Basílica de la Sagrada Família",
156
+ "location": {
157
+ "city": "Barcelona",
158
+ "country": "Spain",
159
+ "region": "Catalonia",
160
+ "continent": "Europe"
161
+ },
162
+ "visual_cues": {
163
+ "iconic_view": [
164
+ "Extraordinary basilica with soaring organic towers showing Gaudí's distinctive naturalistic Gothic design",
165
+ "Unfinished cathedral with multiple spires featuring intricate stone carving and colorful mosaic details",
166
+ "Fantastical church architecture combining Gothic and Art Nouveau with nature-inspired sculptural forms",
167
+ "Massive religious monument with elaborate facades showing biblical scenes in highly detailed stonework"
168
+ ],
169
+ "architectural_details": [
170
+ "Organic columns branching like trees supporting vaulted ceiling with natural light filtering through",
171
+ "Nativity facade with detailed sculptural groups showing biblical narratives in stone",
172
+ "Colorful stained glass windows creating rainbow light effects throughout cathedral interior",
173
+ "Hyperboloid structures and ruled surfaces demonstrating Gaudí's mathematical geometric approach"
174
+ ],
175
+ "contextual_view": [
176
+ "Basilica towers rising above Barcelona cityscape with Mediterranean architecture and urban landscape",
177
+ "Church viewed from Plaça de Gaudí with reflecting pool mirroring elaborate facades",
178
+ "Construction cranes visible around towers showing ongoing building work on Gaudí's vision",
179
+ "Interior forest of columns with visitors experiencing spectacular light and space"
180
+ ],
181
+ "seasonal_lighting": [
182
+ "Sunset light streaming through stained glass creating vibrant color patterns on stone columns",
183
+ "Night illumination highlighting intricate facade details with dramatic architectural lighting",
184
+ "Morning light revealing texture and depth of carved stone with soft shadows",
185
+ "Bright Mediterranean sun emphasizing colorful mosaic work on tower exteriors"
186
+ ]
187
+ },
188
+ "hashtags": {
189
+ "zh": ["聖家堂", "巴塞隆納", "高第建築", "西班牙旅遊", "世界遺產", "教堂"],
190
+ "en": ["SagradaFamilia", "Barcelona", "Gaudi", "Spain", "Cathedral", "Architecture"]
191
+ },
192
+ "cultural_info": {
193
+ "built_year": 1882,
194
+ "architect": "Antoni Gaudí",
195
+ "architectural_style": "Catalan Modernism"
196
+ }
197
+ },
198
+
199
+ "Brandenburg Gate": {
200
+ "name": "Brandenburg Gate",
201
+ "official_name": "Brandenburger Tor",
202
+ "location": {
203
+ "city": "Berlin",
204
+ "country": "Germany",
205
+ "region": "Mitte",
206
+ "continent": "Europe"
207
+ },
208
+ "visual_cues": {
209
+ "iconic_view": [
210
+ "Neoclassical triumphal arch with twelve Doric columns supporting entablature and Quadriga sculpture",
211
+ "Monumental city gate with goddess of victory chariot crowning sandstone classical structure",
212
+ "Historic gateway showing Greek Revival architecture with columned portico and sculptural decoration",
213
+ "Famous Berlin landmark with symmetrical design and copper Quadriga statue against sky"
214
+ ],
215
+ "architectural_details": [
216
+ "Twelve Doric columns arranged in six pairs creating five passageways through gate structure",
217
+ "Quadriga sculpture showing goddess Victoria in four-horse chariot with Prussian eagle and Iron Cross",
218
+ "Sandstone construction with classical Greek proportions and restrained decorative elements",
219
+ "Relief sculptures in metopes showing mythological scenes and Prussian military symbolism"
220
+ ],
221
+ "contextual_view": [
222
+ "Gate standing at Pariser Platz with modern buildings and historic square surrounding monument",
223
+ "Brandenburg Gate viewed down Unter den Linden boulevard with linden trees and embassies",
224
+ "Monument at edge of Tiergarten park showing relationship to green space and city",
225
+ "Gate illuminated with Reichstag building and government district visible in background"
226
+ ],
227
+ "seasonal_lighting": [
228
+ "Dramatic night lighting in various colors for events creating stunning visual effects",
229
+ "Soft morning light highlighting sandstone texture and classical architectural details",
230
+ "Sunset silhouette with Quadriga outlined against colorful Berlin sky",
231
+ "Winter scene with gate surrounded by Christmas market lights and seasonal decorations"
232
+ ]
233
+ },
234
+ "hashtags": {
235
+ "zh": ["布蘭登堡門", "柏林", "德國旅遊", "歷史建築", "柏林地標"],
236
+ "en": ["BrandenburgGate", "Berlin", "Germany", "BerlinLandmark", "GermanHistory"]
237
+ },
238
+ "cultural_info": {
239
+ "built_year": 1791,
240
+ "architect": "Carl Gotthard Langhans",
241
+ "architectural_style": "Neoclassicism"
242
+ }
243
+ },
244
+
245
+ # ===== 亞洲 Asia =====
246
+ "Tokyo Tower": {
247
+ "name": "Tokyo Tower",
248
+ "official_name": "東京タワー",
249
+ "location": {
250
+ "city": "Tokyo",
251
+ "country": "Japan",
252
+ "region": "Minato",
253
+ "continent": "Asia"
254
+ },
255
+ "visual_cues": {
256
+ "iconic_view": [
257
+ "Red and white lattice steel tower inspired by Eiffel Tower rising 333 meters above Tokyo",
258
+ "Iconic communication tower with distinctive orange and white paint showing two observation decks",
259
+ "Tall broadcasting tower with lattice framework and observation platforms overlooking Tokyo cityscape",
260
+ "Famous Japanese landmark tower with red-orange color scheme and tapering lattice structure"
261
+ ],
262
+ "architectural_details": [
263
+ "Steel lattice framework painted international orange and white for aviation safety",
264
+ "Two observation decks at 150m and 250m heights with panoramic windows and viewing platforms",
265
+ "Four massive support legs with elevators and emergency stairs running through lattice structure",
266
+ "Broadcasting antennas and equipment at tower top with decorative lighting systems"
267
+ ],
268
+ "contextual_view": [
269
+ "Tower rising above Shiba Park with traditional temple buildings and modern Tokyo skyscrapers",
270
+ "Tokyo Tower viewed from Roppongi Hills with Mount Fuji visible in distant background",
271
+ "Landmark tower dominating skyline with Rainbow Bridge and Tokyo Bay in view",
272
+ "Tower surrounded by cherry blossoms in spring with pink petals and urban landscape"
273
+ ],
274
+ "seasonal_lighting": [
275
+ "Tower illuminated at night in orange creating warm glow against Tokyo night sky",
276
+ "Special lighting displays in various colors for holidays and events creating festive atmosphere",
277
+ "Sunset view with tower silhouetted against orange and pink sky",
278
+ "Winter illumination with tower and surrounding trees decorated with seasonal lights"
279
+ ]
280
+ },
281
+ "hashtags": {
282
+ "zh": ["東京鐵塔", "東京", "日本旅遊", "東京地標", "日本"],
283
+ "en": ["TokyoTower", "Tokyo", "Japan", "TokyoLandmark", "JapanTravel"]
284
+ },
285
+ "cultural_info": {
286
+ "built_year": 1958,
287
+ "architect": "Tachū Naitō",
288
+ "architectural_style": "Lattice Tower"
289
+ }
290
+ },
291
+
292
+ "Taipei 101": {
293
+ "name": "Taipei 101",
294
+ "official_name": "台北101",
295
+ "location": {
296
+ "city": "Taipei",
297
+ "country": "Taiwan",
298
+ "region": "Xinyi District",
299
+ "continent": "Asia"
300
+ },
301
+ "visual_cues": {
302
+ "iconic_view": [
303
+ "Massive skyscraper with bamboo-inspired segmented design rising 508 meters above Taipei",
304
+ "101-story tower with distinctive eight-segment structure and traditional Chinese architectural elements",
305
+ "Iconic green-glass building with pagoda-like tiers showing postmodern Asian design",
306
+ "Supertall skyscraper with gold-tinted windows and traditional motifs in modern interpretation"
307
+ ],
308
+ "architectural_details": [
309
+ "Eight eight-story modules stacked vertically representing prosperity in Chinese numerology",
310
+ "Traditional ruyi ornaments at corners of each section adding cultural architectural elements",
311
+ "Massive tuned mass damper sphere visible to visitors providing earthquake protection",
312
+ "Double-deck elevators with pressurization system ascending at world-record speeds"
313
+ ],
314
+ "contextual_view": [
315
+ "Tower dominating Taipei skyline with Elephant Mountain and lush green hills in background",
316
+ "Building viewed from Xiangshan with city sprawl and mountains creating dramatic setting",
317
+ "Taipei 101 rising above Xinyi shopping district with modern urban development below",
318
+ "Tower illuminated against night sky with busy streets and city lights surrounding base"
319
+ ],
320
+ "seasonal_lighting": [
321
+ "New Year's Eve fireworks display launched from building creating spectacular light show",
322
+ "LED lighting system displaying colors for holidays and special occasions",
323
+ "Sunset illumination with building's glass reflecting golden and orange tones",
324
+ "Night view with tower lit in green and gold standing out against dark sky"
325
+ ]
326
+ },
327
+ "hashtags": {
328
+ "zh": ["台北101", "台北", "台灣", "台北地標", "摩天大樓", "台灣旅遊"],
329
+ "en": ["Taipei101", "Taipei", "Taiwan", "TaipeiLandmark", "Skyscraper", "TaiwanTravel"]
330
+ },
331
+ "cultural_info": {
332
+ "built_year": 2004,
333
+ "architect": "C.Y. Lee & Partners",
334
+ "architectural_style": "Postmodern"
335
+ }
336
+ },
337
+
338
+ "Burj Khalifa": {
339
+ "name": "Burj Khalifa",
340
+ "official_name": "برج خليفة",
341
+ "location": {
342
+ "city": "Dubai",
343
+ "country": "United Arab Emirates",
344
+ "region": "Downtown Dubai",
345
+ "continent": "Asia"
346
+ },
347
+ "visual_cues": {
348
+ "iconic_view": [
349
+ "World's tallest building at 828 meters with Y-shaped floor plan and sleek tapering design",
350
+ "Supertall skyscraper with reflective glass facade and setback design inspired by desert flower",
351
+ "Iconic needle-like tower piercing clouds with distinctive spire and observation decks",
352
+ "Neo-futurist architecture with Islamic geometric patterns in modern glass and steel construction"
353
+ ],
354
+ "architectural_details": [
355
+ "Buttressed core structural system with wings extending from central hexagonal hub",
356
+ "Reflective glazing with aluminum and textured stainless steel spandrel panels",
357
+ "Observation decks on 124th 125th and 148th floors offering panoramic views",
358
+ "Spire adding 200 meters to height with communication equipment and decorative elements"
359
+ ],
360
+ "contextual_view": [
361
+ "Tower rising from Downtown Dubai with Dubai Mall fountain show and urban development below",
362
+ "Building dominating skyline with Persian Gulf and Palm Jumeirah visible in distance",
363
+ "Burj Khalifa viewed from desert showing contrast between modern architecture and natural landscape",
364
+ "Tower at center of Dubai's business district with surrounding high-rises and infrastructure"
365
+ ],
366
+ "seasonal_lighting": [
367
+ "LED light show on facade creating dynamic patterns and colors for celebrations",
368
+ "Night illumination with tower glowing against dark sky as city lights spread below",
369
+ "Sunset view with building's glass reflecting orange and gold desert light",
370
+ "New Year's Eve spectacular with building covered in coordinated light and firework display"
371
+ ]
372
+ },
373
+ "hashtags": {
374
+ "zh": ["哈里發塔", "杜拜", "阿聯酋", "世界最高", "摩天大樓", "杜拜旅遊"],
375
+ "en": ["BurjKhalifa", "Dubai", "UAE", "WorldsTallest", "Skyscraper", "DubaiTravel"]
376
+ },
377
+ "cultural_info": {
378
+ "built_year": 2010,
379
+ "architect": "Adrian Smith (SOM)",
380
+ "architectural_style": "Neo-futurism"
381
+ }
382
+ },
383
+
384
+ "Petronas Towers": {
385
+ "name": "Petronas Towers",
386
+ "official_name": "Menara Berkembar Petronas",
387
+ "location": {
388
+ "city": "Kuala Lumpur",
389
+ "country": "Malaysia",
390
+ "region": "KLCC",
391
+ "continent": "Asia"
392
+ },
393
+ "visual_cues": {
394
+ "iconic_view": [
395
+ "Twin skyscrapers with Islamic-inspired design connected by sky bridge at 452 meters height",
396
+ "Matching 88-story towers with distinctive postmodern style and geometric floor plans",
397
+ "Iconic twin towers with stainless steel and glass facades showing eight-pointed star motif",
398
+ "Symmetrical tower pair with sky bridge and spires creating recognizable Kuala Lumpur silhouette"
399
+ ],
400
+ "architectural_details": [
401
+ "Floor plan based on Islamic geometric patterns with two interlocking squares creating eight-pointed star",
402
+ "Stainless steel and glass curtain wall with Islamic art-inspired design elements",
403
+ "Double-deck sky bridge on 41st and 42nd floors connecting towers at 170 meters height",
404
+ "Pinnacles adding 73 meters to height with Islamic architectural styling and lighting"
405
+ ],
406
+ "contextual_view": [
407
+ "Towers dominating KLCC Park with fountain lake and green space in foreground",
408
+ "Twin buildings viewed from KL Tower showing relationship to city and surrounding jungle hills",
409
+ "Petronas Towers as centerpiece of business district with modern urban development",
410
+ "Towers reflecting in KLCC Park water features with tropical landscaping and city backdrop"
411
+ ],
412
+ "seasonal_lighting": [
413
+ "Towers illuminated at night with synchronized lighting creating mirror image effect",
414
+ "Special lighting displays for Malaysian holidays in national colors",
415
+ "Blue hour with towers glowing against twilight sky as city lights emerge",
416
+ "Dramatic storm clouds behind towers with lightning and architectural lighting contrast"
417
+ ]
418
+ },
419
+ "hashtags": {
420
+ "zh": ["雙子星大樓", "吉隆坡", "馬來西亞", "雙峰塔", "吉隆坡地標"],
421
+ "en": ["PetronasTowers", "KualaLumpur", "Malaysia", "TwinTowers", "KLCC"]
422
+ },
423
+ "cultural_info": {
424
+ "built_year": 1998,
425
+ "architect": "César Pelli",
426
+ "architectural_style": "Postmodern Islamic"
427
+ }
428
+ },
429
+
430
+ "Forbidden City": {
431
+ "name": "Forbidden City",
432
+ "official_name": "故宮",
433
+ "location": {
434
+ "city": "Beijing",
435
+ "country": "China",
436
+ "region": "Dongcheng District",
437
+ "continent": "Asia"
438
+ },
439
+ "visual_cues": {
440
+ "iconic_view": [
441
+ "Massive imperial palace complex with yellow-glazed roof tiles and red walls showing traditional Chinese architecture",
442
+ "Ancient palace with multiple courtyards ceremonial halls and gates in classical Chinese design",
443
+ "Historic royal residence with distinctive golden roofs and vermilion walls in orthogonal layout",
444
+ "Imperial complex with 980 buildings showing Ming and Qing dynasty architectural grandeur"
445
+ ],
446
+ "architectural_details": [
447
+ "Yellow glazed roof tiles symbolizing imperial authority with elaborate ceramic figurine decorations",
448
+ "Vermilion walls and columns with golden door studs arranged in traditional Chinese numerical symbolism",
449
+ "Marble terraces and balustrades with dragon and phoenix carved relief decorations",
450
+ "Wooden architecture using traditional dougong bracket system without nails in construction"
451
+ ],
452
+ "contextual_view": [
453
+ "Palace viewed through Meridian Gate with vast courtyard and Hall of Supreme Harmony beyond",
454
+ "Forbidden City from Jingshan Park showing complete palace layout and Beijing cityscape",
455
+ "Palace moat and walls with modern Beijing visible in background showing old and new contrast",
456
+ "Interior courtyard with tourists and traditional architecture under blue Beijing sky"
457
+ ],
458
+ "seasonal_lighting": [
459
+ "Winter snow covering golden roofs creating dramatic color contrast with white and gold",
460
+ "Autumn light warming red walls with traditional Chinese architecture in clear air",
461
+ "Sunset illuminating yellow roof tiles with golden hour light creating magical atmosphere",
462
+ "Night opening events with palace buildings subtly illuminated showing architectural details"
463
+ ]
464
+ },
465
+ "hashtags": {
466
+ "zh": ["故宮", "北京", "紫禁城", "中國", "古蹟", "世界遺產"],
467
+ "en": ["ForbiddenCity", "Beijing", "China", "ImperialPalace", "WorldHeritage", "Palace"]
468
+ },
469
+ "cultural_info": {
470
+ "built_year": 1420,
471
+ "architect": "Kuai Xiang",
472
+ "architectural_style": "Traditional Chinese"
473
+ }
474
+ },
475
+
476
+ # ===== 美洲 Americas =====
477
+ "Statue of Liberty": {
478
+ "name": "Statue of Liberty",
479
+ "official_name": "Liberty Enlightening the World",
480
+ "location": {
481
+ "city": "New York",
482
+ "country": "United States",
483
+ "region": "Liberty Island",
484
+ "continent": "North America"
485
+ },
486
+ "visual_cues": {
487
+ "iconic_view": [
488
+ "Colossal neoclassical sculpture with copper patina holding torch aloft on Liberty Island",
489
+ "Famous statue with crown and tablet showing robed female figure representing Libertas",
490
+ "Iconic green copper statue on pedestal with torch raised and broken chains at feet",
491
+ "Monument with seven-ray crown tablet and torch symbolizing freedom and democracy"
492
+ ],
493
+ "architectural_details": [
494
+ "Copper skin with green patina over iron framework designed by Gustave Eiffel",
495
+ "Crown with seven rays representing seven continents and seas with 25 windows",
496
+ "Tablet inscribed with July 4 1776 in Roman numerals held in left hand",
497
+ "Broken shackles and chains at feet symbolizing freedom from oppression"
498
+ ],
499
+ "contextual_view": [
500
+ "Statue viewed from Battery Park with New York Harbor and Manhattan skyline behind",
501
+ "Liberty Island with statue and star-shaped Fort Wood pedestal from aerial view",
502
+ "Statue with Staten Island Ferry passing in foreground and Ellis Island nearby",
503
+ "Sunset silhouette with statue outlined against orange sky and New York City lights"
504
+ ],
505
+ "seasonal_lighting": [
506
+ "Statue illuminated at night with dramatic uplighting showing sculptural details",
507
+ "Golden hour light warming copper patina with soft shadows on draped clothing",
508
+ "Fourth of July fireworks surrounding statue with patriotic celebration",
509
+ "Misty morning with statue emerging from harbor fog creating mystical atmosphere"
510
+ ]
511
+ },
512
+ "hashtags": {
513
+ "zh": ["自由女神", "紐約", "美國", "紐約地標", "自由女神像"],
514
+ "en": ["StatueOfLiberty", "NewYork", "NYC", "Liberty", "USA", "America"]
515
+ },
516
+ "cultural_info": {
517
+ "built_year": 1886,
518
+ "architect": "Frédéric Auguste Bartholdi",
519
+ "architectural_style": "Neoclassicism"
520
+ }
521
+ },
522
+
523
+ "Golden Gate Bridge": {
524
+ "name": "Golden Gate Bridge",
525
+ "official_name": "Golden Gate Bridge",
526
+ "location": {
527
+ "city": "San Francisco",
528
+ "country": "United States",
529
+ "region": "California",
530
+ "continent": "North America"
531
+ },
532
+ "visual_cues": {
533
+ "iconic_view": [
534
+ "Suspension bridge with distinctive International Orange color spanning Golden Gate strait",
535
+ "Art Deco bridge with two towers and cables connecting San Francisco to Marin County",
536
+ "Famous orange bridge with 1.7-mile span over blue Pacific waters and hills beyond",
537
+ "Iconic suspension structure with tall towers and sweeping cables against San Francisco Bay"
538
+ ],
539
+ "architectural_details": [
540
+ "Art Deco towers rising 227 meters above water with distinctive vertical ribbing",
541
+ "Main suspension cables made of 27000 wires in distinctive orange color",
542
+ "Deck structure with six traffic lanes suspended from vertical cables",
543
+ "Art Deco design elements including tower portals and lighting fixtures in period style"
544
+ ],
545
+ "contextual_view": [
546
+ "Bridge viewed from Marin Headlands with San Francisco skyline and bay in background",
547
+ "Golden Gate from Baker Beach with bridge spanning across water to northern hills",
548
+ "Bridge emerging from famous fog with towers visible above marine layer",
549
+ "Aerial view showing complete span connecting two peninsulas across Golden Gate strait"
550
+ ],
551
+ "seasonal_lighting": [
552
+ "Sunset with bridge silhouetted against orange and purple sky over Pacific Ocean",
553
+ "Bridge partially obscured by fog creating mysterious atmospheric effect",
554
+ "Blue hour with bridge illuminated and city lights twinkling in background",
555
+ "Clear day with International Orange color vibrant against blue sky and water"
556
+ ]
557
+ },
558
+ "hashtags": {
559
+ "zh": ["金門大橋", "舊金山", "美國", "三藩市", "加州"],
560
+ "en": ["GoldenGateBridge", "SanFrancisco", "SF", "California", "USA", "Bridge"]
561
+ },
562
+ "cultural_info": {
563
+ "built_year": 1937,
564
+ "architect": "Joseph Strauss",
565
+ "architectural_style": "Art Deco"
566
+ }
567
+ },
568
+
569
+ "Christ the Redeemer": {
570
+ "name": "Christ the Redeemer",
571
+ "official_name": "Cristo Redentor",
572
+ "location": {
573
+ "city": "Rio de Janeiro",
574
+ "country": "Brazil",
575
+ "region": "Corcovado Mountain",
576
+ "continent": "South America"
577
+ },
578
+ "visual_cues": {
579
+ "iconic_view": [
580
+ "Massive Art Deco statue of Jesus Christ with outstretched arms atop Corcovado mountain",
581
+ "Colossal soapstone and concrete sculpture overlooking Rio with arms spanning 28 meters",
582
+ "Iconic statue at 30 meters height standing on 8-meter pedestal above rainforest",
583
+ "Monument with distinctive silhouette of Christ figure blessing city from mountain peak"
584
+ ],
585
+ "architectural_details": [
586
+ "Reinforced concrete and soapstone construction with Art Deco styling",
587
+ "Triangular mosaic tiles covering exterior in whitish soapstone material",
588
+ "Internal chapel at pedestal base with access stairs and elevator system",
589
+ "Outstretched arms forming cross shape with detailed hands and robed figure"
590
+ ],
591
+ "contextual_view": [
592
+ "Statue viewed from Sugarloaf Mountain with Guanabara Bay and Rio sprawl below",
593
+ "Christ overlooking Copacabana and Ipanema beaches with Atlantic Ocean beyond",
594
+ "Monument surrounded by Tijuca Forest with lush tropical vegetation on mountain",
595
+ "Aerial view showing statue's position above city with both ocean and mountains visible"
596
+ ],
597
+ "seasonal_lighting": [
598
+ "Statue illuminated at night with dramatic lighting visible across Rio",
599
+ "Sunset silhouette with statue outlined against orange sky above darkening city",
600
+ "Stormy weather with lightning behind statue creating dramatic atmosphere",
601
+ "Special event lighting in various colors for holidays and celebrations"
602
+ ]
603
+ },
604
+ "hashtags": {
605
+ "zh": ["基督像", "里約熱內盧", "巴西", "救世基督像", "世界新七大奇蹟"],
606
+ "en": ["ChristTheRedeemer", "Rio", "Brazil", "RioDeJaneiro", "CristoRedentor"]
607
+ },
608
+ "cultural_info": {
609
+ "built_year": 1931,
610
+ "architect": "Paul Landowski",
611
+ "architectural_style": "Art Deco"
612
+ }
613
+ },
614
+
615
+ "CN Tower": {
616
+ "name": "CN Tower",
617
+ "official_name": "Canadian National Tower",
618
+ "location": {
619
+ "city": "Toronto",
620
+ "country": "Canada",
621
+ "region": "Ontario",
622
+ "continent": "North America"
623
+ },
624
+ "visual_cues": {
625
+ "iconic_view": [
626
+ "Concrete communication tower at 553 meters with distinctive pod and antenna spire",
627
+ "Iconic Toronto landmark with observation deck pod and long concrete shaft",
628
+ "Tall broadcasting tower with revolving restaurant and glass floor observation area",
629
+ "Slender concrete tower dominating Toronto skyline with characteristic Y-shaped floor plan"
630
+ ],
631
+ "architectural_details": [
632
+ "Hexagonal concrete shaft with three support legs forming Y-shaped base",
633
+ "SkyPod observation level with indoor and outdoor viewing areas at 447 meters",
634
+ "Glass floor section allowing visitors to look straight down to ground",
635
+ "Revolving restaurant completing 360-degree rotation every 72 minutes"
636
+ ],
637
+ "contextual_view": [
638
+ "Tower rising above Toronto skyline with Lake Ontario and city sprawl visible",
639
+ "CN Tower viewed from Toronto Islands with waterfront and downtown core",
640
+ "Tower dominating cityscape with Rogers Centre stadium and financial district nearby",
641
+ "Landmark visible from throughout Greater Toronto Area as defining skyline element"
642
+ ],
643
+ "seasonal_lighting": [
644
+ "Tower illuminated at night in various colors for events and causes",
645
+ "Canada Day celebration with tower lit in red and white national colors",
646
+ "Sunset with tower silhouetted against colorful sky over Lake Ontario",
647
+ "Winter scene with tower emerging from snow and city lights below"
648
+ ]
649
+ },
650
+ "hashtags": {
651
+ "zh": ["CN塔", "多倫多", "加拿大", "多倫多地標", "加拿大國家電視塔"],
652
+ "en": ["CNTower", "Toronto", "Canada", "TorontoLandmark", "YYZ"]
653
+ },
654
+ "cultural_info": {
655
+ "built_year": 1976,
656
+ "architect": "John Andrews",
657
+ "architectural_style": "Modern"
658
+ }
659
+ },
660
+
661
+ # ===== 大洋洲與其他 Oceania & Others =====
662
+ "Sydney Opera House": {
663
+ "name": "Sydney Opera House",
664
+ "official_name": "Sydney Opera House",
665
+ "location": {
666
+ "city": "Sydney",
667
+ "country": "Australia",
668
+ "region": "Bennelong Point",
669
+ "continent": "Oceania"
670
+ },
671
+ "visual_cues": {
672
+ "iconic_view": [
673
+ "Expressionist modern design with distinctive white shell-shaped roof sails on harbor peninsula",
674
+ "Multiple shell structures covered in white and cream tiles rising from water's edge",
675
+ "Iconic performance venue with overlapping concrete shells creating sail-like silhouette",
676
+ "Modernist architecture with innovative roof design of interlocking vaulted shells"
677
+ ],
678
+ "architectural_details": [
679
+ "Precast concrete ribs covered with 1056006 white and cream Swedish tiles",
680
+ "Shell structures based on spherical geometry creating self-supporting roof sections",
681
+ "Glass curtain walls filling spaces between shells and podium below",
682
+ "Multiple performance halls including Concert Hall and Joan Sutherland Theatre within shells"
683
+ ],
684
+ "contextual_view": [
685
+ "Opera House on Bennelong Point with Sydney Harbour Bridge in background",
686
+ "Building viewed from Circular Quay with harbor ferries and city skyline",
687
+ "Opera House at sunset with sails reflecting golden light over harbor waters",
688
+ "Aerial view showing building's position on peninsula with Royal Botanic Gardens adjacent"
689
+ ],
690
+ "seasonal_lighting": [
691
+ "Vivid Sydney festival with colorful projections on shell surfaces",
692
+ "Sunset illuminating white tiles with warm light and harbor reflections",
693
+ "Night lighting highlighting architectural forms against dark harbor",
694
+ "New Year's Eve with fireworks from Harbour Bridge framing Opera House"
695
+ ]
696
+ },
697
+ "hashtags": {
698
+ "zh": ["雪梨歌劇院", "雪梨", "澳洲", "澳大利亞", "世界遺產"],
699
+ "en": ["SydneyOperaHouse", "Sydney", "Australia", "OperaHouse", "WorldHeritage"]
700
+ },
701
+ "cultural_info": {
702
+ "built_year": 1973,
703
+ "architect": "Jørn Utzon",
704
+ "architectural_style": "Expressionist Modernism"
705
+ }
706
+ },
707
+
708
+ "Taj Mahal": {
709
+ "name": "Taj Mahal",
710
+ "official_name": "ताज महल",
711
+ "location": {
712
+ "city": "Agra",
713
+ "country": "India",
714
+ "region": "Uttar Pradesh",
715
+ "continent": "Asia"
716
+ },
717
+ "visual_cues": {
718
+ "iconic_view": [
719
+ "White marble mausoleum with central dome and four minarets in Mughal architecture style",
720
+ "Ivory-white marble structure with perfect symmetry reflected in long rectangular pool",
721
+ "Iconic domed monument with intricate inlay work and Islamic calligraphy decorations",
722
+ "Majestic tomb complex with main building flanked by symmetrical mosque and guest house"
723
+ ],
724
+ "architectural_details": [
725
+ "Central dome rising 35 meters surrounded by four smaller chattri domes",
726
+ "Pietra dura inlay work with semi-precious stones creating floral patterns",
727
+ "Four minarets at corners standing 40 meters high with tilted design for earthquake safety",
728
+ "Calligraphic inscriptions from Quran decorating entrance archways in black marble"
729
+ ],
730
+ "contextual_view": [
731
+ "Taj Mahal viewed through main gateway with frame creating first impression",
732
+ "Monument reflected in Yamuna River during calm conditions with gardens in foreground",
733
+ "Taj from Mehtab Bagh garden across river showing rear view and riverbank",
734
+ "Complex with charbagh Persian garden layout leading to mausoleum platform"
735
+ ],
736
+ "seasonal_lighting": [
737
+ "Sunrise with monument glowing pink and orange in soft morning light",
738
+ "Full moon night viewing with white marble luminous under moonlight",
739
+ "Sunset creating warm golden tones on marble with long shadows",
740
+ "Misty morning with Taj emerging from fog over Yamuna River"
741
+ ]
742
+ },
743
+ "hashtags": {
744
+ "zh": ["泰姬陵", "印度", "阿格拉", "世界遺產", "世界奇蹟"],
745
+ "en": ["TajMahal", "India", "Agra", "WorldHeritage", "Monument", "Mausoleum"]
746
+ },
747
+ "cultural_info": {
748
+ "built_year": 1653,
749
+ "architect": "Ustad Ahmad Lahauri",
750
+ "architectural_style": "Mughal"
751
+ }
752
+ },
753
+
754
+ "Pyramids of Giza": {
755
+ "name": "Pyramids of Giza",
756
+ "official_name": "أهرامات الجيزة",
757
+ "location": {
758
+ "city": "Giza",
759
+ "country": "Egypt",
760
+ "region": "Greater Cairo",
761
+ "continent": "Africa"
762
+ },
763
+ "visual_cues": {
764
+ "iconic_view": [
765
+ "Three ancient pyramids rising from desert plateau with Great Pyramid as largest structure",
766
+ "Massive limestone pyramids with Great Sphinx in foreground on Giza Plateau",
767
+ "Ancient Egyptian royal tombs with precise geometric forms against desert sky",
768
+ "Monumental pyramids showing weathered limestone blocks and missing outer casing"
769
+ ],
770
+ "architectural_details": [
771
+ "Great Pyramid originally 146 meters with 2.3 million limestone blocks",
772
+ "Precise alignment to cardinal directions with astronomical significance",
773
+ "Internal chambers and passages including King's Chamber and Grand Gallery",
774
+ "Remaining casing stones at apex showing original smooth white limestone covering"
775
+ ],
776
+ "contextual_view": [
777
+ "Pyramids with Great Sphinx in foreground and Cairo urban sprawl in background",
778
+ "Three pyramids aligned with smaller queens pyramids and ancient cemetery",
779
+ "Desert landscape with pyramids and camel riders providing scale",
780
+ "Aerial view showing pyramid complex relationship to Nile River and modern city"
781
+ ],
782
+ "seasonal_lighting": [
783
+ "Sound and light show with colorful illumination on pyramid faces at night",
784
+ "Sunrise with pyramids silhouetted against orange desert sky",
785
+ "Harsh midday sun creating strong shadows and highlighting weathered stone",
786
+ "Golden hour light warming limestone with dramatic shadows emphasizing geometry"
787
+ ]
788
+ },
789
+ "hashtags": {
790
+ "zh": ["金字塔", "埃及", "吉薩", "古埃及", "世界奇蹟", "人面獅身像"],
791
+ "en": ["Pyramids", "Egypt", "Giza", "GreatPyramid", "AncientEgypt", "Sphinx"]
792
+ },
793
+ "cultural_info": {
794
+ "built_year": -2560,
795
+ "architect": "Hemiunu",
796
+ "architectural_style": "Ancient Egyptian"
797
+ }
798
+ },
799
+
800
+ "Machu Picchu": {
801
+ "name": "Machu Picchu",
802
+ "official_name": "Machu Picchu",
803
+ "location": {
804
+ "city": "Cusco Region",
805
+ "country": "Peru",
806
+ "region": "Urubamba Province",
807
+ "continent": "South America"
808
+ },
809
+ "visual_cues": {
810
+ "iconic_view": [
811
+ "Ancient Incan citadel on mountain ridge with terraced structures and Huayna Picchu peak behind",
812
+ "Stone ruins at 2430 meters altitude with dramatic mountain setting and cloud forest",
813
+ "Archaeological site with precisely fitted stone walls temples and agricultural terraces",
814
+ "Lost city with iconic postcard view showing complete site with Wayna Picchu mountain"
815
+ ],
816
+ "architectural_details": [
817
+ "Dry-stone construction with precisely cut granite blocks without mortar",
818
+ "Agricultural terraces with sophisticated drainage systems on steep slopes",
819
+ "Temple of the Sun with curved wall and astronomical alignment features",
820
+ "Intihuatana ritual stone showing Incan astronomical and agricultural knowledge"
821
+ ],
822
+ "contextual_view": [
823
+ "Citadel viewed from Sun Gate after completing Inca Trail with morning light",
824
+ "Site from Huayna Picchu summit showing complete layout and surrounding mountains",
825
+ "Machu Picchu with Urubamba River valley and cloud forest below",
826
+ "Ruins with llamas grazing among ancient structures creating iconic Andean scene"
827
+ ],
828
+ "seasonal_lighting": [
829
+ "Sunrise illuminating ruins with first light as mist clears from valleys",
830
+ "Dramatic clouds surrounding peaks with ruins emerging from mountain fog",
831
+ "Afternoon light creating shadows that emphasize stone wall construction details",
832
+ "Rainy season with lush green terraces and dramatic cloud formations"
833
+ ]
834
+ },
835
+ "hashtags": {
836
+ "zh": ["馬丘比丘", "秘魯", "印加", "世界遺產", "失落之城"],
837
+ "en": ["MachuPicchu", "Peru", "Inca", "WorldHeritage", "LostCity", "Cusco"]
838
+ },
839
+ "cultural_info": {
840
+ "built_year": 1450,
841
+ "architect": "Pachacuti Inca Yupanqui",
842
+ "architectural_style": "Inca"
843
+ }
844
+ },
845
+
846
+ "Petra": {
847
+ "name": "Petra",
848
+ "official_name": "البتراء",
849
+ "location": {
850
+ "city": "Ma'an Governorate",
851
+ "country": "Jordan",
852
+ "region": "Wadi Musa",
853
+ "continent": "Asia"
854
+ },
855
+ "visual_cues": {
856
+ "iconic_view": [
857
+ "Rose-red sandstone Treasury building carved into cliff face with Hellenistic facade",
858
+ "Al-Khazneh temple with elaborate columns and sculptures in pink Nabataean rock",
859
+ "Ancient city carved from rock with dramatic facade revealed through narrow Siq canyon",
860
+ "Monumental rock-cut architecture with classical design in desert landscape"
861
+ ],
862
+ "architectural_details": [
863
+ "Hellenistic facade with Corinthian columns and ornate sculptural decorations",
864
+ "Rock-cut construction showing Nabataean engineering carved directly from sandstone cliff",
865
+ "Rose-red to pink sandstone with natural color variations in rock layers",
866
+ "Urn monument crowning upper level with classical Greek architectural influences"
867
+ ],
868
+ "contextual_view": [
869
+ "Treasury viewed through narrow opening of Siq canyon creating dramatic reveal",
870
+ "Petra archaeological park with multiple rock-cut structures and Roman amphitheater",
871
+ "Site in desert landscape with Bedouin presence and arid mountain scenery",
872
+ "Monastery building requiring climb up ancient steps with panoramic desert views"
873
+ ],
874
+ "seasonal_lighting": [
875
+ "Morning light illuminating Treasury facade with warm glow on rose-red stone",
876
+ "Petra by Night with Treasury lit by candlelight creating magical atmosphere",
877
+ "Harsh midday sun emphasizing color variations and carved details in rock",
878
+ "Late afternoon shadows creating depth and emphasizing architectural relief"
879
+ ]
880
+ },
881
+ "hashtags": {
882
+ "zh": ["佩特拉", "約旦", "玫瑰城", "世界遺產", "世界新七大奇蹟"],
883
+ "en": ["Petra", "Jordan", "Treasury", "AlKhazneh", "WorldHeritage", "RoseCity"]
884
+ },
885
+ "cultural_info": {
886
+ "built_year": -312,
887
+ "architect": "Nabataeans",
888
+ "architectural_style": "Nabataean"
889
+ }
890
+ },
891
+
892
+ "Stonehenge": {
893
+ "name": "Stonehenge",
894
+ "official_name": "Stonehenge",
895
+ "location": {
896
+ "city": "Wiltshire",
897
+ "country": "United Kingdom",
898
+ "region": "Salisbury Plain",
899
+ "continent": "Europe"
900
+ },
901
+ "visual_cues": {
902
+ "iconic_view": [
903
+ "Prehistoric monument with massive standing stones arranged in circular pattern on plain",
904
+ "Ancient stone circle with trilithons and sarsen stones in open landscape",
905
+ "Neolithic structure with distinctive stone archways and circular earthwork setting",
906
+ "Mysterious megalithic monument with bluestones and sarsen stones against sky"
907
+ ],
908
+ "architectural_details": [
909
+ "Sarsen stone trilithons with horizontal lintels connected by mortise and tenon joints",
910
+ "Bluestone arrangement within larger sarsen circle showing different stone types",
911
+ "Heel Stone and Avenue aligned to summer solstice sunrise",
912
+ "Weathered surfaces showing 5000 years of exposure to English weather"
913
+ ],
914
+ "contextual_view": [
915
+ "Stone circle in pastoral English landscape with sheep grazing on Salisbury Plain",
916
+ "Monument from distance showing relationship to surrounding earthworks and barrows",
917
+ "Stonehenge with visitors for scale showing massive size of individual stones",
918
+ "Site from aerial view revealing circular formation and astronomical alignments"
919
+ ],
920
+ "seasonal_lighting": [
921
+ "Summer solstice sunrise with sun aligned through stones as crowds gather",
922
+ "Winter solstice sunset creating dramatic silhouettes of standing stones",
923
+ "Moody overcast conditions with stones against dramatic English sky",
924
+ "Misty morning with stones emerging from fog creating mystical atmosphere"
925
+ ]
926
+ },
927
+ "hashtags": {
928
+ "zh": ["巨石陣", "英國", "史前遺跡", "世界遺產", "威爾特郡"],
929
+ "en": ["Stonehenge", "England", "UK", "Prehistoric", "WorldHeritage", "Wiltshire"]
930
+ },
931
+ "cultural_info": {
932
+ "built_year": -3000,
933
+ "architect": "Unknown (Neolithic peoples)",
934
+ "architectural_style": "Prehistoric"
935
+ }
936
+ }
937
+ }
938
+
939
+ print(f"✓ Landmark Prompts initialized with {len(self.landmarks)} world landmarks")
940
+
941
+ def get_prompts(self, landmark_name: str) -> Optional[Dict]:
942
+ """
943
+ 取得特定地標的完整 prompt 資料
944
+
945
+ Args:
946
+ landmark_name: 地標名稱
947
+
948
+ Returns:
949
+ 地標資料字典,若不存在則返回 None
950
+ """
951
+ return self.landmarks.get(landmark_name)
952
+
953
+ def get_all_landmarks(self) -> Dict:
954
+ """取得所有地標資料"""
955
+ return self.landmarks
956
+
957
+ def search_by_location(self, city: str = None, country: str = None) -> List[str]:
958
+ """
959
+ 根據地理位置搜尋地標
960
+
961
+ Args:
962
+ city: 城市名稱
963
+ country: 國家名稱
964
+
965
+ Returns:
966
+ 符合條件的地標名稱列表
967
+ """
968
+ results = []
969
+ for landmark_name, data in self.landmarks.items():
970
+ location = data.get('location', {})
971
+
972
+ if city and country:
973
+ if location.get('city') == city and location.get('country') == country:
974
+ results.append(landmark_name)
975
+ elif city:
976
+ if location.get('city') == city:
977
+ results.append(landmark_name)
978
+ elif country:
979
+ if location.get('country') == country:
980
+ results.append(landmark_name)
981
+
982
+ return results
983
+
984
+ def get_visual_prompts(self, landmark_name: str, context: str = 'iconic_view') -> List[str]:
985
+ """
986
+ 取得地標的視覺描述 prompts
987
+
988
+ Args:
989
+ landmark_name: 地標名稱
990
+ context: 情境類型 ('iconic_view', 'architectural_details', 'contextual_view', 'seasonal_lighting')
991
+
992
+ Returns:
993
+ 視覺描述列表
994
+ """
995
+ landmark = self.landmarks.get(landmark_name)
996
+ if not landmark:
997
+ return []
998
+
999
+ visual_cues = landmark.get('visual_cues', {})
1000
+ return visual_cues.get(context, [])
1001
+
1002
+ def get_hashtags(self, landmark_name: str, language: str = 'zh') -> List[str]:
1003
+ """
1004
+ 取得地標的 hashtags
1005
+
1006
+ Args:
1007
+ landmark_name: 地標名稱
1008
+ language: 語言 ('zh', 'en', 或 'zh-en')
1009
+
1010
+ Returns:
1011
+ Hashtag 列表
1012
+ """
1013
+ landmark = self.landmarks.get(landmark_name)
1014
+ if not landmark:
1015
+ return []
1016
+
1017
+ hashtags = landmark.get('hashtags', {})
1018
+
1019
+ if language == 'zh':
1020
+ return hashtags.get('zh', [])
1021
+ elif language == 'en':
1022
+ return hashtags.get('en', [])
1023
+ elif language == 'zh-en' or language == 'both':
1024
+ zh_tags = hashtags.get('zh', [])
1025
+ en_tags = hashtags.get('en', [])
1026
+ return zh_tags + en_tags
1027
+ else:
1028
+ return hashtags.get('zh', [])
1029
+
1030
+ print("✓ LandmarkPrompts defined")
lighting_analysis_manager.py ADDED
@@ -0,0 +1,453 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ import torch
4
+ import torch.nn as nn
5
+ from PIL import Image
6
+ from typing import Dict, Tuple
7
+ import torchvision.models as models
8
+ import torchvision.transforms as transforms
9
+
10
+ class LightingAnalysisManager:
11
+ """Advanced lighting analysis using Places365 scene recognition + CV features"""
12
+
13
+ def __init__(self):
14
+ print("Initializing Lighting Analysis Manager with Places365...")
15
+
16
+ # Places365 ResNet18
17
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
18
+ self._load_places365_model()
19
+
20
+ # CV feature weights (Places365 gets higher weight)
21
+ self.feature_weights = {
22
+ 'places365': 0.50, # Primary weight to Places365
23
+ 'brightness': 0.15,
24
+ 'color_temp': 0.15,
25
+ 'contrast': 0.08,
26
+ 'gradient': 0.05, # Auxiliary features
27
+ 'laplacian': 0.04,
28
+ 'color_variation': 0.03
29
+ }
30
+
31
+ print("✓ Lighting Analysis Manager initialized with Places365 + advanced CV features")
32
+
33
+ def _load_places365_model(self):
34
+ """Load Places365 ResNet18 for scene attributes"""
35
+ try:
36
+ # Use ResNet18 pretrained on Places365
37
+ model = models.resnet18(weights=None)
38
+ model.fc = nn.Linear(model.fc.in_features, 365)
39
+
40
+ # Load Places365 weights (if available, otherwise use ImageNet as fallback)
41
+ try:
42
+ import urllib
43
+ checkpoint_url = 'http://places2.csail.mit.edu/models_places365/resnet18_places365.pth.tar'
44
+ checkpoint = torch.hub.load_state_dict_from_url(
45
+ checkpoint_url,
46
+ map_location=self.device,
47
+ progress=False
48
+ )
49
+ state_dict = {str.replace(k, 'module.', ''): v for k, v in checkpoint['state_dict'].items()}
50
+ model.load_state_dict(state_dict)
51
+ print(" Loaded Places365 ResNet18 weights")
52
+ except:
53
+ print(" Using ImageNet pretrained ResNet18 (fallback)")
54
+ model = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
55
+
56
+ model = model.to(self.device)
57
+ model.eval()
58
+ self.places_model = model
59
+
60
+ # Image preprocessing for Places365
61
+ self.places_transform = transforms.Compose([
62
+ transforms.Resize((224, 224)),
63
+ transforms.ToTensor(),
64
+ transforms.Normalize(
65
+ mean=[0.485, 0.456, 0.406],
66
+ std=[0.229, 0.224, 0.225]
67
+ )
68
+ ])
69
+
70
+ # Scene categories related to lighting
71
+ self.lighting_scenes = {
72
+ 'sunny': ['street', 'downtown', 'plaza', 'park', 'field'],
73
+ 'overcast': ['alley', 'covered_bridge', 'corridor'],
74
+ 'indoor': ['lobby', 'office', 'museum', 'restaurant'],
75
+ 'evening': ['street', 'downtown', 'plaza'],
76
+ 'natural': ['park', 'forest', 'mountain', 'coast']
77
+ }
78
+
79
+ except Exception as e:
80
+ print(f" Warning: Places365 loading failed ({e}), using CV-only mode")
81
+ self.places_model = None
82
+
83
+ def analyze_lighting(self, image: Image.Image) -> Dict:
84
+ """Comprehensive lighting analysis using Places365 + CV"""
85
+
86
+ # 1. CV-based physical features (including advanced features)
87
+ cv_features = self._extract_cv_features(image)
88
+
89
+ # 2. Places365 scene understanding (if available)
90
+ scene_info = self._analyze_scene_places365(image)
91
+
92
+ # 3. Determine lighting condition (adaptive with auxiliary features)
93
+ lighting_condition, confidence = self._determine_lighting_adaptive(
94
+ cv_features, scene_info
95
+ )
96
+
97
+ return {
98
+ 'lighting_type': lighting_condition,
99
+ 'confidence': confidence,
100
+ 'cv_features': cv_features,
101
+ 'scene_info': scene_info
102
+ }
103
+
104
+ def _extract_cv_features(self, image: Image.Image) -> Dict:
105
+ """Extract CV-based features including advanced gradient and color analysis"""
106
+ img_array = np.array(image)
107
+ img_bgr = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
108
+
109
+ # Basic Features (Primary)
110
+ # Brightness (LAB L-channel)
111
+ lab = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2LAB)
112
+ brightness = float(np.mean(lab[:, :, 0]))
113
+
114
+ # Color temperature (R/B ratio)
115
+ b_mean = np.mean(img_bgr[:, :, 0])
116
+ r_mean = np.mean(img_bgr[:, :, 2])
117
+ color_temp = float(r_mean / (b_mean + 1e-6))
118
+
119
+ # Contrast (std of grayscale)
120
+ gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
121
+ contrast = float(np.std(gray))
122
+
123
+ # Shadow ratio
124
+ _, shadow_mask = cv2.threshold(gray, 80, 255, cv2.THRESH_BINARY_INV)
125
+ shadow_ratio = float(np.sum(shadow_mask > 0) / shadow_mask.size)
126
+
127
+ # Advanced Features
128
+ # 1. First derivative: Sobel gradient magnitude (edge strength)
129
+ # Strong gradients suggest directional lighting, weak suggest diffused
130
+ sobelx = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3)
131
+ sobely = cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=3)
132
+ gradient_magnitude = np.sqrt(sobelx**2 + sobely**2)
133
+ gradient_strength = float(np.mean(gradient_magnitude))
134
+
135
+ # 2. Second derivative: Laplacian variance (lighting change detection)
136
+ # High variance indicates complex lighting with many transitions
137
+ laplacian = cv2.Laplacian(gray, cv2.CV_64F)
138
+ laplacian_var = float(np.var(laplacian))
139
+
140
+ # 3. Color difference in LAB space (color uniformity)
141
+ # Low variation suggests overcast/diffused, high suggests mixed lighting
142
+ a_std = float(np.std(lab[:, :, 1])) # a* channel (green-red)
143
+ b_std = float(np.std(lab[:, :, 2])) # b* channel (blue-yellow)
144
+ color_variation = (a_std + b_std) / 2
145
+
146
+ return {
147
+ # Primary features
148
+ 'brightness': brightness,
149
+ 'color_temp': color_temp,
150
+ 'contrast': contrast,
151
+ 'shadow_ratio': shadow_ratio,
152
+ # Advanced auxiliary features (to assist Places365)
153
+ 'gradient_strength': gradient_strength,
154
+ 'laplacian_variance': laplacian_var,
155
+ 'color_variation': color_variation
156
+ }
157
+
158
+ def _analyze_scene_places365(self, image: Image.Image) -> Dict:
159
+ """Analyze scene using Places365"""
160
+ if self.places_model is None:
161
+ return {'scene_category': 'unknown', 'confidence': 0.0}
162
+
163
+ try:
164
+ with torch.no_grad():
165
+ img_tensor = self.places_transform(image).unsqueeze(0).to(self.device)
166
+ logits = self.places_model(img_tensor)
167
+ probs = torch.nn.functional.softmax(logits, dim=1)
168
+
169
+ # Get top prediction
170
+ top_prob, top_idx = torch.max(probs, 1)
171
+
172
+ # Simple scene categories
173
+ # Using index ranges for common outdoor scenes
174
+ is_outdoor = top_idx.item() < 200 # Rough heuristic
175
+
176
+ return {
177
+ 'scene_category': 'outdoor' if is_outdoor else 'indoor',
178
+ 'confidence': float(top_prob.item()),
179
+ 'scene_idx': int(top_idx.item())
180
+ }
181
+ except Exception as e:
182
+ print(f" Places365 inference failed: {e}")
183
+ return {'scene_category': 'unknown', 'confidence': 0.0}
184
+
185
+ def _detect_indoor_scene(self, cv_features: Dict, scene_info: Dict) -> bool:
186
+ """
187
+ Detect if scene is indoor or outdoor using multiple signals
188
+
189
+ Args:
190
+ cv_features: Computer vision features
191
+ scene_info: Places365 scene information
192
+
193
+ Returns:
194
+ True if indoor, False if outdoor
195
+ """
196
+ indoor_score = 0.0
197
+
198
+ # Signal 1: Places365 scene category (strongest signal)
199
+ if scene_info.get('scene_category') == 'indoor':
200
+ indoor_score += 0.5
201
+ elif scene_info.get('scene_category') == 'outdoor':
202
+ indoor_score -= 0.3
203
+
204
+ # Signal 2: Brightness patterns
205
+ # Indoor scenes typically have controlled brightness (not too bright, not too dark)
206
+ brightness = cv_features['brightness']
207
+ if 60 < brightness < 220: # 放寬範圍,包含更多室內場景
208
+ indoor_score += 0.15
209
+ elif brightness > 230: # Very bright suggests outdoor
210
+ indoor_score -= 0.2
211
+
212
+ # Signal 3: Low gradient suggests controlled/diffused indoor lighting
213
+ gradient = cv_features['gradient_strength']
214
+ if gradient < 20: # 放寬閾值,更多室內場景符合
215
+ indoor_score += 0.15
216
+
217
+ # Signal 4: Low laplacian variance suggests smooth indoor lighting
218
+ laplacian = cv_features['laplacian_variance']
219
+ if laplacian < 400: # 放寬閾值,包含更多室內場景
220
+ indoor_score += 0.10
221
+
222
+ # Signal 5: Shadow ratio - indoor scenes have less harsh shadows
223
+ shadow_ratio = cv_features['shadow_ratio']
224
+ if shadow_ratio < 0.25: # 放寬閾值,包含更多室內場景
225
+ indoor_score += 0.10
226
+ elif shadow_ratio > 0.5: # Strong shadows suggest outdoor sunlight
227
+ indoor_score -= 0.15
228
+
229
+ # Threshold: indoor if score > 0.15 (降低閾值,更容易判定為室內)
230
+ return indoor_score > 0.15
231
+
232
+ def _determine_indoor_lighting(self, cv_features: Dict) -> Tuple[str, float]:
233
+ """
234
+ Determine lighting type for indoor scenes
235
+
236
+ Returns indoor-specific lighting types with confidence
237
+ """
238
+ brightness = cv_features['brightness']
239
+ color_temp = cv_features['color_temp']
240
+ contrast = cv_features['contrast']
241
+ shadow_ratio = cv_features['shadow_ratio']
242
+ gradient = cv_features['gradient_strength']
243
+ laplacian = cv_features['laplacian_variance']
244
+
245
+ # Normalize features
246
+ brightness_norm = min(brightness / 255.0, 1.0)
247
+ contrast_norm = min(contrast / 100.0, 1.0)
248
+ gradient_norm = min(gradient / 50.0, 1.0)
249
+ laplacian_norm = min(laplacian / 1000.0, 1.0)
250
+
251
+ scores = {}
252
+
253
+ # Studio/Product Lighting (工作室/產品攝影燈光)
254
+ # Very controlled, bright, minimal shadows, low gradient
255
+ studio_score = (
256
+ 0.35 * (1.0 if brightness_norm > 0.6 else 0.5) + # Bright
257
+ 0.25 * (1.0 - shadow_ratio) + # Minimal shadows
258
+ 0.20 * (1.0 - gradient_norm) + # Smooth, even
259
+ 0.15 * (1.0 - laplacian_norm) + # Very smooth
260
+ 0.05 * (1.0 - abs(color_temp - 1.0)) # Neutral temp
261
+ )
262
+ scores['studio lighting'] = studio_score
263
+
264
+ # Indoor Natural Light (室內自然光 - 窗光)
265
+ # Medium-bright, some contrast, neutral to warm temp
266
+ natural_indoor_score = (
267
+ 0.30 * (1.0 if 0.5 < brightness_norm < 0.8 else 0.5) + # Medium-bright
268
+ 0.25 * min(contrast_norm, 0.6) + # Some contrast
269
+ 0.20 * (1.0 if color_temp > 0.95 else 0.5) + # Neutral to warm
270
+ 0.15 * min(gradient_norm, 0.5) + # Some direction
271
+ 0.10 * (1.0 if shadow_ratio < 0.3 else 0.5) # Some shadows
272
+ )
273
+ scores['indoor natural light'] = natural_indoor_score
274
+
275
+ # Warm Artificial Lighting (溫暖人工照明)
276
+ # Warm color temp, medium brightness, soft
277
+ warm_artificial_score = (
278
+ 0.35 * (1.0 if color_temp > 1.1 else 0.3) + # Warm temp
279
+ 0.25 * (1.0 - abs(brightness_norm - 0.5)) + # Medium brightness
280
+ 0.20 * (1.0 - gradient_norm) + # Soft
281
+ 0.15 * (1.0 - shadow_ratio) + # Minimal shadows
282
+ 0.05 * (1.0 - laplacian_norm) # Smooth
283
+ )
284
+ scores['warm artificial lighting'] = warm_artificial_score
285
+
286
+ # Cool Artificial Lighting (冷色人工照明)
287
+ # Cool/neutral temp, medium-bright
288
+ cool_artificial_score = (
289
+ 0.35 * (1.0 if color_temp < 1.05 else 0.4) + # Cool/neutral temp
290
+ 0.25 * (1.0 if brightness_norm > 0.5 else 0.5) + # Medium-bright
291
+ 0.20 * (1.0 - gradient_norm) + # Smooth
292
+ 0.15 * (1.0 - shadow_ratio) + # Minimal shadows
293
+ 0.05 * (1.0 - laplacian_norm) # Even
294
+ )
295
+ scores['cool artificial lighting'] = cool_artificial_score
296
+
297
+ # Soft Indoor Lighting (柔和室內光線)
298
+ # Low contrast, diffused, medium brightness
299
+ soft_indoor_score = (
300
+ 0.30 * (1.0 - abs(brightness_norm - 0.5)) + # Medium brightness
301
+ 0.30 * (1.0 - contrast_norm) + # Low contrast
302
+ 0.20 * (1.0 - gradient_norm) + # Very soft
303
+ 0.15 * (1.0 - shadow_ratio) + # Minimal shadows
304
+ 0.05 * (1.0 - laplacian_norm) # Smooth
305
+ )
306
+ scores['soft indoor lighting'] = soft_indoor_score
307
+
308
+ # Dramatic Indoor Lighting (戲劇性室內光線)
309
+ # High contrast, directional, some shadows
310
+ dramatic_score = (
311
+ 0.35 * contrast_norm + # High contrast
312
+ 0.25 * gradient_norm + # Directional
313
+ 0.20 * shadow_ratio + # Shadows present
314
+ 0.15 * laplacian_norm + # Sharp transitions
315
+ 0.05 * (1.0 if brightness_norm < 0.6 else 0.5) # Can be darker
316
+ )
317
+ scores['dramatic indoor lighting'] = dramatic_score
318
+
319
+ # Get best match
320
+ best_condition = max(scores.items(), key=lambda x: x[1])
321
+
322
+ # Calculate confidence
323
+ sorted_scores = sorted(scores.values(), reverse=True)
324
+ if len(sorted_scores) > 1:
325
+ score_gap = sorted_scores[0] - sorted_scores[1]
326
+ confidence = min(0.7 + score_gap * 0.3, 0.95)
327
+ else:
328
+ confidence = 0.7
329
+
330
+ return best_condition[0], confidence
331
+
332
+ def _determine_lighting_adaptive(self, cv_features: Dict, scene_info: Dict) -> Tuple[str, float]:
333
+ """Determine lighting using adaptive thresholds with indoor/outdoor detection"""
334
+
335
+ # Extract all features
336
+ brightness = cv_features['brightness']
337
+ color_temp = cv_features['color_temp']
338
+ contrast = cv_features['contrast']
339
+ shadow = cv_features['shadow_ratio']
340
+ gradient = cv_features['gradient_strength']
341
+ laplacian = cv_features['laplacian_variance']
342
+ color_var = cv_features['color_variation']
343
+
344
+ # NEW: Detect indoor vs outdoor
345
+ is_indoor = self._detect_indoor_scene(cv_features, scene_info)
346
+ if is_indoor:
347
+ # 室內場景優先使用室內光線類型
348
+ return self._determine_indoor_lighting(cv_features)
349
+ # 否則使用原有邏輯
350
+
351
+ # Normalize features to 0-1 scale
352
+ brightness_norm = min(brightness / 255.0, 1.0)
353
+ contrast_norm = min(contrast / 100.0, 1.0)
354
+ gradient_norm = min(gradient / 50.0, 1.0) # Typical range 0-50
355
+ laplacian_norm = min(laplacian / 1000.0, 1.0) # Typical range 0-1000
356
+ color_var_norm = min(color_var / 50.0, 1.0) # Typical range 0-50
357
+
358
+ # Adaptive scoring (Places365 dominant, CV features assist)
359
+ scores = {}
360
+
361
+ # Soft diffused light (柔和漫射光)
362
+ # Characteristics: medium brightness, low contrast, neutral temp
363
+ # Auxiliary: low gradient (no strong edges), low laplacian (smooth transitions)
364
+ diffuse_score = (
365
+ 0.40 * (1.0 - abs(brightness_norm - 0.5)) + # Medium brightness
366
+ 0.25 * (1.0 - contrast_norm) + # Low contrast
367
+ 0.20 * (1.0 - abs(color_temp - 1.0)) + # Neutral temp
368
+ 0.08 * (1.0 - gradient_norm) + # Weak edges (diffused)
369
+ 0.05 * (1.0 - laplacian_norm) + # Smooth transitions
370
+ 0.02 * (1.0 - color_var_norm) # Uniform color
371
+ )
372
+ scores['soft diffused light'] = diffuse_score
373
+
374
+ # Natural daylight (自然光)
375
+ # Characteristics: bright, moderate contrast
376
+ # Auxiliary: moderate gradient, moderate color variation
377
+ daylight_score = (
378
+ 0.40 * brightness_norm + # Bright
379
+ 0.25 * min(contrast_norm, 0.7) + # Moderate contrast
380
+ 0.20 * (1.0 - abs(color_temp - 1.0)) + # Neutral temp
381
+ 0.08 * min(gradient_norm, 0.6) + # Moderate edges
382
+ 0.05 * min(laplacian_norm, 0.6) + # Some detail
383
+ 0.02 * min(color_var_norm, 0.5) # Some color variation
384
+ )
385
+ scores['natural daylight'] = daylight_score
386
+
387
+ # Overcast atmosphere (陰天氛圍)
388
+ # Characteristics: medium-low brightness, very low contrast, cool temp, minimal shadow
389
+ # Auxiliary: very low gradient (flat), low laplacian, low color variation
390
+ overcast_score = (
391
+ 0.35 * (1.0 - abs(brightness_norm - 0.45)) + # Medium-low brightness
392
+ 0.25 * (1.0 - contrast_norm) + # Very low contrast
393
+ 0.15 * (1.0 if color_temp < 1.05 else 0.5) + # Cool temp
394
+ 0.10 * (1.0 - shadow) + # Minimal shadows
395
+ 0.08 * (1.0 - gradient_norm) + # Flat appearance
396
+ 0.05 * (1.0 - laplacian_norm) + # Smooth lighting
397
+ 0.02 * (1.0 - color_var_norm) # Uniform color
398
+ )
399
+ scores['overcast atmosphere'] = overcast_score
400
+
401
+ # Warm ambient light (溫暖環境光)
402
+ # Characteristics: medium brightness, warm temp
403
+ # Auxiliary: moderate gradient, warm color bias
404
+ warm_score = (
405
+ 0.40 * (1.0 - abs(brightness_norm - 0.5)) + # Medium brightness
406
+ 0.30 * (1.0 if color_temp > 1.1 else 0.5) + # Warm temp
407
+ 0.15 * min(contrast_norm, 0.6) + # Moderate contrast
408
+ 0.08 * min(gradient_norm, 0.5) + # Soft edges
409
+ 0.05 * min(laplacian_norm, 0.5) + # Soft transitions
410
+ 0.02 * color_var_norm # Some color variation (warmth)
411
+ )
412
+ scores['warm ambient light'] = warm_score
413
+
414
+ # Evening light (傍晚光線)
415
+ # Characteristics: medium-low brightness, warm temp, medium contrast
416
+ # Auxiliary: moderate gradient (directional), some color variation
417
+ evening_score = (
418
+ 0.35 * (1.0 if brightness_norm < 0.6 else 0.5) + # Lower brightness
419
+ 0.30 * (1.0 if color_temp > 1.05 else 0.5) + # Slightly warm
420
+ 0.20 * contrast_norm + # Some contrast
421
+ 0.08 * min(gradient_norm, 0.7) + # Directional light
422
+ 0.05 * laplacian_norm + # Detail present
423
+ 0.02 * color_var_norm # Color variation
424
+ )
425
+ scores['evening light'] = evening_score
426
+
427
+ # Bright sunlight (明亮陽光)
428
+ # Characteristics: high brightness, high contrast, strong shadows
429
+ # Auxiliary: high gradient (strong edges), high laplacian (sharp transitions)
430
+ sunlight_score = (
431
+ 0.40 * (1.0 if brightness_norm > 0.7 else 0.3) + # High brightness
432
+ 0.25 * contrast_norm + # High contrast
433
+ 0.15 * shadow + # Strong shadows
434
+ 0.10 * gradient_norm + # Strong edges
435
+ 0.08 * laplacian_norm + # Sharp detail
436
+ 0.02 * color_var_norm # Color variation
437
+ )
438
+ scores['bright sunlight'] = sunlight_score
439
+
440
+ # Get top scoring condition
441
+ best_condition = max(scores.items(), key=lambda x: x[1])
442
+
443
+ # Calculate confidence based on score separation
444
+ sorted_scores = sorted(scores.values(), reverse=True)
445
+ if len(sorted_scores) > 1:
446
+ score_gap = sorted_scores[0] - sorted_scores[1]
447
+ confidence = min(0.7 + score_gap * 0.3, 0.95)
448
+ else:
449
+ confidence = 0.7
450
+
451
+ return best_condition[0], confidence
452
+
453
+ print("✓ LightingAnalysisManager (with Places365 + advanced CV features) defined")
ocr_engine_manager.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import easyocr
3
+ import numpy as np
4
+ import cv2
5
+ from PIL import Image
6
+ from typing import List, Dict
7
+ import re
8
+
9
+ class OCREngineManager:
10
+ """Text extraction using EasyOCR with brand-optimized preprocessing"""
11
+
12
+ def __init__(self):
13
+ print("Loading EasyOCR (English + Traditional Chinese)...")
14
+
15
+ # Try GPU first, fallback to CPU if GPU fails
16
+ try:
17
+ if torch.cuda.is_available():
18
+ print(" Attempting GPU initialization...")
19
+ self.reader = easyocr.Reader(['en', 'ch_tra'], gpu=True)
20
+ print(" ✓ EasyOCR loaded with GPU")
21
+ else:
22
+ print(" CUDA not available, using CPU...")
23
+ self.reader = easyocr.Reader(['en', 'ch_tra'], gpu=False)
24
+ print(" ✓ EasyOCR loaded with CPU")
25
+ except Exception as e:
26
+ print(f" ⚠️ GPU initialization failed: {e}")
27
+ print(" Falling back to CPU...")
28
+ self.reader = easyocr.Reader(['en', 'ch_tra'], gpu=False)
29
+ print(" ✓ EasyOCR loaded with CPU (fallback)")
30
+
31
+ print("✓ EasyOCR loaded")
32
+
33
+ def extract_text(self, image: Image.Image, use_brand_preprocessing: bool = False) -> List[Dict]:
34
+ """Extract text from image with optional brand-optimized preprocessing"""
35
+ if use_brand_preprocessing:
36
+ # Apply brand-optimized preprocessing
37
+ processed_image = self.preprocess_for_brand_ocr(image)
38
+ img_array = np.array(processed_image)
39
+ else:
40
+ img_array = np.array(image)
41
+
42
+ # Use more aggressive settings for brand detection
43
+ if use_brand_preprocessing:
44
+ results = self.reader.readtext(
45
+ img_array,
46
+ detail=1,
47
+ paragraph=False,
48
+ min_size=10, # Lower to catch small brand text
49
+ text_threshold=0.5, # Lower threshold for brand logos
50
+ link_threshold=0.3,
51
+ contrast_ths=0.1, # Lower to handle metallic/reflective text
52
+ adjust_contrast=0.8 # Enhance contrast for logos
53
+ )
54
+ else:
55
+ results = self.reader.readtext(
56
+ img_array,
57
+ detail=1,
58
+ paragraph=False,
59
+ min_size=20,
60
+ text_threshold=0.7,
61
+ link_threshold=0.4
62
+ )
63
+
64
+ structured_results = []
65
+ for bbox, text, confidence in results:
66
+ structured_results.append({
67
+ 'bbox': bbox,
68
+ 'text': self.clean_and_normalize(text),
69
+ 'confidence': confidence,
70
+ 'raw_text': text
71
+ })
72
+
73
+ return structured_results
74
+
75
+ def clean_and_normalize(self, text: str) -> str:
76
+ """Clean and normalize text"""
77
+ # Keep Traditional Chinese characters
78
+ text = re.sub(r'[^\w\s\u4e00-\u9fff]', '', text)
79
+ text = ' '.join(text.split())
80
+ return text.upper()
81
+
82
+ def preprocess_for_brand_ocr(self, image_region: Image.Image) -> Image.Image:
83
+ """
84
+ Preprocess image for brand OCR recognition
85
+ Optimizes for detecting brand logos and text on products (especially metallic logos)
86
+
87
+ Args:
88
+ image_region: PIL Image (typically a cropped region)
89
+
90
+ Returns:
91
+ Preprocessed PIL Image
92
+ """
93
+ # Convert to numpy array
94
+ img_array = np.array(image_region)
95
+
96
+ # Convert to grayscale
97
+ if len(img_array.shape) == 3:
98
+ gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
99
+ else:
100
+ gray = img_array
101
+
102
+ # Apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
103
+ # Increased clipLimit for metallic logos (2.0 → 3.0)
104
+ clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
105
+ enhanced = clahe.apply(gray)
106
+
107
+ # Denoise (slightly reduced strength to preserve logo edges)
108
+ denoised = cv2.fastNlMeansDenoising(enhanced, None, h=8, templateWindowSize=7, searchWindowSize=21)
109
+
110
+ # Adaptive thresholding to handle varying lighting
111
+ # Adjusted blockSize for better logo detection (11 → 15)
112
+ binary = cv2.adaptiveThreshold(
113
+ denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
114
+ cv2.THRESH_BINARY, 15, 2
115
+ )
116
+
117
+ # Morphological operations to connect broken characters
118
+ # Slightly larger kernel for logo text (2x2 → 3x3)
119
+ kernel = np.ones((3, 3), np.uint8)
120
+ morph = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
121
+
122
+ # Sharpen to enhance edges (increased center weight 9 → 11)
123
+ kernel_sharp = np.array([[-1, -1, -1], [-1, 11, -1], [-1, -1, -1]])
124
+ sharpened = cv2.filter2D(morph, -1, kernel_sharp)
125
+
126
+ # Convert back to PIL Image
127
+ return Image.fromarray(sharpened)
128
+
129
+ print("✓ OCREngineManager (with brand OCR preprocessing) defined")
openclip_semantic_manager.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+ import open_clip
4
+ from PIL import Image
5
+ from typing import List, Dict
6
+ import numpy as np
7
+
8
+ class OpenCLIPSemanticManager:
9
+ """Zero-shot classification and visual feature extraction with enhanced scene understanding"""
10
+
11
+ def __init__(self):
12
+ print("Loading OpenCLIP ViT-H/14 model...")
13
+ self.model, _, self.preprocess = open_clip.create_model_and_transforms(
14
+ 'ViT-H-14',
15
+ pretrained='laion2b_s32b_b79k'
16
+ )
17
+ self.tokenizer = open_clip.get_tokenizer('ViT-H-14')
18
+
19
+ if torch.cuda.is_available():
20
+ self.model = self.model.cuda()
21
+ self.model.eval()
22
+
23
+ # Enhanced scene vocabularies
24
+ self.scene_vocabularies = {
25
+ 'urban': [
26
+ 'city canyon with tall buildings',
27
+ 'downtown street with skyscrapers',
28
+ 'urban corridor between buildings',
29
+ 'busy city intersection',
30
+ 'metropolitan avenue'
31
+ ],
32
+ 'lighting': [
33
+ 'overcast cloudy day',
34
+ 'bright sunny day',
35
+ 'golden hour warm glow',
36
+ 'blue hour twilight',
37
+ 'harsh midday sun',
38
+ 'soft diffused light',
39
+ 'dramatic evening light',
40
+ 'moody overcast atmosphere'
41
+ ],
42
+ 'mood': [
43
+ 'bustling and energetic',
44
+ 'calm and contemplative',
45
+ 'dramatic and imposing',
46
+ 'intimate and cozy',
47
+ 'vibrant and lively'
48
+ ]
49
+ }
50
+
51
+ # Hierarchical vocabularies
52
+ self.coarse_labels = [
53
+ 'furniture', 'musical instrument', 'artwork',
54
+ 'appliance', 'decoration', 'tool', 'electronic device',
55
+ 'clothing', 'accessory', 'food', 'plant'
56
+ ]
57
+
58
+ self.domain_vocabularies = {
59
+ 'musical instrument': [
60
+ 'acoustic guitar', 'electric guitar', 'bass guitar',
61
+ 'classical guitar', 'ukulele', 'violin', 'cello',
62
+ 'piano', 'keyboard', 'drums', 'saxophone', 'trumpet'
63
+ ],
64
+ 'furniture': [
65
+ 'chair', 'sofa', 'table', 'desk', 'shelf',
66
+ 'cabinet', 'bed', 'stool', 'bench', 'wardrobe'
67
+ ],
68
+ 'electronic device': [
69
+ 'smartphone', 'laptop', 'tablet', 'camera',
70
+ 'headphones', 'speaker', 'monitor', 'keyboard', 'mouse'
71
+ ],
72
+ 'clothing': [
73
+ 'shirt', 'pants', 'dress', 'jacket', 'coat',
74
+ 'sweater', 'skirt', 'jeans', 'hoodie'
75
+ ],
76
+ 'accessory': [
77
+ 'watch', 'sunglasses', 'hat', 'scarf', 'belt',
78
+ 'bag', 'wallet', 'jewelry', 'tie'
79
+ ]
80
+ }
81
+
82
+ self.text_features_cache = {}
83
+ self._cache_text_features()
84
+
85
+ print("✓ OpenCLIP loaded with enhanced scene understanding")
86
+
87
+ def _cache_text_features(self):
88
+ """Pre-compute and cache text features"""
89
+ with torch.no_grad():
90
+ # Cache coarse labels
91
+ prompts = [f"a photo of {label}" for label in self.coarse_labels]
92
+ text = self.tokenizer(prompts)
93
+ if torch.cuda.is_available():
94
+ text = text.cuda()
95
+ self.text_features_cache['coarse'] = self.model.encode_text(text)
96
+ self.text_features_cache['coarse'] /= self.text_features_cache['coarse'].norm(dim=-1, keepdim=True)
97
+
98
+ # Cache domain vocabularies
99
+ for domain, labels in self.domain_vocabularies.items():
100
+ prompts = [f"a photo of {label}" for label in labels]
101
+ text = self.tokenizer(prompts)
102
+ if torch.cuda.is_available():
103
+ text = text.cuda()
104
+ features = self.model.encode_text(text)
105
+ features /= features.norm(dim=-1, keepdim=True)
106
+ self.text_features_cache[domain] = features
107
+
108
+ # Cache scene vocabularies
109
+ for scene_type, labels in self.scene_vocabularies.items():
110
+ text = self.tokenizer(labels)
111
+ if torch.cuda.is_available():
112
+ text = text.cuda()
113
+ features = self.model.encode_text(text)
114
+ features /= features.norm(dim=-1, keepdim=True)
115
+ self.text_features_cache[f'scene_{scene_type}'] = features
116
+
117
+ def analyze_scene(self, image: Image.Image) -> Dict:
118
+ """Comprehensive scene analysis"""
119
+ image_features = self.encode_image(image)
120
+
121
+ scene_analysis = {}
122
+
123
+ # Analyze each scene aspect
124
+ for scene_type in ['urban', 'lighting', 'mood']:
125
+ cache_key = f'scene_{scene_type}'
126
+ similarity = (image_features @ self.text_features_cache[cache_key].T) / 0.01
127
+ probs = similarity.softmax(dim=-1)
128
+
129
+ results = {}
130
+ for i, label in enumerate(self.scene_vocabularies[scene_type]):
131
+ results[label] = float(probs[0, i].cpu())
132
+
133
+ top_result = max(results.items(), key=lambda x: x[1])
134
+ scene_analysis[scene_type] = {
135
+ 'top': top_result[0],
136
+ 'confidence': top_result[1],
137
+ 'all_scores': results
138
+ }
139
+
140
+ return scene_analysis
141
+
142
+ def encode_image(self, image: Image.Image) -> torch.Tensor:
143
+ """Encode image to feature vector"""
144
+ with torch.no_grad():
145
+ image_tensor = self.preprocess(image).unsqueeze(0)
146
+ if torch.cuda.is_available():
147
+ image_tensor = image_tensor.cuda()
148
+
149
+ image_features = self.model.encode_image(image_tensor)
150
+ image_features /= image_features.norm(dim=-1, keepdim=True)
151
+ return image_features
152
+
153
+ def encode_text(self, text_list: List[str]) -> torch.Tensor:
154
+ """Encode text list to feature vectors"""
155
+ with torch.no_grad():
156
+ prompts = [f"a photo of {text}" for text in text_list]
157
+ text = self.tokenizer(prompts)
158
+ if torch.cuda.is_available():
159
+ text = text.cuda()
160
+
161
+ text_features = self.model.encode_text(text)
162
+ text_features /= text_features.norm(dim=-1, keepdim=True)
163
+ return text_features
164
+
165
+ def classify_zero_shot(self, image: Image.Image, candidate_labels: List[str]) -> Dict[str, float]:
166
+ """Zero-shot classification"""
167
+ image_features = self.encode_image(image)
168
+ text_features = self.encode_text(candidate_labels)
169
+
170
+ similarity = (image_features @ text_features.T) / 0.01
171
+ probs = similarity.softmax(dim=-1)
172
+
173
+ results = {}
174
+ for i, label in enumerate(candidate_labels):
175
+ results[label] = float(probs[0, i].cpu())
176
+
177
+ return results
178
+
179
+ def classify_hierarchical(self, image: Image.Image) -> Dict:
180
+ """Hierarchical classification"""
181
+ image_features = self.encode_image(image)
182
+
183
+ coarse_similarity = (image_features @ self.text_features_cache['coarse'].T) / 0.01
184
+ coarse_probs = coarse_similarity.softmax(dim=-1)
185
+
186
+ coarse_results = {}
187
+ for i, label in enumerate(self.coarse_labels):
188
+ coarse_results[label] = float(coarse_probs[0, i].cpu())
189
+
190
+ top_category = max(coarse_results, key=coarse_results.get)
191
+
192
+ if top_category in self.domain_vocabularies:
193
+ fine_labels = self.domain_vocabularies[top_category]
194
+ fine_similarity = (image_features @ self.text_features_cache[top_category].T) / 0.01
195
+ fine_probs = fine_similarity.softmax(dim=-1)
196
+
197
+ fine_results = {}
198
+ for i, label in enumerate(fine_labels):
199
+ fine_results[label] = float(fine_probs[0, i].cpu())
200
+
201
+ top_prediction = max(fine_results, key=fine_results.get)
202
+
203
+ return {
204
+ 'coarse': top_category,
205
+ 'fine': fine_results,
206
+ 'top_prediction': top_prediction,
207
+ 'confidence': fine_results[top_prediction]
208
+ }
209
+
210
+ return {
211
+ 'coarse': top_category,
212
+ 'top_prediction': top_category,
213
+ 'confidence': coarse_results[top_category]
214
+ }
215
+
216
+ print("✓ OpenCLIPSemanticManager defined")
output_processing_manager.py ADDED
@@ -0,0 +1,350 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import re
3
+ from typing import Dict, List, Tuple, Optional
4
+ from prompt_library_manager import PromptLibraryManager
5
+
6
+ class OutputProcessingManager:
7
+ """
8
+ 輸出驗證、格式化與智能標籤生成
9
+ 整合 PromptLibraryManager 提供商業級標籤生成
10
+ """
11
+
12
+ def __init__(self, prompt_library: PromptLibraryManager = None):
13
+ """
14
+ Args:
15
+ prompt_library: PromptLibraryManager 實例(可選,會自動創建)
16
+ """
17
+ self.profanity_filter = set([])
18
+
19
+ self.max_lengths = {
20
+ 'instagram': 2200,
21
+ 'tiktok': 100,
22
+ 'xiaohongshu': 500
23
+ }
24
+
25
+ # 初始化或使用提供的 PromptLibraryManager
26
+ if prompt_library is None:
27
+ self.prompt_library = PromptLibraryManager()
28
+ else:
29
+ self.prompt_library = prompt_library
30
+
31
+ # 地標檢測關鍵字(用於簡單的地標識別)
32
+ self.landmark_keywords = self._init_landmark_keywords()
33
+
34
+ print("✓ OutputProcessingManager (with integrated PromptLibraryManager) initialized")
35
+
36
+ def _init_landmark_keywords(self) -> Dict[str, List[str]]:
37
+ """
38
+ 初始化地標檢測關鍵字映射
39
+ 用於從檢測到的物體和場景中推測可能的地標
40
+ """
41
+ return {
42
+ 'Big Ben': ['clock tower', 'tower', 'bridge', 'palace', 'gothic'],
43
+ 'Eiffel Tower': ['tower', 'iron', 'landmark', 'lattice'],
44
+ 'Statue of Liberty': ['statue', 'monument', 'harbor', 'torch'],
45
+ 'Golden Gate Bridge': ['bridge', 'suspension', 'orange', 'bay'],
46
+ 'Sydney Opera House': ['opera', 'building', 'harbor', 'shell'],
47
+ 'Taj Mahal': ['palace', 'dome', 'monument', 'marble'],
48
+ 'Colosseum': ['arena', 'amphitheater', 'ruins', 'ancient'],
49
+ 'Pyramids of Giza': ['pyramid', 'desert', 'ancient', 'monument'],
50
+ 'Burj Khalifa': ['skyscraper', 'tower', 'building', 'tall'],
51
+ 'Tokyo Tower': ['tower', 'lattice', 'red'],
52
+ 'Taipei 101': ['skyscraper', 'tower', 'building'],
53
+ # 可以擴展更多
54
+ }
55
+
56
+ def detect_landmark(self, detections: List[Dict], scene_info: Dict) -> Optional[str]:
57
+ """
58
+ 從檢測結果中推測可能的地標
59
+
60
+ Args:
61
+ detections: YOLO 檢測結果
62
+ scene_info: 場景分析結果
63
+
64
+ Returns:
65
+ 推測的地標名稱,若無法推測則返回 None
66
+ """
67
+ detected_objects = [d.get('class_name', '').lower() for d in detections]
68
+
69
+ # 從場景資訊中提取更多線索
70
+ scene_keywords = []
71
+ urban_scene = scene_info.get('urban', {}).get('top', '')
72
+ if urban_scene:
73
+ scene_keywords.append(urban_scene.lower())
74
+
75
+ all_keywords = detected_objects + scene_keywords
76
+
77
+ # 計算每個地標的匹配分數
78
+ scores = {}
79
+ for landmark, keywords in self.landmark_keywords.items():
80
+ match_count = sum(1 for obj in all_keywords
81
+ if any(kw in obj for kw in keywords))
82
+ if match_count > 0:
83
+ scores[landmark] = match_count
84
+
85
+ # 返回得分最高的地標(至少需要 2 個匹配)
86
+ if scores:
87
+ best_landmark = max(scores.items(), key=lambda x: x[1])
88
+ if best_landmark[1] >= 2:
89
+ return best_landmark[0]
90
+
91
+ return None
92
+
93
+ def generate_smart_hashtags(self, detections: List[Dict], scene_info: Dict,
94
+ brands: List, platform: str, language: str) -> List[str]:
95
+ """
96
+ 智能標籤生成:整合品牌、地標、場景的標籤
97
+
98
+ Args:
99
+ detections: 檢測到的物體列表
100
+ scene_info: 場景分析結果
101
+ brands: 檢測到的品牌列表
102
+ platform: 平台名稱
103
+ language: 語言 ('zh', 'en', 或 'zh-en')
104
+
105
+ Returns:
106
+ 智能生成的 hashtag 列表(最多 10 個)
107
+ """
108
+ hashtags = []
109
+
110
+ # 1. 檢測地標(最高優先級)
111
+ detected_landmark = self.detect_landmark(detections, scene_info)
112
+ if detected_landmark:
113
+ landmark_tags = self.prompt_library.landmark_prompts.get_hashtags(
114
+ detected_landmark, language
115
+ )
116
+ hashtags.extend(landmark_tags[:5]) # 地標標籤限制 5 個
117
+
118
+ # 2. 品牌標籤(高優先級)
119
+ if brands:
120
+ for brand in brands[:3]: # 最多 3 個品牌
121
+ brand_name = brand[0] if isinstance(brand, tuple) else brand
122
+ brand_tags = self.prompt_library.brand_prompts.get_hashtags(
123
+ brand_name, language
124
+ )
125
+ hashtags.extend(brand_tags[:3]) # 每個品牌最多 3 個標籤
126
+
127
+ # 3. 場景標籤(中優先級)
128
+ scene_category = self._detect_scene_category(scene_info, detections)
129
+ if scene_category:
130
+ scene_tags = self.prompt_library.scene_prompts.get_hashtags(
131
+ scene_category, language
132
+ )
133
+ hashtags.extend(scene_tags[:4])
134
+
135
+ # 4. 構圖特定標籤
136
+ composition_tags = self._get_composition_hashtags(scene_info, language)
137
+ hashtags.extend(composition_tags)
138
+
139
+ # 5. 平台特定標籤
140
+ platform_tags = self._get_platform_hashtags(platform, language)
141
+ hashtags.extend(platform_tags)
142
+
143
+ # 去重並保持順序(地標 > 品牌 > 場景 > 構圖 > 平台)
144
+ seen = set()
145
+ unique_hashtags = []
146
+ for tag in hashtags:
147
+ if tag not in seen and tag: # 確保標籤不為空
148
+ seen.add(tag)
149
+ unique_hashtags.append(tag)
150
+
151
+ # 返回前 10 個
152
+ return unique_hashtags[:10]
153
+
154
+ def _detect_scene_category(self, scene_info: Dict, detections: List[Dict]) -> Optional[str]:
155
+ """
156
+ 檢測場景類別
157
+
158
+ Returns:
159
+ 場景類別名稱 ('urban', 'nature', 'indoor', 'food', etc.)
160
+ """
161
+ # 檢查物體類別來判斷場景
162
+ object_classes = [d.get('class_name', '').lower() for d in detections]
163
+
164
+ # 食物場景
165
+ food_keywords = ['sandwich', 'pizza', 'cake', 'food', 'plate', 'bowl', 'cup', 'bottle']
166
+ if any(kw in obj for kw in food_keywords for obj in object_classes):
167
+ return 'food'
168
+
169
+ # 自然場景
170
+ nature_keywords = ['tree', 'mountain', 'water', 'sky', 'beach', 'ocean']
171
+ if any(kw in obj for kw in nature_keywords for obj in object_classes):
172
+ return 'nature'
173
+
174
+ # 城市場景(默認)
175
+ urban_scene = scene_info.get('urban', {}).get('top', '')
176
+ if urban_scene and ('canyon' in urban_scene or 'street' in urban_scene or 'building' in urban_scene):
177
+ return 'urban'
178
+
179
+ # 室內場景
180
+ indoor_keywords = ['chair', 'table', 'couch', 'bed', 'desk']
181
+ if any(kw in obj for kw in indoor_keywords for obj in object_classes):
182
+ return 'indoor'
183
+
184
+ return 'urban' # 默認城市場景
185
+
186
+ def _get_composition_hashtags(self, scene_info: Dict, language: str) -> List[str]:
187
+ """
188
+ 根據構圖類型生成標籤
189
+ """
190
+ hashtags = []
191
+
192
+ composition = scene_info.get('urban', {}).get('top', '')
193
+
194
+ # 城市峽谷
195
+ if 'canyon' in composition or 'skyscraper' in composition:
196
+ if language == 'zh':
197
+ hashtags.extend(['城市峽谷', '城市風景'])
198
+ elif language == 'en':
199
+ hashtags.extend(['UrbanCanyon', 'Cityscape'])
200
+ else: # bilingual
201
+ hashtags.extend(['城市峽谷', 'UrbanCanyon'])
202
+
203
+ # 攝影類型
204
+ if language == 'zh':
205
+ hashtags.append('攝影日常')
206
+ elif language == 'en':
207
+ hashtags.append('Photography')
208
+ else:
209
+ hashtags.extend(['攝影日常', 'Photography'])
210
+
211
+ return hashtags
212
+
213
+ def _get_platform_hashtags(self, platform: str, language: str) -> List[str]:
214
+ """
215
+ 根據平台生成特定標籤
216
+ """
217
+ hashtags = []
218
+
219
+ if platform == 'instagram':
220
+ if language == 'zh':
221
+ hashtags.append('IG日常')
222
+ elif language == 'en':
223
+ hashtags.append('InstaDaily')
224
+ else:
225
+ hashtags.extend(['IG日常', 'InstaDaily'])
226
+
227
+ elif platform == 'tiktok':
228
+ if language == 'zh':
229
+ hashtags.append('抖音')
230
+ elif language == 'en':
231
+ hashtags.append('TikTok')
232
+ else:
233
+ hashtags.extend(['抖音', 'TikTok'])
234
+
235
+ elif platform == 'xiaohongshu':
236
+ hashtags.extend(['小紅書', '分享日常'])
237
+
238
+ return hashtags
239
+
240
+ def validate_output(self, output: Dict, platform: str,
241
+ detections: List[Dict] = None, scene_info: Dict = None,
242
+ brands: List = None, language: str = 'en') -> Tuple[bool, str]:
243
+ """
244
+ 驗證輸出格式和內容(含標籤自動補充)
245
+
246
+ Args:
247
+ output: 生成的標題字典
248
+ platform: 平台名稱
249
+ detections: 檢測結果(用於標籤補充)
250
+ scene_info: 場景資訊(用於標籤補充)
251
+ brands: 品牌列表(用於標籤補充)
252
+ language: 語言
253
+
254
+ Returns:
255
+ (是否通過驗證, 驗證訊息)
256
+ """
257
+ # 1. 結構驗證
258
+ required_fields = ['caption', 'hashtags', 'tone', 'platform']
259
+ if not all(field in output for field in required_fields):
260
+ return False, "Missing required fields"
261
+
262
+ # 2. 長度驗證
263
+ max_length = self.max_lengths.get(platform, 2200)
264
+ if len(output['caption']) > max_length:
265
+ output['caption'] = output['caption'][:max_length-3] + '...'
266
+
267
+ # 3. 內容過濾
268
+ if self._contains_profanity(output['caption']):
269
+ return False, "Contains inappropriate content"
270
+
271
+ # 4. 標籤驗證
272
+ output['hashtags'] = self._validate_hashtags(output['hashtags'])
273
+
274
+ # 🆕 5. 標籤數量檢查與自動補充(商業級功能)
275
+ min_hashtags = 5 # 最低標籤數量要求
276
+ if len(output['hashtags']) < min_hashtags:
277
+ # 如果提供了檢測資訊,自動補充標籤
278
+ if detections is not None and scene_info is not None:
279
+ additional_tags = self.generate_smart_hashtags(
280
+ detections, scene_info, brands or [], platform, language
281
+ )
282
+ # 補充標籤(避免重複)
283
+ for tag in additional_tags:
284
+ if tag not in output['hashtags'] and len(output['hashtags']) < 10:
285
+ output['hashtags'].append(tag)
286
+
287
+ print(f" [AUTO-補充] 標籤數量不足 ({len(output['hashtags'])} < {min_hashtags}),已自動補充至 {len(output['hashtags'])} 個")
288
+
289
+ # 6. 確保標題中沒有 hashtag 符號
290
+ if '#' in output['caption']:
291
+ # 移除標題中的 hashtag
292
+ output['caption'] = re.sub(r'#\w+', '', output['caption']).strip()
293
+
294
+ return True, "Validation passed"
295
+
296
+ def _contains_profanity(self, text: str) -> bool:
297
+ """檢查不當內容"""
298
+ text_lower = text.lower()
299
+ for word in self.profanity_filter:
300
+ if word in text_lower:
301
+ return True
302
+ return False
303
+
304
+ def _validate_hashtags(self, hashtags: List[str]) -> List[str]:
305
+ """
306
+ 驗證並清理 hashtags
307
+
308
+ Args:
309
+ hashtags: 原始 hashtag 列表
310
+
311
+ Returns:
312
+ 清理後的 hashtag 列表
313
+ """
314
+ cleaned = []
315
+ for tag in hashtags:
316
+ # 移除 # 符號
317
+ tag = tag.lstrip('#')
318
+
319
+ # 保留中文、英文、數字
320
+ tag = re.sub(r'[^\w\u4e00-\u9fff]', '', tag)
321
+
322
+ # 確保不為空且不重複
323
+ if tag and tag not in cleaned:
324
+ cleaned.append(tag)
325
+
326
+ return cleaned[:10] # 最多 10 個
327
+
328
+ def format_for_platform(self, caption: Dict, platform: str) -> str:
329
+ """
330
+ 根據平台格式化輸出
331
+
332
+ Args:
333
+ caption: 標題字典
334
+ platform: 平台名稱
335
+
336
+ Returns:
337
+ 格式化的字串
338
+ """
339
+ formatted = f"{caption['caption']}\n\n"
340
+
341
+ if platform == 'xiaohongshu':
342
+ # 小紅書:標籤直接接在標題後
343
+ formatted += ' '.join([f"#{tag}" for tag in caption['hashtags']])
344
+ else:
345
+ # Instagram/TikTok:標籤另起一行
346
+ formatted += '\n' + ' '.join([f"#{tag}" for tag in caption['hashtags']])
347
+
348
+ return formatted
349
+
350
+ print("✓ OutputProcessingManager (V3 with PromptLibraryManager integration) defined")
pixcribe_pipeline.py ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import time
3
+ import traceback
4
+ from PIL import Image
5
+ from typing import Dict
6
+
7
+ from image_processor_manager import ImageProcessorManager
8
+ from yolo_detection_manager import YOLODetectionManager
9
+ from saliency_detection_manager import SaliencyDetectionManager
10
+ from openclip_semantic_manager import OpenCLIPSemanticManager
11
+ from lighting_analysis_manager import LightingAnalysisManager
12
+ from ocr_engine_manager import OCREngineManager
13
+ from prompt_library_manager import PromptLibraryManager
14
+ from brand_recognition_manager import BrandRecognitionManager
15
+ from brand_visualization_manager import BrandVisualizationManager
16
+ from brand_verification_manager import BrandVerificationManager
17
+ from scene_compatibility_manager import SceneCompatibilityManager
18
+ from caption_generation_manager import CaptionGenerationManager
19
+ from detection_fusion_manager import DetectionFusionManager
20
+ from output_processing_manager import OutputProcessingManager
21
+
22
+ class PixcribePipeline:
23
+ """Main Facade coordinating all components (V2 with multi-language support)"""
24
+
25
+ def __init__(self, yolo_variant='l', vlm_model_name='Qwen/Qwen2.5-VL-7B-Instruct'):
26
+ """
27
+ Args:
28
+ yolo_variant: 'm', 'l' (default), or 'x'
29
+ vlm_model_name: Vision-Language Model name (default: Qwen2.5-VL-7B-Instruct)
30
+ Can be changed to 'Qwen/Qwen3-VL-8B-Instruct' for latest model
31
+ """
32
+ print("="*60)
33
+ print("Initializing Pixcribe Pipeline V2...")
34
+ print("="*60)
35
+
36
+ start_time = time.time()
37
+
38
+ # Initialize all managers
39
+ self.image_processor = ImageProcessorManager()
40
+ self.yolo_detector = YOLODetectionManager(variant=yolo_variant)
41
+ self.saliency_detector = SaliencyDetectionManager()
42
+ self.clip_semantic = OpenCLIPSemanticManager()
43
+ self.lighting_analyzer = LightingAnalysisManager()
44
+ self.ocr_engine = OCREngineManager()
45
+
46
+ # NEW: Initialize PromptLibrary (centralized prompt management)
47
+ self.prompt_library = PromptLibraryManager()
48
+
49
+ # Initialize BrandRecognitionManager with PromptLibrary
50
+ self.brand_recognizer = BrandRecognitionManager(
51
+ self.clip_semantic, self.ocr_engine, self.prompt_library
52
+ )
53
+
54
+ # NEW: Brand visualization manager
55
+ self.brand_visualizer = BrandVisualizationManager()
56
+
57
+ self.caption_generator = CaptionGenerationManager(model_name=vlm_model_name)
58
+
59
+ # NEW: Brand verification with VLM
60
+ self.brand_verifier = BrandVerificationManager(self.caption_generator)
61
+
62
+ # NEW: Scene compatibility checker
63
+ self.scene_compatibility = SceneCompatibilityManager(self.prompt_library)
64
+
65
+ self.fusion_manager = DetectionFusionManager(self.clip_semantic)
66
+
67
+ # Initialize OutputProcessingManager with PromptLibrary for smart hashtag generation
68
+ self.output_processor = OutputProcessingManager(self.prompt_library)
69
+
70
+ elapsed = time.time() - start_time
71
+ print("="*60)
72
+ print(f"✓ Pipeline initialized successfully (Time: {elapsed:.2f}s)")
73
+ print("="*60)
74
+
75
+ def process_image(self, image, platform='instagram', yolo_variant='l', language='zh') -> Dict:
76
+ """End-to-end image processing pipeline
77
+
78
+ Args:
79
+ image: PIL Image or path
80
+ platform: 'instagram', 'tiktok', or 'xiaohongshu'
81
+ yolo_variant: 'm', 'l' (default), or 'x'
82
+ language: 'zh' (Traditional Chinese), 'en' (English), 'zh-en' (Bilingual)
83
+
84
+ Returns:
85
+ Processing results dictionary with brand visualizations
86
+ """
87
+ print(f"\nProcessing image (Platform: {platform}, Language: {language})...")
88
+ start_time = time.time()
89
+
90
+ try:
91
+ # Step 1: Preprocessing
92
+ print("[1/9] Preprocessing image...")
93
+ processed_img = self.image_processor.load_image(image)
94
+ yolo_input = self.image_processor.preprocess_for_yolo(processed_img)
95
+
96
+ # Step 2: Parallel detection
97
+ print("[2/9] YOLO object detection...")
98
+ yolo_results = self.yolo_detector.detect(yolo_input)
99
+ print(f" Detected {len(yolo_results)} objects")
100
+
101
+ print("[3/9] Saliency detection...")
102
+ salient_regions = self.saliency_detector.detect_salient_regions(processed_img)
103
+ print(f" Found {len(salient_regions)} salient regions")
104
+
105
+ # Step 3: Identify unknown objects
106
+ print("[4/9] Identifying unknown objects...")
107
+ unknown_regions = self.saliency_detector.extract_unknown_regions(
108
+ salient_regions, yolo_results
109
+ )
110
+ print(f" Found {len(unknown_regions)} unknown regions")
111
+
112
+ # Step 4: Brand recognition (with bounding boxes)
113
+ print("[5/9] Brand recognition...")
114
+ brands = []
115
+ brand_detections = [] # For visualization
116
+
117
+ # Method 1: Check YOLO-detected brand-relevant objects
118
+ brand_relevant = self.yolo_detector.filter_brand_relevant_objects(yolo_results)
119
+ if brand_relevant:
120
+ print(f" Checking {len(brand_relevant)} YOLO brand-relevant objects...")
121
+ for det in brand_relevant[:5]: # Check top 5 brand-relevant objects
122
+ region = processed_img.crop(det['bbox'])
123
+ brand_result = self.brand_recognizer.recognize_brand(
124
+ region, processed_img, region_bbox=det['bbox']
125
+ )
126
+
127
+ if brand_result:
128
+ for brand_name, confidence, bbox in brand_result[:2]: # Top 2 brands per region
129
+ brands.append((brand_name, confidence))
130
+
131
+ # Prepare for visualization
132
+ brand_info = self.prompt_library.get_brand_prompts(brand_name)
133
+ category = brand_info.get('category', 'default') if brand_info else 'default'
134
+
135
+ brand_detections.append({
136
+ 'name': brand_name,
137
+ 'confidence': confidence,
138
+ 'bbox': bbox,
139
+ 'category': category
140
+ })
141
+
142
+ # Method 2: Full-image brand scan (商業級必要功能)
143
+ # 無論 YOLO 是否檢測到相關物體,都執行全圖品牌掃描
144
+ print(" Performing intelligent full-image brand scan...")
145
+ full_image_brands = self.brand_recognizer.scan_full_image_for_brands(
146
+ processed_img,
147
+ exclude_bboxes=[bd['bbox'] for bd in brand_detections if bd.get('bbox')],
148
+ saliency_regions=salient_regions # 傳遞顯著性區域以智能選擇掃描區域
149
+ )
150
+
151
+ # 合併全圖掃描結果
152
+ if full_image_brands:
153
+ print(f" Full-image scan found {len(full_image_brands)} additional brands")
154
+ for brand_name, confidence, bbox in full_image_brands:
155
+ # 避免重複檢測同一品牌
156
+ if not any(bd['name'] == brand_name for bd in brand_detections):
157
+ brands.append((brand_name, confidence))
158
+
159
+ brand_info = self.prompt_library.get_brand_prompts(brand_name)
160
+ category = brand_info.get('category', 'default') if brand_info else 'default'
161
+
162
+ brand_detections.append({
163
+ 'name': brand_name,
164
+ 'confidence': confidence,
165
+ 'bbox': bbox,
166
+ 'category': category
167
+ })
168
+
169
+ print(f" Identified {len(brands)} brand instances (before verification)")
170
+
171
+ # Step 4.5: CLIP scene understanding (moved earlier for compatibility check)
172
+ print("[5.5/11] Scene understanding (CLIP)...")
173
+ scene_analysis = self.clip_semantic.analyze_scene(processed_img)
174
+ print(f" Scene: {scene_analysis.get('urban', {}).get('top', 'unknown')}")
175
+
176
+ # Step 4.6: Scene compatibility check
177
+ if brands:
178
+ print("[5.6/11] Checking scene compatibility...")
179
+ brands_with_bbox = [(b[0], b[1], brand_detections[i]['bbox'])
180
+ for i, b in enumerate(brands)]
181
+ compatible_brands = self.scene_compatibility.batch_check_compatibility(
182
+ brands_with_bbox, scene_analysis
183
+ )
184
+ print(f" {len(compatible_brands)} brands passed compatibility check")
185
+
186
+ # Update brands and brand_detections
187
+ if compatible_brands:
188
+ brands = [(b[0], b[1]) for b in compatible_brands]
189
+ brand_detections = []
190
+ for brand_name, confidence, bbox in compatible_brands:
191
+ brand_info = self.prompt_library.get_brand_prompts(brand_name)
192
+ category = brand_info.get('category', 'default') if brand_info else 'default'
193
+ brand_detections.append({
194
+ 'name': brand_name,
195
+ 'confidence': confidence,
196
+ 'bbox': bbox,
197
+ 'category': category
198
+ })
199
+ else:
200
+ brands = []
201
+ brand_detections = []
202
+
203
+ # Step 4.7: VLM brand verification
204
+ if brand_detections:
205
+ print("[5.7/11] VLM brand verification...")
206
+ vlm_verification = self.brand_verifier.verify_brands(
207
+ processed_img, [(bd['name'], bd['confidence'], bd['bbox'])
208
+ for bd in brand_detections]
209
+ )
210
+ print(f" VLM verified {len(vlm_verification.get('verified_brands', []))} brands")
211
+
212
+ # Three-way voting: OpenCLIP + OCR + VLM
213
+ # Collect OCR matches for voting
214
+ ocr_brands = {}
215
+ for brand_name, conf in brands:
216
+ if brand_name not in ocr_brands:
217
+ ocr_brands[brand_name] = (0.5, conf) # Approximate text/ocr split
218
+
219
+ final_brands = self.brand_verifier.three_way_voting(
220
+ [(bd['name'], bd['confidence'], bd['bbox']) for bd in brand_detections],
221
+ ocr_brands,
222
+ vlm_verification
223
+ )
224
+ print(f" Final verified brands: {len(final_brands)}")
225
+
226
+ # Update brands and brand_detections with verified results
227
+ if final_brands:
228
+ brands = [(b[0], b[1]) for b in final_brands]
229
+ brand_detections = []
230
+ for brand_name, confidence, bbox in final_brands:
231
+ brand_info = self.prompt_library.get_brand_prompts(brand_name)
232
+ category = brand_info.get('category', 'default') if brand_info else 'default'
233
+ brand_detections.append({
234
+ 'name': brand_name,
235
+ 'confidence': confidence,
236
+ 'bbox': bbox,
237
+ 'category': category
238
+ })
239
+ else:
240
+ brands = []
241
+ brand_detections = []
242
+
243
+ # NEW: Visualize brand detections on image
244
+ if brand_detections:
245
+ visualized_image = self.brand_visualizer.draw_brand_detections(
246
+ processed_img.copy(), brand_detections
247
+ )
248
+ else:
249
+ visualized_image = processed_img
250
+
251
+ # Step 6: CV-based lighting analysis
252
+ print("[7/11] Analyzing lighting conditions...")
253
+ cv_lighting = self.lighting_analyzer.analyze_lighting(processed_img)
254
+ print(f" CV Lighting: {cv_lighting['lighting_type']} (confidence: {cv_lighting['confidence']:.2f})")
255
+ print(f" Details: brightness={cv_lighting['cv_features']['brightness']:.1f}, "
256
+ f"temp_ratio={cv_lighting['cv_features']['color_temp']:.2f}, "
257
+ f"contrast={cv_lighting['cv_features']['contrast']:.1f}")
258
+
259
+ # Step 7: Additional scene analysis details
260
+ print("[8/11] Additional scene analysis...")
261
+ print(f" CLIP Lighting: {scene_analysis.get('lighting', {}).get('top', 'unknown')}")
262
+ print(f" Mood: {scene_analysis.get('mood', {}).get('top', 'unknown')}")
263
+
264
+ # Step 8: Fusion with lighting analysis
265
+ print("[9/11] Fusing detection results...")
266
+ fused_results = self.fusion_manager.fuse_detections(
267
+ yolo_results, unknown_regions, scene_analysis, processed_img, cv_lighting
268
+ )
269
+ fused_results['brands'] = brands
270
+ fused_results['scene_analysis'] = scene_analysis
271
+
272
+ # Print fused lighting result
273
+ fused_lighting = fused_results['scene_analysis']['lighting']['top']
274
+ print(f" Fused Lighting: {fused_lighting}")
275
+
276
+ # Step 9: Caption generation with language support
277
+ print("[10/11] Generating captions...")
278
+ captions = self.caption_generator.generate_captions(
279
+ fused_results, processed_img, platform, language
280
+ )
281
+
282
+ # Step 10: Output processing with smart hashtags
283
+ print("[11/11] Output processing...")
284
+ validated_captions = []
285
+ for caption in captions:
286
+ # Only generate hashtags if VLM didn't generate any
287
+ # DO NOT override VLM hashtags - they follow language requirements
288
+ if not caption.get('hashtags') or len(caption.get('hashtags', [])) < 3:
289
+ print(f" [DEBUG] Caption has {len(caption.get('hashtags', []))} hashtags, generating smart hashtags...")
290
+ caption['hashtags'] = self.output_processor.generate_smart_hashtags(
291
+ fused_results['detections'],
292
+ scene_analysis,
293
+ brands,
294
+ platform,
295
+ language
296
+ )
297
+ else:
298
+ print(f" [DEBUG] Caption has {len(caption['hashtags'])} VLM-generated hashtags")
299
+
300
+ # 傳遞完整參數給 validate_output 以啟用標籤自動補充
301
+ is_valid, msg = self.output_processor.validate_output(
302
+ caption, platform,
303
+ detections=fused_results['detections'],
304
+ scene_info=scene_analysis,
305
+ brands=brands,
306
+ language=language
307
+ )
308
+ if is_valid:
309
+ validated_captions.append(caption)
310
+ else:
311
+ print(f" [DEBUG] Caption validation failed: {msg}")
312
+
313
+ elapsed = time.time() - start_time
314
+ print(f"\n✓ Processing complete (Total time: {elapsed:.2f}s)")
315
+ print(f" Generated {len(validated_captions)} caption variations")
316
+
317
+ return {
318
+ 'captions': validated_captions,
319
+ 'detections': fused_results['detections'],
320
+ 'brands': brands,
321
+ 'brand_detections': brand_detections, # NEW: For UI display
322
+ 'visualized_image': visualized_image, # NEW: Image with brand boxes
323
+ 'scene': scene_analysis,
324
+ 'composition': fused_results.get('composition', {}),
325
+ 'lighting': cv_lighting,
326
+ 'processing_time': elapsed
327
+ }
328
+
329
+ except Exception as e:
330
+ print(f"\n✗ Processing error: {str(e)}")
331
+ traceback.print_exc()
332
+ # Re-raise exception so it can be caught and displayed
333
+ raise
334
+
335
+ print("✓ PixcribePipeline (V2 with VLM Verification, Scene Compatibility, and Adaptive Weights) defined")
prompt_library_manager.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from typing import Dict, List, Optional
3
+ from landmark_prompts import LandmarkPrompts
4
+ from brand_prompts import BrandPrompts
5
+ from scene_prompts import ScenePrompts
6
+ from universal_object_prompts import UniversalObjectPrompts
7
+
8
+ class PromptLibraryManager:
9
+ """
10
+ Facade 模式:統一管理所有 Prompt 子模組
11
+ 提供單一介面存取品牌、地標、場景、通用物品等 prompts
12
+ """
13
+
14
+ def __init__(self):
15
+ """初始化所有 Prompt 子模組"""
16
+
17
+ print("Initializing Prompt Library Manager (Facade)...")
18
+
19
+ # 載入所有子模組
20
+ self.brand_prompts = BrandPrompts()
21
+ self.landmark_prompts = LandmarkPrompts()
22
+ self.scene_prompts = ScenePrompts()
23
+ self.object_prompts = UniversalObjectPrompts()
24
+
25
+ # 統計資訊
26
+ total_brands = self._count_brands()
27
+ total_landmarks = len(self.landmark_prompts.landmarks)
28
+ total_scenes = len(self.scene_prompts.scene_vocabularies)
29
+ total_objects = len(self.object_prompts.object_vocabularies)
30
+
31
+ print(f"✓ Prompt Library Manager initialized:")
32
+ print(f" - {total_brands} brands across {len(self.brand_prompts.brand_prompts)} categories")
33
+ print(f" - {total_landmarks} world landmarks")
34
+ print(f" - {total_scenes} scene categories")
35
+ print(f" - {total_objects} universal object categories")
36
+
37
+ def _count_brands(self) -> int:
38
+ """計算總品牌數量"""
39
+ total = 0
40
+ for category in self.brand_prompts.brand_prompts.values():
41
+ total += len(category)
42
+ return total
43
+
44
+ # ===== 品牌相關方法 Brand Methods =====
45
+
46
+ def get_brand_prompts(self, brand_name: str) -> Optional[Dict]:
47
+ """
48
+ 取得特定品牌的完整 prompt 資料
49
+
50
+ Args:
51
+ brand_name: 品牌名稱
52
+
53
+ Returns:
54
+ 品牌資料字典
55
+ """
56
+ return self.brand_prompts.get_prompts(brand_name)
57
+
58
+ def get_brand_category(self, brand_name: str) -> str:
59
+ """取得品牌類別"""
60
+ return self.brand_prompts.get_brand_category(brand_name)
61
+
62
+ def get_all_brands(self) -> Dict:
63
+ """取得所有品牌的扁平化字典"""
64
+ return self.brand_prompts.get_all_brands()
65
+
66
+ def get_brands_by_category(self, category: str) -> Dict:
67
+ """取得特定類別的所有品牌"""
68
+ return self.brand_prompts.get_brands_by_category(category)
69
+
70
+ def search_brand_by_alias(self, alias: str) -> Optional[str]:
71
+ """根據別名搜尋品牌名稱"""
72
+ return self.brand_prompts.search_brand_by_alias(alias)
73
+
74
+ # ===== 地標相關方法 Landmark Methods =====
75
+
76
+ def get_landmark_prompts(self, landmark_name: str) -> Optional[Dict]:
77
+ """
78
+ 取得特定地標的完整 prompt 資料
79
+
80
+ Args:
81
+ landmark_name: 地標名稱
82
+
83
+ Returns:
84
+ 地標資料字典
85
+ """
86
+ return self.landmark_prompts.get_prompts(landmark_name)
87
+
88
+ def get_all_landmarks(self) -> Dict:
89
+ """取得所有地標資料"""
90
+ return self.landmark_prompts.get_all_landmarks()
91
+
92
+ def search_landmark_by_location(self, city: str = None, country: str = None) -> List[str]:
93
+ """
94
+ 根據地理位置搜尋地標
95
+
96
+ Args:
97
+ city: 城市名稱
98
+ country: 國家名稱
99
+
100
+ Returns:
101
+ 符合條件的地標名稱列表
102
+ """
103
+ return self.landmark_prompts.search_by_location(city, country)
104
+
105
+ def get_landmark_visual_prompts(self, landmark_name: str, context: str = 'iconic_view') -> List[str]:
106
+ """
107
+ 取得地標的視覺描述 prompts
108
+
109
+ Args:
110
+ landmark_name: 地標名稱
111
+ context: 情境類型
112
+
113
+ Returns:
114
+ 視覺描述列表
115
+ """
116
+ return self.landmark_prompts.get_visual_prompts(landmark_name, context)
117
+
118
+ # Scene Methods
119
+ def get_scene_prompts(self, scene_category: str, subcategory: str = None) -> List[str]:
120
+ """
121
+ 取得場景 prompts
122
+
123
+ Args:
124
+ scene_category: 場景類別
125
+ subcategory: 子類別(可選)
126
+
127
+ Returns:
128
+ Prompt 列表
129
+ """
130
+ return self.scene_prompts.get_prompts(scene_category, subcategory)
131
+
132
+ def get_all_scene_categories(self) -> List[str]:
133
+ """取得所有場景類別"""
134
+ return self.scene_prompts.get_all_categories()
135
+
136
+ def get_scene_subcategories(self, scene_category: str) -> List[str]:
137
+ """取得場景的子類別"""
138
+ return self.scene_prompts.get_subcategories(scene_category)
139
+
140
+ # Universal Object Methods
141
+ def get_object_prompts(self, category: str, subcategory: str = None) -> List[str]:
142
+ """
143
+ 取得通用物品 prompts
144
+
145
+ Args:
146
+ category: 物品類別 (如 'animals', 'vehicles')
147
+ subcategory: 子類別 (如 'dogs', 'cats')
148
+
149
+ Returns:
150
+ Prompt 列表
151
+ """
152
+ return self.object_prompts.get_prompts(category, subcategory)
153
+
154
+ def get_all_object_categories(self) -> List[str]:
155
+ """取得所有通用物品類別"""
156
+ return self.object_prompts.get_all_categories()
157
+
158
+ def get_object_subcategories(self, category: str) -> List[str]:
159
+ """取得物品的子類別"""
160
+ return self.object_prompts.get_subcategories(category)
161
+
162
+ def detect_object_category(self, detected_objects: List[str]) -> Optional[str]:
163
+ """根據檢測到的物體推測主要類別"""
164
+ return self.object_prompts.detect_object_category(detected_objects)
165
+
166
+ # Smart Hashtag Generation
167
+ def get_hashtags_for_content(self, detected_items: Dict, language: str = 'zh') -> List[str]:
168
+ """
169
+ 智能標籤生成:整合品牌、地標、場景的標籤
170
+
171
+ Args:
172
+ detected_items: 檢測到的內容字典
173
+ {
174
+ 'landmarks': ['Big Ben', ...],
175
+ 'brands': ['Apple', ...],
176
+ 'scene_category': 'urban',
177
+ 'scene_subcategory': 'city_canyon'
178
+ }
179
+ language: 語言 ('zh', 'en', 或 'zh-en')
180
+
181
+ Returns:
182
+ Hashtag 列表(去重並排序)
183
+ """
184
+ hashtags = []
185
+
186
+ # 1. 地標標籤(最高優先級)
187
+ landmarks = detected_items.get('landmarks', [])
188
+ for landmark in landmarks:
189
+ landmark_tags = self.landmark_prompts.get_hashtags(landmark, language)
190
+ hashtags.extend(landmark_tags)
191
+
192
+ # 2. 品牌標籤(高優先級)
193
+ brands = detected_items.get('brands', [])
194
+ for brand in brands:
195
+ brand_tags = self.brand_prompts.get_hashtags(brand, language)
196
+ hashtags.extend(brand_tags)
197
+
198
+ # 3. 場景標籤(中優先級)
199
+ scene_category = detected_items.get('scene_category')
200
+ if scene_category:
201
+ scene_tags = self.scene_prompts.get_hashtags(scene_category, language)
202
+ hashtags.extend(scene_tags)
203
+
204
+ # 去重並保持順序(地標 > 品牌 > 場景)
205
+ seen = set()
206
+ unique_hashtags = []
207
+ for tag in hashtags:
208
+ if tag not in seen:
209
+ seen.add(tag)
210
+ unique_hashtags.append(tag)
211
+
212
+ # 返回前 10 個
213
+ return unique_hashtags[:10]
214
+
215
+ # Search Functions
216
+ def search_by_location(self, city: str = None, country: str = None) -> Dict:
217
+ """
218
+ 根據地點搜尋所有相關內容(地標、品牌)
219
+
220
+ Args:
221
+ city: 城市名稱
222
+ country: 國家名稱
223
+
224
+ Returns:
225
+ 搜尋結果字典
226
+ """
227
+ results = {
228
+ 'landmarks': [],
229
+ 'brands': []
230
+ }
231
+
232
+ # 搜尋地標
233
+ landmarks = self.landmark_prompts.search_by_location(city, country)
234
+ results['landmarks'] = landmarks
235
+
236
+ # 品牌通常不按地理位置分類,但可以擴展此功能
237
+
238
+ return results
239
+
240
+ def detect_landmark_from_image_context(self, detected_objects: List[str],
241
+ scene_analysis: Dict) -> Optional[str]:
242
+ """
243
+ 根據檢測到的物體和場景分析推測可能的地標
244
+
245
+ Args:
246
+ detected_objects: 檢測到的物體列表
247
+ scene_analysis: 場景分析結果
248
+
249
+ Returns:
250
+ 推測的地標名稱,若無法推測則返回 None
251
+ """
252
+ # 關鍵字映射到地標
253
+ landmark_keywords = {
254
+ 'Big Ben': ['clock tower', 'tower', 'bridge', 'river'],
255
+ 'Eiffel Tower': ['tower', 'iron structure', 'landmark'],
256
+ 'Statue of Liberty': ['statue', 'monument', 'island', 'harbor'],
257
+ 'Sydney Opera House': ['building', 'harbor', 'architecture'],
258
+ 'Taj Mahal': ['building', 'monument', 'dome'],
259
+ 'Pyramids of Giza': ['pyramid', 'desert', 'monument'],
260
+ # 可以擴展更多
261
+ }
262
+
263
+ # 簡單的關鍵字匹配
264
+ for landmark, keywords in landmark_keywords.items():
265
+ match_count = sum(1 for obj in detected_objects
266
+ if any(kw in obj.lower() for kw in keywords))
267
+ if match_count >= 2: # 至少匹配 2 個關鍵字
268
+ return landmark
269
+
270
+ return None
271
+
272
+ print("✓ PromptLibraryManager (Facade) defined")
saliency_detection_manager.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+ import numpy as np
4
+ from PIL import Image
5
+ import cv2
6
+ from typing import List, Dict
7
+ import torchvision.transforms as transforms
8
+
9
+ class SaliencyDetectionManager:
10
+ """Visual saliency detection using U2-Net"""
11
+
12
+ def __init__(self):
13
+ print("Loading U2-Net model...")
14
+ try:
15
+ from torchvision.models.segmentation import deeplabv3_resnet50
16
+ self.model = deeplabv3_resnet50(pretrained=True)
17
+ self.model.eval()
18
+ if torch.cuda.is_available():
19
+ self.model = self.model.cuda()
20
+ except Exception as e:
21
+ print(f"Warning: Cannot load deep learning model, using fallback: {e}")
22
+ self.model = None
23
+
24
+ self.threshold = 0.5
25
+ self.min_area = 1600
26
+ self.min_saliency = 0.6
27
+
28
+ self.transform = transforms.Compose([
29
+ transforms.Resize((320, 320)),
30
+ transforms.ToTensor(),
31
+ transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
32
+ ])
33
+
34
+ print("✓ SaliencyDetectionManager initialized")
35
+
36
+ def detect_salient_regions(self, image: Image.Image) -> List[Dict]:
37
+ """Detect salient regions"""
38
+ img_array = np.array(image)
39
+ gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
40
+
41
+ _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
42
+ contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
43
+
44
+ regions = []
45
+ height, width = img_array.shape[:2]
46
+
47
+ for contour in contours:
48
+ area = cv2.contourArea(contour)
49
+ if area < self.min_area:
50
+ continue
51
+
52
+ x, y, w, h = cv2.boundingRect(contour)
53
+ bbox = [float(x), float(y), float(x + w), float(y + h)]
54
+ region_img = image.crop(bbox)
55
+
56
+ regions.append({
57
+ 'bbox': bbox,
58
+ 'area': area,
59
+ 'saliency_score': min(area / (width * height), 1.0),
60
+ 'image': region_img
61
+ })
62
+
63
+ regions = sorted(regions, key=lambda x: x['saliency_score'], reverse=True)
64
+ return regions[:10]
65
+
66
+ def extract_unknown_regions(self, salient_regions: List[Dict], yolo_detections: List[Dict]) -> List[Dict]:
67
+ """Extract salient regions not detected by YOLO"""
68
+ unknown_regions = []
69
+
70
+ for region in salient_regions:
71
+ max_iou = 0.0
72
+ for det in yolo_detections:
73
+ iou = self._calculate_iou(region['bbox'], det['bbox'])
74
+ max_iou = max(max_iou, iou)
75
+
76
+ if max_iou < 0.3:
77
+ unknown_regions.append(region)
78
+
79
+ return unknown_regions
80
+
81
+ def _calculate_iou(self, box1: List[float], box2: List[float]) -> float:
82
+ """Calculate IoU (Intersection over Union)"""
83
+ x1_min, y1_min, x1_max, y1_max = box1
84
+ x2_min, y2_min, x2_max, y2_max = box2
85
+
86
+ inter_xmin = max(x1_min, x2_min)
87
+ inter_ymin = max(y1_min, y2_min)
88
+ inter_xmax = min(x1_max, x2_max)
89
+ inter_ymax = min(y1_max, y2_max)
90
+
91
+ if inter_xmax < inter_xmin or inter_ymax < inter_ymin:
92
+ return 0.0
93
+
94
+ inter_area = (inter_xmax - inter_xmin) * (inter_ymax - inter_ymin)
95
+ box1_area = (x1_max - x1_min) * (y1_max - y1_min)
96
+ box2_area = (x2_max - x2_min) * (y2_max - y2_min)
97
+ union_area = box1_area + box2_area - inter_area
98
+
99
+ return inter_area / union_area if union_area > 0 else 0.0
100
+
101
+ print("✓ SaliencyDetectionManager defined")
scene_compatibility_manager.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from typing import Dict, List
3
+ from prompt_library_manager import PromptLibraryManager
4
+
5
+ class SceneCompatibilityManager:
6
+ """Check brand-scene compatibility to reduce false positives"""
7
+
8
+ def __init__(self, prompt_library: PromptLibraryManager = None):
9
+ """
10
+ Args:
11
+ prompt_library: PromptLibraryManager instance for brand metadata
12
+ """
13
+ if prompt_library is None:
14
+ prompt_library = PromptLibraryManager()
15
+
16
+ self.prompt_library = prompt_library
17
+
18
+ # Scene classification keywords
19
+ self.scene_keywords = {
20
+ 'food_closeup': ['food', 'meal', 'dish', 'plate', 'restaurant', 'dining', 'cuisine'],
21
+ 'nature_landscape': ['mountain', 'forest', 'beach', 'ocean', 'lake', 'sky', 'sunset', 'outdoor'],
22
+ 'industrial': ['factory', 'warehouse', 'industrial', 'machinery', 'construction'],
23
+ 'sports': ['gym', 'fitness', 'running', 'sports', 'athletic', 'exercise'],
24
+ 'fashion': ['fashion', 'outfit', 'style', 'wearing', 'model'],
25
+ 'luxury_retail': ['store', 'boutique', 'shop', 'retail', 'display'],
26
+ 'office': ['office', 'desk', 'computer', 'workspace', 'business'],
27
+ 'home': ['home', 'room', 'interior', 'living', 'bedroom'],
28
+ 'lifestyle': ['lifestyle', 'casual', 'everyday', 'daily'],
29
+ 'tech_review': ['unboxing', 'review', 'tech', 'device', 'gadget'],
30
+ 'formal_event': ['event', 'party', 'formal', 'ceremony', 'celebration'],
31
+ 'outdoor': ['outdoor', 'park', 'street', 'outside'],
32
+ 'travel': ['travel', 'trip', 'luggage', 'airport', 'vacation'],
33
+ 'street': ['street', 'road', 'urban', 'city'],
34
+ 'parking': ['parking', 'car park', 'garage'],
35
+ 'showroom': ['showroom', 'exhibition', 'display'],
36
+ 'closeup': ['closeup', 'detail', 'macro', 'close-up']
37
+ }
38
+
39
+ print("✓ Scene Compatibility Manager initialized")
40
+
41
+ def classify_scene(self, scene_analysis: Dict) -> str:
42
+ """
43
+ Classify scene type from OpenCLIP scene analysis
44
+
45
+ Args:
46
+ scene_analysis: Scene analysis results from OpenCLIPSemanticManager
47
+
48
+ Returns:
49
+ Scene type string (e.g., 'food_closeup', 'fashion', 'tech_review')
50
+ """
51
+ # Extract top scene categories
52
+ scene_scores = {}
53
+
54
+ # Check different scene analysis keys
55
+ for key in ['urban', 'lighting', 'mood', 'composition']:
56
+ if key in scene_analysis and 'top' in scene_analysis[key]:
57
+ top_label = scene_analysis[key]['top'].lower()
58
+
59
+ # Match with scene keywords
60
+ for scene_type, keywords in self.scene_keywords.items():
61
+ for keyword in keywords:
62
+ if keyword in top_label:
63
+ scene_scores[scene_type] = scene_scores.get(scene_type, 0) + 1
64
+
65
+ # Return most matched scene type
66
+ if scene_scores:
67
+ return max(scene_scores.items(), key=lambda x: x[1])[0]
68
+
69
+ return 'general'
70
+
71
+ def check_compatibility(self, brand_name: str, scene_type: str) -> float:
72
+ """
73
+ Check if brand is compatible with scene
74
+
75
+ Args:
76
+ brand_name: Name of the brand
77
+ scene_type: Scene type (e.g., 'food_closeup', 'fashion')
78
+
79
+ Returns:
80
+ Compatibility score (0.3 to 1.0)
81
+ - 1.0: fully compatible
82
+ - 0.7: neutral (no strong match or mismatch)
83
+ - 0.3: incompatible (reduce confidence)
84
+ """
85
+ brand_info = self.prompt_library.get_brand_prompts(brand_name)
86
+
87
+ if not brand_info:
88
+ return 0.7 # Neutral if brand not found
89
+
90
+ # Check if scene is typical for this brand
91
+ typical_scenes = brand_info.get('typical_scenes', [])
92
+ if scene_type in typical_scenes:
93
+ return 1.0 # Fully compatible
94
+
95
+ # Check if scene is incompatible
96
+ incompatible_scenes = brand_info.get('incompatible_scenes', [])
97
+ if scene_type in incompatible_scenes:
98
+ return 0.3 # Reduce confidence significantly
99
+
100
+ # Neutral case - no strong evidence either way
101
+ return 0.7
102
+
103
+ def batch_check_compatibility(self, detected_brands: List[tuple],
104
+ scene_analysis: Dict) -> List[tuple]:
105
+ """
106
+ Check compatibility for multiple brands
107
+
108
+ Args:
109
+ detected_brands: List of (brand_name, confidence, bbox) tuples
110
+ scene_analysis: Scene analysis results
111
+
112
+ Returns:
113
+ List of (brand_name, adjusted_confidence, bbox) tuples
114
+ """
115
+ scene_type = self.classify_scene(scene_analysis)
116
+
117
+ adjusted_brands = []
118
+ for brand_name, confidence, bbox in detected_brands:
119
+ compatibility_score = self.check_compatibility(brand_name, scene_type)
120
+
121
+ # Adjust confidence based on compatibility
122
+ adjusted_confidence = confidence * compatibility_score
123
+
124
+ # Only keep if adjusted confidence is still reasonable
125
+ if adjusted_confidence > 0.25:
126
+ adjusted_brands.append((brand_name, adjusted_confidence, bbox))
127
+
128
+ # Re-sort by adjusted confidence
129
+ adjusted_brands.sort(key=lambda x: x[1], reverse=True)
130
+
131
+ return adjusted_brands
132
+
133
+ print("✓ SceneCompatibilityManager defined")
scene_prompts.py ADDED
@@ -0,0 +1,433 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from typing import Dict, List
3
+
4
+ class ScenePrompts:
5
+ """
6
+ 場景描述 Prompt 庫
7
+ 提供多元化場景類型的詳細視覺描述
8
+ 涵蓋:城市、自然、室內、食物、人物、產品等場景
9
+ """
10
+
11
+ def __init__(self):
12
+ """初始化場景詞彙庫"""
13
+
14
+ self.scene_vocabularies = {
15
+ # ===== 城市場景 Urban =====
16
+ 'urban': {
17
+ 'city_canyon': [
18
+ 'urban canyon with towering skyscrapers lining both sides of street creating vertical corridor',
19
+ 'metropolitan corridor formed by tall buildings with strong vertical emphasis and symmetrical composition',
20
+ 'downtown street flanked by modern high-rise architecture creating canyon effect',
21
+ 'city street with tall buildings on both sides creating narrow vertical perspective'
22
+ ],
23
+ 'street_level': [
24
+ 'bustling city street with pedestrians and vehicles in urban environment',
25
+ 'urban sidewalk scene with street furniture storefronts and mixed activity',
26
+ 'downtown pedestrian area with commercial buildings and urban infrastructure',
27
+ 'street view with urban architecture shops and people walking'
28
+ ],
29
+ 'skyline': [
30
+ 'city skyline with skyscrapers silhouetted against sky',
31
+ 'urban panorama showing downtown high-rise buildings and city sprawl',
32
+ 'metropolitan skyline view from elevated vantage point',
33
+ 'cityscape with distinctive tall buildings defining horizon line'
34
+ ],
35
+ 'plaza': [
36
+ 'urban plaza with open public space and surrounding architecture',
37
+ 'city square with pedestrians monuments and commercial buildings',
38
+ 'downtown plaza featuring fountains sculptures and gathering spaces',
39
+ 'public square with mixed use of recreational and commercial activities'
40
+ ]
41
+ },
42
+
43
+ # ===== 自然風景 Nature =====
44
+ 'nature': {
45
+ 'mountain': [
46
+ 'majestic mountain range with snow-capped peaks against blue sky',
47
+ 'alpine landscape with rocky summits and glacial valleys',
48
+ 'mountain vista with layered ridges fading into distance creating depth',
49
+ 'dramatic mountain scenery with rugged peaks and alpine vegetation',
50
+ 'mountainous terrain with steep slopes and varied elevation'
51
+ ],
52
+ 'beach': [
53
+ 'serene beach with turquoise water and white sand shore',
54
+ 'coastal scene with gentle waves lapping at sandy beach',
55
+ 'tropical beach with clear water and palm tree shadows',
56
+ 'beach landscape with ocean horizon and coastal features',
57
+ 'seaside view with beach sand water and sky meeting at horizon'
58
+ ],
59
+ 'forest': [
60
+ 'lush forest with dense canopy and dappled sunlight filtering through trees',
61
+ 'woodland scene with tall trees and undergrowth vegetation',
62
+ 'forest interior with tree trunks and leafy canopy overhead',
63
+ 'dense forest landscape with natural vegetation and organic forms',
64
+ 'wooded area with trees creating natural shade and green environment'
65
+ ],
66
+ 'lake': [
67
+ 'tranquil lake with still water reflecting surrounding landscape',
68
+ 'mountain lake with clear water and scenic backdrop',
69
+ 'lakeside view with calm water and shoreline vegetation',
70
+ 'peaceful lake scene with water sky and natural surroundings',
71
+ 'alpine lake with pristine water and mountain reflections'
72
+ ],
73
+ 'desert': [
74
+ 'desert landscape with sand dunes and arid terrain',
75
+ 'sandy desert with undulating dunes and clear sky',
76
+ 'arid desert scene with sparse vegetation and sandy ground',
77
+ 'desert vista with sand formations and minimal vegetation',
78
+ 'dry desert landscape with sand rock and desert plants'
79
+ ],
80
+ 'waterfall': [
81
+ 'cascading waterfall with flowing water over rocks',
82
+ 'waterfall scene with water spray and lush surrounding vegetation',
83
+ 'natural waterfall with water rushing down cliff face',
84
+ 'scenic waterfall with water pool and natural setting',
85
+ 'tiered waterfall with multiple cascades and mist'
86
+ ]
87
+ },
88
+
89
+ # ===== 室內場景 Indoor =====
90
+ 'indoor': {
91
+ 'cafe': [
92
+ 'cozy cafe interior with warm ambient lighting and wooden furniture',
93
+ 'modern coffee shop with industrial decor and minimalist design',
94
+ 'rustic cafe setting with vintage decorations and soft lighting',
95
+ 'contemporary cafe space with comfortable seating and artistic elements',
96
+ 'intimate coffee shop with warm atmosphere and inviting ambiance'
97
+ ],
98
+ 'restaurant': [
99
+ 'upscale restaurant interior with elegant table settings and refined decor',
100
+ 'casual dining space with comfortable seating and welcoming atmosphere',
101
+ 'fine dining restaurant with sophisticated lighting and premium furnishings',
102
+ 'restaurant setting with tables chairs and ambient lighting',
103
+ 'dining establishment with culinary presentation and service area'
104
+ ],
105
+ 'office': [
106
+ 'modern office space with desks computers and professional workspace',
107
+ 'contemporary work environment with ergonomic furniture and technology',
108
+ 'office interior with cubicles meeting areas and work stations',
109
+ 'professional office setting with business equipment and organized layout',
110
+ 'corporate workspace with clean lines and functional design'
111
+ ],
112
+ 'home_living': [
113
+ 'cozy living room with sofa comfortable seating and home decor',
114
+ 'modern home interior with minimalist furniture and clean aesthetic',
115
+ 'warm living space with personal touches and inviting atmosphere',
116
+ 'residential interior with family room features and casual comfort',
117
+ 'home living area with relaxation space and domestic furnishings'
118
+ ],
119
+ 'bedroom': [
120
+ 'peaceful bedroom with bed nightstands and soft lighting',
121
+ 'modern bedroom interior with minimalist design and calm atmosphere',
122
+ 'cozy sleeping space with comfortable bedding and personal decor',
123
+ 'bedroom setting with rest area and private sanctuary feel',
124
+ 'sleeping quarters with bed furniture and restful ambiance'
125
+ ],
126
+ 'museum': [
127
+ 'museum interior with exhibited artworks and gallery lighting',
128
+ 'cultural institution space with display cases and visitor areas',
129
+ 'art gallery with paintings sculptures and exhibition design',
130
+ 'museum hall with artifacts and informational displays',
131
+ 'exhibition space with curated collections and viewing areas'
132
+ ]
133
+ },
134
+
135
+ # ===== 食物場景 Food =====
136
+ 'food': {
137
+ 'plated_dish': [
138
+ 'gourmet plated dish with artistic presentation and fine dining aesthetics',
139
+ 'restaurant plate with carefully arranged food components and garnishes',
140
+ 'culinary creation with vibrant colors and professional plating',
141
+ 'plated meal with balanced composition and appetizing appearance',
142
+ 'food presentation with attention to visual detail and portion control',
143
+ 'elegant dinner plate with sophisticated garnish and culinary artistry',
144
+ 'fusion cuisine dish with innovative presentation and colorful elements',
145
+ 'fine dining entree with sauce art and premium ingredients',
146
+ 'contemporary plated food with geometric arrangement and edible flowers',
147
+ "chef's special with meticulous plating and restaurant-quality finish"
148
+ ],
149
+ 'street_food': [
150
+ 'casual street food on wooden table or food truck setting',
151
+ 'authentic street cuisine with rustic presentation and local character',
152
+ 'food stall offering with simple plating and traditional preparation',
153
+ 'street vendor food with casual serving style and cultural authenticity',
154
+ 'local street eats with informal presentation and fresh ingredients',
155
+ 'food truck meal with paper packaging and urban backdrop',
156
+ 'market stall food with traditional cooking methods and local flavors',
157
+ 'outdoor food stand offering with casual atmosphere and quick service',
158
+ 'street-side cuisine with vibrant colors and authentic preparation',
159
+ 'hawker food with cultural heritage and honest presentation'
160
+ ],
161
+ 'dessert': [
162
+ 'elaborate dessert with decorative elements and sweet presentation',
163
+ 'pastry or cake with artistic decoration and enticing appearance',
164
+ 'sweet course with layered construction and visual appeal',
165
+ 'dessert plate with confectionery artistry and color contrast',
166
+ 'bakery creation with detailed finishing and appetizing styling',
167
+ 'chocolate dessert with glossy ganache and elegant garnish',
168
+ 'fruit tart with colorful berries and glazed finish',
169
+ 'layered cake slice with frosting art and textured decoration',
170
+ 'ice cream sundae with toppings drizzle and attractive presentation',
171
+ 'patisserie item with delicate decoration and refined sweetness'
172
+ ],
173
+ 'ingredients': [
174
+ 'fresh ingredients closeup shot with natural textures and vibrant colors',
175
+ 'raw food components with organic forms and market-fresh appearance',
176
+ 'culinary ingredients arranged with attention to color and composition',
177
+ 'fresh produce with natural beauty and wholesome qualities',
178
+ 'cooking ingredients with variety of textures and natural appeal',
179
+ 'farmers market vegetables with rich colors and organic shapes',
180
+ 'herb and spice arrangement with aromatic qualities and rustic charm',
181
+ 'seafood display with ice and fresh-from-ocean appearance',
182
+ 'butcher quality meat with marbling and premium cut presentation',
183
+ 'artisan bread and grains with wholesome texture and natural crust'
184
+ ],
185
+ 'beverage': [
186
+ 'artisan beverage with careful presentation and appealing pour',
187
+ 'drink in glassware with garnish and professional service style',
188
+ 'coffee or tea with latte art and aesthetic serving',
189
+ 'refreshing beverage with ice garnish and attractive glass',
190
+ 'drink presentation with attention to color and visual interest',
191
+ 'craft cocktail with creative garnish and sophisticated glassware',
192
+ 'specialty coffee with foam art and ceramic cup presentation',
193
+ 'fresh juice with fruit garnish and vibrant natural color',
194
+ 'tea service with elegant teapot and traditional ceremony aesthetic',
195
+ 'smoothie bowl with fruit toppings and colorful healthy presentation'
196
+ ],
197
+ 'breakfast': [
198
+ 'morning breakfast spread with eggs toast and fresh coffee',
199
+ 'continental breakfast with pastries croissants and fruit arrangement',
200
+ 'healthy breakfast bowl with granola yogurt and berries',
201
+ 'pancake stack with maple syrup butter and powdered sugar',
202
+ 'avocado toast with poached egg and microgreens on rustic plate',
203
+ 'breakfast plate with bacon eggs and golden hash browns',
204
+ 'brunch setting with mimosas fresh flowers and elegant tableware',
205
+ 'oatmeal bowl with nuts fruits and honey drizzle',
206
+ 'smoothie and acai bowl with tropical fruits and seeds',
207
+ 'breakfast sandwich with melted cheese and morning sunlight'
208
+ ],
209
+ 'baked_goods': [
210
+ 'fresh baked bread with golden crust and flour dusting',
211
+ 'artisan pastries with flaky layers and butter sheen',
212
+ 'homemade cookies with chocolate chips and rustic appearance',
213
+ 'sourdough loaf with scoring pattern and crusty exterior',
214
+ 'cinnamon rolls with cream cheese frosting and swirls',
215
+ 'French baguette with crispy crust and airy crumb',
216
+ 'croissants with laminated layers and golden brown color',
217
+ 'muffins with crumb topping and fresh from oven warmth',
218
+ 'bagels with sesame seeds and chewy texture',
219
+ 'focaccia bread with herbs olive oil and dimpled surface'
220
+ ]
221
+ },
222
+
223
+ # ===== 人物場景 People =====
224
+ 'people': {
225
+ 'portrait': [
226
+ 'portrait photograph with shallow depth of field and subject focus',
227
+ 'headshot with clean background and flattering lighting on face',
228
+ 'personal portrait with emotional expression and eye contact',
229
+ 'portrait composition with subject as primary visual element',
230
+ 'close-up portrait with facial features and personality captured'
231
+ ],
232
+ 'candid': [
233
+ 'candid street photography moment with natural unposed action',
234
+ 'spontaneous capture of people in authentic situations and activities',
235
+ 'documentary-style photograph of real-life moments and interactions',
236
+ 'natural human behavior captured without staged positioning',
237
+ 'unscripted moment showing genuine emotion and movement'
238
+ ],
239
+ 'group': [
240
+ 'group photo with multiple people in organized composition',
241
+ 'gathering of people with social interaction and shared activity',
242
+ 'team or family portrait with coordinated positioning',
243
+ 'group setting with people engaged in collective experience',
244
+ 'multiple subjects arranged in harmonious group composition'
245
+ ],
246
+ 'activity': [
247
+ 'people engaged in specific activity or recreational pursuit',
248
+ 'action photograph showing physical movement and dynamic energy',
249
+ 'sports or fitness activity with athletic performance captured',
250
+ 'people participating in hobby or leisure activity',
251
+ 'human subjects in motion demonstrating skill or exercise'
252
+ ]
253
+ },
254
+
255
+ # ===== 產品場景 Product =====
256
+ 'product': {
257
+ 'studio_shot': [
258
+ 'minimalist product photography on white background with clean lighting',
259
+ 'commercial product shot with professional lighting and sharp detail',
260
+ 'studio product photograph with controlled environment and even illumination',
261
+ 'catalog-style product image with neutral background and clear presentation',
262
+ 'product on white backdrop with shadow control and highlight management'
263
+ ],
264
+ 'lifestyle': [
265
+ 'lifestyle product shot in natural setting with contextual environment',
266
+ 'product in use showing real-world application and human interaction',
267
+ 'environmental product photography with lifestyle context and atmosphere',
268
+ 'product placed in authentic setting with relatable situation',
269
+ 'contextual product image showing everyday use and practical application'
270
+ ],
271
+ 'flatlay': [
272
+ 'overhead flatlay composition with products arranged on surface',
273
+ "bird's eye view of items arranged in artistic layout",
274
+ 'top-down product styling with complementary objects and props',
275
+ 'flatlay arrangement with balanced composition and visual harmony',
276
+ 'aerial view of products styled with decorative elements'
277
+ ]
278
+ },
279
+
280
+ # ===== 建築場景 Architecture =====
281
+ 'architecture': {
282
+ 'modern': [
283
+ 'contemporary architecture with glass steel and minimalist design',
284
+ 'modern building with clean lines geometric forms and innovative structure',
285
+ 'architectural design featuring current aesthetic and building technology',
286
+ 'present-day construction with progressive design and materials',
287
+ 'modern structure with sleek surfaces and contemporary styling'
288
+ ],
289
+ 'historic': [
290
+ 'historic architecture with traditional design and aged materials',
291
+ 'heritage building with classical elements and period styling',
292
+ 'old structure with architectural significance and historical character',
293
+ 'traditional building with cultural importance and time-worn beauty',
294
+ 'antique architecture showing craftsmanship of past eras'
295
+ ],
296
+ 'interior': [
297
+ 'architectural interior space with designed environment and spatial quality',
298
+ 'building interior showing layout flow and functional design',
299
+ 'indoor architectural space with lighting surfaces and volumes',
300
+ 'interior architecture with structural elements and finish materials',
301
+ 'designed space interior with architectural features and spatial composition'
302
+ ],
303
+ 'detail': [
304
+ 'architectural detail closeup showing construction method and materials',
305
+ 'building element with decorative or functional architectural feature',
306
+ 'structural detail revealing craftsmanship and design specifics',
307
+ 'architectural component with unique design characteristic',
308
+ 'close view of building feature showing texture pattern or ornamentation'
309
+ ]
310
+ },
311
+
312
+ # ===== 光線描述 Lighting =====
313
+ 'lighting': [
314
+ 'soft diffused light creating even illumination without harsh shadows',
315
+ 'natural daylight with bright ambient illumination and true colors',
316
+ 'overcast atmosphere with diffused skylight and muted shadows',
317
+ 'warm ambient light with golden tones and cozy feeling',
318
+ 'evening light with low angle sun and long shadows',
319
+ 'bright sunlight with strong contrast and crisp shadows',
320
+ 'studio lighting with controlled illumination and professional quality',
321
+ 'indoor natural light from windows creating gentle directional lighting',
322
+ 'warm artificial lighting with incandescent glow and amber tones',
323
+ 'cool artificial lighting with fluorescent or LED quality',
324
+ 'soft indoor lighting with diffused sources and minimal shadows',
325
+ 'dramatic lighting with strong contrast and defined shadows'
326
+ ],
327
+
328
+ # ===== 氛圍描述 Mood =====
329
+ 'mood': [
330
+ 'calm and contemplative atmosphere with serene peaceful quality',
331
+ 'bustling and energetic environment with dynamic active feeling',
332
+ 'dramatic and imposing presence with powerful visual impact',
333
+ 'cozy and intimate setting with warm welcoming ambiance',
334
+ 'minimalist and clean aesthetic with simple uncluttered feel',
335
+ 'vibrant and colorful scene with rich saturated hues',
336
+ 'moody and atmospheric environment with evocative lighting',
337
+ 'elegant and sophisticated setting with refined tasteful quality',
338
+ 'rustic and natural atmosphere with organic earthy character',
339
+ 'modern and sleek environment with contemporary styling'
340
+ ]
341
+ }
342
+
343
+ # 場景標籤映射
344
+ self.scene_hashtags = {
345
+ 'urban': {
346
+ 'zh': ['城市', '都市', '城市風景', '街拍', '建築'],
347
+ 'en': ['Urban', 'Cityscape', 'StreetPhotography', 'Architecture', 'City']
348
+ },
349
+ 'nature': {
350
+ 'zh': ['自然', '風景', '戶外', '大自然', '風景攝影'],
351
+ 'en': ['Nature', 'Landscape', 'Outdoor', 'Scenery', 'NaturePhotography']
352
+ },
353
+ 'indoor': {
354
+ 'zh': ['室內', '室內設計', '空間', '居家'],
355
+ 'en': ['Indoor', 'InteriorDesign', 'Interior', 'Home']
356
+ },
357
+ 'food': {
358
+ 'zh': ['美食', '食物', '料理', '美食攝影', '餐廳'],
359
+ 'en': ['Food', 'Foodie', 'FoodPhotography', 'Cuisine', 'Dining']
360
+ },
361
+ 'people': {
362
+ 'zh': ['人像', '人物', '肖像', '街拍'],
363
+ 'en': ['Portrait', 'People', 'PortraitPhotography', 'Candid']
364
+ },
365
+ 'product': {
366
+ 'zh': ['產品', '商品', '產品攝影', '商業攝影'],
367
+ 'en': ['Product', 'ProductPhotography', 'Commercial', 'Flatlay']
368
+ },
369
+ 'architecture': {
370
+ 'zh': ['建築', '建築攝影', '建築設計', '空間'],
371
+ 'en': ['Architecture', 'ArchitecturalPhotography', 'Building', 'Design']
372
+ }
373
+ }
374
+
375
+ print(f"✓ Scene Prompts initialized with {len(self.scene_vocabularies)} scene categories")
376
+
377
+ def get_prompts(self, scene_category: str, subcategory: str = None) -> List[str]:
378
+ """
379
+ 取得場景 prompts
380
+
381
+ Args:
382
+ scene_category: 場景類別 (如 'urban', 'nature')
383
+ subcategory: 子類別 (如 'city_canyon', 'mountain')
384
+
385
+ Returns:
386
+ Prompt 列表
387
+ """
388
+ category_prompts = self.scene_vocabularies.get(scene_category, {})
389
+
390
+ if subcategory:
391
+ return category_prompts.get(subcategory, [])
392
+ else:
393
+ # 返回該類別的所有 prompts
394
+ all_prompts = []
395
+ for prompts in category_prompts.values():
396
+ if isinstance(prompts, list):
397
+ all_prompts.extend(prompts)
398
+ return all_prompts
399
+
400
+ def get_all_categories(self) -> List[str]:
401
+ """取得所有場景類別"""
402
+ return list(self.scene_vocabularies.keys())
403
+
404
+ def get_subcategories(self, scene_category: str) -> List[str]:
405
+ """取得特定類別的所有子類別"""
406
+ category = self.scene_vocabularies.get(scene_category, {})
407
+ return list(category.keys()) if isinstance(category, dict) else []
408
+
409
+ def get_hashtags(self, scene_category: str, language: str = 'zh') -> List[str]:
410
+ """
411
+ 取得場景的 hashtags
412
+
413
+ Args:
414
+ scene_category: 場景類別
415
+ language: 語言 ('zh', 'en', 或 'both')
416
+
417
+ Returns:
418
+ Hashtag 列表
419
+ """
420
+ hashtags = self.scene_hashtags.get(scene_category, {})
421
+
422
+ if language == 'zh':
423
+ return hashtags.get('zh', [])
424
+ elif language == 'en':
425
+ return hashtags.get('en', [])
426
+ elif language == 'both' or language == 'zh-en':
427
+ zh_tags = hashtags.get('zh', [])
428
+ en_tags = hashtags.get('en', [])
429
+ return zh_tags + en_tags
430
+ else:
431
+ return hashtags.get('zh', [])
432
+
433
+ print("✓ ScenePrompts defined")
ui_manager.py ADDED
@@ -0,0 +1,681 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from typing import Dict, List
3
+
4
+ class UIManager:
5
+ """Manages all UI components and styling for Pixcribe"""
6
+
7
+ def __init__(self):
8
+ self.custom_css = self._get_custom_css()
9
+
10
+ def _get_custom_css(self) -> str:
11
+ """Return complete CSS styling - Elegant light design"""
12
+ return """
13
+ /* ==================== Global Reset & Base ==================== */
14
+ * {
15
+ margin: 0;
16
+ padding: 0;
17
+ box-sizing: border-box;
18
+ }
19
+
20
+ .gradio-container {
21
+ background: linear-gradient(135deg, #F8F9FA 0%, #E9ECEF 100%) !important;
22
+ font-family: -apple-system, BlinkMacSystemFont, 'SF Pro Display', 'Segoe UI', 'Roboto', 'Helvetica Neue', Arial, sans-serif !important;
23
+ padding: 0 !important;
24
+ max-width: 100% !important;
25
+ min-height: 100vh !important;
26
+ }
27
+
28
+ /* Main content wrapper - Generous padding to prevent edge clipping */
29
+ .contain {
30
+ max-width: 1600px !important;
31
+ margin: 0 auto !important;
32
+ padding: 64px 96px 96px 96px !important;
33
+ }
34
+
35
+ /* ==================== Header ==================== */
36
+ .app-header {
37
+ text-align: center;
38
+ margin-bottom: 72px;
39
+ animation: fadeInDown 0.8s ease-out;
40
+ padding: 0 32px;
41
+ }
42
+
43
+ @keyframes fadeInDown {
44
+ from {
45
+ opacity: 0;
46
+ transform: translateY(-30px);
47
+ }
48
+ to {
49
+ opacity: 1;
50
+ transform: translateY(0);
51
+ }
52
+ }
53
+
54
+ .app-title {
55
+ font-size: 72px;
56
+ font-weight: 800;
57
+ background: linear-gradient(135deg, #2C3E50 0%, #34495E 100%);
58
+ -webkit-background-clip: text;
59
+ -webkit-text-fill-color: transparent;
60
+ background-clip: text;
61
+ margin-bottom: 24px;
62
+ letter-spacing: -0.05em;
63
+ line-height: 1.1;
64
+ }
65
+
66
+ .app-subtitle {
67
+ font-size: 26px;
68
+ font-weight: 400;
69
+ color: #6C757D;
70
+ margin-bottom: 0;
71
+ letter-spacing: 0.01em;
72
+ }
73
+
74
+ /* ==================== Layout ==================== */
75
+ .main-row {
76
+ gap: 48px !important;
77
+ margin-bottom: 48px !important;
78
+ }
79
+
80
+ /* Left column elegant container */
81
+ .main-row > .column:first-child {
82
+ background: linear-gradient(135deg, rgba(255, 255, 255, 0.8) 0%, rgba(252, 253, 254, 0.6) 100%) !important;
83
+ border-radius: 28px !important;
84
+ padding: 40px !important;
85
+ border: 1px solid rgba(52, 152, 219, 0.08) !important;
86
+ box-shadow: 0 4px 20px rgba(0, 0, 0, 0.04) !important;
87
+ }
88
+
89
+ /* Right column elegant container */
90
+ .main-row > .column:last-child {
91
+ background: linear-gradient(135deg, rgba(255, 255, 255, 0.8) 0%, rgba(252, 253, 254, 0.6) 100%) !important;
92
+ border-radius: 28px !important;
93
+ padding: 40px !important;
94
+ border: 1px solid rgba(52, 152, 219, 0.08) !important;
95
+ box-shadow: 0 4px 20px rgba(0, 0, 0, 0.04) !important;
96
+ }
97
+
98
+ /* ==================== Premium Cards - Light & Spacious ==================== */
99
+ .upload-card {
100
+ background: rgba(255, 255, 255, 0.95) !important;
101
+ border-radius: 32px !important;
102
+ box-shadow:
103
+ 0 4px 16px rgba(0, 0, 0, 0.06),
104
+ 0 2px 4px rgba(0, 0, 0, 0.03),
105
+ 0 1px 2px rgba(0, 0, 0, 0.02) !important;
106
+ border: 1px solid rgba(0, 0, 0, 0.05) !important;
107
+ padding: 48px !important;
108
+ margin-bottom: 32px !important;
109
+ transition: all 0.4s cubic-bezier(0.25, 0.46, 0.45, 0.94) !important;
110
+ overflow: visible !important;
111
+ }
112
+
113
+ .results-card {
114
+ background: transparent !important;
115
+ border-radius: 0 !important;
116
+ box-shadow: none !important;
117
+ border: none !important;
118
+ padding: 0 !important;
119
+ margin-bottom: 32px !important;
120
+ overflow: visible !important;
121
+ }
122
+
123
+ /* Caption Results Container - Elegant Design */
124
+ .caption-results-container {
125
+ background: linear-gradient(135deg, rgba(255, 255, 255, 0.85) 0%, rgba(252, 253, 254, 0.7) 100%) !important;
126
+ border-radius: 28px !important;
127
+ padding: 44px !important;
128
+ border: 1px solid rgba(52, 152, 219, 0.1) !important;
129
+ box-shadow:
130
+ 0 4px 20px rgba(0, 0, 0, 0.04),
131
+ 0 2px 8px rgba(52, 152, 219, 0.03) !important;
132
+ margin-bottom: 40px !important;
133
+ overflow: visible !important;
134
+ }
135
+
136
+ .upload-card:hover {
137
+ box-shadow:
138
+ 0 8px 32px rgba(0, 0, 0, 0.10),
139
+ 0 4px 8px rgba(0, 0, 0, 0.06) !important;
140
+ transform: translateY(-6px);
141
+ border-color: rgba(52, 152, 219, 0.3) !important;
142
+ }
143
+
144
+ /* ==================== Upload Area ==================== */
145
+ .upload-area {
146
+ border: 3px dashed rgba(52, 152, 219, 0.35) !important;
147
+ border-radius: 28px !important;
148
+ background: linear-gradient(135deg, rgba(52, 152, 219, 0.03) 0%, rgba(52, 152, 219, 0.06) 100%) !important;
149
+ padding: 96px 40px !important;
150
+ text-align: center !important;
151
+ transition: all 0.3s ease !important;
152
+ min-height: 360px !important;
153
+ }
154
+
155
+ .upload-area:hover {
156
+ border-color: #3498DB !important;
157
+ background: linear-gradient(135deg, rgba(52, 152, 219, 0.06) 0%, rgba(52, 152, 219, 0.12) 100%) !important;
158
+ transform: scale(1.02);
159
+ }
160
+
161
+ /* ==================== Section Titles - Consistent Spacing ==================== */
162
+ .section-title {
163
+ font-size: 28px !important;
164
+ font-weight: 700 !important;
165
+ color: #2C3E50 !important;
166
+ margin-bottom: 20px !important;
167
+ letter-spacing: -0.02em !important;
168
+ padding-bottom: 0 !important;
169
+ border-bottom: none !important;
170
+ text-align: left !important;
171
+ margin-top: 0 !important;
172
+ }
173
+
174
+ .section-title-left {
175
+ font-size: 28px !important;
176
+ font-weight: 700 !important;
177
+ color: #2C3E50 !important;
178
+ margin-bottom: 20px !important;
179
+ margin-top: 0 !important;
180
+ letter-spacing: -0.02em !important;
181
+ text-align: left !important;
182
+ border-bottom: none !important;
183
+ padding-bottom: 0 !important;
184
+ }
185
+
186
+ /* ==================== Form Elements - Generous Padding ==================== */
187
+ .settings-row {
188
+ gap: 24px !important;
189
+ margin-bottom: 28px !important;
190
+ }
191
+
192
+ .radio-group {
193
+ background: rgba(248, 249, 250, 0.5) !important;
194
+ border-radius: 20px !important;
195
+ padding: 24px 28px !important;
196
+ border: none !important;
197
+ margin-bottom: 24px !important;
198
+ border: 1px solid rgba(0, 0, 0, 0.04) !important;
199
+ }
200
+
201
+ .radio-group:last-child {
202
+ margin-bottom: 0 !important;
203
+ }
204
+
205
+ /* Inline radio groups for side-by-side layout */
206
+ .radio-group-inline {
207
+ background: linear-gradient(135deg, rgba(255, 255, 255, 0.7) 0%, rgba(248, 249, 250, 0.5) 100%) !important;
208
+ border-radius: 16px !important;
209
+ padding: 20px !important;
210
+ border: 1px solid rgba(52, 152, 219, 0.1) !important;
211
+ margin-bottom: 0 !important;
212
+ box-shadow: 0 2px 8px rgba(0, 0, 0, 0.03) !important;
213
+ transition: all 0.3s ease !important;
214
+ }
215
+
216
+ .radio-group-inline:hover {
217
+ box-shadow: 0 4px 16px rgba(52, 152, 219, 0.08) !important;
218
+ border-color: rgba(52, 152, 219, 0.2) !important;
219
+ }
220
+
221
+ .radio-group label {
222
+ color: #6C757D !important;
223
+ font-weight: 600 !important;
224
+ font-size: 14px !important;
225
+ margin-bottom: 16px !important;
226
+ letter-spacing: 0.08em !important;
227
+ text-transform: uppercase !important;
228
+ display: block !important;
229
+ text-align: left !important;
230
+ }
231
+
232
+ /* Radio group title (the actual input label) */
233
+ .radio-group > label:first-child {
234
+ color: #2C3E50 !important;
235
+ font-weight: 700 !important;
236
+ font-size: 19px !important;
237
+ margin-bottom: 16px !important;
238
+ letter-spacing: -0.02em !important;
239
+ text-transform: none !important;
240
+ }
241
+
242
+ /* Inline radio group title - BIGGER and BOLD */
243
+ .radio-group-inline > label:first-child {
244
+ color: #2C3E50 !important;
245
+ font-weight: 700 !important;
246
+ font-size: 18px !important;
247
+ margin-bottom: 14px !important;
248
+ letter-spacing: -0.01em !important;
249
+ text-transform: none !important;
250
+ display: block !important;
251
+ }
252
+
253
+ .radio-group input[type="radio"] {
254
+ accent-color: #3498DB !important;
255
+ width: 22px !important;
256
+ height: 22px !important;
257
+ margin-right: 14px !important;
258
+ }
259
+
260
+ /* Radio option labels */
261
+ .radio-group > div > label {
262
+ color: #495057 !important;
263
+ font-weight: 500 !important;
264
+ font-size: 17px !important;
265
+ letter-spacing: -0.01em !important;
266
+ text-transform: none !important;
267
+ padding: 14px 20px !important;
268
+ border-radius: 14px !important;
269
+ transition: all 0.2s ease !important;
270
+ cursor: pointer !important;
271
+ display: flex !important;
272
+ align-items: center !important;
273
+ }
274
+
275
+ /* Inline radio option labels - BIGGER */
276
+ .radio-group-inline > div > label {
277
+ color: #495057 !important;
278
+ font-weight: 500 !important;
279
+ font-size: 16px !important;
280
+ letter-spacing: -0.01em !important;
281
+ text-transform: none !important;
282
+ padding: 12px 16px !important;
283
+ border-radius: 10px !important;
284
+ transition: all 0.2s ease !important;
285
+ cursor: pointer !important;
286
+ display: flex !important;
287
+ align-items: center !important;
288
+ background: rgba(255, 255, 255, 0.6) !important;
289
+ margin-bottom: 8px !important;
290
+ border: 1px solid rgba(0, 0, 0, 0.04) !important;
291
+ }
292
+
293
+ .radio-group > div > label:hover {
294
+ background: rgba(52, 152, 219, 0.08) !important;
295
+ }
296
+
297
+ .radio-group-inline > div > label:hover {
298
+ background: rgba(52, 152, 219, 0.12) !important;
299
+ transform: translateX(4px);
300
+ }
301
+
302
+ /* ==================== Button ==================== */
303
+ .generate-button {
304
+ background: linear-gradient(135deg, #3498DB 0%, #2980B9 100%) !important;
305
+ color: white !important;
306
+ border: none !important;
307
+ border-radius: 20px !important;
308
+ padding: 24px 64px !important;
309
+ font-size: 19px !important;
310
+ font-weight: 700 !important;
311
+ cursor: pointer !important;
312
+ box-shadow:
313
+ 0 6px 24px rgba(52, 152, 219, 0.35),
314
+ 0 3px 6px rgba(52, 152, 219, 0.25) !important;
315
+ transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
316
+ letter-spacing: -0.02em !important;
317
+ width: 100% !important;
318
+ margin-top: 24px !important;
319
+ }
320
+
321
+ .generate-button:hover {
322
+ transform: translateY(-6px) scale(1.02) !important;
323
+ box-shadow:
324
+ 0 16px 48px rgba(52, 152, 219, 0.45),
325
+ 0 6px 12px rgba(52, 152, 219, 0.35) !important;
326
+ }
327
+
328
+ .generate-button:active {
329
+ transform: translateY(-3px) scale(1.01) !important;
330
+ }
331
+
332
+ /* ==================== Caption Cards - Light & Elegant ==================== */
333
+ .caption-card {
334
+ background: linear-gradient(135deg, rgba(255, 255, 255, 0.98) 0%, rgba(248, 249, 250, 0.95) 100%);
335
+ backdrop-filter: blur(20px);
336
+ border: 1px solid rgba(0, 0, 0, 0.06);
337
+ border-radius: 28px;
338
+ padding: 32px 36px;
339
+ margin-bottom: 28px;
340
+ transition: all 0.4s cubic-bezier(0.25, 0.46, 0.45, 0.94);
341
+ box-shadow:
342
+ 0 4px 16px rgba(0, 0, 0, 0.05),
343
+ 0 2px 4px rgba(0, 0, 0, 0.03);
344
+ position: relative;
345
+ }
346
+
347
+ .caption-card:hover {
348
+ box-shadow:
349
+ 0 8px 32px rgba(0, 0, 0, 0.10),
350
+ 0 4px 8px rgba(0, 0, 0, 0.06);
351
+ transform: translateY(-6px);
352
+ border-color: rgba(52, 152, 219, 0.3);
353
+ }
354
+
355
+ .caption-header {
356
+ font-size: 15px;
357
+ font-weight: 700;
358
+ color: #6C757D;
359
+ text-transform: uppercase;
360
+ letter-spacing: 0.14em;
361
+ margin-bottom: 20px;
362
+ }
363
+
364
+ .caption-text {
365
+ font-size: 21px;
366
+ font-weight: 400;
367
+ color: #2C3E50;
368
+ line-height: 1.8;
369
+ margin-bottom: 24px;
370
+ letter-spacing: -0.01em;
371
+ }
372
+
373
+ .caption-hashtags {
374
+ font-size: 18px;
375
+ font-weight: 600;
376
+ color: #3498DB;
377
+ margin-bottom: 0;
378
+ word-wrap: break-word;
379
+ line-height: 1.75;
380
+ }
381
+
382
+ /* Copy Button */
383
+ .copy-button {
384
+ position: absolute;
385
+ top: 28px;
386
+ right: 28px;
387
+ background: rgba(52, 152, 219, 0.10);
388
+ border: 1px solid rgba(52, 152, 219, 0.25);
389
+ border-radius: 14px;
390
+ padding: 12px 20px;
391
+ font-size: 15px;
392
+ font-weight: 600;
393
+ color: #3498DB;
394
+ cursor: pointer;
395
+ transition: all 0.2s ease;
396
+ display: flex;
397
+ align-items: center;
398
+ gap: 8px;
399
+ }
400
+
401
+ .copy-button:hover {
402
+ background: rgba(52, 152, 219, 0.18);
403
+ border-color: #3498DB;
404
+ transform: translateY(-2px);
405
+ box-shadow: 0 4px 12px rgba(52, 152, 219, 0.25);
406
+ }
407
+
408
+ .copy-button:active {
409
+ transform: translateY(0);
410
+ }
411
+
412
+ .copy-button.copied {
413
+ background: rgba(39, 174, 96, 0.15);
414
+ border-color: #27AE60;
415
+ color: #27AE60;
416
+ }
417
+
418
+ /* ==================== Footer ==================== */
419
+ .app-footer {
420
+ text-align: center;
421
+ margin-top: 96px;
422
+ padding-top: 64px;
423
+ border-top: 3px solid rgba(0, 0, 0, 0.08);
424
+ animation: fadeInUp 0.8s ease-out 0.3s backwards;
425
+ }
426
+
427
+ @keyframes fadeInUp {
428
+ from {
429
+ opacity: 0;
430
+ transform: translateY(30px);
431
+ }
432
+ to {
433
+ opacity: 1;
434
+ transform: translateY(0);
435
+ }
436
+ }
437
+
438
+ .footer-text {
439
+ font-size: 17px;
440
+ color: #6C757D;
441
+ line-height: 2.0;
442
+ letter-spacing: -0.01em;
443
+ font-weight: 500;
444
+ }
445
+
446
+ .footer-models {
447
+ font-size: 15px;
448
+ color: #ADB5BD;
449
+ margin-top: 20px;
450
+ font-weight: 600;
451
+ letter-spacing: 0.03em;
452
+ }
453
+
454
+ /* ==================== Image Display ==================== */
455
+ .image-container {
456
+ border-radius: 28px !important;
457
+ overflow: hidden !important;
458
+ box-shadow:
459
+ 0 6px 24px rgba(0, 0, 0, 0.10),
460
+ 0 3px 6px rgba(0, 0, 0, 0.06) !important;
461
+ }
462
+
463
+ .image-container img {
464
+ border-radius: 28px !important;
465
+ box-shadow:
466
+ 0 6px 24px rgba(0, 0, 0, 0.12),
467
+ 0 3px 6px rgba(0, 0, 0, 0.08) !important;
468
+ }
469
+
470
+ /* ==================== Responsive Design ==================== */
471
+ @media (max-width: 768px) {
472
+ .contain {
473
+ padding: 48px 32px 64px 32px !important;
474
+ }
475
+
476
+ .app-title {
477
+ font-size: 52px;
478
+ }
479
+
480
+ .app-subtitle {
481
+ font-size: 20px;
482
+ }
483
+
484
+ .upload-card, .options-card, .results-card {
485
+ padding: 40px !important;
486
+ }
487
+
488
+ .upload-area {
489
+ padding: 64px 32px !important;
490
+ min-height: 280px !important;
491
+ }
492
+
493
+ .caption-card {
494
+ padding: 28px;
495
+ }
496
+
497
+ .section-title {
498
+ font-size: 30px !important;
499
+ }
500
+
501
+ .copy-button {
502
+ top: 20px;
503
+ right: 20px;
504
+ padding: 10px 16px;
505
+ font-size: 14px;
506
+ }
507
+ }
508
+
509
+ /* ==================== Loading Animation ==================== */
510
+ @keyframes shimmer {
511
+ 0% {
512
+ background-position: -1000px 0;
513
+ }
514
+ 100% {
515
+ background-position: 1000px 0;
516
+ }
517
+ }
518
+
519
+ .loading {
520
+ animation: shimmer 2s infinite;
521
+ background: linear-gradient(to right, #f8f9fa 4%, #e9ecef 25%, #f8f9fa 36%);
522
+ background-size: 1000px 100%;
523
+ }
524
+ """
525
+
526
+ def create_header(self):
527
+ """Create application header"""
528
+ return gr.HTML("""
529
+ <div class="app-header">
530
+ <h1 class="app-title">✨ Pixcribe</h1>
531
+ <p class="app-subtitle">AI-Powered Social Media Caption Generator</p>
532
+ </div>
533
+ """)
534
+
535
+ def create_info_banner(self):
536
+ """Create informational banner about model loading and processing times"""
537
+ return gr.HTML("""
538
+ <div style="
539
+ background: linear-gradient(135deg, #E8F4F8 0%, #D4E9F2 100%);
540
+ border-left: 4px solid #3498DB;
541
+ border-radius: 16px;
542
+ padding: 24px 32px;
543
+ margin: 0 auto 48px auto;
544
+ max-width: 1200px;
545
+ box-shadow: 0 4px 16px rgba(52, 152, 219, 0.12);
546
+ ">
547
+ <div style="display: flex; align-items: start; gap: 20px;">
548
+ <div style="font-size: 32px; line-height: 1; margin-top: 4px;">⏱️</div>
549
+ <div style="flex: 1;">
550
+ <h3 style="
551
+ margin: 0 0 12px 0;
552
+ font-size: 20px;
553
+ font-weight: 700;
554
+ color: #2C3E50;
555
+ letter-spacing: -0.02em;
556
+ ">
557
+ Please Note: Processing Time
558
+ </h3>
559
+ <p style="
560
+ margin: 0 0 12px 0;
561
+ font-size: 15px;
562
+ line-height: 1.6;
563
+ color: #5D6D7E;
564
+ ">
565
+ <strong style="color: #2980B9;">Initial setup and model loading may take a while</strong> as multiple AI models
566
+ are initialized and cached. This includes YOLOv11 object detection, OpenCLIP semantic analysis,
567
+ Qwen2.5-VL caption generation, and other advanced models.
568
+ </p>
569
+ <p style="
570
+ margin: 0;
571
+ font-size: 15px;
572
+ line-height: 1.6;
573
+ color: #5D6D7E;
574
+ ">
575
+ ✨ <strong style="color: #27AE60;">Processing time varies depending on system resources.</strong>
576
+ Thank you for your patience while we generate high-quality captions!
577
+ </p>
578
+ </div>
579
+ </div>
580
+ </div>
581
+ """)
582
+
583
+ def create_footer(self):
584
+ """Create application footer"""
585
+ return gr.HTML("""
586
+ <div class="app-footer">
587
+ <p class="footer-text">
588
+ Powered by advanced AI models
589
+ </p>
590
+ <p class="footer-models">
591
+ YOLOv11 · OpenCLIP ViT-H/14 · Qwen2.5-VL-7B · EasyOCR · Places365 · U2-Net
592
+ </p>
593
+ <p class="footer-text" style="margin-top: 32px;">
594
+ © 2025 Pixcribe · Built for creators
595
+ </p>
596
+ </div>
597
+ """)
598
+
599
+ def format_captions_with_copy(self, captions: List[Dict]) -> str:
600
+ """Format captions as HTML with copy functionality"""
601
+ if not captions:
602
+ return "<p style='color: #6C757D; padding: 24px;'>No captions generated</p>"
603
+
604
+ captions_html = ""
605
+ for i, cap in enumerate(captions):
606
+ caption_text = cap.get('caption', '')
607
+ hashtags = cap.get('hashtags', [])
608
+ tone = cap.get('tone', 'unknown').title()
609
+
610
+ # Create unique ID for each caption
611
+ caption_id = f"caption_{i}"
612
+
613
+ # Full text to copy (caption + hashtags)
614
+ full_text = f"{caption_text}\n\n{' '.join([f'#{tag}' for tag in hashtags])}"
615
+
616
+ captions_html += f"""
617
+ <div class="caption-card" id="{caption_id}">
618
+ <button class="copy-button" onclick="copyCaption{i}()" id="copy-btn-{i}">
619
+ 📋 Copy
620
+ </button>
621
+ <div class="caption-header">Caption {i+1} · {tone}</div>
622
+ <div class="caption-text">{caption_text}</div>
623
+ <div class="caption-hashtags">
624
+ {' '.join([f'#{tag}' for tag in hashtags])}
625
+ </div>
626
+ <textarea id="caption-text-{i}" style="position: absolute; left: -9999px;">{full_text}</textarea>
627
+ </div>
628
+
629
+ <script>
630
+ function copyCaption{i}() {{
631
+ const text = document.getElementById('caption-text-{i}').value;
632
+ const btn = document.getElementById('copy-btn-{i}');
633
+
634
+ // Try modern clipboard API first
635
+ if (navigator.clipboard && navigator.clipboard.writeText) {{
636
+ navigator.clipboard.writeText(text).then(() => {{
637
+ btn.innerHTML = '✓ Copied!';
638
+ btn.classList.add('copied');
639
+ setTimeout(() => {{
640
+ btn.innerHTML = '📋 Copy';
641
+ btn.classList.remove('copied');
642
+ }}, 2000);
643
+ }}).catch(() => {{
644
+ // Fallback to old method
645
+ fallbackCopy{i}();
646
+ }});
647
+ }} else {{
648
+ // Fallback for older browsers
649
+ fallbackCopy{i}();
650
+ }}
651
+ }}
652
+
653
+ function fallbackCopy{i}() {{
654
+ const textarea = document.getElementById('caption-text-{i}');
655
+ const btn = document.getElementById('copy-btn-{i}');
656
+ textarea.style.position = 'static';
657
+ textarea.style.opacity = '0';
658
+ textarea.select();
659
+ try {{
660
+ document.execCommand('copy');
661
+ btn.innerHTML = '✓ Copied!';
662
+ btn.classList.add('copied');
663
+ setTimeout(() => {{
664
+ btn.innerHTML = '📋 Copy';
665
+ btn.classList.remove('copied');
666
+ }}, 2000);
667
+ }} catch (err) {{
668
+ btn.innerHTML = '✗ Failed';
669
+ setTimeout(() => {{
670
+ btn.innerHTML = '📋 Copy';
671
+ }}, 2000);
672
+ }}
673
+ textarea.style.position = 'absolute';
674
+ textarea.style.opacity = '1';
675
+ }}
676
+ </script>
677
+ """
678
+
679
+ return captions_html
680
+
681
+ print("✓ UIManager defined")
universal_object_prompts.py ADDED
@@ -0,0 +1,464 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from typing import Dict, List
3
+
4
+ class UniversalObjectPrompts:
5
+ """
6
+ 通用物品描述 Prompt 庫
7
+ 涵蓋日常物品、動物、交通工具、電子產品等
8
+ 確保系統能夠描述各種類型的圖片
9
+ """
10
+
11
+ def __init__(self):
12
+ """初始化通用物品詞彙庫"""
13
+
14
+ self.object_vocabularies = {
15
+ # ===== 動物 Animals =====
16
+ 'animals': {
17
+ 'dogs': [
18
+ 'friendly dog with expressive eyes and playful demeanor',
19
+ 'canine companion with soft fur and loyal presence',
20
+ 'domestic dog breed with distinct markings and alert posture',
21
+ 'pet dog in outdoor setting with natural behavior',
22
+ 'puppy with cute features and energetic personality',
23
+ 'large breed dog with muscular build and protective stance',
24
+ 'small lap dog with fluffy coat and adorable expression',
25
+ 'working dog demonstrating intelligence and trained skills',
26
+ 'mixed breed dog with unique features and charming character',
27
+ 'dog portrait with focused gaze and photogenic qualities'
28
+ ],
29
+ 'cats': [
30
+ 'elegant cat with graceful posture and alert expression',
31
+ 'feline companion with soft fur and independent character',
32
+ 'domestic cat with distinctive markings and curious nature',
33
+ 'cat resting in comfortable position with relaxed demeanor',
34
+ 'kitten with playful energy and adorable tiny features',
35
+ 'long-haired cat with fluffy coat and majestic appearance',
36
+ 'short-haired cat with sleek coat and athletic build',
37
+ 'cat portrait with piercing eyes and photogenic pose',
38
+ 'tabby cat with striped pattern and charming personality',
39
+ 'cat in natural sunlight with warm ambient lighting'
40
+ ],
41
+ 'birds': [
42
+ 'colorful bird with vibrant plumage and natural beauty',
43
+ 'bird in flight with spread wings and dynamic motion',
44
+ 'perched bird with detailed feather texture and alert posture',
45
+ 'exotic bird species with distinctive beak and eye markings',
46
+ 'songbird with delicate features and graceful appearance',
47
+ 'bird of prey with powerful build and intense gaze',
48
+ 'waterfowl with sleek feathers and aquatic adaptation',
49
+ 'tropical bird with brilliant colors and exotic appeal',
50
+ 'bird feeding or foraging showing natural behavior',
51
+ 'bird silhouette against sky with artistic composition'
52
+ ],
53
+ 'wildlife': [
54
+ 'wildlife creature in natural habitat showing authentic behavior',
55
+ 'wild animal with powerful build and majestic presence',
56
+ 'forest wildlife with camouflage coloring and alert senses',
57
+ 'marine wildlife with aquatic adaptation and fluid movement',
58
+ 'safari animal with distinctive features and exotic appeal',
59
+ 'small mammal with cute features and curious expression',
60
+ 'endangered species with conservation importance and beauty',
61
+ 'nocturnal animal with adapted eyes and nighttime behavior',
62
+ 'wildlife portrait with environmental context and natural light',
63
+ 'animal in motion demonstrating speed agility or power'
64
+ ]
65
+ },
66
+
67
+ # ===== 交通工具 Vehicles =====
68
+ 'vehicles': {
69
+ 'cars': [
70
+ 'modern automobile with sleek design and aerodynamic lines',
71
+ 'luxury car with premium finish and sophisticated styling',
72
+ 'sports car with aggressive stance and performance aesthetics',
73
+ 'classic car with vintage charm and timeless design',
74
+ 'electric vehicle with futuristic design and eco-friendly appeal',
75
+ 'SUV with robust build and commanding presence',
76
+ 'sedan with elegant profile and comfortable proportions',
77
+ 'convertible with open top and free-spirited character',
78
+ 'vintage automobile with chrome details and nostalgic beauty',
79
+ 'race car with aerodynamic body and competition livery'
80
+ ],
81
+ 'motorcycles': [
82
+ 'motorcycle with powerful engine and dynamic design',
83
+ 'cruiser bike with low profile and relaxed riding position',
84
+ 'sport bike with aggressive fairings and racing aesthetics',
85
+ 'vintage motorcycle with classic styling and heritage appeal',
86
+ 'custom bike with unique modifications and personal touches',
87
+ 'touring motorcycle with comfort features and long-distance capability',
88
+ 'dirt bike with off-road tires and rugged construction',
89
+ 'scooter with practical design and urban mobility',
90
+ 'cafe racer with minimalist design and retro styling',
91
+ 'adventure motorcycle with all-terrain capability and robust build'
92
+ ],
93
+ 'bicycles': [
94
+ 'road bike with lightweight frame and racing geometry',
95
+ 'mountain bike with suspension and off-road tires',
96
+ 'vintage bicycle with classic design and nostalgic charm',
97
+ 'urban commuter bike with practical features and city-ready design',
98
+ 'electric bicycle with motor assist and modern technology',
99
+ 'BMX bike with compact frame and trick-ready build',
100
+ 'touring bicycle with panniers and long-distance setup',
101
+ 'folding bike with space-saving design and portability',
102
+ 'fixed gear bike with minimalist aesthetic and urban style',
103
+ 'cruiser bicycle with comfortable seat and relaxed riding position'
104
+ ],
105
+ 'public_transport': [
106
+ 'city bus with public transit livery and urban setting',
107
+ 'train at station with platform and passenger environment',
108
+ 'subway car with interior lighting and metro system',
109
+ 'tram on city streets with overhead wires and urban backdrop',
110
+ 'ferry boat with water transportation and maritime setting',
111
+ 'taxi cab with distinctive markings and urban context',
112
+ 'double-decker bus with iconic design and city character',
113
+ 'monorail with elevated track and futuristic appearance',
114
+ 'light rail vehicle with modern design and efficient transit',
115
+ 'cable car with hillside location and scenic views'
116
+ ]
117
+ },
118
+
119
+ # ===== 電子產品 Electronics =====
120
+ 'electronics': {
121
+ 'smartphones': [
122
+ 'modern smartphone with edge-to-edge display and sleek design',
123
+ 'mobile phone with premium materials and minimalist aesthetic',
124
+ 'smartphone showing screen interface with app icons and features',
125
+ 'phone with camera system and advanced photography capabilities',
126
+ 'mobile device with protective case and personal accessories',
127
+ 'smartphone in hand demonstrating use and scale',
128
+ 'phone with wireless charging and modern conveniences',
129
+ 'mobile phone with notification screen and communication features',
130
+ 'smartphone capturing photo showing photography in action',
131
+ 'device with headphones and mobile entertainment setup'
132
+ ],
133
+ 'laptops': [
134
+ 'laptop computer with open screen and modern workspace',
135
+ 'portable computer with sleek design and professional appearance',
136
+ 'laptop showing desktop interface and productivity software',
137
+ 'computer with external peripherals and complete workstation',
138
+ 'thin and light laptop with premium build and portability',
139
+ 'gaming laptop with powerful specs and aggressive styling',
140
+ 'laptop in cafe setting with remote work environment',
141
+ 'computer with split screen showing multitasking capability',
142
+ 'laptop with coding environment and developer workflow',
143
+ 'portable computer with creative software and design work'
144
+ ],
145
+ 'cameras': [
146
+ 'professional camera with interchangeable lens and manual controls',
147
+ 'DSLR camera with telephoto lens and photography equipment',
148
+ 'mirrorless camera with compact design and modern features',
149
+ 'vintage film camera with classic design and analog charm',
150
+ 'action camera with rugged housing and adventure ready build',
151
+ 'instant camera with retro aesthetic and print functionality',
152
+ 'camera on tripod with stable shooting setup',
153
+ 'photography gear with lenses filters and accessories',
154
+ "camera with viewfinder showing photographer's perspective",
155
+ 'compact camera with point-and-shoot simplicity'
156
+ ],
157
+ 'wearables': [
158
+ 'smartwatch with digital display and fitness tracking features',
159
+ 'fitness tracker with health monitoring and activity data',
160
+ 'wireless earbuds with charging case and modern design',
161
+ 'smart glasses with augmented reality and tech integration',
162
+ 'VR headset with immersive technology and gaming capability',
163
+ 'smart ring with minimalist design and health sensors',
164
+ 'activity band with water resistance and sport features',
165
+ 'wireless headphones with noise cancellation and premium audio',
166
+ 'smart jewelry with notification features and elegant styling',
167
+ 'wearable device with heart rate monitor and workout tracking'
168
+ ]
169
+ },
170
+
171
+ # ===== 家居用品 Home Items =====
172
+ 'home_items': {
173
+ 'furniture': [
174
+ 'modern sofa with clean lines and comfortable upholstery',
175
+ 'wooden dining table with natural grain and family seating',
176
+ 'contemporary chair with ergonomic design and stylish form',
177
+ 'bookshelf with organized volumes and decorative objects',
178
+ 'bed with plush bedding and inviting sleep environment',
179
+ 'desk with workspace organization and productive setup',
180
+ 'coffee table with minimalist design and functional surface',
181
+ 'cabinet with storage solutions and practical organization',
182
+ 'armchair with cozy cushioning and reading nook appeal',
183
+ 'sideboard with display area and dining room elegance'
184
+ ],
185
+ 'decor': [
186
+ 'wall art with framed artwork and gallery wall aesthetic',
187
+ 'decorative plant with lush foliage and natural greenery',
188
+ 'vase with fresh flowers and elegant arrangement',
189
+ 'candles with ambient lighting and cozy atmosphere',
190
+ 'throw pillows with colorful patterns and comfort layers',
191
+ 'mirror with decorative frame and space-enhancing reflection',
192
+ 'rug with textile pattern and floor covering warmth',
193
+ 'sculpture with artistic form and decorative presence',
194
+ 'decorative bowls with artisan craft and functional beauty',
195
+ 'wall clock with timepiece function and design statement'
196
+ ],
197
+ 'kitchenware': [
198
+ 'ceramic plates with elegant design and table setting ready',
199
+ 'glassware with crystal clarity and beverage service',
200
+ 'cookware with non-stick surface and culinary preparation',
201
+ 'cutting board with natural wood and food prep surface',
202
+ 'kitchen utensils with stainless steel and cooking tools',
203
+ 'coffee maker with brewing capability and morning ritual',
204
+ 'mixing bowls with nesting design and baking essentials',
205
+ 'serving platters with presentation surface and entertaining ready',
206
+ 'storage containers with organization and food preservation',
207
+ 'tea kettle with stovetop heating and beverage preparation'
208
+ ]
209
+ },
210
+
211
+ # ===== 服飾配件 Fashion Accessories =====
212
+ 'fashion_accessories': {
213
+ 'shoes': [
214
+ 'leather shoes with polished finish and formal elegance',
215
+ 'sneakers with athletic design and casual comfort',
216
+ 'high heels with sophisticated style and fashion statement',
217
+ 'boots with rugged construction and seasonal appropriateness',
218
+ 'sandals with open design and warm weather comfort',
219
+ 'loafers with slip-on convenience and smart casual style',
220
+ 'running shoes with performance features and sport technology',
221
+ 'dress shoes with refined appearance and occasion ready',
222
+ 'canvas shoes with casual aesthetic and everyday wearability',
223
+ 'designer footwear with luxury branding and premium materials'
224
+ ],
225
+ 'bags': [
226
+ 'leather handbag with structured form and quality craftsmanship',
227
+ 'backpack with practical compartments and daily carry capability',
228
+ 'tote bag with spacious interior and versatile use',
229
+ 'clutch with compact elegance and evening sophistication',
230
+ 'messenger bag with crossbody strap and urban function',
231
+ 'duffel bag with travel capacity and gym ready design',
232
+ 'satchel with classic styling and professional appearance',
233
+ 'wallet with organized card slots and essential storage',
234
+ 'shoulder bag with adjustable strap and comfortable carry',
235
+ 'luxury bag with designer branding and premium construction'
236
+ ],
237
+ 'jewelry': [
238
+ 'necklace with pendant design and elegant neckline accent',
239
+ 'earrings with gemstone sparkle and facial framing beauty',
240
+ 'ring with precious metal and symbolic significance',
241
+ 'bracelet with linked design and wrist adornment',
242
+ 'watch with timepiece function and wrist jewelry appeal',
243
+ 'brooch with decorative pin and vintage charm',
244
+ 'anklet with delicate chain and summer accessory style',
245
+ 'cufflinks with formal accent and menswear detail',
246
+ 'charm bracelet with personal tokens and memory collection',
247
+ 'statement jewelry with bold design and fashion impact'
248
+ ],
249
+ 'eyewear': [
250
+ 'sunglasses with UV protection and stylish frames',
251
+ 'eyeglasses with prescription lenses and daily wear design',
252
+ 'aviator sunglasses with classic pilot styling and metal frame',
253
+ 'cat-eye glasses with vintage inspired shape and feminine flair',
254
+ 'sport sunglasses with wraparound design and performance features',
255
+ 'reading glasses with magnification and close-work utility',
256
+ 'designer eyewear with luxury branding and premium materials',
257
+ 'mirrored sunglasses with reflective lenses and modern edge',
258
+ 'oversized sunglasses with dramatic proportions and fashion statement',
259
+ 'safety glasses with protective function and durable construction'
260
+ ]
261
+ },
262
+
263
+ # ===== 運動器材 Sports Equipment =====
264
+ 'sports_equipment': {
265
+ 'fitness': [
266
+ 'dumbbells with weight plates and strength training equipment',
267
+ 'yoga mat with non-slip surface and exercise foundation',
268
+ 'resistance bands with elastic tension and portable workout',
269
+ 'kettlebell with cast iron construction and functional training',
270
+ 'foam roller with massage texture and recovery tool',
271
+ 'exercise ball with inflatable design and core workout',
272
+ 'jump rope with cardio training and coordination exercise',
273
+ 'weight bench with adjustable positions and lifting support',
274
+ 'pull-up bar with doorframe mounting and bodyweight exercise',
275
+ 'treadmill with running surface and cardio machine'
276
+ ],
277
+ 'outdoor_sports': [
278
+ 'tennis racket with string tension and court sport equipment',
279
+ 'basketball with leather or composite cover and game ready',
280
+ 'soccer ball with classic panel design and field sport',
281
+ 'golf clubs with metal woods and iron set',
282
+ 'baseball glove with leather construction and catching mitt',
283
+ 'skateboard with deck grip tape and wheel assembly',
284
+ 'surfboard with wax coating and wave riding design',
285
+ 'snowboard with bindings and mountain sport equipment',
286
+ 'hiking boots with ankle support and trail ready tread',
287
+ 'camping tent with weatherproof fabric and outdoor shelter'
288
+ ]
289
+ },
290
+
291
+ # ===== 樂器 Musical Instruments =====
292
+ 'musical_instruments': {
293
+ 'string': [
294
+ 'acoustic guitar with wooden body and string instrument charm',
295
+ 'electric guitar with solid body and amplified rock sound',
296
+ 'violin with curved body and classical string beauty',
297
+ 'cello with rich tone and orchestral presence',
298
+ 'bass guitar with deep sound and rhythm section foundation',
299
+ 'ukulele with small size and tropical string instrument',
300
+ 'harp with multiple strings and angelic sound quality',
301
+ 'banjo with circular body and folk music character',
302
+ 'mandolin with paired strings and bright tone',
303
+ 'sitar with resonating strings and world music heritage'
304
+ ],
305
+ 'keyboard': [
306
+ 'piano with ivory keys and classical instrument elegance',
307
+ 'keyboard synthesizer with electronic sound and modern music',
308
+ 'organ with multiple manuals and church music tradition',
309
+ 'electric piano with vintage tone and stage performance',
310
+ 'digital piano with weighted keys and home practice',
311
+ 'accordion with bellows and folk dance music',
312
+ 'MIDI controller with production capability and studio tool',
313
+ 'harpsichord with baroque styling and historical instrument',
314
+ 'melodica with breath control and portable keyboard',
315
+ 'keytar with shoulder strap and performance showmanship'
316
+ ],
317
+ 'percussion': [
318
+ 'drum kit with multiple pieces and rhythmic foundation',
319
+ 'djembe with hand drumming and African rhythm',
320
+ 'conga drums with Latin percussion and tropical beat',
321
+ 'tambourine with jingles and shaker instrument',
322
+ 'bongos with paired drums and Latin music style',
323
+ 'xylophone with mallet playing and melodic percussion',
324
+ 'cymbals with crash sound and orchestral accent',
325
+ 'maracas with rattle sound and Latin rhythm shaker',
326
+ 'cajón with box drum and flamenco percussion',
327
+ 'timpani with kettle drum and orchestral thunder'
328
+ ]
329
+ },
330
+
331
+ # ===== 辦公用品 Office Supplies =====
332
+ 'office_supplies': {
333
+ 'stationery': [
334
+ 'pen with smooth writing and everyday writing tool',
335
+ 'notebook with lined pages and note-taking essential',
336
+ 'pencil with graphite lead and sketching tool',
337
+ 'markers with vibrant colors and highlighting capability',
338
+ 'sticky notes with adhesive backing and reminder function',
339
+ 'paper clips with metal construction and document organization',
340
+ 'stapler with binding function and paper fastening',
341
+ 'tape dispenser with adhesive roll and office essential',
342
+ 'ruler with measurement markings and straight edge',
343
+ 'scissors with sharp blades and cutting tool'
344
+ ],
345
+ 'desk_items': [
346
+ 'desk lamp with adjustable arm and task lighting',
347
+ 'organizer with compartments and clutter management',
348
+ 'mouse pad with smooth surface and wrist support',
349
+ 'desk calendar with date tracking and planning tool',
350
+ 'pen holder with upright storage and writing implement organization',
351
+ 'file folders with document sorting and category organization',
352
+ 'desk mat with large surface and workspace protection',
353
+ 'paper tray with stacking design and document storage',
354
+ 'business card holder with professional presentation',
355
+ 'cable organizer with cord management and tidy workspace'
356
+ ]
357
+ },
358
+
359
+ # ===== 玩具與遊戲 Toys and Games =====
360
+ 'toys_games': {
361
+ 'toys': [
362
+ 'stuffed animal with soft plush and cuddly companion',
363
+ 'action figure with articulated joints and character play',
364
+ 'doll with detailed features and imaginative play',
365
+ 'building blocks with interlocking pieces and creative construction',
366
+ 'toy car with rolling wheels and miniature vehicle play',
367
+ 'puzzle with interlocking pieces and problem-solving challenge',
368
+ 'board game with playing pieces and family entertainment',
369
+ 'card game with illustrated cards and strategy play',
370
+ 'remote control toy with wireless operation and interactive play',
371
+ 'educational toy with learning elements and developmental benefits'
372
+ ],
373
+ 'collectibles': [
374
+ 'figurine with detailed sculpting and display collectible',
375
+ 'vinyl toy with designer art and limited edition appeal',
376
+ 'model kit with assembly parts and hobbyist construction',
377
+ 'trading cards with collectible series and rarity value',
378
+ 'die-cast model with metal construction and scale replica',
379
+ 'statue with artistic detail and collector showcase',
380
+ 'pop culture figure with character likeness and fandom appeal',
381
+ 'vintage toy with nostalgic value and retro charm',
382
+ 'limited edition collectible with numbered series and exclusivity',
383
+ 'display case with protective housing and collection showcase'
384
+ ]
385
+ }
386
+ }
387
+
388
+ print(f"✓ Universal Object Prompts initialized with {len(self.object_vocabularies)} major categories")
389
+
390
+ def get_prompts(self, category: str, subcategory: str = None) -> List[str]:
391
+ """
392
+ 取得物品 prompts
393
+
394
+ Args:
395
+ category: 物品類別 (如 'animals', 'vehicles')
396
+ subcategory: 子類別 (如 'dogs', 'cats')
397
+
398
+ Returns:
399
+ Prompt 列表
400
+ """
401
+ category_prompts = self.object_vocabularies.get(category, {})
402
+
403
+ if subcategory:
404
+ return category_prompts.get(subcategory, [])
405
+ else:
406
+ # 返回該類別的所有 prompts
407
+ all_prompts = []
408
+ for prompts in category_prompts.values():
409
+ if isinstance(prompts, list):
410
+ all_prompts.extend(prompts)
411
+ return all_prompts
412
+
413
+ def get_all_categories(self) -> List[str]:
414
+ """取得所有物品類別"""
415
+ return list(self.object_vocabularies.keys())
416
+
417
+ def get_subcategories(self, category: str) -> List[str]:
418
+ """取得特定類別的所有子類別"""
419
+ category_data = self.object_vocabularies.get(category, {})
420
+ return list(category_data.keys()) if isinstance(category_data, dict) else []
421
+
422
+ def detect_object_category(self, detected_objects: List[str]) -> str:
423
+ """
424
+ 根據檢測到的物體推測主要類別
425
+
426
+ Args:
427
+ detected_objects: YOLO 檢測到的物體列表
428
+
429
+ Returns:
430
+ 推測的類別名稱
431
+ """
432
+ object_str = ' '.join(detected_objects).lower()
433
+
434
+ # 動物關鍵字
435
+ if any(kw in object_str for kw in ['dog', 'cat', 'bird', 'animal', 'pet']):
436
+ return 'animals'
437
+
438
+ # 交通工具關鍵字
439
+ if any(kw in object_str for kw in ['car', 'bike', 'motorcycle', 'bus', 'train', 'vehicle']):
440
+ return 'vehicles'
441
+
442
+ # 電子產品關鍵字
443
+ if any(kw in object_str for kw in ['phone', 'laptop', 'camera', 'computer', 'tablet']):
444
+ return 'electronics'
445
+
446
+ # 家居用品關鍵字
447
+ if any(kw in object_str for kw in ['chair', 'table', 'bed', 'couch', 'furniture']):
448
+ return 'home_items'
449
+
450
+ # 服飾配件關鍵字
451
+ if any(kw in object_str for kw in ['shoe', 'bag', 'handbag', 'backpack', 'watch']):
452
+ return 'fashion_accessories'
453
+
454
+ # 運動器材關鍵字
455
+ if any(kw in object_str for kw in ['ball', 'racket', 'equipment', 'fitness']):
456
+ return 'sports_equipment'
457
+
458
+ # 樂器關鍵字
459
+ if any(kw in object_str for kw in ['guitar', 'piano', 'drum', 'instrument']):
460
+ return 'musical_instruments'
461
+
462
+ return None # 無法辨識
463
+
464
+ print("✓ UniversalObjectPrompts defined")
yolo_detection_manager.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from ultralytics import YOLO
3
+ import numpy as np
4
+ from typing import List, Dict
5
+ from PIL import Image
6
+
7
+ class YOLODetectionManager:
8
+ """Object detection using YOLOv11"""
9
+
10
+ def __init__(self, variant='m'):
11
+ print(f"Loading YOLOv11{variant} model...")
12
+ self.model = YOLO(f'yolo11{variant}.pt')
13
+ self.variant = variant
14
+ self.conf_threshold = 0.25
15
+ self.iou_threshold = 0.45
16
+ self.max_detections = 100
17
+
18
+ # Brand-relevant classes
19
+ self.brand_relevant_classes = [
20
+ 'handbag', 'bottle', 'cell phone', 'laptop',
21
+ 'backpack', 'tie', 'suitcase', 'cup', 'watch',
22
+ 'shoe', 'sneaker', 'boot'
23
+ ]
24
+
25
+ print(f"✓ YOLOv11{variant} loaded")
26
+
27
+ def detect(self, image: np.ndarray) -> List[Dict]:
28
+ """Detect objects in image"""
29
+ results = self.model.predict(
30
+ image,
31
+ conf=self.conf_threshold,
32
+ iou=self.iou_threshold,
33
+ max_det=self.max_detections,
34
+ verbose=False
35
+ )
36
+
37
+ detections = []
38
+
39
+ for result in results:
40
+ boxes = result.boxes
41
+ for box in boxes:
42
+ class_id = int(box.cls[0])
43
+ class_name = result.names[class_id]
44
+ bbox = box.xyxy[0].cpu().numpy().tolist()
45
+ confidence = float(box.conf[0])
46
+
47
+ detection = {
48
+ 'class_id': class_id,
49
+ 'class_name': class_name,
50
+ 'bbox': bbox,
51
+ 'confidence': confidence,
52
+ 'is_brand_relevant': class_name.lower() in self.brand_relevant_classes,
53
+ 'source': 'yolo'
54
+ }
55
+ detections.append(detection)
56
+
57
+ return detections
58
+
59
+ def filter_brand_relevant_objects(self, detections: List[Dict]) -> List[Dict]:
60
+ """Filter brand-relevant objects"""
61
+ return [det for det in detections if det['is_brand_relevant']]
62
+
63
+ print("✓ YOLODetectionManager defined")