Upload 22 files
Browse filesCreate Pixcribe Project
- app.py +159 -0
- brand_detection_optimizer.py +187 -0
- brand_prompts.py +970 -0
- brand_recognition_manager.py +420 -0
- brand_verification_manager.py +349 -0
- brand_visualization_manager.py +107 -0
- caption_generation_manager.py +499 -0
- detection_fusion_manager.py +242 -0
- image_processor_manager.py +70 -0
- landmark_prompts.py +1030 -0
- lighting_analysis_manager.py +453 -0
- ocr_engine_manager.py +129 -0
- openclip_semantic_manager.py +216 -0
- output_processing_manager.py +350 -0
- pixcribe_pipeline.py +335 -0
- prompt_library_manager.py +272 -0
- saliency_detection_manager.py +101 -0
- scene_compatibility_manager.py +133 -0
- scene_prompts.py +433 -0
- ui_manager.py +681 -0
- universal_object_prompts.py +464 -0
- yolo_detection_manager.py +63 -0
app.py
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import torch
|
| 3 |
+
from PIL import Image
|
| 4 |
+
import spaces
|
| 5 |
+
|
| 6 |
+
from pixcribe_pipeline import PixcribePipeline
|
| 7 |
+
from ui_manager import UIManager
|
| 8 |
+
|
| 9 |
+
# Initialize Pipeline and UI Manager
|
| 10 |
+
print("Initializing Pixcribe...")
|
| 11 |
+
print("⏳ Loading models (this may take 60-90 seconds on first run)...")
|
| 12 |
+
pipeline = PixcribePipeline(yolo_variant='l')
|
| 13 |
+
ui_manager = UIManager()
|
| 14 |
+
print("✅ All models loaded successfully!")
|
| 15 |
+
|
| 16 |
+
@spaces.GPU(duration=180)
|
| 17 |
+
def process_wrapper(image, yolo_variant, caption_language):
|
| 18 |
+
"""Process image and return formatted results
|
| 19 |
+
|
| 20 |
+
This function uses GPU-accelerated models:
|
| 21 |
+
- YOLOv11 (object detection)
|
| 22 |
+
- OpenCLIP ViT-H/14 (semantic understanding)
|
| 23 |
+
- EasyOCR (text extraction)
|
| 24 |
+
- Places365 (scene analysis)
|
| 25 |
+
- Qwen2.5-VL-7B (caption generation)
|
| 26 |
+
|
| 27 |
+
Total processing time: ~2-3 seconds on L4 GPU
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
if image is None:
|
| 31 |
+
return None, "<div style='color: #E74C3C; padding: 24px; text-align: center;'>Please upload an image</div>"
|
| 32 |
+
|
| 33 |
+
try:
|
| 34 |
+
platform = 'instagram'
|
| 35 |
+
|
| 36 |
+
results = pipeline.process_image(image, platform, yolo_variant, caption_language)
|
| 37 |
+
|
| 38 |
+
if results is None:
|
| 39 |
+
return None, "<div style='color: #E74C3C; padding: 24px; text-align: center;'>Processing failed. Check terminal logs for details.</div>"
|
| 40 |
+
|
| 41 |
+
except Exception as e:
|
| 42 |
+
import traceback
|
| 43 |
+
error_msg = traceback.format_exc()
|
| 44 |
+
print("="*60)
|
| 45 |
+
print("ERROR DETAILS:")
|
| 46 |
+
print(error_msg)
|
| 47 |
+
print("="*60)
|
| 48 |
+
|
| 49 |
+
# Return detailed error to user
|
| 50 |
+
error_html = f"""
|
| 51 |
+
<div style='background: #FADBD8; border: 2px solid #E74C3C; border-radius: 20px; padding: 28px; margin: 16px 0;'>
|
| 52 |
+
<h3 style='color: #C0392B; margin-top: 0; font-size: 22px;'>❌ Processing Error</h3>
|
| 53 |
+
<p style='color: #E74C3C; font-weight: bold; font-size: 17px; margin-bottom: 16px;'>{str(e)}</p>
|
| 54 |
+
<details style='margin-top: 12px;'>
|
| 55 |
+
<summary style='cursor: pointer; color: #C0392B; font-weight: bold; font-size: 16px;'>View Full Error Trace</summary>
|
| 56 |
+
<pre style='background: white; padding: 16px; border-radius: 12px; overflow-x: auto; font-size: 13px; color: #2C3E50; margin-top: 12px;'>{error_msg}</pre>
|
| 57 |
+
</details>
|
| 58 |
+
</div>
|
| 59 |
+
"""
|
| 60 |
+
return None, error_html
|
| 61 |
+
|
| 62 |
+
# Get visualized image with brand boxes
|
| 63 |
+
visualized_image = results.get('visualized_image', image)
|
| 64 |
+
|
| 65 |
+
# Format captions with copy functionality
|
| 66 |
+
captions_html = ui_manager.format_captions_with_copy(results['captions'])
|
| 67 |
+
|
| 68 |
+
return visualized_image, captions_html
|
| 69 |
+
|
| 70 |
+
# Create Gradio Interface
|
| 71 |
+
with gr.Blocks(css=ui_manager.custom_css, title="Pixcribe - AI Social Media Captions") as app:
|
| 72 |
+
|
| 73 |
+
# Header
|
| 74 |
+
ui_manager.create_header()
|
| 75 |
+
|
| 76 |
+
# Info Banner - Loading Time Notice
|
| 77 |
+
ui_manager.create_info_banner()
|
| 78 |
+
|
| 79 |
+
# Top Row - Upload Images & Detected Objects
|
| 80 |
+
with gr.Row(elem_classes="main-row"):
|
| 81 |
+
# Left - Upload Card
|
| 82 |
+
with gr.Column(scale=1):
|
| 83 |
+
with gr.Group(elem_classes="upload-card"):
|
| 84 |
+
image_input = gr.Image(
|
| 85 |
+
type="pil",
|
| 86 |
+
label="Upload Image",
|
| 87 |
+
elem_classes="upload-area"
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
# Right - Detected Objects
|
| 91 |
+
with gr.Column(scale=1):
|
| 92 |
+
with gr.Group(elem_classes="results-card"):
|
| 93 |
+
gr.Markdown("### Detected Objects", elem_classes="section-title")
|
| 94 |
+
visualized_image = gr.Image(
|
| 95 |
+
label="",
|
| 96 |
+
elem_classes="image-container"
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
# Bottom - Settings Section (Full Width)
|
| 100 |
+
with gr.Group(elem_classes="settings-container"):
|
| 101 |
+
gr.Markdown("### Settings", elem_classes="section-title-left")
|
| 102 |
+
|
| 103 |
+
with gr.Row(elem_classes="settings-row"):
|
| 104 |
+
caption_language = gr.Radio(
|
| 105 |
+
choices=[
|
| 106 |
+
('繁體中文', 'zh'),
|
| 107 |
+
('English', 'en')
|
| 108 |
+
],
|
| 109 |
+
value='en',
|
| 110 |
+
label="Caption Language",
|
| 111 |
+
elem_classes="radio-group-inline"
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
yolo_variant = gr.Radio(
|
| 115 |
+
choices=[
|
| 116 |
+
('Fast (m)', 'm'),
|
| 117 |
+
('Balanced (l)', 'l'),
|
| 118 |
+
('Accurate (x)', 'x')
|
| 119 |
+
],
|
| 120 |
+
value='l',
|
| 121 |
+
label="Detection Mode",
|
| 122 |
+
elem_classes="radio-group-inline"
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
# Generate Button (Centered)
|
| 126 |
+
with gr.Row(elem_classes="button-row"):
|
| 127 |
+
analyze_btn = gr.Button(
|
| 128 |
+
"Generate Captions",
|
| 129 |
+
variant="primary",
|
| 130 |
+
elem_classes="generate-button"
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
# Processing Time Notice
|
| 134 |
+
gr.HTML("""
|
| 135 |
+
<div style="text-align: center; margin-top: 16px; color: #7F8C8D; font-size: 14px;">
|
| 136 |
+
<span style="opacity: 0.8;">⚡ Please be patient - AI processing may take some time</span>
|
| 137 |
+
</div>
|
| 138 |
+
""")
|
| 139 |
+
|
| 140 |
+
# Caption Results (Full Width)
|
| 141 |
+
with gr.Group(elem_classes="caption-results-container"):
|
| 142 |
+
gr.Markdown("### 📝 Generated Captions", elem_classes="section-title")
|
| 143 |
+
caption_output = gr.HTML(
|
| 144 |
+
label="",
|
| 145 |
+
elem_id="caption-results"
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
# Footer
|
| 149 |
+
ui_manager.create_footer()
|
| 150 |
+
|
| 151 |
+
# Connect button to processing function
|
| 152 |
+
analyze_btn.click(
|
| 153 |
+
fn=process_wrapper,
|
| 154 |
+
inputs=[image_input, yolo_variant, caption_language],
|
| 155 |
+
outputs=[visualized_image, caption_output]
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
if __name__ == "__main__":
|
| 159 |
+
app.launch(share=True)
|
brand_detection_optimizer.py
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import torch
|
| 3 |
+
from PIL import Image
|
| 4 |
+
from typing import Dict, List, Tuple
|
| 5 |
+
import numpy as np
|
| 6 |
+
|
| 7 |
+
class BrandDetectionOptimizer:
|
| 8 |
+
"""
|
| 9 |
+
智能品牌檢測優化器 - 性能與準確度平衡
|
| 10 |
+
通過快速預篩選減少不必要的深度檢測
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
def __init__(self, clip_manager, ocr_manager, prompt_library):
|
| 14 |
+
self.clip_manager = clip_manager
|
| 15 |
+
self.ocr_manager = ocr_manager
|
| 16 |
+
self.prompt_library = prompt_library
|
| 17 |
+
|
| 18 |
+
def quick_brand_prescreening(self, image: Image.Image) -> List[str]:
|
| 19 |
+
"""
|
| 20 |
+
快速品牌預篩選 - 只檢測最可能的品牌類別
|
| 21 |
+
大幅減少需要深度檢測的品牌數量
|
| 22 |
+
|
| 23 |
+
Returns:
|
| 24 |
+
List of brand names that are likely present
|
| 25 |
+
"""
|
| 26 |
+
# Step 1: OCR 快速掃描(最快且最準確的方法)
|
| 27 |
+
likely_brands = set()
|
| 28 |
+
|
| 29 |
+
ocr_results = self.ocr_manager.extract_text(image, use_brand_preprocessing=True)
|
| 30 |
+
|
| 31 |
+
for ocr_item in ocr_results:
|
| 32 |
+
text = ocr_item['text'].upper()
|
| 33 |
+
|
| 34 |
+
# 過濾太短的文字(避免誤匹配)
|
| 35 |
+
if len(text) < 2:
|
| 36 |
+
continue
|
| 37 |
+
|
| 38 |
+
# 檢查所有品牌的別名
|
| 39 |
+
for brand_name, brand_info in self.prompt_library.get_all_brands().items():
|
| 40 |
+
aliases = [alias.upper() for alias in brand_info.get('aliases', [])]
|
| 41 |
+
|
| 42 |
+
# 完全匹配或部分匹配(但要求較高相似度)
|
| 43 |
+
for alias in aliases:
|
| 44 |
+
# 完全匹配
|
| 45 |
+
if alias == text:
|
| 46 |
+
likely_brands.add(brand_name)
|
| 47 |
+
break
|
| 48 |
+
# 部分匹配:要求別名長度 >= 3 且匹配度高
|
| 49 |
+
elif len(alias) >= 3:
|
| 50 |
+
if alias in text and len(alias) / len(text) > 0.6:
|
| 51 |
+
likely_brands.add(brand_name)
|
| 52 |
+
break
|
| 53 |
+
elif text in alias and len(text) / len(alias) > 0.6:
|
| 54 |
+
likely_brands.add(brand_name)
|
| 55 |
+
break
|
| 56 |
+
|
| 57 |
+
# Step 2: 視覺特徵快速分類(使用通用品牌類別)
|
| 58 |
+
category_prompts = {
|
| 59 |
+
'luxury': 'luxury brand product with monogram pattern and leather details',
|
| 60 |
+
'sportswear': 'sportswear brand product with athletic logo and swoosh design',
|
| 61 |
+
'tech': 'technology brand product with minimalist design and metal finish',
|
| 62 |
+
'automotive': 'luxury car brand with distinctive grille and emblem',
|
| 63 |
+
'watches': 'luxury watch with distinctive dial and brand logo',
|
| 64 |
+
'fashion': 'fashion brand product with signature pattern or logo'
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
category_scores = self.clip_manager.classify_zero_shot(
|
| 68 |
+
image, list(category_prompts.values())
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
# 獲取最可能的類別(top 2)
|
| 72 |
+
sorted_categories = sorted(
|
| 73 |
+
category_scores.items(), key=lambda x: x[1], reverse=True
|
| 74 |
+
)[:2]
|
| 75 |
+
|
| 76 |
+
# 將類別映射回品牌
|
| 77 |
+
category_mapping = {v: k for k, v in category_prompts.items()}
|
| 78 |
+
|
| 79 |
+
for prompt_text, score in sorted_categories:
|
| 80 |
+
if score > 0.30: # 提高閾值,減少誤判(0.15 → 0.30)
|
| 81 |
+
category = category_mapping[prompt_text]
|
| 82 |
+
# 添加該類別的所有品牌
|
| 83 |
+
category_brands = self.prompt_library.get_brands_by_category(category)
|
| 84 |
+
likely_brands.update(category_brands.keys())
|
| 85 |
+
|
| 86 |
+
# Step 3: 如果完全沒有線索,只添加視覺特徵最明顯的 3 個品牌(保底)
|
| 87 |
+
# 注意:這不是硬編碼,而是在無任何線索時的合理默認值
|
| 88 |
+
if not likely_brands:
|
| 89 |
+
# 只添加視覺特徵極其明顯且常見的品牌
|
| 90 |
+
default_brands = ['Louis Vuitton', 'Gucci', 'Nike']
|
| 91 |
+
likely_brands.update(default_brands)
|
| 92 |
+
|
| 93 |
+
# 返回品牌列表(不限制數量,而是依賴質量過濾)
|
| 94 |
+
return list(likely_brands)
|
| 95 |
+
|
| 96 |
+
def smart_region_selection(self, image: Image.Image,
|
| 97 |
+
saliency_regions: List[Dict]) -> List[Tuple[int, int, int, int]]:
|
| 98 |
+
"""
|
| 99 |
+
智能區域選擇 - 只掃描有品牌可能性的區域
|
| 100 |
+
替代低效的網格掃描
|
| 101 |
+
|
| 102 |
+
Args:
|
| 103 |
+
image: PIL Image
|
| 104 |
+
saliency_regions: Saliency detection results
|
| 105 |
+
|
| 106 |
+
Returns:
|
| 107 |
+
List of bboxes (x1, y1, x2, y2) to scan
|
| 108 |
+
"""
|
| 109 |
+
regions_to_scan = []
|
| 110 |
+
img_width, img_height = image.size
|
| 111 |
+
|
| 112 |
+
# Strategy 1: 使用顯著性區域(最有可能包含品牌)
|
| 113 |
+
if saliency_regions:
|
| 114 |
+
for region in saliency_regions[:3]: # Top 3 salient regions
|
| 115 |
+
bbox = region.get('bbox')
|
| 116 |
+
if bbox:
|
| 117 |
+
# 擴展區域以包含周邊context
|
| 118 |
+
x1, y1, x2, y2 = bbox
|
| 119 |
+
padding = 20
|
| 120 |
+
x1 = max(0, x1 - padding)
|
| 121 |
+
y1 = max(0, y1 - padding)
|
| 122 |
+
x2 = min(img_width, x2 + padding)
|
| 123 |
+
y2 = min(img_height, y2 + padding)
|
| 124 |
+
|
| 125 |
+
# 確保區域夠大
|
| 126 |
+
if (x2 - x1) > 100 and (y2 - y1) > 100:
|
| 127 |
+
regions_to_scan.append((x1, y1, x2, y2))
|
| 128 |
+
|
| 129 |
+
# Strategy 2: 中心區域(品牌通常在中心)
|
| 130 |
+
center_x = img_width // 2
|
| 131 |
+
center_y = img_height // 2
|
| 132 |
+
center_size = min(img_width, img_height) // 2
|
| 133 |
+
|
| 134 |
+
center_bbox = (
|
| 135 |
+
max(0, center_x - center_size // 2),
|
| 136 |
+
max(0, center_y - center_size // 2),
|
| 137 |
+
min(img_width, center_x + center_size // 2),
|
| 138 |
+
min(img_height, center_y + center_size // 2)
|
| 139 |
+
)
|
| 140 |
+
regions_to_scan.append(center_bbox)
|
| 141 |
+
|
| 142 |
+
# Strategy 3: 如果沒有顯著區域,使用全圖
|
| 143 |
+
if not regions_to_scan:
|
| 144 |
+
regions_to_scan.append((0, 0, img_width, img_height))
|
| 145 |
+
|
| 146 |
+
return regions_to_scan
|
| 147 |
+
|
| 148 |
+
def compute_brand_confidence_boost(self, brand_name: str,
|
| 149 |
+
ocr_results: List[Dict],
|
| 150 |
+
base_confidence: float) -> float:
|
| 151 |
+
"""
|
| 152 |
+
基於 OCR 結果提升品牌信心度
|
| 153 |
+
如果 OCR 檢測到品牌名稱,大幅提升信心度
|
| 154 |
+
|
| 155 |
+
Args:
|
| 156 |
+
brand_name: Brand name
|
| 157 |
+
ocr_results: OCR detection results
|
| 158 |
+
base_confidence: Base confidence from visual matching
|
| 159 |
+
|
| 160 |
+
Returns:
|
| 161 |
+
Boosted confidence score
|
| 162 |
+
"""
|
| 163 |
+
brand_info = self.prompt_library.get_brand_prompts(brand_name)
|
| 164 |
+
if not brand_info:
|
| 165 |
+
return base_confidence
|
| 166 |
+
|
| 167 |
+
aliases = [alias.upper() for alias in brand_info.get('aliases', [])]
|
| 168 |
+
|
| 169 |
+
max_boost = 0.0
|
| 170 |
+
for ocr_item in ocr_results:
|
| 171 |
+
text = ocr_item['text'].upper()
|
| 172 |
+
ocr_conf = ocr_item['confidence']
|
| 173 |
+
|
| 174 |
+
for alias in aliases:
|
| 175 |
+
# 完全匹配
|
| 176 |
+
if alias == text:
|
| 177 |
+
max_boost = max(max_boost, 0.40 * ocr_conf) # 最高提升 0.40
|
| 178 |
+
# 部分匹配
|
| 179 |
+
elif alias in text or text in alias:
|
| 180 |
+
if len(alias) > 2: # 避免短字符串誤匹配
|
| 181 |
+
max_boost = max(max_boost, 0.25 * ocr_conf)
|
| 182 |
+
|
| 183 |
+
# 應用提升,但不超過 0.95
|
| 184 |
+
boosted_confidence = min(base_confidence + max_boost, 0.95)
|
| 185 |
+
return boosted_confidence
|
| 186 |
+
|
| 187 |
+
print("✓ BrandDetectionOptimizer (performance and accuracy optimizer) defined")
|
brand_prompts.py
ADDED
|
@@ -0,0 +1,970 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from typing import Dict, List, Optional
|
| 3 |
+
|
| 4 |
+
class BrandPrompts:
|
| 5 |
+
"""
|
| 6 |
+
品牌視覺特徵與多模態識別資料庫
|
| 7 |
+
提供品牌的視覺線索、OpenCLIP prompts、Hashtags
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
def __init__(self):
|
| 11 |
+
"""初始化品牌資料庫"""
|
| 12 |
+
|
| 13 |
+
self.brand_prompts = {
|
| 14 |
+
# ===== 奢侈品牌 Luxury Brands =====
|
| 15 |
+
'luxury': {
|
| 16 |
+
"Louis Vuitton": {
|
| 17 |
+
"strong_cues": [
|
| 18 |
+
"LV monogram pattern with interlocking L and V letters on brown canvas",
|
| 19 |
+
"Brown canvas with golden hardware and leather trim showing Louis Vuitton signature",
|
| 20 |
+
"Damier checkerboard pattern in brown and tan showing LV design",
|
| 21 |
+
"Monogram flower motifs with LV initials repeated across surface"
|
| 22 |
+
],
|
| 23 |
+
"weak_cues": [
|
| 24 |
+
"Luxury handbag with structured silhouette and top handles",
|
| 25 |
+
"Premium leather goods with golden metal accents",
|
| 26 |
+
"Designer bag with classic proportions and elegant hardware"
|
| 27 |
+
],
|
| 28 |
+
"region_contexts": ["bag_panel", "luggage_surface", "wallet_front", "accessory_detail"],
|
| 29 |
+
"openclip_prompts": {
|
| 30 |
+
"bag_panel": [
|
| 31 |
+
"Louis Vuitton monogram canvas handbag with leather trim and brass hardware",
|
| 32 |
+
"LV brown monogram pattern on luxury bag with golden clasps",
|
| 33 |
+
"Designer handbag showing Louis Vuitton signature canvas and leather details"
|
| 34 |
+
],
|
| 35 |
+
"luggage_surface": [
|
| 36 |
+
"Louis Vuitton monogram luggage with brown canvas and leather corners",
|
| 37 |
+
"LV travel bag showing iconic monogram pattern with metallic accents",
|
| 38 |
+
"Luxury suitcase with Louis Vuitton canvas and protective leather trim"
|
| 39 |
+
],
|
| 40 |
+
"wallet_front": [
|
| 41 |
+
"Louis Vuitton monogram wallet with compact folding design",
|
| 42 |
+
"LV small leather good showing monogram canvas and card slots",
|
| 43 |
+
"Designer wallet with Louis Vuitton pattern and golden hardware"
|
| 44 |
+
]
|
| 45 |
+
},
|
| 46 |
+
"aliases": ["LV", "Louis Vuitton Monogram", "VUITTON", "LOUIS VUITTON"],
|
| 47 |
+
"hashtags": {
|
| 48 |
+
"zh": ["LV", "路易威登", "奢侈品", "精品包", "時尚"],
|
| 49 |
+
"en": ["LouisVuitton", "LV", "LuxuryFashion", "DesignerBag", "Luxury"]
|
| 50 |
+
},
|
| 51 |
+
"visual_distinctive": True,
|
| 52 |
+
"text_prominent": False
|
| 53 |
+
},
|
| 54 |
+
|
| 55 |
+
"Gucci": {
|
| 56 |
+
"strong_cues": [
|
| 57 |
+
"Interlocking double G logo in gold or silver metal with heart shape design",
|
| 58 |
+
"GG logo in shiny gold brass hardware on black quilted leather",
|
| 59 |
+
"Green and red web stripe on beige or brown canvas background",
|
| 60 |
+
"GG monogram pattern repeated across fabric or leather surface",
|
| 61 |
+
"Chevron matelassé quilted leather with V-shaped stitching pattern",
|
| 62 |
+
"Heart-shaped double G logo with antique gold finish",
|
| 63 |
+
"Bamboo handle detail on handbag with curved shape and metal hardware"
|
| 64 |
+
],
|
| 65 |
+
"weak_cues": [
|
| 66 |
+
"Luxury fashion item with bold logo placement and premium materials",
|
| 67 |
+
"Designer accessory with distinctive hardware and Italian branding",
|
| 68 |
+
"High-end quilted leather goods with geometric stitching pattern",
|
| 69 |
+
"Black leather handbag with gold chain strap and structured silhouette",
|
| 70 |
+
"Luxury bag with chevron quilting and metallic hardware accents"
|
| 71 |
+
],
|
| 72 |
+
"region_contexts": ["bag_front", "bag_panel", "belt_buckle", "shoe_detail", "accessory_surface", "logo_area"],
|
| 73 |
+
"openclip_prompts": {
|
| 74 |
+
"bag_front": [
|
| 75 |
+
"Gucci Marmont handbag with heart-shaped GG logo in antique gold on quilted black leather",
|
| 76 |
+
"Designer bag showing Gucci chevron matelassé quilted pattern with gold GG hardware",
|
| 77 |
+
"Luxury handbag with double G heart logo and V-shaped quilting on black leather",
|
| 78 |
+
"Gucci bag with interlocking GG logo web stripe and canvas texture",
|
| 79 |
+
"Black quilted leather Gucci bag with shiny gold double G emblem and chain strap",
|
| 80 |
+
"Gucci Marmont camera bag with chevron quilted leather and gold hardware",
|
| 81 |
+
"Designer handbag featuring Gucci signature GG logo with geometric quilting pattern"
|
| 82 |
+
],
|
| 83 |
+
"bag_panel": [
|
| 84 |
+
"Gucci matelassé quilted leather surface with chevron V-pattern stitching",
|
| 85 |
+
"Black quilted leather panel with Gucci heart-shaped GG logo in center",
|
| 86 |
+
"Luxury leather with geometric quilting showing Gucci craftsmanship and gold hardware",
|
| 87 |
+
"Chevron stitched leather surface with interlocking GG emblem in antique gold",
|
| 88 |
+
"Gucci quilted pattern with V-shaped chevron design and metallic logo placement"
|
| 89 |
+
],
|
| 90 |
+
"belt_buckle": [
|
| 91 |
+
"Gucci belt with large interlocking GG buckle in polished gold metal",
|
| 92 |
+
"Designer belt showing double G logo buckle with black or brown leather strap",
|
| 93 |
+
"Luxury belt with Gucci signature GG buckle in brass finish and Italian leather",
|
| 94 |
+
"Gucci GG Marmont belt with textured double G buckle and leather band"
|
| 95 |
+
],
|
| 96 |
+
"logo_area": [
|
| 97 |
+
"Close-up of Gucci interlocking GG logo in gold metal with heart shape",
|
| 98 |
+
"Gucci double G emblem in antique gold brass on black leather background",
|
| 99 |
+
"Heart-shaped GG logo with metallic gold finish showing Gucci branding",
|
| 100 |
+
"Shiny gold Gucci GG hardware on quilted matelassé leather surface"
|
| 101 |
+
]
|
| 102 |
+
},
|
| 103 |
+
"aliases": ["GG", "GUCCI", "Gucci Marmont"],
|
| 104 |
+
"hashtags": {
|
| 105 |
+
"zh": ["Gucci", "古馳", "奢侈品", "精品", "義大利時尚", "Marmont"],
|
| 106 |
+
"en": ["Gucci", "LuxuryFashion", "DesignerBrand", "ItalianFashion", "GG", "GucciMarmont"]
|
| 107 |
+
},
|
| 108 |
+
"visual_distinctive": True,
|
| 109 |
+
"text_prominent": False
|
| 110 |
+
},
|
| 111 |
+
|
| 112 |
+
"Chanel": {
|
| 113 |
+
"strong_cues": [
|
| 114 |
+
"Interlocking double C logo in metal or quilted leather",
|
| 115 |
+
"Quilted diamond pattern leather with chain strap",
|
| 116 |
+
"Black and white color scheme with gold or silver chain",
|
| 117 |
+
"Camellia flower motif as decorative element"
|
| 118 |
+
],
|
| 119 |
+
"weak_cues": [
|
| 120 |
+
"Elegant quilted leather handbag with chain details",
|
| 121 |
+
"Luxury fashion item with classic French design",
|
| 122 |
+
"Designer accessory with sophisticated minimalist styling"
|
| 123 |
+
],
|
| 124 |
+
"region_contexts": ["bag_flap", "jewelry_detail", "perfume_bottle", "clothing_label"],
|
| 125 |
+
"openclip_prompts": {
|
| 126 |
+
"bag_flap": [
|
| 127 |
+
"Chanel quilted bag with interlocking CC logo and chain strap",
|
| 128 |
+
"Classic flap bag showing Chanel diamond quilting and gold hardware",
|
| 129 |
+
"Luxury handbag with Chanel CC closure and leather chain"
|
| 130 |
+
]
|
| 131 |
+
},
|
| 132 |
+
"aliases": ["CC", "CHANEL"],
|
| 133 |
+
"hashtags": {
|
| 134 |
+
"zh": ["Chanel", "香奈兒", "奢侈品", "精品包", "法國時尚"],
|
| 135 |
+
"en": ["Chanel", "ChanelBag", "Luxury", "FrenchFashion", "ClassicBag"]
|
| 136 |
+
},
|
| 137 |
+
"visual_distinctive": True,
|
| 138 |
+
"text_prominent": False
|
| 139 |
+
},
|
| 140 |
+
|
| 141 |
+
"Hermès": {
|
| 142 |
+
"strong_cues": [
|
| 143 |
+
"Orange box or shopping bag with brown ribbon",
|
| 144 |
+
"Birkin or Kelly bag with distinctive silhouette and hardware",
|
| 145 |
+
"H logo belt buckle in polished metal",
|
| 146 |
+
"Saddle stitching on leather goods with equestrian heritage"
|
| 147 |
+
],
|
| 148 |
+
"weak_cues": [
|
| 149 |
+
"Ultra-luxury leather handbag with exceptional craftsmanship",
|
| 150 |
+
"Designer accessory with understated elegance and premium materials",
|
| 151 |
+
"High-end fashion item with classic proportions and hardware"
|
| 152 |
+
],
|
| 153 |
+
"region_contexts": ["bag_structure", "belt_buckle", "scarf_pattern", "packaging"],
|
| 154 |
+
"openclip_prompts": {
|
| 155 |
+
"bag_structure": [
|
| 156 |
+
"Hermès Birkin bag with structured leather and golden padlock",
|
| 157 |
+
"Luxury handbag showing Hermès Kelly bag silhouette with turnlock",
|
| 158 |
+
"Designer bag with Hermès craftsmanship and distinctive hardware"
|
| 159 |
+
]
|
| 160 |
+
},
|
| 161 |
+
"aliases": ["HERMES", "HERMÈS", "BIRKIN", "KELLY"],
|
| 162 |
+
"hashtags": {
|
| 163 |
+
"zh": ["Hermès", "愛馬仕", "柏金包", "奢侈品", "頂級精品"],
|
| 164 |
+
"en": ["Hermes", "Birkin", "KellyBag", "Luxury", "UltraLuxury"]
|
| 165 |
+
},
|
| 166 |
+
"visual_distinctive": True,
|
| 167 |
+
"text_prominent": False
|
| 168 |
+
},
|
| 169 |
+
|
| 170 |
+
"Prada": {
|
| 171 |
+
"strong_cues": [
|
| 172 |
+
"Triangular metal logo plate with Prada Milano text",
|
| 173 |
+
"Saffiano leather with crosshatch texture pattern",
|
| 174 |
+
"Black nylon bag with triangular logo badge",
|
| 175 |
+
"Minimalist design with subtle branding placement"
|
| 176 |
+
],
|
| 177 |
+
"weak_cues": [
|
| 178 |
+
"Italian luxury handbag with clean modern lines",
|
| 179 |
+
"Designer accessory with understated contemporary styling",
|
| 180 |
+
"High-end leather goods with minimalist aesthetic"
|
| 181 |
+
],
|
| 182 |
+
"region_contexts": ["bag_front", "wallet_surface", "shoe_heel", "clothing_tag"],
|
| 183 |
+
"openclip_prompts": {
|
| 184 |
+
"bag_front": [
|
| 185 |
+
"Prada bag with triangular metal logo and saffiano leather",
|
| 186 |
+
"Designer handbag showing Prada Milano badge with textured leather",
|
| 187 |
+
"Luxury bag with Prada signature triangle and minimalist design"
|
| 188 |
+
]
|
| 189 |
+
},
|
| 190 |
+
"aliases": ["PRADA", "MILANO"],
|
| 191 |
+
"hashtags": {
|
| 192 |
+
"zh": ["Prada", "普拉達", "奢侈品", "義大利精品", "時尚"],
|
| 193 |
+
"en": ["Prada", "ItalianLuxury", "DesignerBag", "LuxuryFashion", "Minimalist"]
|
| 194 |
+
},
|
| 195 |
+
"visual_distinctive": True,
|
| 196 |
+
"text_prominent": False
|
| 197 |
+
}
|
| 198 |
+
},
|
| 199 |
+
|
| 200 |
+
# ===== 運動品牌 Sportswear Brands =====
|
| 201 |
+
'sportswear': {
|
| 202 |
+
"Nike": {
|
| 203 |
+
"strong_cues": [
|
| 204 |
+
"Swoosh logo in black white or colored variations",
|
| 205 |
+
"Just Do It slogan text accompanying swoosh",
|
| 206 |
+
"Air Jordan jumpman silhouette logo",
|
| 207 |
+
"Nike Air branding on shoe midsole or tongue"
|
| 208 |
+
],
|
| 209 |
+
"weak_cues": [
|
| 210 |
+
"Athletic footwear with sporty performance design",
|
| 211 |
+
"Sportswear with moisture-wicking technical fabric",
|
| 212 |
+
"Running shoe with cushioned midsole and branded details"
|
| 213 |
+
],
|
| 214 |
+
"region_contexts": ["shoe_side", "apparel_chest", "equipment_surface", "logo_placement"],
|
| 215 |
+
"openclip_prompts": {
|
| 216 |
+
"shoe_side": [
|
| 217 |
+
"Nike sneaker with swoosh logo on side panel",
|
| 218 |
+
"Athletic shoe showing Nike branding and Air technology",
|
| 219 |
+
"Running shoe with Nike swoosh and performance design"
|
| 220 |
+
],
|
| 221 |
+
"apparel_chest": [
|
| 222 |
+
"Nike athletic wear with swoosh logo on chest",
|
| 223 |
+
"Sports apparel showing Nike branding and technical fabric",
|
| 224 |
+
"Performance clothing with Nike swoosh and Just Do It text"
|
| 225 |
+
]
|
| 226 |
+
},
|
| 227 |
+
"aliases": ["NIKE", "JUST DO IT", "swoosh"],
|
| 228 |
+
"hashtags": {
|
| 229 |
+
"zh": ["Nike", "耐吉", "運動", "球鞋", "運動品牌"],
|
| 230 |
+
"en": ["Nike", "JustDoIt", "Sneakers", "Athletic", "Sportswear"]
|
| 231 |
+
},
|
| 232 |
+
"visual_distinctive": True,
|
| 233 |
+
"text_prominent": True
|
| 234 |
+
},
|
| 235 |
+
|
| 236 |
+
"Adidas": {
|
| 237 |
+
"strong_cues": [
|
| 238 |
+
"Three stripes design on side of shoes or apparel",
|
| 239 |
+
"Trefoil logo with three-leaf clover design",
|
| 240 |
+
"Performance logo with three bars forming mountain shape",
|
| 241 |
+
"Boost technology branding on shoe midsole"
|
| 242 |
+
],
|
| 243 |
+
"weak_cues": [
|
| 244 |
+
"Athletic footwear with three-stripe design element",
|
| 245 |
+
"Sportswear with retro or performance styling",
|
| 246 |
+
"Running shoe with distinctive midsole technology"
|
| 247 |
+
],
|
| 248 |
+
"region_contexts": ["shoe_side", "apparel_sleeve", "equipment_detail", "logo_area"],
|
| 249 |
+
"openclip_prompts": {
|
| 250 |
+
"shoe_side": [
|
| 251 |
+
"Adidas sneaker with three stripes on side panel",
|
| 252 |
+
"Athletic shoe showing Adidas branding and Boost sole",
|
| 253 |
+
"Sports footwear with Adidas three-stripe design"
|
| 254 |
+
]
|
| 255 |
+
},
|
| 256 |
+
"aliases": ["ADIDAS", "ORIGINALS", "three stripes"],
|
| 257 |
+
"hashtags": {
|
| 258 |
+
"zh": ["Adidas", "愛迪達", "三條線", "運動", "球鞋"],
|
| 259 |
+
"en": ["Adidas", "ThreeStripes", "Sneakers", "Sportswear", "Athletic"]
|
| 260 |
+
},
|
| 261 |
+
"visual_distinctive": True,
|
| 262 |
+
"text_prominent": True
|
| 263 |
+
},
|
| 264 |
+
|
| 265 |
+
"Puma": {
|
| 266 |
+
"strong_cues": [
|
| 267 |
+
"Leaping puma cat logo in silhouette form",
|
| 268 |
+
"Puma wordmark text in distinctive font",
|
| 269 |
+
"Formstrip design on side of shoes",
|
| 270 |
+
"Cat logo combined with Puma text branding"
|
| 271 |
+
],
|
| 272 |
+
"weak_cues": [
|
| 273 |
+
"Athletic footwear with sleek performance design",
|
| 274 |
+
"Sportswear with modern styling and branding",
|
| 275 |
+
"Running shoe with lightweight construction"
|
| 276 |
+
],
|
| 277 |
+
"region_contexts": ["shoe_side", "apparel_detail", "equipment_logo"],
|
| 278 |
+
"openclip_prompts": {
|
| 279 |
+
"shoe_side": [
|
| 280 |
+
"Puma sneaker with cat logo and formstrip design",
|
| 281 |
+
"Athletic shoe showing Puma branding on side",
|
| 282 |
+
"Sports footwear with Puma leaping cat emblem"
|
| 283 |
+
]
|
| 284 |
+
},
|
| 285 |
+
"aliases": ["PUMA"],
|
| 286 |
+
"hashtags": {
|
| 287 |
+
"zh": ["Puma", "彪馬", "運動品牌", "球鞋"],
|
| 288 |
+
"en": ["Puma", "Sneakers", "Athletic", "Sportswear"]
|
| 289 |
+
},
|
| 290 |
+
"visual_distinctive": True,
|
| 291 |
+
"text_prominent": True
|
| 292 |
+
},
|
| 293 |
+
|
| 294 |
+
"Under Armour": {
|
| 295 |
+
"strong_cues": [
|
| 296 |
+
"Interlocking UA logo design",
|
| 297 |
+
"HeatGear or ColdGear technology branding",
|
| 298 |
+
"Under Armour wordmark in athletic font",
|
| 299 |
+
"Performance fabric with visible texture pattern"
|
| 300 |
+
],
|
| 301 |
+
"weak_cues": [
|
| 302 |
+
"Athletic apparel with technical performance features",
|
| 303 |
+
"Sportswear with moisture management technology",
|
| 304 |
+
"Training gear with modern athletic design"
|
| 305 |
+
],
|
| 306 |
+
"region_contexts": ["apparel_chest", "shoe_detail", "equipment_surface"],
|
| 307 |
+
"openclip_prompts": {
|
| 308 |
+
"apparel_chest": [
|
| 309 |
+
"Under Armour shirt with UA logo on chest",
|
| 310 |
+
"Athletic wear showing Under Armour branding and HeatGear",
|
| 311 |
+
"Performance apparel with Under Armour logo and technical fabric"
|
| 312 |
+
]
|
| 313 |
+
},
|
| 314 |
+
"aliases": ["UA", "UNDER ARMOUR"],
|
| 315 |
+
"hashtags": {
|
| 316 |
+
"zh": ["UnderArmour", "安德瑪", "運動服飾", "訓練裝備"],
|
| 317 |
+
"en": ["UnderArmour", "UA", "Athletic", "PerformanceGear", "Training"]
|
| 318 |
+
},
|
| 319 |
+
"visual_distinctive": False,
|
| 320 |
+
"text_prominent": True
|
| 321 |
+
}
|
| 322 |
+
},
|
| 323 |
+
|
| 324 |
+
# ===== 科技品牌 Tech Brands =====
|
| 325 |
+
'tech': {
|
| 326 |
+
"Apple": {
|
| 327 |
+
"strong_cues": [
|
| 328 |
+
"Bitten apple logo in silver white or black",
|
| 329 |
+
"Minimalist aluminum or glass device design",
|
| 330 |
+
"iPhone with distinctive notch or dynamic island",
|
| 331 |
+
"MacBook with glowing apple logo on lid"
|
| 332 |
+
],
|
| 333 |
+
"weak_cues": [
|
| 334 |
+
"Sleek electronic device with premium materials",
|
| 335 |
+
"Smartphone with edge-to-edge display design",
|
| 336 |
+
"Laptop with thin profile and minimal branding"
|
| 337 |
+
],
|
| 338 |
+
"region_contexts": ["device_back", "laptop_lid", "packaging", "product_front"],
|
| 339 |
+
"openclip_prompts": {
|
| 340 |
+
"device_back": [
|
| 341 |
+
"iPhone back with apple logo and camera array",
|
| 342 |
+
"Apple device showing bitten apple emblem and glass back",
|
| 343 |
+
"Smartphone with Apple branding and premium finish"
|
| 344 |
+
],
|
| 345 |
+
"laptop_lid": [
|
| 346 |
+
"MacBook with glowing apple logo on aluminum lid",
|
| 347 |
+
"Apple laptop showing minimalist design and apple emblem",
|
| 348 |
+
"Premium notebook with Apple branding and sleek profile"
|
| 349 |
+
]
|
| 350 |
+
},
|
| 351 |
+
"aliases": ["APPLE", "IPHONE", "IPAD", "MACBOOK", "apple logo"],
|
| 352 |
+
"hashtags": {
|
| 353 |
+
"zh": ["Apple", "蘋果", "iPhone", "科技", "蘋果產品"],
|
| 354 |
+
"en": ["Apple", "iPhone", "MacBook", "Tech", "iOS"]
|
| 355 |
+
},
|
| 356 |
+
"visual_distinctive": True,
|
| 357 |
+
"text_prominent": False
|
| 358 |
+
},
|
| 359 |
+
|
| 360 |
+
"Samsung": {
|
| 361 |
+
"strong_cues": [
|
| 362 |
+
"Samsung wordmark logo in blue or white",
|
| 363 |
+
"Galaxy branding on smartphone",
|
| 364 |
+
"Curved edge display on premium devices",
|
| 365 |
+
"S Pen stylus with Samsung device"
|
| 366 |
+
],
|
| 367 |
+
"weak_cues": [
|
| 368 |
+
"Android smartphone with large display",
|
| 369 |
+
"Electronic device with modern design",
|
| 370 |
+
"Tech product with screen and branding"
|
| 371 |
+
],
|
| 372 |
+
"region_contexts": ["device_front", "product_back", "packaging"],
|
| 373 |
+
"openclip_prompts": {
|
| 374 |
+
"device_front": [
|
| 375 |
+
"Samsung Galaxy phone with curved display and minimal bezels",
|
| 376 |
+
"Smartphone showing Samsung branding and screen",
|
| 377 |
+
"Android device with Samsung logo and modern design"
|
| 378 |
+
]
|
| 379 |
+
},
|
| 380 |
+
"aliases": ["SAMSUNG", "Galaxy"],
|
| 381 |
+
"hashtags": {
|
| 382 |
+
"zh": ["Samsung", "三星", "Galaxy", "安卓", "科技"],
|
| 383 |
+
"en": ["Samsung", "Galaxy", "Android", "Tech", "Smartphone"]
|
| 384 |
+
},
|
| 385 |
+
"visual_distinctive": False,
|
| 386 |
+
"text_prominent": True
|
| 387 |
+
},
|
| 388 |
+
|
| 389 |
+
"Microsoft": {
|
| 390 |
+
"strong_cues": [
|
| 391 |
+
"Four-colored square window logo",
|
| 392 |
+
"Surface branding on devices",
|
| 393 |
+
"Windows logo on keyboard or device",
|
| 394 |
+
"Xbox green logo on gaming products"
|
| 395 |
+
],
|
| 396 |
+
"weak_cues": [
|
| 397 |
+
"Premium laptop or tablet device",
|
| 398 |
+
"Gaming console or controller",
|
| 399 |
+
"Computer hardware with modern design"
|
| 400 |
+
],
|
| 401 |
+
"region_contexts": ["device_surface", "keyboard_area", "product_branding"],
|
| 402 |
+
"openclip_prompts": {
|
| 403 |
+
"device_surface": [
|
| 404 |
+
"Microsoft Surface laptop with logo and premium build",
|
| 405 |
+
"Device showing Microsoft branding and sleek design",
|
| 406 |
+
"Surface product with distinctive kickstand and logo"
|
| 407 |
+
]
|
| 408 |
+
},
|
| 409 |
+
"aliases": ["MICROSOFT", "Surface", "Windows"],
|
| 410 |
+
"hashtags": {
|
| 411 |
+
"zh": ["Microsoft", "微軟", "Surface", "科技", "Windows"],
|
| 412 |
+
"en": ["Microsoft", "Surface", "Windows", "Tech", "Xbox"]
|
| 413 |
+
},
|
| 414 |
+
"visual_distinctive": False,
|
| 415 |
+
"text_prominent": True
|
| 416 |
+
}
|
| 417 |
+
},
|
| 418 |
+
|
| 419 |
+
# ===== 汽車品牌 Automotive Brands =====
|
| 420 |
+
'automotive': {
|
| 421 |
+
"Mercedes-Benz": {
|
| 422 |
+
"strong_cues": [
|
| 423 |
+
"Three-pointed star logo in circle",
|
| 424 |
+
"Mercedes-Benz wordmark on vehicle",
|
| 425 |
+
"Large star emblem on front grille",
|
| 426 |
+
"Hood ornament with standing star"
|
| 427 |
+
],
|
| 428 |
+
"weak_cues": [
|
| 429 |
+
"Luxury vehicle with premium design",
|
| 430 |
+
"Car with elegant styling and badge",
|
| 431 |
+
"Automobile with refined details"
|
| 432 |
+
],
|
| 433 |
+
"region_contexts": ["front_grille", "hood_ornament", "wheel_center", "badge"],
|
| 434 |
+
"openclip_prompts": {
|
| 435 |
+
"front_grille": [
|
| 436 |
+
"Mercedes-Benz front with three-pointed star on grille",
|
| 437 |
+
"Luxury car showing Mercedes logo and elegant grille design",
|
| 438 |
+
"Vehicle with Mercedes-Benz star emblem and premium styling"
|
| 439 |
+
]
|
| 440 |
+
},
|
| 441 |
+
"aliases": ["Mercedes", "Benz", "MB", "MERCEDES-BENZ"],
|
| 442 |
+
"hashtags": {
|
| 443 |
+
"zh": ["Mercedes", "賓士", "豪華車", "汽車", "德國車"],
|
| 444 |
+
"en": ["Mercedes", "Benz", "LuxuryCar", "German", "Automotive"]
|
| 445 |
+
},
|
| 446 |
+
"visual_distinctive": True,
|
| 447 |
+
"text_prominent": False
|
| 448 |
+
},
|
| 449 |
+
|
| 450 |
+
"BMW": {
|
| 451 |
+
"strong_cues": [
|
| 452 |
+
"Blue and white roundel logo with BMW letters",
|
| 453 |
+
"Kidney grille design on front",
|
| 454 |
+
"Hofmeister kink in rear window design",
|
| 455 |
+
"BMW M badge for performance models"
|
| 456 |
+
],
|
| 457 |
+
"weak_cues": [
|
| 458 |
+
"Luxury sports sedan with dynamic styling",
|
| 459 |
+
"Premium vehicle with distinctive design",
|
| 460 |
+
"Car with performance-oriented features"
|
| 461 |
+
],
|
| 462 |
+
"region_contexts": ["front_badge", "wheel_center", "rear_emblem"],
|
| 463 |
+
"openclip_prompts": {
|
| 464 |
+
"front_badge": [
|
| 465 |
+
"BMW front with blue and white roundel and kidney grille",
|
| 466 |
+
"Luxury car showing BMW logo and distinctive grille design",
|
| 467 |
+
"Vehicle with BMW emblem and sporty styling"
|
| 468 |
+
]
|
| 469 |
+
},
|
| 470 |
+
"aliases": ["BMW"],
|
| 471 |
+
"hashtags": {
|
| 472 |
+
"zh": ["BMW", "寶馬", "豪華車", "德國車", "性能車"],
|
| 473 |
+
"en": ["BMW", "LuxuryCar", "German", "Performance", "Ultimate Driving Machine"]
|
| 474 |
+
},
|
| 475 |
+
"visual_distinctive": True,
|
| 476 |
+
"text_prominent": False
|
| 477 |
+
},
|
| 478 |
+
|
| 479 |
+
"Tesla": {
|
| 480 |
+
"strong_cues": [
|
| 481 |
+
"T-shaped logo resembling cross-section of electric motor",
|
| 482 |
+
"Tesla wordmark on vehicle",
|
| 483 |
+
"Minimalist design with flush door handles",
|
| 484 |
+
"Large touchscreen display in interior"
|
| 485 |
+
],
|
| 486 |
+
"weak_cues": [
|
| 487 |
+
"Electric vehicle with modern design",
|
| 488 |
+
"Car with clean aerodynamic styling",
|
| 489 |
+
"Automobile with minimal exterior branding"
|
| 490 |
+
],
|
| 491 |
+
"region_contexts": ["front_badge", "rear_emblem", "wheel_center"],
|
| 492 |
+
"openclip_prompts": {
|
| 493 |
+
"front_badge": [
|
| 494 |
+
"Tesla front with T logo and minimalist design",
|
| 495 |
+
"Electric vehicle showing Tesla branding and clean styling",
|
| 496 |
+
"Car with Tesla emblem and aerodynamic profile"
|
| 497 |
+
]
|
| 498 |
+
},
|
| 499 |
+
"aliases": ["TESLA"],
|
| 500 |
+
"hashtags": {
|
| 501 |
+
"zh": ["Tesla", "特斯拉", "電動車", "科技", "環保"],
|
| 502 |
+
"en": ["Tesla", "ElectricVehicle", "EV", "Tech", "Sustainable"]
|
| 503 |
+
},
|
| 504 |
+
"visual_distinctive": True,
|
| 505 |
+
"text_prominent": False
|
| 506 |
+
}
|
| 507 |
+
},
|
| 508 |
+
|
| 509 |
+
# ===== 鐘錶品牌 Watch Brands =====
|
| 510 |
+
'watches': {
|
| 511 |
+
"Rolex": {
|
| 512 |
+
"strong_cues": [
|
| 513 |
+
"Crown logo at 12 o'clock position",
|
| 514 |
+
"Rolex wordmark on dial with Oyster Perpetual text",
|
| 515 |
+
"Cyclops date magnifier on crystal",
|
| 516 |
+
"Jubilee or Oyster bracelet design"
|
| 517 |
+
],
|
| 518 |
+
"weak_cues": [
|
| 519 |
+
"Luxury watch with metal bracelet",
|
| 520 |
+
"Timepiece with classic round case",
|
| 521 |
+
"Wristwatch with premium finish"
|
| 522 |
+
],
|
| 523 |
+
"region_contexts": ["watch_dial", "bracelet_clasp", "case_side"],
|
| 524 |
+
"openclip_prompts": {
|
| 525 |
+
"watch_dial": [
|
| 526 |
+
"Rolex watch dial with crown logo and Oyster Perpetual text",
|
| 527 |
+
"Luxury timepiece showing Rolex branding and date window",
|
| 528 |
+
"Wristwatch with Rolex crown emblem and classic design"
|
| 529 |
+
]
|
| 530 |
+
},
|
| 531 |
+
"aliases": ["ROLEX", "OYSTER PERPETUAL"],
|
| 532 |
+
"hashtags": {
|
| 533 |
+
"zh": ["Rolex", "勞力士", "手錶", "奢華", "瑞士錶"],
|
| 534 |
+
"en": ["Rolex", "LuxuryWatch", "Swiss", "Timepiece", "OysterPerpetual"]
|
| 535 |
+
},
|
| 536 |
+
"visual_distinctive": True,
|
| 537 |
+
"text_prominent": True
|
| 538 |
+
},
|
| 539 |
+
|
| 540 |
+
"Omega": {
|
| 541 |
+
"strong_cues": [
|
| 542 |
+
"Omega symbol Ω on dial or case",
|
| 543 |
+
"Seamaster or Speedmaster model branding",
|
| 544 |
+
"Co-Axial escapement text on dial",
|
| 545 |
+
"Distinctive bracelet or strap design"
|
| 546 |
+
],
|
| 547 |
+
"weak_cues": [
|
| 548 |
+
"Swiss luxury watch with sporty design",
|
| 549 |
+
"Timepiece with professional appearance",
|
| 550 |
+
"Wristwatch with precision craftsmanship"
|
| 551 |
+
],
|
| 552 |
+
"region_contexts": ["watch_dial", "case_back", "bracelet"],
|
| 553 |
+
"openclip_prompts": {
|
| 554 |
+
"watch_dial": [
|
| 555 |
+
"Omega watch dial with Ω symbol and Seamaster branding",
|
| 556 |
+
"Luxury timepiece showing Omega logo and Co-Axial text",
|
| 557 |
+
"Wristwatch with Omega emblem and professional design"
|
| 558 |
+
]
|
| 559 |
+
},
|
| 560 |
+
"aliases": ["OMEGA", "Ω"],
|
| 561 |
+
"hashtags": {
|
| 562 |
+
"zh": ["Omega", "歐米茄", "手錶", "瑞士錶", "奢華"],
|
| 563 |
+
"en": ["Omega", "Seamaster", "Speedmaster", "SwissWatch", "Luxury"]
|
| 564 |
+
},
|
| 565 |
+
"visual_distinctive": True,
|
| 566 |
+
"text_prominent": True
|
| 567 |
+
}
|
| 568 |
+
},
|
| 569 |
+
|
| 570 |
+
# ===== 時尚品牌 Fashion Brands =====
|
| 571 |
+
'fashion': {
|
| 572 |
+
"Zara": {
|
| 573 |
+
"strong_cues": [
|
| 574 |
+
"Zara wordmark in sans-serif font",
|
| 575 |
+
"Minimalist clothing tag design",
|
| 576 |
+
"Fast fashion styling with current trends",
|
| 577 |
+
"Zara logo on shopping bag or packaging"
|
| 578 |
+
],
|
| 579 |
+
"weak_cues": [
|
| 580 |
+
"Contemporary fashion apparel",
|
| 581 |
+
"Trendy clothing with modern cut",
|
| 582 |
+
"Affordable fashion item"
|
| 583 |
+
],
|
| 584 |
+
"region_contexts": ["clothing_tag", "shopping_bag", "label"],
|
| 585 |
+
"openclip_prompts": {
|
| 586 |
+
"clothing_tag": [
|
| 587 |
+
"Zara clothing tag with brand logo",
|
| 588 |
+
"Fashion item showing Zara label",
|
| 589 |
+
"Apparel with Zara branding"
|
| 590 |
+
]
|
| 591 |
+
},
|
| 592 |
+
"aliases": ["ZARA"],
|
| 593 |
+
"hashtags": {
|
| 594 |
+
"zh": ["Zara", "時尚", "快時尚", "穿搭"],
|
| 595 |
+
"en": ["Zara", "Fashion", "FastFashion", "Style", "OOTD"]
|
| 596 |
+
},
|
| 597 |
+
"visual_distinctive": False,
|
| 598 |
+
"text_prominent": True
|
| 599 |
+
},
|
| 600 |
+
|
| 601 |
+
"H&M": {
|
| 602 |
+
"strong_cues": [
|
| 603 |
+
"H&M logo in red and white",
|
| 604 |
+
"Hennes & Mauritz full brand name",
|
| 605 |
+
"Conscious collection labeling",
|
| 606 |
+
"Distinctive red shopping bag"
|
| 607 |
+
],
|
| 608 |
+
"weak_cues": [
|
| 609 |
+
"Affordable fashion clothing",
|
| 610 |
+
"Casual apparel with trendy design",
|
| 611 |
+
"Fast fashion item"
|
| 612 |
+
],
|
| 613 |
+
"region_contexts": ["clothing_tag", "label", "shopping_bag"],
|
| 614 |
+
"openclip_prompts": {
|
| 615 |
+
"clothing_tag": [
|
| 616 |
+
"H&M clothing tag with red and white logo",
|
| 617 |
+
"Fashion item showing H&M branding",
|
| 618 |
+
"Apparel with Hennes & Mauritz label"
|
| 619 |
+
]
|
| 620 |
+
},
|
| 621 |
+
"aliases": ["HM", "H&M", "HENNES", "MAURITZ"],
|
| 622 |
+
"hashtags": {
|
| 623 |
+
"zh": ["HM", "時尚", "快時尚", "平價時尚"],
|
| 624 |
+
"en": ["HM", "Fashion", "FastFashion", "Style", "AffordableFashion"]
|
| 625 |
+
},
|
| 626 |
+
"visual_distinctive": False,
|
| 627 |
+
"text_prominent": True
|
| 628 |
+
},
|
| 629 |
+
|
| 630 |
+
"Ralph Lauren": {
|
| 631 |
+
"strong_cues": [
|
| 632 |
+
"Polo player on horse logo",
|
| 633 |
+
"Polo Ralph Lauren text branding",
|
| 634 |
+
"Preppy American style clothing",
|
| 635 |
+
"Polo shirt with collar and logo"
|
| 636 |
+
],
|
| 637 |
+
"weak_cues": [
|
| 638 |
+
"Classic American fashion item",
|
| 639 |
+
"Preppy styled clothing",
|
| 640 |
+
"Casual wear with logo detail"
|
| 641 |
+
],
|
| 642 |
+
"region_contexts": ["shirt_chest", "clothing_tag", "logo_placement"],
|
| 643 |
+
"openclip_prompts": {
|
| 644 |
+
"shirt_chest": [
|
| 645 |
+
"Polo shirt with Ralph Lauren polo player logo",
|
| 646 |
+
"Casual wear showing Polo Ralph Lauren emblem",
|
| 647 |
+
"Apparel with Ralph Lauren polo player branding"
|
| 648 |
+
]
|
| 649 |
+
},
|
| 650 |
+
"aliases": ["Polo", "RALPH LAUREN", "RL"],
|
| 651 |
+
"hashtags": {
|
| 652 |
+
"zh": ["RalphLauren", "Polo", "美式風格", "經典時尚"],
|
| 653 |
+
"en": ["RalphLauren", "Polo", "AmericanStyle", "Preppy", "Classic"]
|
| 654 |
+
},
|
| 655 |
+
"visual_distinctive": True,
|
| 656 |
+
"text_prominent": True
|
| 657 |
+
},
|
| 658 |
+
|
| 659 |
+
"Tommy Hilfiger": {
|
| 660 |
+
"strong_cues": [
|
| 661 |
+
"Red white and blue flag logo",
|
| 662 |
+
"Tommy Hilfiger wordmark text",
|
| 663 |
+
"Preppy American sportswear styling",
|
| 664 |
+
"Flag emblem on clothing"
|
| 665 |
+
],
|
| 666 |
+
"weak_cues": [
|
| 667 |
+
"Casual American fashion",
|
| 668 |
+
"Sporty preppy clothing",
|
| 669 |
+
"Logo-embellished apparel"
|
| 670 |
+
],
|
| 671 |
+
"region_contexts": ["clothing_chest", "tag", "logo_area"],
|
| 672 |
+
"openclip_prompts": {
|
| 673 |
+
"clothing_chest": [
|
| 674 |
+
"Tommy Hilfiger apparel with flag logo",
|
| 675 |
+
"Casual wear showing Tommy Hilfiger branding",
|
| 676 |
+
"Clothing with red white blue Tommy emblem"
|
| 677 |
+
]
|
| 678 |
+
},
|
| 679 |
+
"hashtags": {
|
| 680 |
+
"zh": ["TommyHilfiger", "美式休閒", "時尚", "經典"],
|
| 681 |
+
"en": ["TommyHilfiger", "American", "Preppy", "Fashion", "Classic"]
|
| 682 |
+
}
|
| 683 |
+
},
|
| 684 |
+
|
| 685 |
+
"Uniqlo": {
|
| 686 |
+
"strong_cues": [
|
| 687 |
+
"Uniqlo wordmark in red and white",
|
| 688 |
+
"LifeWear philosophy branding",
|
| 689 |
+
"Minimalist Japanese design aesthetic",
|
| 690 |
+
"HeatTech or AIRism technology labels"
|
| 691 |
+
],
|
| 692 |
+
"weak_cues": [
|
| 693 |
+
"Simple functional clothing",
|
| 694 |
+
"Basic casual apparel",
|
| 695 |
+
"Affordable everyday wear"
|
| 696 |
+
],
|
| 697 |
+
"region_contexts": ["clothing_tag", "label", "shopping_bag"],
|
| 698 |
+
"openclip_prompts": {
|
| 699 |
+
"clothing_tag": [
|
| 700 |
+
"Uniqlo clothing tag with brand logo",
|
| 701 |
+
"Apparel showing Uniqlo LifeWear branding",
|
| 702 |
+
"Clothing with Uniqlo label and technology marker"
|
| 703 |
+
]
|
| 704 |
+
},
|
| 705 |
+
"hashtags": {
|
| 706 |
+
"zh": ["Uniqlo", "優衣庫", "日系", "簡約", "基本款"],
|
| 707 |
+
"en": ["Uniqlo", "LifeWear", "Japanese", "Minimalist", "Basics"]
|
| 708 |
+
}
|
| 709 |
+
},
|
| 710 |
+
|
| 711 |
+
"Gap": {
|
| 712 |
+
"strong_cues": [
|
| 713 |
+
"Gap logo in blue square",
|
| 714 |
+
"Classic American casual styling",
|
| 715 |
+
"Denim and khaki product focus",
|
| 716 |
+
"Gap wordmark on tags"
|
| 717 |
+
],
|
| 718 |
+
"weak_cues": [
|
| 719 |
+
"Casual American clothing",
|
| 720 |
+
"Basic everyday apparel",
|
| 721 |
+
"Classic wardrobe staples"
|
| 722 |
+
],
|
| 723 |
+
"region_contexts": ["clothing_tag", "label", "logo_placement"],
|
| 724 |
+
"openclip_prompts": {
|
| 725 |
+
"clothing_tag": [
|
| 726 |
+
"Gap clothing tag with blue logo",
|
| 727 |
+
"Apparel showing Gap branding",
|
| 728 |
+
"Casual wear with Gap label"
|
| 729 |
+
]
|
| 730 |
+
},
|
| 731 |
+
"hashtags": {
|
| 732 |
+
"zh": ["Gap", "美式休閒", "經典", "基本款"],
|
| 733 |
+
"en": ["Gap", "American", "Casual", "Classic", "Everyday"]
|
| 734 |
+
}
|
| 735 |
+
},
|
| 736 |
+
|
| 737 |
+
"Lacoste": {
|
| 738 |
+
"strong_cues": [
|
| 739 |
+
"Green crocodile logo",
|
| 740 |
+
"Polo shirt with crocodile emblem",
|
| 741 |
+
"French sportswear styling",
|
| 742 |
+
"Crocodile on left chest area"
|
| 743 |
+
],
|
| 744 |
+
"weak_cues": [
|
| 745 |
+
"Tennis-inspired fashion",
|
| 746 |
+
"Sporty casual clothing",
|
| 747 |
+
"Preppy athletic wear"
|
| 748 |
+
],
|
| 749 |
+
"region_contexts": ["shirt_chest", "clothing_detail", "logo_area"],
|
| 750 |
+
"openclip_prompts": {
|
| 751 |
+
"shirt_chest": [
|
| 752 |
+
"Lacoste polo shirt with green crocodile logo",
|
| 753 |
+
"Sportswear showing Lacoste emblem on chest",
|
| 754 |
+
"Tennis apparel with Lacoste crocodile branding"
|
| 755 |
+
]
|
| 756 |
+
},
|
| 757 |
+
"hashtags": {
|
| 758 |
+
"zh": ["Lacoste", "鱷魚", "法國", "網球", "運動時尚"],
|
| 759 |
+
"en": ["Lacoste", "Crocodile", "French", "Tennis", "Sporty"]
|
| 760 |
+
}
|
| 761 |
+
},
|
| 762 |
+
|
| 763 |
+
"Calvin Klein": {
|
| 764 |
+
"strong_cues": [
|
| 765 |
+
"CK logo or Calvin Klein wordmark",
|
| 766 |
+
"Minimalist modern design aesthetic",
|
| 767 |
+
"Monochromatic color schemes",
|
| 768 |
+
"Underwear waistband with CK logo"
|
| 769 |
+
],
|
| 770 |
+
"weak_cues": [
|
| 771 |
+
"Contemporary minimalist fashion",
|
| 772 |
+
"Modern casual clothing",
|
| 773 |
+
"Designer basics"
|
| 774 |
+
],
|
| 775 |
+
"region_contexts": ["clothing_tag", "waistband", "logo_area"],
|
| 776 |
+
"openclip_prompts": {
|
| 777 |
+
"clothing_tag": [
|
| 778 |
+
"Calvin Klein clothing with CK logo",
|
| 779 |
+
"Apparel showing Calvin Klein minimalist branding",
|
| 780 |
+
"Fashion item with CK monogram"
|
| 781 |
+
]
|
| 782 |
+
},
|
| 783 |
+
"hashtags": {
|
| 784 |
+
"zh": ["CalvinKlein", "CK", "簡約", "美式時尚", "現代"],
|
| 785 |
+
"en": ["CalvinKlein", "CK", "Minimalist", "Modern", "Designer"]
|
| 786 |
+
}
|
| 787 |
+
},
|
| 788 |
+
|
| 789 |
+
"Levi's": {
|
| 790 |
+
"strong_cues": [
|
| 791 |
+
"Red tab on back pocket of jeans",
|
| 792 |
+
"Two horse leather patch on waistband",
|
| 793 |
+
"501 or other style number branding",
|
| 794 |
+
"Arcuate stitching pattern on back pockets"
|
| 795 |
+
],
|
| 796 |
+
"weak_cues": [
|
| 797 |
+
"Classic denim jeans",
|
| 798 |
+
"American workwear styling",
|
| 799 |
+
"Vintage-inspired casual wear"
|
| 800 |
+
],
|
| 801 |
+
"region_contexts": ["jeans_pocket", "waistband_patch", "back_detail"],
|
| 802 |
+
"openclip_prompts": {
|
| 803 |
+
"jeans_pocket": [
|
| 804 |
+
"Levi's jeans with red tab on back pocket",
|
| 805 |
+
"Denim showing Levi's two horse patch and arcuate stitching",
|
| 806 |
+
"Jeans with Levi's 501 branding and classic details"
|
| 807 |
+
]
|
| 808 |
+
},
|
| 809 |
+
"hashtags": {
|
| 810 |
+
"zh": ["Levis", "李維斯", "牛仔褲", "丹寧", "美式"],
|
| 811 |
+
"en": ["Levis", "Denim", "Jeans", "American", "501"]
|
| 812 |
+
}
|
| 813 |
+
},
|
| 814 |
+
|
| 815 |
+
"The North Face": {
|
| 816 |
+
"strong_cues": [
|
| 817 |
+
"Half dome logo design",
|
| 818 |
+
"The North Face wordmark",
|
| 819 |
+
"Outdoor technical gear styling",
|
| 820 |
+
"Logo patch on jacket or backpack"
|
| 821 |
+
],
|
| 822 |
+
"weak_cues": [
|
| 823 |
+
"Outdoor athletic apparel",
|
| 824 |
+
"Technical outdoor gear",
|
| 825 |
+
"Adventure clothing"
|
| 826 |
+
],
|
| 827 |
+
"region_contexts": ["jacket_chest", "backpack_front", "apparel_sleeve"],
|
| 828 |
+
"openclip_prompts": {
|
| 829 |
+
"jacket_chest": [
|
| 830 |
+
"The North Face jacket with half dome logo",
|
| 831 |
+
"Outdoor apparel showing North Face branding",
|
| 832 |
+
"Technical gear with The North Face emblem"
|
| 833 |
+
]
|
| 834 |
+
},
|
| 835 |
+
"hashtags": {
|
| 836 |
+
"zh": ["TheNorthFace", "北臉", "戶外", "機能", "登山"],
|
| 837 |
+
"en": ["TheNorthFace", "Outdoor", "Adventure", "Technical", "Hiking"]
|
| 838 |
+
}
|
| 839 |
+
}
|
| 840 |
+
}
|
| 841 |
+
}
|
| 842 |
+
|
| 843 |
+
print(f"✓ Brand Prompts initialized with {self._count_brands()} brands across {len(self.brand_prompts)} categories")
|
| 844 |
+
|
| 845 |
+
def _count_brands(self) -> int:
|
| 846 |
+
"""計算總品牌數量"""
|
| 847 |
+
total = 0
|
| 848 |
+
for category in self.brand_prompts.values():
|
| 849 |
+
total += len(category)
|
| 850 |
+
return total
|
| 851 |
+
|
| 852 |
+
def get_prompts(self, brand_name: str) -> Optional[Dict]:
|
| 853 |
+
"""
|
| 854 |
+
取得特定品牌的完整 prompt 資料
|
| 855 |
+
|
| 856 |
+
Args:
|
| 857 |
+
brand_name: 品牌名稱
|
| 858 |
+
|
| 859 |
+
Returns:
|
| 860 |
+
品牌資料字典,若不存在則返回 None
|
| 861 |
+
"""
|
| 862 |
+
for category in self.brand_prompts.values():
|
| 863 |
+
if brand_name in category:
|
| 864 |
+
result = category[brand_name].copy()
|
| 865 |
+
result['category'] = self.get_brand_category(brand_name)
|
| 866 |
+
return result
|
| 867 |
+
return None
|
| 868 |
+
|
| 869 |
+
def get_brand_category(self, brand_name: str) -> str:
|
| 870 |
+
"""
|
| 871 |
+
取得品牌類別
|
| 872 |
+
|
| 873 |
+
Args:
|
| 874 |
+
brand_name: 品牌名稱
|
| 875 |
+
|
| 876 |
+
Returns:
|
| 877 |
+
品牌類別(luxury, sportswear, tech, etc.)
|
| 878 |
+
"""
|
| 879 |
+
for category_name, brands in self.brand_prompts.items():
|
| 880 |
+
if brand_name in brands:
|
| 881 |
+
return category_name
|
| 882 |
+
return 'unknown'
|
| 883 |
+
|
| 884 |
+
def get_all_brands(self) -> Dict:
|
| 885 |
+
"""
|
| 886 |
+
取得所有品牌的扁平化字典
|
| 887 |
+
|
| 888 |
+
Returns:
|
| 889 |
+
扁平化的品牌字典 {brand_name: brand_data}
|
| 890 |
+
"""
|
| 891 |
+
flat_brands = {}
|
| 892 |
+
for category_name, brands in self.brand_prompts.items():
|
| 893 |
+
for brand_name, brand_data in brands.items():
|
| 894 |
+
brand_data_copy = brand_data.copy()
|
| 895 |
+
brand_data_copy['category'] = category_name
|
| 896 |
+
flat_brands[brand_name] = brand_data_copy
|
| 897 |
+
return flat_brands
|
| 898 |
+
|
| 899 |
+
def get_brands_by_category(self, category: str) -> Dict:
|
| 900 |
+
"""
|
| 901 |
+
取得特定類別的所有品牌
|
| 902 |
+
|
| 903 |
+
Args:
|
| 904 |
+
category: 類別名稱
|
| 905 |
+
|
| 906 |
+
Returns:
|
| 907 |
+
該類別的品牌字典
|
| 908 |
+
"""
|
| 909 |
+
return self.brand_prompts.get(category, {})
|
| 910 |
+
|
| 911 |
+
def search_brand_by_alias(self, alias: str) -> Optional[str]:
|
| 912 |
+
"""
|
| 913 |
+
根據別名搜尋品牌名稱(模糊匹配)
|
| 914 |
+
|
| 915 |
+
Args:
|
| 916 |
+
alias: 品牌別名或簡稱
|
| 917 |
+
|
| 918 |
+
Returns:
|
| 919 |
+
品牌正式名稱,若找不到則返回 None
|
| 920 |
+
"""
|
| 921 |
+
alias_lower = alias.lower()
|
| 922 |
+
|
| 923 |
+
# 簡單的別名映射
|
| 924 |
+
alias_map = {
|
| 925 |
+
'lv': 'Louis Vuitton',
|
| 926 |
+
'ck': 'Calvin Klein',
|
| 927 |
+
'tnf': 'The North Face',
|
| 928 |
+
'ua': 'Under Armour',
|
| 929 |
+
'hm': 'H&M'
|
| 930 |
+
}
|
| 931 |
+
|
| 932 |
+
if alias_lower in alias_map:
|
| 933 |
+
return alias_map[alias_lower]
|
| 934 |
+
|
| 935 |
+
# 模糊匹配品牌名稱
|
| 936 |
+
for brand_name in self.get_all_brands().keys():
|
| 937 |
+
if alias_lower in brand_name.lower():
|
| 938 |
+
return brand_name
|
| 939 |
+
|
| 940 |
+
return None
|
| 941 |
+
|
| 942 |
+
def get_hashtags(self, brand_name: str, language: str = 'zh') -> List[str]:
|
| 943 |
+
"""
|
| 944 |
+
取得品牌的 hashtags
|
| 945 |
+
|
| 946 |
+
Args:
|
| 947 |
+
brand_name: 品牌名稱
|
| 948 |
+
language: 語言 ('zh', 'en', 或 'zh-en')
|
| 949 |
+
|
| 950 |
+
Returns:
|
| 951 |
+
Hashtag 列表
|
| 952 |
+
"""
|
| 953 |
+
brand_data = self.get_prompts(brand_name)
|
| 954 |
+
if not brand_data:
|
| 955 |
+
return []
|
| 956 |
+
|
| 957 |
+
hashtags = brand_data.get('hashtags', {})
|
| 958 |
+
|
| 959 |
+
if language == 'zh':
|
| 960 |
+
return hashtags.get('zh', [])
|
| 961 |
+
elif language == 'en':
|
| 962 |
+
return hashtags.get('en', [])
|
| 963 |
+
elif language == 'zh-en' or language == 'both':
|
| 964 |
+
zh_tags = hashtags.get('zh', [])
|
| 965 |
+
en_tags = hashtags.get('en', [])
|
| 966 |
+
return zh_tags + en_tags
|
| 967 |
+
else:
|
| 968 |
+
return hashtags.get('zh', [])
|
| 969 |
+
|
| 970 |
+
print("✓ BrandPrompts defined")
|
brand_recognition_manager.py
ADDED
|
@@ -0,0 +1,420 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import math
|
| 3 |
+
from PIL import Image
|
| 4 |
+
from typing import Dict, List, Tuple
|
| 5 |
+
from rapidfuzz import fuzz
|
| 6 |
+
from prompt_library_manager import PromptLibraryManager
|
| 7 |
+
from brand_detection_optimizer import BrandDetectionOptimizer
|
| 8 |
+
|
| 9 |
+
class BrandRecognitionManager:
|
| 10 |
+
"""Multi-modal brand recognition with detailed prompts (Visual + Text)"""
|
| 11 |
+
|
| 12 |
+
def __init__(self, clip_manager, ocr_manager, prompt_library=None):
|
| 13 |
+
self.clip_manager = clip_manager
|
| 14 |
+
self.ocr_manager = ocr_manager
|
| 15 |
+
self.prompt_library = prompt_library
|
| 16 |
+
self.flat_brands = prompt_library.get_all_brands()
|
| 17 |
+
|
| 18 |
+
# Initialize optimizer for smart brand detection
|
| 19 |
+
self.optimizer = BrandDetectionOptimizer(clip_manager, ocr_manager, prompt_library)
|
| 20 |
+
|
| 21 |
+
print(f"✓ Brand Recognition Manager loaded with {len(self.flat_brands)} brands (with optimizer)")
|
| 22 |
+
|
| 23 |
+
def recognize_brand(self, image_region: Image.Image, full_image: Image.Image,
|
| 24 |
+
region_bbox: List[int] = None) -> List[Tuple[str, float, List[int]]]:
|
| 25 |
+
"""Recognize brands using detailed context-aware prompts
|
| 26 |
+
|
| 27 |
+
Args:
|
| 28 |
+
image_region: Cropped region containing potential brand
|
| 29 |
+
full_image: Full image for OCR
|
| 30 |
+
region_bbox: Bounding box [x1, y1, x2, y2] for visualization
|
| 31 |
+
|
| 32 |
+
Returns:
|
| 33 |
+
List of (brand_name, confidence, bbox) tuples
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
# Step 1: Classify region context
|
| 37 |
+
region_context = self._classify_region_context(image_region)
|
| 38 |
+
print(f" [DEBUG] Region context classified as: {region_context}")
|
| 39 |
+
|
| 40 |
+
# Step 2: Use context-specific OpenCLIP prompts
|
| 41 |
+
brand_scores = {}
|
| 42 |
+
|
| 43 |
+
for brand_name, brand_info in self.flat_brands.items():
|
| 44 |
+
# Get best matching context for this brand
|
| 45 |
+
best_context = self._match_region_to_brand_context(region_context, brand_info['region_contexts'])
|
| 46 |
+
|
| 47 |
+
if best_context and best_context in brand_info['openclip_prompts']:
|
| 48 |
+
# Use context-specific prompts
|
| 49 |
+
prompts = brand_info['openclip_prompts'][best_context]
|
| 50 |
+
visual_scores = self.clip_manager.classify_zero_shot(image_region, prompts)
|
| 51 |
+
|
| 52 |
+
# Average scores from all prompts
|
| 53 |
+
avg_score = sum(visual_scores.values()) / len(visual_scores) if visual_scores else 0.0
|
| 54 |
+
else:
|
| 55 |
+
# Fallback to strong cues
|
| 56 |
+
prompts = brand_info['strong_cues'][:5] # Top 5 strong cues
|
| 57 |
+
visual_scores = self.clip_manager.classify_zero_shot(image_region, prompts)
|
| 58 |
+
avg_score = sum(visual_scores.values()) / len(visual_scores) if visual_scores else 0.0
|
| 59 |
+
|
| 60 |
+
brand_scores[brand_name] = avg_score
|
| 61 |
+
|
| 62 |
+
# Step 2.5: Multi-scale visual matching for better robustness
|
| 63 |
+
brand_scores = self._multi_scale_visual_matching(image_region, brand_scores)
|
| 64 |
+
|
| 65 |
+
# Step 3: OCR text matching with brand-optimized preprocessing
|
| 66 |
+
ocr_results = self.ocr_manager.extract_text(full_image, use_brand_preprocessing=True)
|
| 67 |
+
text_matches = self._fuzzy_text_matching(ocr_results)
|
| 68 |
+
|
| 69 |
+
print(f" [DEBUG] OCR found {len(ocr_results)} text regions")
|
| 70 |
+
if text_matches:
|
| 71 |
+
print(f" [DEBUG] OCR brand matches: {text_matches}")
|
| 72 |
+
|
| 73 |
+
# Step 4: Adaptive weighted fusion (dynamic weights per brand)
|
| 74 |
+
final_scores = {}
|
| 75 |
+
for brand_name in self.flat_brands.keys():
|
| 76 |
+
visual_score = brand_scores.get(brand_name, 0.0)
|
| 77 |
+
text_score, ocr_conf = text_matches.get(brand_name, (0.0, 0.0))
|
| 78 |
+
|
| 79 |
+
# Calculate adaptive weights based on brand characteristics
|
| 80 |
+
visual_weight, text_weight, ocr_weight = self._calculate_adaptive_weights(
|
| 81 |
+
brand_name, visual_score, text_score, ocr_conf
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
# Weighted fusion with adaptive weights
|
| 85 |
+
final_score = (
|
| 86 |
+
visual_weight * self._scale_visual(visual_score) +
|
| 87 |
+
text_weight * text_score +
|
| 88 |
+
ocr_weight * ocr_conf
|
| 89 |
+
)
|
| 90 |
+
final_scores[brand_name] = final_score
|
| 91 |
+
|
| 92 |
+
sorted_scores = sorted(final_scores.items(), key=lambda x: x[1], reverse=True)[:5]
|
| 93 |
+
print(f" [DEBUG] Top 5 brand scores:")
|
| 94 |
+
for brand, score in sorted_scores:
|
| 95 |
+
print(f" {brand}: {score:.4f} (visual={brand_scores.get(brand, 0):.4f}, text={text_matches.get(brand, (0, 0))[0]:.4f})")
|
| 96 |
+
|
| 97 |
+
# Return confident matches with bounding boxes
|
| 98 |
+
confident_brands = []
|
| 99 |
+
for brand_name, score in final_scores.items():
|
| 100 |
+
if score > 0.10:
|
| 101 |
+
confident_brands.append((brand_name, score, region_bbox))
|
| 102 |
+
print(f" [DEBUG] ✓ Brand detected: {brand_name} (confidence: {score:.4f})")
|
| 103 |
+
|
| 104 |
+
confident_brands.sort(key=lambda x: x[1], reverse=True)
|
| 105 |
+
|
| 106 |
+
if not confident_brands:
|
| 107 |
+
print(f" [DEBUG] ✗ No brands passed threshold 0.10")
|
| 108 |
+
|
| 109 |
+
return confident_brands
|
| 110 |
+
|
| 111 |
+
def _classify_region_context(self, image_region: Image.Image) -> str:
|
| 112 |
+
"""Classify what type of region this is (bag_panel, shoe_side, etc.)"""
|
| 113 |
+
context_labels = [
|
| 114 |
+
'bag panel with pattern',
|
| 115 |
+
'luggage surface with branding',
|
| 116 |
+
'luxury trunk with monogram pattern',
|
| 117 |
+
'vintage travel trunk with hardware',
|
| 118 |
+
'shoe side view',
|
| 119 |
+
'device back cover',
|
| 120 |
+
'apparel chest area',
|
| 121 |
+
'belt buckle',
|
| 122 |
+
'storefront sign',
|
| 123 |
+
'product tag or label',
|
| 124 |
+
'wallet surface',
|
| 125 |
+
'perfume bottle',
|
| 126 |
+
'watch dial or face',
|
| 127 |
+
'car front grille',
|
| 128 |
+
'laptop lid'
|
| 129 |
+
]
|
| 130 |
+
|
| 131 |
+
scores = self.clip_manager.classify_zero_shot(image_region, context_labels)
|
| 132 |
+
|
| 133 |
+
# Map to simplified contexts
|
| 134 |
+
context_mapping = {
|
| 135 |
+
'bag panel with pattern': 'bag_panel',
|
| 136 |
+
'luggage surface with branding': 'luggage_surface',
|
| 137 |
+
'luxury trunk with monogram pattern': 'trunk_body',
|
| 138 |
+
'vintage travel trunk with hardware': 'trunk_body',
|
| 139 |
+
'shoe side view': 'shoe_side',
|
| 140 |
+
'device back cover': 'device_back',
|
| 141 |
+
'apparel chest area': 'apparel_chest',
|
| 142 |
+
'belt buckle': 'belt_buckle',
|
| 143 |
+
'storefront sign': 'storefront',
|
| 144 |
+
'product tag or label': 'product_tag',
|
| 145 |
+
'wallet surface': 'wallet',
|
| 146 |
+
'perfume bottle': 'perfume_bottle',
|
| 147 |
+
'watch dial or face': 'watch_dial',
|
| 148 |
+
'car front grille': 'car_front',
|
| 149 |
+
'laptop lid': 'laptop_lid'
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
top_context = max(scores.items(), key=lambda x: x[1])[0]
|
| 153 |
+
return context_mapping.get(top_context, 'unknown')
|
| 154 |
+
|
| 155 |
+
def _match_region_to_brand_context(self, region_context: str, brand_contexts: List[str]) -> str:
|
| 156 |
+
"""Match detected region context to brand's available contexts"""
|
| 157 |
+
if region_context in brand_contexts:
|
| 158 |
+
return region_context
|
| 159 |
+
# Fuzzy matching
|
| 160 |
+
for brand_context in brand_contexts:
|
| 161 |
+
if region_context.split('_')[0] in brand_context:
|
| 162 |
+
return brand_context
|
| 163 |
+
return None
|
| 164 |
+
|
| 165 |
+
def _fuzzy_text_matching(self, ocr_results: List[Dict]) -> Dict[str, Tuple[float, float]]:
|
| 166 |
+
"""Fuzzy text matching using brand aliases (optimized for logo text)"""
|
| 167 |
+
matches = {}
|
| 168 |
+
|
| 169 |
+
for ocr_item in ocr_results:
|
| 170 |
+
text = ocr_item['text']
|
| 171 |
+
conf = ocr_item['confidence']
|
| 172 |
+
|
| 173 |
+
for brand_name, brand_info in self.flat_brands.items():
|
| 174 |
+
# Check all aliases
|
| 175 |
+
all_names = [brand_name] + brand_info.get('aliases', [])
|
| 176 |
+
|
| 177 |
+
for alias in all_names:
|
| 178 |
+
ratio = fuzz.ratio(text, alias) / 100.0
|
| 179 |
+
if ratio > 0.70: # Lowered threshold for better recall
|
| 180 |
+
if brand_name not in matches or ratio > matches[brand_name][0]:
|
| 181 |
+
matches[brand_name] = (ratio, conf)
|
| 182 |
+
|
| 183 |
+
return matches
|
| 184 |
+
|
| 185 |
+
def _scale_visual(self, score: float) -> float:
|
| 186 |
+
"""Scale visual score using sigmoid"""
|
| 187 |
+
return 1 / (1 + math.exp(-10 * (score - 0.5)))
|
| 188 |
+
|
| 189 |
+
def _calculate_adaptive_weights(self, brand_name: str, visual_score: float,
|
| 190 |
+
text_score: float, ocr_conf: float) -> tuple:
|
| 191 |
+
"""
|
| 192 |
+
Calculate adaptive weights based on brand characteristics and signal strengths
|
| 193 |
+
|
| 194 |
+
Args:
|
| 195 |
+
brand_name: Name of the brand
|
| 196 |
+
visual_score: Visual similarity score
|
| 197 |
+
text_score: Text matching score
|
| 198 |
+
ocr_conf: OCR confidence
|
| 199 |
+
|
| 200 |
+
Returns:
|
| 201 |
+
Tuple of (visual_weight, text_weight, ocr_weight)
|
| 202 |
+
"""
|
| 203 |
+
brand_info = self.prompt_library.get_brand_prompts(brand_name)
|
| 204 |
+
|
| 205 |
+
if not brand_info:
|
| 206 |
+
# Default balanced weights
|
| 207 |
+
return 0.50, 0.30, 0.20
|
| 208 |
+
|
| 209 |
+
# Base weights based on brand characteristics
|
| 210 |
+
if brand_info.get('visual_distinctive', False):
|
| 211 |
+
# Visually distinctive brands (LV, Burberry)
|
| 212 |
+
visual_weight = 0.65
|
| 213 |
+
text_weight = 0.20
|
| 214 |
+
ocr_weight = 0.15
|
| 215 |
+
elif brand_info.get('text_prominent', False):
|
| 216 |
+
# Text-prominent brands (Nike, Adidas)
|
| 217 |
+
visual_weight = 0.30
|
| 218 |
+
text_weight = 0.30
|
| 219 |
+
ocr_weight = 0.40
|
| 220 |
+
else:
|
| 221 |
+
# Balanced for general brands
|
| 222 |
+
visual_weight = 0.50
|
| 223 |
+
text_weight = 0.30
|
| 224 |
+
ocr_weight = 0.20
|
| 225 |
+
|
| 226 |
+
# Dynamic adjustment based on signal strength
|
| 227 |
+
# If visual signal is very strong, boost its weight
|
| 228 |
+
if visual_score > 0.7:
|
| 229 |
+
boost = 0.10
|
| 230 |
+
visual_weight += boost
|
| 231 |
+
text_weight -= boost * 0.5
|
| 232 |
+
ocr_weight -= boost * 0.5
|
| 233 |
+
|
| 234 |
+
# If OCR has very high confidence, boost its weight
|
| 235 |
+
if ocr_conf > 0.85:
|
| 236 |
+
boost = 0.10
|
| 237 |
+
ocr_weight += boost
|
| 238 |
+
visual_weight -= boost * 0.6
|
| 239 |
+
text_weight -= boost * 0.4
|
| 240 |
+
|
| 241 |
+
# If text match is very strong, boost its weight
|
| 242 |
+
if text_score > 0.80:
|
| 243 |
+
boost = 0.08
|
| 244 |
+
text_weight += boost
|
| 245 |
+
visual_weight -= boost * 0.5
|
| 246 |
+
ocr_weight -= boost * 0.5
|
| 247 |
+
|
| 248 |
+
# Normalize weights to sum to 1
|
| 249 |
+
total = visual_weight + text_weight + ocr_weight
|
| 250 |
+
return visual_weight / total, text_weight / total, ocr_weight / total
|
| 251 |
+
|
| 252 |
+
def _multi_scale_visual_matching(self, image_region: Image.Image,
|
| 253 |
+
initial_scores: Dict[str, float]) -> Dict[str, float]:
|
| 254 |
+
"""
|
| 255 |
+
Apply multi-scale matching to improve robustness
|
| 256 |
+
|
| 257 |
+
Args:
|
| 258 |
+
image_region: Image region to analyze
|
| 259 |
+
initial_scores: Initial brand scores from single-scale matching
|
| 260 |
+
|
| 261 |
+
Returns:
|
| 262 |
+
Updated brand scores with multi-scale matching
|
| 263 |
+
"""
|
| 264 |
+
scales = [0.8, 1.0, 1.2] # Three scales
|
| 265 |
+
multi_scale_scores = {brand: [] for brand in initial_scores.keys()}
|
| 266 |
+
|
| 267 |
+
for scale in scales:
|
| 268 |
+
# Resize image
|
| 269 |
+
new_width = int(image_region.width * scale)
|
| 270 |
+
new_height = int(image_region.height * scale)
|
| 271 |
+
|
| 272 |
+
# Ensure minimum size
|
| 273 |
+
if new_width < 50 or new_height < 50:
|
| 274 |
+
continue
|
| 275 |
+
|
| 276 |
+
try:
|
| 277 |
+
scaled_img = image_region.resize((new_width, new_height), Image.Resampling.LANCZOS)
|
| 278 |
+
|
| 279 |
+
# Re-run classification on each brand's prompts
|
| 280 |
+
for brand_name, brand_info in self.flat_brands.items():
|
| 281 |
+
# Get context-specific prompts
|
| 282 |
+
best_context = self._match_region_to_brand_context(
|
| 283 |
+
'bag_panel', # Default context, ideally should be passed as parameter
|
| 284 |
+
brand_info.get('region_contexts', [])
|
| 285 |
+
)
|
| 286 |
+
|
| 287 |
+
if best_context and best_context in brand_info.get('openclip_prompts', {}):
|
| 288 |
+
prompts = brand_info['openclip_prompts'][best_context]
|
| 289 |
+
visual_scores = self.clip_manager.classify_zero_shot(scaled_img, prompts)
|
| 290 |
+
avg_score = sum(visual_scores.values()) / len(visual_scores) if visual_scores else 0.0
|
| 291 |
+
else:
|
| 292 |
+
prompts = brand_info.get('strong_cues', [])[:3]
|
| 293 |
+
visual_scores = self.clip_manager.classify_zero_shot(scaled_img, prompts)
|
| 294 |
+
avg_score = sum(visual_scores.values()) / len(visual_scores) if visual_scores else 0.0
|
| 295 |
+
|
| 296 |
+
multi_scale_scores[brand_name].append(avg_score)
|
| 297 |
+
|
| 298 |
+
except Exception as e:
|
| 299 |
+
# Skip this scale if error occurs
|
| 300 |
+
continue
|
| 301 |
+
|
| 302 |
+
# Aggregate multi-scale scores (use max score across scales)
|
| 303 |
+
final_scores = {}
|
| 304 |
+
for brand_name, scores in multi_scale_scores.items():
|
| 305 |
+
if scores:
|
| 306 |
+
final_scores[brand_name] = max(scores)
|
| 307 |
+
else:
|
| 308 |
+
final_scores[brand_name] = initial_scores.get(brand_name, 0.0)
|
| 309 |
+
|
| 310 |
+
return final_scores
|
| 311 |
+
|
| 312 |
+
def scan_full_image_for_brands(self, full_image: Image.Image,
|
| 313 |
+
exclude_bboxes: List[List[int]] = None,
|
| 314 |
+
saliency_regions: List[Dict] = None) -> List[Tuple[str, float, List[int]]]:
|
| 315 |
+
"""
|
| 316 |
+
智能全圖品牌掃描 - 性能優化版本
|
| 317 |
+
使用預篩選和智能區域選擇大幅減少檢測時間
|
| 318 |
+
|
| 319 |
+
Args:
|
| 320 |
+
full_image: PIL Image (full image)
|
| 321 |
+
exclude_bboxes: List of bboxes to exclude (already detected)
|
| 322 |
+
saliency_regions: Saliency detection results for smart region selection
|
| 323 |
+
|
| 324 |
+
Returns:
|
| 325 |
+
List of (brand_name, confidence, bbox) tuples
|
| 326 |
+
"""
|
| 327 |
+
if exclude_bboxes is None:
|
| 328 |
+
exclude_bboxes = []
|
| 329 |
+
|
| 330 |
+
detected_brands = {} # brand_name -> (confidence, bbox)
|
| 331 |
+
img_width, img_height = full_image.size
|
| 332 |
+
|
| 333 |
+
# OPTIMIZATION 1: 快速品牌預篩選
|
| 334 |
+
likely_brands = self.optimizer.quick_brand_prescreening(full_image)
|
| 335 |
+
print(f" Quick prescreening found {len(likely_brands)} potential brands")
|
| 336 |
+
|
| 337 |
+
# OPTIMIZATION 2: 智能區域選擇(只掃描有意義的區域)
|
| 338 |
+
regions_to_scan = self.optimizer.smart_region_selection(full_image, saliency_regions or [])
|
| 339 |
+
print(f" Scanning {len(regions_to_scan)} intelligent regions")
|
| 340 |
+
|
| 341 |
+
# 掃描選定的區域
|
| 342 |
+
for region_bbox in regions_to_scan:
|
| 343 |
+
x1, y1, x2, y2 = region_bbox
|
| 344 |
+
|
| 345 |
+
# 跳過已檢測區域
|
| 346 |
+
if self._bbox_overlap(list(region_bbox), exclude_bboxes):
|
| 347 |
+
continue
|
| 348 |
+
|
| 349 |
+
# 提取區域
|
| 350 |
+
region = full_image.crop(region_bbox)
|
| 351 |
+
|
| 352 |
+
# 只檢測預篩選的品牌(而非所有20+品牌)
|
| 353 |
+
for brand_name in likely_brands:
|
| 354 |
+
brand_info = self.flat_brands.get(brand_name)
|
| 355 |
+
if not brand_info:
|
| 356 |
+
continue
|
| 357 |
+
|
| 358 |
+
# only use strong_cues
|
| 359 |
+
strong_cues = brand_info.get('strong_cues', [])[:5] # Top 5
|
| 360 |
+
if not strong_cues:
|
| 361 |
+
continue
|
| 362 |
+
|
| 363 |
+
visual_scores = self.clip_manager.classify_zero_shot(region, strong_cues)
|
| 364 |
+
avg_score = sum(visual_scores.values()) / len(visual_scores) if visual_scores else 0.0
|
| 365 |
+
|
| 366 |
+
# OCR 增強
|
| 367 |
+
ocr_results = self.ocr_manager.extract_text(full_image, use_brand_preprocessing=True)
|
| 368 |
+
boosted_score = self.optimizer.compute_brand_confidence_boost(
|
| 369 |
+
brand_name, ocr_results, avg_score
|
| 370 |
+
)
|
| 371 |
+
|
| 372 |
+
# 極度寬鬆的閾值以最大化檢測率
|
| 373 |
+
if boosted_score > 0.08: # 降低到 0.08
|
| 374 |
+
# 更新最佳結果
|
| 375 |
+
if brand_name not in detected_brands or boosted_score > detected_brands[brand_name][0]:
|
| 376 |
+
detected_brands[brand_name] = (boosted_score, list(region_bbox))
|
| 377 |
+
|
| 378 |
+
# 轉換為列表格式
|
| 379 |
+
final_brands = [
|
| 380 |
+
(brand_name, confidence, bbox)
|
| 381 |
+
for brand_name, (confidence, bbox) in detected_brands.items()
|
| 382 |
+
]
|
| 383 |
+
|
| 384 |
+
# 按信心度排序
|
| 385 |
+
final_brands.sort(key=lambda x: x[1], reverse=True)
|
| 386 |
+
|
| 387 |
+
return final_brands[:5] # 返回前5個
|
| 388 |
+
|
| 389 |
+
def _bbox_overlap(self, bbox1: List[int], bbox_list: List[List[int]]) -> bool:
|
| 390 |
+
"""Check if bbox1 overlaps significantly with any bbox in bbox_list"""
|
| 391 |
+
if not bbox_list:
|
| 392 |
+
return False
|
| 393 |
+
|
| 394 |
+
x1_1, y1_1, x2_1, y2_1 = bbox1
|
| 395 |
+
|
| 396 |
+
for bbox2 in bbox_list:
|
| 397 |
+
if bbox2 is None:
|
| 398 |
+
continue
|
| 399 |
+
|
| 400 |
+
x1_2, y1_2, x2_2, y2_2 = bbox2
|
| 401 |
+
|
| 402 |
+
# Calculate intersection
|
| 403 |
+
x_left = max(x1_1, x1_2)
|
| 404 |
+
y_top = max(y1_1, y1_2)
|
| 405 |
+
x_right = min(x2_1, x2_2)
|
| 406 |
+
y_bottom = min(y2_1, y2_2)
|
| 407 |
+
|
| 408 |
+
if x_right < x_left or y_bottom < y_top:
|
| 409 |
+
continue
|
| 410 |
+
|
| 411 |
+
intersection_area = (x_right - x_left) * (y_bottom - y_top)
|
| 412 |
+
bbox1_area = (x2_1 - x1_1) * (y2_1 - y1_1)
|
| 413 |
+
|
| 414 |
+
# 如果重疊超過 30%,視為重疊
|
| 415 |
+
if intersection_area / bbox1_area > 0.3:
|
| 416 |
+
return True
|
| 417 |
+
|
| 418 |
+
return False
|
| 419 |
+
|
| 420 |
+
print("✓ BrandRecognitionManager (with full-image scan for commercial use) defined")
|
brand_verification_manager.py
ADDED
|
@@ -0,0 +1,349 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import json
|
| 3 |
+
import re
|
| 4 |
+
from PIL import Image
|
| 5 |
+
from typing import List, Dict, Tuple
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
from caption_generation_manager import CaptionGenerationManager
|
| 8 |
+
|
| 9 |
+
class BrandVerificationManager:
|
| 10 |
+
"""VLM-based brand verification and three-way voting system"""
|
| 11 |
+
|
| 12 |
+
def __init__(self, caption_generator: CaptionGenerationManager = None):
|
| 13 |
+
"""
|
| 14 |
+
Args:
|
| 15 |
+
caption_generator: CaptionGenerationManager instance for VLM access
|
| 16 |
+
"""
|
| 17 |
+
if caption_generator is None:
|
| 18 |
+
caption_generator = CaptionGenerationManager()
|
| 19 |
+
|
| 20 |
+
self.caption_generator = caption_generator
|
| 21 |
+
|
| 22 |
+
# Confidence mapping for VLM responses
|
| 23 |
+
self.confidence_map = {
|
| 24 |
+
'high': 0.9,
|
| 25 |
+
'medium': 0.7,
|
| 26 |
+
'low': 0.5,
|
| 27 |
+
'very high': 0.95,
|
| 28 |
+
'very low': 0.3
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
print("✓ Brand Verification Manager initialized with VLM")
|
| 32 |
+
|
| 33 |
+
def verify_brands(self, image: Image.Image, detected_brands: List[Tuple[str, float, list]]) -> Dict:
|
| 34 |
+
"""
|
| 35 |
+
Use VLM to verify detected brands
|
| 36 |
+
|
| 37 |
+
Args:
|
| 38 |
+
image: PIL Image
|
| 39 |
+
detected_brands: List of (brand_name, confidence, bbox) tuples
|
| 40 |
+
|
| 41 |
+
Returns:
|
| 42 |
+
Dictionary with verification results
|
| 43 |
+
"""
|
| 44 |
+
if not detected_brands:
|
| 45 |
+
return {
|
| 46 |
+
'verified_brands': [],
|
| 47 |
+
'false_positives': [],
|
| 48 |
+
'additional_brands': [],
|
| 49 |
+
'confidence': 0.0
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
# Construct verification prompt
|
| 53 |
+
brand_list = ', '.join([f"{brand[0]} (confidence: {brand[1]:.2f})"
|
| 54 |
+
for brand in detected_brands[:3]]) # Top 3 brands
|
| 55 |
+
|
| 56 |
+
verification_prompt = f"""Analyze this image carefully. Our computer vision system detected the following brands: {brand_list}.
|
| 57 |
+
|
| 58 |
+
Please verify each brand identification:
|
| 59 |
+
|
| 60 |
+
1. Are these brand identifications correct based on visible logos, patterns, text, or distinctive features?
|
| 61 |
+
2. If incorrect, what brands do you actually see (if any)?
|
| 62 |
+
3. Describe the visual evidence (logo shape, text, pattern, color scheme, hardware) that supports your conclusion.
|
| 63 |
+
|
| 64 |
+
Respond in JSON format:
|
| 65 |
+
{{
|
| 66 |
+
"verified_brands": [
|
| 67 |
+
{{"name": "Brand Name", "confidence": "high/medium/low", "evidence": "description of visual evidence"}}
|
| 68 |
+
],
|
| 69 |
+
"false_positives": ["brand names that were incorrectly detected"],
|
| 70 |
+
"additional_brands": ["brands we missed but you can see"]
|
| 71 |
+
}}
|
| 72 |
+
|
| 73 |
+
IMPORTANT: Only include brands you can clearly identify with visual evidence. If unsure, use "low" confidence."""
|
| 74 |
+
|
| 75 |
+
# Generate VLM response
|
| 76 |
+
try:
|
| 77 |
+
response = self._generate_vlm_response(image, verification_prompt)
|
| 78 |
+
parsed_result = self._parse_verification_response(response)
|
| 79 |
+
return parsed_result
|
| 80 |
+
|
| 81 |
+
except Exception as e:
|
| 82 |
+
print(f"VLM verification error: {e}")
|
| 83 |
+
# Fallback to original detections
|
| 84 |
+
return {
|
| 85 |
+
'verified_brands': [
|
| 86 |
+
{'name': brand[0], 'confidence': 'medium', 'evidence': 'VLM verification failed'}
|
| 87 |
+
for brand in detected_brands
|
| 88 |
+
],
|
| 89 |
+
'false_positives': [],
|
| 90 |
+
'additional_brands': []
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
def three_way_voting(self, openclip_brands: List[Tuple], ocr_brands: Dict,
|
| 94 |
+
vlm_result: Dict) -> List[Tuple[str, float, list]]:
|
| 95 |
+
"""
|
| 96 |
+
Three-way voting: OpenCLIP vs OCR vs VLM
|
| 97 |
+
|
| 98 |
+
Args:
|
| 99 |
+
openclip_brands: List of (brand_name, confidence, bbox) from OpenCLIP
|
| 100 |
+
ocr_brands: Dict of {brand_name: (text_score, ocr_conf)} from OCR
|
| 101 |
+
vlm_result: Verification result from VLM
|
| 102 |
+
|
| 103 |
+
Returns:
|
| 104 |
+
List of (brand_name, final_confidence, bbox) tuples
|
| 105 |
+
"""
|
| 106 |
+
votes = {} # brand_name -> {votes: int, sources: list, bbox: list}
|
| 107 |
+
confidence_scores = {} # brand_name -> list of (source, confidence)
|
| 108 |
+
|
| 109 |
+
# Vote 1: OpenCLIP
|
| 110 |
+
for brand_name, confidence, bbox in openclip_brands:
|
| 111 |
+
if brand_name not in votes:
|
| 112 |
+
votes[brand_name] = {'votes': 0, 'sources': [], 'bbox': bbox}
|
| 113 |
+
confidence_scores[brand_name] = []
|
| 114 |
+
|
| 115 |
+
votes[brand_name]['votes'] += 1
|
| 116 |
+
votes[brand_name]['sources'].append('openclip')
|
| 117 |
+
confidence_scores[brand_name].append(('openclip', confidence * 0.8))
|
| 118 |
+
|
| 119 |
+
# Vote 2: OCR
|
| 120 |
+
for brand_name, (text_score, ocr_conf) in ocr_brands.items():
|
| 121 |
+
if brand_name not in votes:
|
| 122 |
+
# OCR found a brand not detected by OpenCLIP
|
| 123 |
+
votes[brand_name] = {'votes': 0, 'sources': [], 'bbox': None}
|
| 124 |
+
confidence_scores[brand_name] = []
|
| 125 |
+
|
| 126 |
+
votes[brand_name]['votes'] += 1
|
| 127 |
+
votes[brand_name]['sources'].append('ocr')
|
| 128 |
+
combined_ocr_score = (text_score + ocr_conf) / 2
|
| 129 |
+
confidence_scores[brand_name].append(('ocr', combined_ocr_score * 0.7))
|
| 130 |
+
|
| 131 |
+
# Vote 3: VLM (double weight - most reliable)
|
| 132 |
+
for brand_info in vlm_result.get('verified_brands', []):
|
| 133 |
+
brand_name = brand_info['name']
|
| 134 |
+
vlm_confidence_level = brand_info.get('confidence', 'medium')
|
| 135 |
+
vlm_confidence = self.confidence_map.get(vlm_confidence_level.lower(), 0.7)
|
| 136 |
+
|
| 137 |
+
if brand_name not in votes:
|
| 138 |
+
# VLM found a brand missed by both OpenCLIP and OCR
|
| 139 |
+
votes[brand_name] = {'votes': 0, 'sources': [], 'bbox': None}
|
| 140 |
+
confidence_scores[brand_name] = []
|
| 141 |
+
|
| 142 |
+
votes[brand_name]['votes'] += 2 # VLM gets double vote
|
| 143 |
+
votes[brand_name]['sources'].append('vlm')
|
| 144 |
+
confidence_scores[brand_name].append(('vlm', vlm_confidence))
|
| 145 |
+
|
| 146 |
+
# Remove false positives flagged by VLM
|
| 147 |
+
for false_positive in vlm_result.get('false_positives', []):
|
| 148 |
+
if false_positive in votes:
|
| 149 |
+
# Reduce votes significantly
|
| 150 |
+
votes[false_positive]['votes'] = max(0, votes[false_positive]['votes'] - 2)
|
| 151 |
+
|
| 152 |
+
# Calculate final scores
|
| 153 |
+
final_brands = []
|
| 154 |
+
for brand_name, vote_info in votes.items():
|
| 155 |
+
if vote_info['votes'] <= 0:
|
| 156 |
+
continue # Skip brands with no votes
|
| 157 |
+
|
| 158 |
+
# Calculate weighted average confidence
|
| 159 |
+
scores = confidence_scores.get(brand_name, [])
|
| 160 |
+
if not scores:
|
| 161 |
+
continue
|
| 162 |
+
|
| 163 |
+
# VLM has highest weight, OpenCLIP medium, OCR lowest
|
| 164 |
+
weighted_sum = 0.0
|
| 165 |
+
weight_total = 0.0
|
| 166 |
+
|
| 167 |
+
for source, score in scores:
|
| 168 |
+
if source == 'vlm':
|
| 169 |
+
weight = 1.0
|
| 170 |
+
elif source == 'openclip':
|
| 171 |
+
weight = 0.6
|
| 172 |
+
else: # ocr
|
| 173 |
+
weight = 0.4
|
| 174 |
+
|
| 175 |
+
weighted_sum += score * weight
|
| 176 |
+
weight_total += weight
|
| 177 |
+
|
| 178 |
+
avg_confidence = weighted_sum / weight_total if weight_total > 0 else 0.0
|
| 179 |
+
|
| 180 |
+
# Boost confidence if multiple sources agree
|
| 181 |
+
if vote_info['votes'] >= 2:
|
| 182 |
+
avg_confidence *= 1.15 # 15% boost for agreement
|
| 183 |
+
|
| 184 |
+
# Cap at 0.95
|
| 185 |
+
avg_confidence = min(avg_confidence, 0.95)
|
| 186 |
+
|
| 187 |
+
# Only include if confidence is reasonable
|
| 188 |
+
if avg_confidence > 0.30:
|
| 189 |
+
final_brands.append((brand_name, avg_confidence, vote_info['bbox']))
|
| 190 |
+
|
| 191 |
+
# Sort by confidence
|
| 192 |
+
final_brands.sort(key=lambda x: x[1], reverse=True)
|
| 193 |
+
|
| 194 |
+
return final_brands
|
| 195 |
+
|
| 196 |
+
def extract_visual_evidence(self, image: Image.Image, brand_name: str) -> Dict:
|
| 197 |
+
"""
|
| 198 |
+
Extract detailed visual evidence for identified brand
|
| 199 |
+
|
| 200 |
+
Args:
|
| 201 |
+
image: PIL Image
|
| 202 |
+
brand_name: Identified brand name
|
| 203 |
+
|
| 204 |
+
Returns:
|
| 205 |
+
Dictionary with evidence description
|
| 206 |
+
"""
|
| 207 |
+
evidence_prompt = f"""You identified {brand_name} in this image. Please describe the specific visual evidence:
|
| 208 |
+
|
| 209 |
+
1. Logo appearance: Describe the logo's shape, style, color, and exact location in the image
|
| 210 |
+
2. Text elements: What text did you see? (exact wording, font style, placement)
|
| 211 |
+
3. Distinctive patterns: Any signature patterns, textures, or design elements
|
| 212 |
+
4. Color scheme: Brand-specific colors used
|
| 213 |
+
5. Product features: Distinctive product design characteristics
|
| 214 |
+
|
| 215 |
+
Be specific and detailed. Focus on objective visual features."""
|
| 216 |
+
|
| 217 |
+
try:
|
| 218 |
+
evidence_description = self._generate_vlm_response(image, evidence_prompt)
|
| 219 |
+
|
| 220 |
+
return {
|
| 221 |
+
'brand': brand_name,
|
| 222 |
+
'evidence_description': evidence_description,
|
| 223 |
+
'timestamp': datetime.now().isoformat()
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
except Exception as e:
|
| 227 |
+
return {
|
| 228 |
+
'brand': brand_name,
|
| 229 |
+
'evidence_description': f"Evidence extraction failed: {str(e)}",
|
| 230 |
+
'timestamp': datetime.now().isoformat()
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
def _generate_vlm_response(self, image: Image.Image, prompt: str) -> str:
|
| 234 |
+
"""
|
| 235 |
+
Generate VLM response for given image and prompt
|
| 236 |
+
|
| 237 |
+
Args:
|
| 238 |
+
image: PIL Image
|
| 239 |
+
prompt: Text prompt
|
| 240 |
+
|
| 241 |
+
Returns:
|
| 242 |
+
VLM response string
|
| 243 |
+
"""
|
| 244 |
+
from qwen_vl_utils import process_vision_info
|
| 245 |
+
|
| 246 |
+
messages = [{
|
| 247 |
+
"role": "user",
|
| 248 |
+
"content": [
|
| 249 |
+
{"type": "image", "image": image},
|
| 250 |
+
{"type": "text", "text": prompt}
|
| 251 |
+
]
|
| 252 |
+
}]
|
| 253 |
+
|
| 254 |
+
text = self.caption_generator.processor.apply_chat_template(
|
| 255 |
+
messages, tokenize=False, add_generation_prompt=True
|
| 256 |
+
)
|
| 257 |
+
|
| 258 |
+
image_inputs, video_inputs = process_vision_info(messages)
|
| 259 |
+
inputs = self.caption_generator.processor(
|
| 260 |
+
text=[text],
|
| 261 |
+
images=image_inputs,
|
| 262 |
+
videos=video_inputs,
|
| 263 |
+
padding=True,
|
| 264 |
+
return_tensors="pt"
|
| 265 |
+
).to(self.caption_generator.model.device)
|
| 266 |
+
|
| 267 |
+
# Generate with low temperature for factual responses
|
| 268 |
+
generation_config = {
|
| 269 |
+
'temperature': 0.3, # Low temperature for factual verification
|
| 270 |
+
'top_p': 0.9,
|
| 271 |
+
'max_new_tokens': 300,
|
| 272 |
+
'repetition_penalty': 1.1
|
| 273 |
+
}
|
| 274 |
+
|
| 275 |
+
generated_ids = self.caption_generator.model.generate(
|
| 276 |
+
**inputs,
|
| 277 |
+
**generation_config
|
| 278 |
+
)
|
| 279 |
+
|
| 280 |
+
# Trim input tokens
|
| 281 |
+
generated_ids_trimmed = [
|
| 282 |
+
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
| 283 |
+
]
|
| 284 |
+
|
| 285 |
+
output_text = self.caption_generator.processor.batch_decode(
|
| 286 |
+
generated_ids_trimmed,
|
| 287 |
+
skip_special_tokens=True,
|
| 288 |
+
clean_up_tokenization_spaces=False
|
| 289 |
+
)[0]
|
| 290 |
+
|
| 291 |
+
return output_text
|
| 292 |
+
|
| 293 |
+
def _parse_verification_response(self, response: str) -> Dict:
|
| 294 |
+
"""
|
| 295 |
+
Parse VLM verification response
|
| 296 |
+
|
| 297 |
+
Args:
|
| 298 |
+
response: VLM response string
|
| 299 |
+
|
| 300 |
+
Returns:
|
| 301 |
+
Parsed dictionary
|
| 302 |
+
"""
|
| 303 |
+
try:
|
| 304 |
+
# Try to extract JSON from response
|
| 305 |
+
json_match = re.search(r'\{.*\}', response, re.DOTALL)
|
| 306 |
+
if json_match:
|
| 307 |
+
result = json.loads(json_match.group())
|
| 308 |
+
return result
|
| 309 |
+
except json.JSONDecodeError:
|
| 310 |
+
pass
|
| 311 |
+
|
| 312 |
+
# Fallback: rule-based parsing
|
| 313 |
+
return self._rule_based_parse(response)
|
| 314 |
+
|
| 315 |
+
def _rule_based_parse(self, response: str) -> Dict:
|
| 316 |
+
"""
|
| 317 |
+
Fallback rule-based parsing if JSON fails
|
| 318 |
+
|
| 319 |
+
Args:
|
| 320 |
+
response: VLM response string
|
| 321 |
+
|
| 322 |
+
Returns:
|
| 323 |
+
Parsed dictionary
|
| 324 |
+
"""
|
| 325 |
+
result = {
|
| 326 |
+
'verified_brands': [],
|
| 327 |
+
'false_positives': [],
|
| 328 |
+
'additional_brands': []
|
| 329 |
+
}
|
| 330 |
+
|
| 331 |
+
# Simple pattern matching
|
| 332 |
+
lines = response.lower().split('\n')
|
| 333 |
+
|
| 334 |
+
for line in lines:
|
| 335 |
+
# Look for brand names mentioned with positive sentiment
|
| 336 |
+
if any(word in line for word in ['correct', 'yes', 'visible', 'see', 'identified']):
|
| 337 |
+
# Extract potential brand names (capitalize words)
|
| 338 |
+
words = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', response)
|
| 339 |
+
for word in words:
|
| 340 |
+
if len(word) > 2: # Avoid short words
|
| 341 |
+
result['verified_brands'].append({
|
| 342 |
+
'name': word,
|
| 343 |
+
'confidence': 'medium',
|
| 344 |
+
'evidence': 'Extracted from VLM response'
|
| 345 |
+
})
|
| 346 |
+
|
| 347 |
+
return result
|
| 348 |
+
|
| 349 |
+
print("✓ BrandVerificationManager (VLM verification and voting) defined")
|
brand_visualization_manager.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import cv2
|
| 3 |
+
import numpy as np
|
| 4 |
+
from PIL import Image, ImageDraw, ImageFont
|
| 5 |
+
from typing import List, Tuple, Dict
|
| 6 |
+
|
| 7 |
+
class BrandVisualizationManager:
|
| 8 |
+
"""Visualize detected brands with bounding boxes and labels (like YOLO)"""
|
| 9 |
+
|
| 10 |
+
def __init__(self):
|
| 11 |
+
"""Initialize visualization manager"""
|
| 12 |
+
# Color palette for different brand categories
|
| 13 |
+
self.colors = {
|
| 14 |
+
'luxury': (218, 165, 32), # Gold
|
| 15 |
+
'sportswear': (0, 191, 255), # Deep Sky Blue
|
| 16 |
+
'tech': (169, 169, 169), # Dark Gray
|
| 17 |
+
'automotive': (220, 20, 60), # Crimson
|
| 18 |
+
'fashion': (186, 85, 211), # Medium Orchid
|
| 19 |
+
'watches': (184, 134, 11), # Dark Goldenrod
|
| 20 |
+
'default': (0, 255, 0) # Green
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
print("✓ Brand Visualization Manager initialized")
|
| 24 |
+
|
| 25 |
+
def draw_brand_detections(self, image: Image.Image, brand_detections: List[Dict],
|
| 26 |
+
show_confidence: bool = True) -> Image.Image:
|
| 27 |
+
"""Draw bounding boxes and labels for detected brands
|
| 28 |
+
|
| 29 |
+
Args:
|
| 30 |
+
image: PIL Image
|
| 31 |
+
brand_detections: List of dicts with keys: 'name', 'confidence', 'bbox', 'category'
|
| 32 |
+
show_confidence: Whether to show confidence scores
|
| 33 |
+
|
| 34 |
+
Returns:
|
| 35 |
+
Image with drawn bounding boxes
|
| 36 |
+
"""
|
| 37 |
+
if not brand_detections:
|
| 38 |
+
return image
|
| 39 |
+
|
| 40 |
+
# Convert PIL to OpenCV format
|
| 41 |
+
img_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
|
| 42 |
+
|
| 43 |
+
for detection in brand_detections:
|
| 44 |
+
brand_name = detection.get('name', 'Unknown')
|
| 45 |
+
confidence = detection.get('confidence', 0.0)
|
| 46 |
+
bbox = detection.get('bbox')
|
| 47 |
+
category = detection.get('category', 'default')
|
| 48 |
+
|
| 49 |
+
if bbox is None:
|
| 50 |
+
continue
|
| 51 |
+
|
| 52 |
+
x1, y1, x2, y2 = bbox
|
| 53 |
+
color = self.colors.get(category, self.colors['default'])
|
| 54 |
+
|
| 55 |
+
# Draw bounding box
|
| 56 |
+
cv2.rectangle(img_cv, (int(x1), int(y1)), (int(x2), int(y2)), color, 3)
|
| 57 |
+
|
| 58 |
+
# Prepare label text
|
| 59 |
+
if show_confidence:
|
| 60 |
+
label = f"{brand_name} {confidence:.2f}"
|
| 61 |
+
else:
|
| 62 |
+
label = brand_name
|
| 63 |
+
|
| 64 |
+
# Calculate text size
|
| 65 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
| 66 |
+
font_scale = 0.7
|
| 67 |
+
thickness = 2
|
| 68 |
+
(text_width, text_height), baseline = cv2.getTextSize(label, font, font_scale, thickness)
|
| 69 |
+
|
| 70 |
+
# Draw label background
|
| 71 |
+
cv2.rectangle(img_cv,
|
| 72 |
+
(int(x1), int(y1) - text_height - 10),
|
| 73 |
+
(int(x1) + text_width + 10, int(y1)),
|
| 74 |
+
color, -1)
|
| 75 |
+
|
| 76 |
+
# Draw label text
|
| 77 |
+
cv2.putText(img_cv, label,
|
| 78 |
+
(int(x1) + 5, int(y1) - 5),
|
| 79 |
+
font, font_scale, (255, 255, 255), thickness, cv2.LINE_AA)
|
| 80 |
+
|
| 81 |
+
# Convert back to PIL
|
| 82 |
+
img_pil = Image.fromarray(cv2.cvtColor(img_cv, cv2.COLOR_BGR2RGB))
|
| 83 |
+
return img_pil
|
| 84 |
+
|
| 85 |
+
def format_brand_list(self, brand_detections: List[Dict]) -> str:
|
| 86 |
+
"""Format brand detections as readable text
|
| 87 |
+
|
| 88 |
+
Args:
|
| 89 |
+
brand_detections: List of brand detection dicts
|
| 90 |
+
|
| 91 |
+
Returns:
|
| 92 |
+
Formatted string with brand names and confidences
|
| 93 |
+
"""
|
| 94 |
+
if not brand_detections:
|
| 95 |
+
return "No brands identified"
|
| 96 |
+
|
| 97 |
+
formatted = []
|
| 98 |
+
for detection in brand_detections:
|
| 99 |
+
brand_name = detection.get('name', 'Unknown')
|
| 100 |
+
confidence = detection.get('confidence', 0.0)
|
| 101 |
+
# 移除 category 標籤顯示,保持簡潔
|
| 102 |
+
|
| 103 |
+
formatted.append(f"{brand_name} ({confidence:.2f})")
|
| 104 |
+
|
| 105 |
+
return ", ".join(formatted)
|
| 106 |
+
|
| 107 |
+
print("✓ BrandVisualizationManager defined")
|
caption_generation_manager.py
ADDED
|
@@ -0,0 +1,499 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from transformers import AutoModelForImageTextToText, AutoProcessor
|
| 3 |
+
from qwen_vl_utils import process_vision_info
|
| 4 |
+
from PIL import Image
|
| 5 |
+
from typing import List, Dict
|
| 6 |
+
import json
|
| 7 |
+
from opencc import OpenCC
|
| 8 |
+
import warnings
|
| 9 |
+
|
| 10 |
+
class CaptionGenerationManager:
|
| 11 |
+
"""Caption generation using Vision-Language Models (supports Qwen2.5-VL, Qwen3-VL, etc.)"""
|
| 12 |
+
|
| 13 |
+
def __init__(self, model_name: str = "Qwen/Qwen2.5-VL-7B-Instruct"):
|
| 14 |
+
"""
|
| 15 |
+
Args:
|
| 16 |
+
model_name: Vision-Language model name, e.g.:
|
| 17 |
+
- "Qwen/Qwen2.5-VL-7B-Instruct" (default)
|
| 18 |
+
- "Qwen/Qwen3-VL-8B-Instruct" (2025 latest)
|
| 19 |
+
"""
|
| 20 |
+
print(f"Loading Vision-Language Model: {model_name}...")
|
| 21 |
+
|
| 22 |
+
# Suppress processor warning
|
| 23 |
+
warnings.filterwarnings("ignore", category=FutureWarning, module="transformers")
|
| 24 |
+
|
| 25 |
+
# Use Auto* classes for flexibility (supports Qwen2.5-VL, Qwen3-VL, etc.)
|
| 26 |
+
self.processor = AutoProcessor.from_pretrained(model_name, use_fast=False)
|
| 27 |
+
self.model = AutoModelForImageTextToText.from_pretrained(
|
| 28 |
+
model_name,
|
| 29 |
+
dtype=torch.bfloat16, # Changed from torch_dtype to dtype
|
| 30 |
+
device_map="auto"
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
# Simplified Chinese to Traditional Chinese converter
|
| 34 |
+
self.cc = OpenCC('s2t') # Simplified to Traditional
|
| 35 |
+
|
| 36 |
+
self.generation_config = {
|
| 37 |
+
'temperature': 0.7,
|
| 38 |
+
'top_p': 0.9,
|
| 39 |
+
'max_new_tokens': 300, # Increased from 200 to prevent truncation
|
| 40 |
+
'repetition_penalty': 1.1
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
# Platform-specific templates
|
| 44 |
+
self.platform_templates = {
|
| 45 |
+
'instagram': {
|
| 46 |
+
'style': 'storytelling, aesthetic',
|
| 47 |
+
'emoji_count': '2-3',
|
| 48 |
+
'hashtag_count': '8-10',
|
| 49 |
+
'min_length': 120, # Increased for richer content
|
| 50 |
+
'max_length': 220, # Allow more detailed descriptions
|
| 51 |
+
'features': ['call-to-action', 'question', 'relatable']
|
| 52 |
+
},
|
| 53 |
+
'tiktok': {
|
| 54 |
+
'style': 'brief, punchy',
|
| 55 |
+
'emoji_count': '1-2',
|
| 56 |
+
'hashtag_count': '5-8',
|
| 57 |
+
'min_length': 60,
|
| 58 |
+
'max_length': 120,
|
| 59 |
+
'features': ['trending', 'POV', 'relatable']
|
| 60 |
+
},
|
| 61 |
+
'xiaohongshu': {
|
| 62 |
+
'style': 'structured, informative, detailed',
|
| 63 |
+
'emoji_count': '5-8',
|
| 64 |
+
'hashtag_count': '8-12',
|
| 65 |
+
'min_length': 180,
|
| 66 |
+
'max_length': 500,
|
| 67 |
+
'features': ['tips', 'bullets', 'sharing-tone']
|
| 68 |
+
}
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
print(f"✓ {model_name.split('/')[-1]} loaded successfully (using Auto* classes for flexibility)")
|
| 72 |
+
|
| 73 |
+
def construct_prompt(self, analysis_results: Dict, platform: str = 'instagram', language: str = 'zh') -> str:
|
| 74 |
+
"""Construct prompt with language support ensuring consistency
|
| 75 |
+
|
| 76 |
+
Args:
|
| 77 |
+
language: 'zh' (Traditional Chinese), 'en' (English), 'zh-en' (Bilingual)
|
| 78 |
+
"""
|
| 79 |
+
platform_config = self.platform_templates.get(platform, self.platform_templates['instagram'])
|
| 80 |
+
|
| 81 |
+
# Language-specific instructions
|
| 82 |
+
language_instructions = {
|
| 83 |
+
'zh': '請使用繁體中文生成標題和標籤。語言要自然流暢,符合華語社群媒體的表達習慣。避免使用簡體字。當偵測到品牌時,必須在標題中提及品牌名稱。',
|
| 84 |
+
'en': '''🚨 CRITICAL LANGUAGE REQUIREMENT 🚨
|
| 85 |
+
Generate captions and hashtags EXCLUSIVELY in English.
|
| 86 |
+
- NEVER use Chinese characters (Traditional or Simplified)
|
| 87 |
+
- NEVER mix languages
|
| 88 |
+
- Use natural, engaging language suitable for international social media
|
| 89 |
+
- When brands are detected, mention them naturally in English
|
| 90 |
+
- All text output must be 100% English only
|
| 91 |
+
This is MANDATORY and NON-NEGOTIABLE.''',
|
| 92 |
+
'zh-en': '''生成雙語內容:標題使用繁體中文,同時提供英文翻譯。標籤混合使用中英文以擴大觸及範圍。當偵測到品牌時,必須在標題中提及品牌名稱。
|
| 93 |
+
|
| 94 |
+
🚨 重要:雙語一致性要求 🚨
|
| 95 |
+
- 中文和英文必須表達相同的核心意義
|
| 96 |
+
- 允許表達方式的差異(形容詞、語法不同)
|
| 97 |
+
- 但整體訊息、語氣、品牌提及必須一致
|
| 98 |
+
- 兩種語言都要朝同一方向詮釋內容'''
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
system_instruction = f"""You are a professional social media content strategist.
|
| 102 |
+
|
| 103 |
+
{language_instructions.get(language, language_instructions['zh'])}
|
| 104 |
+
|
| 105 |
+
Target platform: {platform}
|
| 106 |
+
Content style: Authentic, creative, and optimized for engagement.
|
| 107 |
+
|
| 108 |
+
CRITICAL RULE: Never include hashtags (symbols starting with #) in the caption text. Hashtags must only appear in the separate 'hashtags' array."""
|
| 109 |
+
|
| 110 |
+
# Extract analysis context
|
| 111 |
+
objects = analysis_results.get('detections', [])
|
| 112 |
+
brands = analysis_results.get('brands', [])
|
| 113 |
+
scene_info = analysis_results.get('scene_analysis', {})
|
| 114 |
+
composition = analysis_results.get('composition', {})
|
| 115 |
+
|
| 116 |
+
# FIXED: Get fused lighting from scene_info (it's been updated by DetectionFusionManager)
|
| 117 |
+
lighting = scene_info.get('lighting', {}).get('top', 'natural light')
|
| 118 |
+
lighting_confidence = scene_info.get('lighting', {}).get('confidence', 0.7)
|
| 119 |
+
|
| 120 |
+
# Provide explicit Chinese translations to ensure consistency
|
| 121 |
+
lighting_translations_zh = {
|
| 122 |
+
'soft diffused light': '柔和漫射光',
|
| 123 |
+
'overcast atmosphere': '陰天氛圍',
|
| 124 |
+
'natural daylight': '自然日光',
|
| 125 |
+
'warm ambient light': '溫暖環境光',
|
| 126 |
+
'evening light': '傍晚光線',
|
| 127 |
+
'bright sunlight': '明亮陽光',
|
| 128 |
+
'golden hour': '金黃時刻',
|
| 129 |
+
'blue hour': '藍調時刻'
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
# Get appropriate lighting description based on language
|
| 133 |
+
if language == 'zh':
|
| 134 |
+
lighting_zh = lighting_translations_zh.get(lighting, lighting)
|
| 135 |
+
lighting_display = lighting_zh
|
| 136 |
+
else:
|
| 137 |
+
# For English and bilingual, use English only
|
| 138 |
+
lighting_display = lighting
|
| 139 |
+
lighting_zh = lighting
|
| 140 |
+
|
| 141 |
+
objects_str = ', '.join([obj['class_name'] for obj in objects[:10]])
|
| 142 |
+
|
| 143 |
+
# CRITICAL: Emphasize brands EXTREMELY prominently - repeat multiple times
|
| 144 |
+
if brands:
|
| 145 |
+
brands_list = [b[0] for b in brands[:5]]
|
| 146 |
+
brands_str = ', '.join(brands_list)
|
| 147 |
+
brand_emphasis = f"""
|
| 148 |
+
|
| 149 |
+
🚨 CRITICAL BRAND REQUIREMENT 🚨
|
| 150 |
+
The following brands were POSITIVELY IDENTIFIED in this image: {brands_str}
|
| 151 |
+
|
| 152 |
+
YOU ABSOLUTELY MUST:
|
| 153 |
+
1. Mention the brand name "{brands_list[0]}" explicitly in the FIRST sentence
|
| 154 |
+
2. Use the exact brand name - do not use generic terms like "bag" or "accessory" without the brand
|
| 155 |
+
3. Write naturally as if you're excited to share this {brands_list[0]} item
|
| 156 |
+
4. Example: "在傍晚光線下,這款{brands_list[0]}經典黑色菱格紋皮革包..." (CORRECT)
|
| 157 |
+
5. NOT acceptable: "在傍晚光線下,這款經典黑色菱格紋皮革包..." (WRONG - missing brand name!)
|
| 158 |
+
|
| 159 |
+
THIS IS MANDATORY - The caption will be rejected if it doesn't mention {brands_str}.
|
| 160 |
+
"""
|
| 161 |
+
else:
|
| 162 |
+
brands_str = 'None detected'
|
| 163 |
+
brand_emphasis = ""
|
| 164 |
+
|
| 165 |
+
# Enhanced scene description
|
| 166 |
+
urban_scene = scene_info.get('urban', {}).get('top', 'unknown')
|
| 167 |
+
mood = scene_info.get('mood', {}).get('top', 'neutral')
|
| 168 |
+
comp_type = composition.get('composition_type', 'standard')
|
| 169 |
+
|
| 170 |
+
context = f"""
|
| 171 |
+
Analyze this image and generate an engaging, DETAILED social media caption with rich visual descriptions.
|
| 172 |
+
|
| 173 |
+
**Visual Elements (Describe in Detail):**
|
| 174 |
+
- Detected objects: {objects_str}
|
| 175 |
+
- Scene composition: {comp_type}
|
| 176 |
+
- Urban environment: {urban_scene}
|
| 177 |
+
- **IMPORTANT**: Include specific details about:
|
| 178 |
+
* Materials (leather, metal, fabric, canvas, etc.)
|
| 179 |
+
* Colors (use descriptive terms: jet black, antique gold, midnight blue, etc.)
|
| 180 |
+
* Textures (quilted, smooth, matte, glossy, metallic, etc.)
|
| 181 |
+
* Design features (stitching patterns, hardware, logos, emblems, etc.)
|
| 182 |
+
* Reflections and lighting effects on surfaces
|
| 183 |
+
|
| 184 |
+
**Atmosphere:**
|
| 185 |
+
- Lighting (analyzed with Places365 + CV): {lighting_display} (confidence: {lighting_confidence:.2f})
|
| 186 |
+
- Mood: {mood}
|
| 187 |
+
|
| 188 |
+
**Brand Detection:**
|
| 189 |
+
- Identified brands: {brands_str}{brand_emphasis}
|
| 190 |
+
|
| 191 |
+
**Caption Structure (Required - BE SPECIFIC AND DETAILED):**
|
| 192 |
+
1. Opening hook - Most striking visual element with SPECIFIC details (1-2 sentences)
|
| 193 |
+
{f"- 🚨 MANDATORY: Start with the BRAND NAME '{brands_list[0]}' in the FIRST sentence!" if brands else ""}
|
| 194 |
+
{f"- Example (CORRECT): '這款{brands_list[0]}經典黑色菱格紋皮革包...'" if brands else ""}
|
| 195 |
+
{f"- Example (WRONG): '這款經典黑色菱格紋皮革包...' (missing {brands_list[0]}!)" if brands else ""}
|
| 196 |
+
- Be SPECIFIC: Include material, color, design features WITH the brand name
|
| 197 |
+
|
| 198 |
+
2. Visual details - Describe materials, textures, colors, and design elements (2-3 sentences)
|
| 199 |
+
- Be SPECIFIC: mention quilting patterns, metal finishes, chain details, logo placements
|
| 200 |
+
- Describe how light interacts with materials (reflections on leather, gleam of metal)
|
| 201 |
+
- MUST use the EXACT lighting description: "{lighting_display}"
|
| 202 |
+
|
| 203 |
+
3. Atmospheric context - How lighting and mood create the scene's character (1-2 sentences)
|
| 204 |
+
- Connect lighting to the overall visual impact
|
| 205 |
+
- Describe depth, shadows, contrasts
|
| 206 |
+
|
| 207 |
+
4. Emotional connection & Engagement - How this resonates with viewers + call-to-action (1 sentence)
|
| 208 |
+
|
| 209 |
+
**Content Requirements:**
|
| 210 |
+
- Minimum information: 3-4 specific visual details per caption
|
| 211 |
+
- Include material types, color descriptions, design features
|
| 212 |
+
- Describe how lighting affects the appearance
|
| 213 |
+
- Make it vivid and immersive
|
| 214 |
+
|
| 215 |
+
Platform style: {platform_config['style']}
|
| 216 |
+
"""
|
| 217 |
+
|
| 218 |
+
# Language-specific examples with DETAILED visual descriptions AND BRAND NAMES
|
| 219 |
+
if language == 'zh':
|
| 220 |
+
brand_name_zh = brands_list[0] if brands else "Gucci" # Use detected brand or example
|
| 221 |
+
example_correct = f"""正確範例 - 詳細描述 + 品牌提及 (繁體中文):
|
| 222 |
+
"在{lighting_zh}的映襯下,這款{brand_name_zh}經典黑色菱格紋皮革包展現奢華質感,V字形縫線在柔軟小牛皮上勾勒出精緻的幾何圖案,復古金色雙G標誌在深色背景中熠熠生輝。金屬鏈條肩帶反射著{lighting_zh},增添層次感與立體效果。皮革表面細膩的光澤與霧面質地形成迷人對比,每個細節都彰顯義大利工藝的極致追求。這樣的{brand_name_zh}單品不只是配件,更是品味與格調的完美詮釋。你的衣櫃裡有哪件經典單品?✨🖤"
|
| 223 |
+
|
| 224 |
+
注意:品牌名稱 "{brand_name_zh}" 出現在第一句!這是正確的做法。
|
| 225 |
+
|
| 226 |
+
CRITICAL:
|
| 227 |
+
- 必須包含材質描述(皮革、金屬等)
|
| 228 |
+
- 必須包含顏色細節(黑色、復古金色等)
|
| 229 |
+
- 必須包含設計特點(縫線、標誌、鏈條等)
|
| 230 |
+
- 必須使用"{lighting_zh}"來描述光線
|
| 231 |
+
"""
|
| 232 |
+
elif language == 'en':
|
| 233 |
+
brand_name_en = brands_list[0] if brands else "Gucci" # Use detected brand or example
|
| 234 |
+
example_correct = f"""CORRECT EXAMPLE - Detailed Description + Brand Mention (ENGLISH ONLY - NO CHINESE):
|
| 235 |
+
"Under the {lighting}, this {brand_name_en} classic black quilted leather bag showcases luxurious craftsmanship. V-shaped stitching traces intricate geometric patterns across supple calfskin, while the antique gold double-G logo gleams against the dark backdrop. The metal chain strap catches and reflects the {lighting}, adding dimension and depth to the piece. The leather surface presents a captivating contrast between fine sheen and matte texture, with every detail exemplifying Italian artisanship at its finest. This {brand_name_en} piece isn't just an accessory – it's a perfect expression of taste and sophistication. What's your timeless wardrobe essential? ✨🖤"
|
| 236 |
+
|
| 237 |
+
NOTE: Brand name "{brand_name_en}" appears in the FIRST sentence! This is the correct approach.
|
| 238 |
+
|
| 239 |
+
🚨 ABSOLUTE REQUIREMENT FOR ENGLISH MODE 🚨
|
| 240 |
+
- Output must be 100% ENGLISH - zero Chinese characters allowed
|
| 241 |
+
- MUST include material descriptions (leather, metal, etc.)
|
| 242 |
+
- MUST include color details (black, antique gold, etc.)
|
| 243 |
+
- MUST include design features (stitching, logo, chain, etc.)
|
| 244 |
+
- MUST use "{lighting}" to describe the lighting
|
| 245 |
+
- NO Chinese characters anywhere in the output
|
| 246 |
+
"""
|
| 247 |
+
else: # zh-en bilingual
|
| 248 |
+
brand_name_en = brands_list[0] if brands else "Gucci"
|
| 249 |
+
example_correct = f"""BILINGUAL EXAMPLE - 雙語範例:
|
| 250 |
+
Caption in Traditional Chinese, with English hashtags support.
|
| 251 |
+
(Details omitted for brevity)
|
| 252 |
+
"""
|
| 253 |
+
|
| 254 |
+
# Language-specific hashtag instructions
|
| 255 |
+
if language == 'zh':
|
| 256 |
+
hashtag_instruction = """
|
| 257 |
+
【CRITICAL HASHTAG REQUIREMENT - 繁體中文】:
|
| 258 |
+
- ALL hashtags MUST be in Traditional Chinese (繁體中文)
|
| 259 |
+
- NEVER use English hashtags when language is 繁體中文
|
| 260 |
+
- Examples of CORRECT hashtags: ["時尚包包", "奢華風格", "皮革工藝", "精品配件"]
|
| 261 |
+
- Examples of WRONG hashtags: ["FashionBlogger", "LuxuryLifestyle"] - DO NOT USE THESE
|
| 262 |
+
"""
|
| 263 |
+
elif language == 'en':
|
| 264 |
+
hashtag_instruction = """
|
| 265 |
+
【CRITICAL HASHTAG REQUIREMENT - English】:
|
| 266 |
+
- ALL hashtags MUST be in English
|
| 267 |
+
- NEVER use Chinese characters in hashtags
|
| 268 |
+
- Examples of CORRECT hashtags: ["FashionBlogger", "LuxuryLifestyle", "LeatherCraft"]
|
| 269 |
+
"""
|
| 270 |
+
else: # zh-en
|
| 271 |
+
hashtag_instruction = """
|
| 272 |
+
【CRITICAL HASHTAG REQUIREMENT - Bilingual】:
|
| 273 |
+
- Hashtags should MIX Traditional Chinese and English
|
| 274 |
+
- First half in Chinese, second half in English
|
| 275 |
+
- Example: ["時尚包包", "奢華風格", "FashionBlogger", "LuxuryLifestyle"]
|
| 276 |
+
"""
|
| 277 |
+
|
| 278 |
+
output_format = f"""
|
| 279 |
+
Generate output in JSON format:
|
| 280 |
+
{{
|
| 281 |
+
"caption": "string (minimum {platform_config['min_length']} chars, maximum {platform_config['max_length']} chars, engaging and descriptive)",
|
| 282 |
+
"hashtags": ["tag1", "tag2", ...] ({platform_config['hashtag_count']} relevant hashtags),
|
| 283 |
+
"tone": "casual|professional|playful",
|
| 284 |
+
"platform": "{platform}"
|
| 285 |
+
}}
|
| 286 |
+
|
| 287 |
+
{hashtag_instruction}
|
| 288 |
+
|
| 289 |
+
STRICT REQUIREMENTS:
|
| 290 |
+
1. Caption length: {platform_config['min_length']}-{platform_config['max_length']} characters
|
| 291 |
+
2. 🚨 EMOJI REQUIREMENT 🚨 - MUST use EXACTLY {platform_config['emoji_count']} emojis naturally integrated into caption text
|
| 292 |
+
- Professional style: 1-2 emojis (e.g., ✨💼🌟)
|
| 293 |
+
- Creative style: 2-3 emojis (e.g., 🎨✨💫🌙)
|
| 294 |
+
- Authentic style: 2-3 emojis (e.g., 💖👜✨🖤)
|
| 295 |
+
- Place emojis naturally within or at end of sentences
|
| 296 |
+
3. Caption must be pure descriptive text only - absolutely NO hashtags allowed
|
| 297 |
+
4. 🚨 CALL-TO-ACTION REQUIREMENT 🚨 - MUST include an engaging question or CTA at the end
|
| 298 |
+
- Professional: Brief professional question (e.g., "What's your go-to piece?")
|
| 299 |
+
- Creative: Thought-provoking question (e.g., "How does this speak to you?")
|
| 300 |
+
- Authentic: Personal question (e.g., "What's your favorite timeless accessory?")
|
| 301 |
+
5. Write 3-4 complete sentences following the structure above
|
| 302 |
+
6. Be specific and vivid - describe what you see in detail
|
| 303 |
+
7. 【CRITICAL】 MUST use the EXACT lighting description: "{lighting_display}"
|
| 304 |
+
- DO NOT substitute with similar terms
|
| 305 |
+
- DO NOT use "金黃時刻" if the lighting is "{lighting_zh if language == 'zh' else lighting}"
|
| 306 |
+
- DO NOT invent your own lighting description
|
| 307 |
+
8. 🚨 HASHTAG REQUIREMENT 🚨 - Generate {platform_config['hashtag_count']} relevant hashtags
|
| 308 |
+
- Hashtags go ONLY in the 'hashtags' array, NEVER in the caption text
|
| 309 |
+
- Mix of broad and specific tags
|
| 310 |
+
- Include brand name as hashtag if detected
|
| 311 |
+
9. {"🚨 CRITICAL BRAND REQUIREMENT 🚨 - The brand name '" + brands_list[0] + "' MUST appear in the FIRST sentence of your caption. This is MANDATORY and NON-NEGOTIABLE. Example: " + ("'這款" + brands_list[0] + "經典黑色...'" if language == 'zh' else "'This " + brands_list[0] + " classic black...'") if brands else "No brands detected to mention"}
|
| 312 |
+
10. {"🚨 LANGUAGE REQUIREMENT 🚨 - Output must be 100% ENGLISH ONLY. NO Chinese characters allowed anywhere." if language == 'en' else ""}
|
| 313 |
+
|
| 314 |
+
WRONG EXAMPLE (DO NOT DO THIS):
|
| 315 |
+
"Lost in the city's towering skyscrapers 🏙️✨ | #UrbanVibes #CityLife"
|
| 316 |
+
|
| 317 |
+
{example_correct}
|
| 318 |
+
"""
|
| 319 |
+
|
| 320 |
+
full_prompt = f"{system_instruction}\n\n{context}\n\n{output_format}"
|
| 321 |
+
return full_prompt
|
| 322 |
+
|
| 323 |
+
def generate_captions(self, analysis_results: Dict, image: Image.Image,
|
| 324 |
+
platform: str = 'instagram', language: str = 'zh') -> List[Dict]:
|
| 325 |
+
"""Generate 3 captions with distinct styles: Professional, Creative, Authentic"""
|
| 326 |
+
|
| 327 |
+
# Extract brands for style instructions
|
| 328 |
+
brands_in_image = analysis_results.get('brands', [])
|
| 329 |
+
brand_names = [b[0] for b in brands_in_image[:3]] if brands_in_image else []
|
| 330 |
+
brand_mention_requirement = f" CRITICAL: Mention {', '.join(brand_names)} brand(s) naturally in the caption." if brand_names else ""
|
| 331 |
+
|
| 332 |
+
# Define 3 distinct styles
|
| 333 |
+
styles = [
|
| 334 |
+
{
|
| 335 |
+
'name': 'professional',
|
| 336 |
+
'temp': 0.6,
|
| 337 |
+
'instruction': f'Professional style: Concise, elegant, sophisticated. Focus on quality and craftsmanship. Use refined language.{brand_mention_requirement}',
|
| 338 |
+
'length_modifier': 0.8 # Shorter, more concise
|
| 339 |
+
},
|
| 340 |
+
{
|
| 341 |
+
'name': 'creative',
|
| 342 |
+
'temp': 0.7,
|
| 343 |
+
'instruction': f'Creative style: Artistic, expressive, imaginative. Use vivid metaphors and sensory descriptions. Balance detail with flair.{brand_mention_requirement}',
|
| 344 |
+
'length_modifier': 1.0 # Medium length
|
| 345 |
+
},
|
| 346 |
+
{
|
| 347 |
+
'name': 'authentic',
|
| 348 |
+
'temp': 0.8,
|
| 349 |
+
'instruction': f'Authentic style: Personal, detailed, storytelling. Share rich observations and genuine feelings. Most descriptive and engaging.{brand_mention_requirement}',
|
| 350 |
+
'length_modifier': 1.2 # Longer, more detailed
|
| 351 |
+
}
|
| 352 |
+
]
|
| 353 |
+
|
| 354 |
+
variations = []
|
| 355 |
+
|
| 356 |
+
for style in styles:
|
| 357 |
+
# Build style-specific prompt
|
| 358 |
+
base_prompt = self.construct_prompt(analysis_results, platform, language)
|
| 359 |
+
|
| 360 |
+
# Add style instruction
|
| 361 |
+
style_prompt = f"""{base_prompt}
|
| 362 |
+
|
| 363 |
+
**STYLE REQUIREMENT FOR THIS CAPTION:**
|
| 364 |
+
{style['instruction']}
|
| 365 |
+
|
| 366 |
+
Adjust tone to be clearly '{style['name']}' - this should be noticeably different from other styles."""
|
| 367 |
+
|
| 368 |
+
messages = [{
|
| 369 |
+
"role": "user",
|
| 370 |
+
"content": [
|
| 371 |
+
{"type": "image", "image": image},
|
| 372 |
+
{"type": "text", "text": style_prompt}
|
| 373 |
+
]
|
| 374 |
+
}]
|
| 375 |
+
|
| 376 |
+
text = self.processor.apply_chat_template(
|
| 377 |
+
messages, tokenize=False, add_generation_prompt=True
|
| 378 |
+
)
|
| 379 |
+
|
| 380 |
+
image_inputs, video_inputs = process_vision_info(messages)
|
| 381 |
+
inputs = self.processor(
|
| 382 |
+
text=[text],
|
| 383 |
+
images=image_inputs,
|
| 384 |
+
videos=video_inputs,
|
| 385 |
+
padding=True,
|
| 386 |
+
return_tensors="pt"
|
| 387 |
+
)
|
| 388 |
+
|
| 389 |
+
if torch.cuda.is_available():
|
| 390 |
+
inputs = inputs.to("cuda")
|
| 391 |
+
|
| 392 |
+
# Generate with style-specific temperature
|
| 393 |
+
config = self.generation_config.copy()
|
| 394 |
+
config['temperature'] = style['temp']
|
| 395 |
+
|
| 396 |
+
with torch.no_grad():
|
| 397 |
+
generated_ids = self.model.generate(**inputs, **config)
|
| 398 |
+
|
| 399 |
+
generated_ids_trimmed = [
|
| 400 |
+
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
| 401 |
+
]
|
| 402 |
+
|
| 403 |
+
output_text = self.processor.batch_decode(
|
| 404 |
+
generated_ids_trimmed,
|
| 405 |
+
skip_special_tokens=True,
|
| 406 |
+
clean_up_tokenization_spaces=False
|
| 407 |
+
)[0]
|
| 408 |
+
|
| 409 |
+
parsed = self._parse_json_output(output_text)
|
| 410 |
+
if parsed:
|
| 411 |
+
# Force the correct tone
|
| 412 |
+
parsed['tone'] = style['name']
|
| 413 |
+
|
| 414 |
+
# Remove any hashtags that leaked into caption
|
| 415 |
+
if 'caption' in parsed:
|
| 416 |
+
parsed['caption'] = self._remove_hashtags_from_caption(parsed['caption'])
|
| 417 |
+
|
| 418 |
+
# Convert Simplified Chinese to Traditional if language is 'zh'
|
| 419 |
+
if language == 'zh' or language == 'zh-en':
|
| 420 |
+
parsed = self._convert_to_traditional(parsed)
|
| 421 |
+
|
| 422 |
+
variations.append(parsed)
|
| 423 |
+
|
| 424 |
+
return variations if variations else [self._get_fallback_caption(platform, language)]
|
| 425 |
+
|
| 426 |
+
def _remove_hashtags_from_caption(self, caption: str) -> str:
|
| 427 |
+
"""Remove any hashtags, pipes, and debug info that leaked into caption text"""
|
| 428 |
+
import re
|
| 429 |
+
|
| 430 |
+
# CRITICAL FIX: Remove pipe symbol and everything after it (debug info)
|
| 431 |
+
# Example: "Text 🕰️🌉 | SoftDiffusedLight" -> "Text 🕰️🌉"
|
| 432 |
+
if '|' in caption:
|
| 433 |
+
caption = caption.split('|')[0].strip()
|
| 434 |
+
|
| 435 |
+
# Remove hashtags (words starting with #)
|
| 436 |
+
caption = re.sub(r'#\w+', '', caption)
|
| 437 |
+
caption = re.sub(r'#[\u4e00-\u9fff]+', '', caption) # Remove Chinese hashtags
|
| 438 |
+
|
| 439 |
+
# Remove standalone weird text patterns (like "BLACKBELT")
|
| 440 |
+
# If there's a suspicious all-caps word at the end without context, remove it
|
| 441 |
+
words = caption.split()
|
| 442 |
+
if len(words) > 0:
|
| 443 |
+
last_word = words[-1].strip('✨💎👗🌟💫🖤')
|
| 444 |
+
# If last "word" is all caps and doesn't look like a normal sentence word, remove it
|
| 445 |
+
if last_word.isupper() and len(last_word) > 3 and not any(char in last_word for char in '.,!?'):
|
| 446 |
+
caption = ' '.join(words[:-1])
|
| 447 |
+
|
| 448 |
+
# Remove excessive emojis at the end (more than 3)
|
| 449 |
+
emoji_pattern = r'[\U0001F300-\U0001F9FF]{4,}$'
|
| 450 |
+
caption = re.sub(emoji_pattern, '', caption)
|
| 451 |
+
|
| 452 |
+
# Remove multiple spaces
|
| 453 |
+
caption = re.sub(r'\s+', ' ', caption)
|
| 454 |
+
|
| 455 |
+
# Remove trailing/leading whitespace
|
| 456 |
+
caption = caption.strip()
|
| 457 |
+
|
| 458 |
+
# Final cleanup: if caption ends with weird patterns like "✨X 👗💎", clean it
|
| 459 |
+
if re.search(r'[✨💎👗🌟💫🖤]{2,}\s*$', caption):
|
| 460 |
+
caption = re.sub(r'[✨💎👗🌟💫🖤\s]+$', '', caption).strip()
|
| 461 |
+
|
| 462 |
+
return caption
|
| 463 |
+
|
| 464 |
+
def _convert_to_traditional(self, caption: Dict) -> Dict:
|
| 465 |
+
"""Convert Simplified Chinese to Traditional Chinese"""
|
| 466 |
+
if 'caption' in caption:
|
| 467 |
+
caption['caption'] = self.cc.convert(caption['caption'])
|
| 468 |
+
return caption
|
| 469 |
+
|
| 470 |
+
def _parse_json_output(self, text: str) -> Dict:
|
| 471 |
+
"""Parse JSON output"""
|
| 472 |
+
try:
|
| 473 |
+
start = text.find('{')
|
| 474 |
+
end = text.rfind('}') + 1
|
| 475 |
+
if start != -1 and end > start:
|
| 476 |
+
json_str = text[start:end]
|
| 477 |
+
return json.loads(json_str)
|
| 478 |
+
except:
|
| 479 |
+
pass
|
| 480 |
+
return None
|
| 481 |
+
|
| 482 |
+
def _get_fallback_caption(self, platform: str, language: str) -> Dict:
|
| 483 |
+
"""Fallback caption"""
|
| 484 |
+
if language == 'en':
|
| 485 |
+
return {
|
| 486 |
+
'caption': 'Every moment tells a story worth sharing. The world around us is filled with beauty waiting to be discovered. Take a pause and appreciate the details that make life extraordinary. What caught your eye today? ✨',
|
| 487 |
+
'hashtags': ['photography', 'daily', 'lifestyle', 'moment', 'capture'],
|
| 488 |
+
'tone': 'casual',
|
| 489 |
+
'platform': platform
|
| 490 |
+
}
|
| 491 |
+
else:
|
| 492 |
+
return {
|
| 493 |
+
'caption': '每個瞬間都值得被記錄與分享。生活中充滿了等待被發現的美好細節。停下腳步,用心感受周遭的一切。今天什麼畫面觸動了你的心?✨',
|
| 494 |
+
'hashtags': ['攝影', '日常', '生活', '瞬間', '分享'],
|
| 495 |
+
'tone': 'casual',
|
| 496 |
+
'platform': platform
|
| 497 |
+
}
|
| 498 |
+
|
| 499 |
+
print("✓ CaptionGenerationManager (with Auto* classes for flexible model support) defined")
|
detection_fusion_manager.py
ADDED
|
@@ -0,0 +1,242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Dict
|
| 2 |
+
import numpy as np
|
| 3 |
+
|
| 4 |
+
class DetectionFusionManager:
|
| 5 |
+
"""Integrate and prioritize detection results with intelligent lighting fusion"""
|
| 6 |
+
|
| 7 |
+
def __init__(self, clip_manager):
|
| 8 |
+
self.clip_manager = clip_manager
|
| 9 |
+
|
| 10 |
+
def fuse_lighting_analysis(self, cv_lighting: Dict, clip_scene: Dict) -> Dict:
|
| 11 |
+
"""Intelligently fuse CV+Places365 lighting with CLIP scene understanding"""
|
| 12 |
+
|
| 13 |
+
cv_lighting_type = cv_lighting.get('lighting_type', 'soft diffused light')
|
| 14 |
+
cv_confidence = cv_lighting.get('confidence', 0.7)
|
| 15 |
+
cv_features = cv_lighting.get('cv_features', {})
|
| 16 |
+
|
| 17 |
+
# Get CLIP's lighting prediction
|
| 18 |
+
clip_lighting_data = clip_scene.get('lighting', {})
|
| 19 |
+
clip_lighting_type = clip_lighting_data.get('top', 'natural light')
|
| 20 |
+
clip_confidence = clip_lighting_data.get('confidence', 0.5)
|
| 21 |
+
|
| 22 |
+
# Intelligent fusion strategy:
|
| 23 |
+
# 1. If CV has high confidence (>0.85), trust it
|
| 24 |
+
# 2. If CV and CLIP semantically agree, boost confidence
|
| 25 |
+
# 3. Otherwise, weighted average based on confidence
|
| 26 |
+
|
| 27 |
+
if cv_confidence > 0.85:
|
| 28 |
+
# High confidence from CV+Places365
|
| 29 |
+
final_lighting = cv_lighting_type
|
| 30 |
+
final_confidence = cv_confidence
|
| 31 |
+
fusion_method = 'cv_dominant'
|
| 32 |
+
|
| 33 |
+
elif self._lighting_semantically_similar(cv_lighting_type, clip_lighting_type):
|
| 34 |
+
# Semantic agreement between CV and CLIP
|
| 35 |
+
final_lighting = cv_lighting_type # Prefer CV's specific description
|
| 36 |
+
# Boost confidence when both agree
|
| 37 |
+
final_confidence = min(cv_confidence * 1.15, 0.95)
|
| 38 |
+
fusion_method = 'consensus'
|
| 39 |
+
|
| 40 |
+
else:
|
| 41 |
+
# Weighted fusion based on confidence
|
| 42 |
+
cv_weight = cv_confidence / (cv_confidence + clip_confidence)
|
| 43 |
+
clip_weight = 1.0 - cv_weight
|
| 44 |
+
|
| 45 |
+
# If CV weight is higher, use CV result
|
| 46 |
+
if cv_weight > 0.6:
|
| 47 |
+
final_lighting = cv_lighting_type
|
| 48 |
+
final_confidence = cv_confidence * 0.9
|
| 49 |
+
fusion_method = 'cv_weighted'
|
| 50 |
+
else:
|
| 51 |
+
# Use more generic description when uncertain
|
| 52 |
+
final_lighting = self._generalize_lighting_description(
|
| 53 |
+
cv_lighting_type, clip_lighting_type, cv_features
|
| 54 |
+
)
|
| 55 |
+
final_confidence = (cv_confidence * cv_weight + clip_confidence * clip_weight) * 0.85
|
| 56 |
+
fusion_method = 'generalized'
|
| 57 |
+
|
| 58 |
+
return {
|
| 59 |
+
'lighting_type': final_lighting,
|
| 60 |
+
'confidence': min(final_confidence, 0.95),
|
| 61 |
+
'cv_analysis': cv_lighting_type,
|
| 62 |
+
'clip_prediction': clip_lighting_type,
|
| 63 |
+
'fusion_method': fusion_method,
|
| 64 |
+
'cv_confidence': cv_confidence,
|
| 65 |
+
'clip_confidence': clip_confidence
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
def _lighting_semantically_similar(self, cv_type: str, clip_type: str) -> bool:
|
| 69 |
+
"""Check if two lighting descriptions are semantically similar"""
|
| 70 |
+
# Define semantic similarity groups
|
| 71 |
+
similarity_groups = [
|
| 72 |
+
{'soft', 'diffused', 'overcast', 'cloudy'},
|
| 73 |
+
{'bright', 'sunny', 'sunlight', 'clear'},
|
| 74 |
+
{'warm', 'golden', 'amber', 'evening'},
|
| 75 |
+
{'natural', 'daylight', 'outdoor'},
|
| 76 |
+
{'cool', 'blue', 'twilight'},
|
| 77 |
+
]
|
| 78 |
+
|
| 79 |
+
cv_words = set(cv_type.lower().split())
|
| 80 |
+
clip_words = set(clip_type.lower().split())
|
| 81 |
+
|
| 82 |
+
# Check if both descriptions share words from same semantic group
|
| 83 |
+
for group in similarity_groups:
|
| 84 |
+
cv_match = cv_words & group
|
| 85 |
+
clip_match = clip_words & group
|
| 86 |
+
if cv_match and clip_match:
|
| 87 |
+
return True
|
| 88 |
+
|
| 89 |
+
# Direct word overlap
|
| 90 |
+
common_words = cv_words & clip_words
|
| 91 |
+
return len(common_words) >= 1
|
| 92 |
+
|
| 93 |
+
def _generalize_lighting_description(self, cv_type: str, clip_type: str,
|
| 94 |
+
cv_features: Dict) -> str:
|
| 95 |
+
"""Generate a generalized lighting description when CV and CLIP disagree"""
|
| 96 |
+
|
| 97 |
+
brightness = cv_features.get('brightness', 128)
|
| 98 |
+
contrast = cv_features.get('contrast', 50)
|
| 99 |
+
color_temp = cv_features.get('color_temp', 1.0)
|
| 100 |
+
|
| 101 |
+
# Use feature-based generalization (not hard thresholds)
|
| 102 |
+
brightness_norm = brightness / 255.0
|
| 103 |
+
contrast_norm = min(contrast / 100.0, 1.0)
|
| 104 |
+
|
| 105 |
+
# Decision tree based on physical features
|
| 106 |
+
if contrast_norm < 0.5:
|
| 107 |
+
# Low contrast
|
| 108 |
+
if color_temp < 1.0:
|
| 109 |
+
return 'soft diffused light'
|
| 110 |
+
else:
|
| 111 |
+
return 'warm ambient light'
|
| 112 |
+
elif brightness_norm > 0.7:
|
| 113 |
+
# High brightness
|
| 114 |
+
return 'natural daylight'
|
| 115 |
+
elif color_temp > 1.1:
|
| 116 |
+
# Warm temperature
|
| 117 |
+
return 'warm ambient light'
|
| 118 |
+
else:
|
| 119 |
+
# Default safe description
|
| 120 |
+
return 'soft diffused light'
|
| 121 |
+
|
| 122 |
+
def analyze_composition(self, image, detections: List[Dict]) -> Dict:
|
| 123 |
+
"""Analyze image composition"""
|
| 124 |
+
if not detections:
|
| 125 |
+
return {'composition_type': 'empty', 'vertical_ratio': 0.0}
|
| 126 |
+
|
| 127 |
+
# Calculate vertical element ratio
|
| 128 |
+
vertical_objects = [
|
| 129 |
+
d for d in detections
|
| 130 |
+
if (d['bbox'][3] - d['bbox'][1]) > (d['bbox'][2] - d['bbox'][0])
|
| 131 |
+
]
|
| 132 |
+
vertical_ratio = len(vertical_objects) / max(len(detections), 1)
|
| 133 |
+
|
| 134 |
+
# Determine composition type
|
| 135 |
+
if vertical_ratio > 0.6:
|
| 136 |
+
composition_type = 'urban canyon'
|
| 137 |
+
elif vertical_ratio > 0.4:
|
| 138 |
+
composition_type = 'vertical emphasis'
|
| 139 |
+
else:
|
| 140 |
+
composition_type = 'standard street view'
|
| 141 |
+
|
| 142 |
+
return {
|
| 143 |
+
'composition_type': composition_type,
|
| 144 |
+
'vertical_ratio': vertical_ratio,
|
| 145 |
+
'vertical_objects_count': len(vertical_objects),
|
| 146 |
+
'total_objects': len(detections)
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
def fuse_detections(self, yolo_results: List[Dict], unknown_regions: List[Dict],
|
| 150 |
+
scene_info: Dict, image=None, cv_lighting: Dict = None) -> Dict:
|
| 151 |
+
"""Fuse all detection results with intelligent lighting fusion"""
|
| 152 |
+
all_detections = []
|
| 153 |
+
|
| 154 |
+
# Process YOLO detections with attention scores
|
| 155 |
+
for det in yolo_results:
|
| 156 |
+
attention_score = self._calculate_attention_score(det)
|
| 157 |
+
det['attention_score'] = attention_score
|
| 158 |
+
all_detections.append(det)
|
| 159 |
+
|
| 160 |
+
# Classify unknown regions using OpenCLIP
|
| 161 |
+
for region in unknown_regions:
|
| 162 |
+
if 'image' not in region:
|
| 163 |
+
continue
|
| 164 |
+
|
| 165 |
+
classification = self.clip_manager.classify_hierarchical(region['image'])
|
| 166 |
+
|
| 167 |
+
detection = {
|
| 168 |
+
'class_name': classification['top_prediction'],
|
| 169 |
+
'bbox': region['bbox'],
|
| 170 |
+
'confidence': classification.get('confidence', 0.5),
|
| 171 |
+
'attention_score': region.get('saliency_score', 0.5),
|
| 172 |
+
'source': 'openclip'
|
| 173 |
+
}
|
| 174 |
+
all_detections.append(detection)
|
| 175 |
+
|
| 176 |
+
# Sort by attention score
|
| 177 |
+
ranked_detections = sorted(
|
| 178 |
+
all_detections,
|
| 179 |
+
key=lambda x: x['attention_score'],
|
| 180 |
+
reverse=True
|
| 181 |
+
)
|
| 182 |
+
|
| 183 |
+
# Filter top 15
|
| 184 |
+
filtered = []
|
| 185 |
+
for det in ranked_detections:
|
| 186 |
+
if len(filtered) >= 15:
|
| 187 |
+
if det.get('brand') and det.get('brand_confidence', 0) > 0.45:
|
| 188 |
+
filtered.append(det)
|
| 189 |
+
else:
|
| 190 |
+
break
|
| 191 |
+
else:
|
| 192 |
+
filtered.append(det)
|
| 193 |
+
|
| 194 |
+
# Analyze composition
|
| 195 |
+
composition = self.analyze_composition(image, filtered) if image else {}
|
| 196 |
+
|
| 197 |
+
# Intelligent lighting fusion
|
| 198 |
+
if cv_lighting:
|
| 199 |
+
fused_lighting = self.fuse_lighting_analysis(cv_lighting, scene_info)
|
| 200 |
+
# Update scene_info with fused lighting
|
| 201 |
+
scene_info['lighting'] = {
|
| 202 |
+
'top': fused_lighting['lighting_type'],
|
| 203 |
+
'confidence': fused_lighting['confidence'],
|
| 204 |
+
'fusion_details': fused_lighting
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
return {
|
| 208 |
+
'detections': filtered,
|
| 209 |
+
'scene_info': scene_info,
|
| 210 |
+
'composition': composition,
|
| 211 |
+
'total_objects': len(all_detections)
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
def _calculate_attention_score(self, detection: Dict) -> float:
|
| 215 |
+
"""Calculate attention score based on position, size, and confidence"""
|
| 216 |
+
bbox = detection['bbox']
|
| 217 |
+
x1, y1, x2, y2 = bbox
|
| 218 |
+
|
| 219 |
+
center_x = (x1 + x2) / 2
|
| 220 |
+
center_y = (y1 + y2) / 2
|
| 221 |
+
|
| 222 |
+
if x2 > 100:
|
| 223 |
+
position_score = 0.5
|
| 224 |
+
else:
|
| 225 |
+
position_score = 1.0 - (abs(center_x - 0.5) + abs(center_y - 0.5))
|
| 226 |
+
|
| 227 |
+
area = abs((x2 - x1) * (y2 - y1))
|
| 228 |
+
if x2 > 100:
|
| 229 |
+
area = area / (1000 * 1000)
|
| 230 |
+
size_score = min(area, 0.5)
|
| 231 |
+
|
| 232 |
+
conf_score = detection.get('confidence', 0.5)
|
| 233 |
+
|
| 234 |
+
attention = (
|
| 235 |
+
0.3 * position_score +
|
| 236 |
+
0.3 * size_score +
|
| 237 |
+
0.4 * conf_score
|
| 238 |
+
)
|
| 239 |
+
|
| 240 |
+
return attention
|
| 241 |
+
|
| 242 |
+
print("✓ DetectionFusionManager (V2 with intelligent fusion) defined")
|
image_processor_manager.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import torch
|
| 3 |
+
import numpy as np
|
| 4 |
+
from PIL import Image
|
| 5 |
+
from typing import Tuple, Optional, Union
|
| 6 |
+
import torchvision.transforms as transforms
|
| 7 |
+
|
| 8 |
+
class ImageProcessorManager:
|
| 9 |
+
"""Image validation, preprocessing and format standardization"""
|
| 10 |
+
|
| 11 |
+
def __init__(self):
|
| 12 |
+
self.supported_formats = ['JPEG', 'PNG', 'WEBP', 'JPG']
|
| 13 |
+
self.min_resolution = (224, 224)
|
| 14 |
+
|
| 15 |
+
# CLIP preprocessing transform
|
| 16 |
+
self.clip_transform = transforms.Compose([
|
| 17 |
+
transforms.Resize((336, 336), interpolation=transforms.InterpolationMode.BICUBIC),
|
| 18 |
+
transforms.ToTensor(),
|
| 19 |
+
transforms.Normalize(
|
| 20 |
+
mean=[0.48145466, 0.4578275, 0.40821073],
|
| 21 |
+
std=[0.26862954, 0.26130258, 0.27577711]
|
| 22 |
+
)
|
| 23 |
+
])
|
| 24 |
+
|
| 25 |
+
def load_image(self, file_path: Union[str, Image.Image]) -> Image.Image:
|
| 26 |
+
"""Load and validate image"""
|
| 27 |
+
if isinstance(file_path, Image.Image):
|
| 28 |
+
image = file_path
|
| 29 |
+
else:
|
| 30 |
+
try:
|
| 31 |
+
image = Image.open(file_path)
|
| 32 |
+
except Exception as e:
|
| 33 |
+
raise ValueError(f"Failed to load image: {e}")
|
| 34 |
+
|
| 35 |
+
# Convert to RGB
|
| 36 |
+
if image.mode != 'RGB':
|
| 37 |
+
image = image.convert('RGB')
|
| 38 |
+
|
| 39 |
+
# Check resolution
|
| 40 |
+
if image.size[0] < self.min_resolution[0] or image.size[1] < self.min_resolution[1]:
|
| 41 |
+
raise ValueError(f"Image resolution too low, minimum required: {self.min_resolution}")
|
| 42 |
+
|
| 43 |
+
return image
|
| 44 |
+
|
| 45 |
+
def preprocess_for_yolo(self, image: Image.Image) -> np.ndarray:
|
| 46 |
+
"""Preprocess image for YOLO (keep original format)"""
|
| 47 |
+
return np.array(image)
|
| 48 |
+
|
| 49 |
+
def preprocess_for_clip(self, image: Image.Image) -> torch.Tensor:
|
| 50 |
+
"""Preprocess image for CLIP (336x336, ImageNet normalization)"""
|
| 51 |
+
return self.clip_transform(image)
|
| 52 |
+
|
| 53 |
+
def preprocess_for_qwen(self, image: Image.Image) -> Image.Image:
|
| 54 |
+
"""Preprocess image for Qwen2.5-VL (dynamic resolution)"""
|
| 55 |
+
return image
|
| 56 |
+
|
| 57 |
+
def resize_with_aspect_ratio(self, image: Image.Image, max_size: int = 1024) -> Image.Image:
|
| 58 |
+
"""Resize image while maintaining aspect ratio"""
|
| 59 |
+
width, height = image.size
|
| 60 |
+
if max(width, height) > max_size:
|
| 61 |
+
if width > height:
|
| 62 |
+
new_width = max_size
|
| 63 |
+
new_height = int(height * (max_size / width))
|
| 64 |
+
else:
|
| 65 |
+
new_height = max_size
|
| 66 |
+
new_width = int(width * (max_size / height))
|
| 67 |
+
image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
|
| 68 |
+
return image
|
| 69 |
+
|
| 70 |
+
print("✓ ImageProcessorManager defined")
|
landmark_prompts.py
ADDED
|
@@ -0,0 +1,1030 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from typing import Dict, List, Optional
|
| 3 |
+
|
| 4 |
+
class LandmarkPrompts:
|
| 5 |
+
"""
|
| 6 |
+
世界地標視覺描述與 Hashtag 資料庫
|
| 7 |
+
提供 20 個世界知名地標的詳細資料
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
def __init__(self):
|
| 11 |
+
"""初始化地標資料庫"""
|
| 12 |
+
|
| 13 |
+
self.landmarks = {
|
| 14 |
+
# ===== 歐洲 Europe =====
|
| 15 |
+
"Big Ben": {
|
| 16 |
+
"name": "Big Ben",
|
| 17 |
+
"official_name": "Elizabeth Tower",
|
| 18 |
+
"location": {
|
| 19 |
+
"city": "London",
|
| 20 |
+
"country": "United Kingdom",
|
| 21 |
+
"region": "Westminster",
|
| 22 |
+
"continent": "Europe"
|
| 23 |
+
},
|
| 24 |
+
"visual_cues": {
|
| 25 |
+
"iconic_view": [
|
| 26 |
+
"Gothic Revival clock tower with four ornate clock faces rising beside Westminster Palace and Thames River",
|
| 27 |
+
"Tall Victorian tower with intricate stone detailing golden clock faces and pointed spire against London sky",
|
| 28 |
+
"Famous clock tower landmark showing detailed Gothic architecture with Palace of Westminster backdrop",
|
| 29 |
+
"Majestic bell tower with elaborate Victorian Gothic design overlooking Westminster Bridge"
|
| 30 |
+
],
|
| 31 |
+
"architectural_details": [
|
| 32 |
+
"Ornate clock faces with Roman numerals surrounded by decorative Gothic stonework and gilded details",
|
| 33 |
+
"Victorian Gothic Revival architecture featuring pointed arches flying buttresses and limestone facade",
|
| 34 |
+
"Detailed carved stonework showing Gothic tracery pinnacles and decorative moldings on tower exterior",
|
| 35 |
+
"Cast iron and gold leaf clock mechanisms visible within ornamental Gothic Revival tower framework"
|
| 36 |
+
],
|
| 37 |
+
"contextual_view": [
|
| 38 |
+
"Clock tower rising above Westminster Bridge with red double-decker buses and Thames River in foreground",
|
| 39 |
+
"Big Ben silhouetted against dramatic London sunset with Westminster Palace and river reflections",
|
| 40 |
+
"Famous landmark viewed from Parliament Square with traffic pedestrians and London Eye in distance",
|
| 41 |
+
"Tower seen through tree branches in nearby park with Westminster Abbey and government buildings visible"
|
| 42 |
+
],
|
| 43 |
+
"seasonal_lighting": [
|
| 44 |
+
"Tower illuminated at night with golden clock faces glowing against dark sky creating iconic London scene",
|
| 45 |
+
"Soft morning light highlighting limestone details as mist rises from Thames creating atmospheric mood",
|
| 46 |
+
"Dramatic storm clouds gathering behind tower with contrasting sunlight illuminating Gothic stonework",
|
| 47 |
+
"Winter scene with tower emerging from fog as streetlights reflect on wet Westminster Bridge pavement"
|
| 48 |
+
]
|
| 49 |
+
},
|
| 50 |
+
"hashtags": {
|
| 51 |
+
"zh": ["大笨鐘", "倫敦地標", "西敏寺", "泰晤士河", "英國旅遊", "倫敦", "英國"],
|
| 52 |
+
"en": ["BigBen", "London", "Westminster", "Thames", "UKTravel", "LondonLandmarks", "ElizabethTower"]
|
| 53 |
+
},
|
| 54 |
+
"cultural_info": {
|
| 55 |
+
"built_year": 1859,
|
| 56 |
+
"architect": "Augustus Pugin",
|
| 57 |
+
"architectural_style": "Gothic Revival"
|
| 58 |
+
}
|
| 59 |
+
},
|
| 60 |
+
|
| 61 |
+
"Eiffel Tower": {
|
| 62 |
+
"name": "Eiffel Tower",
|
| 63 |
+
"official_name": "La Tour Eiffel",
|
| 64 |
+
"location": {
|
| 65 |
+
"city": "Paris",
|
| 66 |
+
"country": "France",
|
| 67 |
+
"region": "Champ de Mars",
|
| 68 |
+
"continent": "Europe"
|
| 69 |
+
},
|
| 70 |
+
"visual_cues": {
|
| 71 |
+
"iconic_view": [
|
| 72 |
+
"Iconic iron lattice tower rising 330 meters above Champ de Mars with distinctive tapering silhouette",
|
| 73 |
+
"Wrought iron structure with three observation levels showing intricate lattice framework against Paris sky",
|
| 74 |
+
"Famous Parisian landmark with characteristic brown paint and elegant art nouveau iron lattice design",
|
| 75 |
+
"Monumental tower structure displaying puddle iron construction with four massive arched base legs"
|
| 76 |
+
],
|
| 77 |
+
"architectural_details": [
|
| 78 |
+
"Intricate wrought iron lattice work showing 18000 metallic parts joined by 2.5 million rivets",
|
| 79 |
+
"Distinctive curved base arches with elevator shafts and lattice framework creating transparent appearance",
|
| 80 |
+
"Observation deck platforms with iron railings providing panoramic views across Paris rooftops",
|
| 81 |
+
"Antique elevators and iron staircases winding through lattice structure between three viewing levels"
|
| 82 |
+
],
|
| 83 |
+
"contextual_view": [
|
| 84 |
+
"Tower framed by Trocadéro fountains with reflecting pools and Parisian cityscape in background",
|
| 85 |
+
"Eiffel Tower viewed from Seine River with tourist boats and bridges in romantic Parisian setting",
|
| 86 |
+
"Landmark rising above Champ de Mars gardens with visitors and green lawns in foreground",
|
| 87 |
+
"Tower seen from Montparnasse showing Paris rooftops Sacré-Cœur and urban landscape panorama"
|
| 88 |
+
],
|
| 89 |
+
"seasonal_lighting": [
|
| 90 |
+
"Tower illuminated at night with golden lights creating magical sparkling effect every hour",
|
| 91 |
+
"Sunset silhouette with tower's iron structure outlined against orange and pink Paris sky",
|
| 92 |
+
"Cherry blossoms framing tower in spring with soft natural light on iron lattice",
|
| 93 |
+
"Winter scene with tower emerging from clouds as snow dusts Champ de Mars gardens"
|
| 94 |
+
]
|
| 95 |
+
},
|
| 96 |
+
"hashtags": {
|
| 97 |
+
"zh": ["艾菲爾鐵塔", "巴黎鐵塔", "巴黎地標", "法國旅遊", "巴黎", "鐵塔"],
|
| 98 |
+
"en": ["EiffelTower", "Paris", "ParisLandmark", "TourEiffel", "France", "ParisTravel"]
|
| 99 |
+
},
|
| 100 |
+
"cultural_info": {
|
| 101 |
+
"built_year": 1889,
|
| 102 |
+
"architect": "Gustave Eiffel",
|
| 103 |
+
"architectural_style": "Structural Expressionism"
|
| 104 |
+
}
|
| 105 |
+
},
|
| 106 |
+
|
| 107 |
+
"Colosseum": {
|
| 108 |
+
"name": "Colosseum",
|
| 109 |
+
"official_name": "Flavian Amphitheatre",
|
| 110 |
+
"location": {
|
| 111 |
+
"city": "Rome",
|
| 112 |
+
"country": "Italy",
|
| 113 |
+
"region": "Lazio",
|
| 114 |
+
"continent": "Europe"
|
| 115 |
+
},
|
| 116 |
+
"visual_cues": {
|
| 117 |
+
"iconic_view": [
|
| 118 |
+
"Ancient Roman amphitheater with massive oval structure showing three tiers of arches in weathered stone",
|
| 119 |
+
"Iconic ruined arena with partially collapsed walls revealing internal chambers and underground passages",
|
| 120 |
+
"Monumental stone amphitheater displaying Roman engineering with distinctive arched facade and columns",
|
| 121 |
+
"Historic gladiatorial arena showing travertine limestone construction with Doric Ionic and Corinthian orders"
|
| 122 |
+
],
|
| 123 |
+
"architectural_details": [
|
| 124 |
+
"Three stories of arches supported by columns showing progression of classical orders from ground to top",
|
| 125 |
+
"Weathered travertine blocks and brick revealing ancient construction techniques and earthquake damage",
|
| 126 |
+
"Hypogeum underground chambers visible through arena floor showing complex staging machinery areas",
|
| 127 |
+
"Massive exterior wall with remaining arches brackets and column fragments from original four-story height"
|
| 128 |
+
],
|
| 129 |
+
"contextual_view": [
|
| 130 |
+
"Colosseum rising above Roman Forum with ancient temples columns and ruins in surrounding area",
|
| 131 |
+
"Amphitheater viewed from Palatine Hill showing relationship to Imperial Palace and Roman landscape",
|
| 132 |
+
"Monument surrounded by modern Rome with traffic tourists and urban development contrasting ancient stone",
|
| 133 |
+
"Arena illuminated at dusk with Constantine's Arch and Roman ruins visible in archaeological park"
|
| 134 |
+
],
|
| 135 |
+
"seasonal_lighting": [
|
| 136 |
+
"Golden hour light warming travertine stone with dramatic shadows emphasizing architectural depth",
|
| 137 |
+
"Night illumination creating dramatic effect on ancient arches with warm amber lighting",
|
| 138 |
+
"Overcast sky providing even light showing weathering patterns and stone texture details",
|
| 139 |
+
"Bright midday sun creating strong contrast between light and shadow in deep archways"
|
| 140 |
+
]
|
| 141 |
+
},
|
| 142 |
+
"hashtags": {
|
| 143 |
+
"zh": ["羅馬競技場", "古羅馬", "羅馬", "義大利旅遊", "古蹟", "世界遺產"],
|
| 144 |
+
"en": ["Colosseum", "Rome", "AncientRome", "Italy", "Roman", "WorldHeritage"]
|
| 145 |
+
},
|
| 146 |
+
"cultural_info": {
|
| 147 |
+
"built_year": 80,
|
| 148 |
+
"architect": "Emperor Vespasian",
|
| 149 |
+
"architectural_style": "Ancient Roman"
|
| 150 |
+
}
|
| 151 |
+
},
|
| 152 |
+
|
| 153 |
+
"Sagrada Familia": {
|
| 154 |
+
"name": "Sagrada Familia",
|
| 155 |
+
"official_name": "Basílica de la Sagrada Família",
|
| 156 |
+
"location": {
|
| 157 |
+
"city": "Barcelona",
|
| 158 |
+
"country": "Spain",
|
| 159 |
+
"region": "Catalonia",
|
| 160 |
+
"continent": "Europe"
|
| 161 |
+
},
|
| 162 |
+
"visual_cues": {
|
| 163 |
+
"iconic_view": [
|
| 164 |
+
"Extraordinary basilica with soaring organic towers showing Gaudí's distinctive naturalistic Gothic design",
|
| 165 |
+
"Unfinished cathedral with multiple spires featuring intricate stone carving and colorful mosaic details",
|
| 166 |
+
"Fantastical church architecture combining Gothic and Art Nouveau with nature-inspired sculptural forms",
|
| 167 |
+
"Massive religious monument with elaborate facades showing biblical scenes in highly detailed stonework"
|
| 168 |
+
],
|
| 169 |
+
"architectural_details": [
|
| 170 |
+
"Organic columns branching like trees supporting vaulted ceiling with natural light filtering through",
|
| 171 |
+
"Nativity facade with detailed sculptural groups showing biblical narratives in stone",
|
| 172 |
+
"Colorful stained glass windows creating rainbow light effects throughout cathedral interior",
|
| 173 |
+
"Hyperboloid structures and ruled surfaces demonstrating Gaudí's mathematical geometric approach"
|
| 174 |
+
],
|
| 175 |
+
"contextual_view": [
|
| 176 |
+
"Basilica towers rising above Barcelona cityscape with Mediterranean architecture and urban landscape",
|
| 177 |
+
"Church viewed from Plaça de Gaudí with reflecting pool mirroring elaborate facades",
|
| 178 |
+
"Construction cranes visible around towers showing ongoing building work on Gaudí's vision",
|
| 179 |
+
"Interior forest of columns with visitors experiencing spectacular light and space"
|
| 180 |
+
],
|
| 181 |
+
"seasonal_lighting": [
|
| 182 |
+
"Sunset light streaming through stained glass creating vibrant color patterns on stone columns",
|
| 183 |
+
"Night illumination highlighting intricate facade details with dramatic architectural lighting",
|
| 184 |
+
"Morning light revealing texture and depth of carved stone with soft shadows",
|
| 185 |
+
"Bright Mediterranean sun emphasizing colorful mosaic work on tower exteriors"
|
| 186 |
+
]
|
| 187 |
+
},
|
| 188 |
+
"hashtags": {
|
| 189 |
+
"zh": ["聖家堂", "巴塞隆納", "高第建築", "西班牙旅遊", "世界遺產", "教堂"],
|
| 190 |
+
"en": ["SagradaFamilia", "Barcelona", "Gaudi", "Spain", "Cathedral", "Architecture"]
|
| 191 |
+
},
|
| 192 |
+
"cultural_info": {
|
| 193 |
+
"built_year": 1882,
|
| 194 |
+
"architect": "Antoni Gaudí",
|
| 195 |
+
"architectural_style": "Catalan Modernism"
|
| 196 |
+
}
|
| 197 |
+
},
|
| 198 |
+
|
| 199 |
+
"Brandenburg Gate": {
|
| 200 |
+
"name": "Brandenburg Gate",
|
| 201 |
+
"official_name": "Brandenburger Tor",
|
| 202 |
+
"location": {
|
| 203 |
+
"city": "Berlin",
|
| 204 |
+
"country": "Germany",
|
| 205 |
+
"region": "Mitte",
|
| 206 |
+
"continent": "Europe"
|
| 207 |
+
},
|
| 208 |
+
"visual_cues": {
|
| 209 |
+
"iconic_view": [
|
| 210 |
+
"Neoclassical triumphal arch with twelve Doric columns supporting entablature and Quadriga sculpture",
|
| 211 |
+
"Monumental city gate with goddess of victory chariot crowning sandstone classical structure",
|
| 212 |
+
"Historic gateway showing Greek Revival architecture with columned portico and sculptural decoration",
|
| 213 |
+
"Famous Berlin landmark with symmetrical design and copper Quadriga statue against sky"
|
| 214 |
+
],
|
| 215 |
+
"architectural_details": [
|
| 216 |
+
"Twelve Doric columns arranged in six pairs creating five passageways through gate structure",
|
| 217 |
+
"Quadriga sculpture showing goddess Victoria in four-horse chariot with Prussian eagle and Iron Cross",
|
| 218 |
+
"Sandstone construction with classical Greek proportions and restrained decorative elements",
|
| 219 |
+
"Relief sculptures in metopes showing mythological scenes and Prussian military symbolism"
|
| 220 |
+
],
|
| 221 |
+
"contextual_view": [
|
| 222 |
+
"Gate standing at Pariser Platz with modern buildings and historic square surrounding monument",
|
| 223 |
+
"Brandenburg Gate viewed down Unter den Linden boulevard with linden trees and embassies",
|
| 224 |
+
"Monument at edge of Tiergarten park showing relationship to green space and city",
|
| 225 |
+
"Gate illuminated with Reichstag building and government district visible in background"
|
| 226 |
+
],
|
| 227 |
+
"seasonal_lighting": [
|
| 228 |
+
"Dramatic night lighting in various colors for events creating stunning visual effects",
|
| 229 |
+
"Soft morning light highlighting sandstone texture and classical architectural details",
|
| 230 |
+
"Sunset silhouette with Quadriga outlined against colorful Berlin sky",
|
| 231 |
+
"Winter scene with gate surrounded by Christmas market lights and seasonal decorations"
|
| 232 |
+
]
|
| 233 |
+
},
|
| 234 |
+
"hashtags": {
|
| 235 |
+
"zh": ["布蘭登堡門", "柏林", "德國旅遊", "歷史建築", "柏林地標"],
|
| 236 |
+
"en": ["BrandenburgGate", "Berlin", "Germany", "BerlinLandmark", "GermanHistory"]
|
| 237 |
+
},
|
| 238 |
+
"cultural_info": {
|
| 239 |
+
"built_year": 1791,
|
| 240 |
+
"architect": "Carl Gotthard Langhans",
|
| 241 |
+
"architectural_style": "Neoclassicism"
|
| 242 |
+
}
|
| 243 |
+
},
|
| 244 |
+
|
| 245 |
+
# ===== 亞洲 Asia =====
|
| 246 |
+
"Tokyo Tower": {
|
| 247 |
+
"name": "Tokyo Tower",
|
| 248 |
+
"official_name": "東京タワー",
|
| 249 |
+
"location": {
|
| 250 |
+
"city": "Tokyo",
|
| 251 |
+
"country": "Japan",
|
| 252 |
+
"region": "Minato",
|
| 253 |
+
"continent": "Asia"
|
| 254 |
+
},
|
| 255 |
+
"visual_cues": {
|
| 256 |
+
"iconic_view": [
|
| 257 |
+
"Red and white lattice steel tower inspired by Eiffel Tower rising 333 meters above Tokyo",
|
| 258 |
+
"Iconic communication tower with distinctive orange and white paint showing two observation decks",
|
| 259 |
+
"Tall broadcasting tower with lattice framework and observation platforms overlooking Tokyo cityscape",
|
| 260 |
+
"Famous Japanese landmark tower with red-orange color scheme and tapering lattice structure"
|
| 261 |
+
],
|
| 262 |
+
"architectural_details": [
|
| 263 |
+
"Steel lattice framework painted international orange and white for aviation safety",
|
| 264 |
+
"Two observation decks at 150m and 250m heights with panoramic windows and viewing platforms",
|
| 265 |
+
"Four massive support legs with elevators and emergency stairs running through lattice structure",
|
| 266 |
+
"Broadcasting antennas and equipment at tower top with decorative lighting systems"
|
| 267 |
+
],
|
| 268 |
+
"contextual_view": [
|
| 269 |
+
"Tower rising above Shiba Park with traditional temple buildings and modern Tokyo skyscrapers",
|
| 270 |
+
"Tokyo Tower viewed from Roppongi Hills with Mount Fuji visible in distant background",
|
| 271 |
+
"Landmark tower dominating skyline with Rainbow Bridge and Tokyo Bay in view",
|
| 272 |
+
"Tower surrounded by cherry blossoms in spring with pink petals and urban landscape"
|
| 273 |
+
],
|
| 274 |
+
"seasonal_lighting": [
|
| 275 |
+
"Tower illuminated at night in orange creating warm glow against Tokyo night sky",
|
| 276 |
+
"Special lighting displays in various colors for holidays and events creating festive atmosphere",
|
| 277 |
+
"Sunset view with tower silhouetted against orange and pink sky",
|
| 278 |
+
"Winter illumination with tower and surrounding trees decorated with seasonal lights"
|
| 279 |
+
]
|
| 280 |
+
},
|
| 281 |
+
"hashtags": {
|
| 282 |
+
"zh": ["東京鐵塔", "東京", "日本旅遊", "東京地標", "日本"],
|
| 283 |
+
"en": ["TokyoTower", "Tokyo", "Japan", "TokyoLandmark", "JapanTravel"]
|
| 284 |
+
},
|
| 285 |
+
"cultural_info": {
|
| 286 |
+
"built_year": 1958,
|
| 287 |
+
"architect": "Tachū Naitō",
|
| 288 |
+
"architectural_style": "Lattice Tower"
|
| 289 |
+
}
|
| 290 |
+
},
|
| 291 |
+
|
| 292 |
+
"Taipei 101": {
|
| 293 |
+
"name": "Taipei 101",
|
| 294 |
+
"official_name": "台北101",
|
| 295 |
+
"location": {
|
| 296 |
+
"city": "Taipei",
|
| 297 |
+
"country": "Taiwan",
|
| 298 |
+
"region": "Xinyi District",
|
| 299 |
+
"continent": "Asia"
|
| 300 |
+
},
|
| 301 |
+
"visual_cues": {
|
| 302 |
+
"iconic_view": [
|
| 303 |
+
"Massive skyscraper with bamboo-inspired segmented design rising 508 meters above Taipei",
|
| 304 |
+
"101-story tower with distinctive eight-segment structure and traditional Chinese architectural elements",
|
| 305 |
+
"Iconic green-glass building with pagoda-like tiers showing postmodern Asian design",
|
| 306 |
+
"Supertall skyscraper with gold-tinted windows and traditional motifs in modern interpretation"
|
| 307 |
+
],
|
| 308 |
+
"architectural_details": [
|
| 309 |
+
"Eight eight-story modules stacked vertically representing prosperity in Chinese numerology",
|
| 310 |
+
"Traditional ruyi ornaments at corners of each section adding cultural architectural elements",
|
| 311 |
+
"Massive tuned mass damper sphere visible to visitors providing earthquake protection",
|
| 312 |
+
"Double-deck elevators with pressurization system ascending at world-record speeds"
|
| 313 |
+
],
|
| 314 |
+
"contextual_view": [
|
| 315 |
+
"Tower dominating Taipei skyline with Elephant Mountain and lush green hills in background",
|
| 316 |
+
"Building viewed from Xiangshan with city sprawl and mountains creating dramatic setting",
|
| 317 |
+
"Taipei 101 rising above Xinyi shopping district with modern urban development below",
|
| 318 |
+
"Tower illuminated against night sky with busy streets and city lights surrounding base"
|
| 319 |
+
],
|
| 320 |
+
"seasonal_lighting": [
|
| 321 |
+
"New Year's Eve fireworks display launched from building creating spectacular light show",
|
| 322 |
+
"LED lighting system displaying colors for holidays and special occasions",
|
| 323 |
+
"Sunset illumination with building's glass reflecting golden and orange tones",
|
| 324 |
+
"Night view with tower lit in green and gold standing out against dark sky"
|
| 325 |
+
]
|
| 326 |
+
},
|
| 327 |
+
"hashtags": {
|
| 328 |
+
"zh": ["台北101", "台北", "台灣", "台北地標", "摩天大樓", "台灣旅遊"],
|
| 329 |
+
"en": ["Taipei101", "Taipei", "Taiwan", "TaipeiLandmark", "Skyscraper", "TaiwanTravel"]
|
| 330 |
+
},
|
| 331 |
+
"cultural_info": {
|
| 332 |
+
"built_year": 2004,
|
| 333 |
+
"architect": "C.Y. Lee & Partners",
|
| 334 |
+
"architectural_style": "Postmodern"
|
| 335 |
+
}
|
| 336 |
+
},
|
| 337 |
+
|
| 338 |
+
"Burj Khalifa": {
|
| 339 |
+
"name": "Burj Khalifa",
|
| 340 |
+
"official_name": "برج خليفة",
|
| 341 |
+
"location": {
|
| 342 |
+
"city": "Dubai",
|
| 343 |
+
"country": "United Arab Emirates",
|
| 344 |
+
"region": "Downtown Dubai",
|
| 345 |
+
"continent": "Asia"
|
| 346 |
+
},
|
| 347 |
+
"visual_cues": {
|
| 348 |
+
"iconic_view": [
|
| 349 |
+
"World's tallest building at 828 meters with Y-shaped floor plan and sleek tapering design",
|
| 350 |
+
"Supertall skyscraper with reflective glass facade and setback design inspired by desert flower",
|
| 351 |
+
"Iconic needle-like tower piercing clouds with distinctive spire and observation decks",
|
| 352 |
+
"Neo-futurist architecture with Islamic geometric patterns in modern glass and steel construction"
|
| 353 |
+
],
|
| 354 |
+
"architectural_details": [
|
| 355 |
+
"Buttressed core structural system with wings extending from central hexagonal hub",
|
| 356 |
+
"Reflective glazing with aluminum and textured stainless steel spandrel panels",
|
| 357 |
+
"Observation decks on 124th 125th and 148th floors offering panoramic views",
|
| 358 |
+
"Spire adding 200 meters to height with communication equipment and decorative elements"
|
| 359 |
+
],
|
| 360 |
+
"contextual_view": [
|
| 361 |
+
"Tower rising from Downtown Dubai with Dubai Mall fountain show and urban development below",
|
| 362 |
+
"Building dominating skyline with Persian Gulf and Palm Jumeirah visible in distance",
|
| 363 |
+
"Burj Khalifa viewed from desert showing contrast between modern architecture and natural landscape",
|
| 364 |
+
"Tower at center of Dubai's business district with surrounding high-rises and infrastructure"
|
| 365 |
+
],
|
| 366 |
+
"seasonal_lighting": [
|
| 367 |
+
"LED light show on facade creating dynamic patterns and colors for celebrations",
|
| 368 |
+
"Night illumination with tower glowing against dark sky as city lights spread below",
|
| 369 |
+
"Sunset view with building's glass reflecting orange and gold desert light",
|
| 370 |
+
"New Year's Eve spectacular with building covered in coordinated light and firework display"
|
| 371 |
+
]
|
| 372 |
+
},
|
| 373 |
+
"hashtags": {
|
| 374 |
+
"zh": ["哈里發塔", "杜拜", "阿聯酋", "世界最高", "摩天大樓", "杜拜旅遊"],
|
| 375 |
+
"en": ["BurjKhalifa", "Dubai", "UAE", "WorldsTallest", "Skyscraper", "DubaiTravel"]
|
| 376 |
+
},
|
| 377 |
+
"cultural_info": {
|
| 378 |
+
"built_year": 2010,
|
| 379 |
+
"architect": "Adrian Smith (SOM)",
|
| 380 |
+
"architectural_style": "Neo-futurism"
|
| 381 |
+
}
|
| 382 |
+
},
|
| 383 |
+
|
| 384 |
+
"Petronas Towers": {
|
| 385 |
+
"name": "Petronas Towers",
|
| 386 |
+
"official_name": "Menara Berkembar Petronas",
|
| 387 |
+
"location": {
|
| 388 |
+
"city": "Kuala Lumpur",
|
| 389 |
+
"country": "Malaysia",
|
| 390 |
+
"region": "KLCC",
|
| 391 |
+
"continent": "Asia"
|
| 392 |
+
},
|
| 393 |
+
"visual_cues": {
|
| 394 |
+
"iconic_view": [
|
| 395 |
+
"Twin skyscrapers with Islamic-inspired design connected by sky bridge at 452 meters height",
|
| 396 |
+
"Matching 88-story towers with distinctive postmodern style and geometric floor plans",
|
| 397 |
+
"Iconic twin towers with stainless steel and glass facades showing eight-pointed star motif",
|
| 398 |
+
"Symmetrical tower pair with sky bridge and spires creating recognizable Kuala Lumpur silhouette"
|
| 399 |
+
],
|
| 400 |
+
"architectural_details": [
|
| 401 |
+
"Floor plan based on Islamic geometric patterns with two interlocking squares creating eight-pointed star",
|
| 402 |
+
"Stainless steel and glass curtain wall with Islamic art-inspired design elements",
|
| 403 |
+
"Double-deck sky bridge on 41st and 42nd floors connecting towers at 170 meters height",
|
| 404 |
+
"Pinnacles adding 73 meters to height with Islamic architectural styling and lighting"
|
| 405 |
+
],
|
| 406 |
+
"contextual_view": [
|
| 407 |
+
"Towers dominating KLCC Park with fountain lake and green space in foreground",
|
| 408 |
+
"Twin buildings viewed from KL Tower showing relationship to city and surrounding jungle hills",
|
| 409 |
+
"Petronas Towers as centerpiece of business district with modern urban development",
|
| 410 |
+
"Towers reflecting in KLCC Park water features with tropical landscaping and city backdrop"
|
| 411 |
+
],
|
| 412 |
+
"seasonal_lighting": [
|
| 413 |
+
"Towers illuminated at night with synchronized lighting creating mirror image effect",
|
| 414 |
+
"Special lighting displays for Malaysian holidays in national colors",
|
| 415 |
+
"Blue hour with towers glowing against twilight sky as city lights emerge",
|
| 416 |
+
"Dramatic storm clouds behind towers with lightning and architectural lighting contrast"
|
| 417 |
+
]
|
| 418 |
+
},
|
| 419 |
+
"hashtags": {
|
| 420 |
+
"zh": ["雙子星大樓", "吉隆坡", "馬來西亞", "雙峰塔", "吉隆坡地標"],
|
| 421 |
+
"en": ["PetronasTowers", "KualaLumpur", "Malaysia", "TwinTowers", "KLCC"]
|
| 422 |
+
},
|
| 423 |
+
"cultural_info": {
|
| 424 |
+
"built_year": 1998,
|
| 425 |
+
"architect": "César Pelli",
|
| 426 |
+
"architectural_style": "Postmodern Islamic"
|
| 427 |
+
}
|
| 428 |
+
},
|
| 429 |
+
|
| 430 |
+
"Forbidden City": {
|
| 431 |
+
"name": "Forbidden City",
|
| 432 |
+
"official_name": "故宮",
|
| 433 |
+
"location": {
|
| 434 |
+
"city": "Beijing",
|
| 435 |
+
"country": "China",
|
| 436 |
+
"region": "Dongcheng District",
|
| 437 |
+
"continent": "Asia"
|
| 438 |
+
},
|
| 439 |
+
"visual_cues": {
|
| 440 |
+
"iconic_view": [
|
| 441 |
+
"Massive imperial palace complex with yellow-glazed roof tiles and red walls showing traditional Chinese architecture",
|
| 442 |
+
"Ancient palace with multiple courtyards ceremonial halls and gates in classical Chinese design",
|
| 443 |
+
"Historic royal residence with distinctive golden roofs and vermilion walls in orthogonal layout",
|
| 444 |
+
"Imperial complex with 980 buildings showing Ming and Qing dynasty architectural grandeur"
|
| 445 |
+
],
|
| 446 |
+
"architectural_details": [
|
| 447 |
+
"Yellow glazed roof tiles symbolizing imperial authority with elaborate ceramic figurine decorations",
|
| 448 |
+
"Vermilion walls and columns with golden door studs arranged in traditional Chinese numerical symbolism",
|
| 449 |
+
"Marble terraces and balustrades with dragon and phoenix carved relief decorations",
|
| 450 |
+
"Wooden architecture using traditional dougong bracket system without nails in construction"
|
| 451 |
+
],
|
| 452 |
+
"contextual_view": [
|
| 453 |
+
"Palace viewed through Meridian Gate with vast courtyard and Hall of Supreme Harmony beyond",
|
| 454 |
+
"Forbidden City from Jingshan Park showing complete palace layout and Beijing cityscape",
|
| 455 |
+
"Palace moat and walls with modern Beijing visible in background showing old and new contrast",
|
| 456 |
+
"Interior courtyard with tourists and traditional architecture under blue Beijing sky"
|
| 457 |
+
],
|
| 458 |
+
"seasonal_lighting": [
|
| 459 |
+
"Winter snow covering golden roofs creating dramatic color contrast with white and gold",
|
| 460 |
+
"Autumn light warming red walls with traditional Chinese architecture in clear air",
|
| 461 |
+
"Sunset illuminating yellow roof tiles with golden hour light creating magical atmosphere",
|
| 462 |
+
"Night opening events with palace buildings subtly illuminated showing architectural details"
|
| 463 |
+
]
|
| 464 |
+
},
|
| 465 |
+
"hashtags": {
|
| 466 |
+
"zh": ["故宮", "北京", "紫禁城", "中國", "古蹟", "世界遺產"],
|
| 467 |
+
"en": ["ForbiddenCity", "Beijing", "China", "ImperialPalace", "WorldHeritage", "Palace"]
|
| 468 |
+
},
|
| 469 |
+
"cultural_info": {
|
| 470 |
+
"built_year": 1420,
|
| 471 |
+
"architect": "Kuai Xiang",
|
| 472 |
+
"architectural_style": "Traditional Chinese"
|
| 473 |
+
}
|
| 474 |
+
},
|
| 475 |
+
|
| 476 |
+
# ===== 美洲 Americas =====
|
| 477 |
+
"Statue of Liberty": {
|
| 478 |
+
"name": "Statue of Liberty",
|
| 479 |
+
"official_name": "Liberty Enlightening the World",
|
| 480 |
+
"location": {
|
| 481 |
+
"city": "New York",
|
| 482 |
+
"country": "United States",
|
| 483 |
+
"region": "Liberty Island",
|
| 484 |
+
"continent": "North America"
|
| 485 |
+
},
|
| 486 |
+
"visual_cues": {
|
| 487 |
+
"iconic_view": [
|
| 488 |
+
"Colossal neoclassical sculpture with copper patina holding torch aloft on Liberty Island",
|
| 489 |
+
"Famous statue with crown and tablet showing robed female figure representing Libertas",
|
| 490 |
+
"Iconic green copper statue on pedestal with torch raised and broken chains at feet",
|
| 491 |
+
"Monument with seven-ray crown tablet and torch symbolizing freedom and democracy"
|
| 492 |
+
],
|
| 493 |
+
"architectural_details": [
|
| 494 |
+
"Copper skin with green patina over iron framework designed by Gustave Eiffel",
|
| 495 |
+
"Crown with seven rays representing seven continents and seas with 25 windows",
|
| 496 |
+
"Tablet inscribed with July 4 1776 in Roman numerals held in left hand",
|
| 497 |
+
"Broken shackles and chains at feet symbolizing freedom from oppression"
|
| 498 |
+
],
|
| 499 |
+
"contextual_view": [
|
| 500 |
+
"Statue viewed from Battery Park with New York Harbor and Manhattan skyline behind",
|
| 501 |
+
"Liberty Island with statue and star-shaped Fort Wood pedestal from aerial view",
|
| 502 |
+
"Statue with Staten Island Ferry passing in foreground and Ellis Island nearby",
|
| 503 |
+
"Sunset silhouette with statue outlined against orange sky and New York City lights"
|
| 504 |
+
],
|
| 505 |
+
"seasonal_lighting": [
|
| 506 |
+
"Statue illuminated at night with dramatic uplighting showing sculptural details",
|
| 507 |
+
"Golden hour light warming copper patina with soft shadows on draped clothing",
|
| 508 |
+
"Fourth of July fireworks surrounding statue with patriotic celebration",
|
| 509 |
+
"Misty morning with statue emerging from harbor fog creating mystical atmosphere"
|
| 510 |
+
]
|
| 511 |
+
},
|
| 512 |
+
"hashtags": {
|
| 513 |
+
"zh": ["自由女神", "紐約", "美國", "紐約地標", "自由女神像"],
|
| 514 |
+
"en": ["StatueOfLiberty", "NewYork", "NYC", "Liberty", "USA", "America"]
|
| 515 |
+
},
|
| 516 |
+
"cultural_info": {
|
| 517 |
+
"built_year": 1886,
|
| 518 |
+
"architect": "Frédéric Auguste Bartholdi",
|
| 519 |
+
"architectural_style": "Neoclassicism"
|
| 520 |
+
}
|
| 521 |
+
},
|
| 522 |
+
|
| 523 |
+
"Golden Gate Bridge": {
|
| 524 |
+
"name": "Golden Gate Bridge",
|
| 525 |
+
"official_name": "Golden Gate Bridge",
|
| 526 |
+
"location": {
|
| 527 |
+
"city": "San Francisco",
|
| 528 |
+
"country": "United States",
|
| 529 |
+
"region": "California",
|
| 530 |
+
"continent": "North America"
|
| 531 |
+
},
|
| 532 |
+
"visual_cues": {
|
| 533 |
+
"iconic_view": [
|
| 534 |
+
"Suspension bridge with distinctive International Orange color spanning Golden Gate strait",
|
| 535 |
+
"Art Deco bridge with two towers and cables connecting San Francisco to Marin County",
|
| 536 |
+
"Famous orange bridge with 1.7-mile span over blue Pacific waters and hills beyond",
|
| 537 |
+
"Iconic suspension structure with tall towers and sweeping cables against San Francisco Bay"
|
| 538 |
+
],
|
| 539 |
+
"architectural_details": [
|
| 540 |
+
"Art Deco towers rising 227 meters above water with distinctive vertical ribbing",
|
| 541 |
+
"Main suspension cables made of 27000 wires in distinctive orange color",
|
| 542 |
+
"Deck structure with six traffic lanes suspended from vertical cables",
|
| 543 |
+
"Art Deco design elements including tower portals and lighting fixtures in period style"
|
| 544 |
+
],
|
| 545 |
+
"contextual_view": [
|
| 546 |
+
"Bridge viewed from Marin Headlands with San Francisco skyline and bay in background",
|
| 547 |
+
"Golden Gate from Baker Beach with bridge spanning across water to northern hills",
|
| 548 |
+
"Bridge emerging from famous fog with towers visible above marine layer",
|
| 549 |
+
"Aerial view showing complete span connecting two peninsulas across Golden Gate strait"
|
| 550 |
+
],
|
| 551 |
+
"seasonal_lighting": [
|
| 552 |
+
"Sunset with bridge silhouetted against orange and purple sky over Pacific Ocean",
|
| 553 |
+
"Bridge partially obscured by fog creating mysterious atmospheric effect",
|
| 554 |
+
"Blue hour with bridge illuminated and city lights twinkling in background",
|
| 555 |
+
"Clear day with International Orange color vibrant against blue sky and water"
|
| 556 |
+
]
|
| 557 |
+
},
|
| 558 |
+
"hashtags": {
|
| 559 |
+
"zh": ["金門大橋", "舊金山", "美國", "三藩市", "加州"],
|
| 560 |
+
"en": ["GoldenGateBridge", "SanFrancisco", "SF", "California", "USA", "Bridge"]
|
| 561 |
+
},
|
| 562 |
+
"cultural_info": {
|
| 563 |
+
"built_year": 1937,
|
| 564 |
+
"architect": "Joseph Strauss",
|
| 565 |
+
"architectural_style": "Art Deco"
|
| 566 |
+
}
|
| 567 |
+
},
|
| 568 |
+
|
| 569 |
+
"Christ the Redeemer": {
|
| 570 |
+
"name": "Christ the Redeemer",
|
| 571 |
+
"official_name": "Cristo Redentor",
|
| 572 |
+
"location": {
|
| 573 |
+
"city": "Rio de Janeiro",
|
| 574 |
+
"country": "Brazil",
|
| 575 |
+
"region": "Corcovado Mountain",
|
| 576 |
+
"continent": "South America"
|
| 577 |
+
},
|
| 578 |
+
"visual_cues": {
|
| 579 |
+
"iconic_view": [
|
| 580 |
+
"Massive Art Deco statue of Jesus Christ with outstretched arms atop Corcovado mountain",
|
| 581 |
+
"Colossal soapstone and concrete sculpture overlooking Rio with arms spanning 28 meters",
|
| 582 |
+
"Iconic statue at 30 meters height standing on 8-meter pedestal above rainforest",
|
| 583 |
+
"Monument with distinctive silhouette of Christ figure blessing city from mountain peak"
|
| 584 |
+
],
|
| 585 |
+
"architectural_details": [
|
| 586 |
+
"Reinforced concrete and soapstone construction with Art Deco styling",
|
| 587 |
+
"Triangular mosaic tiles covering exterior in whitish soapstone material",
|
| 588 |
+
"Internal chapel at pedestal base with access stairs and elevator system",
|
| 589 |
+
"Outstretched arms forming cross shape with detailed hands and robed figure"
|
| 590 |
+
],
|
| 591 |
+
"contextual_view": [
|
| 592 |
+
"Statue viewed from Sugarloaf Mountain with Guanabara Bay and Rio sprawl below",
|
| 593 |
+
"Christ overlooking Copacabana and Ipanema beaches with Atlantic Ocean beyond",
|
| 594 |
+
"Monument surrounded by Tijuca Forest with lush tropical vegetation on mountain",
|
| 595 |
+
"Aerial view showing statue's position above city with both ocean and mountains visible"
|
| 596 |
+
],
|
| 597 |
+
"seasonal_lighting": [
|
| 598 |
+
"Statue illuminated at night with dramatic lighting visible across Rio",
|
| 599 |
+
"Sunset silhouette with statue outlined against orange sky above darkening city",
|
| 600 |
+
"Stormy weather with lightning behind statue creating dramatic atmosphere",
|
| 601 |
+
"Special event lighting in various colors for holidays and celebrations"
|
| 602 |
+
]
|
| 603 |
+
},
|
| 604 |
+
"hashtags": {
|
| 605 |
+
"zh": ["基督像", "里約熱內盧", "巴西", "救世基督像", "世界新七大奇蹟"],
|
| 606 |
+
"en": ["ChristTheRedeemer", "Rio", "Brazil", "RioDeJaneiro", "CristoRedentor"]
|
| 607 |
+
},
|
| 608 |
+
"cultural_info": {
|
| 609 |
+
"built_year": 1931,
|
| 610 |
+
"architect": "Paul Landowski",
|
| 611 |
+
"architectural_style": "Art Deco"
|
| 612 |
+
}
|
| 613 |
+
},
|
| 614 |
+
|
| 615 |
+
"CN Tower": {
|
| 616 |
+
"name": "CN Tower",
|
| 617 |
+
"official_name": "Canadian National Tower",
|
| 618 |
+
"location": {
|
| 619 |
+
"city": "Toronto",
|
| 620 |
+
"country": "Canada",
|
| 621 |
+
"region": "Ontario",
|
| 622 |
+
"continent": "North America"
|
| 623 |
+
},
|
| 624 |
+
"visual_cues": {
|
| 625 |
+
"iconic_view": [
|
| 626 |
+
"Concrete communication tower at 553 meters with distinctive pod and antenna spire",
|
| 627 |
+
"Iconic Toronto landmark with observation deck pod and long concrete shaft",
|
| 628 |
+
"Tall broadcasting tower with revolving restaurant and glass floor observation area",
|
| 629 |
+
"Slender concrete tower dominating Toronto skyline with characteristic Y-shaped floor plan"
|
| 630 |
+
],
|
| 631 |
+
"architectural_details": [
|
| 632 |
+
"Hexagonal concrete shaft with three support legs forming Y-shaped base",
|
| 633 |
+
"SkyPod observation level with indoor and outdoor viewing areas at 447 meters",
|
| 634 |
+
"Glass floor section allowing visitors to look straight down to ground",
|
| 635 |
+
"Revolving restaurant completing 360-degree rotation every 72 minutes"
|
| 636 |
+
],
|
| 637 |
+
"contextual_view": [
|
| 638 |
+
"Tower rising above Toronto skyline with Lake Ontario and city sprawl visible",
|
| 639 |
+
"CN Tower viewed from Toronto Islands with waterfront and downtown core",
|
| 640 |
+
"Tower dominating cityscape with Rogers Centre stadium and financial district nearby",
|
| 641 |
+
"Landmark visible from throughout Greater Toronto Area as defining skyline element"
|
| 642 |
+
],
|
| 643 |
+
"seasonal_lighting": [
|
| 644 |
+
"Tower illuminated at night in various colors for events and causes",
|
| 645 |
+
"Canada Day celebration with tower lit in red and white national colors",
|
| 646 |
+
"Sunset with tower silhouetted against colorful sky over Lake Ontario",
|
| 647 |
+
"Winter scene with tower emerging from snow and city lights below"
|
| 648 |
+
]
|
| 649 |
+
},
|
| 650 |
+
"hashtags": {
|
| 651 |
+
"zh": ["CN塔", "多倫多", "加拿大", "多倫多地標", "加拿大國家電視塔"],
|
| 652 |
+
"en": ["CNTower", "Toronto", "Canada", "TorontoLandmark", "YYZ"]
|
| 653 |
+
},
|
| 654 |
+
"cultural_info": {
|
| 655 |
+
"built_year": 1976,
|
| 656 |
+
"architect": "John Andrews",
|
| 657 |
+
"architectural_style": "Modern"
|
| 658 |
+
}
|
| 659 |
+
},
|
| 660 |
+
|
| 661 |
+
# ===== 大洋洲與其他 Oceania & Others =====
|
| 662 |
+
"Sydney Opera House": {
|
| 663 |
+
"name": "Sydney Opera House",
|
| 664 |
+
"official_name": "Sydney Opera House",
|
| 665 |
+
"location": {
|
| 666 |
+
"city": "Sydney",
|
| 667 |
+
"country": "Australia",
|
| 668 |
+
"region": "Bennelong Point",
|
| 669 |
+
"continent": "Oceania"
|
| 670 |
+
},
|
| 671 |
+
"visual_cues": {
|
| 672 |
+
"iconic_view": [
|
| 673 |
+
"Expressionist modern design with distinctive white shell-shaped roof sails on harbor peninsula",
|
| 674 |
+
"Multiple shell structures covered in white and cream tiles rising from water's edge",
|
| 675 |
+
"Iconic performance venue with overlapping concrete shells creating sail-like silhouette",
|
| 676 |
+
"Modernist architecture with innovative roof design of interlocking vaulted shells"
|
| 677 |
+
],
|
| 678 |
+
"architectural_details": [
|
| 679 |
+
"Precast concrete ribs covered with 1056006 white and cream Swedish tiles",
|
| 680 |
+
"Shell structures based on spherical geometry creating self-supporting roof sections",
|
| 681 |
+
"Glass curtain walls filling spaces between shells and podium below",
|
| 682 |
+
"Multiple performance halls including Concert Hall and Joan Sutherland Theatre within shells"
|
| 683 |
+
],
|
| 684 |
+
"contextual_view": [
|
| 685 |
+
"Opera House on Bennelong Point with Sydney Harbour Bridge in background",
|
| 686 |
+
"Building viewed from Circular Quay with harbor ferries and city skyline",
|
| 687 |
+
"Opera House at sunset with sails reflecting golden light over harbor waters",
|
| 688 |
+
"Aerial view showing building's position on peninsula with Royal Botanic Gardens adjacent"
|
| 689 |
+
],
|
| 690 |
+
"seasonal_lighting": [
|
| 691 |
+
"Vivid Sydney festival with colorful projections on shell surfaces",
|
| 692 |
+
"Sunset illuminating white tiles with warm light and harbor reflections",
|
| 693 |
+
"Night lighting highlighting architectural forms against dark harbor",
|
| 694 |
+
"New Year's Eve with fireworks from Harbour Bridge framing Opera House"
|
| 695 |
+
]
|
| 696 |
+
},
|
| 697 |
+
"hashtags": {
|
| 698 |
+
"zh": ["雪梨歌劇院", "雪梨", "澳洲", "澳大利亞", "世界遺產"],
|
| 699 |
+
"en": ["SydneyOperaHouse", "Sydney", "Australia", "OperaHouse", "WorldHeritage"]
|
| 700 |
+
},
|
| 701 |
+
"cultural_info": {
|
| 702 |
+
"built_year": 1973,
|
| 703 |
+
"architect": "Jørn Utzon",
|
| 704 |
+
"architectural_style": "Expressionist Modernism"
|
| 705 |
+
}
|
| 706 |
+
},
|
| 707 |
+
|
| 708 |
+
"Taj Mahal": {
|
| 709 |
+
"name": "Taj Mahal",
|
| 710 |
+
"official_name": "ताज महल",
|
| 711 |
+
"location": {
|
| 712 |
+
"city": "Agra",
|
| 713 |
+
"country": "India",
|
| 714 |
+
"region": "Uttar Pradesh",
|
| 715 |
+
"continent": "Asia"
|
| 716 |
+
},
|
| 717 |
+
"visual_cues": {
|
| 718 |
+
"iconic_view": [
|
| 719 |
+
"White marble mausoleum with central dome and four minarets in Mughal architecture style",
|
| 720 |
+
"Ivory-white marble structure with perfect symmetry reflected in long rectangular pool",
|
| 721 |
+
"Iconic domed monument with intricate inlay work and Islamic calligraphy decorations",
|
| 722 |
+
"Majestic tomb complex with main building flanked by symmetrical mosque and guest house"
|
| 723 |
+
],
|
| 724 |
+
"architectural_details": [
|
| 725 |
+
"Central dome rising 35 meters surrounded by four smaller chattri domes",
|
| 726 |
+
"Pietra dura inlay work with semi-precious stones creating floral patterns",
|
| 727 |
+
"Four minarets at corners standing 40 meters high with tilted design for earthquake safety",
|
| 728 |
+
"Calligraphic inscriptions from Quran decorating entrance archways in black marble"
|
| 729 |
+
],
|
| 730 |
+
"contextual_view": [
|
| 731 |
+
"Taj Mahal viewed through main gateway with frame creating first impression",
|
| 732 |
+
"Monument reflected in Yamuna River during calm conditions with gardens in foreground",
|
| 733 |
+
"Taj from Mehtab Bagh garden across river showing rear view and riverbank",
|
| 734 |
+
"Complex with charbagh Persian garden layout leading to mausoleum platform"
|
| 735 |
+
],
|
| 736 |
+
"seasonal_lighting": [
|
| 737 |
+
"Sunrise with monument glowing pink and orange in soft morning light",
|
| 738 |
+
"Full moon night viewing with white marble luminous under moonlight",
|
| 739 |
+
"Sunset creating warm golden tones on marble with long shadows",
|
| 740 |
+
"Misty morning with Taj emerging from fog over Yamuna River"
|
| 741 |
+
]
|
| 742 |
+
},
|
| 743 |
+
"hashtags": {
|
| 744 |
+
"zh": ["泰姬陵", "印度", "阿格拉", "世界遺產", "世界奇蹟"],
|
| 745 |
+
"en": ["TajMahal", "India", "Agra", "WorldHeritage", "Monument", "Mausoleum"]
|
| 746 |
+
},
|
| 747 |
+
"cultural_info": {
|
| 748 |
+
"built_year": 1653,
|
| 749 |
+
"architect": "Ustad Ahmad Lahauri",
|
| 750 |
+
"architectural_style": "Mughal"
|
| 751 |
+
}
|
| 752 |
+
},
|
| 753 |
+
|
| 754 |
+
"Pyramids of Giza": {
|
| 755 |
+
"name": "Pyramids of Giza",
|
| 756 |
+
"official_name": "أهرامات الجيزة",
|
| 757 |
+
"location": {
|
| 758 |
+
"city": "Giza",
|
| 759 |
+
"country": "Egypt",
|
| 760 |
+
"region": "Greater Cairo",
|
| 761 |
+
"continent": "Africa"
|
| 762 |
+
},
|
| 763 |
+
"visual_cues": {
|
| 764 |
+
"iconic_view": [
|
| 765 |
+
"Three ancient pyramids rising from desert plateau with Great Pyramid as largest structure",
|
| 766 |
+
"Massive limestone pyramids with Great Sphinx in foreground on Giza Plateau",
|
| 767 |
+
"Ancient Egyptian royal tombs with precise geometric forms against desert sky",
|
| 768 |
+
"Monumental pyramids showing weathered limestone blocks and missing outer casing"
|
| 769 |
+
],
|
| 770 |
+
"architectural_details": [
|
| 771 |
+
"Great Pyramid originally 146 meters with 2.3 million limestone blocks",
|
| 772 |
+
"Precise alignment to cardinal directions with astronomical significance",
|
| 773 |
+
"Internal chambers and passages including King's Chamber and Grand Gallery",
|
| 774 |
+
"Remaining casing stones at apex showing original smooth white limestone covering"
|
| 775 |
+
],
|
| 776 |
+
"contextual_view": [
|
| 777 |
+
"Pyramids with Great Sphinx in foreground and Cairo urban sprawl in background",
|
| 778 |
+
"Three pyramids aligned with smaller queens pyramids and ancient cemetery",
|
| 779 |
+
"Desert landscape with pyramids and camel riders providing scale",
|
| 780 |
+
"Aerial view showing pyramid complex relationship to Nile River and modern city"
|
| 781 |
+
],
|
| 782 |
+
"seasonal_lighting": [
|
| 783 |
+
"Sound and light show with colorful illumination on pyramid faces at night",
|
| 784 |
+
"Sunrise with pyramids silhouetted against orange desert sky",
|
| 785 |
+
"Harsh midday sun creating strong shadows and highlighting weathered stone",
|
| 786 |
+
"Golden hour light warming limestone with dramatic shadows emphasizing geometry"
|
| 787 |
+
]
|
| 788 |
+
},
|
| 789 |
+
"hashtags": {
|
| 790 |
+
"zh": ["金字塔", "埃及", "吉薩", "古埃及", "世界奇蹟", "人面獅身像"],
|
| 791 |
+
"en": ["Pyramids", "Egypt", "Giza", "GreatPyramid", "AncientEgypt", "Sphinx"]
|
| 792 |
+
},
|
| 793 |
+
"cultural_info": {
|
| 794 |
+
"built_year": -2560,
|
| 795 |
+
"architect": "Hemiunu",
|
| 796 |
+
"architectural_style": "Ancient Egyptian"
|
| 797 |
+
}
|
| 798 |
+
},
|
| 799 |
+
|
| 800 |
+
"Machu Picchu": {
|
| 801 |
+
"name": "Machu Picchu",
|
| 802 |
+
"official_name": "Machu Picchu",
|
| 803 |
+
"location": {
|
| 804 |
+
"city": "Cusco Region",
|
| 805 |
+
"country": "Peru",
|
| 806 |
+
"region": "Urubamba Province",
|
| 807 |
+
"continent": "South America"
|
| 808 |
+
},
|
| 809 |
+
"visual_cues": {
|
| 810 |
+
"iconic_view": [
|
| 811 |
+
"Ancient Incan citadel on mountain ridge with terraced structures and Huayna Picchu peak behind",
|
| 812 |
+
"Stone ruins at 2430 meters altitude with dramatic mountain setting and cloud forest",
|
| 813 |
+
"Archaeological site with precisely fitted stone walls temples and agricultural terraces",
|
| 814 |
+
"Lost city with iconic postcard view showing complete site with Wayna Picchu mountain"
|
| 815 |
+
],
|
| 816 |
+
"architectural_details": [
|
| 817 |
+
"Dry-stone construction with precisely cut granite blocks without mortar",
|
| 818 |
+
"Agricultural terraces with sophisticated drainage systems on steep slopes",
|
| 819 |
+
"Temple of the Sun with curved wall and astronomical alignment features",
|
| 820 |
+
"Intihuatana ritual stone showing Incan astronomical and agricultural knowledge"
|
| 821 |
+
],
|
| 822 |
+
"contextual_view": [
|
| 823 |
+
"Citadel viewed from Sun Gate after completing Inca Trail with morning light",
|
| 824 |
+
"Site from Huayna Picchu summit showing complete layout and surrounding mountains",
|
| 825 |
+
"Machu Picchu with Urubamba River valley and cloud forest below",
|
| 826 |
+
"Ruins with llamas grazing among ancient structures creating iconic Andean scene"
|
| 827 |
+
],
|
| 828 |
+
"seasonal_lighting": [
|
| 829 |
+
"Sunrise illuminating ruins with first light as mist clears from valleys",
|
| 830 |
+
"Dramatic clouds surrounding peaks with ruins emerging from mountain fog",
|
| 831 |
+
"Afternoon light creating shadows that emphasize stone wall construction details",
|
| 832 |
+
"Rainy season with lush green terraces and dramatic cloud formations"
|
| 833 |
+
]
|
| 834 |
+
},
|
| 835 |
+
"hashtags": {
|
| 836 |
+
"zh": ["馬丘比丘", "秘魯", "印加", "世界遺產", "失落之城"],
|
| 837 |
+
"en": ["MachuPicchu", "Peru", "Inca", "WorldHeritage", "LostCity", "Cusco"]
|
| 838 |
+
},
|
| 839 |
+
"cultural_info": {
|
| 840 |
+
"built_year": 1450,
|
| 841 |
+
"architect": "Pachacuti Inca Yupanqui",
|
| 842 |
+
"architectural_style": "Inca"
|
| 843 |
+
}
|
| 844 |
+
},
|
| 845 |
+
|
| 846 |
+
"Petra": {
|
| 847 |
+
"name": "Petra",
|
| 848 |
+
"official_name": "البتراء",
|
| 849 |
+
"location": {
|
| 850 |
+
"city": "Ma'an Governorate",
|
| 851 |
+
"country": "Jordan",
|
| 852 |
+
"region": "Wadi Musa",
|
| 853 |
+
"continent": "Asia"
|
| 854 |
+
},
|
| 855 |
+
"visual_cues": {
|
| 856 |
+
"iconic_view": [
|
| 857 |
+
"Rose-red sandstone Treasury building carved into cliff face with Hellenistic facade",
|
| 858 |
+
"Al-Khazneh temple with elaborate columns and sculptures in pink Nabataean rock",
|
| 859 |
+
"Ancient city carved from rock with dramatic facade revealed through narrow Siq canyon",
|
| 860 |
+
"Monumental rock-cut architecture with classical design in desert landscape"
|
| 861 |
+
],
|
| 862 |
+
"architectural_details": [
|
| 863 |
+
"Hellenistic facade with Corinthian columns and ornate sculptural decorations",
|
| 864 |
+
"Rock-cut construction showing Nabataean engineering carved directly from sandstone cliff",
|
| 865 |
+
"Rose-red to pink sandstone with natural color variations in rock layers",
|
| 866 |
+
"Urn monument crowning upper level with classical Greek architectural influences"
|
| 867 |
+
],
|
| 868 |
+
"contextual_view": [
|
| 869 |
+
"Treasury viewed through narrow opening of Siq canyon creating dramatic reveal",
|
| 870 |
+
"Petra archaeological park with multiple rock-cut structures and Roman amphitheater",
|
| 871 |
+
"Site in desert landscape with Bedouin presence and arid mountain scenery",
|
| 872 |
+
"Monastery building requiring climb up ancient steps with panoramic desert views"
|
| 873 |
+
],
|
| 874 |
+
"seasonal_lighting": [
|
| 875 |
+
"Morning light illuminating Treasury facade with warm glow on rose-red stone",
|
| 876 |
+
"Petra by Night with Treasury lit by candlelight creating magical atmosphere",
|
| 877 |
+
"Harsh midday sun emphasizing color variations and carved details in rock",
|
| 878 |
+
"Late afternoon shadows creating depth and emphasizing architectural relief"
|
| 879 |
+
]
|
| 880 |
+
},
|
| 881 |
+
"hashtags": {
|
| 882 |
+
"zh": ["佩特拉", "約旦", "玫瑰城", "世界遺產", "世界新七大奇蹟"],
|
| 883 |
+
"en": ["Petra", "Jordan", "Treasury", "AlKhazneh", "WorldHeritage", "RoseCity"]
|
| 884 |
+
},
|
| 885 |
+
"cultural_info": {
|
| 886 |
+
"built_year": -312,
|
| 887 |
+
"architect": "Nabataeans",
|
| 888 |
+
"architectural_style": "Nabataean"
|
| 889 |
+
}
|
| 890 |
+
},
|
| 891 |
+
|
| 892 |
+
"Stonehenge": {
|
| 893 |
+
"name": "Stonehenge",
|
| 894 |
+
"official_name": "Stonehenge",
|
| 895 |
+
"location": {
|
| 896 |
+
"city": "Wiltshire",
|
| 897 |
+
"country": "United Kingdom",
|
| 898 |
+
"region": "Salisbury Plain",
|
| 899 |
+
"continent": "Europe"
|
| 900 |
+
},
|
| 901 |
+
"visual_cues": {
|
| 902 |
+
"iconic_view": [
|
| 903 |
+
"Prehistoric monument with massive standing stones arranged in circular pattern on plain",
|
| 904 |
+
"Ancient stone circle with trilithons and sarsen stones in open landscape",
|
| 905 |
+
"Neolithic structure with distinctive stone archways and circular earthwork setting",
|
| 906 |
+
"Mysterious megalithic monument with bluestones and sarsen stones against sky"
|
| 907 |
+
],
|
| 908 |
+
"architectural_details": [
|
| 909 |
+
"Sarsen stone trilithons with horizontal lintels connected by mortise and tenon joints",
|
| 910 |
+
"Bluestone arrangement within larger sarsen circle showing different stone types",
|
| 911 |
+
"Heel Stone and Avenue aligned to summer solstice sunrise",
|
| 912 |
+
"Weathered surfaces showing 5000 years of exposure to English weather"
|
| 913 |
+
],
|
| 914 |
+
"contextual_view": [
|
| 915 |
+
"Stone circle in pastoral English landscape with sheep grazing on Salisbury Plain",
|
| 916 |
+
"Monument from distance showing relationship to surrounding earthworks and barrows",
|
| 917 |
+
"Stonehenge with visitors for scale showing massive size of individual stones",
|
| 918 |
+
"Site from aerial view revealing circular formation and astronomical alignments"
|
| 919 |
+
],
|
| 920 |
+
"seasonal_lighting": [
|
| 921 |
+
"Summer solstice sunrise with sun aligned through stones as crowds gather",
|
| 922 |
+
"Winter solstice sunset creating dramatic silhouettes of standing stones",
|
| 923 |
+
"Moody overcast conditions with stones against dramatic English sky",
|
| 924 |
+
"Misty morning with stones emerging from fog creating mystical atmosphere"
|
| 925 |
+
]
|
| 926 |
+
},
|
| 927 |
+
"hashtags": {
|
| 928 |
+
"zh": ["巨石陣", "英國", "史前遺跡", "世界遺產", "威爾特郡"],
|
| 929 |
+
"en": ["Stonehenge", "England", "UK", "Prehistoric", "WorldHeritage", "Wiltshire"]
|
| 930 |
+
},
|
| 931 |
+
"cultural_info": {
|
| 932 |
+
"built_year": -3000,
|
| 933 |
+
"architect": "Unknown (Neolithic peoples)",
|
| 934 |
+
"architectural_style": "Prehistoric"
|
| 935 |
+
}
|
| 936 |
+
}
|
| 937 |
+
}
|
| 938 |
+
|
| 939 |
+
print(f"✓ Landmark Prompts initialized with {len(self.landmarks)} world landmarks")
|
| 940 |
+
|
| 941 |
+
def get_prompts(self, landmark_name: str) -> Optional[Dict]:
|
| 942 |
+
"""
|
| 943 |
+
取得特定地標的完整 prompt 資料
|
| 944 |
+
|
| 945 |
+
Args:
|
| 946 |
+
landmark_name: 地標名稱
|
| 947 |
+
|
| 948 |
+
Returns:
|
| 949 |
+
地標資料字典,若不存在則返回 None
|
| 950 |
+
"""
|
| 951 |
+
return self.landmarks.get(landmark_name)
|
| 952 |
+
|
| 953 |
+
def get_all_landmarks(self) -> Dict:
|
| 954 |
+
"""取得所有地標資料"""
|
| 955 |
+
return self.landmarks
|
| 956 |
+
|
| 957 |
+
def search_by_location(self, city: str = None, country: str = None) -> List[str]:
|
| 958 |
+
"""
|
| 959 |
+
根據地理位置搜尋地標
|
| 960 |
+
|
| 961 |
+
Args:
|
| 962 |
+
city: 城市名稱
|
| 963 |
+
country: 國家名稱
|
| 964 |
+
|
| 965 |
+
Returns:
|
| 966 |
+
符合條件的地標名稱列表
|
| 967 |
+
"""
|
| 968 |
+
results = []
|
| 969 |
+
for landmark_name, data in self.landmarks.items():
|
| 970 |
+
location = data.get('location', {})
|
| 971 |
+
|
| 972 |
+
if city and country:
|
| 973 |
+
if location.get('city') == city and location.get('country') == country:
|
| 974 |
+
results.append(landmark_name)
|
| 975 |
+
elif city:
|
| 976 |
+
if location.get('city') == city:
|
| 977 |
+
results.append(landmark_name)
|
| 978 |
+
elif country:
|
| 979 |
+
if location.get('country') == country:
|
| 980 |
+
results.append(landmark_name)
|
| 981 |
+
|
| 982 |
+
return results
|
| 983 |
+
|
| 984 |
+
def get_visual_prompts(self, landmark_name: str, context: str = 'iconic_view') -> List[str]:
|
| 985 |
+
"""
|
| 986 |
+
取得地標的視覺描述 prompts
|
| 987 |
+
|
| 988 |
+
Args:
|
| 989 |
+
landmark_name: 地標名稱
|
| 990 |
+
context: 情境類型 ('iconic_view', 'architectural_details', 'contextual_view', 'seasonal_lighting')
|
| 991 |
+
|
| 992 |
+
Returns:
|
| 993 |
+
視覺描述列表
|
| 994 |
+
"""
|
| 995 |
+
landmark = self.landmarks.get(landmark_name)
|
| 996 |
+
if not landmark:
|
| 997 |
+
return []
|
| 998 |
+
|
| 999 |
+
visual_cues = landmark.get('visual_cues', {})
|
| 1000 |
+
return visual_cues.get(context, [])
|
| 1001 |
+
|
| 1002 |
+
def get_hashtags(self, landmark_name: str, language: str = 'zh') -> List[str]:
|
| 1003 |
+
"""
|
| 1004 |
+
取得地標的 hashtags
|
| 1005 |
+
|
| 1006 |
+
Args:
|
| 1007 |
+
landmark_name: 地標名稱
|
| 1008 |
+
language: 語言 ('zh', 'en', 或 'zh-en')
|
| 1009 |
+
|
| 1010 |
+
Returns:
|
| 1011 |
+
Hashtag 列表
|
| 1012 |
+
"""
|
| 1013 |
+
landmark = self.landmarks.get(landmark_name)
|
| 1014 |
+
if not landmark:
|
| 1015 |
+
return []
|
| 1016 |
+
|
| 1017 |
+
hashtags = landmark.get('hashtags', {})
|
| 1018 |
+
|
| 1019 |
+
if language == 'zh':
|
| 1020 |
+
return hashtags.get('zh', [])
|
| 1021 |
+
elif language == 'en':
|
| 1022 |
+
return hashtags.get('en', [])
|
| 1023 |
+
elif language == 'zh-en' or language == 'both':
|
| 1024 |
+
zh_tags = hashtags.get('zh', [])
|
| 1025 |
+
en_tags = hashtags.get('en', [])
|
| 1026 |
+
return zh_tags + en_tags
|
| 1027 |
+
else:
|
| 1028 |
+
return hashtags.get('zh', [])
|
| 1029 |
+
|
| 1030 |
+
print("✓ LandmarkPrompts defined")
|
lighting_analysis_manager.py
ADDED
|
@@ -0,0 +1,453 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import cv2
|
| 2 |
+
import numpy as np
|
| 3 |
+
import torch
|
| 4 |
+
import torch.nn as nn
|
| 5 |
+
from PIL import Image
|
| 6 |
+
from typing import Dict, Tuple
|
| 7 |
+
import torchvision.models as models
|
| 8 |
+
import torchvision.transforms as transforms
|
| 9 |
+
|
| 10 |
+
class LightingAnalysisManager:
|
| 11 |
+
"""Advanced lighting analysis using Places365 scene recognition + CV features"""
|
| 12 |
+
|
| 13 |
+
def __init__(self):
|
| 14 |
+
print("Initializing Lighting Analysis Manager with Places365...")
|
| 15 |
+
|
| 16 |
+
# Places365 ResNet18
|
| 17 |
+
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 18 |
+
self._load_places365_model()
|
| 19 |
+
|
| 20 |
+
# CV feature weights (Places365 gets higher weight)
|
| 21 |
+
self.feature_weights = {
|
| 22 |
+
'places365': 0.50, # Primary weight to Places365
|
| 23 |
+
'brightness': 0.15,
|
| 24 |
+
'color_temp': 0.15,
|
| 25 |
+
'contrast': 0.08,
|
| 26 |
+
'gradient': 0.05, # Auxiliary features
|
| 27 |
+
'laplacian': 0.04,
|
| 28 |
+
'color_variation': 0.03
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
print("✓ Lighting Analysis Manager initialized with Places365 + advanced CV features")
|
| 32 |
+
|
| 33 |
+
def _load_places365_model(self):
|
| 34 |
+
"""Load Places365 ResNet18 for scene attributes"""
|
| 35 |
+
try:
|
| 36 |
+
# Use ResNet18 pretrained on Places365
|
| 37 |
+
model = models.resnet18(weights=None)
|
| 38 |
+
model.fc = nn.Linear(model.fc.in_features, 365)
|
| 39 |
+
|
| 40 |
+
# Load Places365 weights (if available, otherwise use ImageNet as fallback)
|
| 41 |
+
try:
|
| 42 |
+
import urllib
|
| 43 |
+
checkpoint_url = 'http://places2.csail.mit.edu/models_places365/resnet18_places365.pth.tar'
|
| 44 |
+
checkpoint = torch.hub.load_state_dict_from_url(
|
| 45 |
+
checkpoint_url,
|
| 46 |
+
map_location=self.device,
|
| 47 |
+
progress=False
|
| 48 |
+
)
|
| 49 |
+
state_dict = {str.replace(k, 'module.', ''): v for k, v in checkpoint['state_dict'].items()}
|
| 50 |
+
model.load_state_dict(state_dict)
|
| 51 |
+
print(" Loaded Places365 ResNet18 weights")
|
| 52 |
+
except:
|
| 53 |
+
print(" Using ImageNet pretrained ResNet18 (fallback)")
|
| 54 |
+
model = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
|
| 55 |
+
|
| 56 |
+
model = model.to(self.device)
|
| 57 |
+
model.eval()
|
| 58 |
+
self.places_model = model
|
| 59 |
+
|
| 60 |
+
# Image preprocessing for Places365
|
| 61 |
+
self.places_transform = transforms.Compose([
|
| 62 |
+
transforms.Resize((224, 224)),
|
| 63 |
+
transforms.ToTensor(),
|
| 64 |
+
transforms.Normalize(
|
| 65 |
+
mean=[0.485, 0.456, 0.406],
|
| 66 |
+
std=[0.229, 0.224, 0.225]
|
| 67 |
+
)
|
| 68 |
+
])
|
| 69 |
+
|
| 70 |
+
# Scene categories related to lighting
|
| 71 |
+
self.lighting_scenes = {
|
| 72 |
+
'sunny': ['street', 'downtown', 'plaza', 'park', 'field'],
|
| 73 |
+
'overcast': ['alley', 'covered_bridge', 'corridor'],
|
| 74 |
+
'indoor': ['lobby', 'office', 'museum', 'restaurant'],
|
| 75 |
+
'evening': ['street', 'downtown', 'plaza'],
|
| 76 |
+
'natural': ['park', 'forest', 'mountain', 'coast']
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
except Exception as e:
|
| 80 |
+
print(f" Warning: Places365 loading failed ({e}), using CV-only mode")
|
| 81 |
+
self.places_model = None
|
| 82 |
+
|
| 83 |
+
def analyze_lighting(self, image: Image.Image) -> Dict:
|
| 84 |
+
"""Comprehensive lighting analysis using Places365 + CV"""
|
| 85 |
+
|
| 86 |
+
# 1. CV-based physical features (including advanced features)
|
| 87 |
+
cv_features = self._extract_cv_features(image)
|
| 88 |
+
|
| 89 |
+
# 2. Places365 scene understanding (if available)
|
| 90 |
+
scene_info = self._analyze_scene_places365(image)
|
| 91 |
+
|
| 92 |
+
# 3. Determine lighting condition (adaptive with auxiliary features)
|
| 93 |
+
lighting_condition, confidence = self._determine_lighting_adaptive(
|
| 94 |
+
cv_features, scene_info
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
return {
|
| 98 |
+
'lighting_type': lighting_condition,
|
| 99 |
+
'confidence': confidence,
|
| 100 |
+
'cv_features': cv_features,
|
| 101 |
+
'scene_info': scene_info
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
def _extract_cv_features(self, image: Image.Image) -> Dict:
|
| 105 |
+
"""Extract CV-based features including advanced gradient and color analysis"""
|
| 106 |
+
img_array = np.array(image)
|
| 107 |
+
img_bgr = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
|
| 108 |
+
|
| 109 |
+
# Basic Features (Primary)
|
| 110 |
+
# Brightness (LAB L-channel)
|
| 111 |
+
lab = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2LAB)
|
| 112 |
+
brightness = float(np.mean(lab[:, :, 0]))
|
| 113 |
+
|
| 114 |
+
# Color temperature (R/B ratio)
|
| 115 |
+
b_mean = np.mean(img_bgr[:, :, 0])
|
| 116 |
+
r_mean = np.mean(img_bgr[:, :, 2])
|
| 117 |
+
color_temp = float(r_mean / (b_mean + 1e-6))
|
| 118 |
+
|
| 119 |
+
# Contrast (std of grayscale)
|
| 120 |
+
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
|
| 121 |
+
contrast = float(np.std(gray))
|
| 122 |
+
|
| 123 |
+
# Shadow ratio
|
| 124 |
+
_, shadow_mask = cv2.threshold(gray, 80, 255, cv2.THRESH_BINARY_INV)
|
| 125 |
+
shadow_ratio = float(np.sum(shadow_mask > 0) / shadow_mask.size)
|
| 126 |
+
|
| 127 |
+
# Advanced Features
|
| 128 |
+
# 1. First derivative: Sobel gradient magnitude (edge strength)
|
| 129 |
+
# Strong gradients suggest directional lighting, weak suggest diffused
|
| 130 |
+
sobelx = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3)
|
| 131 |
+
sobely = cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=3)
|
| 132 |
+
gradient_magnitude = np.sqrt(sobelx**2 + sobely**2)
|
| 133 |
+
gradient_strength = float(np.mean(gradient_magnitude))
|
| 134 |
+
|
| 135 |
+
# 2. Second derivative: Laplacian variance (lighting change detection)
|
| 136 |
+
# High variance indicates complex lighting with many transitions
|
| 137 |
+
laplacian = cv2.Laplacian(gray, cv2.CV_64F)
|
| 138 |
+
laplacian_var = float(np.var(laplacian))
|
| 139 |
+
|
| 140 |
+
# 3. Color difference in LAB space (color uniformity)
|
| 141 |
+
# Low variation suggests overcast/diffused, high suggests mixed lighting
|
| 142 |
+
a_std = float(np.std(lab[:, :, 1])) # a* channel (green-red)
|
| 143 |
+
b_std = float(np.std(lab[:, :, 2])) # b* channel (blue-yellow)
|
| 144 |
+
color_variation = (a_std + b_std) / 2
|
| 145 |
+
|
| 146 |
+
return {
|
| 147 |
+
# Primary features
|
| 148 |
+
'brightness': brightness,
|
| 149 |
+
'color_temp': color_temp,
|
| 150 |
+
'contrast': contrast,
|
| 151 |
+
'shadow_ratio': shadow_ratio,
|
| 152 |
+
# Advanced auxiliary features (to assist Places365)
|
| 153 |
+
'gradient_strength': gradient_strength,
|
| 154 |
+
'laplacian_variance': laplacian_var,
|
| 155 |
+
'color_variation': color_variation
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
def _analyze_scene_places365(self, image: Image.Image) -> Dict:
|
| 159 |
+
"""Analyze scene using Places365"""
|
| 160 |
+
if self.places_model is None:
|
| 161 |
+
return {'scene_category': 'unknown', 'confidence': 0.0}
|
| 162 |
+
|
| 163 |
+
try:
|
| 164 |
+
with torch.no_grad():
|
| 165 |
+
img_tensor = self.places_transform(image).unsqueeze(0).to(self.device)
|
| 166 |
+
logits = self.places_model(img_tensor)
|
| 167 |
+
probs = torch.nn.functional.softmax(logits, dim=1)
|
| 168 |
+
|
| 169 |
+
# Get top prediction
|
| 170 |
+
top_prob, top_idx = torch.max(probs, 1)
|
| 171 |
+
|
| 172 |
+
# Simple scene categories
|
| 173 |
+
# Using index ranges for common outdoor scenes
|
| 174 |
+
is_outdoor = top_idx.item() < 200 # Rough heuristic
|
| 175 |
+
|
| 176 |
+
return {
|
| 177 |
+
'scene_category': 'outdoor' if is_outdoor else 'indoor',
|
| 178 |
+
'confidence': float(top_prob.item()),
|
| 179 |
+
'scene_idx': int(top_idx.item())
|
| 180 |
+
}
|
| 181 |
+
except Exception as e:
|
| 182 |
+
print(f" Places365 inference failed: {e}")
|
| 183 |
+
return {'scene_category': 'unknown', 'confidence': 0.0}
|
| 184 |
+
|
| 185 |
+
def _detect_indoor_scene(self, cv_features: Dict, scene_info: Dict) -> bool:
|
| 186 |
+
"""
|
| 187 |
+
Detect if scene is indoor or outdoor using multiple signals
|
| 188 |
+
|
| 189 |
+
Args:
|
| 190 |
+
cv_features: Computer vision features
|
| 191 |
+
scene_info: Places365 scene information
|
| 192 |
+
|
| 193 |
+
Returns:
|
| 194 |
+
True if indoor, False if outdoor
|
| 195 |
+
"""
|
| 196 |
+
indoor_score = 0.0
|
| 197 |
+
|
| 198 |
+
# Signal 1: Places365 scene category (strongest signal)
|
| 199 |
+
if scene_info.get('scene_category') == 'indoor':
|
| 200 |
+
indoor_score += 0.5
|
| 201 |
+
elif scene_info.get('scene_category') == 'outdoor':
|
| 202 |
+
indoor_score -= 0.3
|
| 203 |
+
|
| 204 |
+
# Signal 2: Brightness patterns
|
| 205 |
+
# Indoor scenes typically have controlled brightness (not too bright, not too dark)
|
| 206 |
+
brightness = cv_features['brightness']
|
| 207 |
+
if 60 < brightness < 220: # 放寬範圍,包含更多室內場景
|
| 208 |
+
indoor_score += 0.15
|
| 209 |
+
elif brightness > 230: # Very bright suggests outdoor
|
| 210 |
+
indoor_score -= 0.2
|
| 211 |
+
|
| 212 |
+
# Signal 3: Low gradient suggests controlled/diffused indoor lighting
|
| 213 |
+
gradient = cv_features['gradient_strength']
|
| 214 |
+
if gradient < 20: # 放寬閾值,更多室內場景符合
|
| 215 |
+
indoor_score += 0.15
|
| 216 |
+
|
| 217 |
+
# Signal 4: Low laplacian variance suggests smooth indoor lighting
|
| 218 |
+
laplacian = cv_features['laplacian_variance']
|
| 219 |
+
if laplacian < 400: # 放寬閾值,包含更多室內場景
|
| 220 |
+
indoor_score += 0.10
|
| 221 |
+
|
| 222 |
+
# Signal 5: Shadow ratio - indoor scenes have less harsh shadows
|
| 223 |
+
shadow_ratio = cv_features['shadow_ratio']
|
| 224 |
+
if shadow_ratio < 0.25: # 放寬閾值,包含更多室內場景
|
| 225 |
+
indoor_score += 0.10
|
| 226 |
+
elif shadow_ratio > 0.5: # Strong shadows suggest outdoor sunlight
|
| 227 |
+
indoor_score -= 0.15
|
| 228 |
+
|
| 229 |
+
# Threshold: indoor if score > 0.15 (降低閾值,更容易判定為室內)
|
| 230 |
+
return indoor_score > 0.15
|
| 231 |
+
|
| 232 |
+
def _determine_indoor_lighting(self, cv_features: Dict) -> Tuple[str, float]:
|
| 233 |
+
"""
|
| 234 |
+
Determine lighting type for indoor scenes
|
| 235 |
+
|
| 236 |
+
Returns indoor-specific lighting types with confidence
|
| 237 |
+
"""
|
| 238 |
+
brightness = cv_features['brightness']
|
| 239 |
+
color_temp = cv_features['color_temp']
|
| 240 |
+
contrast = cv_features['contrast']
|
| 241 |
+
shadow_ratio = cv_features['shadow_ratio']
|
| 242 |
+
gradient = cv_features['gradient_strength']
|
| 243 |
+
laplacian = cv_features['laplacian_variance']
|
| 244 |
+
|
| 245 |
+
# Normalize features
|
| 246 |
+
brightness_norm = min(brightness / 255.0, 1.0)
|
| 247 |
+
contrast_norm = min(contrast / 100.0, 1.0)
|
| 248 |
+
gradient_norm = min(gradient / 50.0, 1.0)
|
| 249 |
+
laplacian_norm = min(laplacian / 1000.0, 1.0)
|
| 250 |
+
|
| 251 |
+
scores = {}
|
| 252 |
+
|
| 253 |
+
# Studio/Product Lighting (工作室/產品攝影燈光)
|
| 254 |
+
# Very controlled, bright, minimal shadows, low gradient
|
| 255 |
+
studio_score = (
|
| 256 |
+
0.35 * (1.0 if brightness_norm > 0.6 else 0.5) + # Bright
|
| 257 |
+
0.25 * (1.0 - shadow_ratio) + # Minimal shadows
|
| 258 |
+
0.20 * (1.0 - gradient_norm) + # Smooth, even
|
| 259 |
+
0.15 * (1.0 - laplacian_norm) + # Very smooth
|
| 260 |
+
0.05 * (1.0 - abs(color_temp - 1.0)) # Neutral temp
|
| 261 |
+
)
|
| 262 |
+
scores['studio lighting'] = studio_score
|
| 263 |
+
|
| 264 |
+
# Indoor Natural Light (室內自然光 - 窗光)
|
| 265 |
+
# Medium-bright, some contrast, neutral to warm temp
|
| 266 |
+
natural_indoor_score = (
|
| 267 |
+
0.30 * (1.0 if 0.5 < brightness_norm < 0.8 else 0.5) + # Medium-bright
|
| 268 |
+
0.25 * min(contrast_norm, 0.6) + # Some contrast
|
| 269 |
+
0.20 * (1.0 if color_temp > 0.95 else 0.5) + # Neutral to warm
|
| 270 |
+
0.15 * min(gradient_norm, 0.5) + # Some direction
|
| 271 |
+
0.10 * (1.0 if shadow_ratio < 0.3 else 0.5) # Some shadows
|
| 272 |
+
)
|
| 273 |
+
scores['indoor natural light'] = natural_indoor_score
|
| 274 |
+
|
| 275 |
+
# Warm Artificial Lighting (溫暖人工照明)
|
| 276 |
+
# Warm color temp, medium brightness, soft
|
| 277 |
+
warm_artificial_score = (
|
| 278 |
+
0.35 * (1.0 if color_temp > 1.1 else 0.3) + # Warm temp
|
| 279 |
+
0.25 * (1.0 - abs(brightness_norm - 0.5)) + # Medium brightness
|
| 280 |
+
0.20 * (1.0 - gradient_norm) + # Soft
|
| 281 |
+
0.15 * (1.0 - shadow_ratio) + # Minimal shadows
|
| 282 |
+
0.05 * (1.0 - laplacian_norm) # Smooth
|
| 283 |
+
)
|
| 284 |
+
scores['warm artificial lighting'] = warm_artificial_score
|
| 285 |
+
|
| 286 |
+
# Cool Artificial Lighting (冷色人工照明)
|
| 287 |
+
# Cool/neutral temp, medium-bright
|
| 288 |
+
cool_artificial_score = (
|
| 289 |
+
0.35 * (1.0 if color_temp < 1.05 else 0.4) + # Cool/neutral temp
|
| 290 |
+
0.25 * (1.0 if brightness_norm > 0.5 else 0.5) + # Medium-bright
|
| 291 |
+
0.20 * (1.0 - gradient_norm) + # Smooth
|
| 292 |
+
0.15 * (1.0 - shadow_ratio) + # Minimal shadows
|
| 293 |
+
0.05 * (1.0 - laplacian_norm) # Even
|
| 294 |
+
)
|
| 295 |
+
scores['cool artificial lighting'] = cool_artificial_score
|
| 296 |
+
|
| 297 |
+
# Soft Indoor Lighting (柔和室內光線)
|
| 298 |
+
# Low contrast, diffused, medium brightness
|
| 299 |
+
soft_indoor_score = (
|
| 300 |
+
0.30 * (1.0 - abs(brightness_norm - 0.5)) + # Medium brightness
|
| 301 |
+
0.30 * (1.0 - contrast_norm) + # Low contrast
|
| 302 |
+
0.20 * (1.0 - gradient_norm) + # Very soft
|
| 303 |
+
0.15 * (1.0 - shadow_ratio) + # Minimal shadows
|
| 304 |
+
0.05 * (1.0 - laplacian_norm) # Smooth
|
| 305 |
+
)
|
| 306 |
+
scores['soft indoor lighting'] = soft_indoor_score
|
| 307 |
+
|
| 308 |
+
# Dramatic Indoor Lighting (戲劇性室內光線)
|
| 309 |
+
# High contrast, directional, some shadows
|
| 310 |
+
dramatic_score = (
|
| 311 |
+
0.35 * contrast_norm + # High contrast
|
| 312 |
+
0.25 * gradient_norm + # Directional
|
| 313 |
+
0.20 * shadow_ratio + # Shadows present
|
| 314 |
+
0.15 * laplacian_norm + # Sharp transitions
|
| 315 |
+
0.05 * (1.0 if brightness_norm < 0.6 else 0.5) # Can be darker
|
| 316 |
+
)
|
| 317 |
+
scores['dramatic indoor lighting'] = dramatic_score
|
| 318 |
+
|
| 319 |
+
# Get best match
|
| 320 |
+
best_condition = max(scores.items(), key=lambda x: x[1])
|
| 321 |
+
|
| 322 |
+
# Calculate confidence
|
| 323 |
+
sorted_scores = sorted(scores.values(), reverse=True)
|
| 324 |
+
if len(sorted_scores) > 1:
|
| 325 |
+
score_gap = sorted_scores[0] - sorted_scores[1]
|
| 326 |
+
confidence = min(0.7 + score_gap * 0.3, 0.95)
|
| 327 |
+
else:
|
| 328 |
+
confidence = 0.7
|
| 329 |
+
|
| 330 |
+
return best_condition[0], confidence
|
| 331 |
+
|
| 332 |
+
def _determine_lighting_adaptive(self, cv_features: Dict, scene_info: Dict) -> Tuple[str, float]:
|
| 333 |
+
"""Determine lighting using adaptive thresholds with indoor/outdoor detection"""
|
| 334 |
+
|
| 335 |
+
# Extract all features
|
| 336 |
+
brightness = cv_features['brightness']
|
| 337 |
+
color_temp = cv_features['color_temp']
|
| 338 |
+
contrast = cv_features['contrast']
|
| 339 |
+
shadow = cv_features['shadow_ratio']
|
| 340 |
+
gradient = cv_features['gradient_strength']
|
| 341 |
+
laplacian = cv_features['laplacian_variance']
|
| 342 |
+
color_var = cv_features['color_variation']
|
| 343 |
+
|
| 344 |
+
# NEW: Detect indoor vs outdoor
|
| 345 |
+
is_indoor = self._detect_indoor_scene(cv_features, scene_info)
|
| 346 |
+
if is_indoor:
|
| 347 |
+
# 室內場景優先使用室內光線類型
|
| 348 |
+
return self._determine_indoor_lighting(cv_features)
|
| 349 |
+
# 否則使用原有邏輯
|
| 350 |
+
|
| 351 |
+
# Normalize features to 0-1 scale
|
| 352 |
+
brightness_norm = min(brightness / 255.0, 1.0)
|
| 353 |
+
contrast_norm = min(contrast / 100.0, 1.0)
|
| 354 |
+
gradient_norm = min(gradient / 50.0, 1.0) # Typical range 0-50
|
| 355 |
+
laplacian_norm = min(laplacian / 1000.0, 1.0) # Typical range 0-1000
|
| 356 |
+
color_var_norm = min(color_var / 50.0, 1.0) # Typical range 0-50
|
| 357 |
+
|
| 358 |
+
# Adaptive scoring (Places365 dominant, CV features assist)
|
| 359 |
+
scores = {}
|
| 360 |
+
|
| 361 |
+
# Soft diffused light (柔和漫射光)
|
| 362 |
+
# Characteristics: medium brightness, low contrast, neutral temp
|
| 363 |
+
# Auxiliary: low gradient (no strong edges), low laplacian (smooth transitions)
|
| 364 |
+
diffuse_score = (
|
| 365 |
+
0.40 * (1.0 - abs(brightness_norm - 0.5)) + # Medium brightness
|
| 366 |
+
0.25 * (1.0 - contrast_norm) + # Low contrast
|
| 367 |
+
0.20 * (1.0 - abs(color_temp - 1.0)) + # Neutral temp
|
| 368 |
+
0.08 * (1.0 - gradient_norm) + # Weak edges (diffused)
|
| 369 |
+
0.05 * (1.0 - laplacian_norm) + # Smooth transitions
|
| 370 |
+
0.02 * (1.0 - color_var_norm) # Uniform color
|
| 371 |
+
)
|
| 372 |
+
scores['soft diffused light'] = diffuse_score
|
| 373 |
+
|
| 374 |
+
# Natural daylight (自然光)
|
| 375 |
+
# Characteristics: bright, moderate contrast
|
| 376 |
+
# Auxiliary: moderate gradient, moderate color variation
|
| 377 |
+
daylight_score = (
|
| 378 |
+
0.40 * brightness_norm + # Bright
|
| 379 |
+
0.25 * min(contrast_norm, 0.7) + # Moderate contrast
|
| 380 |
+
0.20 * (1.0 - abs(color_temp - 1.0)) + # Neutral temp
|
| 381 |
+
0.08 * min(gradient_norm, 0.6) + # Moderate edges
|
| 382 |
+
0.05 * min(laplacian_norm, 0.6) + # Some detail
|
| 383 |
+
0.02 * min(color_var_norm, 0.5) # Some color variation
|
| 384 |
+
)
|
| 385 |
+
scores['natural daylight'] = daylight_score
|
| 386 |
+
|
| 387 |
+
# Overcast atmosphere (陰天氛圍)
|
| 388 |
+
# Characteristics: medium-low brightness, very low contrast, cool temp, minimal shadow
|
| 389 |
+
# Auxiliary: very low gradient (flat), low laplacian, low color variation
|
| 390 |
+
overcast_score = (
|
| 391 |
+
0.35 * (1.0 - abs(brightness_norm - 0.45)) + # Medium-low brightness
|
| 392 |
+
0.25 * (1.0 - contrast_norm) + # Very low contrast
|
| 393 |
+
0.15 * (1.0 if color_temp < 1.05 else 0.5) + # Cool temp
|
| 394 |
+
0.10 * (1.0 - shadow) + # Minimal shadows
|
| 395 |
+
0.08 * (1.0 - gradient_norm) + # Flat appearance
|
| 396 |
+
0.05 * (1.0 - laplacian_norm) + # Smooth lighting
|
| 397 |
+
0.02 * (1.0 - color_var_norm) # Uniform color
|
| 398 |
+
)
|
| 399 |
+
scores['overcast atmosphere'] = overcast_score
|
| 400 |
+
|
| 401 |
+
# Warm ambient light (溫暖環境光)
|
| 402 |
+
# Characteristics: medium brightness, warm temp
|
| 403 |
+
# Auxiliary: moderate gradient, warm color bias
|
| 404 |
+
warm_score = (
|
| 405 |
+
0.40 * (1.0 - abs(brightness_norm - 0.5)) + # Medium brightness
|
| 406 |
+
0.30 * (1.0 if color_temp > 1.1 else 0.5) + # Warm temp
|
| 407 |
+
0.15 * min(contrast_norm, 0.6) + # Moderate contrast
|
| 408 |
+
0.08 * min(gradient_norm, 0.5) + # Soft edges
|
| 409 |
+
0.05 * min(laplacian_norm, 0.5) + # Soft transitions
|
| 410 |
+
0.02 * color_var_norm # Some color variation (warmth)
|
| 411 |
+
)
|
| 412 |
+
scores['warm ambient light'] = warm_score
|
| 413 |
+
|
| 414 |
+
# Evening light (傍晚光線)
|
| 415 |
+
# Characteristics: medium-low brightness, warm temp, medium contrast
|
| 416 |
+
# Auxiliary: moderate gradient (directional), some color variation
|
| 417 |
+
evening_score = (
|
| 418 |
+
0.35 * (1.0 if brightness_norm < 0.6 else 0.5) + # Lower brightness
|
| 419 |
+
0.30 * (1.0 if color_temp > 1.05 else 0.5) + # Slightly warm
|
| 420 |
+
0.20 * contrast_norm + # Some contrast
|
| 421 |
+
0.08 * min(gradient_norm, 0.7) + # Directional light
|
| 422 |
+
0.05 * laplacian_norm + # Detail present
|
| 423 |
+
0.02 * color_var_norm # Color variation
|
| 424 |
+
)
|
| 425 |
+
scores['evening light'] = evening_score
|
| 426 |
+
|
| 427 |
+
# Bright sunlight (明亮陽光)
|
| 428 |
+
# Characteristics: high brightness, high contrast, strong shadows
|
| 429 |
+
# Auxiliary: high gradient (strong edges), high laplacian (sharp transitions)
|
| 430 |
+
sunlight_score = (
|
| 431 |
+
0.40 * (1.0 if brightness_norm > 0.7 else 0.3) + # High brightness
|
| 432 |
+
0.25 * contrast_norm + # High contrast
|
| 433 |
+
0.15 * shadow + # Strong shadows
|
| 434 |
+
0.10 * gradient_norm + # Strong edges
|
| 435 |
+
0.08 * laplacian_norm + # Sharp detail
|
| 436 |
+
0.02 * color_var_norm # Color variation
|
| 437 |
+
)
|
| 438 |
+
scores['bright sunlight'] = sunlight_score
|
| 439 |
+
|
| 440 |
+
# Get top scoring condition
|
| 441 |
+
best_condition = max(scores.items(), key=lambda x: x[1])
|
| 442 |
+
|
| 443 |
+
# Calculate confidence based on score separation
|
| 444 |
+
sorted_scores = sorted(scores.values(), reverse=True)
|
| 445 |
+
if len(sorted_scores) > 1:
|
| 446 |
+
score_gap = sorted_scores[0] - sorted_scores[1]
|
| 447 |
+
confidence = min(0.7 + score_gap * 0.3, 0.95)
|
| 448 |
+
else:
|
| 449 |
+
confidence = 0.7
|
| 450 |
+
|
| 451 |
+
return best_condition[0], confidence
|
| 452 |
+
|
| 453 |
+
print("✓ LightingAnalysisManager (with Places365 + advanced CV features) defined")
|
ocr_engine_manager.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import easyocr
|
| 3 |
+
import numpy as np
|
| 4 |
+
import cv2
|
| 5 |
+
from PIL import Image
|
| 6 |
+
from typing import List, Dict
|
| 7 |
+
import re
|
| 8 |
+
|
| 9 |
+
class OCREngineManager:
|
| 10 |
+
"""Text extraction using EasyOCR with brand-optimized preprocessing"""
|
| 11 |
+
|
| 12 |
+
def __init__(self):
|
| 13 |
+
print("Loading EasyOCR (English + Traditional Chinese)...")
|
| 14 |
+
|
| 15 |
+
# Try GPU first, fallback to CPU if GPU fails
|
| 16 |
+
try:
|
| 17 |
+
if torch.cuda.is_available():
|
| 18 |
+
print(" Attempting GPU initialization...")
|
| 19 |
+
self.reader = easyocr.Reader(['en', 'ch_tra'], gpu=True)
|
| 20 |
+
print(" ✓ EasyOCR loaded with GPU")
|
| 21 |
+
else:
|
| 22 |
+
print(" CUDA not available, using CPU...")
|
| 23 |
+
self.reader = easyocr.Reader(['en', 'ch_tra'], gpu=False)
|
| 24 |
+
print(" ✓ EasyOCR loaded with CPU")
|
| 25 |
+
except Exception as e:
|
| 26 |
+
print(f" ⚠️ GPU initialization failed: {e}")
|
| 27 |
+
print(" Falling back to CPU...")
|
| 28 |
+
self.reader = easyocr.Reader(['en', 'ch_tra'], gpu=False)
|
| 29 |
+
print(" ✓ EasyOCR loaded with CPU (fallback)")
|
| 30 |
+
|
| 31 |
+
print("✓ EasyOCR loaded")
|
| 32 |
+
|
| 33 |
+
def extract_text(self, image: Image.Image, use_brand_preprocessing: bool = False) -> List[Dict]:
|
| 34 |
+
"""Extract text from image with optional brand-optimized preprocessing"""
|
| 35 |
+
if use_brand_preprocessing:
|
| 36 |
+
# Apply brand-optimized preprocessing
|
| 37 |
+
processed_image = self.preprocess_for_brand_ocr(image)
|
| 38 |
+
img_array = np.array(processed_image)
|
| 39 |
+
else:
|
| 40 |
+
img_array = np.array(image)
|
| 41 |
+
|
| 42 |
+
# Use more aggressive settings for brand detection
|
| 43 |
+
if use_brand_preprocessing:
|
| 44 |
+
results = self.reader.readtext(
|
| 45 |
+
img_array,
|
| 46 |
+
detail=1,
|
| 47 |
+
paragraph=False,
|
| 48 |
+
min_size=10, # Lower to catch small brand text
|
| 49 |
+
text_threshold=0.5, # Lower threshold for brand logos
|
| 50 |
+
link_threshold=0.3,
|
| 51 |
+
contrast_ths=0.1, # Lower to handle metallic/reflective text
|
| 52 |
+
adjust_contrast=0.8 # Enhance contrast for logos
|
| 53 |
+
)
|
| 54 |
+
else:
|
| 55 |
+
results = self.reader.readtext(
|
| 56 |
+
img_array,
|
| 57 |
+
detail=1,
|
| 58 |
+
paragraph=False,
|
| 59 |
+
min_size=20,
|
| 60 |
+
text_threshold=0.7,
|
| 61 |
+
link_threshold=0.4
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
structured_results = []
|
| 65 |
+
for bbox, text, confidence in results:
|
| 66 |
+
structured_results.append({
|
| 67 |
+
'bbox': bbox,
|
| 68 |
+
'text': self.clean_and_normalize(text),
|
| 69 |
+
'confidence': confidence,
|
| 70 |
+
'raw_text': text
|
| 71 |
+
})
|
| 72 |
+
|
| 73 |
+
return structured_results
|
| 74 |
+
|
| 75 |
+
def clean_and_normalize(self, text: str) -> str:
|
| 76 |
+
"""Clean and normalize text"""
|
| 77 |
+
# Keep Traditional Chinese characters
|
| 78 |
+
text = re.sub(r'[^\w\s\u4e00-\u9fff]', '', text)
|
| 79 |
+
text = ' '.join(text.split())
|
| 80 |
+
return text.upper()
|
| 81 |
+
|
| 82 |
+
def preprocess_for_brand_ocr(self, image_region: Image.Image) -> Image.Image:
|
| 83 |
+
"""
|
| 84 |
+
Preprocess image for brand OCR recognition
|
| 85 |
+
Optimizes for detecting brand logos and text on products (especially metallic logos)
|
| 86 |
+
|
| 87 |
+
Args:
|
| 88 |
+
image_region: PIL Image (typically a cropped region)
|
| 89 |
+
|
| 90 |
+
Returns:
|
| 91 |
+
Preprocessed PIL Image
|
| 92 |
+
"""
|
| 93 |
+
# Convert to numpy array
|
| 94 |
+
img_array = np.array(image_region)
|
| 95 |
+
|
| 96 |
+
# Convert to grayscale
|
| 97 |
+
if len(img_array.shape) == 3:
|
| 98 |
+
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
| 99 |
+
else:
|
| 100 |
+
gray = img_array
|
| 101 |
+
|
| 102 |
+
# Apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
|
| 103 |
+
# Increased clipLimit for metallic logos (2.0 → 3.0)
|
| 104 |
+
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
|
| 105 |
+
enhanced = clahe.apply(gray)
|
| 106 |
+
|
| 107 |
+
# Denoise (slightly reduced strength to preserve logo edges)
|
| 108 |
+
denoised = cv2.fastNlMeansDenoising(enhanced, None, h=8, templateWindowSize=7, searchWindowSize=21)
|
| 109 |
+
|
| 110 |
+
# Adaptive thresholding to handle varying lighting
|
| 111 |
+
# Adjusted blockSize for better logo detection (11 → 15)
|
| 112 |
+
binary = cv2.adaptiveThreshold(
|
| 113 |
+
denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
| 114 |
+
cv2.THRESH_BINARY, 15, 2
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
# Morphological operations to connect broken characters
|
| 118 |
+
# Slightly larger kernel for logo text (2x2 → 3x3)
|
| 119 |
+
kernel = np.ones((3, 3), np.uint8)
|
| 120 |
+
morph = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
|
| 121 |
+
|
| 122 |
+
# Sharpen to enhance edges (increased center weight 9 → 11)
|
| 123 |
+
kernel_sharp = np.array([[-1, -1, -1], [-1, 11, -1], [-1, -1, -1]])
|
| 124 |
+
sharpened = cv2.filter2D(morph, -1, kernel_sharp)
|
| 125 |
+
|
| 126 |
+
# Convert back to PIL Image
|
| 127 |
+
return Image.fromarray(sharpened)
|
| 128 |
+
|
| 129 |
+
print("✓ OCREngineManager (with brand OCR preprocessing) defined")
|
openclip_semantic_manager.py
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import torch
|
| 3 |
+
import open_clip
|
| 4 |
+
from PIL import Image
|
| 5 |
+
from typing import List, Dict
|
| 6 |
+
import numpy as np
|
| 7 |
+
|
| 8 |
+
class OpenCLIPSemanticManager:
|
| 9 |
+
"""Zero-shot classification and visual feature extraction with enhanced scene understanding"""
|
| 10 |
+
|
| 11 |
+
def __init__(self):
|
| 12 |
+
print("Loading OpenCLIP ViT-H/14 model...")
|
| 13 |
+
self.model, _, self.preprocess = open_clip.create_model_and_transforms(
|
| 14 |
+
'ViT-H-14',
|
| 15 |
+
pretrained='laion2b_s32b_b79k'
|
| 16 |
+
)
|
| 17 |
+
self.tokenizer = open_clip.get_tokenizer('ViT-H-14')
|
| 18 |
+
|
| 19 |
+
if torch.cuda.is_available():
|
| 20 |
+
self.model = self.model.cuda()
|
| 21 |
+
self.model.eval()
|
| 22 |
+
|
| 23 |
+
# Enhanced scene vocabularies
|
| 24 |
+
self.scene_vocabularies = {
|
| 25 |
+
'urban': [
|
| 26 |
+
'city canyon with tall buildings',
|
| 27 |
+
'downtown street with skyscrapers',
|
| 28 |
+
'urban corridor between buildings',
|
| 29 |
+
'busy city intersection',
|
| 30 |
+
'metropolitan avenue'
|
| 31 |
+
],
|
| 32 |
+
'lighting': [
|
| 33 |
+
'overcast cloudy day',
|
| 34 |
+
'bright sunny day',
|
| 35 |
+
'golden hour warm glow',
|
| 36 |
+
'blue hour twilight',
|
| 37 |
+
'harsh midday sun',
|
| 38 |
+
'soft diffused light',
|
| 39 |
+
'dramatic evening light',
|
| 40 |
+
'moody overcast atmosphere'
|
| 41 |
+
],
|
| 42 |
+
'mood': [
|
| 43 |
+
'bustling and energetic',
|
| 44 |
+
'calm and contemplative',
|
| 45 |
+
'dramatic and imposing',
|
| 46 |
+
'intimate and cozy',
|
| 47 |
+
'vibrant and lively'
|
| 48 |
+
]
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
# Hierarchical vocabularies
|
| 52 |
+
self.coarse_labels = [
|
| 53 |
+
'furniture', 'musical instrument', 'artwork',
|
| 54 |
+
'appliance', 'decoration', 'tool', 'electronic device',
|
| 55 |
+
'clothing', 'accessory', 'food', 'plant'
|
| 56 |
+
]
|
| 57 |
+
|
| 58 |
+
self.domain_vocabularies = {
|
| 59 |
+
'musical instrument': [
|
| 60 |
+
'acoustic guitar', 'electric guitar', 'bass guitar',
|
| 61 |
+
'classical guitar', 'ukulele', 'violin', 'cello',
|
| 62 |
+
'piano', 'keyboard', 'drums', 'saxophone', 'trumpet'
|
| 63 |
+
],
|
| 64 |
+
'furniture': [
|
| 65 |
+
'chair', 'sofa', 'table', 'desk', 'shelf',
|
| 66 |
+
'cabinet', 'bed', 'stool', 'bench', 'wardrobe'
|
| 67 |
+
],
|
| 68 |
+
'electronic device': [
|
| 69 |
+
'smartphone', 'laptop', 'tablet', 'camera',
|
| 70 |
+
'headphones', 'speaker', 'monitor', 'keyboard', 'mouse'
|
| 71 |
+
],
|
| 72 |
+
'clothing': [
|
| 73 |
+
'shirt', 'pants', 'dress', 'jacket', 'coat',
|
| 74 |
+
'sweater', 'skirt', 'jeans', 'hoodie'
|
| 75 |
+
],
|
| 76 |
+
'accessory': [
|
| 77 |
+
'watch', 'sunglasses', 'hat', 'scarf', 'belt',
|
| 78 |
+
'bag', 'wallet', 'jewelry', 'tie'
|
| 79 |
+
]
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
self.text_features_cache = {}
|
| 83 |
+
self._cache_text_features()
|
| 84 |
+
|
| 85 |
+
print("✓ OpenCLIP loaded with enhanced scene understanding")
|
| 86 |
+
|
| 87 |
+
def _cache_text_features(self):
|
| 88 |
+
"""Pre-compute and cache text features"""
|
| 89 |
+
with torch.no_grad():
|
| 90 |
+
# Cache coarse labels
|
| 91 |
+
prompts = [f"a photo of {label}" for label in self.coarse_labels]
|
| 92 |
+
text = self.tokenizer(prompts)
|
| 93 |
+
if torch.cuda.is_available():
|
| 94 |
+
text = text.cuda()
|
| 95 |
+
self.text_features_cache['coarse'] = self.model.encode_text(text)
|
| 96 |
+
self.text_features_cache['coarse'] /= self.text_features_cache['coarse'].norm(dim=-1, keepdim=True)
|
| 97 |
+
|
| 98 |
+
# Cache domain vocabularies
|
| 99 |
+
for domain, labels in self.domain_vocabularies.items():
|
| 100 |
+
prompts = [f"a photo of {label}" for label in labels]
|
| 101 |
+
text = self.tokenizer(prompts)
|
| 102 |
+
if torch.cuda.is_available():
|
| 103 |
+
text = text.cuda()
|
| 104 |
+
features = self.model.encode_text(text)
|
| 105 |
+
features /= features.norm(dim=-1, keepdim=True)
|
| 106 |
+
self.text_features_cache[domain] = features
|
| 107 |
+
|
| 108 |
+
# Cache scene vocabularies
|
| 109 |
+
for scene_type, labels in self.scene_vocabularies.items():
|
| 110 |
+
text = self.tokenizer(labels)
|
| 111 |
+
if torch.cuda.is_available():
|
| 112 |
+
text = text.cuda()
|
| 113 |
+
features = self.model.encode_text(text)
|
| 114 |
+
features /= features.norm(dim=-1, keepdim=True)
|
| 115 |
+
self.text_features_cache[f'scene_{scene_type}'] = features
|
| 116 |
+
|
| 117 |
+
def analyze_scene(self, image: Image.Image) -> Dict:
|
| 118 |
+
"""Comprehensive scene analysis"""
|
| 119 |
+
image_features = self.encode_image(image)
|
| 120 |
+
|
| 121 |
+
scene_analysis = {}
|
| 122 |
+
|
| 123 |
+
# Analyze each scene aspect
|
| 124 |
+
for scene_type in ['urban', 'lighting', 'mood']:
|
| 125 |
+
cache_key = f'scene_{scene_type}'
|
| 126 |
+
similarity = (image_features @ self.text_features_cache[cache_key].T) / 0.01
|
| 127 |
+
probs = similarity.softmax(dim=-1)
|
| 128 |
+
|
| 129 |
+
results = {}
|
| 130 |
+
for i, label in enumerate(self.scene_vocabularies[scene_type]):
|
| 131 |
+
results[label] = float(probs[0, i].cpu())
|
| 132 |
+
|
| 133 |
+
top_result = max(results.items(), key=lambda x: x[1])
|
| 134 |
+
scene_analysis[scene_type] = {
|
| 135 |
+
'top': top_result[0],
|
| 136 |
+
'confidence': top_result[1],
|
| 137 |
+
'all_scores': results
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
return scene_analysis
|
| 141 |
+
|
| 142 |
+
def encode_image(self, image: Image.Image) -> torch.Tensor:
|
| 143 |
+
"""Encode image to feature vector"""
|
| 144 |
+
with torch.no_grad():
|
| 145 |
+
image_tensor = self.preprocess(image).unsqueeze(0)
|
| 146 |
+
if torch.cuda.is_available():
|
| 147 |
+
image_tensor = image_tensor.cuda()
|
| 148 |
+
|
| 149 |
+
image_features = self.model.encode_image(image_tensor)
|
| 150 |
+
image_features /= image_features.norm(dim=-1, keepdim=True)
|
| 151 |
+
return image_features
|
| 152 |
+
|
| 153 |
+
def encode_text(self, text_list: List[str]) -> torch.Tensor:
|
| 154 |
+
"""Encode text list to feature vectors"""
|
| 155 |
+
with torch.no_grad():
|
| 156 |
+
prompts = [f"a photo of {text}" for text in text_list]
|
| 157 |
+
text = self.tokenizer(prompts)
|
| 158 |
+
if torch.cuda.is_available():
|
| 159 |
+
text = text.cuda()
|
| 160 |
+
|
| 161 |
+
text_features = self.model.encode_text(text)
|
| 162 |
+
text_features /= text_features.norm(dim=-1, keepdim=True)
|
| 163 |
+
return text_features
|
| 164 |
+
|
| 165 |
+
def classify_zero_shot(self, image: Image.Image, candidate_labels: List[str]) -> Dict[str, float]:
|
| 166 |
+
"""Zero-shot classification"""
|
| 167 |
+
image_features = self.encode_image(image)
|
| 168 |
+
text_features = self.encode_text(candidate_labels)
|
| 169 |
+
|
| 170 |
+
similarity = (image_features @ text_features.T) / 0.01
|
| 171 |
+
probs = similarity.softmax(dim=-1)
|
| 172 |
+
|
| 173 |
+
results = {}
|
| 174 |
+
for i, label in enumerate(candidate_labels):
|
| 175 |
+
results[label] = float(probs[0, i].cpu())
|
| 176 |
+
|
| 177 |
+
return results
|
| 178 |
+
|
| 179 |
+
def classify_hierarchical(self, image: Image.Image) -> Dict:
|
| 180 |
+
"""Hierarchical classification"""
|
| 181 |
+
image_features = self.encode_image(image)
|
| 182 |
+
|
| 183 |
+
coarse_similarity = (image_features @ self.text_features_cache['coarse'].T) / 0.01
|
| 184 |
+
coarse_probs = coarse_similarity.softmax(dim=-1)
|
| 185 |
+
|
| 186 |
+
coarse_results = {}
|
| 187 |
+
for i, label in enumerate(self.coarse_labels):
|
| 188 |
+
coarse_results[label] = float(coarse_probs[0, i].cpu())
|
| 189 |
+
|
| 190 |
+
top_category = max(coarse_results, key=coarse_results.get)
|
| 191 |
+
|
| 192 |
+
if top_category in self.domain_vocabularies:
|
| 193 |
+
fine_labels = self.domain_vocabularies[top_category]
|
| 194 |
+
fine_similarity = (image_features @ self.text_features_cache[top_category].T) / 0.01
|
| 195 |
+
fine_probs = fine_similarity.softmax(dim=-1)
|
| 196 |
+
|
| 197 |
+
fine_results = {}
|
| 198 |
+
for i, label in enumerate(fine_labels):
|
| 199 |
+
fine_results[label] = float(fine_probs[0, i].cpu())
|
| 200 |
+
|
| 201 |
+
top_prediction = max(fine_results, key=fine_results.get)
|
| 202 |
+
|
| 203 |
+
return {
|
| 204 |
+
'coarse': top_category,
|
| 205 |
+
'fine': fine_results,
|
| 206 |
+
'top_prediction': top_prediction,
|
| 207 |
+
'confidence': fine_results[top_prediction]
|
| 208 |
+
}
|
| 209 |
+
|
| 210 |
+
return {
|
| 211 |
+
'coarse': top_category,
|
| 212 |
+
'top_prediction': top_category,
|
| 213 |
+
'confidence': coarse_results[top_category]
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
print("✓ OpenCLIPSemanticManager defined")
|
output_processing_manager.py
ADDED
|
@@ -0,0 +1,350 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import re
|
| 3 |
+
from typing import Dict, List, Tuple, Optional
|
| 4 |
+
from prompt_library_manager import PromptLibraryManager
|
| 5 |
+
|
| 6 |
+
class OutputProcessingManager:
|
| 7 |
+
"""
|
| 8 |
+
輸出驗證、格式化與智能標籤生成
|
| 9 |
+
整合 PromptLibraryManager 提供商業級標籤生成
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
def __init__(self, prompt_library: PromptLibraryManager = None):
|
| 13 |
+
"""
|
| 14 |
+
Args:
|
| 15 |
+
prompt_library: PromptLibraryManager 實例(可選,會自動創建)
|
| 16 |
+
"""
|
| 17 |
+
self.profanity_filter = set([])
|
| 18 |
+
|
| 19 |
+
self.max_lengths = {
|
| 20 |
+
'instagram': 2200,
|
| 21 |
+
'tiktok': 100,
|
| 22 |
+
'xiaohongshu': 500
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
# 初始化或使用提供的 PromptLibraryManager
|
| 26 |
+
if prompt_library is None:
|
| 27 |
+
self.prompt_library = PromptLibraryManager()
|
| 28 |
+
else:
|
| 29 |
+
self.prompt_library = prompt_library
|
| 30 |
+
|
| 31 |
+
# 地標檢測關鍵字(用於簡單的地標識別)
|
| 32 |
+
self.landmark_keywords = self._init_landmark_keywords()
|
| 33 |
+
|
| 34 |
+
print("✓ OutputProcessingManager (with integrated PromptLibraryManager) initialized")
|
| 35 |
+
|
| 36 |
+
def _init_landmark_keywords(self) -> Dict[str, List[str]]:
|
| 37 |
+
"""
|
| 38 |
+
初始化地標檢測關鍵字映射
|
| 39 |
+
用於從檢測到的物體和場景中推測可能的地標
|
| 40 |
+
"""
|
| 41 |
+
return {
|
| 42 |
+
'Big Ben': ['clock tower', 'tower', 'bridge', 'palace', 'gothic'],
|
| 43 |
+
'Eiffel Tower': ['tower', 'iron', 'landmark', 'lattice'],
|
| 44 |
+
'Statue of Liberty': ['statue', 'monument', 'harbor', 'torch'],
|
| 45 |
+
'Golden Gate Bridge': ['bridge', 'suspension', 'orange', 'bay'],
|
| 46 |
+
'Sydney Opera House': ['opera', 'building', 'harbor', 'shell'],
|
| 47 |
+
'Taj Mahal': ['palace', 'dome', 'monument', 'marble'],
|
| 48 |
+
'Colosseum': ['arena', 'amphitheater', 'ruins', 'ancient'],
|
| 49 |
+
'Pyramids of Giza': ['pyramid', 'desert', 'ancient', 'monument'],
|
| 50 |
+
'Burj Khalifa': ['skyscraper', 'tower', 'building', 'tall'],
|
| 51 |
+
'Tokyo Tower': ['tower', 'lattice', 'red'],
|
| 52 |
+
'Taipei 101': ['skyscraper', 'tower', 'building'],
|
| 53 |
+
# 可以擴展更多
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
def detect_landmark(self, detections: List[Dict], scene_info: Dict) -> Optional[str]:
|
| 57 |
+
"""
|
| 58 |
+
從檢測結果中推測可能的地標
|
| 59 |
+
|
| 60 |
+
Args:
|
| 61 |
+
detections: YOLO 檢測結果
|
| 62 |
+
scene_info: 場景分析結果
|
| 63 |
+
|
| 64 |
+
Returns:
|
| 65 |
+
推測的地標名稱,若無法推測則返回 None
|
| 66 |
+
"""
|
| 67 |
+
detected_objects = [d.get('class_name', '').lower() for d in detections]
|
| 68 |
+
|
| 69 |
+
# 從場景資訊中提取更多線索
|
| 70 |
+
scene_keywords = []
|
| 71 |
+
urban_scene = scene_info.get('urban', {}).get('top', '')
|
| 72 |
+
if urban_scene:
|
| 73 |
+
scene_keywords.append(urban_scene.lower())
|
| 74 |
+
|
| 75 |
+
all_keywords = detected_objects + scene_keywords
|
| 76 |
+
|
| 77 |
+
# 計算每個地標的匹配分數
|
| 78 |
+
scores = {}
|
| 79 |
+
for landmark, keywords in self.landmark_keywords.items():
|
| 80 |
+
match_count = sum(1 for obj in all_keywords
|
| 81 |
+
if any(kw in obj for kw in keywords))
|
| 82 |
+
if match_count > 0:
|
| 83 |
+
scores[landmark] = match_count
|
| 84 |
+
|
| 85 |
+
# 返回得分最高的地標(至少需要 2 個匹配)
|
| 86 |
+
if scores:
|
| 87 |
+
best_landmark = max(scores.items(), key=lambda x: x[1])
|
| 88 |
+
if best_landmark[1] >= 2:
|
| 89 |
+
return best_landmark[0]
|
| 90 |
+
|
| 91 |
+
return None
|
| 92 |
+
|
| 93 |
+
def generate_smart_hashtags(self, detections: List[Dict], scene_info: Dict,
|
| 94 |
+
brands: List, platform: str, language: str) -> List[str]:
|
| 95 |
+
"""
|
| 96 |
+
智能標籤生成:整合品牌、地標、場景的標籤
|
| 97 |
+
|
| 98 |
+
Args:
|
| 99 |
+
detections: 檢測到的物體列表
|
| 100 |
+
scene_info: 場景分析結果
|
| 101 |
+
brands: 檢測到的品牌列表
|
| 102 |
+
platform: 平台名稱
|
| 103 |
+
language: 語言 ('zh', 'en', 或 'zh-en')
|
| 104 |
+
|
| 105 |
+
Returns:
|
| 106 |
+
智能生成的 hashtag 列表(最多 10 個)
|
| 107 |
+
"""
|
| 108 |
+
hashtags = []
|
| 109 |
+
|
| 110 |
+
# 1. 檢測地標(最高優先級)
|
| 111 |
+
detected_landmark = self.detect_landmark(detections, scene_info)
|
| 112 |
+
if detected_landmark:
|
| 113 |
+
landmark_tags = self.prompt_library.landmark_prompts.get_hashtags(
|
| 114 |
+
detected_landmark, language
|
| 115 |
+
)
|
| 116 |
+
hashtags.extend(landmark_tags[:5]) # 地標標籤限制 5 個
|
| 117 |
+
|
| 118 |
+
# 2. 品牌標籤(高優先級)
|
| 119 |
+
if brands:
|
| 120 |
+
for brand in brands[:3]: # 最多 3 個品牌
|
| 121 |
+
brand_name = brand[0] if isinstance(brand, tuple) else brand
|
| 122 |
+
brand_tags = self.prompt_library.brand_prompts.get_hashtags(
|
| 123 |
+
brand_name, language
|
| 124 |
+
)
|
| 125 |
+
hashtags.extend(brand_tags[:3]) # 每個品牌最多 3 個標籤
|
| 126 |
+
|
| 127 |
+
# 3. 場景標籤(中優先級)
|
| 128 |
+
scene_category = self._detect_scene_category(scene_info, detections)
|
| 129 |
+
if scene_category:
|
| 130 |
+
scene_tags = self.prompt_library.scene_prompts.get_hashtags(
|
| 131 |
+
scene_category, language
|
| 132 |
+
)
|
| 133 |
+
hashtags.extend(scene_tags[:4])
|
| 134 |
+
|
| 135 |
+
# 4. 構圖特定標籤
|
| 136 |
+
composition_tags = self._get_composition_hashtags(scene_info, language)
|
| 137 |
+
hashtags.extend(composition_tags)
|
| 138 |
+
|
| 139 |
+
# 5. 平台特定標籤
|
| 140 |
+
platform_tags = self._get_platform_hashtags(platform, language)
|
| 141 |
+
hashtags.extend(platform_tags)
|
| 142 |
+
|
| 143 |
+
# 去重並保持順序(地標 > 品牌 > 場景 > 構圖 > 平台)
|
| 144 |
+
seen = set()
|
| 145 |
+
unique_hashtags = []
|
| 146 |
+
for tag in hashtags:
|
| 147 |
+
if tag not in seen and tag: # 確保標籤不為空
|
| 148 |
+
seen.add(tag)
|
| 149 |
+
unique_hashtags.append(tag)
|
| 150 |
+
|
| 151 |
+
# 返回前 10 個
|
| 152 |
+
return unique_hashtags[:10]
|
| 153 |
+
|
| 154 |
+
def _detect_scene_category(self, scene_info: Dict, detections: List[Dict]) -> Optional[str]:
|
| 155 |
+
"""
|
| 156 |
+
檢測場景類別
|
| 157 |
+
|
| 158 |
+
Returns:
|
| 159 |
+
場景類別名稱 ('urban', 'nature', 'indoor', 'food', etc.)
|
| 160 |
+
"""
|
| 161 |
+
# 檢查物體類別來判斷場景
|
| 162 |
+
object_classes = [d.get('class_name', '').lower() for d in detections]
|
| 163 |
+
|
| 164 |
+
# 食物場景
|
| 165 |
+
food_keywords = ['sandwich', 'pizza', 'cake', 'food', 'plate', 'bowl', 'cup', 'bottle']
|
| 166 |
+
if any(kw in obj for kw in food_keywords for obj in object_classes):
|
| 167 |
+
return 'food'
|
| 168 |
+
|
| 169 |
+
# 自然場景
|
| 170 |
+
nature_keywords = ['tree', 'mountain', 'water', 'sky', 'beach', 'ocean']
|
| 171 |
+
if any(kw in obj for kw in nature_keywords for obj in object_classes):
|
| 172 |
+
return 'nature'
|
| 173 |
+
|
| 174 |
+
# 城市場景(默認)
|
| 175 |
+
urban_scene = scene_info.get('urban', {}).get('top', '')
|
| 176 |
+
if urban_scene and ('canyon' in urban_scene or 'street' in urban_scene or 'building' in urban_scene):
|
| 177 |
+
return 'urban'
|
| 178 |
+
|
| 179 |
+
# 室內場景
|
| 180 |
+
indoor_keywords = ['chair', 'table', 'couch', 'bed', 'desk']
|
| 181 |
+
if any(kw in obj for kw in indoor_keywords for obj in object_classes):
|
| 182 |
+
return 'indoor'
|
| 183 |
+
|
| 184 |
+
return 'urban' # 默認城市場景
|
| 185 |
+
|
| 186 |
+
def _get_composition_hashtags(self, scene_info: Dict, language: str) -> List[str]:
|
| 187 |
+
"""
|
| 188 |
+
根據構圖類型生成標籤
|
| 189 |
+
"""
|
| 190 |
+
hashtags = []
|
| 191 |
+
|
| 192 |
+
composition = scene_info.get('urban', {}).get('top', '')
|
| 193 |
+
|
| 194 |
+
# 城市峽谷
|
| 195 |
+
if 'canyon' in composition or 'skyscraper' in composition:
|
| 196 |
+
if language == 'zh':
|
| 197 |
+
hashtags.extend(['城市峽谷', '城市風景'])
|
| 198 |
+
elif language == 'en':
|
| 199 |
+
hashtags.extend(['UrbanCanyon', 'Cityscape'])
|
| 200 |
+
else: # bilingual
|
| 201 |
+
hashtags.extend(['城市峽谷', 'UrbanCanyon'])
|
| 202 |
+
|
| 203 |
+
# 攝影類型
|
| 204 |
+
if language == 'zh':
|
| 205 |
+
hashtags.append('攝影日常')
|
| 206 |
+
elif language == 'en':
|
| 207 |
+
hashtags.append('Photography')
|
| 208 |
+
else:
|
| 209 |
+
hashtags.extend(['攝影日常', 'Photography'])
|
| 210 |
+
|
| 211 |
+
return hashtags
|
| 212 |
+
|
| 213 |
+
def _get_platform_hashtags(self, platform: str, language: str) -> List[str]:
|
| 214 |
+
"""
|
| 215 |
+
根據平台生成特定標籤
|
| 216 |
+
"""
|
| 217 |
+
hashtags = []
|
| 218 |
+
|
| 219 |
+
if platform == 'instagram':
|
| 220 |
+
if language == 'zh':
|
| 221 |
+
hashtags.append('IG日常')
|
| 222 |
+
elif language == 'en':
|
| 223 |
+
hashtags.append('InstaDaily')
|
| 224 |
+
else:
|
| 225 |
+
hashtags.extend(['IG日常', 'InstaDaily'])
|
| 226 |
+
|
| 227 |
+
elif platform == 'tiktok':
|
| 228 |
+
if language == 'zh':
|
| 229 |
+
hashtags.append('抖音')
|
| 230 |
+
elif language == 'en':
|
| 231 |
+
hashtags.append('TikTok')
|
| 232 |
+
else:
|
| 233 |
+
hashtags.extend(['抖音', 'TikTok'])
|
| 234 |
+
|
| 235 |
+
elif platform == 'xiaohongshu':
|
| 236 |
+
hashtags.extend(['小紅書', '分享日常'])
|
| 237 |
+
|
| 238 |
+
return hashtags
|
| 239 |
+
|
| 240 |
+
def validate_output(self, output: Dict, platform: str,
|
| 241 |
+
detections: List[Dict] = None, scene_info: Dict = None,
|
| 242 |
+
brands: List = None, language: str = 'en') -> Tuple[bool, str]:
|
| 243 |
+
"""
|
| 244 |
+
驗證輸出格式和內容(含標籤自動補充)
|
| 245 |
+
|
| 246 |
+
Args:
|
| 247 |
+
output: 生成的標題字典
|
| 248 |
+
platform: 平台名稱
|
| 249 |
+
detections: 檢測結果(用於標籤補充)
|
| 250 |
+
scene_info: 場景資訊(用於標籤補充)
|
| 251 |
+
brands: 品牌列表(用於標籤補充)
|
| 252 |
+
language: 語言
|
| 253 |
+
|
| 254 |
+
Returns:
|
| 255 |
+
(是否通過驗證, 驗證訊息)
|
| 256 |
+
"""
|
| 257 |
+
# 1. 結構驗證
|
| 258 |
+
required_fields = ['caption', 'hashtags', 'tone', 'platform']
|
| 259 |
+
if not all(field in output for field in required_fields):
|
| 260 |
+
return False, "Missing required fields"
|
| 261 |
+
|
| 262 |
+
# 2. 長度驗證
|
| 263 |
+
max_length = self.max_lengths.get(platform, 2200)
|
| 264 |
+
if len(output['caption']) > max_length:
|
| 265 |
+
output['caption'] = output['caption'][:max_length-3] + '...'
|
| 266 |
+
|
| 267 |
+
# 3. 內容過濾
|
| 268 |
+
if self._contains_profanity(output['caption']):
|
| 269 |
+
return False, "Contains inappropriate content"
|
| 270 |
+
|
| 271 |
+
# 4. 標籤驗證
|
| 272 |
+
output['hashtags'] = self._validate_hashtags(output['hashtags'])
|
| 273 |
+
|
| 274 |
+
# 🆕 5. 標籤數量檢查與自動補充(商業級功能)
|
| 275 |
+
min_hashtags = 5 # 最低標籤數量要求
|
| 276 |
+
if len(output['hashtags']) < min_hashtags:
|
| 277 |
+
# 如果提供了檢測資訊,自動補充標籤
|
| 278 |
+
if detections is not None and scene_info is not None:
|
| 279 |
+
additional_tags = self.generate_smart_hashtags(
|
| 280 |
+
detections, scene_info, brands or [], platform, language
|
| 281 |
+
)
|
| 282 |
+
# 補充標籤(避免重複)
|
| 283 |
+
for tag in additional_tags:
|
| 284 |
+
if tag not in output['hashtags'] and len(output['hashtags']) < 10:
|
| 285 |
+
output['hashtags'].append(tag)
|
| 286 |
+
|
| 287 |
+
print(f" [AUTO-補充] 標籤數量不足 ({len(output['hashtags'])} < {min_hashtags}),已自動補充至 {len(output['hashtags'])} 個")
|
| 288 |
+
|
| 289 |
+
# 6. 確保標題中沒有 hashtag 符號
|
| 290 |
+
if '#' in output['caption']:
|
| 291 |
+
# 移除標題中的 hashtag
|
| 292 |
+
output['caption'] = re.sub(r'#\w+', '', output['caption']).strip()
|
| 293 |
+
|
| 294 |
+
return True, "Validation passed"
|
| 295 |
+
|
| 296 |
+
def _contains_profanity(self, text: str) -> bool:
|
| 297 |
+
"""檢查不當內容"""
|
| 298 |
+
text_lower = text.lower()
|
| 299 |
+
for word in self.profanity_filter:
|
| 300 |
+
if word in text_lower:
|
| 301 |
+
return True
|
| 302 |
+
return False
|
| 303 |
+
|
| 304 |
+
def _validate_hashtags(self, hashtags: List[str]) -> List[str]:
|
| 305 |
+
"""
|
| 306 |
+
驗證並清理 hashtags
|
| 307 |
+
|
| 308 |
+
Args:
|
| 309 |
+
hashtags: 原始 hashtag 列表
|
| 310 |
+
|
| 311 |
+
Returns:
|
| 312 |
+
清理後的 hashtag 列表
|
| 313 |
+
"""
|
| 314 |
+
cleaned = []
|
| 315 |
+
for tag in hashtags:
|
| 316 |
+
# 移除 # 符號
|
| 317 |
+
tag = tag.lstrip('#')
|
| 318 |
+
|
| 319 |
+
# 保留中文、英文、數字
|
| 320 |
+
tag = re.sub(r'[^\w\u4e00-\u9fff]', '', tag)
|
| 321 |
+
|
| 322 |
+
# 確保不為空且不重複
|
| 323 |
+
if tag and tag not in cleaned:
|
| 324 |
+
cleaned.append(tag)
|
| 325 |
+
|
| 326 |
+
return cleaned[:10] # 最多 10 個
|
| 327 |
+
|
| 328 |
+
def format_for_platform(self, caption: Dict, platform: str) -> str:
|
| 329 |
+
"""
|
| 330 |
+
根據平台格式化輸出
|
| 331 |
+
|
| 332 |
+
Args:
|
| 333 |
+
caption: 標題字典
|
| 334 |
+
platform: 平台名稱
|
| 335 |
+
|
| 336 |
+
Returns:
|
| 337 |
+
格式化的字串
|
| 338 |
+
"""
|
| 339 |
+
formatted = f"{caption['caption']}\n\n"
|
| 340 |
+
|
| 341 |
+
if platform == 'xiaohongshu':
|
| 342 |
+
# 小紅書:標籤直接接在標題後
|
| 343 |
+
formatted += ' '.join([f"#{tag}" for tag in caption['hashtags']])
|
| 344 |
+
else:
|
| 345 |
+
# Instagram/TikTok:標籤另起一行
|
| 346 |
+
formatted += '\n' + ' '.join([f"#{tag}" for tag in caption['hashtags']])
|
| 347 |
+
|
| 348 |
+
return formatted
|
| 349 |
+
|
| 350 |
+
print("✓ OutputProcessingManager (V3 with PromptLibraryManager integration) defined")
|
pixcribe_pipeline.py
ADDED
|
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import time
|
| 3 |
+
import traceback
|
| 4 |
+
from PIL import Image
|
| 5 |
+
from typing import Dict
|
| 6 |
+
|
| 7 |
+
from image_processor_manager import ImageProcessorManager
|
| 8 |
+
from yolo_detection_manager import YOLODetectionManager
|
| 9 |
+
from saliency_detection_manager import SaliencyDetectionManager
|
| 10 |
+
from openclip_semantic_manager import OpenCLIPSemanticManager
|
| 11 |
+
from lighting_analysis_manager import LightingAnalysisManager
|
| 12 |
+
from ocr_engine_manager import OCREngineManager
|
| 13 |
+
from prompt_library_manager import PromptLibraryManager
|
| 14 |
+
from brand_recognition_manager import BrandRecognitionManager
|
| 15 |
+
from brand_visualization_manager import BrandVisualizationManager
|
| 16 |
+
from brand_verification_manager import BrandVerificationManager
|
| 17 |
+
from scene_compatibility_manager import SceneCompatibilityManager
|
| 18 |
+
from caption_generation_manager import CaptionGenerationManager
|
| 19 |
+
from detection_fusion_manager import DetectionFusionManager
|
| 20 |
+
from output_processing_manager import OutputProcessingManager
|
| 21 |
+
|
| 22 |
+
class PixcribePipeline:
|
| 23 |
+
"""Main Facade coordinating all components (V2 with multi-language support)"""
|
| 24 |
+
|
| 25 |
+
def __init__(self, yolo_variant='l', vlm_model_name='Qwen/Qwen2.5-VL-7B-Instruct'):
|
| 26 |
+
"""
|
| 27 |
+
Args:
|
| 28 |
+
yolo_variant: 'm', 'l' (default), or 'x'
|
| 29 |
+
vlm_model_name: Vision-Language Model name (default: Qwen2.5-VL-7B-Instruct)
|
| 30 |
+
Can be changed to 'Qwen/Qwen3-VL-8B-Instruct' for latest model
|
| 31 |
+
"""
|
| 32 |
+
print("="*60)
|
| 33 |
+
print("Initializing Pixcribe Pipeline V2...")
|
| 34 |
+
print("="*60)
|
| 35 |
+
|
| 36 |
+
start_time = time.time()
|
| 37 |
+
|
| 38 |
+
# Initialize all managers
|
| 39 |
+
self.image_processor = ImageProcessorManager()
|
| 40 |
+
self.yolo_detector = YOLODetectionManager(variant=yolo_variant)
|
| 41 |
+
self.saliency_detector = SaliencyDetectionManager()
|
| 42 |
+
self.clip_semantic = OpenCLIPSemanticManager()
|
| 43 |
+
self.lighting_analyzer = LightingAnalysisManager()
|
| 44 |
+
self.ocr_engine = OCREngineManager()
|
| 45 |
+
|
| 46 |
+
# NEW: Initialize PromptLibrary (centralized prompt management)
|
| 47 |
+
self.prompt_library = PromptLibraryManager()
|
| 48 |
+
|
| 49 |
+
# Initialize BrandRecognitionManager with PromptLibrary
|
| 50 |
+
self.brand_recognizer = BrandRecognitionManager(
|
| 51 |
+
self.clip_semantic, self.ocr_engine, self.prompt_library
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
# NEW: Brand visualization manager
|
| 55 |
+
self.brand_visualizer = BrandVisualizationManager()
|
| 56 |
+
|
| 57 |
+
self.caption_generator = CaptionGenerationManager(model_name=vlm_model_name)
|
| 58 |
+
|
| 59 |
+
# NEW: Brand verification with VLM
|
| 60 |
+
self.brand_verifier = BrandVerificationManager(self.caption_generator)
|
| 61 |
+
|
| 62 |
+
# NEW: Scene compatibility checker
|
| 63 |
+
self.scene_compatibility = SceneCompatibilityManager(self.prompt_library)
|
| 64 |
+
|
| 65 |
+
self.fusion_manager = DetectionFusionManager(self.clip_semantic)
|
| 66 |
+
|
| 67 |
+
# Initialize OutputProcessingManager with PromptLibrary for smart hashtag generation
|
| 68 |
+
self.output_processor = OutputProcessingManager(self.prompt_library)
|
| 69 |
+
|
| 70 |
+
elapsed = time.time() - start_time
|
| 71 |
+
print("="*60)
|
| 72 |
+
print(f"✓ Pipeline initialized successfully (Time: {elapsed:.2f}s)")
|
| 73 |
+
print("="*60)
|
| 74 |
+
|
| 75 |
+
def process_image(self, image, platform='instagram', yolo_variant='l', language='zh') -> Dict:
|
| 76 |
+
"""End-to-end image processing pipeline
|
| 77 |
+
|
| 78 |
+
Args:
|
| 79 |
+
image: PIL Image or path
|
| 80 |
+
platform: 'instagram', 'tiktok', or 'xiaohongshu'
|
| 81 |
+
yolo_variant: 'm', 'l' (default), or 'x'
|
| 82 |
+
language: 'zh' (Traditional Chinese), 'en' (English), 'zh-en' (Bilingual)
|
| 83 |
+
|
| 84 |
+
Returns:
|
| 85 |
+
Processing results dictionary with brand visualizations
|
| 86 |
+
"""
|
| 87 |
+
print(f"\nProcessing image (Platform: {platform}, Language: {language})...")
|
| 88 |
+
start_time = time.time()
|
| 89 |
+
|
| 90 |
+
try:
|
| 91 |
+
# Step 1: Preprocessing
|
| 92 |
+
print("[1/9] Preprocessing image...")
|
| 93 |
+
processed_img = self.image_processor.load_image(image)
|
| 94 |
+
yolo_input = self.image_processor.preprocess_for_yolo(processed_img)
|
| 95 |
+
|
| 96 |
+
# Step 2: Parallel detection
|
| 97 |
+
print("[2/9] YOLO object detection...")
|
| 98 |
+
yolo_results = self.yolo_detector.detect(yolo_input)
|
| 99 |
+
print(f" Detected {len(yolo_results)} objects")
|
| 100 |
+
|
| 101 |
+
print("[3/9] Saliency detection...")
|
| 102 |
+
salient_regions = self.saliency_detector.detect_salient_regions(processed_img)
|
| 103 |
+
print(f" Found {len(salient_regions)} salient regions")
|
| 104 |
+
|
| 105 |
+
# Step 3: Identify unknown objects
|
| 106 |
+
print("[4/9] Identifying unknown objects...")
|
| 107 |
+
unknown_regions = self.saliency_detector.extract_unknown_regions(
|
| 108 |
+
salient_regions, yolo_results
|
| 109 |
+
)
|
| 110 |
+
print(f" Found {len(unknown_regions)} unknown regions")
|
| 111 |
+
|
| 112 |
+
# Step 4: Brand recognition (with bounding boxes)
|
| 113 |
+
print("[5/9] Brand recognition...")
|
| 114 |
+
brands = []
|
| 115 |
+
brand_detections = [] # For visualization
|
| 116 |
+
|
| 117 |
+
# Method 1: Check YOLO-detected brand-relevant objects
|
| 118 |
+
brand_relevant = self.yolo_detector.filter_brand_relevant_objects(yolo_results)
|
| 119 |
+
if brand_relevant:
|
| 120 |
+
print(f" Checking {len(brand_relevant)} YOLO brand-relevant objects...")
|
| 121 |
+
for det in brand_relevant[:5]: # Check top 5 brand-relevant objects
|
| 122 |
+
region = processed_img.crop(det['bbox'])
|
| 123 |
+
brand_result = self.brand_recognizer.recognize_brand(
|
| 124 |
+
region, processed_img, region_bbox=det['bbox']
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
if brand_result:
|
| 128 |
+
for brand_name, confidence, bbox in brand_result[:2]: # Top 2 brands per region
|
| 129 |
+
brands.append((brand_name, confidence))
|
| 130 |
+
|
| 131 |
+
# Prepare for visualization
|
| 132 |
+
brand_info = self.prompt_library.get_brand_prompts(brand_name)
|
| 133 |
+
category = brand_info.get('category', 'default') if brand_info else 'default'
|
| 134 |
+
|
| 135 |
+
brand_detections.append({
|
| 136 |
+
'name': brand_name,
|
| 137 |
+
'confidence': confidence,
|
| 138 |
+
'bbox': bbox,
|
| 139 |
+
'category': category
|
| 140 |
+
})
|
| 141 |
+
|
| 142 |
+
# Method 2: Full-image brand scan (商業級必要功能)
|
| 143 |
+
# 無論 YOLO 是否檢測到相關物體,都執行全圖品牌掃描
|
| 144 |
+
print(" Performing intelligent full-image brand scan...")
|
| 145 |
+
full_image_brands = self.brand_recognizer.scan_full_image_for_brands(
|
| 146 |
+
processed_img,
|
| 147 |
+
exclude_bboxes=[bd['bbox'] for bd in brand_detections if bd.get('bbox')],
|
| 148 |
+
saliency_regions=salient_regions # 傳遞顯著性區域以智能選擇掃描區域
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
# 合併全圖掃描結果
|
| 152 |
+
if full_image_brands:
|
| 153 |
+
print(f" Full-image scan found {len(full_image_brands)} additional brands")
|
| 154 |
+
for brand_name, confidence, bbox in full_image_brands:
|
| 155 |
+
# 避免重複檢測同一品牌
|
| 156 |
+
if not any(bd['name'] == brand_name for bd in brand_detections):
|
| 157 |
+
brands.append((brand_name, confidence))
|
| 158 |
+
|
| 159 |
+
brand_info = self.prompt_library.get_brand_prompts(brand_name)
|
| 160 |
+
category = brand_info.get('category', 'default') if brand_info else 'default'
|
| 161 |
+
|
| 162 |
+
brand_detections.append({
|
| 163 |
+
'name': brand_name,
|
| 164 |
+
'confidence': confidence,
|
| 165 |
+
'bbox': bbox,
|
| 166 |
+
'category': category
|
| 167 |
+
})
|
| 168 |
+
|
| 169 |
+
print(f" Identified {len(brands)} brand instances (before verification)")
|
| 170 |
+
|
| 171 |
+
# Step 4.5: CLIP scene understanding (moved earlier for compatibility check)
|
| 172 |
+
print("[5.5/11] Scene understanding (CLIP)...")
|
| 173 |
+
scene_analysis = self.clip_semantic.analyze_scene(processed_img)
|
| 174 |
+
print(f" Scene: {scene_analysis.get('urban', {}).get('top', 'unknown')}")
|
| 175 |
+
|
| 176 |
+
# Step 4.6: Scene compatibility check
|
| 177 |
+
if brands:
|
| 178 |
+
print("[5.6/11] Checking scene compatibility...")
|
| 179 |
+
brands_with_bbox = [(b[0], b[1], brand_detections[i]['bbox'])
|
| 180 |
+
for i, b in enumerate(brands)]
|
| 181 |
+
compatible_brands = self.scene_compatibility.batch_check_compatibility(
|
| 182 |
+
brands_with_bbox, scene_analysis
|
| 183 |
+
)
|
| 184 |
+
print(f" {len(compatible_brands)} brands passed compatibility check")
|
| 185 |
+
|
| 186 |
+
# Update brands and brand_detections
|
| 187 |
+
if compatible_brands:
|
| 188 |
+
brands = [(b[0], b[1]) for b in compatible_brands]
|
| 189 |
+
brand_detections = []
|
| 190 |
+
for brand_name, confidence, bbox in compatible_brands:
|
| 191 |
+
brand_info = self.prompt_library.get_brand_prompts(brand_name)
|
| 192 |
+
category = brand_info.get('category', 'default') if brand_info else 'default'
|
| 193 |
+
brand_detections.append({
|
| 194 |
+
'name': brand_name,
|
| 195 |
+
'confidence': confidence,
|
| 196 |
+
'bbox': bbox,
|
| 197 |
+
'category': category
|
| 198 |
+
})
|
| 199 |
+
else:
|
| 200 |
+
brands = []
|
| 201 |
+
brand_detections = []
|
| 202 |
+
|
| 203 |
+
# Step 4.7: VLM brand verification
|
| 204 |
+
if brand_detections:
|
| 205 |
+
print("[5.7/11] VLM brand verification...")
|
| 206 |
+
vlm_verification = self.brand_verifier.verify_brands(
|
| 207 |
+
processed_img, [(bd['name'], bd['confidence'], bd['bbox'])
|
| 208 |
+
for bd in brand_detections]
|
| 209 |
+
)
|
| 210 |
+
print(f" VLM verified {len(vlm_verification.get('verified_brands', []))} brands")
|
| 211 |
+
|
| 212 |
+
# Three-way voting: OpenCLIP + OCR + VLM
|
| 213 |
+
# Collect OCR matches for voting
|
| 214 |
+
ocr_brands = {}
|
| 215 |
+
for brand_name, conf in brands:
|
| 216 |
+
if brand_name not in ocr_brands:
|
| 217 |
+
ocr_brands[brand_name] = (0.5, conf) # Approximate text/ocr split
|
| 218 |
+
|
| 219 |
+
final_brands = self.brand_verifier.three_way_voting(
|
| 220 |
+
[(bd['name'], bd['confidence'], bd['bbox']) for bd in brand_detections],
|
| 221 |
+
ocr_brands,
|
| 222 |
+
vlm_verification
|
| 223 |
+
)
|
| 224 |
+
print(f" Final verified brands: {len(final_brands)}")
|
| 225 |
+
|
| 226 |
+
# Update brands and brand_detections with verified results
|
| 227 |
+
if final_brands:
|
| 228 |
+
brands = [(b[0], b[1]) for b in final_brands]
|
| 229 |
+
brand_detections = []
|
| 230 |
+
for brand_name, confidence, bbox in final_brands:
|
| 231 |
+
brand_info = self.prompt_library.get_brand_prompts(brand_name)
|
| 232 |
+
category = brand_info.get('category', 'default') if brand_info else 'default'
|
| 233 |
+
brand_detections.append({
|
| 234 |
+
'name': brand_name,
|
| 235 |
+
'confidence': confidence,
|
| 236 |
+
'bbox': bbox,
|
| 237 |
+
'category': category
|
| 238 |
+
})
|
| 239 |
+
else:
|
| 240 |
+
brands = []
|
| 241 |
+
brand_detections = []
|
| 242 |
+
|
| 243 |
+
# NEW: Visualize brand detections on image
|
| 244 |
+
if brand_detections:
|
| 245 |
+
visualized_image = self.brand_visualizer.draw_brand_detections(
|
| 246 |
+
processed_img.copy(), brand_detections
|
| 247 |
+
)
|
| 248 |
+
else:
|
| 249 |
+
visualized_image = processed_img
|
| 250 |
+
|
| 251 |
+
# Step 6: CV-based lighting analysis
|
| 252 |
+
print("[7/11] Analyzing lighting conditions...")
|
| 253 |
+
cv_lighting = self.lighting_analyzer.analyze_lighting(processed_img)
|
| 254 |
+
print(f" CV Lighting: {cv_lighting['lighting_type']} (confidence: {cv_lighting['confidence']:.2f})")
|
| 255 |
+
print(f" Details: brightness={cv_lighting['cv_features']['brightness']:.1f}, "
|
| 256 |
+
f"temp_ratio={cv_lighting['cv_features']['color_temp']:.2f}, "
|
| 257 |
+
f"contrast={cv_lighting['cv_features']['contrast']:.1f}")
|
| 258 |
+
|
| 259 |
+
# Step 7: Additional scene analysis details
|
| 260 |
+
print("[8/11] Additional scene analysis...")
|
| 261 |
+
print(f" CLIP Lighting: {scene_analysis.get('lighting', {}).get('top', 'unknown')}")
|
| 262 |
+
print(f" Mood: {scene_analysis.get('mood', {}).get('top', 'unknown')}")
|
| 263 |
+
|
| 264 |
+
# Step 8: Fusion with lighting analysis
|
| 265 |
+
print("[9/11] Fusing detection results...")
|
| 266 |
+
fused_results = self.fusion_manager.fuse_detections(
|
| 267 |
+
yolo_results, unknown_regions, scene_analysis, processed_img, cv_lighting
|
| 268 |
+
)
|
| 269 |
+
fused_results['brands'] = brands
|
| 270 |
+
fused_results['scene_analysis'] = scene_analysis
|
| 271 |
+
|
| 272 |
+
# Print fused lighting result
|
| 273 |
+
fused_lighting = fused_results['scene_analysis']['lighting']['top']
|
| 274 |
+
print(f" Fused Lighting: {fused_lighting}")
|
| 275 |
+
|
| 276 |
+
# Step 9: Caption generation with language support
|
| 277 |
+
print("[10/11] Generating captions...")
|
| 278 |
+
captions = self.caption_generator.generate_captions(
|
| 279 |
+
fused_results, processed_img, platform, language
|
| 280 |
+
)
|
| 281 |
+
|
| 282 |
+
# Step 10: Output processing with smart hashtags
|
| 283 |
+
print("[11/11] Output processing...")
|
| 284 |
+
validated_captions = []
|
| 285 |
+
for caption in captions:
|
| 286 |
+
# Only generate hashtags if VLM didn't generate any
|
| 287 |
+
# DO NOT override VLM hashtags - they follow language requirements
|
| 288 |
+
if not caption.get('hashtags') or len(caption.get('hashtags', [])) < 3:
|
| 289 |
+
print(f" [DEBUG] Caption has {len(caption.get('hashtags', []))} hashtags, generating smart hashtags...")
|
| 290 |
+
caption['hashtags'] = self.output_processor.generate_smart_hashtags(
|
| 291 |
+
fused_results['detections'],
|
| 292 |
+
scene_analysis,
|
| 293 |
+
brands,
|
| 294 |
+
platform,
|
| 295 |
+
language
|
| 296 |
+
)
|
| 297 |
+
else:
|
| 298 |
+
print(f" [DEBUG] Caption has {len(caption['hashtags'])} VLM-generated hashtags")
|
| 299 |
+
|
| 300 |
+
# 傳遞完整參數給 validate_output 以啟用標籤自動補充
|
| 301 |
+
is_valid, msg = self.output_processor.validate_output(
|
| 302 |
+
caption, platform,
|
| 303 |
+
detections=fused_results['detections'],
|
| 304 |
+
scene_info=scene_analysis,
|
| 305 |
+
brands=brands,
|
| 306 |
+
language=language
|
| 307 |
+
)
|
| 308 |
+
if is_valid:
|
| 309 |
+
validated_captions.append(caption)
|
| 310 |
+
else:
|
| 311 |
+
print(f" [DEBUG] Caption validation failed: {msg}")
|
| 312 |
+
|
| 313 |
+
elapsed = time.time() - start_time
|
| 314 |
+
print(f"\n✓ Processing complete (Total time: {elapsed:.2f}s)")
|
| 315 |
+
print(f" Generated {len(validated_captions)} caption variations")
|
| 316 |
+
|
| 317 |
+
return {
|
| 318 |
+
'captions': validated_captions,
|
| 319 |
+
'detections': fused_results['detections'],
|
| 320 |
+
'brands': brands,
|
| 321 |
+
'brand_detections': brand_detections, # NEW: For UI display
|
| 322 |
+
'visualized_image': visualized_image, # NEW: Image with brand boxes
|
| 323 |
+
'scene': scene_analysis,
|
| 324 |
+
'composition': fused_results.get('composition', {}),
|
| 325 |
+
'lighting': cv_lighting,
|
| 326 |
+
'processing_time': elapsed
|
| 327 |
+
}
|
| 328 |
+
|
| 329 |
+
except Exception as e:
|
| 330 |
+
print(f"\n✗ Processing error: {str(e)}")
|
| 331 |
+
traceback.print_exc()
|
| 332 |
+
# Re-raise exception so it can be caught and displayed
|
| 333 |
+
raise
|
| 334 |
+
|
| 335 |
+
print("✓ PixcribePipeline (V2 with VLM Verification, Scene Compatibility, and Adaptive Weights) defined")
|
prompt_library_manager.py
ADDED
|
@@ -0,0 +1,272 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from typing import Dict, List, Optional
|
| 3 |
+
from landmark_prompts import LandmarkPrompts
|
| 4 |
+
from brand_prompts import BrandPrompts
|
| 5 |
+
from scene_prompts import ScenePrompts
|
| 6 |
+
from universal_object_prompts import UniversalObjectPrompts
|
| 7 |
+
|
| 8 |
+
class PromptLibraryManager:
|
| 9 |
+
"""
|
| 10 |
+
Facade 模式:統一管理所有 Prompt 子模組
|
| 11 |
+
提供單一介面存取品牌、地標、場景、通用物品等 prompts
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
def __init__(self):
|
| 15 |
+
"""初始化所有 Prompt 子模組"""
|
| 16 |
+
|
| 17 |
+
print("Initializing Prompt Library Manager (Facade)...")
|
| 18 |
+
|
| 19 |
+
# 載入所有子模組
|
| 20 |
+
self.brand_prompts = BrandPrompts()
|
| 21 |
+
self.landmark_prompts = LandmarkPrompts()
|
| 22 |
+
self.scene_prompts = ScenePrompts()
|
| 23 |
+
self.object_prompts = UniversalObjectPrompts()
|
| 24 |
+
|
| 25 |
+
# 統計資訊
|
| 26 |
+
total_brands = self._count_brands()
|
| 27 |
+
total_landmarks = len(self.landmark_prompts.landmarks)
|
| 28 |
+
total_scenes = len(self.scene_prompts.scene_vocabularies)
|
| 29 |
+
total_objects = len(self.object_prompts.object_vocabularies)
|
| 30 |
+
|
| 31 |
+
print(f"✓ Prompt Library Manager initialized:")
|
| 32 |
+
print(f" - {total_brands} brands across {len(self.brand_prompts.brand_prompts)} categories")
|
| 33 |
+
print(f" - {total_landmarks} world landmarks")
|
| 34 |
+
print(f" - {total_scenes} scene categories")
|
| 35 |
+
print(f" - {total_objects} universal object categories")
|
| 36 |
+
|
| 37 |
+
def _count_brands(self) -> int:
|
| 38 |
+
"""計算總品牌數量"""
|
| 39 |
+
total = 0
|
| 40 |
+
for category in self.brand_prompts.brand_prompts.values():
|
| 41 |
+
total += len(category)
|
| 42 |
+
return total
|
| 43 |
+
|
| 44 |
+
# ===== 品牌相關方法 Brand Methods =====
|
| 45 |
+
|
| 46 |
+
def get_brand_prompts(self, brand_name: str) -> Optional[Dict]:
|
| 47 |
+
"""
|
| 48 |
+
取得特定品牌的完整 prompt 資料
|
| 49 |
+
|
| 50 |
+
Args:
|
| 51 |
+
brand_name: 品牌名稱
|
| 52 |
+
|
| 53 |
+
Returns:
|
| 54 |
+
品牌資料字典
|
| 55 |
+
"""
|
| 56 |
+
return self.brand_prompts.get_prompts(brand_name)
|
| 57 |
+
|
| 58 |
+
def get_brand_category(self, brand_name: str) -> str:
|
| 59 |
+
"""取得品牌類別"""
|
| 60 |
+
return self.brand_prompts.get_brand_category(brand_name)
|
| 61 |
+
|
| 62 |
+
def get_all_brands(self) -> Dict:
|
| 63 |
+
"""取得所有品牌的扁平化字典"""
|
| 64 |
+
return self.brand_prompts.get_all_brands()
|
| 65 |
+
|
| 66 |
+
def get_brands_by_category(self, category: str) -> Dict:
|
| 67 |
+
"""取得特定類別的所有品牌"""
|
| 68 |
+
return self.brand_prompts.get_brands_by_category(category)
|
| 69 |
+
|
| 70 |
+
def search_brand_by_alias(self, alias: str) -> Optional[str]:
|
| 71 |
+
"""根據別名搜尋品牌名稱"""
|
| 72 |
+
return self.brand_prompts.search_brand_by_alias(alias)
|
| 73 |
+
|
| 74 |
+
# ===== 地標相關方法 Landmark Methods =====
|
| 75 |
+
|
| 76 |
+
def get_landmark_prompts(self, landmark_name: str) -> Optional[Dict]:
|
| 77 |
+
"""
|
| 78 |
+
取得特定地標的完整 prompt 資料
|
| 79 |
+
|
| 80 |
+
Args:
|
| 81 |
+
landmark_name: 地標名稱
|
| 82 |
+
|
| 83 |
+
Returns:
|
| 84 |
+
地標資料字典
|
| 85 |
+
"""
|
| 86 |
+
return self.landmark_prompts.get_prompts(landmark_name)
|
| 87 |
+
|
| 88 |
+
def get_all_landmarks(self) -> Dict:
|
| 89 |
+
"""取得所有地標資料"""
|
| 90 |
+
return self.landmark_prompts.get_all_landmarks()
|
| 91 |
+
|
| 92 |
+
def search_landmark_by_location(self, city: str = None, country: str = None) -> List[str]:
|
| 93 |
+
"""
|
| 94 |
+
根據地理位置搜尋地標
|
| 95 |
+
|
| 96 |
+
Args:
|
| 97 |
+
city: 城市名稱
|
| 98 |
+
country: 國家名稱
|
| 99 |
+
|
| 100 |
+
Returns:
|
| 101 |
+
符合條件的地標名稱列表
|
| 102 |
+
"""
|
| 103 |
+
return self.landmark_prompts.search_by_location(city, country)
|
| 104 |
+
|
| 105 |
+
def get_landmark_visual_prompts(self, landmark_name: str, context: str = 'iconic_view') -> List[str]:
|
| 106 |
+
"""
|
| 107 |
+
取得地標的視覺描述 prompts
|
| 108 |
+
|
| 109 |
+
Args:
|
| 110 |
+
landmark_name: 地標名稱
|
| 111 |
+
context: 情境類型
|
| 112 |
+
|
| 113 |
+
Returns:
|
| 114 |
+
視覺描述列表
|
| 115 |
+
"""
|
| 116 |
+
return self.landmark_prompts.get_visual_prompts(landmark_name, context)
|
| 117 |
+
|
| 118 |
+
# Scene Methods
|
| 119 |
+
def get_scene_prompts(self, scene_category: str, subcategory: str = None) -> List[str]:
|
| 120 |
+
"""
|
| 121 |
+
取得場景 prompts
|
| 122 |
+
|
| 123 |
+
Args:
|
| 124 |
+
scene_category: 場景類別
|
| 125 |
+
subcategory: 子類別(可選)
|
| 126 |
+
|
| 127 |
+
Returns:
|
| 128 |
+
Prompt 列表
|
| 129 |
+
"""
|
| 130 |
+
return self.scene_prompts.get_prompts(scene_category, subcategory)
|
| 131 |
+
|
| 132 |
+
def get_all_scene_categories(self) -> List[str]:
|
| 133 |
+
"""取得所有場景類別"""
|
| 134 |
+
return self.scene_prompts.get_all_categories()
|
| 135 |
+
|
| 136 |
+
def get_scene_subcategories(self, scene_category: str) -> List[str]:
|
| 137 |
+
"""取得場景的子類別"""
|
| 138 |
+
return self.scene_prompts.get_subcategories(scene_category)
|
| 139 |
+
|
| 140 |
+
# Universal Object Methods
|
| 141 |
+
def get_object_prompts(self, category: str, subcategory: str = None) -> List[str]:
|
| 142 |
+
"""
|
| 143 |
+
取得通用物品 prompts
|
| 144 |
+
|
| 145 |
+
Args:
|
| 146 |
+
category: 物品類別 (如 'animals', 'vehicles')
|
| 147 |
+
subcategory: 子類別 (如 'dogs', 'cats')
|
| 148 |
+
|
| 149 |
+
Returns:
|
| 150 |
+
Prompt 列表
|
| 151 |
+
"""
|
| 152 |
+
return self.object_prompts.get_prompts(category, subcategory)
|
| 153 |
+
|
| 154 |
+
def get_all_object_categories(self) -> List[str]:
|
| 155 |
+
"""取得所有通用物品類別"""
|
| 156 |
+
return self.object_prompts.get_all_categories()
|
| 157 |
+
|
| 158 |
+
def get_object_subcategories(self, category: str) -> List[str]:
|
| 159 |
+
"""取得物品的子類別"""
|
| 160 |
+
return self.object_prompts.get_subcategories(category)
|
| 161 |
+
|
| 162 |
+
def detect_object_category(self, detected_objects: List[str]) -> Optional[str]:
|
| 163 |
+
"""根據檢測到的物體推測主要類別"""
|
| 164 |
+
return self.object_prompts.detect_object_category(detected_objects)
|
| 165 |
+
|
| 166 |
+
# Smart Hashtag Generation
|
| 167 |
+
def get_hashtags_for_content(self, detected_items: Dict, language: str = 'zh') -> List[str]:
|
| 168 |
+
"""
|
| 169 |
+
智能標籤生成:整合品牌、地標、場景的標籤
|
| 170 |
+
|
| 171 |
+
Args:
|
| 172 |
+
detected_items: 檢測到的內容字典
|
| 173 |
+
{
|
| 174 |
+
'landmarks': ['Big Ben', ...],
|
| 175 |
+
'brands': ['Apple', ...],
|
| 176 |
+
'scene_category': 'urban',
|
| 177 |
+
'scene_subcategory': 'city_canyon'
|
| 178 |
+
}
|
| 179 |
+
language: 語言 ('zh', 'en', 或 'zh-en')
|
| 180 |
+
|
| 181 |
+
Returns:
|
| 182 |
+
Hashtag 列表(去重並排序)
|
| 183 |
+
"""
|
| 184 |
+
hashtags = []
|
| 185 |
+
|
| 186 |
+
# 1. 地標標籤(最高優先級)
|
| 187 |
+
landmarks = detected_items.get('landmarks', [])
|
| 188 |
+
for landmark in landmarks:
|
| 189 |
+
landmark_tags = self.landmark_prompts.get_hashtags(landmark, language)
|
| 190 |
+
hashtags.extend(landmark_tags)
|
| 191 |
+
|
| 192 |
+
# 2. 品牌標籤(高優先級)
|
| 193 |
+
brands = detected_items.get('brands', [])
|
| 194 |
+
for brand in brands:
|
| 195 |
+
brand_tags = self.brand_prompts.get_hashtags(brand, language)
|
| 196 |
+
hashtags.extend(brand_tags)
|
| 197 |
+
|
| 198 |
+
# 3. 場景標籤(中優先級)
|
| 199 |
+
scene_category = detected_items.get('scene_category')
|
| 200 |
+
if scene_category:
|
| 201 |
+
scene_tags = self.scene_prompts.get_hashtags(scene_category, language)
|
| 202 |
+
hashtags.extend(scene_tags)
|
| 203 |
+
|
| 204 |
+
# 去重並保持順序(地標 > 品牌 > 場景)
|
| 205 |
+
seen = set()
|
| 206 |
+
unique_hashtags = []
|
| 207 |
+
for tag in hashtags:
|
| 208 |
+
if tag not in seen:
|
| 209 |
+
seen.add(tag)
|
| 210 |
+
unique_hashtags.append(tag)
|
| 211 |
+
|
| 212 |
+
# 返回前 10 個
|
| 213 |
+
return unique_hashtags[:10]
|
| 214 |
+
|
| 215 |
+
# Search Functions
|
| 216 |
+
def search_by_location(self, city: str = None, country: str = None) -> Dict:
|
| 217 |
+
"""
|
| 218 |
+
根據地點搜尋所有相關內容(地標、品牌)
|
| 219 |
+
|
| 220 |
+
Args:
|
| 221 |
+
city: 城市名稱
|
| 222 |
+
country: 國家名稱
|
| 223 |
+
|
| 224 |
+
Returns:
|
| 225 |
+
搜尋結果字典
|
| 226 |
+
"""
|
| 227 |
+
results = {
|
| 228 |
+
'landmarks': [],
|
| 229 |
+
'brands': []
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
# 搜尋地標
|
| 233 |
+
landmarks = self.landmark_prompts.search_by_location(city, country)
|
| 234 |
+
results['landmarks'] = landmarks
|
| 235 |
+
|
| 236 |
+
# 品牌通常不按地理位置分類,但可以擴展此功能
|
| 237 |
+
|
| 238 |
+
return results
|
| 239 |
+
|
| 240 |
+
def detect_landmark_from_image_context(self, detected_objects: List[str],
|
| 241 |
+
scene_analysis: Dict) -> Optional[str]:
|
| 242 |
+
"""
|
| 243 |
+
根據檢測到的物體和場景分析推測可能的地標
|
| 244 |
+
|
| 245 |
+
Args:
|
| 246 |
+
detected_objects: 檢測到的物體列表
|
| 247 |
+
scene_analysis: 場景分析結果
|
| 248 |
+
|
| 249 |
+
Returns:
|
| 250 |
+
推測的地標名稱,若無法推測則返回 None
|
| 251 |
+
"""
|
| 252 |
+
# 關鍵字映射到地標
|
| 253 |
+
landmark_keywords = {
|
| 254 |
+
'Big Ben': ['clock tower', 'tower', 'bridge', 'river'],
|
| 255 |
+
'Eiffel Tower': ['tower', 'iron structure', 'landmark'],
|
| 256 |
+
'Statue of Liberty': ['statue', 'monument', 'island', 'harbor'],
|
| 257 |
+
'Sydney Opera House': ['building', 'harbor', 'architecture'],
|
| 258 |
+
'Taj Mahal': ['building', 'monument', 'dome'],
|
| 259 |
+
'Pyramids of Giza': ['pyramid', 'desert', 'monument'],
|
| 260 |
+
# 可以擴展更多
|
| 261 |
+
}
|
| 262 |
+
|
| 263 |
+
# 簡單的關鍵字匹配
|
| 264 |
+
for landmark, keywords in landmark_keywords.items():
|
| 265 |
+
match_count = sum(1 for obj in detected_objects
|
| 266 |
+
if any(kw in obj.lower() for kw in keywords))
|
| 267 |
+
if match_count >= 2: # 至少匹配 2 個關鍵字
|
| 268 |
+
return landmark
|
| 269 |
+
|
| 270 |
+
return None
|
| 271 |
+
|
| 272 |
+
print("✓ PromptLibraryManager (Facade) defined")
|
saliency_detection_manager.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import torch
|
| 3 |
+
import numpy as np
|
| 4 |
+
from PIL import Image
|
| 5 |
+
import cv2
|
| 6 |
+
from typing import List, Dict
|
| 7 |
+
import torchvision.transforms as transforms
|
| 8 |
+
|
| 9 |
+
class SaliencyDetectionManager:
|
| 10 |
+
"""Visual saliency detection using U2-Net"""
|
| 11 |
+
|
| 12 |
+
def __init__(self):
|
| 13 |
+
print("Loading U2-Net model...")
|
| 14 |
+
try:
|
| 15 |
+
from torchvision.models.segmentation import deeplabv3_resnet50
|
| 16 |
+
self.model = deeplabv3_resnet50(pretrained=True)
|
| 17 |
+
self.model.eval()
|
| 18 |
+
if torch.cuda.is_available():
|
| 19 |
+
self.model = self.model.cuda()
|
| 20 |
+
except Exception as e:
|
| 21 |
+
print(f"Warning: Cannot load deep learning model, using fallback: {e}")
|
| 22 |
+
self.model = None
|
| 23 |
+
|
| 24 |
+
self.threshold = 0.5
|
| 25 |
+
self.min_area = 1600
|
| 26 |
+
self.min_saliency = 0.6
|
| 27 |
+
|
| 28 |
+
self.transform = transforms.Compose([
|
| 29 |
+
transforms.Resize((320, 320)),
|
| 30 |
+
transforms.ToTensor(),
|
| 31 |
+
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
| 32 |
+
])
|
| 33 |
+
|
| 34 |
+
print("✓ SaliencyDetectionManager initialized")
|
| 35 |
+
|
| 36 |
+
def detect_salient_regions(self, image: Image.Image) -> List[Dict]:
|
| 37 |
+
"""Detect salient regions"""
|
| 38 |
+
img_array = np.array(image)
|
| 39 |
+
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
| 40 |
+
|
| 41 |
+
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
| 42 |
+
contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
| 43 |
+
|
| 44 |
+
regions = []
|
| 45 |
+
height, width = img_array.shape[:2]
|
| 46 |
+
|
| 47 |
+
for contour in contours:
|
| 48 |
+
area = cv2.contourArea(contour)
|
| 49 |
+
if area < self.min_area:
|
| 50 |
+
continue
|
| 51 |
+
|
| 52 |
+
x, y, w, h = cv2.boundingRect(contour)
|
| 53 |
+
bbox = [float(x), float(y), float(x + w), float(y + h)]
|
| 54 |
+
region_img = image.crop(bbox)
|
| 55 |
+
|
| 56 |
+
regions.append({
|
| 57 |
+
'bbox': bbox,
|
| 58 |
+
'area': area,
|
| 59 |
+
'saliency_score': min(area / (width * height), 1.0),
|
| 60 |
+
'image': region_img
|
| 61 |
+
})
|
| 62 |
+
|
| 63 |
+
regions = sorted(regions, key=lambda x: x['saliency_score'], reverse=True)
|
| 64 |
+
return regions[:10]
|
| 65 |
+
|
| 66 |
+
def extract_unknown_regions(self, salient_regions: List[Dict], yolo_detections: List[Dict]) -> List[Dict]:
|
| 67 |
+
"""Extract salient regions not detected by YOLO"""
|
| 68 |
+
unknown_regions = []
|
| 69 |
+
|
| 70 |
+
for region in salient_regions:
|
| 71 |
+
max_iou = 0.0
|
| 72 |
+
for det in yolo_detections:
|
| 73 |
+
iou = self._calculate_iou(region['bbox'], det['bbox'])
|
| 74 |
+
max_iou = max(max_iou, iou)
|
| 75 |
+
|
| 76 |
+
if max_iou < 0.3:
|
| 77 |
+
unknown_regions.append(region)
|
| 78 |
+
|
| 79 |
+
return unknown_regions
|
| 80 |
+
|
| 81 |
+
def _calculate_iou(self, box1: List[float], box2: List[float]) -> float:
|
| 82 |
+
"""Calculate IoU (Intersection over Union)"""
|
| 83 |
+
x1_min, y1_min, x1_max, y1_max = box1
|
| 84 |
+
x2_min, y2_min, x2_max, y2_max = box2
|
| 85 |
+
|
| 86 |
+
inter_xmin = max(x1_min, x2_min)
|
| 87 |
+
inter_ymin = max(y1_min, y2_min)
|
| 88 |
+
inter_xmax = min(x1_max, x2_max)
|
| 89 |
+
inter_ymax = min(y1_max, y2_max)
|
| 90 |
+
|
| 91 |
+
if inter_xmax < inter_xmin or inter_ymax < inter_ymin:
|
| 92 |
+
return 0.0
|
| 93 |
+
|
| 94 |
+
inter_area = (inter_xmax - inter_xmin) * (inter_ymax - inter_ymin)
|
| 95 |
+
box1_area = (x1_max - x1_min) * (y1_max - y1_min)
|
| 96 |
+
box2_area = (x2_max - x2_min) * (y2_max - y2_min)
|
| 97 |
+
union_area = box1_area + box2_area - inter_area
|
| 98 |
+
|
| 99 |
+
return inter_area / union_area if union_area > 0 else 0.0
|
| 100 |
+
|
| 101 |
+
print("✓ SaliencyDetectionManager defined")
|
scene_compatibility_manager.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from typing import Dict, List
|
| 3 |
+
from prompt_library_manager import PromptLibraryManager
|
| 4 |
+
|
| 5 |
+
class SceneCompatibilityManager:
|
| 6 |
+
"""Check brand-scene compatibility to reduce false positives"""
|
| 7 |
+
|
| 8 |
+
def __init__(self, prompt_library: PromptLibraryManager = None):
|
| 9 |
+
"""
|
| 10 |
+
Args:
|
| 11 |
+
prompt_library: PromptLibraryManager instance for brand metadata
|
| 12 |
+
"""
|
| 13 |
+
if prompt_library is None:
|
| 14 |
+
prompt_library = PromptLibraryManager()
|
| 15 |
+
|
| 16 |
+
self.prompt_library = prompt_library
|
| 17 |
+
|
| 18 |
+
# Scene classification keywords
|
| 19 |
+
self.scene_keywords = {
|
| 20 |
+
'food_closeup': ['food', 'meal', 'dish', 'plate', 'restaurant', 'dining', 'cuisine'],
|
| 21 |
+
'nature_landscape': ['mountain', 'forest', 'beach', 'ocean', 'lake', 'sky', 'sunset', 'outdoor'],
|
| 22 |
+
'industrial': ['factory', 'warehouse', 'industrial', 'machinery', 'construction'],
|
| 23 |
+
'sports': ['gym', 'fitness', 'running', 'sports', 'athletic', 'exercise'],
|
| 24 |
+
'fashion': ['fashion', 'outfit', 'style', 'wearing', 'model'],
|
| 25 |
+
'luxury_retail': ['store', 'boutique', 'shop', 'retail', 'display'],
|
| 26 |
+
'office': ['office', 'desk', 'computer', 'workspace', 'business'],
|
| 27 |
+
'home': ['home', 'room', 'interior', 'living', 'bedroom'],
|
| 28 |
+
'lifestyle': ['lifestyle', 'casual', 'everyday', 'daily'],
|
| 29 |
+
'tech_review': ['unboxing', 'review', 'tech', 'device', 'gadget'],
|
| 30 |
+
'formal_event': ['event', 'party', 'formal', 'ceremony', 'celebration'],
|
| 31 |
+
'outdoor': ['outdoor', 'park', 'street', 'outside'],
|
| 32 |
+
'travel': ['travel', 'trip', 'luggage', 'airport', 'vacation'],
|
| 33 |
+
'street': ['street', 'road', 'urban', 'city'],
|
| 34 |
+
'parking': ['parking', 'car park', 'garage'],
|
| 35 |
+
'showroom': ['showroom', 'exhibition', 'display'],
|
| 36 |
+
'closeup': ['closeup', 'detail', 'macro', 'close-up']
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
print("✓ Scene Compatibility Manager initialized")
|
| 40 |
+
|
| 41 |
+
def classify_scene(self, scene_analysis: Dict) -> str:
|
| 42 |
+
"""
|
| 43 |
+
Classify scene type from OpenCLIP scene analysis
|
| 44 |
+
|
| 45 |
+
Args:
|
| 46 |
+
scene_analysis: Scene analysis results from OpenCLIPSemanticManager
|
| 47 |
+
|
| 48 |
+
Returns:
|
| 49 |
+
Scene type string (e.g., 'food_closeup', 'fashion', 'tech_review')
|
| 50 |
+
"""
|
| 51 |
+
# Extract top scene categories
|
| 52 |
+
scene_scores = {}
|
| 53 |
+
|
| 54 |
+
# Check different scene analysis keys
|
| 55 |
+
for key in ['urban', 'lighting', 'mood', 'composition']:
|
| 56 |
+
if key in scene_analysis and 'top' in scene_analysis[key]:
|
| 57 |
+
top_label = scene_analysis[key]['top'].lower()
|
| 58 |
+
|
| 59 |
+
# Match with scene keywords
|
| 60 |
+
for scene_type, keywords in self.scene_keywords.items():
|
| 61 |
+
for keyword in keywords:
|
| 62 |
+
if keyword in top_label:
|
| 63 |
+
scene_scores[scene_type] = scene_scores.get(scene_type, 0) + 1
|
| 64 |
+
|
| 65 |
+
# Return most matched scene type
|
| 66 |
+
if scene_scores:
|
| 67 |
+
return max(scene_scores.items(), key=lambda x: x[1])[0]
|
| 68 |
+
|
| 69 |
+
return 'general'
|
| 70 |
+
|
| 71 |
+
def check_compatibility(self, brand_name: str, scene_type: str) -> float:
|
| 72 |
+
"""
|
| 73 |
+
Check if brand is compatible with scene
|
| 74 |
+
|
| 75 |
+
Args:
|
| 76 |
+
brand_name: Name of the brand
|
| 77 |
+
scene_type: Scene type (e.g., 'food_closeup', 'fashion')
|
| 78 |
+
|
| 79 |
+
Returns:
|
| 80 |
+
Compatibility score (0.3 to 1.0)
|
| 81 |
+
- 1.0: fully compatible
|
| 82 |
+
- 0.7: neutral (no strong match or mismatch)
|
| 83 |
+
- 0.3: incompatible (reduce confidence)
|
| 84 |
+
"""
|
| 85 |
+
brand_info = self.prompt_library.get_brand_prompts(brand_name)
|
| 86 |
+
|
| 87 |
+
if not brand_info:
|
| 88 |
+
return 0.7 # Neutral if brand not found
|
| 89 |
+
|
| 90 |
+
# Check if scene is typical for this brand
|
| 91 |
+
typical_scenes = brand_info.get('typical_scenes', [])
|
| 92 |
+
if scene_type in typical_scenes:
|
| 93 |
+
return 1.0 # Fully compatible
|
| 94 |
+
|
| 95 |
+
# Check if scene is incompatible
|
| 96 |
+
incompatible_scenes = brand_info.get('incompatible_scenes', [])
|
| 97 |
+
if scene_type in incompatible_scenes:
|
| 98 |
+
return 0.3 # Reduce confidence significantly
|
| 99 |
+
|
| 100 |
+
# Neutral case - no strong evidence either way
|
| 101 |
+
return 0.7
|
| 102 |
+
|
| 103 |
+
def batch_check_compatibility(self, detected_brands: List[tuple],
|
| 104 |
+
scene_analysis: Dict) -> List[tuple]:
|
| 105 |
+
"""
|
| 106 |
+
Check compatibility for multiple brands
|
| 107 |
+
|
| 108 |
+
Args:
|
| 109 |
+
detected_brands: List of (brand_name, confidence, bbox) tuples
|
| 110 |
+
scene_analysis: Scene analysis results
|
| 111 |
+
|
| 112 |
+
Returns:
|
| 113 |
+
List of (brand_name, adjusted_confidence, bbox) tuples
|
| 114 |
+
"""
|
| 115 |
+
scene_type = self.classify_scene(scene_analysis)
|
| 116 |
+
|
| 117 |
+
adjusted_brands = []
|
| 118 |
+
for brand_name, confidence, bbox in detected_brands:
|
| 119 |
+
compatibility_score = self.check_compatibility(brand_name, scene_type)
|
| 120 |
+
|
| 121 |
+
# Adjust confidence based on compatibility
|
| 122 |
+
adjusted_confidence = confidence * compatibility_score
|
| 123 |
+
|
| 124 |
+
# Only keep if adjusted confidence is still reasonable
|
| 125 |
+
if adjusted_confidence > 0.25:
|
| 126 |
+
adjusted_brands.append((brand_name, adjusted_confidence, bbox))
|
| 127 |
+
|
| 128 |
+
# Re-sort by adjusted confidence
|
| 129 |
+
adjusted_brands.sort(key=lambda x: x[1], reverse=True)
|
| 130 |
+
|
| 131 |
+
return adjusted_brands
|
| 132 |
+
|
| 133 |
+
print("✓ SceneCompatibilityManager defined")
|
scene_prompts.py
ADDED
|
@@ -0,0 +1,433 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from typing import Dict, List
|
| 3 |
+
|
| 4 |
+
class ScenePrompts:
|
| 5 |
+
"""
|
| 6 |
+
場景描述 Prompt 庫
|
| 7 |
+
提供多元化場景類型的詳細視覺描述
|
| 8 |
+
涵蓋:城市、自然、室內、食物、人物、產品等場景
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
def __init__(self):
|
| 12 |
+
"""初始化場景詞彙庫"""
|
| 13 |
+
|
| 14 |
+
self.scene_vocabularies = {
|
| 15 |
+
# ===== 城市場景 Urban =====
|
| 16 |
+
'urban': {
|
| 17 |
+
'city_canyon': [
|
| 18 |
+
'urban canyon with towering skyscrapers lining both sides of street creating vertical corridor',
|
| 19 |
+
'metropolitan corridor formed by tall buildings with strong vertical emphasis and symmetrical composition',
|
| 20 |
+
'downtown street flanked by modern high-rise architecture creating canyon effect',
|
| 21 |
+
'city street with tall buildings on both sides creating narrow vertical perspective'
|
| 22 |
+
],
|
| 23 |
+
'street_level': [
|
| 24 |
+
'bustling city street with pedestrians and vehicles in urban environment',
|
| 25 |
+
'urban sidewalk scene with street furniture storefronts and mixed activity',
|
| 26 |
+
'downtown pedestrian area with commercial buildings and urban infrastructure',
|
| 27 |
+
'street view with urban architecture shops and people walking'
|
| 28 |
+
],
|
| 29 |
+
'skyline': [
|
| 30 |
+
'city skyline with skyscrapers silhouetted against sky',
|
| 31 |
+
'urban panorama showing downtown high-rise buildings and city sprawl',
|
| 32 |
+
'metropolitan skyline view from elevated vantage point',
|
| 33 |
+
'cityscape with distinctive tall buildings defining horizon line'
|
| 34 |
+
],
|
| 35 |
+
'plaza': [
|
| 36 |
+
'urban plaza with open public space and surrounding architecture',
|
| 37 |
+
'city square with pedestrians monuments and commercial buildings',
|
| 38 |
+
'downtown plaza featuring fountains sculptures and gathering spaces',
|
| 39 |
+
'public square with mixed use of recreational and commercial activities'
|
| 40 |
+
]
|
| 41 |
+
},
|
| 42 |
+
|
| 43 |
+
# ===== 自然風景 Nature =====
|
| 44 |
+
'nature': {
|
| 45 |
+
'mountain': [
|
| 46 |
+
'majestic mountain range with snow-capped peaks against blue sky',
|
| 47 |
+
'alpine landscape with rocky summits and glacial valleys',
|
| 48 |
+
'mountain vista with layered ridges fading into distance creating depth',
|
| 49 |
+
'dramatic mountain scenery with rugged peaks and alpine vegetation',
|
| 50 |
+
'mountainous terrain with steep slopes and varied elevation'
|
| 51 |
+
],
|
| 52 |
+
'beach': [
|
| 53 |
+
'serene beach with turquoise water and white sand shore',
|
| 54 |
+
'coastal scene with gentle waves lapping at sandy beach',
|
| 55 |
+
'tropical beach with clear water and palm tree shadows',
|
| 56 |
+
'beach landscape with ocean horizon and coastal features',
|
| 57 |
+
'seaside view with beach sand water and sky meeting at horizon'
|
| 58 |
+
],
|
| 59 |
+
'forest': [
|
| 60 |
+
'lush forest with dense canopy and dappled sunlight filtering through trees',
|
| 61 |
+
'woodland scene with tall trees and undergrowth vegetation',
|
| 62 |
+
'forest interior with tree trunks and leafy canopy overhead',
|
| 63 |
+
'dense forest landscape with natural vegetation and organic forms',
|
| 64 |
+
'wooded area with trees creating natural shade and green environment'
|
| 65 |
+
],
|
| 66 |
+
'lake': [
|
| 67 |
+
'tranquil lake with still water reflecting surrounding landscape',
|
| 68 |
+
'mountain lake with clear water and scenic backdrop',
|
| 69 |
+
'lakeside view with calm water and shoreline vegetation',
|
| 70 |
+
'peaceful lake scene with water sky and natural surroundings',
|
| 71 |
+
'alpine lake with pristine water and mountain reflections'
|
| 72 |
+
],
|
| 73 |
+
'desert': [
|
| 74 |
+
'desert landscape with sand dunes and arid terrain',
|
| 75 |
+
'sandy desert with undulating dunes and clear sky',
|
| 76 |
+
'arid desert scene with sparse vegetation and sandy ground',
|
| 77 |
+
'desert vista with sand formations and minimal vegetation',
|
| 78 |
+
'dry desert landscape with sand rock and desert plants'
|
| 79 |
+
],
|
| 80 |
+
'waterfall': [
|
| 81 |
+
'cascading waterfall with flowing water over rocks',
|
| 82 |
+
'waterfall scene with water spray and lush surrounding vegetation',
|
| 83 |
+
'natural waterfall with water rushing down cliff face',
|
| 84 |
+
'scenic waterfall with water pool and natural setting',
|
| 85 |
+
'tiered waterfall with multiple cascades and mist'
|
| 86 |
+
]
|
| 87 |
+
},
|
| 88 |
+
|
| 89 |
+
# ===== 室內場景 Indoor =====
|
| 90 |
+
'indoor': {
|
| 91 |
+
'cafe': [
|
| 92 |
+
'cozy cafe interior with warm ambient lighting and wooden furniture',
|
| 93 |
+
'modern coffee shop with industrial decor and minimalist design',
|
| 94 |
+
'rustic cafe setting with vintage decorations and soft lighting',
|
| 95 |
+
'contemporary cafe space with comfortable seating and artistic elements',
|
| 96 |
+
'intimate coffee shop with warm atmosphere and inviting ambiance'
|
| 97 |
+
],
|
| 98 |
+
'restaurant': [
|
| 99 |
+
'upscale restaurant interior with elegant table settings and refined decor',
|
| 100 |
+
'casual dining space with comfortable seating and welcoming atmosphere',
|
| 101 |
+
'fine dining restaurant with sophisticated lighting and premium furnishings',
|
| 102 |
+
'restaurant setting with tables chairs and ambient lighting',
|
| 103 |
+
'dining establishment with culinary presentation and service area'
|
| 104 |
+
],
|
| 105 |
+
'office': [
|
| 106 |
+
'modern office space with desks computers and professional workspace',
|
| 107 |
+
'contemporary work environment with ergonomic furniture and technology',
|
| 108 |
+
'office interior with cubicles meeting areas and work stations',
|
| 109 |
+
'professional office setting with business equipment and organized layout',
|
| 110 |
+
'corporate workspace with clean lines and functional design'
|
| 111 |
+
],
|
| 112 |
+
'home_living': [
|
| 113 |
+
'cozy living room with sofa comfortable seating and home decor',
|
| 114 |
+
'modern home interior with minimalist furniture and clean aesthetic',
|
| 115 |
+
'warm living space with personal touches and inviting atmosphere',
|
| 116 |
+
'residential interior with family room features and casual comfort',
|
| 117 |
+
'home living area with relaxation space and domestic furnishings'
|
| 118 |
+
],
|
| 119 |
+
'bedroom': [
|
| 120 |
+
'peaceful bedroom with bed nightstands and soft lighting',
|
| 121 |
+
'modern bedroom interior with minimalist design and calm atmosphere',
|
| 122 |
+
'cozy sleeping space with comfortable bedding and personal decor',
|
| 123 |
+
'bedroom setting with rest area and private sanctuary feel',
|
| 124 |
+
'sleeping quarters with bed furniture and restful ambiance'
|
| 125 |
+
],
|
| 126 |
+
'museum': [
|
| 127 |
+
'museum interior with exhibited artworks and gallery lighting',
|
| 128 |
+
'cultural institution space with display cases and visitor areas',
|
| 129 |
+
'art gallery with paintings sculptures and exhibition design',
|
| 130 |
+
'museum hall with artifacts and informational displays',
|
| 131 |
+
'exhibition space with curated collections and viewing areas'
|
| 132 |
+
]
|
| 133 |
+
},
|
| 134 |
+
|
| 135 |
+
# ===== 食物場景 Food =====
|
| 136 |
+
'food': {
|
| 137 |
+
'plated_dish': [
|
| 138 |
+
'gourmet plated dish with artistic presentation and fine dining aesthetics',
|
| 139 |
+
'restaurant plate with carefully arranged food components and garnishes',
|
| 140 |
+
'culinary creation with vibrant colors and professional plating',
|
| 141 |
+
'plated meal with balanced composition and appetizing appearance',
|
| 142 |
+
'food presentation with attention to visual detail and portion control',
|
| 143 |
+
'elegant dinner plate with sophisticated garnish and culinary artistry',
|
| 144 |
+
'fusion cuisine dish with innovative presentation and colorful elements',
|
| 145 |
+
'fine dining entree with sauce art and premium ingredients',
|
| 146 |
+
'contemporary plated food with geometric arrangement and edible flowers',
|
| 147 |
+
"chef's special with meticulous plating and restaurant-quality finish"
|
| 148 |
+
],
|
| 149 |
+
'street_food': [
|
| 150 |
+
'casual street food on wooden table or food truck setting',
|
| 151 |
+
'authentic street cuisine with rustic presentation and local character',
|
| 152 |
+
'food stall offering with simple plating and traditional preparation',
|
| 153 |
+
'street vendor food with casual serving style and cultural authenticity',
|
| 154 |
+
'local street eats with informal presentation and fresh ingredients',
|
| 155 |
+
'food truck meal with paper packaging and urban backdrop',
|
| 156 |
+
'market stall food with traditional cooking methods and local flavors',
|
| 157 |
+
'outdoor food stand offering with casual atmosphere and quick service',
|
| 158 |
+
'street-side cuisine with vibrant colors and authentic preparation',
|
| 159 |
+
'hawker food with cultural heritage and honest presentation'
|
| 160 |
+
],
|
| 161 |
+
'dessert': [
|
| 162 |
+
'elaborate dessert with decorative elements and sweet presentation',
|
| 163 |
+
'pastry or cake with artistic decoration and enticing appearance',
|
| 164 |
+
'sweet course with layered construction and visual appeal',
|
| 165 |
+
'dessert plate with confectionery artistry and color contrast',
|
| 166 |
+
'bakery creation with detailed finishing and appetizing styling',
|
| 167 |
+
'chocolate dessert with glossy ganache and elegant garnish',
|
| 168 |
+
'fruit tart with colorful berries and glazed finish',
|
| 169 |
+
'layered cake slice with frosting art and textured decoration',
|
| 170 |
+
'ice cream sundae with toppings drizzle and attractive presentation',
|
| 171 |
+
'patisserie item with delicate decoration and refined sweetness'
|
| 172 |
+
],
|
| 173 |
+
'ingredients': [
|
| 174 |
+
'fresh ingredients closeup shot with natural textures and vibrant colors',
|
| 175 |
+
'raw food components with organic forms and market-fresh appearance',
|
| 176 |
+
'culinary ingredients arranged with attention to color and composition',
|
| 177 |
+
'fresh produce with natural beauty and wholesome qualities',
|
| 178 |
+
'cooking ingredients with variety of textures and natural appeal',
|
| 179 |
+
'farmers market vegetables with rich colors and organic shapes',
|
| 180 |
+
'herb and spice arrangement with aromatic qualities and rustic charm',
|
| 181 |
+
'seafood display with ice and fresh-from-ocean appearance',
|
| 182 |
+
'butcher quality meat with marbling and premium cut presentation',
|
| 183 |
+
'artisan bread and grains with wholesome texture and natural crust'
|
| 184 |
+
],
|
| 185 |
+
'beverage': [
|
| 186 |
+
'artisan beverage with careful presentation and appealing pour',
|
| 187 |
+
'drink in glassware with garnish and professional service style',
|
| 188 |
+
'coffee or tea with latte art and aesthetic serving',
|
| 189 |
+
'refreshing beverage with ice garnish and attractive glass',
|
| 190 |
+
'drink presentation with attention to color and visual interest',
|
| 191 |
+
'craft cocktail with creative garnish and sophisticated glassware',
|
| 192 |
+
'specialty coffee with foam art and ceramic cup presentation',
|
| 193 |
+
'fresh juice with fruit garnish and vibrant natural color',
|
| 194 |
+
'tea service with elegant teapot and traditional ceremony aesthetic',
|
| 195 |
+
'smoothie bowl with fruit toppings and colorful healthy presentation'
|
| 196 |
+
],
|
| 197 |
+
'breakfast': [
|
| 198 |
+
'morning breakfast spread with eggs toast and fresh coffee',
|
| 199 |
+
'continental breakfast with pastries croissants and fruit arrangement',
|
| 200 |
+
'healthy breakfast bowl with granola yogurt and berries',
|
| 201 |
+
'pancake stack with maple syrup butter and powdered sugar',
|
| 202 |
+
'avocado toast with poached egg and microgreens on rustic plate',
|
| 203 |
+
'breakfast plate with bacon eggs and golden hash browns',
|
| 204 |
+
'brunch setting with mimosas fresh flowers and elegant tableware',
|
| 205 |
+
'oatmeal bowl with nuts fruits and honey drizzle',
|
| 206 |
+
'smoothie and acai bowl with tropical fruits and seeds',
|
| 207 |
+
'breakfast sandwich with melted cheese and morning sunlight'
|
| 208 |
+
],
|
| 209 |
+
'baked_goods': [
|
| 210 |
+
'fresh baked bread with golden crust and flour dusting',
|
| 211 |
+
'artisan pastries with flaky layers and butter sheen',
|
| 212 |
+
'homemade cookies with chocolate chips and rustic appearance',
|
| 213 |
+
'sourdough loaf with scoring pattern and crusty exterior',
|
| 214 |
+
'cinnamon rolls with cream cheese frosting and swirls',
|
| 215 |
+
'French baguette with crispy crust and airy crumb',
|
| 216 |
+
'croissants with laminated layers and golden brown color',
|
| 217 |
+
'muffins with crumb topping and fresh from oven warmth',
|
| 218 |
+
'bagels with sesame seeds and chewy texture',
|
| 219 |
+
'focaccia bread with herbs olive oil and dimpled surface'
|
| 220 |
+
]
|
| 221 |
+
},
|
| 222 |
+
|
| 223 |
+
# ===== 人物場景 People =====
|
| 224 |
+
'people': {
|
| 225 |
+
'portrait': [
|
| 226 |
+
'portrait photograph with shallow depth of field and subject focus',
|
| 227 |
+
'headshot with clean background and flattering lighting on face',
|
| 228 |
+
'personal portrait with emotional expression and eye contact',
|
| 229 |
+
'portrait composition with subject as primary visual element',
|
| 230 |
+
'close-up portrait with facial features and personality captured'
|
| 231 |
+
],
|
| 232 |
+
'candid': [
|
| 233 |
+
'candid street photography moment with natural unposed action',
|
| 234 |
+
'spontaneous capture of people in authentic situations and activities',
|
| 235 |
+
'documentary-style photograph of real-life moments and interactions',
|
| 236 |
+
'natural human behavior captured without staged positioning',
|
| 237 |
+
'unscripted moment showing genuine emotion and movement'
|
| 238 |
+
],
|
| 239 |
+
'group': [
|
| 240 |
+
'group photo with multiple people in organized composition',
|
| 241 |
+
'gathering of people with social interaction and shared activity',
|
| 242 |
+
'team or family portrait with coordinated positioning',
|
| 243 |
+
'group setting with people engaged in collective experience',
|
| 244 |
+
'multiple subjects arranged in harmonious group composition'
|
| 245 |
+
],
|
| 246 |
+
'activity': [
|
| 247 |
+
'people engaged in specific activity or recreational pursuit',
|
| 248 |
+
'action photograph showing physical movement and dynamic energy',
|
| 249 |
+
'sports or fitness activity with athletic performance captured',
|
| 250 |
+
'people participating in hobby or leisure activity',
|
| 251 |
+
'human subjects in motion demonstrating skill or exercise'
|
| 252 |
+
]
|
| 253 |
+
},
|
| 254 |
+
|
| 255 |
+
# ===== 產品場景 Product =====
|
| 256 |
+
'product': {
|
| 257 |
+
'studio_shot': [
|
| 258 |
+
'minimalist product photography on white background with clean lighting',
|
| 259 |
+
'commercial product shot with professional lighting and sharp detail',
|
| 260 |
+
'studio product photograph with controlled environment and even illumination',
|
| 261 |
+
'catalog-style product image with neutral background and clear presentation',
|
| 262 |
+
'product on white backdrop with shadow control and highlight management'
|
| 263 |
+
],
|
| 264 |
+
'lifestyle': [
|
| 265 |
+
'lifestyle product shot in natural setting with contextual environment',
|
| 266 |
+
'product in use showing real-world application and human interaction',
|
| 267 |
+
'environmental product photography with lifestyle context and atmosphere',
|
| 268 |
+
'product placed in authentic setting with relatable situation',
|
| 269 |
+
'contextual product image showing everyday use and practical application'
|
| 270 |
+
],
|
| 271 |
+
'flatlay': [
|
| 272 |
+
'overhead flatlay composition with products arranged on surface',
|
| 273 |
+
"bird's eye view of items arranged in artistic layout",
|
| 274 |
+
'top-down product styling with complementary objects and props',
|
| 275 |
+
'flatlay arrangement with balanced composition and visual harmony',
|
| 276 |
+
'aerial view of products styled with decorative elements'
|
| 277 |
+
]
|
| 278 |
+
},
|
| 279 |
+
|
| 280 |
+
# ===== 建築場景 Architecture =====
|
| 281 |
+
'architecture': {
|
| 282 |
+
'modern': [
|
| 283 |
+
'contemporary architecture with glass steel and minimalist design',
|
| 284 |
+
'modern building with clean lines geometric forms and innovative structure',
|
| 285 |
+
'architectural design featuring current aesthetic and building technology',
|
| 286 |
+
'present-day construction with progressive design and materials',
|
| 287 |
+
'modern structure with sleek surfaces and contemporary styling'
|
| 288 |
+
],
|
| 289 |
+
'historic': [
|
| 290 |
+
'historic architecture with traditional design and aged materials',
|
| 291 |
+
'heritage building with classical elements and period styling',
|
| 292 |
+
'old structure with architectural significance and historical character',
|
| 293 |
+
'traditional building with cultural importance and time-worn beauty',
|
| 294 |
+
'antique architecture showing craftsmanship of past eras'
|
| 295 |
+
],
|
| 296 |
+
'interior': [
|
| 297 |
+
'architectural interior space with designed environment and spatial quality',
|
| 298 |
+
'building interior showing layout flow and functional design',
|
| 299 |
+
'indoor architectural space with lighting surfaces and volumes',
|
| 300 |
+
'interior architecture with structural elements and finish materials',
|
| 301 |
+
'designed space interior with architectural features and spatial composition'
|
| 302 |
+
],
|
| 303 |
+
'detail': [
|
| 304 |
+
'architectural detail closeup showing construction method and materials',
|
| 305 |
+
'building element with decorative or functional architectural feature',
|
| 306 |
+
'structural detail revealing craftsmanship and design specifics',
|
| 307 |
+
'architectural component with unique design characteristic',
|
| 308 |
+
'close view of building feature showing texture pattern or ornamentation'
|
| 309 |
+
]
|
| 310 |
+
},
|
| 311 |
+
|
| 312 |
+
# ===== 光線描述 Lighting =====
|
| 313 |
+
'lighting': [
|
| 314 |
+
'soft diffused light creating even illumination without harsh shadows',
|
| 315 |
+
'natural daylight with bright ambient illumination and true colors',
|
| 316 |
+
'overcast atmosphere with diffused skylight and muted shadows',
|
| 317 |
+
'warm ambient light with golden tones and cozy feeling',
|
| 318 |
+
'evening light with low angle sun and long shadows',
|
| 319 |
+
'bright sunlight with strong contrast and crisp shadows',
|
| 320 |
+
'studio lighting with controlled illumination and professional quality',
|
| 321 |
+
'indoor natural light from windows creating gentle directional lighting',
|
| 322 |
+
'warm artificial lighting with incandescent glow and amber tones',
|
| 323 |
+
'cool artificial lighting with fluorescent or LED quality',
|
| 324 |
+
'soft indoor lighting with diffused sources and minimal shadows',
|
| 325 |
+
'dramatic lighting with strong contrast and defined shadows'
|
| 326 |
+
],
|
| 327 |
+
|
| 328 |
+
# ===== 氛圍描述 Mood =====
|
| 329 |
+
'mood': [
|
| 330 |
+
'calm and contemplative atmosphere with serene peaceful quality',
|
| 331 |
+
'bustling and energetic environment with dynamic active feeling',
|
| 332 |
+
'dramatic and imposing presence with powerful visual impact',
|
| 333 |
+
'cozy and intimate setting with warm welcoming ambiance',
|
| 334 |
+
'minimalist and clean aesthetic with simple uncluttered feel',
|
| 335 |
+
'vibrant and colorful scene with rich saturated hues',
|
| 336 |
+
'moody and atmospheric environment with evocative lighting',
|
| 337 |
+
'elegant and sophisticated setting with refined tasteful quality',
|
| 338 |
+
'rustic and natural atmosphere with organic earthy character',
|
| 339 |
+
'modern and sleek environment with contemporary styling'
|
| 340 |
+
]
|
| 341 |
+
}
|
| 342 |
+
|
| 343 |
+
# 場景標籤映射
|
| 344 |
+
self.scene_hashtags = {
|
| 345 |
+
'urban': {
|
| 346 |
+
'zh': ['城市', '都市', '城市風景', '街拍', '建築'],
|
| 347 |
+
'en': ['Urban', 'Cityscape', 'StreetPhotography', 'Architecture', 'City']
|
| 348 |
+
},
|
| 349 |
+
'nature': {
|
| 350 |
+
'zh': ['自然', '風景', '戶外', '大自然', '風景攝影'],
|
| 351 |
+
'en': ['Nature', 'Landscape', 'Outdoor', 'Scenery', 'NaturePhotography']
|
| 352 |
+
},
|
| 353 |
+
'indoor': {
|
| 354 |
+
'zh': ['室內', '室內設計', '空間', '居家'],
|
| 355 |
+
'en': ['Indoor', 'InteriorDesign', 'Interior', 'Home']
|
| 356 |
+
},
|
| 357 |
+
'food': {
|
| 358 |
+
'zh': ['美食', '食物', '料理', '美食攝影', '餐廳'],
|
| 359 |
+
'en': ['Food', 'Foodie', 'FoodPhotography', 'Cuisine', 'Dining']
|
| 360 |
+
},
|
| 361 |
+
'people': {
|
| 362 |
+
'zh': ['人像', '人物', '肖像', '街拍'],
|
| 363 |
+
'en': ['Portrait', 'People', 'PortraitPhotography', 'Candid']
|
| 364 |
+
},
|
| 365 |
+
'product': {
|
| 366 |
+
'zh': ['產品', '商品', '產品攝影', '商業攝影'],
|
| 367 |
+
'en': ['Product', 'ProductPhotography', 'Commercial', 'Flatlay']
|
| 368 |
+
},
|
| 369 |
+
'architecture': {
|
| 370 |
+
'zh': ['建築', '建築攝影', '建築設計', '空間'],
|
| 371 |
+
'en': ['Architecture', 'ArchitecturalPhotography', 'Building', 'Design']
|
| 372 |
+
}
|
| 373 |
+
}
|
| 374 |
+
|
| 375 |
+
print(f"✓ Scene Prompts initialized with {len(self.scene_vocabularies)} scene categories")
|
| 376 |
+
|
| 377 |
+
def get_prompts(self, scene_category: str, subcategory: str = None) -> List[str]:
|
| 378 |
+
"""
|
| 379 |
+
取得場景 prompts
|
| 380 |
+
|
| 381 |
+
Args:
|
| 382 |
+
scene_category: 場景類別 (如 'urban', 'nature')
|
| 383 |
+
subcategory: 子類別 (如 'city_canyon', 'mountain')
|
| 384 |
+
|
| 385 |
+
Returns:
|
| 386 |
+
Prompt 列表
|
| 387 |
+
"""
|
| 388 |
+
category_prompts = self.scene_vocabularies.get(scene_category, {})
|
| 389 |
+
|
| 390 |
+
if subcategory:
|
| 391 |
+
return category_prompts.get(subcategory, [])
|
| 392 |
+
else:
|
| 393 |
+
# 返回該類別的所有 prompts
|
| 394 |
+
all_prompts = []
|
| 395 |
+
for prompts in category_prompts.values():
|
| 396 |
+
if isinstance(prompts, list):
|
| 397 |
+
all_prompts.extend(prompts)
|
| 398 |
+
return all_prompts
|
| 399 |
+
|
| 400 |
+
def get_all_categories(self) -> List[str]:
|
| 401 |
+
"""取得所有場景類別"""
|
| 402 |
+
return list(self.scene_vocabularies.keys())
|
| 403 |
+
|
| 404 |
+
def get_subcategories(self, scene_category: str) -> List[str]:
|
| 405 |
+
"""取得特定類別的所有子類別"""
|
| 406 |
+
category = self.scene_vocabularies.get(scene_category, {})
|
| 407 |
+
return list(category.keys()) if isinstance(category, dict) else []
|
| 408 |
+
|
| 409 |
+
def get_hashtags(self, scene_category: str, language: str = 'zh') -> List[str]:
|
| 410 |
+
"""
|
| 411 |
+
取得場景的 hashtags
|
| 412 |
+
|
| 413 |
+
Args:
|
| 414 |
+
scene_category: 場景類別
|
| 415 |
+
language: 語言 ('zh', 'en', 或 'both')
|
| 416 |
+
|
| 417 |
+
Returns:
|
| 418 |
+
Hashtag 列表
|
| 419 |
+
"""
|
| 420 |
+
hashtags = self.scene_hashtags.get(scene_category, {})
|
| 421 |
+
|
| 422 |
+
if language == 'zh':
|
| 423 |
+
return hashtags.get('zh', [])
|
| 424 |
+
elif language == 'en':
|
| 425 |
+
return hashtags.get('en', [])
|
| 426 |
+
elif language == 'both' or language == 'zh-en':
|
| 427 |
+
zh_tags = hashtags.get('zh', [])
|
| 428 |
+
en_tags = hashtags.get('en', [])
|
| 429 |
+
return zh_tags + en_tags
|
| 430 |
+
else:
|
| 431 |
+
return hashtags.get('zh', [])
|
| 432 |
+
|
| 433 |
+
print("✓ ScenePrompts defined")
|
ui_manager.py
ADDED
|
@@ -0,0 +1,681 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from typing import Dict, List
|
| 3 |
+
|
| 4 |
+
class UIManager:
|
| 5 |
+
"""Manages all UI components and styling for Pixcribe"""
|
| 6 |
+
|
| 7 |
+
def __init__(self):
|
| 8 |
+
self.custom_css = self._get_custom_css()
|
| 9 |
+
|
| 10 |
+
def _get_custom_css(self) -> str:
|
| 11 |
+
"""Return complete CSS styling - Elegant light design"""
|
| 12 |
+
return """
|
| 13 |
+
/* ==================== Global Reset & Base ==================== */
|
| 14 |
+
* {
|
| 15 |
+
margin: 0;
|
| 16 |
+
padding: 0;
|
| 17 |
+
box-sizing: border-box;
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
.gradio-container {
|
| 21 |
+
background: linear-gradient(135deg, #F8F9FA 0%, #E9ECEF 100%) !important;
|
| 22 |
+
font-family: -apple-system, BlinkMacSystemFont, 'SF Pro Display', 'Segoe UI', 'Roboto', 'Helvetica Neue', Arial, sans-serif !important;
|
| 23 |
+
padding: 0 !important;
|
| 24 |
+
max-width: 100% !important;
|
| 25 |
+
min-height: 100vh !important;
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
/* Main content wrapper - Generous padding to prevent edge clipping */
|
| 29 |
+
.contain {
|
| 30 |
+
max-width: 1600px !important;
|
| 31 |
+
margin: 0 auto !important;
|
| 32 |
+
padding: 64px 96px 96px 96px !important;
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
/* ==================== Header ==================== */
|
| 36 |
+
.app-header {
|
| 37 |
+
text-align: center;
|
| 38 |
+
margin-bottom: 72px;
|
| 39 |
+
animation: fadeInDown 0.8s ease-out;
|
| 40 |
+
padding: 0 32px;
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
@keyframes fadeInDown {
|
| 44 |
+
from {
|
| 45 |
+
opacity: 0;
|
| 46 |
+
transform: translateY(-30px);
|
| 47 |
+
}
|
| 48 |
+
to {
|
| 49 |
+
opacity: 1;
|
| 50 |
+
transform: translateY(0);
|
| 51 |
+
}
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
.app-title {
|
| 55 |
+
font-size: 72px;
|
| 56 |
+
font-weight: 800;
|
| 57 |
+
background: linear-gradient(135deg, #2C3E50 0%, #34495E 100%);
|
| 58 |
+
-webkit-background-clip: text;
|
| 59 |
+
-webkit-text-fill-color: transparent;
|
| 60 |
+
background-clip: text;
|
| 61 |
+
margin-bottom: 24px;
|
| 62 |
+
letter-spacing: -0.05em;
|
| 63 |
+
line-height: 1.1;
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
.app-subtitle {
|
| 67 |
+
font-size: 26px;
|
| 68 |
+
font-weight: 400;
|
| 69 |
+
color: #6C757D;
|
| 70 |
+
margin-bottom: 0;
|
| 71 |
+
letter-spacing: 0.01em;
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
/* ==================== Layout ==================== */
|
| 75 |
+
.main-row {
|
| 76 |
+
gap: 48px !important;
|
| 77 |
+
margin-bottom: 48px !important;
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
/* Left column elegant container */
|
| 81 |
+
.main-row > .column:first-child {
|
| 82 |
+
background: linear-gradient(135deg, rgba(255, 255, 255, 0.8) 0%, rgba(252, 253, 254, 0.6) 100%) !important;
|
| 83 |
+
border-radius: 28px !important;
|
| 84 |
+
padding: 40px !important;
|
| 85 |
+
border: 1px solid rgba(52, 152, 219, 0.08) !important;
|
| 86 |
+
box-shadow: 0 4px 20px rgba(0, 0, 0, 0.04) !important;
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
/* Right column elegant container */
|
| 90 |
+
.main-row > .column:last-child {
|
| 91 |
+
background: linear-gradient(135deg, rgba(255, 255, 255, 0.8) 0%, rgba(252, 253, 254, 0.6) 100%) !important;
|
| 92 |
+
border-radius: 28px !important;
|
| 93 |
+
padding: 40px !important;
|
| 94 |
+
border: 1px solid rgba(52, 152, 219, 0.08) !important;
|
| 95 |
+
box-shadow: 0 4px 20px rgba(0, 0, 0, 0.04) !important;
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
/* ==================== Premium Cards - Light & Spacious ==================== */
|
| 99 |
+
.upload-card {
|
| 100 |
+
background: rgba(255, 255, 255, 0.95) !important;
|
| 101 |
+
border-radius: 32px !important;
|
| 102 |
+
box-shadow:
|
| 103 |
+
0 4px 16px rgba(0, 0, 0, 0.06),
|
| 104 |
+
0 2px 4px rgba(0, 0, 0, 0.03),
|
| 105 |
+
0 1px 2px rgba(0, 0, 0, 0.02) !important;
|
| 106 |
+
border: 1px solid rgba(0, 0, 0, 0.05) !important;
|
| 107 |
+
padding: 48px !important;
|
| 108 |
+
margin-bottom: 32px !important;
|
| 109 |
+
transition: all 0.4s cubic-bezier(0.25, 0.46, 0.45, 0.94) !important;
|
| 110 |
+
overflow: visible !important;
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
.results-card {
|
| 114 |
+
background: transparent !important;
|
| 115 |
+
border-radius: 0 !important;
|
| 116 |
+
box-shadow: none !important;
|
| 117 |
+
border: none !important;
|
| 118 |
+
padding: 0 !important;
|
| 119 |
+
margin-bottom: 32px !important;
|
| 120 |
+
overflow: visible !important;
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
/* Caption Results Container - Elegant Design */
|
| 124 |
+
.caption-results-container {
|
| 125 |
+
background: linear-gradient(135deg, rgba(255, 255, 255, 0.85) 0%, rgba(252, 253, 254, 0.7) 100%) !important;
|
| 126 |
+
border-radius: 28px !important;
|
| 127 |
+
padding: 44px !important;
|
| 128 |
+
border: 1px solid rgba(52, 152, 219, 0.1) !important;
|
| 129 |
+
box-shadow:
|
| 130 |
+
0 4px 20px rgba(0, 0, 0, 0.04),
|
| 131 |
+
0 2px 8px rgba(52, 152, 219, 0.03) !important;
|
| 132 |
+
margin-bottom: 40px !important;
|
| 133 |
+
overflow: visible !important;
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
.upload-card:hover {
|
| 137 |
+
box-shadow:
|
| 138 |
+
0 8px 32px rgba(0, 0, 0, 0.10),
|
| 139 |
+
0 4px 8px rgba(0, 0, 0, 0.06) !important;
|
| 140 |
+
transform: translateY(-6px);
|
| 141 |
+
border-color: rgba(52, 152, 219, 0.3) !important;
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
/* ==================== Upload Area ==================== */
|
| 145 |
+
.upload-area {
|
| 146 |
+
border: 3px dashed rgba(52, 152, 219, 0.35) !important;
|
| 147 |
+
border-radius: 28px !important;
|
| 148 |
+
background: linear-gradient(135deg, rgba(52, 152, 219, 0.03) 0%, rgba(52, 152, 219, 0.06) 100%) !important;
|
| 149 |
+
padding: 96px 40px !important;
|
| 150 |
+
text-align: center !important;
|
| 151 |
+
transition: all 0.3s ease !important;
|
| 152 |
+
min-height: 360px !important;
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
.upload-area:hover {
|
| 156 |
+
border-color: #3498DB !important;
|
| 157 |
+
background: linear-gradient(135deg, rgba(52, 152, 219, 0.06) 0%, rgba(52, 152, 219, 0.12) 100%) !important;
|
| 158 |
+
transform: scale(1.02);
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
/* ==================== Section Titles - Consistent Spacing ==================== */
|
| 162 |
+
.section-title {
|
| 163 |
+
font-size: 28px !important;
|
| 164 |
+
font-weight: 700 !important;
|
| 165 |
+
color: #2C3E50 !important;
|
| 166 |
+
margin-bottom: 20px !important;
|
| 167 |
+
letter-spacing: -0.02em !important;
|
| 168 |
+
padding-bottom: 0 !important;
|
| 169 |
+
border-bottom: none !important;
|
| 170 |
+
text-align: left !important;
|
| 171 |
+
margin-top: 0 !important;
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
.section-title-left {
|
| 175 |
+
font-size: 28px !important;
|
| 176 |
+
font-weight: 700 !important;
|
| 177 |
+
color: #2C3E50 !important;
|
| 178 |
+
margin-bottom: 20px !important;
|
| 179 |
+
margin-top: 0 !important;
|
| 180 |
+
letter-spacing: -0.02em !important;
|
| 181 |
+
text-align: left !important;
|
| 182 |
+
border-bottom: none !important;
|
| 183 |
+
padding-bottom: 0 !important;
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
/* ==================== Form Elements - Generous Padding ==================== */
|
| 187 |
+
.settings-row {
|
| 188 |
+
gap: 24px !important;
|
| 189 |
+
margin-bottom: 28px !important;
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
.radio-group {
|
| 193 |
+
background: rgba(248, 249, 250, 0.5) !important;
|
| 194 |
+
border-radius: 20px !important;
|
| 195 |
+
padding: 24px 28px !important;
|
| 196 |
+
border: none !important;
|
| 197 |
+
margin-bottom: 24px !important;
|
| 198 |
+
border: 1px solid rgba(0, 0, 0, 0.04) !important;
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
.radio-group:last-child {
|
| 202 |
+
margin-bottom: 0 !important;
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
/* Inline radio groups for side-by-side layout */
|
| 206 |
+
.radio-group-inline {
|
| 207 |
+
background: linear-gradient(135deg, rgba(255, 255, 255, 0.7) 0%, rgba(248, 249, 250, 0.5) 100%) !important;
|
| 208 |
+
border-radius: 16px !important;
|
| 209 |
+
padding: 20px !important;
|
| 210 |
+
border: 1px solid rgba(52, 152, 219, 0.1) !important;
|
| 211 |
+
margin-bottom: 0 !important;
|
| 212 |
+
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.03) !important;
|
| 213 |
+
transition: all 0.3s ease !important;
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
.radio-group-inline:hover {
|
| 217 |
+
box-shadow: 0 4px 16px rgba(52, 152, 219, 0.08) !important;
|
| 218 |
+
border-color: rgba(52, 152, 219, 0.2) !important;
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
.radio-group label {
|
| 222 |
+
color: #6C757D !important;
|
| 223 |
+
font-weight: 600 !important;
|
| 224 |
+
font-size: 14px !important;
|
| 225 |
+
margin-bottom: 16px !important;
|
| 226 |
+
letter-spacing: 0.08em !important;
|
| 227 |
+
text-transform: uppercase !important;
|
| 228 |
+
display: block !important;
|
| 229 |
+
text-align: left !important;
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
/* Radio group title (the actual input label) */
|
| 233 |
+
.radio-group > label:first-child {
|
| 234 |
+
color: #2C3E50 !important;
|
| 235 |
+
font-weight: 700 !important;
|
| 236 |
+
font-size: 19px !important;
|
| 237 |
+
margin-bottom: 16px !important;
|
| 238 |
+
letter-spacing: -0.02em !important;
|
| 239 |
+
text-transform: none !important;
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
/* Inline radio group title - BIGGER and BOLD */
|
| 243 |
+
.radio-group-inline > label:first-child {
|
| 244 |
+
color: #2C3E50 !important;
|
| 245 |
+
font-weight: 700 !important;
|
| 246 |
+
font-size: 18px !important;
|
| 247 |
+
margin-bottom: 14px !important;
|
| 248 |
+
letter-spacing: -0.01em !important;
|
| 249 |
+
text-transform: none !important;
|
| 250 |
+
display: block !important;
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
+
.radio-group input[type="radio"] {
|
| 254 |
+
accent-color: #3498DB !important;
|
| 255 |
+
width: 22px !important;
|
| 256 |
+
height: 22px !important;
|
| 257 |
+
margin-right: 14px !important;
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
/* Radio option labels */
|
| 261 |
+
.radio-group > div > label {
|
| 262 |
+
color: #495057 !important;
|
| 263 |
+
font-weight: 500 !important;
|
| 264 |
+
font-size: 17px !important;
|
| 265 |
+
letter-spacing: -0.01em !important;
|
| 266 |
+
text-transform: none !important;
|
| 267 |
+
padding: 14px 20px !important;
|
| 268 |
+
border-radius: 14px !important;
|
| 269 |
+
transition: all 0.2s ease !important;
|
| 270 |
+
cursor: pointer !important;
|
| 271 |
+
display: flex !important;
|
| 272 |
+
align-items: center !important;
|
| 273 |
+
}
|
| 274 |
+
|
| 275 |
+
/* Inline radio option labels - BIGGER */
|
| 276 |
+
.radio-group-inline > div > label {
|
| 277 |
+
color: #495057 !important;
|
| 278 |
+
font-weight: 500 !important;
|
| 279 |
+
font-size: 16px !important;
|
| 280 |
+
letter-spacing: -0.01em !important;
|
| 281 |
+
text-transform: none !important;
|
| 282 |
+
padding: 12px 16px !important;
|
| 283 |
+
border-radius: 10px !important;
|
| 284 |
+
transition: all 0.2s ease !important;
|
| 285 |
+
cursor: pointer !important;
|
| 286 |
+
display: flex !important;
|
| 287 |
+
align-items: center !important;
|
| 288 |
+
background: rgba(255, 255, 255, 0.6) !important;
|
| 289 |
+
margin-bottom: 8px !important;
|
| 290 |
+
border: 1px solid rgba(0, 0, 0, 0.04) !important;
|
| 291 |
+
}
|
| 292 |
+
|
| 293 |
+
.radio-group > div > label:hover {
|
| 294 |
+
background: rgba(52, 152, 219, 0.08) !important;
|
| 295 |
+
}
|
| 296 |
+
|
| 297 |
+
.radio-group-inline > div > label:hover {
|
| 298 |
+
background: rgba(52, 152, 219, 0.12) !important;
|
| 299 |
+
transform: translateX(4px);
|
| 300 |
+
}
|
| 301 |
+
|
| 302 |
+
/* ==================== Button ==================== */
|
| 303 |
+
.generate-button {
|
| 304 |
+
background: linear-gradient(135deg, #3498DB 0%, #2980B9 100%) !important;
|
| 305 |
+
color: white !important;
|
| 306 |
+
border: none !important;
|
| 307 |
+
border-radius: 20px !important;
|
| 308 |
+
padding: 24px 64px !important;
|
| 309 |
+
font-size: 19px !important;
|
| 310 |
+
font-weight: 700 !important;
|
| 311 |
+
cursor: pointer !important;
|
| 312 |
+
box-shadow:
|
| 313 |
+
0 6px 24px rgba(52, 152, 219, 0.35),
|
| 314 |
+
0 3px 6px rgba(52, 152, 219, 0.25) !important;
|
| 315 |
+
transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
|
| 316 |
+
letter-spacing: -0.02em !important;
|
| 317 |
+
width: 100% !important;
|
| 318 |
+
margin-top: 24px !important;
|
| 319 |
+
}
|
| 320 |
+
|
| 321 |
+
.generate-button:hover {
|
| 322 |
+
transform: translateY(-6px) scale(1.02) !important;
|
| 323 |
+
box-shadow:
|
| 324 |
+
0 16px 48px rgba(52, 152, 219, 0.45),
|
| 325 |
+
0 6px 12px rgba(52, 152, 219, 0.35) !important;
|
| 326 |
+
}
|
| 327 |
+
|
| 328 |
+
.generate-button:active {
|
| 329 |
+
transform: translateY(-3px) scale(1.01) !important;
|
| 330 |
+
}
|
| 331 |
+
|
| 332 |
+
/* ==================== Caption Cards - Light & Elegant ==================== */
|
| 333 |
+
.caption-card {
|
| 334 |
+
background: linear-gradient(135deg, rgba(255, 255, 255, 0.98) 0%, rgba(248, 249, 250, 0.95) 100%);
|
| 335 |
+
backdrop-filter: blur(20px);
|
| 336 |
+
border: 1px solid rgba(0, 0, 0, 0.06);
|
| 337 |
+
border-radius: 28px;
|
| 338 |
+
padding: 32px 36px;
|
| 339 |
+
margin-bottom: 28px;
|
| 340 |
+
transition: all 0.4s cubic-bezier(0.25, 0.46, 0.45, 0.94);
|
| 341 |
+
box-shadow:
|
| 342 |
+
0 4px 16px rgba(0, 0, 0, 0.05),
|
| 343 |
+
0 2px 4px rgba(0, 0, 0, 0.03);
|
| 344 |
+
position: relative;
|
| 345 |
+
}
|
| 346 |
+
|
| 347 |
+
.caption-card:hover {
|
| 348 |
+
box-shadow:
|
| 349 |
+
0 8px 32px rgba(0, 0, 0, 0.10),
|
| 350 |
+
0 4px 8px rgba(0, 0, 0, 0.06);
|
| 351 |
+
transform: translateY(-6px);
|
| 352 |
+
border-color: rgba(52, 152, 219, 0.3);
|
| 353 |
+
}
|
| 354 |
+
|
| 355 |
+
.caption-header {
|
| 356 |
+
font-size: 15px;
|
| 357 |
+
font-weight: 700;
|
| 358 |
+
color: #6C757D;
|
| 359 |
+
text-transform: uppercase;
|
| 360 |
+
letter-spacing: 0.14em;
|
| 361 |
+
margin-bottom: 20px;
|
| 362 |
+
}
|
| 363 |
+
|
| 364 |
+
.caption-text {
|
| 365 |
+
font-size: 21px;
|
| 366 |
+
font-weight: 400;
|
| 367 |
+
color: #2C3E50;
|
| 368 |
+
line-height: 1.8;
|
| 369 |
+
margin-bottom: 24px;
|
| 370 |
+
letter-spacing: -0.01em;
|
| 371 |
+
}
|
| 372 |
+
|
| 373 |
+
.caption-hashtags {
|
| 374 |
+
font-size: 18px;
|
| 375 |
+
font-weight: 600;
|
| 376 |
+
color: #3498DB;
|
| 377 |
+
margin-bottom: 0;
|
| 378 |
+
word-wrap: break-word;
|
| 379 |
+
line-height: 1.75;
|
| 380 |
+
}
|
| 381 |
+
|
| 382 |
+
/* Copy Button */
|
| 383 |
+
.copy-button {
|
| 384 |
+
position: absolute;
|
| 385 |
+
top: 28px;
|
| 386 |
+
right: 28px;
|
| 387 |
+
background: rgba(52, 152, 219, 0.10);
|
| 388 |
+
border: 1px solid rgba(52, 152, 219, 0.25);
|
| 389 |
+
border-radius: 14px;
|
| 390 |
+
padding: 12px 20px;
|
| 391 |
+
font-size: 15px;
|
| 392 |
+
font-weight: 600;
|
| 393 |
+
color: #3498DB;
|
| 394 |
+
cursor: pointer;
|
| 395 |
+
transition: all 0.2s ease;
|
| 396 |
+
display: flex;
|
| 397 |
+
align-items: center;
|
| 398 |
+
gap: 8px;
|
| 399 |
+
}
|
| 400 |
+
|
| 401 |
+
.copy-button:hover {
|
| 402 |
+
background: rgba(52, 152, 219, 0.18);
|
| 403 |
+
border-color: #3498DB;
|
| 404 |
+
transform: translateY(-2px);
|
| 405 |
+
box-shadow: 0 4px 12px rgba(52, 152, 219, 0.25);
|
| 406 |
+
}
|
| 407 |
+
|
| 408 |
+
.copy-button:active {
|
| 409 |
+
transform: translateY(0);
|
| 410 |
+
}
|
| 411 |
+
|
| 412 |
+
.copy-button.copied {
|
| 413 |
+
background: rgba(39, 174, 96, 0.15);
|
| 414 |
+
border-color: #27AE60;
|
| 415 |
+
color: #27AE60;
|
| 416 |
+
}
|
| 417 |
+
|
| 418 |
+
/* ==================== Footer ==================== */
|
| 419 |
+
.app-footer {
|
| 420 |
+
text-align: center;
|
| 421 |
+
margin-top: 96px;
|
| 422 |
+
padding-top: 64px;
|
| 423 |
+
border-top: 3px solid rgba(0, 0, 0, 0.08);
|
| 424 |
+
animation: fadeInUp 0.8s ease-out 0.3s backwards;
|
| 425 |
+
}
|
| 426 |
+
|
| 427 |
+
@keyframes fadeInUp {
|
| 428 |
+
from {
|
| 429 |
+
opacity: 0;
|
| 430 |
+
transform: translateY(30px);
|
| 431 |
+
}
|
| 432 |
+
to {
|
| 433 |
+
opacity: 1;
|
| 434 |
+
transform: translateY(0);
|
| 435 |
+
}
|
| 436 |
+
}
|
| 437 |
+
|
| 438 |
+
.footer-text {
|
| 439 |
+
font-size: 17px;
|
| 440 |
+
color: #6C757D;
|
| 441 |
+
line-height: 2.0;
|
| 442 |
+
letter-spacing: -0.01em;
|
| 443 |
+
font-weight: 500;
|
| 444 |
+
}
|
| 445 |
+
|
| 446 |
+
.footer-models {
|
| 447 |
+
font-size: 15px;
|
| 448 |
+
color: #ADB5BD;
|
| 449 |
+
margin-top: 20px;
|
| 450 |
+
font-weight: 600;
|
| 451 |
+
letter-spacing: 0.03em;
|
| 452 |
+
}
|
| 453 |
+
|
| 454 |
+
/* ==================== Image Display ==================== */
|
| 455 |
+
.image-container {
|
| 456 |
+
border-radius: 28px !important;
|
| 457 |
+
overflow: hidden !important;
|
| 458 |
+
box-shadow:
|
| 459 |
+
0 6px 24px rgba(0, 0, 0, 0.10),
|
| 460 |
+
0 3px 6px rgba(0, 0, 0, 0.06) !important;
|
| 461 |
+
}
|
| 462 |
+
|
| 463 |
+
.image-container img {
|
| 464 |
+
border-radius: 28px !important;
|
| 465 |
+
box-shadow:
|
| 466 |
+
0 6px 24px rgba(0, 0, 0, 0.12),
|
| 467 |
+
0 3px 6px rgba(0, 0, 0, 0.08) !important;
|
| 468 |
+
}
|
| 469 |
+
|
| 470 |
+
/* ==================== Responsive Design ==================== */
|
| 471 |
+
@media (max-width: 768px) {
|
| 472 |
+
.contain {
|
| 473 |
+
padding: 48px 32px 64px 32px !important;
|
| 474 |
+
}
|
| 475 |
+
|
| 476 |
+
.app-title {
|
| 477 |
+
font-size: 52px;
|
| 478 |
+
}
|
| 479 |
+
|
| 480 |
+
.app-subtitle {
|
| 481 |
+
font-size: 20px;
|
| 482 |
+
}
|
| 483 |
+
|
| 484 |
+
.upload-card, .options-card, .results-card {
|
| 485 |
+
padding: 40px !important;
|
| 486 |
+
}
|
| 487 |
+
|
| 488 |
+
.upload-area {
|
| 489 |
+
padding: 64px 32px !important;
|
| 490 |
+
min-height: 280px !important;
|
| 491 |
+
}
|
| 492 |
+
|
| 493 |
+
.caption-card {
|
| 494 |
+
padding: 28px;
|
| 495 |
+
}
|
| 496 |
+
|
| 497 |
+
.section-title {
|
| 498 |
+
font-size: 30px !important;
|
| 499 |
+
}
|
| 500 |
+
|
| 501 |
+
.copy-button {
|
| 502 |
+
top: 20px;
|
| 503 |
+
right: 20px;
|
| 504 |
+
padding: 10px 16px;
|
| 505 |
+
font-size: 14px;
|
| 506 |
+
}
|
| 507 |
+
}
|
| 508 |
+
|
| 509 |
+
/* ==================== Loading Animation ==================== */
|
| 510 |
+
@keyframes shimmer {
|
| 511 |
+
0% {
|
| 512 |
+
background-position: -1000px 0;
|
| 513 |
+
}
|
| 514 |
+
100% {
|
| 515 |
+
background-position: 1000px 0;
|
| 516 |
+
}
|
| 517 |
+
}
|
| 518 |
+
|
| 519 |
+
.loading {
|
| 520 |
+
animation: shimmer 2s infinite;
|
| 521 |
+
background: linear-gradient(to right, #f8f9fa 4%, #e9ecef 25%, #f8f9fa 36%);
|
| 522 |
+
background-size: 1000px 100%;
|
| 523 |
+
}
|
| 524 |
+
"""
|
| 525 |
+
|
| 526 |
+
def create_header(self):
|
| 527 |
+
"""Create application header"""
|
| 528 |
+
return gr.HTML("""
|
| 529 |
+
<div class="app-header">
|
| 530 |
+
<h1 class="app-title">✨ Pixcribe</h1>
|
| 531 |
+
<p class="app-subtitle">AI-Powered Social Media Caption Generator</p>
|
| 532 |
+
</div>
|
| 533 |
+
""")
|
| 534 |
+
|
| 535 |
+
def create_info_banner(self):
|
| 536 |
+
"""Create informational banner about model loading and processing times"""
|
| 537 |
+
return gr.HTML("""
|
| 538 |
+
<div style="
|
| 539 |
+
background: linear-gradient(135deg, #E8F4F8 0%, #D4E9F2 100%);
|
| 540 |
+
border-left: 4px solid #3498DB;
|
| 541 |
+
border-radius: 16px;
|
| 542 |
+
padding: 24px 32px;
|
| 543 |
+
margin: 0 auto 48px auto;
|
| 544 |
+
max-width: 1200px;
|
| 545 |
+
box-shadow: 0 4px 16px rgba(52, 152, 219, 0.12);
|
| 546 |
+
">
|
| 547 |
+
<div style="display: flex; align-items: start; gap: 20px;">
|
| 548 |
+
<div style="font-size: 32px; line-height: 1; margin-top: 4px;">⏱️</div>
|
| 549 |
+
<div style="flex: 1;">
|
| 550 |
+
<h3 style="
|
| 551 |
+
margin: 0 0 12px 0;
|
| 552 |
+
font-size: 20px;
|
| 553 |
+
font-weight: 700;
|
| 554 |
+
color: #2C3E50;
|
| 555 |
+
letter-spacing: -0.02em;
|
| 556 |
+
">
|
| 557 |
+
Please Note: Processing Time
|
| 558 |
+
</h3>
|
| 559 |
+
<p style="
|
| 560 |
+
margin: 0 0 12px 0;
|
| 561 |
+
font-size: 15px;
|
| 562 |
+
line-height: 1.6;
|
| 563 |
+
color: #5D6D7E;
|
| 564 |
+
">
|
| 565 |
+
<strong style="color: #2980B9;">Initial setup and model loading may take a while</strong> as multiple AI models
|
| 566 |
+
are initialized and cached. This includes YOLOv11 object detection, OpenCLIP semantic analysis,
|
| 567 |
+
Qwen2.5-VL caption generation, and other advanced models.
|
| 568 |
+
</p>
|
| 569 |
+
<p style="
|
| 570 |
+
margin: 0;
|
| 571 |
+
font-size: 15px;
|
| 572 |
+
line-height: 1.6;
|
| 573 |
+
color: #5D6D7E;
|
| 574 |
+
">
|
| 575 |
+
✨ <strong style="color: #27AE60;">Processing time varies depending on system resources.</strong>
|
| 576 |
+
Thank you for your patience while we generate high-quality captions!
|
| 577 |
+
</p>
|
| 578 |
+
</div>
|
| 579 |
+
</div>
|
| 580 |
+
</div>
|
| 581 |
+
""")
|
| 582 |
+
|
| 583 |
+
def create_footer(self):
|
| 584 |
+
"""Create application footer"""
|
| 585 |
+
return gr.HTML("""
|
| 586 |
+
<div class="app-footer">
|
| 587 |
+
<p class="footer-text">
|
| 588 |
+
Powered by advanced AI models
|
| 589 |
+
</p>
|
| 590 |
+
<p class="footer-models">
|
| 591 |
+
YOLOv11 · OpenCLIP ViT-H/14 · Qwen2.5-VL-7B · EasyOCR · Places365 · U2-Net
|
| 592 |
+
</p>
|
| 593 |
+
<p class="footer-text" style="margin-top: 32px;">
|
| 594 |
+
© 2025 Pixcribe · Built for creators
|
| 595 |
+
</p>
|
| 596 |
+
</div>
|
| 597 |
+
""")
|
| 598 |
+
|
| 599 |
+
def format_captions_with_copy(self, captions: List[Dict]) -> str:
|
| 600 |
+
"""Format captions as HTML with copy functionality"""
|
| 601 |
+
if not captions:
|
| 602 |
+
return "<p style='color: #6C757D; padding: 24px;'>No captions generated</p>"
|
| 603 |
+
|
| 604 |
+
captions_html = ""
|
| 605 |
+
for i, cap in enumerate(captions):
|
| 606 |
+
caption_text = cap.get('caption', '')
|
| 607 |
+
hashtags = cap.get('hashtags', [])
|
| 608 |
+
tone = cap.get('tone', 'unknown').title()
|
| 609 |
+
|
| 610 |
+
# Create unique ID for each caption
|
| 611 |
+
caption_id = f"caption_{i}"
|
| 612 |
+
|
| 613 |
+
# Full text to copy (caption + hashtags)
|
| 614 |
+
full_text = f"{caption_text}\n\n{' '.join([f'#{tag}' for tag in hashtags])}"
|
| 615 |
+
|
| 616 |
+
captions_html += f"""
|
| 617 |
+
<div class="caption-card" id="{caption_id}">
|
| 618 |
+
<button class="copy-button" onclick="copyCaption{i}()" id="copy-btn-{i}">
|
| 619 |
+
📋 Copy
|
| 620 |
+
</button>
|
| 621 |
+
<div class="caption-header">Caption {i+1} · {tone}</div>
|
| 622 |
+
<div class="caption-text">{caption_text}</div>
|
| 623 |
+
<div class="caption-hashtags">
|
| 624 |
+
{' '.join([f'#{tag}' for tag in hashtags])}
|
| 625 |
+
</div>
|
| 626 |
+
<textarea id="caption-text-{i}" style="position: absolute; left: -9999px;">{full_text}</textarea>
|
| 627 |
+
</div>
|
| 628 |
+
|
| 629 |
+
<script>
|
| 630 |
+
function copyCaption{i}() {{
|
| 631 |
+
const text = document.getElementById('caption-text-{i}').value;
|
| 632 |
+
const btn = document.getElementById('copy-btn-{i}');
|
| 633 |
+
|
| 634 |
+
// Try modern clipboard API first
|
| 635 |
+
if (navigator.clipboard && navigator.clipboard.writeText) {{
|
| 636 |
+
navigator.clipboard.writeText(text).then(() => {{
|
| 637 |
+
btn.innerHTML = '✓ Copied!';
|
| 638 |
+
btn.classList.add('copied');
|
| 639 |
+
setTimeout(() => {{
|
| 640 |
+
btn.innerHTML = '📋 Copy';
|
| 641 |
+
btn.classList.remove('copied');
|
| 642 |
+
}}, 2000);
|
| 643 |
+
}}).catch(() => {{
|
| 644 |
+
// Fallback to old method
|
| 645 |
+
fallbackCopy{i}();
|
| 646 |
+
}});
|
| 647 |
+
}} else {{
|
| 648 |
+
// Fallback for older browsers
|
| 649 |
+
fallbackCopy{i}();
|
| 650 |
+
}}
|
| 651 |
+
}}
|
| 652 |
+
|
| 653 |
+
function fallbackCopy{i}() {{
|
| 654 |
+
const textarea = document.getElementById('caption-text-{i}');
|
| 655 |
+
const btn = document.getElementById('copy-btn-{i}');
|
| 656 |
+
textarea.style.position = 'static';
|
| 657 |
+
textarea.style.opacity = '0';
|
| 658 |
+
textarea.select();
|
| 659 |
+
try {{
|
| 660 |
+
document.execCommand('copy');
|
| 661 |
+
btn.innerHTML = '✓ Copied!';
|
| 662 |
+
btn.classList.add('copied');
|
| 663 |
+
setTimeout(() => {{
|
| 664 |
+
btn.innerHTML = '📋 Copy';
|
| 665 |
+
btn.classList.remove('copied');
|
| 666 |
+
}}, 2000);
|
| 667 |
+
}} catch (err) {{
|
| 668 |
+
btn.innerHTML = '✗ Failed';
|
| 669 |
+
setTimeout(() => {{
|
| 670 |
+
btn.innerHTML = '📋 Copy';
|
| 671 |
+
}}, 2000);
|
| 672 |
+
}}
|
| 673 |
+
textarea.style.position = 'absolute';
|
| 674 |
+
textarea.style.opacity = '1';
|
| 675 |
+
}}
|
| 676 |
+
</script>
|
| 677 |
+
"""
|
| 678 |
+
|
| 679 |
+
return captions_html
|
| 680 |
+
|
| 681 |
+
print("✓ UIManager defined")
|
universal_object_prompts.py
ADDED
|
@@ -0,0 +1,464 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from typing import Dict, List
|
| 3 |
+
|
| 4 |
+
class UniversalObjectPrompts:
|
| 5 |
+
"""
|
| 6 |
+
通用物品描述 Prompt 庫
|
| 7 |
+
涵蓋日常物品、動物、交通工具、電子產品等
|
| 8 |
+
確保系統能夠描述各種類型的圖片
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
def __init__(self):
|
| 12 |
+
"""初始化通用物品詞彙庫"""
|
| 13 |
+
|
| 14 |
+
self.object_vocabularies = {
|
| 15 |
+
# ===== 動物 Animals =====
|
| 16 |
+
'animals': {
|
| 17 |
+
'dogs': [
|
| 18 |
+
'friendly dog with expressive eyes and playful demeanor',
|
| 19 |
+
'canine companion with soft fur and loyal presence',
|
| 20 |
+
'domestic dog breed with distinct markings and alert posture',
|
| 21 |
+
'pet dog in outdoor setting with natural behavior',
|
| 22 |
+
'puppy with cute features and energetic personality',
|
| 23 |
+
'large breed dog with muscular build and protective stance',
|
| 24 |
+
'small lap dog with fluffy coat and adorable expression',
|
| 25 |
+
'working dog demonstrating intelligence and trained skills',
|
| 26 |
+
'mixed breed dog with unique features and charming character',
|
| 27 |
+
'dog portrait with focused gaze and photogenic qualities'
|
| 28 |
+
],
|
| 29 |
+
'cats': [
|
| 30 |
+
'elegant cat with graceful posture and alert expression',
|
| 31 |
+
'feline companion with soft fur and independent character',
|
| 32 |
+
'domestic cat with distinctive markings and curious nature',
|
| 33 |
+
'cat resting in comfortable position with relaxed demeanor',
|
| 34 |
+
'kitten with playful energy and adorable tiny features',
|
| 35 |
+
'long-haired cat with fluffy coat and majestic appearance',
|
| 36 |
+
'short-haired cat with sleek coat and athletic build',
|
| 37 |
+
'cat portrait with piercing eyes and photogenic pose',
|
| 38 |
+
'tabby cat with striped pattern and charming personality',
|
| 39 |
+
'cat in natural sunlight with warm ambient lighting'
|
| 40 |
+
],
|
| 41 |
+
'birds': [
|
| 42 |
+
'colorful bird with vibrant plumage and natural beauty',
|
| 43 |
+
'bird in flight with spread wings and dynamic motion',
|
| 44 |
+
'perched bird with detailed feather texture and alert posture',
|
| 45 |
+
'exotic bird species with distinctive beak and eye markings',
|
| 46 |
+
'songbird with delicate features and graceful appearance',
|
| 47 |
+
'bird of prey with powerful build and intense gaze',
|
| 48 |
+
'waterfowl with sleek feathers and aquatic adaptation',
|
| 49 |
+
'tropical bird with brilliant colors and exotic appeal',
|
| 50 |
+
'bird feeding or foraging showing natural behavior',
|
| 51 |
+
'bird silhouette against sky with artistic composition'
|
| 52 |
+
],
|
| 53 |
+
'wildlife': [
|
| 54 |
+
'wildlife creature in natural habitat showing authentic behavior',
|
| 55 |
+
'wild animal with powerful build and majestic presence',
|
| 56 |
+
'forest wildlife with camouflage coloring and alert senses',
|
| 57 |
+
'marine wildlife with aquatic adaptation and fluid movement',
|
| 58 |
+
'safari animal with distinctive features and exotic appeal',
|
| 59 |
+
'small mammal with cute features and curious expression',
|
| 60 |
+
'endangered species with conservation importance and beauty',
|
| 61 |
+
'nocturnal animal with adapted eyes and nighttime behavior',
|
| 62 |
+
'wildlife portrait with environmental context and natural light',
|
| 63 |
+
'animal in motion demonstrating speed agility or power'
|
| 64 |
+
]
|
| 65 |
+
},
|
| 66 |
+
|
| 67 |
+
# ===== 交通工具 Vehicles =====
|
| 68 |
+
'vehicles': {
|
| 69 |
+
'cars': [
|
| 70 |
+
'modern automobile with sleek design and aerodynamic lines',
|
| 71 |
+
'luxury car with premium finish and sophisticated styling',
|
| 72 |
+
'sports car with aggressive stance and performance aesthetics',
|
| 73 |
+
'classic car with vintage charm and timeless design',
|
| 74 |
+
'electric vehicle with futuristic design and eco-friendly appeal',
|
| 75 |
+
'SUV with robust build and commanding presence',
|
| 76 |
+
'sedan with elegant profile and comfortable proportions',
|
| 77 |
+
'convertible with open top and free-spirited character',
|
| 78 |
+
'vintage automobile with chrome details and nostalgic beauty',
|
| 79 |
+
'race car with aerodynamic body and competition livery'
|
| 80 |
+
],
|
| 81 |
+
'motorcycles': [
|
| 82 |
+
'motorcycle with powerful engine and dynamic design',
|
| 83 |
+
'cruiser bike with low profile and relaxed riding position',
|
| 84 |
+
'sport bike with aggressive fairings and racing aesthetics',
|
| 85 |
+
'vintage motorcycle with classic styling and heritage appeal',
|
| 86 |
+
'custom bike with unique modifications and personal touches',
|
| 87 |
+
'touring motorcycle with comfort features and long-distance capability',
|
| 88 |
+
'dirt bike with off-road tires and rugged construction',
|
| 89 |
+
'scooter with practical design and urban mobility',
|
| 90 |
+
'cafe racer with minimalist design and retro styling',
|
| 91 |
+
'adventure motorcycle with all-terrain capability and robust build'
|
| 92 |
+
],
|
| 93 |
+
'bicycles': [
|
| 94 |
+
'road bike with lightweight frame and racing geometry',
|
| 95 |
+
'mountain bike with suspension and off-road tires',
|
| 96 |
+
'vintage bicycle with classic design and nostalgic charm',
|
| 97 |
+
'urban commuter bike with practical features and city-ready design',
|
| 98 |
+
'electric bicycle with motor assist and modern technology',
|
| 99 |
+
'BMX bike with compact frame and trick-ready build',
|
| 100 |
+
'touring bicycle with panniers and long-distance setup',
|
| 101 |
+
'folding bike with space-saving design and portability',
|
| 102 |
+
'fixed gear bike with minimalist aesthetic and urban style',
|
| 103 |
+
'cruiser bicycle with comfortable seat and relaxed riding position'
|
| 104 |
+
],
|
| 105 |
+
'public_transport': [
|
| 106 |
+
'city bus with public transit livery and urban setting',
|
| 107 |
+
'train at station with platform and passenger environment',
|
| 108 |
+
'subway car with interior lighting and metro system',
|
| 109 |
+
'tram on city streets with overhead wires and urban backdrop',
|
| 110 |
+
'ferry boat with water transportation and maritime setting',
|
| 111 |
+
'taxi cab with distinctive markings and urban context',
|
| 112 |
+
'double-decker bus with iconic design and city character',
|
| 113 |
+
'monorail with elevated track and futuristic appearance',
|
| 114 |
+
'light rail vehicle with modern design and efficient transit',
|
| 115 |
+
'cable car with hillside location and scenic views'
|
| 116 |
+
]
|
| 117 |
+
},
|
| 118 |
+
|
| 119 |
+
# ===== 電子產品 Electronics =====
|
| 120 |
+
'electronics': {
|
| 121 |
+
'smartphones': [
|
| 122 |
+
'modern smartphone with edge-to-edge display and sleek design',
|
| 123 |
+
'mobile phone with premium materials and minimalist aesthetic',
|
| 124 |
+
'smartphone showing screen interface with app icons and features',
|
| 125 |
+
'phone with camera system and advanced photography capabilities',
|
| 126 |
+
'mobile device with protective case and personal accessories',
|
| 127 |
+
'smartphone in hand demonstrating use and scale',
|
| 128 |
+
'phone with wireless charging and modern conveniences',
|
| 129 |
+
'mobile phone with notification screen and communication features',
|
| 130 |
+
'smartphone capturing photo showing photography in action',
|
| 131 |
+
'device with headphones and mobile entertainment setup'
|
| 132 |
+
],
|
| 133 |
+
'laptops': [
|
| 134 |
+
'laptop computer with open screen and modern workspace',
|
| 135 |
+
'portable computer with sleek design and professional appearance',
|
| 136 |
+
'laptop showing desktop interface and productivity software',
|
| 137 |
+
'computer with external peripherals and complete workstation',
|
| 138 |
+
'thin and light laptop with premium build and portability',
|
| 139 |
+
'gaming laptop with powerful specs and aggressive styling',
|
| 140 |
+
'laptop in cafe setting with remote work environment',
|
| 141 |
+
'computer with split screen showing multitasking capability',
|
| 142 |
+
'laptop with coding environment and developer workflow',
|
| 143 |
+
'portable computer with creative software and design work'
|
| 144 |
+
],
|
| 145 |
+
'cameras': [
|
| 146 |
+
'professional camera with interchangeable lens and manual controls',
|
| 147 |
+
'DSLR camera with telephoto lens and photography equipment',
|
| 148 |
+
'mirrorless camera with compact design and modern features',
|
| 149 |
+
'vintage film camera with classic design and analog charm',
|
| 150 |
+
'action camera with rugged housing and adventure ready build',
|
| 151 |
+
'instant camera with retro aesthetic and print functionality',
|
| 152 |
+
'camera on tripod with stable shooting setup',
|
| 153 |
+
'photography gear with lenses filters and accessories',
|
| 154 |
+
"camera with viewfinder showing photographer's perspective",
|
| 155 |
+
'compact camera with point-and-shoot simplicity'
|
| 156 |
+
],
|
| 157 |
+
'wearables': [
|
| 158 |
+
'smartwatch with digital display and fitness tracking features',
|
| 159 |
+
'fitness tracker with health monitoring and activity data',
|
| 160 |
+
'wireless earbuds with charging case and modern design',
|
| 161 |
+
'smart glasses with augmented reality and tech integration',
|
| 162 |
+
'VR headset with immersive technology and gaming capability',
|
| 163 |
+
'smart ring with minimalist design and health sensors',
|
| 164 |
+
'activity band with water resistance and sport features',
|
| 165 |
+
'wireless headphones with noise cancellation and premium audio',
|
| 166 |
+
'smart jewelry with notification features and elegant styling',
|
| 167 |
+
'wearable device with heart rate monitor and workout tracking'
|
| 168 |
+
]
|
| 169 |
+
},
|
| 170 |
+
|
| 171 |
+
# ===== 家居用品 Home Items =====
|
| 172 |
+
'home_items': {
|
| 173 |
+
'furniture': [
|
| 174 |
+
'modern sofa with clean lines and comfortable upholstery',
|
| 175 |
+
'wooden dining table with natural grain and family seating',
|
| 176 |
+
'contemporary chair with ergonomic design and stylish form',
|
| 177 |
+
'bookshelf with organized volumes and decorative objects',
|
| 178 |
+
'bed with plush bedding and inviting sleep environment',
|
| 179 |
+
'desk with workspace organization and productive setup',
|
| 180 |
+
'coffee table with minimalist design and functional surface',
|
| 181 |
+
'cabinet with storage solutions and practical organization',
|
| 182 |
+
'armchair with cozy cushioning and reading nook appeal',
|
| 183 |
+
'sideboard with display area and dining room elegance'
|
| 184 |
+
],
|
| 185 |
+
'decor': [
|
| 186 |
+
'wall art with framed artwork and gallery wall aesthetic',
|
| 187 |
+
'decorative plant with lush foliage and natural greenery',
|
| 188 |
+
'vase with fresh flowers and elegant arrangement',
|
| 189 |
+
'candles with ambient lighting and cozy atmosphere',
|
| 190 |
+
'throw pillows with colorful patterns and comfort layers',
|
| 191 |
+
'mirror with decorative frame and space-enhancing reflection',
|
| 192 |
+
'rug with textile pattern and floor covering warmth',
|
| 193 |
+
'sculpture with artistic form and decorative presence',
|
| 194 |
+
'decorative bowls with artisan craft and functional beauty',
|
| 195 |
+
'wall clock with timepiece function and design statement'
|
| 196 |
+
],
|
| 197 |
+
'kitchenware': [
|
| 198 |
+
'ceramic plates with elegant design and table setting ready',
|
| 199 |
+
'glassware with crystal clarity and beverage service',
|
| 200 |
+
'cookware with non-stick surface and culinary preparation',
|
| 201 |
+
'cutting board with natural wood and food prep surface',
|
| 202 |
+
'kitchen utensils with stainless steel and cooking tools',
|
| 203 |
+
'coffee maker with brewing capability and morning ritual',
|
| 204 |
+
'mixing bowls with nesting design and baking essentials',
|
| 205 |
+
'serving platters with presentation surface and entertaining ready',
|
| 206 |
+
'storage containers with organization and food preservation',
|
| 207 |
+
'tea kettle with stovetop heating and beverage preparation'
|
| 208 |
+
]
|
| 209 |
+
},
|
| 210 |
+
|
| 211 |
+
# ===== 服飾配件 Fashion Accessories =====
|
| 212 |
+
'fashion_accessories': {
|
| 213 |
+
'shoes': [
|
| 214 |
+
'leather shoes with polished finish and formal elegance',
|
| 215 |
+
'sneakers with athletic design and casual comfort',
|
| 216 |
+
'high heels with sophisticated style and fashion statement',
|
| 217 |
+
'boots with rugged construction and seasonal appropriateness',
|
| 218 |
+
'sandals with open design and warm weather comfort',
|
| 219 |
+
'loafers with slip-on convenience and smart casual style',
|
| 220 |
+
'running shoes with performance features and sport technology',
|
| 221 |
+
'dress shoes with refined appearance and occasion ready',
|
| 222 |
+
'canvas shoes with casual aesthetic and everyday wearability',
|
| 223 |
+
'designer footwear with luxury branding and premium materials'
|
| 224 |
+
],
|
| 225 |
+
'bags': [
|
| 226 |
+
'leather handbag with structured form and quality craftsmanship',
|
| 227 |
+
'backpack with practical compartments and daily carry capability',
|
| 228 |
+
'tote bag with spacious interior and versatile use',
|
| 229 |
+
'clutch with compact elegance and evening sophistication',
|
| 230 |
+
'messenger bag with crossbody strap and urban function',
|
| 231 |
+
'duffel bag with travel capacity and gym ready design',
|
| 232 |
+
'satchel with classic styling and professional appearance',
|
| 233 |
+
'wallet with organized card slots and essential storage',
|
| 234 |
+
'shoulder bag with adjustable strap and comfortable carry',
|
| 235 |
+
'luxury bag with designer branding and premium construction'
|
| 236 |
+
],
|
| 237 |
+
'jewelry': [
|
| 238 |
+
'necklace with pendant design and elegant neckline accent',
|
| 239 |
+
'earrings with gemstone sparkle and facial framing beauty',
|
| 240 |
+
'ring with precious metal and symbolic significance',
|
| 241 |
+
'bracelet with linked design and wrist adornment',
|
| 242 |
+
'watch with timepiece function and wrist jewelry appeal',
|
| 243 |
+
'brooch with decorative pin and vintage charm',
|
| 244 |
+
'anklet with delicate chain and summer accessory style',
|
| 245 |
+
'cufflinks with formal accent and menswear detail',
|
| 246 |
+
'charm bracelet with personal tokens and memory collection',
|
| 247 |
+
'statement jewelry with bold design and fashion impact'
|
| 248 |
+
],
|
| 249 |
+
'eyewear': [
|
| 250 |
+
'sunglasses with UV protection and stylish frames',
|
| 251 |
+
'eyeglasses with prescription lenses and daily wear design',
|
| 252 |
+
'aviator sunglasses with classic pilot styling and metal frame',
|
| 253 |
+
'cat-eye glasses with vintage inspired shape and feminine flair',
|
| 254 |
+
'sport sunglasses with wraparound design and performance features',
|
| 255 |
+
'reading glasses with magnification and close-work utility',
|
| 256 |
+
'designer eyewear with luxury branding and premium materials',
|
| 257 |
+
'mirrored sunglasses with reflective lenses and modern edge',
|
| 258 |
+
'oversized sunglasses with dramatic proportions and fashion statement',
|
| 259 |
+
'safety glasses with protective function and durable construction'
|
| 260 |
+
]
|
| 261 |
+
},
|
| 262 |
+
|
| 263 |
+
# ===== 運動器材 Sports Equipment =====
|
| 264 |
+
'sports_equipment': {
|
| 265 |
+
'fitness': [
|
| 266 |
+
'dumbbells with weight plates and strength training equipment',
|
| 267 |
+
'yoga mat with non-slip surface and exercise foundation',
|
| 268 |
+
'resistance bands with elastic tension and portable workout',
|
| 269 |
+
'kettlebell with cast iron construction and functional training',
|
| 270 |
+
'foam roller with massage texture and recovery tool',
|
| 271 |
+
'exercise ball with inflatable design and core workout',
|
| 272 |
+
'jump rope with cardio training and coordination exercise',
|
| 273 |
+
'weight bench with adjustable positions and lifting support',
|
| 274 |
+
'pull-up bar with doorframe mounting and bodyweight exercise',
|
| 275 |
+
'treadmill with running surface and cardio machine'
|
| 276 |
+
],
|
| 277 |
+
'outdoor_sports': [
|
| 278 |
+
'tennis racket with string tension and court sport equipment',
|
| 279 |
+
'basketball with leather or composite cover and game ready',
|
| 280 |
+
'soccer ball with classic panel design and field sport',
|
| 281 |
+
'golf clubs with metal woods and iron set',
|
| 282 |
+
'baseball glove with leather construction and catching mitt',
|
| 283 |
+
'skateboard with deck grip tape and wheel assembly',
|
| 284 |
+
'surfboard with wax coating and wave riding design',
|
| 285 |
+
'snowboard with bindings and mountain sport equipment',
|
| 286 |
+
'hiking boots with ankle support and trail ready tread',
|
| 287 |
+
'camping tent with weatherproof fabric and outdoor shelter'
|
| 288 |
+
]
|
| 289 |
+
},
|
| 290 |
+
|
| 291 |
+
# ===== 樂器 Musical Instruments =====
|
| 292 |
+
'musical_instruments': {
|
| 293 |
+
'string': [
|
| 294 |
+
'acoustic guitar with wooden body and string instrument charm',
|
| 295 |
+
'electric guitar with solid body and amplified rock sound',
|
| 296 |
+
'violin with curved body and classical string beauty',
|
| 297 |
+
'cello with rich tone and orchestral presence',
|
| 298 |
+
'bass guitar with deep sound and rhythm section foundation',
|
| 299 |
+
'ukulele with small size and tropical string instrument',
|
| 300 |
+
'harp with multiple strings and angelic sound quality',
|
| 301 |
+
'banjo with circular body and folk music character',
|
| 302 |
+
'mandolin with paired strings and bright tone',
|
| 303 |
+
'sitar with resonating strings and world music heritage'
|
| 304 |
+
],
|
| 305 |
+
'keyboard': [
|
| 306 |
+
'piano with ivory keys and classical instrument elegance',
|
| 307 |
+
'keyboard synthesizer with electronic sound and modern music',
|
| 308 |
+
'organ with multiple manuals and church music tradition',
|
| 309 |
+
'electric piano with vintage tone and stage performance',
|
| 310 |
+
'digital piano with weighted keys and home practice',
|
| 311 |
+
'accordion with bellows and folk dance music',
|
| 312 |
+
'MIDI controller with production capability and studio tool',
|
| 313 |
+
'harpsichord with baroque styling and historical instrument',
|
| 314 |
+
'melodica with breath control and portable keyboard',
|
| 315 |
+
'keytar with shoulder strap and performance showmanship'
|
| 316 |
+
],
|
| 317 |
+
'percussion': [
|
| 318 |
+
'drum kit with multiple pieces and rhythmic foundation',
|
| 319 |
+
'djembe with hand drumming and African rhythm',
|
| 320 |
+
'conga drums with Latin percussion and tropical beat',
|
| 321 |
+
'tambourine with jingles and shaker instrument',
|
| 322 |
+
'bongos with paired drums and Latin music style',
|
| 323 |
+
'xylophone with mallet playing and melodic percussion',
|
| 324 |
+
'cymbals with crash sound and orchestral accent',
|
| 325 |
+
'maracas with rattle sound and Latin rhythm shaker',
|
| 326 |
+
'cajón with box drum and flamenco percussion',
|
| 327 |
+
'timpani with kettle drum and orchestral thunder'
|
| 328 |
+
]
|
| 329 |
+
},
|
| 330 |
+
|
| 331 |
+
# ===== 辦公用品 Office Supplies =====
|
| 332 |
+
'office_supplies': {
|
| 333 |
+
'stationery': [
|
| 334 |
+
'pen with smooth writing and everyday writing tool',
|
| 335 |
+
'notebook with lined pages and note-taking essential',
|
| 336 |
+
'pencil with graphite lead and sketching tool',
|
| 337 |
+
'markers with vibrant colors and highlighting capability',
|
| 338 |
+
'sticky notes with adhesive backing and reminder function',
|
| 339 |
+
'paper clips with metal construction and document organization',
|
| 340 |
+
'stapler with binding function and paper fastening',
|
| 341 |
+
'tape dispenser with adhesive roll and office essential',
|
| 342 |
+
'ruler with measurement markings and straight edge',
|
| 343 |
+
'scissors with sharp blades and cutting tool'
|
| 344 |
+
],
|
| 345 |
+
'desk_items': [
|
| 346 |
+
'desk lamp with adjustable arm and task lighting',
|
| 347 |
+
'organizer with compartments and clutter management',
|
| 348 |
+
'mouse pad with smooth surface and wrist support',
|
| 349 |
+
'desk calendar with date tracking and planning tool',
|
| 350 |
+
'pen holder with upright storage and writing implement organization',
|
| 351 |
+
'file folders with document sorting and category organization',
|
| 352 |
+
'desk mat with large surface and workspace protection',
|
| 353 |
+
'paper tray with stacking design and document storage',
|
| 354 |
+
'business card holder with professional presentation',
|
| 355 |
+
'cable organizer with cord management and tidy workspace'
|
| 356 |
+
]
|
| 357 |
+
},
|
| 358 |
+
|
| 359 |
+
# ===== 玩具與遊戲 Toys and Games =====
|
| 360 |
+
'toys_games': {
|
| 361 |
+
'toys': [
|
| 362 |
+
'stuffed animal with soft plush and cuddly companion',
|
| 363 |
+
'action figure with articulated joints and character play',
|
| 364 |
+
'doll with detailed features and imaginative play',
|
| 365 |
+
'building blocks with interlocking pieces and creative construction',
|
| 366 |
+
'toy car with rolling wheels and miniature vehicle play',
|
| 367 |
+
'puzzle with interlocking pieces and problem-solving challenge',
|
| 368 |
+
'board game with playing pieces and family entertainment',
|
| 369 |
+
'card game with illustrated cards and strategy play',
|
| 370 |
+
'remote control toy with wireless operation and interactive play',
|
| 371 |
+
'educational toy with learning elements and developmental benefits'
|
| 372 |
+
],
|
| 373 |
+
'collectibles': [
|
| 374 |
+
'figurine with detailed sculpting and display collectible',
|
| 375 |
+
'vinyl toy with designer art and limited edition appeal',
|
| 376 |
+
'model kit with assembly parts and hobbyist construction',
|
| 377 |
+
'trading cards with collectible series and rarity value',
|
| 378 |
+
'die-cast model with metal construction and scale replica',
|
| 379 |
+
'statue with artistic detail and collector showcase',
|
| 380 |
+
'pop culture figure with character likeness and fandom appeal',
|
| 381 |
+
'vintage toy with nostalgic value and retro charm',
|
| 382 |
+
'limited edition collectible with numbered series and exclusivity',
|
| 383 |
+
'display case with protective housing and collection showcase'
|
| 384 |
+
]
|
| 385 |
+
}
|
| 386 |
+
}
|
| 387 |
+
|
| 388 |
+
print(f"✓ Universal Object Prompts initialized with {len(self.object_vocabularies)} major categories")
|
| 389 |
+
|
| 390 |
+
def get_prompts(self, category: str, subcategory: str = None) -> List[str]:
|
| 391 |
+
"""
|
| 392 |
+
取得物品 prompts
|
| 393 |
+
|
| 394 |
+
Args:
|
| 395 |
+
category: 物品類別 (如 'animals', 'vehicles')
|
| 396 |
+
subcategory: 子類別 (如 'dogs', 'cats')
|
| 397 |
+
|
| 398 |
+
Returns:
|
| 399 |
+
Prompt 列表
|
| 400 |
+
"""
|
| 401 |
+
category_prompts = self.object_vocabularies.get(category, {})
|
| 402 |
+
|
| 403 |
+
if subcategory:
|
| 404 |
+
return category_prompts.get(subcategory, [])
|
| 405 |
+
else:
|
| 406 |
+
# 返回該類別的所有 prompts
|
| 407 |
+
all_prompts = []
|
| 408 |
+
for prompts in category_prompts.values():
|
| 409 |
+
if isinstance(prompts, list):
|
| 410 |
+
all_prompts.extend(prompts)
|
| 411 |
+
return all_prompts
|
| 412 |
+
|
| 413 |
+
def get_all_categories(self) -> List[str]:
|
| 414 |
+
"""取得所有物品類別"""
|
| 415 |
+
return list(self.object_vocabularies.keys())
|
| 416 |
+
|
| 417 |
+
def get_subcategories(self, category: str) -> List[str]:
|
| 418 |
+
"""取得特定類別的所有子類別"""
|
| 419 |
+
category_data = self.object_vocabularies.get(category, {})
|
| 420 |
+
return list(category_data.keys()) if isinstance(category_data, dict) else []
|
| 421 |
+
|
| 422 |
+
def detect_object_category(self, detected_objects: List[str]) -> str:
|
| 423 |
+
"""
|
| 424 |
+
根據檢測到的物體推測主要類別
|
| 425 |
+
|
| 426 |
+
Args:
|
| 427 |
+
detected_objects: YOLO 檢測到的物體列表
|
| 428 |
+
|
| 429 |
+
Returns:
|
| 430 |
+
推測的類別名稱
|
| 431 |
+
"""
|
| 432 |
+
object_str = ' '.join(detected_objects).lower()
|
| 433 |
+
|
| 434 |
+
# 動物關鍵字
|
| 435 |
+
if any(kw in object_str for kw in ['dog', 'cat', 'bird', 'animal', 'pet']):
|
| 436 |
+
return 'animals'
|
| 437 |
+
|
| 438 |
+
# 交通工具關鍵字
|
| 439 |
+
if any(kw in object_str for kw in ['car', 'bike', 'motorcycle', 'bus', 'train', 'vehicle']):
|
| 440 |
+
return 'vehicles'
|
| 441 |
+
|
| 442 |
+
# 電子產品關鍵字
|
| 443 |
+
if any(kw in object_str for kw in ['phone', 'laptop', 'camera', 'computer', 'tablet']):
|
| 444 |
+
return 'electronics'
|
| 445 |
+
|
| 446 |
+
# 家居用品關鍵字
|
| 447 |
+
if any(kw in object_str for kw in ['chair', 'table', 'bed', 'couch', 'furniture']):
|
| 448 |
+
return 'home_items'
|
| 449 |
+
|
| 450 |
+
# 服飾配件關鍵字
|
| 451 |
+
if any(kw in object_str for kw in ['shoe', 'bag', 'handbag', 'backpack', 'watch']):
|
| 452 |
+
return 'fashion_accessories'
|
| 453 |
+
|
| 454 |
+
# 運動器材關鍵字
|
| 455 |
+
if any(kw in object_str for kw in ['ball', 'racket', 'equipment', 'fitness']):
|
| 456 |
+
return 'sports_equipment'
|
| 457 |
+
|
| 458 |
+
# 樂器關鍵字
|
| 459 |
+
if any(kw in object_str for kw in ['guitar', 'piano', 'drum', 'instrument']):
|
| 460 |
+
return 'musical_instruments'
|
| 461 |
+
|
| 462 |
+
return None # 無法辨識
|
| 463 |
+
|
| 464 |
+
print("✓ UniversalObjectPrompts defined")
|
yolo_detection_manager.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from ultralytics import YOLO
|
| 3 |
+
import numpy as np
|
| 4 |
+
from typing import List, Dict
|
| 5 |
+
from PIL import Image
|
| 6 |
+
|
| 7 |
+
class YOLODetectionManager:
|
| 8 |
+
"""Object detection using YOLOv11"""
|
| 9 |
+
|
| 10 |
+
def __init__(self, variant='m'):
|
| 11 |
+
print(f"Loading YOLOv11{variant} model...")
|
| 12 |
+
self.model = YOLO(f'yolo11{variant}.pt')
|
| 13 |
+
self.variant = variant
|
| 14 |
+
self.conf_threshold = 0.25
|
| 15 |
+
self.iou_threshold = 0.45
|
| 16 |
+
self.max_detections = 100
|
| 17 |
+
|
| 18 |
+
# Brand-relevant classes
|
| 19 |
+
self.brand_relevant_classes = [
|
| 20 |
+
'handbag', 'bottle', 'cell phone', 'laptop',
|
| 21 |
+
'backpack', 'tie', 'suitcase', 'cup', 'watch',
|
| 22 |
+
'shoe', 'sneaker', 'boot'
|
| 23 |
+
]
|
| 24 |
+
|
| 25 |
+
print(f"✓ YOLOv11{variant} loaded")
|
| 26 |
+
|
| 27 |
+
def detect(self, image: np.ndarray) -> List[Dict]:
|
| 28 |
+
"""Detect objects in image"""
|
| 29 |
+
results = self.model.predict(
|
| 30 |
+
image,
|
| 31 |
+
conf=self.conf_threshold,
|
| 32 |
+
iou=self.iou_threshold,
|
| 33 |
+
max_det=self.max_detections,
|
| 34 |
+
verbose=False
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
detections = []
|
| 38 |
+
|
| 39 |
+
for result in results:
|
| 40 |
+
boxes = result.boxes
|
| 41 |
+
for box in boxes:
|
| 42 |
+
class_id = int(box.cls[0])
|
| 43 |
+
class_name = result.names[class_id]
|
| 44 |
+
bbox = box.xyxy[0].cpu().numpy().tolist()
|
| 45 |
+
confidence = float(box.conf[0])
|
| 46 |
+
|
| 47 |
+
detection = {
|
| 48 |
+
'class_id': class_id,
|
| 49 |
+
'class_name': class_name,
|
| 50 |
+
'bbox': bbox,
|
| 51 |
+
'confidence': confidence,
|
| 52 |
+
'is_brand_relevant': class_name.lower() in self.brand_relevant_classes,
|
| 53 |
+
'source': 'yolo'
|
| 54 |
+
}
|
| 55 |
+
detections.append(detection)
|
| 56 |
+
|
| 57 |
+
return detections
|
| 58 |
+
|
| 59 |
+
def filter_brand_relevant_objects(self, detections: List[Dict]) -> List[Dict]:
|
| 60 |
+
"""Filter brand-relevant objects"""
|
| 61 |
+
return [det for det in detections if det['is_brand_relevant']]
|
| 62 |
+
|
| 63 |
+
print("✓ YOLODetectionManager defined")
|