docs: add API verification comments to gemini_client.py
Browse files
video_intelligence/gemini_client.py
CHANGED
|
@@ -2,6 +2,13 @@
|
|
| 2 |
Video Intelligence Platform β Gemini Integration
|
| 3 |
Handles video captioning, text embeddings, query decomposition, and RAG generation.
|
| 4 |
Uses the new google-genai SDK (NOT the deprecated google-generativeai).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
"""
|
| 6 |
import time
|
| 7 |
import json
|
|
@@ -46,7 +53,7 @@ class GeminiClient:
|
|
| 46 |
return response.text or ""
|
| 47 |
|
| 48 |
def caption_frames_batch(self, frames_bytes: List[bytes],
|
| 49 |
-
|
| 50 |
"""Caption multiple frames. Each call is independent."""
|
| 51 |
captions = []
|
| 52 |
for i, fb in enumerate(frames_bytes):
|
|
@@ -58,8 +65,8 @@ class GeminiClient:
|
|
| 58 |
captions.append("")
|
| 59 |
return captions
|
| 60 |
|
| 61 |
-
def caption_video_segment(self, video_bytes: bytes,
|
| 62 |
-
|
| 63 |
"""Caption a video segment using Gemini's native video understanding."""
|
| 64 |
response = self.client.models.generate_content(
|
| 65 |
model=self.vision_model,
|
|
@@ -108,7 +115,7 @@ class GeminiClient:
|
|
| 108 |
def decompose_query(self, query: str) -> Dict:
|
| 109 |
"""
|
| 110 |
Decompose a natural language query into sub-queries + boolean operator.
|
| 111 |
-
|
| 112 |
Examples:
|
| 113 |
"red car and yellow car" β {"sub_queries": ["red car", "yellow car"], "operator": "AND"}
|
| 114 |
"people in white OR blue clothes" β {"sub_queries": ["people in white clothes", "people in blue clothes"], "operator": "OR"}
|
|
@@ -151,10 +158,10 @@ Respond ONLY with valid JSON:
|
|
| 151 |
# ββ RAG Answer Generation βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 152 |
|
| 153 |
def generate_rag_answer(self, query: str,
|
| 154 |
-
|
| 155 |
"""
|
| 156 |
Generate a grounded answer using retrieved video segments as context.
|
| 157 |
-
|
| 158 |
Args:
|
| 159 |
query: User's original question
|
| 160 |
retrieved_contexts: List of dicts with keys:
|
|
@@ -204,17 +211,17 @@ Instructions:
|
|
| 204 |
# ββ Akinator Question Generation ββββββββββββββββββββββββββββββββββββββββ
|
| 205 |
|
| 206 |
def generate_refinement_question(self, query: str,
|
| 207 |
-
|
| 208 |
"""
|
| 209 |
Generate the next best question to narrow down results (Akinator-style).
|
| 210 |
-
|
| 211 |
Args:
|
| 212 |
query: Original user query
|
| 213 |
candidate_attributes: Dict mapping attribute_name β list of unique values
|
| 214 |
e.g. {"location": ["indoor", "outdoor"], "time_of_day": ["day", "night"]}
|
| 215 |
-
|
| 216 |
Returns:
|
| 217 |
-
{"attribute": "location", "question": "Is the scene indoor or outdoor?",
|
| 218 |
"options": ["indoor", "outdoor"]}
|
| 219 |
"""
|
| 220 |
attrs_str = json.dumps(candidate_attributes, indent=2)
|
|
|
|
| 2 |
Video Intelligence Platform β Gemini Integration
|
| 3 |
Handles video captioning, text embeddings, query decomposition, and RAG generation.
|
| 4 |
Uses the new google-genai SDK (NOT the deprecated google-generativeai).
|
| 5 |
+
|
| 6 |
+
Verified against google-genai >= 1.0:
|
| 7 |
+
- Client: genai.Client(api_key=...)
|
| 8 |
+
- Generate: client.models.generate_content(model=..., contents=[...], config=...)
|
| 9 |
+
- Embed: client.models.embed_content(model=..., contents=..., config=...)
|
| 10 |
+
- Types: types.Part.from_bytes, types.Part.from_text, types.GenerateContentConfig,
|
| 11 |
+
types.EmbedContentConfig
|
| 12 |
"""
|
| 13 |
import time
|
| 14 |
import json
|
|
|
|
| 53 |
return response.text or ""
|
| 54 |
|
| 55 |
def caption_frames_batch(self, frames_bytes: List[bytes],
|
| 56 |
+
batch_desc: str = "") -> List[str]:
|
| 57 |
"""Caption multiple frames. Each call is independent."""
|
| 58 |
captions = []
|
| 59 |
for i, fb in enumerate(frames_bytes):
|
|
|
|
| 65 |
captions.append("")
|
| 66 |
return captions
|
| 67 |
|
| 68 |
+
def caption_video_segment(self, video_bytes: bytes,
|
| 69 |
+
prompt: str = "Describe all objects and actions in this video clip.") -> str:
|
| 70 |
"""Caption a video segment using Gemini's native video understanding."""
|
| 71 |
response = self.client.models.generate_content(
|
| 72 |
model=self.vision_model,
|
|
|
|
| 115 |
def decompose_query(self, query: str) -> Dict:
|
| 116 |
"""
|
| 117 |
Decompose a natural language query into sub-queries + boolean operator.
|
| 118 |
+
|
| 119 |
Examples:
|
| 120 |
"red car and yellow car" β {"sub_queries": ["red car", "yellow car"], "operator": "AND"}
|
| 121 |
"people in white OR blue clothes" β {"sub_queries": ["people in white clothes", "people in blue clothes"], "operator": "OR"}
|
|
|
|
| 158 |
# ββ RAG Answer Generation βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 159 |
|
| 160 |
def generate_rag_answer(self, query: str,
|
| 161 |
+
retrieved_contexts: List[Dict]) -> str:
|
| 162 |
"""
|
| 163 |
Generate a grounded answer using retrieved video segments as context.
|
| 164 |
+
|
| 165 |
Args:
|
| 166 |
query: User's original question
|
| 167 |
retrieved_contexts: List of dicts with keys:
|
|
|
|
| 211 |
# ββ Akinator Question Generation ββββββββββββββββββββββββββββββββββββββββ
|
| 212 |
|
| 213 |
def generate_refinement_question(self, query: str,
|
| 214 |
+
candidate_attributes: Dict[str, List[str]]) -> Dict:
|
| 215 |
"""
|
| 216 |
Generate the next best question to narrow down results (Akinator-style).
|
| 217 |
+
|
| 218 |
Args:
|
| 219 |
query: Original user query
|
| 220 |
candidate_attributes: Dict mapping attribute_name β list of unique values
|
| 221 |
e.g. {"location": ["indoor", "outdoor"], "time_of_day": ["day", "night"]}
|
| 222 |
+
|
| 223 |
Returns:
|
| 224 |
+
{"attribute": "location", "question": "Is the scene indoor or outdoor?",
|
| 225 |
"options": ["indoor", "outdoor"]}
|
| 226 |
"""
|
| 227 |
attrs_str = json.dumps(candidate_attributes, indent=2)
|