notRaphael
/

video-intelligence-platform

Model card Files Files and versions

xet

Community

notRaphael commited on 18 days ago

Commit

2781fa9

verified ·

1 Parent(s): 82f0d5d

docs: add API verification comments to gemini_client.py

Browse files

Files changed (1) hide show

video_intelligence/gemini_client.py +17 -10

video_intelligence/gemini_client.py CHANGED Viewed

@@ -2,6 +2,13 @@
 Video Intelligence Platform — Gemini Integration
 Handles video captioning, text embeddings, query decomposition, and RAG generation.
 Uses the new google-genai SDK (NOT the deprecated google-generativeai).
 """
 import time
 import json
@@ -46,7 +53,7 @@ class GeminiClient:
         return response.text or ""
     def caption_frames_batch(self, frames_bytes: List[bytes],
-                              batch_desc: str = "") -> List[str]:
         """Caption multiple frames. Each call is independent."""
         captions = []
         for i, fb in enumerate(frames_bytes):
@@ -58,8 +65,8 @@ class GeminiClient:
                 captions.append("")
         return captions
-    def caption_video_segment(self, video_bytes: bytes,
-                               prompt: str = "Describe all objects and actions in this video clip.") -> str:
         """Caption a video segment using Gemini's native video understanding."""
         response = self.client.models.generate_content(
             model=self.vision_model,
@@ -108,7 +115,7 @@ class GeminiClient:
     def decompose_query(self, query: str) -> Dict:
         """
         Decompose a natural language query into sub-queries + boolean operator.
         Examples:
             "red car and yellow car" → {"sub_queries": ["red car", "yellow car"], "operator": "AND"}
             "people in white OR blue clothes" → {"sub_queries": ["people in white clothes", "people in blue clothes"], "operator": "OR"}
@@ -151,10 +158,10 @@ Respond ONLY with valid JSON:
     # ── RAG Answer Generation ───────────────────────────────────────────────
     def generate_rag_answer(self, query: str,
-                             retrieved_contexts: List[Dict]) -> str:
         """
         Generate a grounded answer using retrieved video segments as context.
         Args:
             query: User's original question
             retrieved_contexts: List of dicts with keys:
@@ -204,17 +211,17 @@ Instructions:
     # ── Akinator Question Generation ────────────────────────────────────────
     def generate_refinement_question(self, query: str,
-                                       candidate_attributes: Dict[str, List[str]]) -> Dict:
         """
         Generate the next best question to narrow down results (Akinator-style).
         Args:
             query: Original user query
             candidate_attributes: Dict mapping attribute_name → list of unique values
                 e.g. {"location": ["indoor", "outdoor"], "time_of_day": ["day", "night"]}
         Returns:
-            {"attribute": "location", "question": "Is the scene indoor or outdoor?",
              "options": ["indoor", "outdoor"]}
         """
         attrs_str = json.dumps(candidate_attributes, indent=2)

 Video Intelligence Platform — Gemini Integration
 Handles video captioning, text embeddings, query decomposition, and RAG generation.
 Uses the new google-genai SDK (NOT the deprecated google-generativeai).
+Verified against google-genai >= 1.0:
+- Client: genai.Client(api_key=...)
+- Generate: client.models.generate_content(model=..., contents=[...], config=...)
+- Embed: client.models.embed_content(model=..., contents=..., config=...)
+- Types: types.Part.from_bytes, types.Part.from_text, types.GenerateContentConfig,
+         types.EmbedContentConfig
 """
 import time
 import json
         return response.text or ""
     def caption_frames_batch(self, frames_bytes: List[bytes],
+                             batch_desc: str = "") -> List[str]:
         """Caption multiple frames. Each call is independent."""
         captions = []
         for i, fb in enumerate(frames_bytes):
                 captions.append("")
         return captions
+    def caption_video_segment(self, video_bytes: bytes,
+                              prompt: str = "Describe all objects and actions in this video clip.") -> str:
         """Caption a video segment using Gemini's native video understanding."""
         response = self.client.models.generate_content(
             model=self.vision_model,
     def decompose_query(self, query: str) -> Dict:
         """
         Decompose a natural language query into sub-queries + boolean operator.
         Examples:
             "red car and yellow car" → {"sub_queries": ["red car", "yellow car"], "operator": "AND"}
             "people in white OR blue clothes" → {"sub_queries": ["people in white clothes", "people in blue clothes"], "operator": "OR"}
     # ── RAG Answer Generation ───────────────────────────────────────────────
     def generate_rag_answer(self, query: str,
+                            retrieved_contexts: List[Dict]) -> str:
         """
         Generate a grounded answer using retrieved video segments as context.
         Args:
             query: User's original question
             retrieved_contexts: List of dicts with keys:
     # ── Akinator Question Generation ────────────────────────────────────────
     def generate_refinement_question(self, query: str,
+                                     candidate_attributes: Dict[str, List[str]]) -> Dict:
         """
         Generate the next best question to narrow down results (Akinator-style).
         Args:
             query: Original user query
             candidate_attributes: Dict mapping attribute_name → list of unique values
                 e.g. {"location": ["indoor", "outdoor"], "time_of_day": ["day", "night"]}
         Returns:
+            {"attribute": "location", "question": "Is the scene indoor or outdoor?",
              "options": ["indoor", "outdoor"]}
         """
         attrs_str = json.dumps(candidate_attributes, indent=2)