Spaces:

yalrashed
/

ScriptLLM

Sleeping

App Files Files Community

yalrashed commited on Dec 5, 2024

Commit

042441a

verified ·

1 Parent(s): 680c044

Update src/analysis/coverage_generator.py

Browse files

Files changed (1) hide show

src/analysis/coverage_generator.py +214 -0

src/analysis/coverage_generator.py CHANGED Viewed

	@@ -0,0 +1,214 @@

+import os
+import google.generativeai as genai
+from pathlib import Path
+from tqdm import tqdm
+import logging
+# Set up logging
+logging.basicConfig(level=logging.DEBUG,
+                   format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+class CoverageGenerator:
+    def __init__(self):
+        # Initialize Gemini
+        api_key = os.getenv("GOOGLE_API_KEY")
+        if not api_key:
+            raise ValueError("GOOGLE_API_KEY not found")
+        genai.configure(api_key=api_key)
+        self.model = genai.GenerativeModel('gemini-pro')
+        # Add token tracking
+        self.token_usage = {
+            'prompt_tokens': 0,
+            'completion_tokens': 0,
+            'total_tokens': 0
+        }
+        # Set chunk size (in estimated tokens)
+        self.chunk_size = 8000  # Conservative size to avoid issues
+    def count_tokens(self, text: str) -> int:
+        """Estimate token count using simple word-based estimation"""
+        words = text.split()
+        return int(len(words) * 1.3)
+    def chunk_screenplay(self, text: str) -> list:
+        """Split screenplay into chunks with overlap for context"""
+        logger.info("Chunking screenplay...")
+        # Split into scenes (looking for standard screenplay headers)
+        scenes = text.split("\n\n")
+        chunks = []
+        current_chunk = []
+        current_size = 0
+        overlap_scenes = 2  # Number of scenes to overlap
+        for i, scene in enumerate(scenes):
+            scene_size = self.count_tokens(scene)
+            if current_size + scene_size > self.chunk_size and current_chunk:
+                # Get overlap scenes from the end of current chunk
+                overlap = current_chunk[-overlap_scenes:] if len(current_chunk) > overlap_scenes else current_chunk
+                # Join current chunk and add to chunks
+                chunks.append("\n\n".join(current_chunk))
+                # Start new chunk with overlap for context
+                current_chunk = overlap + [scene]
+                current_size = sum(self.count_tokens(s) for s in current_chunk)
+            else:
+                current_chunk.append(scene)
+                current_size += scene_size
+        # Add the last chunk if it exists
+        if current_chunk:
+            chunks.append("\n\n".join(current_chunk))
+        logger.info(f"Split screenplay into {len(chunks)} chunks with context overlap")
+        return chunks
+    def read_screenplay(self, filepath: Path) -> str:
+        """Read the cleaned screenplay file"""
+        try:
+            logger.info(f"Reading screenplay from: {filepath}")
+            with open(filepath, 'r', encoding='utf-8') as file:
+                text = file.read()
+                tokens = self.count_tokens(text)
+                logger.info(f"Successfully read screenplay. Length: {tokens} tokens (estimated)")
+                return text
+        except Exception as e:
+            logger.error(f"Error reading screenplay: {e}")
+            logger.error(f"Tried to read from: {filepath}")
+            return None
+    def generate_synopsis(self, chunk: str, chunk_num: int = 1, total_chunks: int = 1) -> str:
+        """Generate synopsis for a single chunk"""
+        prompt = f"""As an experienced script analyst, analyze this section ({chunk_num}/{total_chunks}) of the screenplay.
+        Important: This section may overlap with others to maintain context. Focus on:
+        - Key plot developments and their implications for the larger story
+        - Character appearances and development
+        - How this section connects to the ongoing narrative
+        - Major themes or motifs that emerge
+        Provide a summary that captures both the specific events and their significance to the larger narrative.
+        Screenplay section:
+        {chunk}"""
+        try:
+            prompt_tokens = self.count_tokens(prompt)
+            logger.debug(f"Chunk {chunk_num} prompt length: {prompt_tokens} tokens")
+            with tqdm(total=1, desc=f"Processing chunk {chunk_num}/{total_chunks}") as pbar:
+                response = self.model.generate_content(prompt)
+                completion_tokens = self.count_tokens(response.text)
+                pbar.update(1)
+            self.token_usage['prompt_tokens'] += prompt_tokens
+            self.token_usage['completion_tokens'] += completion_tokens
+            self.token_usage['total_tokens'] += (prompt_tokens + completion_tokens)
+            return response.text
+        except Exception as e:
+            logger.error(f"Error processing chunk {chunk_num}: {str(e)}")
+            logger.error("Full error details:", exc_info=True)
+            return None
+    def generate_final_synopsis(self, chunk_synopses: list) -> str:
+        """Combine chunk synopses into a final, coherent synopsis with strong narrative focus"""
+        combined_text = "\n\n".join([f"Section {i+1}:\n{synopsis}"
+                                   for i, synopsis in enumerate(chunk_synopses)])
+        prompt = f"""As an experienced script analyst, synthesize these section summaries into a comprehensive,
+        narratively cohesive synopsis of the entire screenplay.
+        You should have distinct sections on:
+        1. The complete narrative arc from beginning to end
+        2. Character development across the full story
+        3. Major themes and how they evolve
+        4. Key turning points and their impact
+        5. The core conflict and its resolution
+        Ensure the synopsis flows naturally and captures the full story without revealing the seams between sections.
+        Section summaries:
+        {combined_text}"""
+        try:
+            logger.info("Generating final synopsis")
+            with tqdm(total=1, desc="Creating final synopsis") as pbar:
+                response = self.model.generate_content(prompt)
+                pbar.update(1)
+            return response.text
+        except Exception as e:
+            logger.error(f"Error generating final synopsis: {str(e)}")
+            return None
+    def generate_coverage(self, screenplay_path: Path) -> bool:
+        """Main method to generate full coverage document"""
+        logger.info("Starting coverage generation")
+        self.token_usage = {
+            'prompt_tokens': 0,
+            'completion_tokens': 0,
+            'total_tokens': 0
+        }
+        with tqdm(total=4, desc="Generating coverage") as pbar:
+            # Read screenplay
+            screenplay_text = self.read_screenplay(screenplay_path)
+            if not screenplay_text:
+                return False
+            pbar.update(1)
+            # Split into chunks
+            chunks = self.chunk_screenplay(screenplay_text)
+            pbar.update(1)
+            # Process each chunk
+            chunk_synopses = []
+            for i, chunk in enumerate(chunks, 1):
+                synopsis = self.generate_synopsis(chunk, i, len(chunks))
+                if synopsis:
+                    chunk_synopses.append(synopsis)
+                else:
+                    logger.error(f"Failed to process chunk {i}")
+                    return False
+            pbar.update(1)
+            # Generate final synopsis
+            final_synopsis = self.generate_final_synopsis(chunk_synopses)
+            if not final_synopsis:
+                return False
+            # Save coverage
+            output_dir = screenplay_path.parent
+            output_path = output_dir / "coverage.txt"
+            try:
+                with open(output_path, 'w', encoding='utf-8') as f:
+                    f.write("SCREENPLAY COVERAGE\n\n")
+                    f.write("### SYNOPSIS ###\n\n")
+                    f.write(final_synopsis)
+                    # Add token usage summary
+                    f.write("\n\n### TOKEN USAGE SUMMARY ###\n")
+                    f.write(f"Prompt Tokens: {self.token_usage['prompt_tokens']}\n")
+                    f.write(f"Completion Tokens: {self.token_usage['completion_tokens']}\n")
+                    f.write(f"Total Tokens: {self.token_usage['total_tokens']}\n")
+                logger.info("\nFinal Token Usage Summary:")
+                logger.info(f"Prompt Tokens: {self.token_usage['prompt_tokens']}")
+                logger.info(f"Completion Tokens: {self.token_usage['completion_tokens']}")
+                logger.info(f"Total Tokens: {self.token_usage['total_tokens']}")
+                pbar.update(1)
+                return True
+            except Exception as e:
+                logger.error(f"Error saving coverage: {str(e)}")
+                logger.error("Full error details:", exc_info=True)
+                return False