yalrashed commited on
Commit
042441a
·
verified ·
1 Parent(s): 680c044

Update src/analysis/coverage_generator.py

Browse files
Files changed (1) hide show
  1. src/analysis/coverage_generator.py +214 -0
src/analysis/coverage_generator.py CHANGED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import google.generativeai as genai
3
+ from pathlib import Path
4
+ from tqdm import tqdm
5
+ import logging
6
+
7
+ # Set up logging
8
+ logging.basicConfig(level=logging.DEBUG,
9
+ format='%(asctime)s - %(levelname)s - %(message)s')
10
+ logger = logging.getLogger(__name__)
11
+
12
+ class CoverageGenerator:
13
+ def __init__(self):
14
+ # Initialize Gemini
15
+ api_key = os.getenv("GOOGLE_API_KEY")
16
+ if not api_key:
17
+ raise ValueError("GOOGLE_API_KEY not found")
18
+
19
+ genai.configure(api_key=api_key)
20
+ self.model = genai.GenerativeModel('gemini-pro')
21
+
22
+ # Add token tracking
23
+ self.token_usage = {
24
+ 'prompt_tokens': 0,
25
+ 'completion_tokens': 0,
26
+ 'total_tokens': 0
27
+ }
28
+
29
+ # Set chunk size (in estimated tokens)
30
+ self.chunk_size = 8000 # Conservative size to avoid issues
31
+
32
+ def count_tokens(self, text: str) -> int:
33
+ """Estimate token count using simple word-based estimation"""
34
+ words = text.split()
35
+ return int(len(words) * 1.3)
36
+
37
+ def chunk_screenplay(self, text: str) -> list:
38
+ """Split screenplay into chunks with overlap for context"""
39
+ logger.info("Chunking screenplay...")
40
+
41
+ # Split into scenes (looking for standard screenplay headers)
42
+ scenes = text.split("\n\n")
43
+
44
+ chunks = []
45
+ current_chunk = []
46
+ current_size = 0
47
+ overlap_scenes = 2 # Number of scenes to overlap
48
+
49
+ for i, scene in enumerate(scenes):
50
+ scene_size = self.count_tokens(scene)
51
+
52
+ if current_size + scene_size > self.chunk_size and current_chunk:
53
+ # Get overlap scenes from the end of current chunk
54
+ overlap = current_chunk[-overlap_scenes:] if len(current_chunk) > overlap_scenes else current_chunk
55
+
56
+ # Join current chunk and add to chunks
57
+ chunks.append("\n\n".join(current_chunk))
58
+
59
+ # Start new chunk with overlap for context
60
+ current_chunk = overlap + [scene]
61
+ current_size = sum(self.count_tokens(s) for s in current_chunk)
62
+ else:
63
+ current_chunk.append(scene)
64
+ current_size += scene_size
65
+
66
+ # Add the last chunk if it exists
67
+ if current_chunk:
68
+ chunks.append("\n\n".join(current_chunk))
69
+
70
+ logger.info(f"Split screenplay into {len(chunks)} chunks with context overlap")
71
+ return chunks
72
+
73
+ def read_screenplay(self, filepath: Path) -> str:
74
+ """Read the cleaned screenplay file"""
75
+ try:
76
+ logger.info(f"Reading screenplay from: {filepath}")
77
+ with open(filepath, 'r', encoding='utf-8') as file:
78
+ text = file.read()
79
+ tokens = self.count_tokens(text)
80
+ logger.info(f"Successfully read screenplay. Length: {tokens} tokens (estimated)")
81
+ return text
82
+ except Exception as e:
83
+ logger.error(f"Error reading screenplay: {e}")
84
+ logger.error(f"Tried to read from: {filepath}")
85
+ return None
86
+
87
+ def generate_synopsis(self, chunk: str, chunk_num: int = 1, total_chunks: int = 1) -> str:
88
+ """Generate synopsis for a single chunk"""
89
+ prompt = f"""As an experienced script analyst, analyze this section ({chunk_num}/{total_chunks}) of the screenplay.
90
+
91
+ Important: This section may overlap with others to maintain context. Focus on:
92
+ - Key plot developments and their implications for the larger story
93
+ - Character appearances and development
94
+ - How this section connects to the ongoing narrative
95
+ - Major themes or motifs that emerge
96
+
97
+ Provide a summary that captures both the specific events and their significance to the larger narrative.
98
+
99
+ Screenplay section:
100
+ {chunk}"""
101
+
102
+ try:
103
+ prompt_tokens = self.count_tokens(prompt)
104
+ logger.debug(f"Chunk {chunk_num} prompt length: {prompt_tokens} tokens")
105
+
106
+ with tqdm(total=1, desc=f"Processing chunk {chunk_num}/{total_chunks}") as pbar:
107
+ response = self.model.generate_content(prompt)
108
+ completion_tokens = self.count_tokens(response.text)
109
+ pbar.update(1)
110
+
111
+ self.token_usage['prompt_tokens'] += prompt_tokens
112
+ self.token_usage['completion_tokens'] += completion_tokens
113
+ self.token_usage['total_tokens'] += (prompt_tokens + completion_tokens)
114
+
115
+ return response.text
116
+ except Exception as e:
117
+ logger.error(f"Error processing chunk {chunk_num}: {str(e)}")
118
+ logger.error("Full error details:", exc_info=True)
119
+ return None
120
+
121
+ def generate_final_synopsis(self, chunk_synopses: list) -> str:
122
+ """Combine chunk synopses into a final, coherent synopsis with strong narrative focus"""
123
+ combined_text = "\n\n".join([f"Section {i+1}:\n{synopsis}"
124
+ for i, synopsis in enumerate(chunk_synopses)])
125
+
126
+ prompt = f"""As an experienced script analyst, synthesize these section summaries into a comprehensive,
127
+ narratively cohesive synopsis of the entire screenplay.
128
+
129
+ You should have distinct sections on:
130
+ 1. The complete narrative arc from beginning to end
131
+ 2. Character development across the full story
132
+ 3. Major themes and how they evolve
133
+ 4. Key turning points and their impact
134
+ 5. The core conflict and its resolution
135
+
136
+ Ensure the synopsis flows naturally and captures the full story without revealing the seams between sections.
137
+
138
+ Section summaries:
139
+ {combined_text}"""
140
+
141
+ try:
142
+ logger.info("Generating final synopsis")
143
+ with tqdm(total=1, desc="Creating final synopsis") as pbar:
144
+ response = self.model.generate_content(prompt)
145
+ pbar.update(1)
146
+ return response.text
147
+ except Exception as e:
148
+ logger.error(f"Error generating final synopsis: {str(e)}")
149
+ return None
150
+
151
+ def generate_coverage(self, screenplay_path: Path) -> bool:
152
+ """Main method to generate full coverage document"""
153
+ logger.info("Starting coverage generation")
154
+
155
+ self.token_usage = {
156
+ 'prompt_tokens': 0,
157
+ 'completion_tokens': 0,
158
+ 'total_tokens': 0
159
+ }
160
+
161
+ with tqdm(total=4, desc="Generating coverage") as pbar:
162
+ # Read screenplay
163
+ screenplay_text = self.read_screenplay(screenplay_path)
164
+ if not screenplay_text:
165
+ return False
166
+ pbar.update(1)
167
+
168
+ # Split into chunks
169
+ chunks = self.chunk_screenplay(screenplay_text)
170
+ pbar.update(1)
171
+
172
+ # Process each chunk
173
+ chunk_synopses = []
174
+ for i, chunk in enumerate(chunks, 1):
175
+ synopsis = self.generate_synopsis(chunk, i, len(chunks))
176
+ if synopsis:
177
+ chunk_synopses.append(synopsis)
178
+ else:
179
+ logger.error(f"Failed to process chunk {i}")
180
+ return False
181
+ pbar.update(1)
182
+
183
+ # Generate final synopsis
184
+ final_synopsis = self.generate_final_synopsis(chunk_synopses)
185
+ if not final_synopsis:
186
+ return False
187
+
188
+ # Save coverage
189
+ output_dir = screenplay_path.parent
190
+ output_path = output_dir / "coverage.txt"
191
+
192
+ try:
193
+ with open(output_path, 'w', encoding='utf-8') as f:
194
+ f.write("SCREENPLAY COVERAGE\n\n")
195
+ f.write("### SYNOPSIS ###\n\n")
196
+ f.write(final_synopsis)
197
+
198
+ # Add token usage summary
199
+ f.write("\n\n### TOKEN USAGE SUMMARY ###\n")
200
+ f.write(f"Prompt Tokens: {self.token_usage['prompt_tokens']}\n")
201
+ f.write(f"Completion Tokens: {self.token_usage['completion_tokens']}\n")
202
+ f.write(f"Total Tokens: {self.token_usage['total_tokens']}\n")
203
+
204
+ logger.info("\nFinal Token Usage Summary:")
205
+ logger.info(f"Prompt Tokens: {self.token_usage['prompt_tokens']}")
206
+ logger.info(f"Completion Tokens: {self.token_usage['completion_tokens']}")
207
+ logger.info(f"Total Tokens: {self.token_usage['total_tokens']}")
208
+
209
+ pbar.update(1)
210
+ return True
211
+ except Exception as e:
212
+ logger.error(f"Error saving coverage: {str(e)}")
213
+ logger.error("Full error details:", exc_info=True)
214
+ return False