yalrashed commited on
Commit
680c044
·
verified ·
1 Parent(s): 2ce6886

Update src/processing/gemini_processor.py

Browse files
Files changed (1) hide show
  1. src/processing/gemini_processor.py +127 -0
src/processing/gemini_processor.py CHANGED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ from pathlib import Path
4
+ from typing import List
5
+ import google.generativeai as genai
6
+ from PyPDF2 import PdfReader
7
+ from tqdm import tqdm
8
+
9
+
10
+ class GeminiProcessor:
11
+
12
+ def __init__(self):
13
+ self.api_key = os.getenv("GOOGLE_API_KEY")
14
+ if not self.api_key:
15
+ raise ValueError("GOOGLE_API_KEY not found")
16
+
17
+ # Configure Gemini
18
+ genai.configure(api_key=self.api_key)
19
+ self.model = genai.GenerativeModel('gemini-pro')
20
+
21
+ def preprocess_text(self, text: str) -> str:
22
+ """Enhanced preprocessing for screenplay text"""
23
+ # Remove HTML and script tags
24
+ text = re.sub(r'<[^>]+>', '', text)
25
+
26
+ # Fix standalone scene headings
27
+ text = re.sub(r'\n(INT\.|EXT\.|INT\/EXT\.)\s*\n', '', text)
28
+
29
+ # Remove line numbers and (CONT'D)
30
+ text = re.sub(r'\d+\.$', '', text, flags=re.MULTILINE)
31
+ text = re.sub(r'\(CONT\'D\)\d*', '', text)
32
+
33
+ # Fix spacing around punctuation
34
+ text = re.sub(r'\s+([.,!?])', r'\1', text)
35
+
36
+ # Clean up multiple spaces and line breaks
37
+ text = re.sub(r' +', ' ', text)
38
+ text = re.sub(r'\n{3,}', '\n\n', text)
39
+
40
+ # Remove repetitive content
41
+ lines = text.split('\n')
42
+ cleaned_lines = []
43
+ prev_line = None
44
+
45
+ for line in lines:
46
+ if not line.strip() or line == prev_line:
47
+ continue
48
+ if line.strip() in ['INT.', 'EXT.', 'INT/EXT.']:
49
+ continue
50
+ cleaned_lines.append(line)
51
+ prev_line = line
52
+
53
+ return '\n'.join(cleaned_lines)
54
+
55
+ def split_into_scenes(self, text: str) -> list:
56
+ """Split screenplay into scenes while preserving headers and content"""
57
+ # Match scene headers and capture all content until the next header
58
+ scene_pattern = r'((?:INT\.|EXT\.|INT\/EXT\.)[^\n]+\n(?:(?!(?:INT\.|EXT\.|INT\/EXT\.))[^\n]+\n)*)'
59
+
60
+ scenes = re.findall(scene_pattern, text, re.MULTILINE)
61
+
62
+ # Clean and validate scenes
63
+ valid_scenes = []
64
+ for scene in scenes:
65
+ scene = scene.strip()
66
+ if scene:
67
+ valid_scenes.append(scene)
68
+
69
+ return valid_scenes
70
+
71
+ def clean_scene(self, scene: str) -> str:
72
+ """Process a single scene through Gemini"""
73
+ prompt = f"""Fix ONLY spacing and indentation in this screenplay scene.
74
+ DO NOT modify any words or content. DO NOT add or remove lines.
75
+ Keep original capitalization and formatting:
76
+
77
+ {scene}"""
78
+
79
+ try:
80
+ response = self.model.generate_content(prompt)
81
+ if response.text:
82
+ cleaned = response.text
83
+ # Basic validation
84
+ if abs(len(scene.split()) - len(cleaned.split())) <= 3:
85
+ return cleaned.strip()
86
+ return scene
87
+
88
+ except Exception as e:
89
+ print(f"Error cleaning scene: {str(e)}")
90
+ return scene
91
+
92
+ def process_screenplay(self, pdf_path: str, output_path: str) -> bool:
93
+ """Process entire screenplay"""
94
+ try:
95
+ # Read PDF
96
+ with open(pdf_path, 'rb') as file:
97
+ pdf = PdfReader(file)
98
+ text = '\n'.join(page.extract_text() for page in pdf.pages)
99
+
100
+ #print("Extracted Text:")
101
+ #print(text) # This will show you what text was actually extracted from the PDF
102
+
103
+ # Initial preprocessing
104
+ text = self.preprocess_text(text)
105
+
106
+ # Split into scenes
107
+ scenes = self.split_into_scenes(text)
108
+ print(f"Found {len(scenes)} scenes")
109
+
110
+ # Process each scene
111
+ cleaned_scenes = []
112
+ for scene in tqdm(scenes, desc="Processing scenes"):
113
+ cleaned = self.clean_scene(scene)
114
+ if cleaned:
115
+ cleaned = self.preprocess_text(cleaned)
116
+ cleaned_scenes.append(cleaned)
117
+
118
+ # Save result
119
+ Path(output_path).parent.mkdir(parents=True, exist_ok=True)
120
+ with open(output_path, 'w', encoding='utf-8') as f:
121
+ f.write('\n\n'.join(cleaned_scenes))
122
+
123
+ return True
124
+
125
+ except Exception as e:
126
+ print(f"Error processing screenplay: {str(e)}")
127
+ return False