Spaces:

yalrashed
/

ScriptLLM

Sleeping

App Files Files

xet

Community

yalrashed commited on Dec 5, 2024

Commit

680c044

verified ·

1 Parent(s): 2ce6886

Update src/processing/gemini_processor.py

Browse files

Files changed (1) hide show

src/processing/gemini_processor.py +127 -0

src/processing/gemini_processor.py CHANGED Viewed

	@@ -0,0 +1,127 @@

+import os
+import re
+from pathlib import Path
+from typing import List
+import google.generativeai as genai
+from PyPDF2 import PdfReader
+from tqdm import tqdm
+class GeminiProcessor:
+    def __init__(self):
+        self.api_key = os.getenv("GOOGLE_API_KEY")
+        if not self.api_key:
+            raise ValueError("GOOGLE_API_KEY not found")
+        # Configure Gemini
+        genai.configure(api_key=self.api_key)
+        self.model = genai.GenerativeModel('gemini-pro')
+    def preprocess_text(self, text: str) -> str:
+        """Enhanced preprocessing for screenplay text"""
+        # Remove HTML and script tags
+        text = re.sub(r'<[^>]+>', '', text)
+        # Fix standalone scene headings
+        text = re.sub(r'\n(INT\.|EXT\.|INT\/EXT\.)\s*\n', '', text)
+        # Remove line numbers and (CONT'D)
+        text = re.sub(r'\d+\.$', '', text, flags=re.MULTILINE)
+        text = re.sub(r'\(CONT\'D\)\d*', '', text)
+        # Fix spacing around punctuation
+        text = re.sub(r'\s+([.,!?])', r'\1', text)
+        # Clean up multiple spaces and line breaks
+        text = re.sub(r' +', ' ', text)
+        text = re.sub(r'\n{3,}', '\n\n', text)
+        # Remove repetitive content
+        lines = text.split('\n')
+        cleaned_lines = []
+        prev_line = None
+        for line in lines:
+            if not line.strip() or line == prev_line:
+                continue
+            if line.strip() in ['INT.', 'EXT.', 'INT/EXT.']:
+                continue
+            cleaned_lines.append(line)
+            prev_line = line
+        return '\n'.join(cleaned_lines)
+    def split_into_scenes(self, text: str) -> list:
+        """Split screenplay into scenes while preserving headers and content"""
+        # Match scene headers and capture all content until the next header
+        scene_pattern = r'((?:INT\.|EXT\.|INT\/EXT\.)[^\n]+\n(?:(?!(?:INT\.|EXT\.|INT\/EXT\.))[^\n]+\n)*)'
+        scenes = re.findall(scene_pattern, text, re.MULTILINE)
+        # Clean and validate scenes
+        valid_scenes = []
+        for scene in scenes:
+            scene = scene.strip()
+            if scene:
+                valid_scenes.append(scene)
+        return valid_scenes
+    def clean_scene(self, scene: str) -> str:
+        """Process a single scene through Gemini"""
+        prompt = f"""Fix ONLY spacing and indentation in this screenplay scene.
+    DO NOT modify any words or content. DO NOT add or remove lines.
+    Keep original capitalization and formatting:
+    {scene}"""
+        try:
+            response = self.model.generate_content(prompt)
+            if response.text:
+                cleaned = response.text
+                # Basic validation
+                if abs(len(scene.split()) - len(cleaned.split())) <= 3:
+                    return cleaned.strip()
+            return scene
+        except Exception as e:
+            print(f"Error cleaning scene: {str(e)}")
+            return scene
+    def process_screenplay(self, pdf_path: str, output_path: str) -> bool:
+        """Process entire screenplay"""
+        try:
+            # Read PDF
+            with open(pdf_path, 'rb') as file:
+                pdf = PdfReader(file)
+                text = '\n'.join(page.extract_text() for page in pdf.pages)
+            #print("Extracted Text:")
+            #print(text)  # This will show you what text was actually extracted from the PDF
+            # Initial preprocessing
+            text = self.preprocess_text(text)
+            # Split into scenes
+            scenes = self.split_into_scenes(text)
+            print(f"Found {len(scenes)} scenes")
+            # Process each scene
+            cleaned_scenes = []
+            for scene in tqdm(scenes, desc="Processing scenes"):
+                cleaned = self.clean_scene(scene)
+                if cleaned:
+                    cleaned = self.preprocess_text(cleaned)
+                    cleaned_scenes.append(cleaned)
+            # Save result
+            Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+            with open(output_path, 'w', encoding='utf-8') as f:
+                f.write('\n\n'.join(cleaned_scenes))
+            return True
+        except Exception as e:
+            print(f"Error processing screenplay: {str(e)}")
+            return False