Spaces:

yalrashed
/

pdf-to-podcast-test

Paused

App Files Files Community

yalrashed commited on Nov 14, 2024

Commit

405a174

verified ·

1 Parent(s): 882484d

Update src/screenplay_analysis.py

Browse files

Files changed (1) hide show

src/screenplay_analysis.py +29 -105

src/screenplay_analysis.py CHANGED Viewed

@@ -2,62 +2,46 @@ import os
 import requests
 import PyPDF2
 from tqdm import tqdm
-# Your original working API code
 API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-3.2-1B-Instruct"
 headers = {"Authorization": f"Bearer {os.environ['HUGGING_FACE_API_KEY']}"}
 def query(payload):
     response = requests.post(API_URL, headers=headers, json=payload)
     return response.json()
-# PDF Processing Functions
-def validate_pdf(file_path: str) -> bool:
-    """Check if the PDF file exists and is valid"""
-    if not os.path.exists(file_path):
-        print(f"Error: File not found at path: {file_path}")
-        return False
-    if not file_path.lower().endswith('.pdf'):
-        print("Error: File is not a PDF")
-        return False
-    return True
-def extract_text_from_pdf(file_path: str, max_pages: int = 20) -> str:
-    """Extract text from first n pages of PDF"""
-    if not validate_pdf(file_path):
-        return None
     try:
-        with open(file_path, 'rb') as file:
-            # Create PDF reader object
-            pdf_reader = PyPDF2.PdfReader(file)
-            # Get total number of pages
-            num_pages = len(pdf_reader.pages)
-            print(f"PDF has {num_pages} total pages. Will process first {max_pages} pages...")
-            extracted_text = []
-            # Iterate through pages up to max_pages
-            for page_num in range(min(num_pages, max_pages)):
-                page = pdf_reader.pages[page_num]
-                text = page.extract_text()
-                extracted_text.append(text)
-                print(f"Processed page {page_num + 1}")
-            final_text = '\n'.join(extracted_text)
-            print(f"\nExtraction complete! Total characters: {len(final_text)}")
-            return final_text
     except Exception as e:
         print(f"An error occurred: {str(e)}")
         return None
-# NEW FUNCTION GOES HERE - ADD THIS CHUNK FUNCTION RIGHT HERE, AFTER THE PDF FUNCTIONS BUT BEFORE THE FILE CHECKING CODE
 def create_screenplay_chunks(text: str, chunk_size: int = 1000) -> list:
-    """
-    Split screenplay into chunks at natural break points (scene headings where possible)
-    """
     # Split text into lines
     lines = text.split('\n')
     chunks = []
@@ -73,8 +57,7 @@ def create_screenplay_chunks(text: str, chunk_size: int = 1000) -> list:
             current_chunk = []
             current_length = 0
-        # Scene headings (usually in caps and start with INT. or EXT.)
-        # should start new chunks when possible
         if line.strip().upper() == line.strip() and ('INT.' in line or 'EXT.' in line):
             if current_chunk:  # If we have a current chunk, save it
                 chunks.append('\n'.join(current_chunk))
@@ -90,28 +73,6 @@ def create_screenplay_chunks(text: str, chunk_size: int = 1000) -> list:
     return chunks
-# File checking code
-print("Files in current directory:")
-for file in os.listdir():
-    print(file)
-# Try to extract from PDF
-pdf_path = "./F20.pdf"  # assuming F20.pdf is the exact filename
-print("\nTrying to extract from:", pdf_path)
-result = extract_text_from_pdf(pdf_path)
-# If successful, show first 500 characters
-if result:
-    print("\nFirst 500 characters of extracted text:")
-    print("-" * 50)
-    print(result[:500])
-    # NEW TEST CODE GOES HERE - ADD THIS RIGHT AFTER SHOWING THE FIRST 500 CHARACTERS
-    chunks = create_screenplay_chunks(result)
-    print(f"\nCreated {len(chunks)} chunks from the screenplay")
-    print("\nPreview of first chunk:")
-    print("-" * 50)
-    print(chunks[0])
 def process_screenplay_chunk(chunk: str, chunk_num: int) -> str:
     """Process a single chunk of screenplay text"""
     prompt = f"""<s>[INST]Clean this screenplay text, maintaining exact content and format:
@@ -136,7 +97,7 @@ def process_screenplay_chunk(chunk: str, chunk_num: int) -> str:
                 if len(cleaned_text) < 10 or len(cleaned_text) > len(chunk) * 3:
                     print(f"Warning: Chunk {chunk_num + 1} output seems invalid, using original")
                     return chunk
-            print(f"Processed chunk {chunk_num + 1}/34")
             return cleaned_text
         else:
             print(f"Error processing chunk {chunk_num + 1}: Unexpected output format")
@@ -144,41 +105,4 @@ def process_screenplay_chunk(chunk: str, chunk_num: int) -> str:
     except Exception as e:
         print(f"Error processing chunk {chunk_num + 1}: {str(e)}")
-        return chunk
-# Test the processing with just the first chunk
-if result:
-    chunks = create_screenplay_chunks(result)
-    print("\nTesting processing on first chunk:")
-    print("-" * 50)
-    processed_chunk = process_screenplay_chunk(chunks[0], 0)
-    print("\nProcessed result:")
-    print("-" * 50)
-    print(processed_chunk)
-# Process all chunks and combine results
-if result:
-    all_chunks = create_screenplay_chunks(result)
-    processed_chunks = []
-    print("\nProcessing all chunks...")
-    for i, chunk in enumerate(all_chunks):
-        processed_text = process_screenplay_chunk(chunk, i)
-        processed_chunks.append(processed_text)
-    # Combine all processed chunks
-    final_text = '\n\n'.join(processed_chunks)
-    # Save to file
-    output_file = 'cleaned_screenplay.txt'
-    with open(output_file, 'w', encoding='utf-8') as f:
-        f.write(final_text)
-    print(f"\nProcessing complete! Saved to {output_file}")
-    print(f"Total chunks processed: {len(processed_chunks)}")
-    # Show preview of final result
-    print("\nPreview of final cleaned text:")
-    print("-" * 50)
-    print(final_text[:500])

 import requests
 import PyPDF2
 from tqdm import tqdm
+import io
+# API configuration
 API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-3.2-1B-Instruct"
 headers = {"Authorization": f"Bearer {os.environ['HUGGING_FACE_API_KEY']}"}
 def query(payload):
+    """Send request to Hugging Face API"""
     response = requests.post(API_URL, headers=headers, json=payload)
     return response.json()
+def extract_text_from_pdf(pdf_content: bytes, max_pages: int = 20) -> str:
+    """Extract text from first n pages of PDF content"""
     try:
+        # Create PDF reader object from bytes content
+        pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_content))
+        # Get total number of pages
+        num_pages = len(pdf_reader.pages)
+        print(f"PDF has {num_pages} total pages. Will process first {max_pages} pages...")
+        extracted_text = []
+        # Iterate through pages up to max_pages
+        for page_num in range(min(num_pages, max_pages)):
+            page = pdf_reader.pages[page_num]
+            text = page.extract_text()
+            extracted_text.append(text)
+            print(f"Processed page {page_num + 1}")
+        final_text = '\n'.join(extracted_text)
+        print(f"\nExtraction complete! Total characters: {len(final_text)}")
+        return final_text
     except Exception as e:
         print(f"An error occurred: {str(e)}")
         return None
 def create_screenplay_chunks(text: str, chunk_size: int = 1000) -> list:
+    """Split screenplay into chunks at natural break points"""
     # Split text into lines
     lines = text.split('\n')
     chunks = []
             current_chunk = []
             current_length = 0
+        # Scene headings should start new chunks when possible
         if line.strip().upper() == line.strip() and ('INT.' in line or 'EXT.' in line):
             if current_chunk:  # If we have a current chunk, save it
                 chunks.append('\n'.join(current_chunk))
     return chunks
 def process_screenplay_chunk(chunk: str, chunk_num: int) -> str:
     """Process a single chunk of screenplay text"""
     prompt = f"""<s>[INST]Clean this screenplay text, maintaining exact content and format:
                 if len(cleaned_text) < 10 or len(cleaned_text) > len(chunk) * 3:
                     print(f"Warning: Chunk {chunk_num + 1} output seems invalid, using original")
                     return chunk
+            print(f"Processed chunk {chunk_num + 1}")
             return cleaned_text
         else:
             print(f"Error processing chunk {chunk_num + 1}: Unexpected output format")
     except Exception as e:
         print(f"Error processing chunk {chunk_num + 1}: {str(e)}")
+        return chunk