import json import os

class TranscriptLoader: def init(self, main_folder): # Define relative paths self.transcript_folder = os.path.join(main_folder, 'youtube_scrape', 'transcripts_commercial_archivist') self.json_file = os.path.join(self.transcript_folder, 'transcripts.json') self.data = self._load_data()

def _load_data(self):
    # Load the JSON file
    with open(self.json_file, 'r') as f:
        json_content = f.read()
    
    # Fix the format by adding commas between objects and wrapping it in a list
    json_content = '[' + json_content.replace('}{', '},{') + ']'

    # Parse the JSON data
    try:
        data = json.loads(json_content)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        return []

    return data

def get_transcript(self, index):
    try:
        entry = self.data[index]
        # Extract the brand name (first word in the title)
        brand = entry['video_title'].split()[0]
        # Load the transcript from the corresponding .txt file
        transcript_file = os.path.join(self.transcript_folder, f"commercialarchivist_{index}.txt")
        with open(transcript_file, 'r') as f:
            transcript = f.read()
        return brand, transcript
    except IndexError:
        print("Index out of range.")
    except FileNotFoundError:
        print(f"Transcript file not found for index {index}.")
    return None, None

Usage

main_folder = '.' # relative path to the main folder where this script is located loader = TranscriptLoader(main_folder)

Example: Load the transcript and brand of the first video

brand, transcript = loader.get_transcript(0) print(f"Brand: {brand}") print(f"Transcript:\n{transcript}")