YAML Metadata
Warning:
empty or missing yaml metadata in repo card
(https://huggingface.co/docs/hub/model-cards#model-card-metadata)
import json import os
class TranscriptLoader: def init(self, main_folder): # Define relative paths self.transcript_folder = os.path.join(main_folder, 'youtube_scrape', 'transcripts_commercial_archivist') self.json_file = os.path.join(self.transcript_folder, 'transcripts.json') self.data = self._load_data()
def _load_data(self):
# Load the JSON file
with open(self.json_file, 'r') as f:
json_content = f.read()
# Fix the format by adding commas between objects and wrapping it in a list
json_content = '[' + json_content.replace('}{', '},{') + ']'
# Parse the JSON data
try:
data = json.loads(json_content)
except json.JSONDecodeError as e:
print(f"Error decoding JSON: {e}")
return []
return data
def get_transcript(self, index):
try:
entry = self.data[index]
# Extract the brand name (first word in the title)
brand = entry['video_title'].split()[0]
# Load the transcript from the corresponding .txt file
transcript_file = os.path.join(self.transcript_folder, f"commercialarchivist_{index}.txt")
with open(transcript_file, 'r') as f:
transcript = f.read()
return brand, transcript
except IndexError:
print("Index out of range.")
except FileNotFoundError:
print(f"Transcript file not found for index {index}.")
return None, None
Usage
main_folder = '.' # relative path to the main folder where this script is located loader = TranscriptLoader(main_folder)
Example: Load the transcript and brand of the first video
brand, transcript = loader.get_transcript(0) print(f"Brand: {brand}") print(f"Transcript:\n{transcript}")