import requests from youtube_transcript_api import YouTubeTranscriptApi import json import os headers = { "Authorization": f"Bearer {os.environ['HF_Token']}" } # NOTE: put this somewhere else def retrieve_transcript(vid_id): try: transcript = YouTubeTranscriptApi.get_transcript(vid_id) return transcript except Exception as e: return None def split_transcript(transcript, chunk_size=40): sentences = [] for i in range(0, len(transcript), chunk_size): to_add = [x["text"] for x in transcript[i : i + chunk_size]] sentences.append(" ".join(to_add)) return sentences def query_punctuation(splits): payload = {"inputs": splits} API_URL = "https://api-inference.huggingface.co/models/oliverguhr/fullstop-punctuation-multilang-large" response = requests.post(API_URL, headers=headers, json=payload) return response.json() def parse_output(output, comb): total = [] # loop over the response from the huggingface api for i, o in enumerate(output): added = 0 tt = comb[i] for elem in o: try: # Loop over the output chunks and add the . and ? if elem["entity_group"] not in ["0", ",", ""]: split = elem["end"] + added tt = tt[:split] + elem["entity_group"] + tt[split:] added += 1 except: continue total.append(tt) return " ".join(total) def punctuate(video_id): transcript = retrieve_transcript(video_id) splits = split_transcript( transcript ) # Get the transcript from the YoutubeTranscriptApi resp = query_punctuation(splits) # Get the response from the Inference API punctuated_transcript = parse_output(resp, splits) return punctuated_transcript, transcript