from youtube_transcript_api import YouTubeTranscriptApi import re from PIL import Image #transcript_list = YouTubeTranscriptApi.list_transcripts('ReHGSGwV4-A') #transcript = transcript_list.find_transcript(['en','de']) # step 1: download the json transcript for youtube video def get_json_transcript(link,rpunkt_switch): if "v=" in link: video_id = link.split("v=")[1].split("&")[0] else: return "Error: Invalid Link, it does not have the pattern 'v=' in it." transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) # get the auto-generated english text # if it is not available translate to en raw_transcript = 'empty' type_transcript = [] if rpunkt_switch: try: transcript = transcript_list.find_generated_transcript(['en']) raw_transcript = transcript.fetch() type_transcript = ['en','generated'] except: transcript = transcript_list.find_transcript(['de']) raw_transcript = transcript.translate('en').fetch() type_transcript = ['en','translated'] else: transcript = transcript_list.find_transcript(['en','de']) raw_transcript = transcript.fetch() type_transcript = ['den','manual'] return raw_transcript, type_transcript # step 2: extract timestamps from json transcript def get_timestamps(transcript_raw): transcript_timestamps = '\n'.join([str(i['start']) for i in transcript_raw]) return transcript_timestamps.split('\n') # step 3: extract text from transcript def get_caption(transcript_raw): transcript_text = '\n'.join([i['text'].replace('\n',' ') for i in transcript_raw]) return transcript_text def replacePunctuatedText(raw_transcript, caption): list_caption = caption.split('\n') pnct_raw_transcript = raw_transcript for (idx, line) in enumerate(pnct_raw_transcript): line['text']=list_caption[idx] return pnct_raw_transcript def getSentences(raw_transcript): # walk over each frame and extract the time stamp and the text # the time stamp is wrapped in hash tag signs frm_cap = '' for (idx, line) in enumerate(raw_transcript, start=1): frm_cap = frm_cap+' #'+str(idx)+'# '+line['text'].replace('\n',' ').replace('\n',' ') dict_sentences = {} sentences = frm_cap.strip().split('. ') # small sentences that do not have an own frame are dropped # sentences that are less than 20 letters large are dropped, too # this is useful, so that lexrank does not picks the short sentences for idx,item in enumerate(sentences): m = re.search(r"#[^#]*#", item) if m is not None: match = m.group(0) frm = match.replace('#','') clean_match = re.sub('\s*#[^#]*#\s*',' ',item) + '.' if len(clean_match) > 20: dict_sentences[frm] = clean_match.strip() return dict_sentences # split all sentences into an array # remove all timestamps in the middle of the sentences # leave only the timestamps at the beginning of each sentence # restore the full-stop sign at the end of each sentence, that was removed in the split step #chops = '' #for item in sl.strip().split('. '): # chops = chops + re.sub('\s*#[^#]*#\s*',' ',item) + '. ' #chops # remove all remaining hash tags #dsl={} #for item in chops.split('. #'): # elem= item.split('# ') # idx = elem[0].replace('#','') # sentence = elem[1]+'.' # dsl[idx] = sentence #return dsl def convertToHTML(dsl): workdir = 'file/workdir/' cnt=1 html_rows = '