import transcript as ts import ytvideo as vd import frames as fr import repunct as rp import lexrank as lr # import sys # del sys.modules['ytvideo'] # del sys.modules['transcript'] # del sys.modules['frames'] # del sys.modules['lexrank'] ######################################################################### # LEXRANK SUMMARY ######################################################################### def getSummaryImage(link, lexrank_switch, rpunkt_switch): # cleanup the working directory #result = fr.removeFilesInWorkdir() #print('removeFilesInWorkdir result: ',result) if len(link) == 0: return 'Error: no link provided' print('getting transcript using link: ', link) raw_transcript, type_transcript = ts.get_json_transcript(link,rpunkt_switch) print('transcript type: ', type_transcript) #timestamps = ts.get_timestamps(raw_transcript) raw_caption = ts.get_caption(raw_transcript) # module rpunct # restore punctuations from raw captions # if necessary pnct_raw_transcript = raw_transcript pnct_caption = raw_caption dict_sentences = ts.getSentences(pnct_raw_transcript) concat_list_summary = 'empty' if lexrank_switch: # summarize small part of the text nr_sentences = round(len(dict_sentences)*0.05) trunc_pnct_caption = ' '.join(dict_sentences.values()) list_summary = lr.getSummary(trunc_pnct_caption,nr_sentences) # it can happen that for lexrank a sentence conists of multiple actual sentences, # that are separated with full stops. Then the correspoinding timestamp cannot be found # all items from the lexrank summary must be concatinated and split up by full stops. concat_list_summary = '. '.join([str(item) for item in list_summary]).split('. ') print('zip: '+str(nr_sentences)) if nr_sentences == 0: return 'Error: No sentences available', None else: concat_list_summary = [*dict_sentences.values()] dict_timestamp_summary = ts.getTimestampAtFrameFromSummary(pnct_raw_transcript,dict_sentences,concat_list_summary) if 'Error' in dict_timestamp_summary: return dict_timestamp_summary result_get_video=vd.get_video(link) print('video: '+result_get_video) proc_list = fr.extractImagesFromVideo(dict_timestamp_summary.keys()) print('frames: '+str(proc_list)) images = ts.getImages(dict_timestamp_summary) return images def getSummary(link, lexrank_switch, rpunkt_switch): # cleanup the working directory #result = fr.removeFilesInWorkdir() #print('removeFilesInWorkdir result: ',result) if len(link) == 0: return 'Error: no link provided' video_id = ts.get_id_from_link(link) if 'Error' in video_id: return video_id print('getting transcript using video_id: ', video_id, rpunkt_switch) raw_transcript, type_transcript = ts.get_json_transcript(video_id,rpunkt_switch) print('transcript type: ', type_transcript) #timestamps = ts.get_timestamps(raw_transcript) raw_caption = ts.get_caption(raw_transcript) # module rpunct # restore punctuations from raw captions # if necessary pnct_raw_transcript = raw_transcript pnct_caption = raw_caption if rpunkt_switch: #if type_transcript[0] == 'en': # the variable type_transcript[1] contains the text 'generated' or 'translated' print('Recovering punctuation from english text...', type_transcript[1]) # remove punctuation leftovers #clean_raw_caption = re.sub('[,?.!]','',raw_caption) caption = rp.predict(raw_caption) if 'Error' in caption: return caption pnct_caption = ts.restore_cr(raw_caption,caption) pnct_raw_transcript = ts.replacePunctuatedText(raw_transcript, pnct_caption) dict_sentences = ts.getSentences(pnct_raw_transcript) concat_list_summary = 'empty' if lexrank_switch: # summarize small part of the text nr_sentences = round(len(dict_sentences)*0.05) trunc_pnct_caption = ' '.join(dict_sentences.values()) list_summary = lr.getSummary(trunc_pnct_caption,nr_sentences) # it can happen that for lexrank a sentence conists of multiple actual sentences, # that are separated with full stops. Then the correspoinding timestamp cannot be found # all items from the lexrank summary must be concatinated and split up by full stops. concat_list_summary = '. '.join([str(item) for item in list_summary]).split('. ') print('zip: '+str(nr_sentences)) if nr_sentences == 0: return 'Error: No sentences available', None else: concat_list_summary = [*dict_sentences.values()] dict_timestamp_summary = ts.getTimestampAtFrameFromSummary(pnct_raw_transcript,dict_sentences,concat_list_summary) if 'Error' in dict_timestamp_summary: return dict_timestamp_summary prefix='http://youtube.com/watch?v=' result_get_video=vd.get_video(prefix+video_id) print('video: '+result_get_video) proc_list = fr.extractImagesFromVideo(dict_timestamp_summary.keys()) print('frames: '+str(proc_list)) json_file = ts.convertToJSON(dict_timestamp_summary) #images = ts.getImages(dict_timestamp_summary) #return json_file, images return json_file #filename='/Users/hujo/Downloads/Channel_Summaries/wholesaleted.srt.pnct.txt' #with open(filename, 'w') as the_file: # the_file.write(raw_caption) #link ="https://www.youtube.com/watch?v=8uQDDUfGNPA" # blog #link = "https://www.youtube.com/watch?v=ofZEo0Rzo5s" # h-educate #link = 'https://www.youtube.com/watch?v=ReHGSGwV4-A' #wholesale ted #link = 'https://www.youtube.com/watch?v=n8JHnLgodRI' #kevindavid #link = 'https://www.youtube.com/watch?v=6MI0f6YjJIk' # Nicholas #link = 'https://www.youtube.com/watch?v=bj9snrsSook' #Geldschnurrbart #link = 'https://www.youtube.com/watch?v=lCnHfTHkhbE' #fcc tutorial #link = 'https://www.youtube.com/watch?v=0kJz0q0pvgQ&feature=youtu.be' # fcc #lexrank = True #result = getSummary(link, lexrank) #print(result)