tubifier / summarizer.py
wldmr's picture
app file
fa50d64
raw
history blame
6.11 kB
import transcript as ts
import ytvideo as vd
import frames as fr
import repunct as rp
import lexrank as lr
# import sys
# del sys.modules['ytvideo']
# del sys.modules['transcript']
# del sys.modules['frames']
# del sys.modules['lexrank']
#########################################################################
# LEXRANK SUMMARY
#########################################################################
def getSummaryImage(link, lexrank_switch, rpunkt_switch):
# cleanup the working directory
#result = fr.removeFilesInWorkdir()
#print('removeFilesInWorkdir result: ',result)
if len(link) == 0:
return 'Error: no link provided'
print('getting transcript using link: ', link)
raw_transcript, type_transcript = ts.get_json_transcript(link,rpunkt_switch)
print('transcript type: ', type_transcript)
#timestamps = ts.get_timestamps(raw_transcript)
raw_caption = ts.get_caption(raw_transcript)
# module rpunct
# restore punctuations from raw captions
# if necessary
pnct_raw_transcript = raw_transcript
pnct_caption = raw_caption
dict_sentences = ts.getSentences(pnct_raw_transcript)
concat_list_summary = 'empty'
if lexrank_switch:
# summarize small part of the text
nr_sentences = round(len(dict_sentences)*0.05)
trunc_pnct_caption = ' '.join(dict_sentences.values())
list_summary = lr.getSummary(trunc_pnct_caption,nr_sentences)
# it can happen that for lexrank a sentence conists of multiple actual sentences,
# that are separated with full stops. Then the correspoinding timestamp cannot be found
# all items from the lexrank summary must be concatinated and split up by full stops.
concat_list_summary = '. '.join([str(item) for item in list_summary]).split('. ')
print('zip: '+str(nr_sentences))
if nr_sentences == 0:
return 'Error: No sentences available', None
else:
concat_list_summary = [*dict_sentences.values()]
dict_timestamp_summary = ts.getTimestampAtFrameFromSummary(pnct_raw_transcript,dict_sentences,concat_list_summary)
if 'Error' in dict_timestamp_summary:
return dict_timestamp_summary
result_get_video=vd.get_video(link)
print('video: '+result_get_video)
proc_list = fr.extractImagesFromVideo(dict_timestamp_summary.keys())
print('frames: '+str(proc_list))
images = ts.getImages(dict_timestamp_summary)
return images
def getSummary(link, lexrank_switch, rpunkt_switch):
# cleanup the working directory
#result = fr.removeFilesInWorkdir()
#print('removeFilesInWorkdir result: ',result)
if len(link) == 0:
return 'Error: no link provided'
video_id = ts.get_id_from_link(link)
if 'Error' in video_id:
return video_id
print('getting transcript using video_id: ', video_id, rpunkt_switch)
raw_transcript, type_transcript = ts.get_json_transcript(video_id,rpunkt_switch)
print('transcript type: ', type_transcript)
#timestamps = ts.get_timestamps(raw_transcript)
raw_caption = ts.get_caption(raw_transcript)
# module rpunct
# restore punctuations from raw captions
# if necessary
pnct_raw_transcript = raw_transcript
pnct_caption = raw_caption
if rpunkt_switch:
#if type_transcript[0] == 'en':
# the variable type_transcript[1] contains the text 'generated' or 'translated'
print('Recovering punctuation from english text...', type_transcript[1])
# remove punctuation leftovers
#clean_raw_caption = re.sub('[,?.!]','',raw_caption)
caption = rp.predict(raw_caption)
pnct_caption = ts.restore_cr(raw_caption,caption)
pnct_raw_transcript = ts.replacePunctuatedText(raw_transcript, pnct_caption)
dict_sentences = ts.getSentences(pnct_raw_transcript)
concat_list_summary = 'empty'
if lexrank_switch:
# summarize small part of the text
nr_sentences = round(len(dict_sentences)*0.05)
trunc_pnct_caption = ' '.join(dict_sentences.values())
list_summary = lr.getSummary(trunc_pnct_caption,nr_sentences)
# it can happen that for lexrank a sentence conists of multiple actual sentences,
# that are separated with full stops. Then the correspoinding timestamp cannot be found
# all items from the lexrank summary must be concatinated and split up by full stops.
concat_list_summary = '. '.join([str(item) for item in list_summary]).split('. ')
print('zip: '+str(nr_sentences))
if nr_sentences == 0:
return 'Error: No sentences available', None
else:
concat_list_summary = [*dict_sentences.values()]
dict_timestamp_summary = ts.getTimestampAtFrameFromSummary(pnct_raw_transcript,dict_sentences,concat_list_summary)
if 'Error' in dict_timestamp_summary:
return dict_timestamp_summary
prefix='http://youtube.com/watch?v='
result_get_video=vd.get_video(prefix+video_id)
print('video: '+result_get_video)
proc_list = fr.extractImagesFromVideo(dict_timestamp_summary.keys())
print('frames: '+str(proc_list))
json_file = ts.convertToJSON(dict_timestamp_summary)
#images = ts.getImages(dict_timestamp_summary)
#return json_file, images
return json_file
#filename='/Users/hujo/Downloads/Channel_Summaries/wholesaleted.srt.pnct.txt'
#with open(filename, 'w') as the_file:
# the_file.write(raw_caption)
#link ="https://www.youtube.com/watch?v=8uQDDUfGNPA" # blog
#link = "https://www.youtube.com/watch?v=ofZEo0Rzo5s" # h-educate
#link = 'https://www.youtube.com/watch?v=ReHGSGwV4-A' #wholesale ted
#link = 'https://www.youtube.com/watch?v=n8JHnLgodRI' #kevindavid
#link = 'https://www.youtube.com/watch?v=6MI0f6YjJIk' # Nicholas
#link = 'https://www.youtube.com/watch?v=bj9snrsSook' #Geldschnurrbart
#link = 'https://www.youtube.com/watch?v=lCnHfTHkhbE' #fcc tutorial
#link = 'https://www.youtube.com/watch?v=0kJz0q0pvgQ&feature=youtu.be' # fcc
#lexrank = True
#result = getSummary(link, lexrank)
#print(result)