File size: 6,169 Bytes
837fdb6 fa50d64 837fdb6 8fc8a5c 11a46c7 e81d911 ee90b6b 837fdb6 5b1aa5f 837fdb6 9543b09 837fdb6 23c56a3 837fdb6 23c56a3 837fdb6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
import transcript as ts
import ytvideo as vd
import frames as fr
import repunct as rp
import lexrank as lr
# import sys
# del sys.modules['ytvideo']
# del sys.modules['transcript']
# del sys.modules['frames']
# del sys.modules['lexrank']
#########################################################################
# LEXRANK SUMMARY
#########################################################################
def getSummaryImage(link, lexrank_switch, rpunkt_switch):
# cleanup the working directory
#result = fr.removeFilesInWorkdir()
#print('removeFilesInWorkdir result: ',result)
if len(link) == 0:
return 'Error: no link provided'
print('getting transcript using link: ', link)
raw_transcript, type_transcript = ts.get_json_transcript(link,rpunkt_switch)
print('transcript type: ', type_transcript)
#timestamps = ts.get_timestamps(raw_transcript)
raw_caption = ts.get_caption(raw_transcript)
# module rpunct
# restore punctuations from raw captions
# if necessary
pnct_raw_transcript = raw_transcript
pnct_caption = raw_caption
dict_sentences = ts.getSentences(pnct_raw_transcript)
concat_list_summary = 'empty'
if lexrank_switch:
# summarize small part of the text
nr_sentences = round(len(dict_sentences)*0.05)
trunc_pnct_caption = ' '.join(dict_sentences.values())
list_summary = lr.getSummary(trunc_pnct_caption,nr_sentences)
# it can happen that for lexrank a sentence conists of multiple actual sentences,
# that are separated with full stops. Then the correspoinding timestamp cannot be found
# all items from the lexrank summary must be concatinated and split up by full stops.
concat_list_summary = '. '.join([str(item) for item in list_summary]).split('. ')
print('zip: '+str(nr_sentences))
if nr_sentences == 0:
return 'Error: No sentences available', None
else:
concat_list_summary = [*dict_sentences.values()]
dict_timestamp_summary = ts.getTimestampAtFrameFromSummary(pnct_raw_transcript,dict_sentences,concat_list_summary)
if 'Error' in dict_timestamp_summary:
return dict_timestamp_summary
result_get_video=vd.get_video(link)
print('video: '+result_get_video)
proc_list = fr.extractImagesFromVideo(dict_timestamp_summary.keys())
print('frames: '+str(proc_list))
images = ts.getImages(dict_timestamp_summary)
return images
def getSummary(link, lexrank_switch, rpunkt_switch):
# cleanup the working directory
#result = fr.removeFilesInWorkdir()
#print('removeFilesInWorkdir result: ',result)
if len(link) == 0:
return 'Error: no link provided'
video_id = ts.get_id_from_link(link)
if 'Error' in video_id:
return video_id
print('getting transcript using video_id: ', video_id, rpunkt_switch)
raw_transcript, type_transcript = ts.get_json_transcript(video_id,rpunkt_switch)
print('transcript type: ', type_transcript)
#timestamps = ts.get_timestamps(raw_transcript)
raw_caption = ts.get_caption(raw_transcript)
# module rpunct
# restore punctuations from raw captions
# if necessary
pnct_raw_transcript = raw_transcript
pnct_caption = raw_caption
if rpunkt_switch:
#if type_transcript[0] == 'en':
# the variable type_transcript[1] contains the text 'generated' or 'translated'
print('Recovering punctuation from english text...', type_transcript[1])
# remove punctuation leftovers
#clean_raw_caption = re.sub('[,?.!]','',raw_caption)
caption = rp.predict(raw_caption)
if 'Error' in caption:
return caption
pnct_caption = ts.restore_cr(raw_caption,caption)
pnct_raw_transcript = ts.replacePunctuatedText(raw_transcript, pnct_caption)
dict_sentences = ts.getSentences(pnct_raw_transcript)
concat_list_summary = 'empty'
if lexrank_switch:
# summarize small part of the text
nr_sentences = round(len(dict_sentences)*0.05)
trunc_pnct_caption = ' '.join(dict_sentences.values())
list_summary = lr.getSummary(trunc_pnct_caption,nr_sentences)
# it can happen that for lexrank a sentence conists of multiple actual sentences,
# that are separated with full stops. Then the correspoinding timestamp cannot be found
# all items from the lexrank summary must be concatinated and split up by full stops.
concat_list_summary = '. '.join([str(item) for item in list_summary]).split('. ')
print('zip: '+str(nr_sentences))
if nr_sentences == 0:
return 'Error: No sentences available', None
else:
concat_list_summary = [*dict_sentences.values()]
dict_timestamp_summary = ts.getTimestampAtFrameFromSummary(pnct_raw_transcript,dict_sentences,concat_list_summary)
if 'Error' in dict_timestamp_summary:
return dict_timestamp_summary
prefix='http://youtube.com/watch?v='
result_get_video=vd.get_video(prefix+video_id)
print('video: '+result_get_video)
proc_list = fr.extractImagesFromVideo(dict_timestamp_summary.keys())
print('frames: '+str(proc_list))
json_file = ts.convertToJSON(dict_timestamp_summary)
#images = ts.getImages(dict_timestamp_summary)
#return json_file, images
return json_file
#filename='/Users/hujo/Downloads/Channel_Summaries/wholesaleted.srt.pnct.txt'
#with open(filename, 'w') as the_file:
# the_file.write(raw_caption)
#link ="https://www.youtube.com/watch?v=8uQDDUfGNPA" # blog
#link = "https://www.youtube.com/watch?v=ofZEo0Rzo5s" # h-educate
#link = 'https://www.youtube.com/watch?v=ReHGSGwV4-A' #wholesale ted
#link = 'https://www.youtube.com/watch?v=n8JHnLgodRI' #kevindavid
#link = 'https://www.youtube.com/watch?v=6MI0f6YjJIk' # Nicholas
#link = 'https://www.youtube.com/watch?v=bj9snrsSook' #Geldschnurrbart
#link = 'https://www.youtube.com/watch?v=lCnHfTHkhbE' #fcc tutorial
#link = 'https://www.youtube.com/watch?v=0kJz0q0pvgQ&feature=youtu.be' # fcc
#lexrank = True
#result = getSummary(link, lexrank)
#print(result)
|