Spaces:

wldmr
/

tubifier

Sleeping

tubifier / summarizer.py

app file

9543b09 over 1 year ago

6.11 kB

	import transcript as ts
	import ytvideo as vd
	import frames as fr
	#import repunct as rp
	import lexrank as lr

	# import sys
	# del sys.modules['ytvideo']
	# del sys.modules['transcript']
	# del sys.modules['frames']
	# del sys.modules['lexrank']


	#########################################################################
	# LEXRANK SUMMARY
	#########################################################################

	def getSummaryImage(link, lexrank_switch, rpunkt_switch):

	# cleanup the working directory
	#result = fr.removeFilesInWorkdir()
	#print('removeFilesInWorkdir result: ',result)

	if len(link) == 0:
	return 'Error: no link provided'

	print('getting transcript using link: ', link)
	raw_transcript, type_transcript = ts.get_json_transcript(link,rpunkt_switch)
	print('transcript type: ', type_transcript)
	#timestamps = ts.get_timestamps(raw_transcript)
	raw_caption = ts.get_caption(raw_transcript)

	# module rpunct
	# restore punctuations from raw captions
	# if necessary
	pnct_raw_transcript = raw_transcript
	pnct_caption = raw_caption

	dict_sentences = ts.getSentences(pnct_raw_transcript)

	concat_list_summary = 'empty'
	if lexrank_switch:
	# summarize small part of the text
	nr_sentences = round(len(dict_sentences)*0.05)
	trunc_pnct_caption = ' '.join(dict_sentences.values())
	list_summary = lr.getSummary(trunc_pnct_caption,nr_sentences)
	# it can happen that for lexrank a sentence conists of multiple actual sentences,
	# that are separated with full stops. Then the correspoinding timestamp cannot be found
	# all items from the lexrank summary must be concatinated and split up by full stops.
	concat_list_summary = '. '.join([str(item) for item in list_summary]).split('. ')
	print('zip: '+str(nr_sentences))
	if nr_sentences == 0:
	return 'Error: No sentences available', None
	else:
	concat_list_summary = [*dict_sentences.values()]

	dict_timestamp_summary = ts.getTimestampAtFrameFromSummary(pnct_raw_transcript,dict_sentences,concat_list_summary)
	if 'Error' in dict_timestamp_summary:
	return dict_timestamp_summary

	result_get_video=vd.get_video(link)
	print('video: '+result_get_video)

	proc_list = fr.extractImagesFromVideo(dict_timestamp_summary.keys())
	print('frames: '+str(proc_list))

	images = ts.getImages(dict_timestamp_summary)

	return images


	def getSummary(link, lexrank_switch, rpunkt_switch):

	# cleanup the working directory
	#result = fr.removeFilesInWorkdir()
	#print('removeFilesInWorkdir result: ',result)

	if len(link) == 0:
	return 'Error: no link provided'

	video_id = ts.get_id_from_link(link)
	if 'Error' in video_id:
	return video_id

	print('getting transcript using video_id: ', video_id, rpunkt_switch)
	raw_transcript, type_transcript = ts.get_json_transcript(video_id,rpunkt_switch)
	print('transcript type: ', type_transcript)
	#timestamps = ts.get_timestamps(raw_transcript)
	raw_caption = ts.get_caption(raw_transcript)

	# module rpunct
	# restore punctuations from raw captions
	# if necessary
	pnct_raw_transcript = raw_transcript
	pnct_caption = raw_caption

	if rpunkt_switch:
	#if type_transcript[0] == 'en':
	# the variable type_transcript[1] contains the text 'generated' or 'translated'
	print('Recovering punctuation from english text...', type_transcript[1])
	# remove punctuation leftovers
	#clean_raw_caption = re.sub('[,?.!]','',raw_caption)
	caption = rp.predict(raw_caption)
	pnct_caption = ts.restore_cr(raw_caption,caption)
	pnct_raw_transcript = ts.replacePunctuatedText(raw_transcript, pnct_caption)

	dict_sentences = ts.getSentences(pnct_raw_transcript)

	concat_list_summary = 'empty'
	if lexrank_switch:
	# summarize small part of the text
	nr_sentences = round(len(dict_sentences)*0.05)
	trunc_pnct_caption = ' '.join(dict_sentences.values())
	list_summary = lr.getSummary(trunc_pnct_caption,nr_sentences)
	# it can happen that for lexrank a sentence conists of multiple actual sentences,
	# that are separated with full stops. Then the correspoinding timestamp cannot be found
	# all items from the lexrank summary must be concatinated and split up by full stops.
	concat_list_summary = '. '.join([str(item) for item in list_summary]).split('. ')
	print('zip: '+str(nr_sentences))
	if nr_sentences == 0:
	return 'Error: No sentences available', None
	else:
	concat_list_summary = [*dict_sentences.values()]

	dict_timestamp_summary = ts.getTimestampAtFrameFromSummary(pnct_raw_transcript,dict_sentences,concat_list_summary)
	if 'Error' in dict_timestamp_summary:
	return dict_timestamp_summary

	prefix='http://youtube.com/watch?v='
	result_get_video=vd.get_video(prefix+video_id)
	print('video: '+result_get_video)

	proc_list = fr.extractImagesFromVideo(dict_timestamp_summary.keys())
	print('frames: '+str(proc_list))

	json_file = ts.convertToJSON(dict_timestamp_summary)
	#images = ts.getImages(dict_timestamp_summary)

	#return json_file, images
	return json_file

	#filename='/Users/hujo/Downloads/Channel_Summaries/wholesaleted.srt.pnct.txt'
	#with open(filename, 'w') as the_file:
	# the_file.write(raw_caption)

	#link ="https://www.youtube.com/watch?v=8uQDDUfGNPA" # blog
	#link = "https://www.youtube.com/watch?v=ofZEo0Rzo5s" # h-educate
	#link = 'https://www.youtube.com/watch?v=ReHGSGwV4-A' #wholesale ted
	#link = 'https://www.youtube.com/watch?v=n8JHnLgodRI' #kevindavid
	#link = 'https://www.youtube.com/watch?v=6MI0f6YjJIk' # Nicholas
	#link = 'https://www.youtube.com/watch?v=bj9snrsSook' #Geldschnurrbart
	#link = 'https://www.youtube.com/watch?v=lCnHfTHkhbE' #fcc tutorial
	#link = 'https://www.youtube.com/watch?v=0kJz0q0pvgQ&feature=youtu.be' # fcc

	#lexrank = True
	#result = getSummary(link, lexrank)
	#print(result)