Spaces:

wldmr
/

tubifier

Sleeping

App Files Files Community

tubifier / summarizer.py

wldmr

update

5b1aa5f almost 2 years ago

raw

history blame contribute delete

No virus

6.17 kB

	import transcript as ts
	import ytvideo as vd
	import frames as fr
	import repunct as rp
	import lexrank as lr

	# import sys
	# del sys.modules['ytvideo']
	# del sys.modules['transcript']
	# del sys.modules['frames']
	# del sys.modules['lexrank']


	#########################################################################
	# LEXRANK SUMMARY
	#########################################################################

	def getSummaryImage(link, lexrank_switch, rpunkt_switch):

	# cleanup the working directory
	#result = fr.removeFilesInWorkdir()
	#print('removeFilesInWorkdir result: ',result)

	if len(link) == 0:
	return 'Error: no link provided'

	print('getting transcript using link: ', link)
	raw_transcript, type_transcript = ts.get_json_transcript(link,rpunkt_switch)
	print('transcript type: ', type_transcript)
	#timestamps = ts.get_timestamps(raw_transcript)
	raw_caption = ts.get_caption(raw_transcript)

	# module rpunct
	# restore punctuations from raw captions
	# if necessary
	pnct_raw_transcript = raw_transcript
	pnct_caption = raw_caption

	dict_sentences = ts.getSentences(pnct_raw_transcript)

	concat_list_summary = 'empty'
	if lexrank_switch:
	# summarize small part of the text
	nr_sentences = round(len(dict_sentences)*0.05)
	trunc_pnct_caption = ' '.join(dict_sentences.values())
	list_summary = lr.getSummary(trunc_pnct_caption,nr_sentences)
	# it can happen that for lexrank a sentence conists of multiple actual sentences,
	# that are separated with full stops. Then the correspoinding timestamp cannot be found
	# all items from the lexrank summary must be concatinated and split up by full stops.
	concat_list_summary = '. '.join([str(item) for item in list_summary]).split('. ')
	print('zip: '+str(nr_sentences))
	if nr_sentences == 0:
	return 'Error: No sentences available', None
	else:
	concat_list_summary = [*dict_sentences.values()]

	dict_timestamp_summary = ts.getTimestampAtFrameFromSummary(pnct_raw_transcript,dict_sentences,concat_list_summary)
	if 'Error' in dict_timestamp_summary:
	return dict_timestamp_summary

	result_get_video=vd.get_video(link)
	print('video: '+result_get_video)

	proc_list = fr.extractImagesFromVideo(dict_timestamp_summary.keys())
	print('frames: '+str(proc_list))

	images = ts.getImages(dict_timestamp_summary)

	return images


	def getSummary(link, lexrank_switch, rpunkt_switch):

	# cleanup the working directory
	#result = fr.removeFilesInWorkdir()
	#print('removeFilesInWorkdir result: ',result)

	if len(link) == 0:
	return 'Error: no link provided'

	video_id = ts.get_id_from_link(link)
	if 'Error' in video_id:
	return video_id

	print('getting transcript using video_id: ', video_id, rpunkt_switch)
	raw_transcript, type_transcript = ts.get_json_transcript(video_id,rpunkt_switch)
	print('transcript type: ', type_transcript)
	#timestamps = ts.get_timestamps(raw_transcript)
	raw_caption = ts.get_caption(raw_transcript)

	# module rpunct
	# restore punctuations from raw captions
	# if necessary
	pnct_raw_transcript = raw_transcript
	pnct_caption = raw_caption

	if rpunkt_switch:
	#if type_transcript[0] == 'en':
	# the variable type_transcript[1] contains the text 'generated' or 'translated'
	print('Recovering punctuation from english text...', type_transcript[1])
	# remove punctuation leftovers
	#clean_raw_caption = re.sub('[,?.!]','',raw_caption)
	caption = rp.predict(raw_caption)
	if 'Error' in caption:
	return caption
	pnct_caption = ts.restore_cr(raw_caption,caption)
	pnct_raw_transcript = ts.replacePunctuatedText(raw_transcript, pnct_caption)

	dict_sentences = ts.getSentences(pnct_raw_transcript)

	concat_list_summary = 'empty'
	if lexrank_switch:
	# summarize small part of the text
	nr_sentences = round(len(dict_sentences)*0.05)
	trunc_pnct_caption = ' '.join(dict_sentences.values())
	list_summary = lr.getSummary(trunc_pnct_caption,nr_sentences)
	# it can happen that for lexrank a sentence conists of multiple actual sentences,
	# that are separated with full stops. Then the correspoinding timestamp cannot be found
	# all items from the lexrank summary must be concatinated and split up by full stops.
	concat_list_summary = '. '.join([str(item) for item in list_summary]).split('. ')
	print('zip: '+str(nr_sentences))
	if nr_sentences == 0:
	return 'Error: No sentences available', None
	else:
	concat_list_summary = [*dict_sentences.values()]

	dict_timestamp_summary = ts.getTimestampAtFrameFromSummary(pnct_raw_transcript,dict_sentences,concat_list_summary)
	if 'Error' in dict_timestamp_summary:
	return dict_timestamp_summary

	prefix='http://youtube.com/watch?v='
	result_get_video=vd.get_video(prefix+video_id)
	print('video: '+result_get_video)

	proc_list = fr.extractImagesFromVideo(dict_timestamp_summary.keys())
	print('frames: '+str(proc_list))

	json_file = ts.convertToJSON(dict_timestamp_summary)
	#images = ts.getImages(dict_timestamp_summary)

	#return json_file, images
	return json_file

	#filename='/Users/hujo/Downloads/Channel_Summaries/wholesaleted.srt.pnct.txt'
	#with open(filename, 'w') as the_file:
	# the_file.write(raw_caption)

	#link ="https://www.youtube.com/watch?v=8uQDDUfGNPA" # blog
	#link = "https://www.youtube.com/watch?v=ofZEo0Rzo5s" # h-educate
	#link = 'https://www.youtube.com/watch?v=ReHGSGwV4-A' #wholesale ted
	#link = 'https://www.youtube.com/watch?v=n8JHnLgodRI' #kevindavid
	#link = 'https://www.youtube.com/watch?v=6MI0f6YjJIk' # Nicholas
	#link = 'https://www.youtube.com/watch?v=bj9snrsSook' #Geldschnurrbart
	#link = 'https://www.youtube.com/watch?v=lCnHfTHkhbE' #fcc tutorial
	#link = 'https://www.youtube.com/watch?v=0kJz0q0pvgQ&feature=youtu.be' # fcc

	#lexrank = True
	#result = getSummary(link, lexrank)
	#print(result)