Spaces:

wldmr
/

tubifier

Sleeping

File size: 6,112 Bytes

import transcript as ts
import ytvideo as vd
import frames as fr
#import repunct as rp
import lexrank as lr

# import sys
# del sys.modules['ytvideo']
# del sys.modules['transcript']
# del sys.modules['frames']
# del sys.modules['lexrank']


#########################################################################
# LEXRANK SUMMARY
#########################################################################

def getSummaryImage(link, lexrank_switch, rpunkt_switch):

    # cleanup the working directory
    #result = fr.removeFilesInWorkdir()
    #print('removeFilesInWorkdir result: ',result)

    if len(link) == 0:
        return 'Error: no link provided'

    print('getting transcript using link: ', link)
    raw_transcript, type_transcript = ts.get_json_transcript(link,rpunkt_switch)
    print('transcript type: ', type_transcript)
    #timestamps = ts.get_timestamps(raw_transcript)
    raw_caption = ts.get_caption(raw_transcript)

    # module rpunct
    # restore punctuations from raw captions
    # if necessary
    pnct_raw_transcript = raw_transcript
    pnct_caption = raw_caption

    dict_sentences = ts.getSentences(pnct_raw_transcript)
    
    concat_list_summary = 'empty'
    if lexrank_switch:
        # summarize small part of the text
        nr_sentences = round(len(dict_sentences)*0.05)
        trunc_pnct_caption = ' '.join(dict_sentences.values())
        list_summary = lr.getSummary(trunc_pnct_caption,nr_sentences)
        # it can happen that for lexrank a sentence conists of multiple actual sentences, 
        # that are separated with full stops. Then the correspoinding timestamp cannot be found
        # all items from the lexrank summary must be concatinated and split up by full stops.
        concat_list_summary = '. '.join([str(item) for item in list_summary]).split('. ')
        print('zip: '+str(nr_sentences))
        if nr_sentences == 0:
            return 'Error: No sentences available', None 
    else:
        concat_list_summary = [*dict_sentences.values()]
    
    dict_timestamp_summary = ts.getTimestampAtFrameFromSummary(pnct_raw_transcript,dict_sentences,concat_list_summary)
    if 'Error' in dict_timestamp_summary:
        return dict_timestamp_summary

    result_get_video=vd.get_video(link)
    print('video: '+result_get_video)
    
    proc_list = fr.extractImagesFromVideo(dict_timestamp_summary.keys())
    print('frames: '+str(proc_list))

    images = ts.getImages(dict_timestamp_summary)

    return images


def getSummary(link, lexrank_switch, rpunkt_switch):

    # cleanup the working directory
    #result = fr.removeFilesInWorkdir()
    #print('removeFilesInWorkdir result: ',result)

    if len(link) == 0:
        return 'Error: no link provided'

    video_id = ts.get_id_from_link(link)
    if 'Error' in video_id:
        return video_id

    print('getting transcript using video_id: ', video_id, rpunkt_switch)
    raw_transcript, type_transcript = ts.get_json_transcript(video_id,rpunkt_switch)
    print('transcript type: ', type_transcript)
    #timestamps = ts.get_timestamps(raw_transcript)
    raw_caption = ts.get_caption(raw_transcript)

    # module rpunct
    # restore punctuations from raw captions
    # if necessary
    pnct_raw_transcript = raw_transcript
    pnct_caption = raw_caption

    if rpunkt_switch:
        #if type_transcript[0] == 'en':
        # the variable type_transcript[1] contains the text 'generated' or 'translated'
        print('Recovering punctuation from english text...', type_transcript[1])
        # remove punctuation leftovers
        #clean_raw_caption = re.sub('[,?.!]','',raw_caption)
        caption = rp.predict(raw_caption)
        pnct_caption = ts.restore_cr(raw_caption,caption)
        pnct_raw_transcript = ts.replacePunctuatedText(raw_transcript, pnct_caption)
        
    dict_sentences = ts.getSentences(pnct_raw_transcript)
    
    concat_list_summary = 'empty'
    if lexrank_switch:
        # summarize small part of the text
        nr_sentences = round(len(dict_sentences)*0.05)
        trunc_pnct_caption = ' '.join(dict_sentences.values())
        list_summary = lr.getSummary(trunc_pnct_caption,nr_sentences)
        # it can happen that for lexrank a sentence conists of multiple actual sentences, 
        # that are separated with full stops. Then the correspoinding timestamp cannot be found
        # all items from the lexrank summary must be concatinated and split up by full stops.
        concat_list_summary = '. '.join([str(item) for item in list_summary]).split('. ')
        print('zip: '+str(nr_sentences))
        if nr_sentences == 0:
            return 'Error: No sentences available', None 
    else:
        concat_list_summary = [*dict_sentences.values()]
    
    dict_timestamp_summary = ts.getTimestampAtFrameFromSummary(pnct_raw_transcript,dict_sentences,concat_list_summary)
    if 'Error' in dict_timestamp_summary:
        return dict_timestamp_summary

    prefix='http://youtube.com/watch?v='
    result_get_video=vd.get_video(prefix+video_id)
    print('video: '+result_get_video)
    
    proc_list = fr.extractImagesFromVideo(dict_timestamp_summary.keys())
    print('frames: '+str(proc_list))

    json_file = ts.convertToJSON(dict_timestamp_summary)
    #images = ts.getImages(dict_timestamp_summary)

    #return json_file, images
    return json_file

#filename='/Users/hujo/Downloads/Channel_Summaries/wholesaleted.srt.pnct.txt'
#with open(filename, 'w') as the_file:
#    the_file.write(raw_caption)

#link ="https://www.youtube.com/watch?v=8uQDDUfGNPA" # blog
#link = "https://www.youtube.com/watch?v=ofZEo0Rzo5s" # h-educate
#link = 'https://www.youtube.com/watch?v=ReHGSGwV4-A' #wholesale ted
#link = 'https://www.youtube.com/watch?v=n8JHnLgodRI' #kevindavid
#link = 'https://www.youtube.com/watch?v=6MI0f6YjJIk' # Nicholas
#link = 'https://www.youtube.com/watch?v=bj9snrsSook' #Geldschnurrbart
#link = 'https://www.youtube.com/watch?v=lCnHfTHkhbE' #fcc tutorial
#link = 'https://www.youtube.com/watch?v=0kJz0q0pvgQ&feature=youtu.be' # fcc

#lexrank = True
#result = getSummary(link, lexrank)
#print(result)