File size: 6,112 Bytes
837fdb6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8fc8a5c
11a46c7
 
 
e81d911
ee90b6b
837fdb6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9543b09
 
837fdb6
 
 
 
 
23c56a3
 
837fdb6
23c56a3
 
837fdb6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import transcript as ts
import ytvideo as vd
import frames as fr
#import repunct as rp
import lexrank as lr

# import sys
# del sys.modules['ytvideo']
# del sys.modules['transcript']
# del sys.modules['frames']
# del sys.modules['lexrank']


#########################################################################
# LEXRANK SUMMARY
#########################################################################

def getSummaryImage(link, lexrank_switch, rpunkt_switch):

    # cleanup the working directory
    #result = fr.removeFilesInWorkdir()
    #print('removeFilesInWorkdir result: ',result)

    if len(link) == 0:
        return 'Error: no link provided'

    print('getting transcript using link: ', link)
    raw_transcript, type_transcript = ts.get_json_transcript(link,rpunkt_switch)
    print('transcript type: ', type_transcript)
    #timestamps = ts.get_timestamps(raw_transcript)
    raw_caption = ts.get_caption(raw_transcript)

    # module rpunct
    # restore punctuations from raw captions
    # if necessary
    pnct_raw_transcript = raw_transcript
    pnct_caption = raw_caption

    dict_sentences = ts.getSentences(pnct_raw_transcript)
    
    concat_list_summary = 'empty'
    if lexrank_switch:
        # summarize small part of the text
        nr_sentences = round(len(dict_sentences)*0.05)
        trunc_pnct_caption = ' '.join(dict_sentences.values())
        list_summary = lr.getSummary(trunc_pnct_caption,nr_sentences)
        # it can happen that for lexrank a sentence conists of multiple actual sentences, 
        # that are separated with full stops. Then the correspoinding timestamp cannot be found
        # all items from the lexrank summary must be concatinated and split up by full stops.
        concat_list_summary = '. '.join([str(item) for item in list_summary]).split('. ')
        print('zip: '+str(nr_sentences))
        if nr_sentences == 0:
            return 'Error: No sentences available', None 
    else:
        concat_list_summary = [*dict_sentences.values()]
    
    dict_timestamp_summary = ts.getTimestampAtFrameFromSummary(pnct_raw_transcript,dict_sentences,concat_list_summary)
    if 'Error' in dict_timestamp_summary:
        return dict_timestamp_summary

    result_get_video=vd.get_video(link)
    print('video: '+result_get_video)
    
    proc_list = fr.extractImagesFromVideo(dict_timestamp_summary.keys())
    print('frames: '+str(proc_list))

    images = ts.getImages(dict_timestamp_summary)

    return images


def getSummary(link, lexrank_switch, rpunkt_switch):

    # cleanup the working directory
    #result = fr.removeFilesInWorkdir()
    #print('removeFilesInWorkdir result: ',result)

    if len(link) == 0:
        return 'Error: no link provided'

    video_id = ts.get_id_from_link(link)
    if 'Error' in video_id:
        return video_id

    print('getting transcript using video_id: ', video_id, rpunkt_switch)
    raw_transcript, type_transcript = ts.get_json_transcript(video_id,rpunkt_switch)
    print('transcript type: ', type_transcript)
    #timestamps = ts.get_timestamps(raw_transcript)
    raw_caption = ts.get_caption(raw_transcript)

    # module rpunct
    # restore punctuations from raw captions
    # if necessary
    pnct_raw_transcript = raw_transcript
    pnct_caption = raw_caption

    if rpunkt_switch:
        #if type_transcript[0] == 'en':
        # the variable type_transcript[1] contains the text 'generated' or 'translated'
        print('Recovering punctuation from english text...', type_transcript[1])
        # remove punctuation leftovers
        #clean_raw_caption = re.sub('[,?.!]','',raw_caption)
        caption = rp.predict(raw_caption)
        pnct_caption = ts.restore_cr(raw_caption,caption)
        pnct_raw_transcript = ts.replacePunctuatedText(raw_transcript, pnct_caption)
        
    dict_sentences = ts.getSentences(pnct_raw_transcript)
    
    concat_list_summary = 'empty'
    if lexrank_switch:
        # summarize small part of the text
        nr_sentences = round(len(dict_sentences)*0.05)
        trunc_pnct_caption = ' '.join(dict_sentences.values())
        list_summary = lr.getSummary(trunc_pnct_caption,nr_sentences)
        # it can happen that for lexrank a sentence conists of multiple actual sentences, 
        # that are separated with full stops. Then the correspoinding timestamp cannot be found
        # all items from the lexrank summary must be concatinated and split up by full stops.
        concat_list_summary = '. '.join([str(item) for item in list_summary]).split('. ')
        print('zip: '+str(nr_sentences))
        if nr_sentences == 0:
            return 'Error: No sentences available', None 
    else:
        concat_list_summary = [*dict_sentences.values()]
    
    dict_timestamp_summary = ts.getTimestampAtFrameFromSummary(pnct_raw_transcript,dict_sentences,concat_list_summary)
    if 'Error' in dict_timestamp_summary:
        return dict_timestamp_summary

    prefix='http://youtube.com/watch?v='
    result_get_video=vd.get_video(prefix+video_id)
    print('video: '+result_get_video)
    
    proc_list = fr.extractImagesFromVideo(dict_timestamp_summary.keys())
    print('frames: '+str(proc_list))

    json_file = ts.convertToJSON(dict_timestamp_summary)
    #images = ts.getImages(dict_timestamp_summary)

    #return json_file, images
    return json_file

#filename='/Users/hujo/Downloads/Channel_Summaries/wholesaleted.srt.pnct.txt'
#with open(filename, 'w') as the_file:
#    the_file.write(raw_caption)

#link ="https://www.youtube.com/watch?v=8uQDDUfGNPA" # blog
#link = "https://www.youtube.com/watch?v=ofZEo0Rzo5s" # h-educate
#link = 'https://www.youtube.com/watch?v=ReHGSGwV4-A' #wholesale ted
#link = 'https://www.youtube.com/watch?v=n8JHnLgodRI' #kevindavid
#link = 'https://www.youtube.com/watch?v=6MI0f6YjJIk' # Nicholas
#link = 'https://www.youtube.com/watch?v=bj9snrsSook' #Geldschnurrbart
#link = 'https://www.youtube.com/watch?v=lCnHfTHkhbE' #fcc tutorial
#link = 'https://www.youtube.com/watch?v=0kJz0q0pvgQ&feature=youtu.be' # fcc

#lexrank = True
#result = getSummary(link, lexrank)
#print(result)