File size: 7,202 Bytes
837fdb6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
from youtube_transcript_api import YouTubeTranscriptApi
import re
from PIL import Image


#transcript_list = YouTubeTranscriptApi.list_transcripts('ReHGSGwV4-A')
#transcript = transcript_list.find_transcript(['en','de'])

# step 1: download the json transcript for youtube video
def get_json_transcript(link,rpunkt_switch):
    if "v=" in link:
        video_id = link.split("v=")[1].split("&")[0]
    else:
        return "Error: Invalid Link, it does not have the pattern 'v=' in it."
    

    transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
    # get the auto-generated english text
    # if it is not available translate to en
    raw_transcript = 'empty'
    type_transcript = []
    if rpunkt_switch:
        try:
            transcript = transcript_list.find_generated_transcript(['en'])
            raw_transcript = transcript.fetch()
            type_transcript = ['en','generated']
        except:
            transcript = transcript_list.find_transcript(['de'])
            raw_transcript = transcript.translate('en').fetch()
            type_transcript = ['en','translated']
    else:
        transcript = transcript_list.find_transcript(['en','de'])
        raw_transcript = transcript.fetch()
        type_transcript = ['den','manual']

    return raw_transcript, type_transcript

# step 2: extract timestamps from json transcript
def get_timestamps(transcript_raw):
    transcript_timestamps = '\n'.join([str(i['start']) for i in transcript_raw])
    return transcript_timestamps.split('\n')

# step 3: extract text from transcript
def get_caption(transcript_raw):
    transcript_text = '\n'.join([i['text'].replace('\n',' ') for i in transcript_raw])
    return transcript_text

def replacePunctuatedText(raw_transcript, caption):
    list_caption = caption.split('\n')
    pnct_raw_transcript = raw_transcript

    for (idx, line) in enumerate(pnct_raw_transcript):
        line['text']=list_caption[idx]
    
    return pnct_raw_transcript

def getSentences(raw_transcript):
    # walk over each frame and extract the time stamp and the text 
    # the time stamp is wrapped in hash tag signs
    frm_cap = ''
    for (idx, line) in enumerate(raw_transcript, start=1):
        frm_cap = frm_cap+' #'+str(idx)+'# '+line['text'].replace('\n',' ').replace('\n',' ')


    dict_sentences = {}
    sentences = frm_cap.strip().split('. ')
    # small sentences that do not have an own frame are dropped
    # sentences that are less than 20 letters large are dropped, too
    # this is useful, so that lexrank does not picks the short sentences
    for idx,item in enumerate(sentences):
        m = re.search(r"#[^#]*#", item)
        if m is not None:
            match = m.group(0)
        frm = match.replace('#','')
        clean_match = re.sub('\s*#[^#]*#\s*',' ',item) + '.'
        if len(clean_match) > 20:
            dict_sentences[frm] = clean_match.strip()
        
    
    return dict_sentences


    # split all sentences into an array
    # remove all timestamps in the middle of the sentences
    # leave only the timestamps at the beginning of each sentence 
    # restore the full-stop sign at the end of each sentence, that was removed in the split step
    #chops = ''
    #for item in sl.strip().split('. '):
    #    chops = chops + re.sub('\s*#[^#]*#\s*',' ',item) + '. '
    #chops

    # remove all remaining hash tags
    #dsl={}
    #for item in chops.split('. #'):
    #    elem= item.split('# ')
    #    idx = elem[0].replace('#','')
    #    sentence = elem[1]+'.'
    #    dsl[idx] = sentence

    #return dsl

def convertToHTML(dsl):
    workdir = 'file/workdir/'
    cnt=1
    html_rows = '<table border=1>'
    html_rows = html_rows + '<tr><td>Image Nr.</td><td>Timestamp [sec]</td><td>Image</td><td>Caption</td>'
    for (key,val) in dsl.items():
        image='frame_'+f"{int(cnt):04d}"+'.jpg'
        sentence = val 
        row = '<tr><td>'+str(cnt)+'</td>'
        #row = row +'<td>'+f"{int(key):04d}"+'</td>'
        row = row +'<td>'+key+'</td>'
        row = row +'<td><a href='+workdir+image+'><img src="'+workdir+image+'" width=500></a></td>'
        row = row +'<td>'+sentence+'</td></tr>\n'
        html_rows = html_rows + row
        cnt = cnt+1
    html_rows = html_rows + '</table>'


    filename='./workdir/output.html'
    with open(filename, 'w') as the_file:
        the_file.write(html_rows)

    return html_rows

def getImages(dsl):
    images = []
    workdir = 'workdir/'
    cnt=1
    for (key,val) in dsl.items():
        image='frame_'+f"{int(cnt):04d}"+'.jpg'
        image_path = workdir+image
        pil_im = Image.open(image_path)
        images.append(pil_im)
        cnt=cnt+1

    return images 


# 1.
# dict_sentences contains all sentences with the frame-nr
# list_summary contains the summed sentences
# the task is to find for all summarized sentences the corresponding frame-nr
# 2.
# dict_frame_timestamp contains a mapping of frames to the timestamps
# 3.
# it is used to construct the sum_timestamps list of the timestamps for each summarized sentence
def getTimestampAtFrameFromSummary(raw_transcript, dict_sentences,list_summary):
    dict_summary = {}
    for key, value in dict_sentences.items():
        for sentence in list_summary:
            if str(sentence) in value:
                dict_summary[key]=value

    # sanity check, if the number of summarized sentences was found
    if len(list_summary) != len(dict_summary):
        err_msg = 'Error: Number of summarized sentences '+str(len(list_summary)) +' is not equal to the identified sentences '+str(len(dict_summary))+'.'
        print(err_msg)
        return err_msg

    dict_frame_timestamp = {}
    for (idx, line) in enumerate(raw_transcript, start=1):
        dict_frame_timestamp[str(idx)] = str(line['start'])

    sum_timestamps = []
    for key in dict_summary.keys():
        sum_timestamps.append(dict_frame_timestamp.get(key))

    dict_timestamp_summary = {}
    for (idx,value) in enumerate(list_summary):
        timestamp = sum_timestamps[idx]
        dict_timestamp_summary[timestamp] = str(value)

    return dict_timestamp_summary


def restore_cr(input_text, output_text):
    # restore the carrige returns 
    srt_file = input_text
    punctuated = output_text

    srt_file_strip=srt_file.strip()
    srt_file_sub=re.sub('\s*\n\s*','# ',srt_file_strip)
    srt_file_array=srt_file_sub.split(' ')
    pcnt_file_array=punctuated.split(' ')

    # goal: restore the break points i.e. the same number of lines as the srt file
    # this is necessary, because each line in the srt file corresponds to a frame from the video
    if len(srt_file_array)!=len(pcnt_file_array):
        return "AssertError: The length of the transcript and the punctuated file should be the same: ",len(srt_file_array),len(pcnt_file_array)
    pcnt_file_array_hash = []
    for idx, item in enumerate(srt_file_array):
        if item.endswith('#'):
            pcnt_file_array_hash.append(pcnt_file_array[idx]+'#')
        else:
            pcnt_file_array_hash.append(pcnt_file_array[idx])

    # assemble the array back to a string
    pcnt_file_cr=' '.join(pcnt_file_array_hash).replace('#','\n')

    return pcnt_file_cr