File size: 2,303 Bytes
bda77e6
 
4bcf60e
bda77e6
 
 
 
 
 
 
 
 
49e4213
bda77e6
dd748bd
bda77e6
 
 
 
 
90323cf
bda77e6
 
90323cf
bda77e6
 
 
 
 
 
 
 
 
 
90323cf
bda77e6
 
 
 
 
 
 
90323cf
bda77e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92597d1
6dac9eb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60



#importing the necessary modules 
import os 
import urllib.request 
import re 
import time 
import gradio as gr

#Creating a Gradio App Menu
def transcript_extract():
    
    #specifying the YouTube channel URL 
    channel_url = gr.inputs.Textbox(label="Channel URL")

#accessing the webpage 
    page = urllib.request.urlopen(channel_url)

#reading the source code 
    data = page.read().decode("utf-8")

#creating a directory to save the transcripts 
    os.makedirs('Transcripts',exist_ok=True)

#finding the transcripts 
    transcript_links = re.findall(r'(\/watch\?v=[A-Za-z0-9_.-]*)', str(data))

#looping through each transcript to download 
    for link in transcript_links:
        video_url = 'http://www.youtube.com'+link
        #access the video page 
        video_page = urllib.request.urlopen(video_url)
        #read the source code 
        video_data = video_page.read().decode("utf-8")
        #find the transcript 
        transcript_link = re.findall(r'(\/timedtext_editor\?[A-Za-z0-9_.-]*)', str(video_data))
        #check if there is a transcript available 
        if(len(transcript_link) > 0):
            #access the transcript page 
            transcript_url ='http://www.youtube.com'+ transcript_link[0]
            transcript_page = urllib.request.urlopen(transcript_url)
            transcript_data = transcript_page.read().decode("utf-8")
            #find the link to the transcript 
            transcript_download_link = re.findall(r'(\/api\/timedtext\?[A-Za-z0-9_.-]*)', str(transcript_data))
            #check if the transcript is available for download 
            if(len(transcript_download_link) > 0):
                #download the transcript 
                file_name = "Transcripts/" + link[9:] + ".xml"
                download_url = 'http://www.youtube.com'+transcript_download_link[0]
                urllib.request.urlretrieve(download_url, file_name)
                print("Downloading transcript for video " + link[9:] + "...")
                time.sleep(3)
            else:
                print("Transcript not available for video " + link[9:])
        else:
            print("Transcript not available for video " + link[9:])

#launch the gradio
gr.Interface(fn=transcript_extract, inputs="textbox", outputs="textbox", share=True).launch()