Yannael_LB commited on
Commit
ea1af87
·
1 Parent(s): eade312
Files changed (2) hide show
  1. app.py +131 -5
  2. requirements.txt +3 -0
app.py CHANGED
@@ -1,9 +1,135 @@
1
  import gradio as gr
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!"
5
 
6
- demo = gr.Interface(fn=greet, inputs="textbox", outputs="textbox")
7
 
8
- if __name__ == "__main__":
9
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import os
3
+ import json
4
 
5
+ from youtube_transcript_api import YouTubeTranscriptApi
 
6
 
7
+ from openai import OpenAI
8
 
9
+ import numpy as np
10
+ from sklearn.feature_extraction.text import TfidfVectorizer
11
+ from sklearn.metrics.pairwise import cosine_similarity
12
+
13
+ def gradio_video_id_to_transcript(video_id):
14
+
15
+ transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"])
16
+ transcript_formatted = [{'start': entry['start'], 'text': entry['text']} for entry in transcript[0:10]]
17
+ transcript_formatted_str = json.dumps(transcript_formatted, indent=2)+'...'
18
+
19
+ return {output_transcript: transcript_formatted_str,
20
+ gv_transcript: transcript}
21
+
22
+ def gradio_transcript_to_paragraphs(gv_transcript_value):
23
+
24
+ paragraphs, nb_input_tokens, nb_output_tokens, price = \
25
+ transcript_to_paragraphs(gv_transcript_value, openai_client, openai_model, chunk_size=5000)
26
+
27
+ paragraphs_formatted_str = json.dumps(paragraphs[0:4], indent=2)+'...'
28
+
29
+ return {output_paragraphs: paragraphs_formatted_str,
30
+ gv_paragraphs: paragraphs}
31
+
32
+ def gradio_paragraphs_to_toc(gv_paragraphs_value):
33
+
34
+ paragraphs_dict = gv_paragraphs_value
35
+
36
+ json_toc, nb_input_tokens, nb_output_tokens, price = \
37
+ paragraphs_to_toc(paragraphs_dict, openai_client, openai_model, chunk_size=100)
38
+
39
+ json_toc_formatted_str = json.dumps(json_toc[0:4], indent=2)+'...'
40
+
41
+ return {output_toc: json_toc_formatted_str,
42
+ gv_toc: json_toc}
43
+
44
+
45
+ def gradio_get_paragraphs_timestamps(gv_transcript_value, gv_paragraphs_value):
46
+
47
+ paragraphs = add_timestamps_to_paragraphs(gv_transcript_value, gv_paragraphs_value, num_words=50)
48
+
49
+ paragraphs_formatted_str = json.dumps(paragraphs[0:4], indent=2)+'...'
50
+
51
+ return {output_paragraphs_timestamps: paragraphs_formatted_str,
52
+ gv_paragraphs: paragraphs}
53
+
54
+
55
+ def gradio_get_chapters(gv_paragraphs_value, gv_toc_value):
56
+
57
+ chapters = get_chapters(gv_paragraphs_value, gv_toc_value)
58
+
59
+ chapters_formatted_str = json.dumps(chapters[0:4], indent=2)+'...'
60
+
61
+ return {output_chapters: chapters_formatted_str,
62
+ gv_chapters: chapters}
63
+
64
+
65
+ def gradio_get_markdown(gv_chapters_value):
66
+
67
+ markdown = chapters_to_markdown(gv_chapters_value)
68
+
69
+ return markdown
70
+
71
+ with gr.Blocks() as app:
72
+
73
+ gr.Markdown("## Get transcript")
74
+
75
+ gv_transcript = gr.State()
76
+ video_id_input = gr.Textbox(label="Video ID", value = "ErnWZxJovaM")
77
+ get_transcript_button = gr.Button("Get transcript")
78
+ output_transcript = gr.Textbox(label = "Transcript (JSON format - start, text)")
79
+
80
+ get_transcript_button.click(gradio_video_id_to_transcript,
81
+ inputs=[video_id_input],
82
+ outputs=[output_transcript, gv_transcript])
83
+
84
+ gr.Markdown("## Transcript to paragraphs")
85
+
86
+ gv_paragraphs = gr.State()
87
+ get_paragraphs_button = gr.Button("Get paragraphs")
88
+ output_paragraphs = gr.Textbox(label = "Paragraphs (JSON format - paragraph_number, paragraph_text)")
89
+
90
+ get_paragraphs_button.click(gradio_transcript_to_paragraphs,
91
+ inputs=[gv_transcript],
92
+ outputs=[output_paragraphs, gv_paragraphs])
93
+
94
+ gr.Markdown("## Get table of content")
95
+
96
+ gv_toc = gr.State()
97
+ get_toc_button = gr.Button("Get table of contents")
98
+ output_toc = gr.Textbox(label = "Table of content (JSON format - paragraph_number, title)")
99
+
100
+ get_toc_button.click(gradio_paragraphs_to_toc,
101
+ inputs=[gv_paragraphs],
102
+ outputs=[output_toc, gv_toc])
103
+
104
+
105
+ gr.Markdown("## Infer paragraph timestamps with TF-IDF")
106
+
107
+ get_timestamps_button = gr.Button("Infer paragraph timestamps")
108
+ output_paragraphs_timestamps = gr.Textbox(label = "Paragraphs (JSON format - paragraph_number, paragraph_text, start)")
109
+
110
+ get_timestamps_button.click(gradio_get_paragraphs_timestamps,
111
+ inputs=[gv_transcript, gv_paragraphs],
112
+ outputs=[output_paragraphs_timestamps, gv_paragraphs])
113
+
114
+ gr.Markdown("## Get chapters")
115
+
116
+ gv_chapters = gr.State()
117
+ get_chapters_button = gr.Button("Get chapters")
118
+ output_chapters = gr.Textbox(label = "Chapters (JSON format)")
119
+
120
+ get_chapters_button.click(gradio_get_chapters,
121
+ inputs=[gv_paragraphs, gv_toc],
122
+ outputs=[output_chapters, gv_chapters])
123
+
124
+
125
+ gr.Markdown("## Markdown formatting")
126
+
127
+ get_markdown_button = gr.Button("Markdown formatting")
128
+ output_markdown = gr.Markdown(label = "Chapters (Markdown format)")
129
+
130
+ get_markdown_button.click(gradio_get_markdown,
131
+ inputs=[gv_chapters],
132
+ outputs=[output_markdown])
133
+
134
+
135
+ app.launch(debug=True)
requirements.txt CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ youtube-transcript-api
2
+ openai
3
+ gradio