Spaces:
Running
Running
Yannael_LB
commited on
Commit
·
ea1af87
1
Parent(s):
eade312
Update
Browse files- app.py +131 -5
- requirements.txt +3 -0
app.py
CHANGED
@@ -1,9 +1,135 @@
|
|
1 |
import gradio as gr
|
|
|
|
|
2 |
|
3 |
-
|
4 |
-
return "Hello " + name + "!"
|
5 |
|
6 |
-
|
7 |
|
8 |
-
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
+
import os
|
3 |
+
import json
|
4 |
|
5 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
|
|
6 |
|
7 |
+
from openai import OpenAI
|
8 |
|
9 |
+
import numpy as np
|
10 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
11 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
12 |
+
|
13 |
+
def gradio_video_id_to_transcript(video_id):
|
14 |
+
|
15 |
+
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"])
|
16 |
+
transcript_formatted = [{'start': entry['start'], 'text': entry['text']} for entry in transcript[0:10]]
|
17 |
+
transcript_formatted_str = json.dumps(transcript_formatted, indent=2)+'...'
|
18 |
+
|
19 |
+
return {output_transcript: transcript_formatted_str,
|
20 |
+
gv_transcript: transcript}
|
21 |
+
|
22 |
+
def gradio_transcript_to_paragraphs(gv_transcript_value):
|
23 |
+
|
24 |
+
paragraphs, nb_input_tokens, nb_output_tokens, price = \
|
25 |
+
transcript_to_paragraphs(gv_transcript_value, openai_client, openai_model, chunk_size=5000)
|
26 |
+
|
27 |
+
paragraphs_formatted_str = json.dumps(paragraphs[0:4], indent=2)+'...'
|
28 |
+
|
29 |
+
return {output_paragraphs: paragraphs_formatted_str,
|
30 |
+
gv_paragraphs: paragraphs}
|
31 |
+
|
32 |
+
def gradio_paragraphs_to_toc(gv_paragraphs_value):
|
33 |
+
|
34 |
+
paragraphs_dict = gv_paragraphs_value
|
35 |
+
|
36 |
+
json_toc, nb_input_tokens, nb_output_tokens, price = \
|
37 |
+
paragraphs_to_toc(paragraphs_dict, openai_client, openai_model, chunk_size=100)
|
38 |
+
|
39 |
+
json_toc_formatted_str = json.dumps(json_toc[0:4], indent=2)+'...'
|
40 |
+
|
41 |
+
return {output_toc: json_toc_formatted_str,
|
42 |
+
gv_toc: json_toc}
|
43 |
+
|
44 |
+
|
45 |
+
def gradio_get_paragraphs_timestamps(gv_transcript_value, gv_paragraphs_value):
|
46 |
+
|
47 |
+
paragraphs = add_timestamps_to_paragraphs(gv_transcript_value, gv_paragraphs_value, num_words=50)
|
48 |
+
|
49 |
+
paragraphs_formatted_str = json.dumps(paragraphs[0:4], indent=2)+'...'
|
50 |
+
|
51 |
+
return {output_paragraphs_timestamps: paragraphs_formatted_str,
|
52 |
+
gv_paragraphs: paragraphs}
|
53 |
+
|
54 |
+
|
55 |
+
def gradio_get_chapters(gv_paragraphs_value, gv_toc_value):
|
56 |
+
|
57 |
+
chapters = get_chapters(gv_paragraphs_value, gv_toc_value)
|
58 |
+
|
59 |
+
chapters_formatted_str = json.dumps(chapters[0:4], indent=2)+'...'
|
60 |
+
|
61 |
+
return {output_chapters: chapters_formatted_str,
|
62 |
+
gv_chapters: chapters}
|
63 |
+
|
64 |
+
|
65 |
+
def gradio_get_markdown(gv_chapters_value):
|
66 |
+
|
67 |
+
markdown = chapters_to_markdown(gv_chapters_value)
|
68 |
+
|
69 |
+
return markdown
|
70 |
+
|
71 |
+
with gr.Blocks() as app:
|
72 |
+
|
73 |
+
gr.Markdown("## Get transcript")
|
74 |
+
|
75 |
+
gv_transcript = gr.State()
|
76 |
+
video_id_input = gr.Textbox(label="Video ID", value = "ErnWZxJovaM")
|
77 |
+
get_transcript_button = gr.Button("Get transcript")
|
78 |
+
output_transcript = gr.Textbox(label = "Transcript (JSON format - start, text)")
|
79 |
+
|
80 |
+
get_transcript_button.click(gradio_video_id_to_transcript,
|
81 |
+
inputs=[video_id_input],
|
82 |
+
outputs=[output_transcript, gv_transcript])
|
83 |
+
|
84 |
+
gr.Markdown("## Transcript to paragraphs")
|
85 |
+
|
86 |
+
gv_paragraphs = gr.State()
|
87 |
+
get_paragraphs_button = gr.Button("Get paragraphs")
|
88 |
+
output_paragraphs = gr.Textbox(label = "Paragraphs (JSON format - paragraph_number, paragraph_text)")
|
89 |
+
|
90 |
+
get_paragraphs_button.click(gradio_transcript_to_paragraphs,
|
91 |
+
inputs=[gv_transcript],
|
92 |
+
outputs=[output_paragraphs, gv_paragraphs])
|
93 |
+
|
94 |
+
gr.Markdown("## Get table of content")
|
95 |
+
|
96 |
+
gv_toc = gr.State()
|
97 |
+
get_toc_button = gr.Button("Get table of contents")
|
98 |
+
output_toc = gr.Textbox(label = "Table of content (JSON format - paragraph_number, title)")
|
99 |
+
|
100 |
+
get_toc_button.click(gradio_paragraphs_to_toc,
|
101 |
+
inputs=[gv_paragraphs],
|
102 |
+
outputs=[output_toc, gv_toc])
|
103 |
+
|
104 |
+
|
105 |
+
gr.Markdown("## Infer paragraph timestamps with TF-IDF")
|
106 |
+
|
107 |
+
get_timestamps_button = gr.Button("Infer paragraph timestamps")
|
108 |
+
output_paragraphs_timestamps = gr.Textbox(label = "Paragraphs (JSON format - paragraph_number, paragraph_text, start)")
|
109 |
+
|
110 |
+
get_timestamps_button.click(gradio_get_paragraphs_timestamps,
|
111 |
+
inputs=[gv_transcript, gv_paragraphs],
|
112 |
+
outputs=[output_paragraphs_timestamps, gv_paragraphs])
|
113 |
+
|
114 |
+
gr.Markdown("## Get chapters")
|
115 |
+
|
116 |
+
gv_chapters = gr.State()
|
117 |
+
get_chapters_button = gr.Button("Get chapters")
|
118 |
+
output_chapters = gr.Textbox(label = "Chapters (JSON format)")
|
119 |
+
|
120 |
+
get_chapters_button.click(gradio_get_chapters,
|
121 |
+
inputs=[gv_paragraphs, gv_toc],
|
122 |
+
outputs=[output_chapters, gv_chapters])
|
123 |
+
|
124 |
+
|
125 |
+
gr.Markdown("## Markdown formatting")
|
126 |
+
|
127 |
+
get_markdown_button = gr.Button("Markdown formatting")
|
128 |
+
output_markdown = gr.Markdown(label = "Chapters (Markdown format)")
|
129 |
+
|
130 |
+
get_markdown_button.click(gradio_get_markdown,
|
131 |
+
inputs=[gv_chapters],
|
132 |
+
outputs=[output_markdown])
|
133 |
+
|
134 |
+
|
135 |
+
app.launch(debug=True)
|
requirements.txt
CHANGED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
youtube-transcript-api
|
2 |
+
openai
|
3 |
+
gradio
|