Selim321 commited on
Commit
8396eb2
1 Parent(s): 2182ae8

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +100 -0
  2. requirements.txt +70 -0
app.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+ from gtts import gTTS
4
+ from urllib.parse import urlparse, parse_qs
5
+ from youtube_transcript_api import YouTubeTranscriptApi
6
+ import unicodedata
7
+ from deepmultilingualpunctuation import PunctuationModel
8
+ from transformers import pipeline
9
+
10
+
11
+ def summarize_video(url):
12
+ parsed_url = urlparse(url)
13
+ video_id = parse_qs(parsed_url.query)['v'][0]
14
+
15
+ # Get the transcript
16
+ transcript = YouTubeTranscriptApi.get_transcript(video_id)
17
+
18
+ # Combining all the lists into on unique list
19
+ text = []
20
+ for i in range(0, len(transcript)):
21
+ text.append(transcript[i]["text"])
22
+
23
+ # Join list items into one paragraph
24
+ video_transcript = " ".join(text)
25
+ print("Text transcript created")
26
+
27
+ print(video_transcript)
28
+
29
+ # Text normalization
30
+ my_string = unicodedata.normalize('NFKD', video_transcript)
31
+ print("Text normalized")
32
+
33
+
34
+ # Add punctuation
35
+ model = PunctuationModel()
36
+ result = model.restore_punctuation(video_transcript)
37
+ print("Punctuation restored")
38
+
39
+ # SUMMARIZATION
40
+
41
+ # instantiate the summarization pipeline
42
+ summarization_pipeline = pipeline(
43
+ "summarization",
44
+ model="t5-base", # you can choose a different model, depending on your requirements
45
+ tokenizer="t5-base" # you can choose a different tokenizer, depending on your requirements
46
+ )
47
+
48
+ # define the input text to summarize
49
+ input_text = result
50
+
51
+ # split the input text into smaller chunks
52
+ chunk_size = 5000
53
+ chunks = [input_text[i:i+chunk_size] for i in range(0, len(input_text), chunk_size)]
54
+
55
+ # summarize each chunk separately
56
+ summaries = []
57
+ for chunk in chunks:
58
+ summary = summarization_pipeline(chunk, max_length=200, min_length=30, do_sample=False)
59
+ summaries.append(summary[0]['summary_text'])
60
+
61
+ # combine the summaries of all chunks into a single summary
62
+ final_summary = " ".join(summaries)
63
+
64
+ # print the generated summary
65
+ return final_summary
66
+
67
+ # Define the Streamlit app
68
+ st.title("YouTube Summarizer")
69
+
70
+ # Define the input form
71
+ form = st.form(key="input_form")
72
+
73
+ # Get the video ID from the URL
74
+ video_url = form.text_input("Enter a YouTube video URL")
75
+
76
+ # Submit button
77
+ submit_button = form.form_submit_button("Summarize Video")
78
+
79
+ # Handle form submissions
80
+ if submit_button:
81
+ # Call the summarize_video function to get the summary
82
+ summary = summarize_video(video_url)
83
+
84
+ # Display the summary to the user
85
+ st.subheader("Summary")
86
+ st.write(summary)
87
+
88
+ # Convert text summary into audio
89
+ tts = gTTS(summary)
90
+ print("converting text to audio")
91
+ tts.save('hello.mp3')
92
+
93
+ # Download audio transcript
94
+ with open('hello.mp3', 'rb') as f:
95
+ st.download_button('Download mp3', f, file_name='hello.mp3')
96
+
97
+
98
+
99
+
100
+
requirements.txt ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==4.2.2
2
+ anyio==3.6.2
3
+ attrs==23.1.0
4
+ blinker==1.6.2
5
+ cachetools==5.3.0
6
+ certifi==2022.12.7
7
+ charset-normalizer==3.1.0
8
+ click==8.1.3
9
+ decorator==5.1.1
10
+ deepmultilingualpunctuation==1.0.1
11
+ entrypoints==0.4
12
+ fastapi==0.95.1
13
+ filelock==3.12.0
14
+ fsspec==2023.4.0
15
+ gitdb==4.0.10
16
+ GitPython==3.1.31
17
+ gTTS==2.3.2
18
+ h11==0.14.0
19
+ huggingface-hub==0.14.1
20
+ idna==3.4
21
+ importlib-metadata==6.6.0
22
+ Jinja2==3.1.2
23
+ jsonschema==4.17.3
24
+ markdown-it-py==2.2.0
25
+ MarkupSafe==2.1.2
26
+ mdurl==0.1.2
27
+ mpmath==1.3.0
28
+ networkx==3.1
29
+ numpy==1.24.3
30
+ packaging==23.1
31
+ pandas==2.0.1
32
+ Pillow==9.5.0
33
+ protobuf==3.20.1
34
+ pyarrow==12.0.0
35
+ pydantic==1.10.7
36
+ pydeck==0.8.1b0
37
+ Pygments==2.15.1
38
+ Pympler==1.0.1
39
+ pyrsistent==0.19.3
40
+ python-dateutil==2.8.2
41
+ pytz==2023.3
42
+ pytz-deprecation-shim==0.1.0.post0
43
+ PyYAML==6.0
44
+ regex==2023.5.5
45
+ requests==2.30.0
46
+ rich==13.3.5
47
+ sentencepiece==0.1.99
48
+ six==1.16.0
49
+ smmap==5.0.0
50
+ sniffio==1.3.0
51
+ starlette==0.26.1
52
+ streamlit==1.22.0
53
+ sympy==1.11.1
54
+ tenacity==8.2.2
55
+ tokenizers==0.13.3
56
+ toml==0.10.2
57
+ toolz==0.12.0
58
+ torch==2.0.0
59
+ tornado==6.3.1
60
+ tqdm==4.65.0
61
+ transformers==4.28.1
62
+ typing_extensions==4.5.0
63
+ tzdata==2023.3
64
+ tzlocal==4.3
65
+ urllib3==2.0.2
66
+ uvicorn==0.22.0
67
+ validators==0.20.0
68
+ watchdog==3.0.0
69
+ youtube-transcript-api==0.6.0
70
+ zipp==3.15.0