cowrycode commited on
Commit
aea337a
·
verified ·
1 Parent(s): 81917a3

Create youtube_tool.py

Browse files
Files changed (1) hide show
  1. youtube_tool.py +68 -0
youtube_tool.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from urllib.parse import parse_qs, urlparse
2
+ from llama_index.core.tools import FunctionTool
3
+ from llama_index.readers.youtube_transcript import YoutubeTranscriptReader
4
+ from llama_index.readers.youtube_transcript.utils import is_youtube_video
5
+ from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound, VideoUnavailable
6
+
7
+ loader = YoutubeTranscriptReader()
8
+ yt_ap = YouTubeTranscriptApi()
9
+
10
+ def extract_video_id(url: str) -> str:
11
+ """
12
+ Handles typical YouTube URLs:
13
+ - https://www.youtube.com/watch?v=VIDEO_ID
14
+ - https://youtu.be/VIDEO_ID
15
+ - with extra query params
16
+ """
17
+ parsed = urlparse(url)
18
+ if parsed.hostname in {"www.youtube.com", "youtube.com"}:
19
+ qs = parse_qs(parsed.query)
20
+ if "v" in qs:
21
+ return qs["v"][0]
22
+ # fallback for youtu.be or raw IDs
23
+ return parsed.path.lstrip("/")
24
+
25
+ def fetch_youtube_transcript(video_url: str) -> str:
26
+ """
27
+ Fetch YouTube transcript text for the given URL.
28
+ In English language.
29
+ """
30
+ video_id = extract_video_id(video_url)
31
+
32
+ try:
33
+ # ✅ call on the class, NOT an instance
34
+ transcript_data = yt_ap.fetch(
35
+ video_id=video_id,
36
+ languages=["en"], #You can add as many languages, use yt_ap.list(video_id) function to get the langauges
37
+ )
38
+
39
+ #FROM TRANSCRIPT DATA, YOU CAN CREATE A OBJECT OF TRANSCRIPT SNIPET AND TIME
40
+ arr = [snippet.text for snippet in transcript_data]
41
+ return " ".join(arr)
42
+ #return " ".join(entry["text"] for entry in arr)
43
+ except Exception as e:
44
+ return f"Error fetching video details: {str(e)}"
45
+
46
+ def fetch_youtube_transcript_snippets(video_url: str) -> str:
47
+ """
48
+ Fetch YouTube transcript snippets for the given URL.
49
+ It gets the start-time, end-time and duration of each snippet.
50
+ """
51
+ video_id = extract_video_id(video_url)
52
+
53
+ try:
54
+ # ✅ call on the class, NOT an instance
55
+ transcript_data = yt_ap.fetch(
56
+ video_id=video_id,
57
+ languages=["en"], #You can add as many languages, use yt_ap.list(video_id) function to get the langauges
58
+ )
59
+ arr = [
60
+ {"text": snippet.text, "duration": snippet.duration, "start": snippet.start}
61
+ for snippet in transcript_data
62
+ ]
63
+ return " ".join(f"Text: {entry['text']} Duration: {entry['duration']} StartTime: {entry['start']} <End>" for entry in arr)
64
+ except Exception as e:
65
+ return f"Error fetching video details: {str(e)}"
66
+
67
+ youtube_transcript_tool = FunctionTool.from_defaults(fetch_youtube_transcript)
68
+ youtube_transcript_snippet_tool = FunctionTool.from_defaults(fetch_youtube_transcript_snippets)