Codex commited on
Commit
e6f021c
·
1 Parent(s): 1d7c2c1

Add Space-only YouTube fallback strategies

Browse files
Files changed (4) hide show
  1. Dockerfile +1 -0
  2. README.md +25 -0
  3. app.py +362 -18
  4. requirements.txt +1 -0
Dockerfile CHANGED
@@ -5,6 +5,7 @@ WORKDIR /app
5
  RUN apt-get update && apt-get install -y \
6
  build-essential \
7
  curl \
 
8
  git \
9
  && rm -rf /var/lib/apt/lists/*
10
 
 
5
  RUN apt-get update && apt-get install -y \
6
  build-essential \
7
  curl \
8
+ ffmpeg \
9
  git \
10
  && rm -rf /var/lib/apt/lists/*
11
 
README.md CHANGED
@@ -32,3 +32,28 @@ YouTube transcript loading may work locally but fail on Hugging Face Spaces beca
32
  - `YOUTUBE_HTTPS_PROXY`
33
 
34
  You can also use the standard `HTTP_PROXY` and `HTTPS_PROXY` environment variables if that matches your setup.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  - `YOUTUBE_HTTPS_PROXY`
33
 
34
  You can also use the standard `HTTP_PROXY` and `HTTPS_PROXY` environment variables if that matches your setup.
35
+
36
+ ## Space-Only YouTube Fallbacks
37
+
38
+ The Hugging Face Space version now supports multiple YouTube retrieval strategies:
39
+
40
+ - Direct transcript fetch
41
+ - External transcript API
42
+ - Audio transcription via `yt-dlp` + Groq Whisper
43
+ - Manual transcript paste/upload
44
+
45
+ ### Optional secrets for external transcript API
46
+
47
+ - `YOUTUBE_TRANSCRIPT_API_URL`
48
+ - `YOUTUBE_TRANSCRIPT_API_KEY`
49
+ - `YOUTUBE_TRANSCRIPT_API_METHOD` (`GET` or `POST`, default `GET`)
50
+ - `YOUTUBE_TRANSCRIPT_API_KEY_HEADER` (default `Authorization`)
51
+ - `YOUTUBE_TRANSCRIPT_API_TIMEOUT` (default `45`)
52
+
53
+ `YOUTUBE_TRANSCRIPT_API_URL` may contain placeholders such as `{video_id}`, `{url}`, and `{language_code}`.
54
+
55
+ ### Optional secrets for Groq audio transcription fallback
56
+
57
+ - `GROQ_AUDIO_TRANSCRIPTION_MODEL`
58
+
59
+ Default model: `whisper-large-v3-turbo`
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
 
2
  from io import BytesIO
3
- from urllib.parse import urlparse
4
  from xml.etree import ElementTree as ET
5
  from zipfile import ZipFile
6
 
@@ -25,7 +26,7 @@ from youtube_transcript_api import YouTubeTranscriptApi
25
 
26
  load_dotenv()
27
 
28
- APP_VERSION = "2026-04-23-hf-youtube-fix-2"
29
  SAMPLE_YOUTUBE_URL = "https://youtu.be/ocBh08fjIfU"
30
  LANGUAGE_OPTIONS = ["Original", "English", "Arabic", "French", "Bahasa Malay"]
31
  LANGUAGE_CODE_MAP = {
@@ -46,6 +47,12 @@ YOUTUBE_PROXY_ENV_VARS = (
46
  "HTTP_PROXY",
47
  "HTTPS_PROXY",
48
  )
 
 
 
 
 
 
49
 
50
  st.set_page_config(page_title="Summarize Text From PDF, YouTube, Website", page_icon="📝")
51
  st.title("📝 Summarize Text From PDF, YouTube, Website")
@@ -79,6 +86,8 @@ if "youtube_transcript_source_url" not in st.session_state:
79
  st.session_state.youtube_transcript_source_url = ""
80
  if "youtube_transcript_language_label" not in st.session_state:
81
  st.session_state.youtube_transcript_language_label = "Original"
 
 
82
 
83
  summary_language = "Original"
84
  transcript_language = "Original"
@@ -136,6 +145,9 @@ with st.sidebar:
136
 
137
  generic_url = ""
138
  uploaded_files = []
 
 
 
139
 
140
  if input_source_mode in {"URL", "Both"}:
141
  st.markdown('<div class="source-section-label">Summarize URL</div>', unsafe_allow_html=True)
@@ -161,6 +173,43 @@ if input_source_mode in {"Upload documents", "Both"}:
161
  "Uploaded files: " + ", ".join(uploaded_file.name for uploaded_file in uploaded_files)
162
  )
163
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  llm = ChatGroq(model="llama-3.1-8b-instant", groq_api_key=groq_api_key)
165
 
166
  REQUEST_HEADERS = {
@@ -170,12 +219,6 @@ REQUEST_HEADERS = {
170
  "Referer": "https://www.google.com/",
171
  }
172
 
173
-
174
- def _is_youtube_url(url: str) -> bool:
175
- host = urlparse(url).netloc.lower()
176
- return "youtube.com" in host or "youtu.be" in host
177
-
178
-
179
  def _summary_language_instruction(selected_language: str) -> str:
180
  if selected_language == "Original":
181
  return "Write the summary in the original language of the source content. If the source is mixed-language, use the dominant language."
@@ -447,6 +490,14 @@ def _make_transcript_filename(url: str) -> str:
447
  return f"youtube_transcript_{video_id}.txt"
448
 
449
 
 
 
 
 
 
 
 
 
450
  def _store_youtube_transcript(url: str, docs: list[Document]) -> None:
451
  st.session_state.youtube_transcript_text = "\n\n".join(
452
  doc.page_content for doc in docs if doc.page_content.strip()
@@ -457,6 +508,285 @@ def _store_youtube_transcript(url: str, docs: list[Document]) -> None:
457
  "transcript_language_label",
458
  docs[0].metadata.get("language", "Original"),
459
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
460
 
461
 
462
  def _has_meaningful_content(docs: list[Document], min_chars: int = 300) -> bool:
@@ -690,13 +1020,21 @@ if input_source_mode in {"URL", "Both"} and _is_youtube_url(generic_url):
690
  else:
691
  try:
692
  with st.spinner("Loading transcript..."):
693
- docs = _load_youtube_documents(generic_url, transcript_language)
 
 
 
 
 
 
694
  if not docs:
695
  st.error("No transcript could be extracted from the provided YouTube video.")
696
  else:
697
  _store_youtube_transcript(generic_url, docs)
698
  st.success(
699
- f"Transcript ready for export in {st.session_state.youtube_transcript_language_label}."
 
 
700
  )
701
  except Exception as transcript_err:
702
  st.error(f"Failed to load YouTube transcript: {transcript_err}")
@@ -705,7 +1043,11 @@ if input_source_mode in {"URL", "Both"} and _is_youtube_url(generic_url):
705
  st.session_state.youtube_transcript_text
706
  and st.session_state.youtube_transcript_source_url == generic_url
707
  ):
708
- st.caption(f"Prepared transcript: `{st.session_state.youtube_transcript_language_label}`")
 
 
 
 
709
  st.download_button(
710
  "Export transcript",
711
  data=st.session_state.youtube_transcript_text,
@@ -733,15 +1075,19 @@ if st.button("Summarize content"):
733
  if input_source_mode in {"URL", "Both"} and generic_url.strip():
734
  if _is_youtube_url(generic_url):
735
  try:
736
- url_docs = _load_youtube_documents(generic_url, transcript_language)
 
 
 
 
 
 
737
  _store_youtube_transcript(generic_url, url_docs)
738
  except Exception as load_err:
739
  st.error(f"Failed to load YouTube transcript: {load_err}")
740
  st.stop()
741
  else:
742
- st.session_state.youtube_transcript_text = ""
743
- st.session_state.youtube_transcript_name = "youtube_transcript.txt"
744
- st.session_state.youtube_transcript_source_url = ""
745
  try:
746
  url_docs = _load_web_documents(generic_url)
747
  except Exception as load_err:
@@ -750,9 +1096,7 @@ if st.button("Summarize content"):
750
 
751
  docs.extend(url_docs)
752
  else:
753
- st.session_state.youtube_transcript_text = ""
754
- st.session_state.youtube_transcript_name = "youtube_transcript.txt"
755
- st.session_state.youtube_transcript_source_url = ""
756
 
757
  if input_source_mode in {"Upload documents", "Both"} and uploaded_files:
758
  try:
 
1
  import os
2
+ import tempfile
3
  from io import BytesIO
4
+ from urllib.parse import quote_plus, urlparse
5
  from xml.etree import ElementTree as ET
6
  from zipfile import ZipFile
7
 
 
26
 
27
  load_dotenv()
28
 
29
+ APP_VERSION = "2026-04-23-hf-youtube-fallbacks-1"
30
  SAMPLE_YOUTUBE_URL = "https://youtu.be/ocBh08fjIfU"
31
  LANGUAGE_OPTIONS = ["Original", "English", "Arabic", "French", "Bahasa Malay"]
32
  LANGUAGE_CODE_MAP = {
 
47
  "HTTP_PROXY",
48
  "HTTPS_PROXY",
49
  )
50
+ YOUTUBE_AUDIO_EXTENSIONS = (".m4a", ".mp3", ".mp4", ".mpeg", ".mpga", ".ogg", ".wav", ".webm")
51
+
52
+
53
+ def _is_youtube_url(url: str) -> bool:
54
+ host = urlparse(url).netloc.lower()
55
+ return "youtube.com" in host or "youtu.be" in host
56
 
57
  st.set_page_config(page_title="Summarize Text From PDF, YouTube, Website", page_icon="📝")
58
  st.title("📝 Summarize Text From PDF, YouTube, Website")
 
86
  st.session_state.youtube_transcript_source_url = ""
87
  if "youtube_transcript_language_label" not in st.session_state:
88
  st.session_state.youtube_transcript_language_label = "Original"
89
+ if "youtube_transcript_source_mode" not in st.session_state:
90
+ st.session_state.youtube_transcript_source_mode = ""
91
 
92
  summary_language = "Original"
93
  transcript_language = "Original"
 
145
 
146
  generic_url = ""
147
  uploaded_files = []
148
+ youtube_source_mode = "Auto"
149
+ manual_transcript_text = ""
150
+ manual_transcript_file = None
151
 
152
  if input_source_mode in {"URL", "Both"}:
153
  st.markdown('<div class="source-section-label">Summarize URL</div>', unsafe_allow_html=True)
 
173
  "Uploaded files: " + ", ".join(uploaded_file.name for uploaded_file in uploaded_files)
174
  )
175
 
176
+ if input_source_mode in {"URL", "Both"} and generic_url.strip() and _is_youtube_url(generic_url):
177
+ st.markdown('<div class="source-section-label">YouTube Fallback Options</div>', unsafe_allow_html=True)
178
+ youtube_source_mode = st.radio(
179
+ "YouTube transcript source",
180
+ options=[
181
+ "Auto",
182
+ "Direct transcript",
183
+ "External transcript API",
184
+ "Audio transcription (yt-dlp + Groq)",
185
+ "Manual transcript",
186
+ ],
187
+ index=0,
188
+ help=(
189
+ "`Auto` tries direct transcript first, then external API, then yt-dlp + Groq audio transcription. "
190
+ "`Manual transcript` lets you paste or upload transcript text."
191
+ ),
192
+ )
193
+ if youtube_source_mode == "Manual transcript":
194
+ manual_transcript_text = st.text_area(
195
+ "Paste transcript",
196
+ height=220,
197
+ placeholder="Paste the YouTube transcript here if direct fetching is blocked.",
198
+ )
199
+ manual_transcript_file = st.file_uploader(
200
+ "Upload transcript file",
201
+ type=["txt", "md", "csv", "srt", "vtt"],
202
+ help="Upload a transcript file to summarize when direct YouTube access is blocked.",
203
+ )
204
+ else:
205
+ configured_modes = []
206
+ if any(os.getenv(var_name) for var_name in YOUTUBE_PROXY_ENV_VARS):
207
+ configured_modes.append("direct transcript via proxy")
208
+ if os.getenv("YOUTUBE_TRANSCRIPT_API_URL"):
209
+ configured_modes.append("external transcript API")
210
+ configured_modes.append("audio transcription via yt-dlp + Groq")
211
+ st.caption("Available fallbacks: " + ", ".join(configured_modes) + ".")
212
+
213
  llm = ChatGroq(model="llama-3.1-8b-instant", groq_api_key=groq_api_key)
214
 
215
  REQUEST_HEADERS = {
 
219
  "Referer": "https://www.google.com/",
220
  }
221
 
 
 
 
 
 
 
222
  def _summary_language_instruction(selected_language: str) -> str:
223
  if selected_language == "Original":
224
  return "Write the summary in the original language of the source content. If the source is mixed-language, use the dominant language."
 
490
  return f"youtube_transcript_{video_id}.txt"
491
 
492
 
493
+ def _reset_youtube_transcript_state() -> None:
494
+ st.session_state.youtube_transcript_text = ""
495
+ st.session_state.youtube_transcript_name = "youtube_transcript.txt"
496
+ st.session_state.youtube_transcript_source_url = ""
497
+ st.session_state.youtube_transcript_language_label = "Original"
498
+ st.session_state.youtube_transcript_source_mode = ""
499
+
500
+
501
  def _store_youtube_transcript(url: str, docs: list[Document]) -> None:
502
  st.session_state.youtube_transcript_text = "\n\n".join(
503
  doc.page_content for doc in docs if doc.page_content.strip()
 
508
  "transcript_language_label",
509
  docs[0].metadata.get("language", "Original"),
510
  )
511
+ st.session_state.youtube_transcript_source_mode = docs[0].metadata.get(
512
+ "transcript_source_mode",
513
+ "Direct transcript",
514
+ )
515
+
516
+
517
+ def _normalize_transcript_text(raw_text: str) -> str:
518
+ lines = [line.strip() for line in raw_text.splitlines()]
519
+ return "\n".join(line for line in lines if line)
520
+
521
+
522
+ def _read_uploaded_text_file(uploaded_file) -> str:
523
+ return uploaded_file.getvalue().decode("utf-8", errors="ignore").strip()
524
+
525
+
526
+ def _build_transcript_documents(
527
+ url: str,
528
+ transcript_text: str,
529
+ language_label: str,
530
+ source_mode: str,
531
+ ) -> list[Document]:
532
+ normalized_text = _normalize_transcript_text(transcript_text)
533
+ if not normalized_text:
534
+ raise ValueError("Transcript text is empty.")
535
+
536
+ return [
537
+ Document(
538
+ page_content=normalized_text,
539
+ metadata={
540
+ "source": url,
541
+ "video_id": YoutubeLoader.extract_video_id(url),
542
+ "transcript_language_label": language_label,
543
+ "transcript_source_mode": source_mode,
544
+ },
545
+ )
546
+ ]
547
+
548
+
549
+ def _load_manual_transcript_documents(
550
+ url: str,
551
+ selected_language: str,
552
+ transcript_text: str,
553
+ transcript_file,
554
+ ) -> list[Document]:
555
+ combined_parts = []
556
+ if transcript_text.strip():
557
+ combined_parts.append(transcript_text.strip())
558
+ if transcript_file is not None:
559
+ combined_parts.append(_read_uploaded_text_file(transcript_file))
560
+
561
+ combined_text = "\n\n".join(part for part in combined_parts if part.strip())
562
+ if not combined_text.strip():
563
+ raise ValueError("Please paste a transcript or upload a transcript file.")
564
+
565
+ docs = _build_transcript_documents(
566
+ url,
567
+ combined_text,
568
+ "Original",
569
+ "Manual transcript",
570
+ )
571
+ if selected_language != "Original":
572
+ docs = _translate_documents_with_llm(docs, selected_language)
573
+ for doc in docs:
574
+ doc.metadata["transcript_language_label"] = f"{selected_language} (LLM translated)"
575
+ return docs
576
+
577
+
578
+ def _extract_transcript_text_from_payload(payload) -> str:
579
+ if isinstance(payload, str):
580
+ return payload.strip()
581
+
582
+ if isinstance(payload, list):
583
+ text_parts = []
584
+ for item in payload:
585
+ extracted = _extract_transcript_text_from_payload(item)
586
+ if extracted:
587
+ text_parts.append(extracted)
588
+ return "\n".join(part for part in text_parts if part)
589
+
590
+ if isinstance(payload, dict):
591
+ for key in ("text", "transcript", "content", "full_text", "body"):
592
+ value = payload.get(key)
593
+ if isinstance(value, str) and value.strip():
594
+ return value.strip()
595
+
596
+ for key in ("data", "result", "results", "transcription", "response"):
597
+ if key in payload:
598
+ extracted = _extract_transcript_text_from_payload(payload[key])
599
+ if extracted:
600
+ return extracted
601
+
602
+ for key in ("segments", "items", "captions", "chunks", "utterances"):
603
+ value = payload.get(key)
604
+ if isinstance(value, list):
605
+ extracted = _extract_transcript_text_from_payload(value)
606
+ if extracted:
607
+ return extracted
608
+
609
+ return ""
610
+
611
+
612
+ def _load_youtube_documents_via_external_api(url: str, selected_language: str) -> list[Document]:
613
+ api_url = os.getenv("YOUTUBE_TRANSCRIPT_API_URL", "").strip()
614
+ if not api_url:
615
+ raise ValueError(
616
+ "External transcript API is not configured. Set `YOUTUBE_TRANSCRIPT_API_URL` in Space secrets."
617
+ )
618
+
619
+ video_id = YoutubeLoader.extract_video_id(url)
620
+ language_code = LANGUAGE_CODE_MAP.get(selected_language, "")
621
+ formatted_url = api_url.format(
622
+ video_id=video_id,
623
+ url=quote_plus(url),
624
+ language_code=language_code,
625
+ )
626
+
627
+ method = os.getenv("YOUTUBE_TRANSCRIPT_API_METHOD", "GET").strip().upper()
628
+ timeout_seconds = int(os.getenv("YOUTUBE_TRANSCRIPT_API_TIMEOUT", "45"))
629
+ api_key = os.getenv("YOUTUBE_TRANSCRIPT_API_KEY", "").strip()
630
+ api_key_header = os.getenv("YOUTUBE_TRANSCRIPT_API_KEY_HEADER", "Authorization").strip()
631
+
632
+ headers = {"Accept": "application/json"}
633
+ if api_key:
634
+ if api_key_header.lower() == "authorization":
635
+ headers[api_key_header] = f"Bearer {api_key}"
636
+ else:
637
+ headers[api_key_header] = api_key
638
+
639
+ payload = {
640
+ "video_id": video_id,
641
+ "url": url,
642
+ "language": language_code or None,
643
+ }
644
+
645
+ if method == "POST":
646
+ response = requests.post(formatted_url, json=payload, headers=headers, timeout=timeout_seconds)
647
+ else:
648
+ response = requests.get(formatted_url, params=payload, headers=headers, timeout=timeout_seconds)
649
+ response.raise_for_status()
650
+
651
+ try:
652
+ parsed_payload = response.json()
653
+ except ValueError:
654
+ parsed_payload = response.text
655
+
656
+ transcript_text = _extract_transcript_text_from_payload(parsed_payload)
657
+ if not transcript_text:
658
+ raise ValueError("External transcript API response did not contain usable transcript text.")
659
+
660
+ docs = _build_transcript_documents(
661
+ url,
662
+ transcript_text,
663
+ selected_language if selected_language != "Original" else "Original",
664
+ "External transcript API",
665
+ )
666
+ if selected_language != "Original":
667
+ for doc in docs:
668
+ doc.metadata["transcript_language_label"] = selected_language
669
+ return docs
670
+
671
+
672
+ def _download_youtube_audio(url: str, video_id: str) -> str:
673
+ try:
674
+ import yt_dlp
675
+ except ImportError as exc:
676
+ raise RuntimeError("`yt-dlp` is not installed in this Space build.") from exc
677
+
678
+ with tempfile.TemporaryDirectory() as temp_dir:
679
+ output_template = os.path.join(temp_dir, f"{video_id}.%(ext)s")
680
+ ydl_opts = {
681
+ "format": "bestaudio[ext=m4a]/bestaudio[ext=webm]/bestaudio/best",
682
+ "outtmpl": output_template,
683
+ "quiet": True,
684
+ "no_warnings": True,
685
+ "noprogress": True,
686
+ "skip_download": False,
687
+ }
688
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
689
+ ydl.extract_info(url, download=True)
690
+
691
+ audio_files = [
692
+ os.path.join(temp_dir, file_name)
693
+ for file_name in os.listdir(temp_dir)
694
+ if os.path.splitext(file_name)[1].lower() in YOUTUBE_AUDIO_EXTENSIONS
695
+ ]
696
+ if not audio_files:
697
+ raise RuntimeError("yt-dlp did not produce a supported audio file for transcription.")
698
+
699
+ source_path = max(audio_files, key=os.path.getsize)
700
+ persisted_path = os.path.join(tempfile.gettempdir(), os.path.basename(source_path))
701
+ with open(source_path, "rb") as source_file, open(persisted_path, "wb") as target_file:
702
+ target_file.write(source_file.read())
703
+ return persisted_path
704
+
705
+
706
+ def _transcribe_audio_with_groq(audio_path: str, selected_language: str) -> str:
707
+ if not groq_api_key.strip():
708
+ raise ValueError("`GROQ_API_KEY` is required for audio transcription fallback.")
709
+
710
+ model_name = os.getenv("GROQ_AUDIO_TRANSCRIPTION_MODEL", "whisper-large-v3-turbo")
711
+ payload = {
712
+ "model": model_name,
713
+ "response_format": "json",
714
+ "temperature": "0",
715
+ }
716
+ if selected_language != "Original":
717
+ payload["language"] = LANGUAGE_CODE_MAP[selected_language]
718
+
719
+ with open(audio_path, "rb") as audio_file:
720
+ response = requests.post(
721
+ "https://api.groq.com/openai/v1/audio/transcriptions",
722
+ headers={"Authorization": f"Bearer {groq_api_key}"},
723
+ data=payload,
724
+ files={"file": (os.path.basename(audio_path), audio_file)},
725
+ timeout=300,
726
+ )
727
+ response.raise_for_status()
728
+ transcript_text = response.json().get("text", "").strip()
729
+ if not transcript_text:
730
+ raise ValueError("Groq audio transcription returned empty text.")
731
+ return transcript_text
732
+
733
+
734
+ def _load_youtube_documents_via_audio_transcription(url: str, selected_language: str) -> list[Document]:
735
+ video_id = YoutubeLoader.extract_video_id(url)
736
+ audio_path = _download_youtube_audio(url, video_id)
737
+ try:
738
+ transcript_text = _transcribe_audio_with_groq(audio_path, selected_language)
739
+ finally:
740
+ if os.path.exists(audio_path):
741
+ os.remove(audio_path)
742
+
743
+ return _build_transcript_documents(
744
+ url,
745
+ transcript_text,
746
+ selected_language if selected_language != "Original" else "Original",
747
+ "Audio transcription (yt-dlp + Groq)",
748
+ )
749
+
750
+
751
+ def _load_youtube_documents_with_fallbacks(
752
+ url: str,
753
+ selected_language: str,
754
+ source_mode: str,
755
+ transcript_text: str,
756
+ transcript_file,
757
+ ) -> list[Document]:
758
+ if source_mode == "Manual transcript":
759
+ return _load_manual_transcript_documents(url, selected_language, transcript_text, transcript_file)
760
+
761
+ strategies = []
762
+ if source_mode in {"Auto", "Direct transcript"}:
763
+ strategies.append(("Direct transcript", lambda: _load_youtube_documents(url, selected_language)))
764
+ if source_mode in {"Auto", "External transcript API"}:
765
+ strategies.append(
766
+ ("External transcript API", lambda: _load_youtube_documents_via_external_api(url, selected_language))
767
+ )
768
+ if source_mode in {"Auto", "Audio transcription (yt-dlp + Groq)"}:
769
+ strategies.append(
770
+ (
771
+ "Audio transcription (yt-dlp + Groq)",
772
+ lambda: _load_youtube_documents_via_audio_transcription(url, selected_language),
773
+ )
774
+ )
775
+
776
+ failures = []
777
+ for strategy_name, loader in strategies:
778
+ try:
779
+ return loader()
780
+ except Exception as exc:
781
+ failures.append(f"{strategy_name}: {exc}")
782
+
783
+ if source_mode == "Auto" and (transcript_text.strip() or transcript_file is not None):
784
+ return _load_manual_transcript_documents(url, selected_language, transcript_text, transcript_file)
785
+
786
+ if not failures:
787
+ raise ValueError("No YouTube transcript strategy is available for the selected mode.")
788
+
789
+ raise RuntimeError("All YouTube transcript strategies failed.\n" + "\n".join(failures))
790
 
791
 
792
  def _has_meaningful_content(docs: list[Document], min_chars: int = 300) -> bool:
 
1020
  else:
1021
  try:
1022
  with st.spinner("Loading transcript..."):
1023
+ docs = _load_youtube_documents_with_fallbacks(
1024
+ generic_url,
1025
+ transcript_language,
1026
+ youtube_source_mode,
1027
+ manual_transcript_text,
1028
+ manual_transcript_file,
1029
+ )
1030
  if not docs:
1031
  st.error("No transcript could be extracted from the provided YouTube video.")
1032
  else:
1033
  _store_youtube_transcript(generic_url, docs)
1034
  st.success(
1035
+ "Transcript ready for export in "
1036
+ f"{st.session_state.youtube_transcript_language_label} "
1037
+ f"via {st.session_state.youtube_transcript_source_mode}."
1038
  )
1039
  except Exception as transcript_err:
1040
  st.error(f"Failed to load YouTube transcript: {transcript_err}")
 
1043
  st.session_state.youtube_transcript_text
1044
  and st.session_state.youtube_transcript_source_url == generic_url
1045
  ):
1046
+ st.caption(
1047
+ "Prepared transcript: "
1048
+ f"`{st.session_state.youtube_transcript_language_label}` via "
1049
+ f"`{st.session_state.youtube_transcript_source_mode}`"
1050
+ )
1051
  st.download_button(
1052
  "Export transcript",
1053
  data=st.session_state.youtube_transcript_text,
 
1075
  if input_source_mode in {"URL", "Both"} and generic_url.strip():
1076
  if _is_youtube_url(generic_url):
1077
  try:
1078
+ url_docs = _load_youtube_documents_with_fallbacks(
1079
+ generic_url,
1080
+ transcript_language,
1081
+ youtube_source_mode,
1082
+ manual_transcript_text,
1083
+ manual_transcript_file,
1084
+ )
1085
  _store_youtube_transcript(generic_url, url_docs)
1086
  except Exception as load_err:
1087
  st.error(f"Failed to load YouTube transcript: {load_err}")
1088
  st.stop()
1089
  else:
1090
+ _reset_youtube_transcript_state()
 
 
1091
  try:
1092
  url_docs = _load_web_documents(generic_url)
1093
  except Exception as load_err:
 
1096
 
1097
  docs.extend(url_docs)
1098
  else:
1099
+ _reset_youtube_transcript_state()
 
 
1100
 
1101
  if input_source_mode in {"Upload documents", "Both"} and uploaded_files:
1102
  try:
requirements.txt CHANGED
@@ -14,3 +14,4 @@ langchain-text-splitters>=1.1.2
14
  youtube-transcript-api>=1.2.4
15
  unstructured>=0.22.22
16
  pytube>=15.0.0
 
 
14
  youtube-transcript-api>=1.2.4
15
  unstructured>=0.22.22
16
  pytube>=15.0.0
17
+ yt-dlp>=2025.1.15