RAG_APP / src /scraping /youtube_transcript.py
sxid003's picture
Upload 83 files
3107242 verified
def fetch_youtube_transcripts(max_videos=None):
import os
import yt_dlp
import re
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
import pandas as pd
from datetime import datetime
from src.configs.config import RAW_CSV
from datetime import datetime, timedelta
# Configuration
CHANNEL_URL = "https://www.youtube.com/@ParlementMa/streams"
START_DATE = datetime(2025, 7, 21)
END_DATE = datetime.today()
SKIP_WORDS = ['تشلحيت', 'تمزيغت', 'تريفيت']
LANGUAGE = 'ar'
# Create output directory if needed
output_dir = os.path.dirname(RAW_CSV)
if output_dir:
os.makedirs(output_dir, exist_ok=True)
# Get video list
def get_video_list(channel_url):
print(f"-> Extraction des vidéos depuis : {channel_url}")
ydl_opts = {
"quiet": True,
"extract_flat": True,
"force_generic_extractor": True
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(channel_url, download=False)
entries = info.get("entries", [])
print(f"-> {len(entries)} vidéos extraites du flux.")
return entries
# Get transcript text
def get_transcript_text(video_id):
import glob
video_url = f"https://www.youtube.com/watch?v={video_id}"
output_path = f"/tmp/{video_id}.%(ext)s"
ydl_opts = {
'writesubtitles': True,
'writeautomaticsub': True,
'subtitleslangs': ['ar'],
'skip_download': True,
'outtmpl': output_path,
'quiet': True,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
try:
ydl.download([video_url])
except Exception as e:
print(f"[!] yt-dlp download error: {e}")
return None
# Look for any Arabic VTT subtitle file (auto or manual)
vtt_files = glob.glob(f"/tmp/{video_id}.*.vtt")
if not vtt_files:
print(f"[x] No Arabic subtitle found for: {video_id}")
return None
try:
with open(vtt_files[0], 'r', encoding='utf-8') as f:
raw_text = f.read()
def clean_caption(text):
text = re.sub(r'<[^>]+>', '', text)
text = re.sub(r'Kind:.*\n?', '', text)
text = re.sub(r'Language:.*\n?', '', text)
text = re.sub(r'WEBVTT.*\n?', '', text)
text = re.sub(r'\d+:\d+:\d+\.\d+ --> .*', '', text)
text = re.sub(r'\d+:\d+\.\d+ --> .*', '', text)
text = re.sub(r'\s+', ' ', text)
return text.strip()
return clean_caption(raw_text)
except Exception as e:
print(f"[!] Failed to read/clean subtitle: {e}")
return None
# Skip by keywords
def should_skip_title(title):
if any(word in title for word in SKIP_WORDS):
print(f"-> Ignorée (langue exclue) : {title}")
return True
return False
print("\n==== LANCEMENT DU SCRIPT ====\n")
videos = get_video_list(CHANNEL_URL)
print(f"\n-> Début du traitement des vidéos dans la plage {START_DATE.date()}{END_DATE.date()}")
data = []
video_count = 1
for idx, vid in enumerate(videos, start=1):
if max_videos is not None and video_count > max_videos:
print(f"\n✅ Limite de {max_videos} vidéos atteinte.")
break
video_id = vid.get("id")
title = vid.get("title") or ""
url = f"https://www.youtube.com/watch?v={video_id}"
print(f"\n[{idx}/{len(videos)}] Traitement de la vidéo : {title}")
if should_skip_title(title):
continue
try:
with yt_dlp.YoutubeDL({'quiet': True}) as ydl:
full_info = ydl.extract_info(url, download=False)
upload_date_str = full_info.get("upload_date")
if not upload_date_str:
print(f" [!] Date non trouvée pour : {title}")
continue
upload_date = datetime.strptime(upload_date_str, "%Y%m%d")
if not (START_DATE <= upload_date <= END_DATE):
print(f" [<-] Vidéo hors période ➜ {upload_date.date()} — arrêt du traitement.")
break
transcript = get_transcript_text(video_id)
if transcript:
data.append({
"id": video_count,
"titre": title,
"date": upload_date.strftime("%Y-%m-%d"),
"langue": LANGUAGE,
"sous-titre": transcript,
"lien": url
})
print(f" [v] Transcription ajoutée (ID: {video_count})")
video_count += 1
else:
print(f" [x] Transcription non disponible ou vide.")
except Exception as e:
print(f" [!] Erreur pendant le traitement de la vidéo {video_id}: {e}")
print(f"\n-> Génération du fichier CSV: {RAW_CSV}")
df = pd.DataFrame(data)
df.to_csv(RAW_CSV, index=False, encoding='utf-8-sig')
print(f"\n Enregistré : {len(df)} vidéos avec transcription dans '{RAW_CSV}'")
print("\n==== FIN DU SCRIPT ====\n")