Spaces:

HOLOKIATEAM
/

RAG_APP

Sleeping

App Files Files Community

RAG_APP / src /scraping /youtube_transcript.py

sxid003

Upload 83 files

3107242 verified 6 months ago

raw

history blame contribute delete

5.4 kB

	def fetch_youtube_transcripts(max_videos=None):
	import os
	import yt_dlp
	import re
	from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
	import pandas as pd
	from datetime import datetime
	from src.configs.config import RAW_CSV
	from datetime import datetime, timedelta

	# Configuration
	CHANNEL_URL = "https://www.youtube.com/@ParlementMa/streams"
	START_DATE = datetime(2025, 7, 21)
	END_DATE = datetime.today()
	SKIP_WORDS = ['تشلحيت', 'تمزيغت', 'تريفيت']
	LANGUAGE = 'ar'

	# Create output directory if needed
	output_dir = os.path.dirname(RAW_CSV)
	if output_dir:
	os.makedirs(output_dir, exist_ok=True)

	# Get video list
	def get_video_list(channel_url):
	print(f"-> Extraction des vidéos depuis : {channel_url}")
	ydl_opts = {
	"quiet": True,
	"extract_flat": True,
	"force_generic_extractor": True
	}
	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	info = ydl.extract_info(channel_url, download=False)
	entries = info.get("entries", [])
	print(f"-> {len(entries)} vidéos extraites du flux.")
	return entries

	# Get transcript text
	def get_transcript_text(video_id):
	import glob

	video_url = f"https://www.youtube.com/watch?v={video_id}"
	output_path = f"/tmp/{video_id}.%(ext)s"

	ydl_opts = {
	'writesubtitles': True,
	'writeautomaticsub': True,
	'subtitleslangs': ['ar'],
	'skip_download': True,
	'outtmpl': output_path,
	'quiet': True,
	}

	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	try:
	ydl.download([video_url])
	except Exception as e:
	print(f"[!] yt-dlp download error: {e}")
	return None

	# Look for any Arabic VTT subtitle file (auto or manual)
	vtt_files = glob.glob(f"/tmp/{video_id}.*.vtt")
	if not vtt_files:
	print(f"[x] No Arabic subtitle found for: {video_id}")
	return None

	try:
	with open(vtt_files[0], 'r', encoding='utf-8') as f:
	raw_text = f.read()

	def clean_caption(text):
	text = re.sub(r'<[^>]+>', '', text)
	text = re.sub(r'Kind:.*\n?', '', text)
	text = re.sub(r'Language:.*\n?', '', text)
	text = re.sub(r'WEBVTT.*\n?', '', text)
	text = re.sub(r'\d+:\d+:\d+\.\d+ --> .*', '', text)
	text = re.sub(r'\d+:\d+\.\d+ --> .*', '', text)
	text = re.sub(r'\s+', ' ', text)
	return text.strip()

	return clean_caption(raw_text)

	except Exception as e:
	print(f"[!] Failed to read/clean subtitle: {e}")
	return None

	# Skip by keywords
	def should_skip_title(title):
	if any(word in title for word in SKIP_WORDS):
	print(f"-> Ignorée (langue exclue) : {title}")
	return True
	return False

	print("\n==== LANCEMENT DU SCRIPT ====\n")
	videos = get_video_list(CHANNEL_URL)
	print(f"\n-> Début du traitement des vidéos dans la plage {START_DATE.date()} ➜ {END_DATE.date()}")

	data = []
	video_count = 1

	for idx, vid in enumerate(videos, start=1):
	if max_videos is not None and video_count > max_videos:
	print(f"\n✅ Limite de {max_videos} vidéos atteinte.")
	break
	video_id = vid.get("id")
	title = vid.get("title") or ""
	url = f"https://www.youtube.com/watch?v={video_id}"

	print(f"\n[{idx}/{len(videos)}] Traitement de la vidéo : {title}")

	if should_skip_title(title):
	continue

	try:
	with yt_dlp.YoutubeDL({'quiet': True}) as ydl:
	full_info = ydl.extract_info(url, download=False)

	upload_date_str = full_info.get("upload_date")
	if not upload_date_str:
	print(f" [!] Date non trouvée pour : {title}")
	continue

	upload_date = datetime.strptime(upload_date_str, "%Y%m%d")

	if not (START_DATE <= upload_date <= END_DATE):
	print(f" [<-] Vidéo hors période ➜ {upload_date.date()} — arrêt du traitement.")
	break

	transcript = get_transcript_text(video_id)
	if transcript:
	data.append({
	"id": video_count,
	"titre": title,
	"date": upload_date.strftime("%Y-%m-%d"),
	"langue": LANGUAGE,
	"sous-titre": transcript,
	"lien": url
	})
	print(f" [v] Transcription ajoutée (ID: {video_count})")
	video_count += 1
	else:
	print(f" [x] Transcription non disponible ou vide.")

	except Exception as e:
	print(f" [!] Erreur pendant le traitement de la vidéo {video_id}: {e}")

	print(f"\n-> Génération du fichier CSV: {RAW_CSV}")
	df = pd.DataFrame(data)
	df.to_csv(RAW_CSV, index=False, encoding='utf-8-sig')
	print(f"\n Enregistré : {len(df)} vidéos avec transcription dans '{RAW_CSV}'")
	print("\n==== FIN DU SCRIPT ====\n")