Spaces:

VanguardAI
/

Youtube_Chunking

Sleeping

App Files Files Community

Youtube_Chunking / app.py

llama

Update app.py

15483bb verified 3 months ago

raw

history blame contribute delete

No virus

7.17 kB

	import os
	HF_TOKEN = os.environ["HF_TOKEN"]
	import re
	import string
	import torch
	import spaces
	import gradio as gr
	import json
	from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
	from transformers import BitsAndBytesConfig
	from pytube import YouTube
	from youtube_transcript_api import YouTubeTranscriptApi
	from llama_index.llms.groq import Groq
	from llama_index.core.llms import ChatMessage
	#Whisper-Small
	quantization_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_use_double_quant=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_compute_dtype=torch.float16)
	torch_dtype =torch.float16
	model_id = "openai/whisper-small"
	model = AutoModelForSpeechSeq2Seq.from_pretrained(
	model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, quantization_config=quantization_config,)
	processor = AutoProcessor.from_pretrained(model_id)
	pipe = pipeline(
	"automatic-speech-recognition",
	model=model,
	tokenizer=processor.tokenizer,
	feature_extractor=processor.feature_extractor,
	max_new_tokens=128,
	chunk_length_s=30,
	batch_size=16,
	return_timestamps=True,
	torch_dtype=torch_dtype)

	@spaces.GPU(duration=300)
	def chunk_it(yt_link):
	match = re.search(r"v=([^&]+)", yt_link)
	if match:
	video_id = match.group(1)
	#Youtube_Audio_Extraction
	yt = YouTube(yt_link)
	stream= yt.streams.filter(only_audio=True).first()
	path=stream.download()
	#Youtube_Transcript
	srt = YouTubeTranscriptApi.get_transcript(video_id)
	#Whisper-Small_inference
	result = pipe(path, return_timestamps="word")
	#LLaMa3-8B Extraction
	llm = Groq(model="llama3-8b-8192", api_key="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")
	messages = [
	ChatMessage(
	role="system", content='''You are provided with the audio-to-text transcript of a YouTube video. Your task is to IDENTIFY AND EXTRACT, WITHOUT ANY MODIFICATION, the most valuable and informative phrases from the transcript. This includes retaining ORIGINAL punctuation, capitalization, and wording. The goal is to create a list of verbatim excerpts that distill the essence of the video's content. There should NOT be any sort of modification done to the text, simply copy and paste it here!!
	Example:- If you found this line/phrase to be important "we want to basically make generative ai available and accessible to the people in the country and that's that's the intent and when we said that we want to do this there was a resonance in the investment community". Then, the output should include "we want to basically make generative ai available and accessible to the people in the country and that's that's the intent and when we said that we want to do this there was a resonance in the investment community" exactly without any alteration.'''
	),
	ChatMessage(role="user", content=result["text"].lower()),]
	resp = llm.chat(messages)
	chat_response_dict = resp.raw
	choice_obj = chat_response_dict['choices'][0]
	message_dict = choice_obj.to_dict()['message']
	content = message_dict['content']
	#Chunking
	def create_semantic_chunks(word_timing_data, extracted_text, max_chunk_length=30.0):
	output_chunks = []
	chunk_id = 1
	for phrase in extracted_text:
	phrase_words = phrase.lower().split()
	dp = [[0] * (len(word_timing_data) + 1) for _ in range(len(phrase_words) + 1)]
	for i in range(1, len(phrase_words) + 1):
	for j in range(1, len(word_timing_data) + 1):
	if phrase_words[i - 1] == word_timing_data[j - 1]['text'].lower():
	dp[i][j] = dp[i - 1][j - 1] + 1
	else:
	dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
	i, j = len(phrase_words), len(word_timing_data)
	match = []
	while i > 0 and j > 0:
	if phrase_words[i - 1] == word_timing_data[j - 1]['text'].lower():
	match.append(word_timing_data[j - 1])
	i -= 1
	j -= 1
	elif dp[i - 1][j] > dp[i][j - 1]:
	i -= 1
	else:
	j -= 1
	match.reverse()
	start_time = match[0]['timestamp'][0]
	chunk_end_time = match[0]['timestamp'][1]
	chunk_text = ''
	for data in match:
	chunk_text += data['text'] + ' '
	if data['timestamp'][1] - start_time > max_chunk_length:
	output_chunks.append({
	"chunk_id": chunk_id,
	"chunk_length": round(data['timestamp'][1] - start_time, 2),
	"text": chunk_text.strip(),
	"start_time": start_time,
	"end_time": data['timestamp'][1],
	})
	chunk_id += 1
	start_time = data['timestamp'][1]
	chunk_text = ''
	chunk_end_time = max(chunk_end_time, data['timestamp'][1])
	if chunk_text:
	output_chunks.append({
	"chunk_id": chunk_id,
	"chunk_length": round(chunk_end_time - start_time, 2),
	"text": chunk_text.strip(),
	"start_time": start_time,
	"end_time": chunk_end_time,
	})
	chunk_id += 1
	return output_chunks

	transcript_text = content
	pattern = r'"(.*?)"'
	extracted_phrases = re.findall(pattern, transcript_text)
	word_timing_data=result['chunks'].copy()

	def preprocess_extracted_text(extracted_phrases):
	"""Preprocesses extracted_text (lowercase, no punctuation)."""
	processed_text = []
	translator = str.maketrans('', '', string.punctuation) # Create translation table
	for sentence in extracted_phrases:
	sentence = sentence.lower().translate(translator) # Remove punctuation
	processed_text.append(sentence)
	return processed_text

	def preprocess_word_timing_data(word_timing_data):
	"""Preprocesses word_timing_data (lowercase, no punctuation, no spaces)."""
	processed_data = []
	translator = str.maketrans('', '', string.punctuation + ' ') # Include space in translation
	for data in word_timing_data:
	word = data['text'].lower().translate(translator)
	processed_data.append({'text': word, 'timestamp': data['timestamp']})
	return processed_data
	#Final_Stage
	processed_phrases = preprocess_extracted_text(extracted_phrases)
	processed_data = preprocess_word_timing_data(word_timing_data)
	final_output=create_semantic_chunks(processed_data, processed_phrases, max_chunk_length=30.0)

	return final_output

	iface=gr.Interface(fn=chunk_it,
	inputs="text",
	outputs="text",
	title="Youtube Chunker",
	)
	iface.launch(inline=False)