Spaces:

SankethShetty001
/

Vision_to_Insight

Running

App Files Files Community

Vision_to_Insight / app.py

SankethShetty001

Update app.py

c95222f verified 2 days ago

raw

history blame contribute delete

4.99 kB

	import gradio as gr
	import subprocess
	import whisper
	from transformers import pipeline , T5ForConditionalGeneration, T5Tokenizer
	import os
	import torch
	import spacy

	# Load models once
	whisper_model = whisper.load_model("base")
	summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=-1)

	# Load model and tokenizer
	model_name = "valhalla/t5-base-qg-hl"
	tokenizer = T5Tokenizer.from_pretrained(model_name)
	model = T5ForConditionalGeneration.from_pretrained(model_name)

	import spacy
	try:
	nlp = spacy.load("en_core_web_sm")
	except OSError:
	from spacy.cli import download
	download("en_core_web_sm")
	nlp = spacy.load("en_core_web_sm")


	# Load QA pipeline
	qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

	def extract_audio(video_path, audio_output_path):
	command = ['ffmpeg', '-i', video_path, '-vn', '-acodec', 'pcm_s16le', '-ar', '44100', '-ac', '2', audio_output_path]
	subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	return audio_output_path

	def process_video(video_file):
	try:
	import whisper
	from transformers import pipeline
	import subprocess
	import os

	audio_path = "extracted_audio.wav"

	# Extract audio from video using FFmpeg
	command = ['ffmpeg', '-i', video_file, '-vn', '-acodec', 'pcm_s16le', '-ar', '44100', '-ac', '2', audio_path]
	subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

	if not os.path.exists(audio_path):
	return "Audio extraction failed.", "No summary generated."

	# Load Whisper model
	model = whisper.load_model("base")
	result = model.transcribe(audio_path)

	transcript_text = result['text']

	# Load summarizer
	summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=-1)

	# Chunk text if needed
	chunks = [transcript_text[i:i + 1024] for i in range(0, len(transcript_text), 1024)]
	summaries = [summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]['summary_text'] for chunk in chunks]
	final_summary = ' '.join(summaries)

	return transcript_text, final_summary

	except Exception as e:
	return f"Error: {str(e)}", f"Error: {str(e)}"

	# Extract top named entities for highlighting
	def select_top_entities(text, max_entities=3):
	doc = nlp(text)
	candidates = [ent.text for ent in doc.ents if 2 <= len(ent.text) <= 30 and len(ent.text.split()) <= 5]
	seen = set()
	top_entities = []
	for entity in candidates:
	if entity not in seen:
	seen.add(entity)
	top_entities.append(entity)
	if len(top_entities) >= max_entities:
	break
	return top_entities

	# Generate questions for each highlighted entity
	def generate_questions(context):
	entities = select_top_entities(context, max_entities=3)
	questions = []

	for ent in entities:
	highlighted = context.replace(ent, f"<hl> {ent} <hl>", 1)
	input_text = f"generate question: {highlighted}"
	input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True)
	outputs = model.generate(
	input_ids=input_ids,
	max_length=64,
	num_beams=4,
	num_return_sequences=1,
	no_repeat_ngram_size=2,
	early_stopping=True
	)
	question = tokenizer.decode(outputs[0], skip_special_tokens=True)
	questions.append(question)

	return "\n".join(f"Q{i+1}: {q}" for i, q in enumerate(questions))

	def generate_answers(context, questions):
	"""
	context: str — typically the summary
	questions: list[str] or str — can be multiline string or list
	returns: str — formatted answers
	"""
	if isinstance(questions, str):
	questions = questions.strip().split('\n')

	answers = []
	for q in questions:
	if q.strip():
	result = qa_pipeline(question=q.strip(), context=context)
	answers.append(f"Q: {q.strip()}\nA: {result['answer']}")

	return "\n\n".join(answers)


	import gradio as gr

	# Dummy processing functions — replace these with your actual logic
	def process_video_(video_path):
	# Step 1: Transcribe the video
	transcript , summary = process_video(video_path)

	questions = generate_questions(summary)

	answers = generate_answers(summary, questions)

	return transcript, summary, questions , answers

	# Gradio Interface
	iface = gr.Interface(
	fn=process_video_,
	inputs=gr.Video(label="Upload a video"),
	outputs=[
	gr.Textbox(label="Transcript"),
	gr.Textbox(label="Summary"),
	gr.Textbox(label="Generated Questions"),
	gr.Textbox(label="Generated Answers")
	],
	title="Vision to Insight",
	description="Upload a video to extract a transcript, generate a summary, and get 2–3 meaningful questions based on the summary."
	)

	iface.launch(share=True)