|
import gradio as gr |
|
import subprocess |
|
import whisper |
|
from transformers import pipeline , T5ForConditionalGeneration, T5Tokenizer |
|
import os |
|
import torch |
|
import spacy |
|
|
|
|
|
whisper_model = whisper.load_model("base") |
|
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=-1) |
|
|
|
|
|
model_name = "valhalla/t5-base-qg-hl" |
|
tokenizer = T5Tokenizer.from_pretrained(model_name) |
|
model = T5ForConditionalGeneration.from_pretrained(model_name) |
|
|
|
import spacy |
|
try: |
|
nlp = spacy.load("en_core_web_sm") |
|
except OSError: |
|
from spacy.cli import download |
|
download("en_core_web_sm") |
|
nlp = spacy.load("en_core_web_sm") |
|
|
|
|
|
|
|
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2") |
|
|
|
def extract_audio(video_path, audio_output_path): |
|
command = ['ffmpeg', '-i', video_path, '-vn', '-acodec', 'pcm_s16le', '-ar', '44100', '-ac', '2', audio_output_path] |
|
subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) |
|
return audio_output_path |
|
|
|
def process_video(video_file): |
|
try: |
|
import whisper |
|
from transformers import pipeline |
|
import subprocess |
|
import os |
|
|
|
audio_path = "extracted_audio.wav" |
|
|
|
|
|
command = ['ffmpeg', '-i', video_file, '-vn', '-acodec', 'pcm_s16le', '-ar', '44100', '-ac', '2', audio_path] |
|
subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) |
|
|
|
if not os.path.exists(audio_path): |
|
return "Audio extraction failed.", "No summary generated." |
|
|
|
|
|
model = whisper.load_model("base") |
|
result = model.transcribe(audio_path) |
|
|
|
transcript_text = result['text'] |
|
|
|
|
|
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=-1) |
|
|
|
|
|
chunks = [transcript_text[i:i + 1024] for i in range(0, len(transcript_text), 1024)] |
|
summaries = [summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]['summary_text'] for chunk in chunks] |
|
final_summary = ' '.join(summaries) |
|
|
|
return transcript_text, final_summary |
|
|
|
except Exception as e: |
|
return f"Error: {str(e)}", f"Error: {str(e)}" |
|
|
|
|
|
def select_top_entities(text, max_entities=3): |
|
doc = nlp(text) |
|
candidates = [ent.text for ent in doc.ents if 2 <= len(ent.text) <= 30 and len(ent.text.split()) <= 5] |
|
seen = set() |
|
top_entities = [] |
|
for entity in candidates: |
|
if entity not in seen: |
|
seen.add(entity) |
|
top_entities.append(entity) |
|
if len(top_entities) >= max_entities: |
|
break |
|
return top_entities |
|
|
|
|
|
def generate_questions(context): |
|
entities = select_top_entities(context, max_entities=3) |
|
questions = [] |
|
|
|
for ent in entities: |
|
highlighted = context.replace(ent, f"<hl> {ent} <hl>", 1) |
|
input_text = f"generate question: {highlighted}" |
|
input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True) |
|
outputs = model.generate( |
|
input_ids=input_ids, |
|
max_length=64, |
|
num_beams=4, |
|
num_return_sequences=1, |
|
no_repeat_ngram_size=2, |
|
early_stopping=True |
|
) |
|
question = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
questions.append(question) |
|
|
|
return "\n".join(f"Q{i+1}: {q}" for i, q in enumerate(questions)) |
|
|
|
def generate_answers(context, questions): |
|
""" |
|
context: str β typically the summary |
|
questions: list[str] or str β can be multiline string or list |
|
returns: str β formatted answers |
|
""" |
|
if isinstance(questions, str): |
|
questions = questions.strip().split('\n') |
|
|
|
answers = [] |
|
for q in questions: |
|
if q.strip(): |
|
result = qa_pipeline(question=q.strip(), context=context) |
|
answers.append(f"Q: {q.strip()}\nA: {result['answer']}") |
|
|
|
return "\n\n".join(answers) |
|
|
|
|
|
import gradio as gr |
|
|
|
|
|
def process_video_(video_path): |
|
|
|
transcript , summary = process_video(video_path) |
|
|
|
questions = generate_questions(summary) |
|
|
|
answers = generate_answers(summary, questions) |
|
|
|
return transcript, summary, questions , answers |
|
|
|
|
|
iface = gr.Interface( |
|
fn=process_video_, |
|
inputs=gr.Video(label="Upload a video"), |
|
outputs=[ |
|
gr.Textbox(label="Transcript"), |
|
gr.Textbox(label="Summary"), |
|
gr.Textbox(label="Generated Questions"), |
|
gr.Textbox(label="Generated Answers") |
|
], |
|
title="Vision to Insight", |
|
description="Upload a video to extract a transcript, generate a summary, and get 2β3 meaningful questions based on the summary." |
|
) |
|
|
|
iface.launch(share=True) |
|
|