lecture-summaries / gradio_nlp_group_project.py
cranonieu2021's picture
Upload folder using huggingface_hub
972c6bb verified
raw
history blame
4.49 kB
# -*- coding: utf-8 -*-
"""Gradio NLP Group Project.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1hDGMwj7G7avlxrqmXe6SIN9LjLRRsuqE
"""
import gradio as gr
from youtube_transcript_api import YouTubeTranscriptApi
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, MarianMTModel, MarianTokenizer, AutoModelForSequenceClassification
import torch
class TextProcessor:
def __init__(self, text):
self.text = text
def summarize_text(self, text):
tokenizer = AutoTokenizer.from_pretrained('cranonieu2021/pegasus-on-lectures')
model = AutoModelForSeq2SeqLM.from_pretrained("cranonieu2021/pegasus-on-lectures")
inputs = tokenizer(text, max_length=1024, return_tensors="pt", truncation=True)
summary_ids = model.generate(inputs.input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
return summary
def translate_text(self, text):
model_name = "sfarjebespalaia/enestranslatorforsummaries"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
src_text = [text]
tokenized_text = tokenizer.prepare_seq2seq_batch(src_text, return_tensors="pt")
translated = model.generate(**tokenized_text)
translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
return translated_text
def classify_text(self, text):
model_name = "gserafico/roberta-base-finetuned-classifier-roberta1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
outputs = model(**inputs)
predicted_class_idx = torch.argmax(outputs.logits, dim=1).item()
labels = {
0: 'Social Sciences',
1: 'Arts',
2: 'Natural Sciences',
3: 'Business and Law',
4: 'Engineering and Technology'
}
return labels[predicted_class_idx]
def get_transcript(video_id):
transcripts = YouTubeTranscriptApi.list_transcripts(video_id)
available_languages = []
for transcript in transcripts:
language_details = {
'Language': transcript.language,
'Language Code': transcript.language_code,
'Is Generated': transcript.is_generated,
'Is Translatable': transcript.is_translatable
}
available_languages.append(language_details)
available_languages = [transcript['Language Code'] for transcript in available_languages if transcript['Language Code'] in ['en']]
if 'en' in available_languages:
transcript_list = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])
transcript_text = ' '.join([i['text'] for i in transcript_list])
return transcript_text, 'en'
else:
return 'Transcript in unsupported language.'
def process_text(video_id):
transcript, language = get_transcript(video_id)
if "The following languages were not available" in transcript:
return {"Error": transcript, "Language Detected": "None"}
processor = TextProcessor(transcript)
results = {"Language Detected": language} # Include language in the output for debugging
if language == 'en':
summarized_text = processor.summarize_text(transcript)
translated_text = processor.translate_text(summarized_text)
classification_result = processor.classify_text(summarized_text)
results.update({
'Summarized Text': summarized_text,
'Translated Text': translated_text,
'Classification Result': classification_result
})
else:
results.update({'Error': 'Unsupported language'})
return results
iface = gr.Interface(
fn=process_text,
inputs=[gr.Textbox(label="YouTube Video ID")],
outputs=[gr.JSON(label="Results")],
title="Text Processing App with YouTube Transcript",
description="This app allows you to fetch, summarize, translate, and classify YouTube video transcripts based on their language. Currently, English to Spanish Translation is currently supported."
)
def main():
iface.launch()
if __name__ == '__main__':
main()