Spaces:

cranonieu2021
/

lecture-summaries

Sleeping

App Files Files Community

lecture-summaries / gradio_nlp_group_project.py

cranonieu2021

Upload folder using huggingface_hub

972c6bb verified 4 months ago

raw

history blame

4.49 kB

	# -- coding: utf-8 --
	"""Gradio NLP Group Project.ipynb

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/1hDGMwj7G7avlxrqmXe6SIN9LjLRRsuqE
	"""

	import gradio as gr
	from youtube_transcript_api import YouTubeTranscriptApi
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, MarianMTModel, MarianTokenizer, AutoModelForSequenceClassification
	import torch

	class TextProcessor:
	def __init__(self, text):
	self.text = text

	def summarize_text(self, text):
	tokenizer = AutoTokenizer.from_pretrained('cranonieu2021/pegasus-on-lectures')
	model = AutoModelForSeq2SeqLM.from_pretrained("cranonieu2021/pegasus-on-lectures")
	inputs = tokenizer(text, max_length=1024, return_tensors="pt", truncation=True)
	summary_ids = model.generate(inputs.input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
	summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
	return summary

	def translate_text(self, text):
	model_name = "sfarjebespalaia/enestranslatorforsummaries"
	tokenizer = MarianTokenizer.from_pretrained(model_name)
	model = MarianMTModel.from_pretrained(model_name)
	src_text = [text]
	tokenized_text = tokenizer.prepare_seq2seq_batch(src_text, return_tensors="pt")
	translated = model.generate(**tokenized_text)
	translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
	return translated_text

	def classify_text(self, text):
	model_name = "gserafico/roberta-base-finetuned-classifier-roberta1"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForSequenceClassification.from_pretrained(model_name)
	inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
	with torch.no_grad():
	outputs = model(**inputs)
	predicted_class_idx = torch.argmax(outputs.logits, dim=1).item()
	labels = {
	0: 'Social Sciences',
	1: 'Arts',
	2: 'Natural Sciences',
	3: 'Business and Law',
	4: 'Engineering and Technology'
	}
	return labels[predicted_class_idx]

	def get_transcript(video_id):

	transcripts = YouTubeTranscriptApi.list_transcripts(video_id)

	available_languages = []

	for transcript in transcripts:
	language_details = {
	'Language': transcript.language,
	'Language Code': transcript.language_code,
	'Is Generated': transcript.is_generated,
	'Is Translatable': transcript.is_translatable
	}

	available_languages.append(language_details)

	available_languages = [transcript['Language Code'] for transcript in available_languages if transcript['Language Code'] in ['en']]

	if 'en' in available_languages:
	transcript_list = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])
	transcript_text = ' '.join([i['text'] for i in transcript_list])

	return transcript_text, 'en'
	else:
	return 'Transcript in unsupported language.'

	def process_text(video_id):
	transcript, language = get_transcript(video_id)
	if "The following languages were not available" in transcript:
	return {"Error": transcript, "Language Detected": "None"}
	processor = TextProcessor(transcript)

	results = {"Language Detected": language} # Include language in the output for debugging

	if language == 'en':
	summarized_text = processor.summarize_text(transcript)
	translated_text = processor.translate_text(summarized_text)
	classification_result = processor.classify_text(summarized_text)
	results.update({
	'Summarized Text': summarized_text,
	'Translated Text': translated_text,
	'Classification Result': classification_result
	})
	else:
	results.update({'Error': 'Unsupported language'})

	return results

	iface = gr.Interface(
	fn=process_text,
	inputs=[gr.Textbox(label="YouTube Video ID")],
	outputs=[gr.JSON(label="Results")],
	title="Text Processing App with YouTube Transcript",
	description="This app allows you to fetch, summarize, translate, and classify YouTube video transcripts based on their language. Currently, English to Spanish Translation is currently supported."

	)

	def main():

	iface.launch()

	if __name__ == '__main__':
	main()