import gradio as gr from TTS.api import TTS from transformers import T5ForConditionalGeneration, T5Tokenizer, pipeline import feedparser import re language_map = { 'en': 'English', 'fr': 'French' } # Add default RSS feeds rss_feed_map = { "NY Times": 'https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml', "Fox News": 'https://moxie.foxnews.com/google-publisher/latest.xml', "Yahoo! News": 'https://www.yahoo.com/news/rss', "France 24": 'https://www.france24.com/fr/rss', "France Info": 'https://www.francetvinfo.fr/titres.rss' } def get_rss_feeds(default_choices, custom_choices): custom_rss_feeds = custom_choices.split("\n") if custom_rss_feeds == ['']: return list(set([rss_feed_map[key] for key in default_choices])) return list(set(custom_rss_feeds + [rss_feed_map[key] for key in default_choices])) # RSS feeds def is_url(string): url_pattern = re.compile( r'^(?:http|ftp)s?://' # http:// or https:// r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain... r'localhost|' # localhost... r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip r'(?::\d+)?' # optional port r'(?:/?|[/?]\S+)$', re.IGNORECASE) return re.match(url_pattern, string) is not None def fetch_news(rss_feed): if not is_url(rss_feed): raise ValueError(f"{rss_feed} is not a valid RSS feed.") news = [] feed = feedparser.parse(rss_feed) for entry in feed.entries: news.append(entry.title) return news def fetch_news_multiple_urls(rss_feeds): return [news for rss_feed in rss_feeds for news in fetch_news(rss_feed)] # Language_id model_ckpt = "papluca/xlm-roberta-base-language-detection" pipe = pipeline("text-classification", model=model_ckpt) def language_id(strings:list[str]): return [(string,language_map[pipe(string, top_k=1, truncation=True)[0]['label']]) for string in strings] # Translation ## Initialize T5 model and tokenizer model_name = "t5-small" tokenizer = T5Tokenizer.from_pretrained(model_name) model = T5ForConditionalGeneration.from_pretrained(model_name) def translate(source_text_with_id, target_language): # source_text_with_id = ('text','French') for example source_language = source_text_with_id[1] assert source_language in language_map.values(), f"{source_language} language is not supported." assert target_language in language_map.values(), f"{target_language} language is not supported." source_text = f"translate {source_language} to {target_language}: " + source_text_with_id[0] # Tokenize input text input_ids = tokenizer.encode(source_text, return_tensors="pt") # Generate translation translated_ids = model.generate(input_ids=input_ids, max_length=100, num_beams=4, early_stopping=True) # Decode translated text return tokenizer.decode(translated_ids[0], skip_special_tokens=True) def translate_multiple(source_texts_with_id, target_language): return [translate(source_text_with_id, target_language) for source_text_with_id in source_texts_with_id] # Speech generation tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2") def read_news(text,input,output,language): assert language in language_map.keys(), f"{language} language is not supported." print("speech generation starting") tts.tts_to_file(text=text, file_path=output, speaker_wav=input, language=language) print("speech generation done") return output # Gradio interface def process(radio_value, textbox_value, audio_value, checkbox_value): inputs = { "language": radio_value, "rss_feed_urls": textbox_value, "audio": audio_value, "selected_feeds": checkbox_value } print("Inputs to Gradio Blocks:") print(inputs) rss_feeds = get_rss_feeds(checkbox_value,textbox_value) print("rss_feeds=",rss_feeds) news = fetch_news_multiple_urls(rss_feeds) print("news=",news[:2]) news_with_language_id = language_id(news) print("news_with_language_id=",news_with_language_id[:2]) translated_news = translate_multiple(news_with_language_id, radio_value) print("translated_news=",translated_news[:2]) language = next((key for key, val in language_map.items() if val == radio_value), None) print("language=",language) all_news = ' '.join(translated_news) print("all_news=",all_news[:80]) output_path = "output.wav" return read_news(all_news,audio_value,output_path,language) with gr.Blocks() as demo: gr.Markdown("Customize your newsletter and then click **Fetch News** to download the audio output.") with gr.Row(): radio = gr.Radio( label='Choose the language of the output', info="If the output language doesn't match the language of an RSS feed, an AI model will take care of translation", choices=["English", "French"] ) with gr.Row(): textbox = gr.Textbox( placeholder='https://www.francetvinfo.fr/titres.rss', label='Add custom RSS feeds to your newsletter', info='The provided urls needed to be written each in a separate line' ) with gr.Row(): audio = gr.Audio( label="Upload a sample audio of someone speaking. The voice of the output will match the voice of the input.", type='filepath' ) with gr.Row(): checkboxgroup = gr.CheckboxGroup( ["NY Times", "Fox News", "Yahoo! News", "France 24", "France Info"], label="RSS feeds", info="Default RSS feeds" ) with gr.Row(): btn = gr.Button(value='Fetch News') with gr.Row(): out = gr.DownloadButton("📂 Click to download file") btn.click( fn=process, inputs=[radio, textbox, audio, checkboxgroup], outputs=out ) demo.launch(debug=True)