LLMProject / app.py
Pratham06's picture
Upload 4 files
02f42df verified
import gradio as gr
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
import json
# ---- Load language map ----
with open("languages.json", "r", encoding="utf-8") as f:
LANG_MAP = json.load(f)
LANG_NAMES = list(LANG_MAP.keys())
# ---- Translation setup (M2M100 supports ~100 languages) ----
TRANS_MODEL_NAME = "facebook/m2m100_418M"
trans_tokenizer = AutoTokenizer.from_pretrained(TRANS_MODEL_NAME)
trans_model = AutoModelForSeq2SeqLM.from_pretrained(TRANS_MODEL_NAME)
def translate_text(text, src_name, tgt_name, max_new_tokens=200):
if not text.strip():
return ""
src = LANG_MAP[src_name]
tgt = LANG_MAP[tgt_name]
# Set source language for tokenizer
trans_tokenizer.src_lang = src
# Tokenize
encoded = trans_tokenizer(text, return_tensors="pt", truncation=True, max_length=1024)
# Set target language via forced BOS token
forced_bos_token_id = trans_tokenizer.get_lang_id(tgt)
generated_tokens = trans_model.generate(
**encoded,
forced_bos_token_id=forced_bos_token_id,
max_new_tokens=max_new_tokens,
num_beams=4
)
return trans_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
# ---- Summarization setup (English-focused, small & fast) ----
SUM_MODEL_NAME = "sshleifer/distilbart-cnn-12-6"
summarizer = pipeline("summarization", model=SUM_MODEL_NAME)
def summarize_text(long_text, min_len=60, max_len=160):
if not long_text.strip():
return ""
# DistilBART expects English and up to 1024 tokens; truncation handled by pipeline tokenizer
summary = summarizer(long_text, min_length=int(min_len), max_length=int(max_len), truncation=True)
return summary[0]["summary_text"]
# ---- Gradio UI ----
with gr.Blocks(title="Multipurpose NLP (Translate + Summarize)") as demo:
gr.Markdown(
"""
# ๐ŸŒ Multipurpose NLP: Translation & Summarization
- **Translate** between many languages using **M2M100 (418M)**
- **Summarize** long English text using **DistilBART-CNN**
- Built with **Gradio** on Hugging Face Spaces
"""
)
with gr.Tab("๐Ÿ—ฃ๏ธ Translate"):
with gr.Row():
src_lang = gr.Dropdown(choices=LANG_NAMES, value="English", label="Source language")
tgt_lang = gr.Dropdown(choices=LANG_NAMES, value="Hindi", label="Target language")
input_text = gr.Textbox(lines=6, label="Input text")
max_toks = gr.Slider(16, 400, value=200, step=1, label="Max new tokens")
translate_btn = gr.Button("Translate")
output_text = gr.Textbox(lines=6, label="Translation", interactive=False)
translate_btn.click(
fn=translate_text,
inputs=[input_text, src_lang, tgt_lang, max_toks],
outputs=[output_text]
)
gr.Examples(
examples=[
["Hello, how are you?", "English", "French", 60],
["India is developing rapidly in science and technology.", "English", "Hindi", 80],
["ู…ุฑุญุจุง ูƒูŠู ุญุงู„ูƒุŸ", "Arabic", "English", 60],
],
inputs=[input_text, src_lang, tgt_lang, max_toks],
)
with gr.Tab("๐Ÿ“ Summarize"):
long_input = gr.Textbox(lines=12, label="Long input text (English)")
with gr.Row():
min_len = gr.Slider(20, 200, value=60, step=1, label="Min summary length")
max_len = gr.Slider(60, 400, value=160, step=1, label="Max summary length")
sum_btn = gr.Button("Summarize")
sum_output = gr.Textbox(lines=8, label="Summary", interactive=False)
sum_btn.click(
fn=summarize_text,
inputs=[long_input, min_len, max_len],
outputs=[sum_output]
)
gr.Examples(
examples=[
["""Artificial intelligence (AI) has seen rapid growth over the past decade.
Advances in deep learning, transformers, and large-scale datasets have enabled breakthroughs across natural language processing, computer vision, and speech.
However, concerns about bias, interpretability, and energy consumption persist.
As AI systems are deployed in high-stakes settings, researchers and policymakers emphasize responsible AI principles such as fairness, transparency, and accountability."""],
],
inputs=[long_input],
)
if __name__ == "__main__":
demo.launch()