Spaces:
Sleeping
Sleeping
Upload 4 files
Browse files- README.md +34 -10
- app.py +107 -0
- languages.json +51 -0
- requirements.txt +8 -0
README.md
CHANGED
|
@@ -1,12 +1,36 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
| 1 |
+
|
| 2 |
+
# Multipurpose NLP Web App (Gradio on Hugging Face Spaces)
|
| 3 |
+
|
| 4 |
+
An all‑in‑one **Translation** + **Summarization** app:
|
| 5 |
+
|
| 6 |
+
- **Translation**: [facebook/m2m100_418M](https://huggingface.co/facebook/m2m100_418M) (~100 languages)
|
| 7 |
+
- **Summarization**: [sshleifer/distilbart-cnn-12-6](https://huggingface.co/sshleifer/distilbart-cnn-12-6) (English)
|
| 8 |
+
|
| 9 |
+
## Deploy in 3 minutes
|
| 10 |
+
|
| 11 |
+
1. **Create a Space**: https://huggingface.co/new-space
|
| 12 |
+
- **Space SDK**: `Gradio`
|
| 13 |
+
- **Hardware**: CPU Basic is fine (bigger models may load slower).
|
| 14 |
+
2. **Upload these three files**: `app.py`, `requirements.txt`, `languages.json`.
|
| 15 |
+
3. **Commit** → The Space will build and start. No secrets needed.
|
| 16 |
+
|
| 17 |
+
## Features
|
| 18 |
+
- Two tabs: **Translate** and **Summarize**
|
| 19 |
+
- Dropdowns for source/target languages (common ~45 languages pre‑mapped)
|
| 20 |
+
- Adjustable generation lengths
|
| 21 |
+
- Example inputs to test quickly
|
| 22 |
+
|
| 23 |
+
## Notes
|
| 24 |
+
- M2M100 requires setting `src_lang` and a `forced_bos_token_id` for the target language; handled for you in `app.py`.
|
| 25 |
+
- Summarization is English‑focused for speed. If you need multilingual summaries, swap the model in `app.py` for an mT5‑based XLSum model (e.g., `csebuetnlp/mT5_multilingual_XLSum`) and add `sentencepiece` (already included).
|
| 26 |
+
|
| 27 |
+
## Optional: Multilingual Summarization
|
| 28 |
+
Replace the summarizer lines in `app.py` with:
|
| 29 |
+
```python
|
| 30 |
+
SUM_MODEL_NAME = "csebuetnlp/mT5_multilingual_XLSum"
|
| 31 |
+
summarizer = pipeline("summarization", model=SUM_MODEL_NAME, tokenizer=SUM_MODEL_NAME)
|
| 32 |
+
```
|
| 33 |
+
|
| 34 |
---
|
| 35 |
|
| 36 |
+
Built for assignments and demos. Enjoy!
|
app.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import gradio as gr
|
| 3 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
|
| 4 |
+
import json
|
| 5 |
+
|
| 6 |
+
# ---- Load language map ----
|
| 7 |
+
with open("languages.json", "r", encoding="utf-8") as f:
|
| 8 |
+
LANG_MAP = json.load(f)
|
| 9 |
+
|
| 10 |
+
LANG_NAMES = list(LANG_MAP.keys())
|
| 11 |
+
|
| 12 |
+
# ---- Translation setup (M2M100 supports ~100 languages) ----
|
| 13 |
+
TRANS_MODEL_NAME = "facebook/m2m100_418M"
|
| 14 |
+
trans_tokenizer = AutoTokenizer.from_pretrained(TRANS_MODEL_NAME)
|
| 15 |
+
trans_model = AutoModelForSeq2SeqLM.from_pretrained(TRANS_MODEL_NAME)
|
| 16 |
+
|
| 17 |
+
def translate_text(text, src_name, tgt_name, max_new_tokens=200):
|
| 18 |
+
if not text.strip():
|
| 19 |
+
return ""
|
| 20 |
+
src = LANG_MAP[src_name]
|
| 21 |
+
tgt = LANG_MAP[tgt_name]
|
| 22 |
+
# Set source language for tokenizer
|
| 23 |
+
trans_tokenizer.src_lang = src
|
| 24 |
+
# Tokenize
|
| 25 |
+
encoded = trans_tokenizer(text, return_tensors="pt", truncation=True, max_length=1024)
|
| 26 |
+
# Set target language via forced BOS token
|
| 27 |
+
forced_bos_token_id = trans_tokenizer.get_lang_id(tgt)
|
| 28 |
+
generated_tokens = trans_model.generate(
|
| 29 |
+
**encoded,
|
| 30 |
+
forced_bos_token_id=forced_bos_token_id,
|
| 31 |
+
max_new_tokens=max_new_tokens,
|
| 32 |
+
num_beams=4
|
| 33 |
+
)
|
| 34 |
+
return trans_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
|
| 35 |
+
|
| 36 |
+
# ---- Summarization setup (English-focused, small & fast) ----
|
| 37 |
+
SUM_MODEL_NAME = "sshleifer/distilbart-cnn-12-6"
|
| 38 |
+
summarizer = pipeline("summarization", model=SUM_MODEL_NAME)
|
| 39 |
+
|
| 40 |
+
def summarize_text(long_text, min_len=60, max_len=160):
|
| 41 |
+
if not long_text.strip():
|
| 42 |
+
return ""
|
| 43 |
+
# DistilBART expects English and up to 1024 tokens; truncation handled by pipeline tokenizer
|
| 44 |
+
summary = summarizer(long_text, min_length=int(min_len), max_length=int(max_len), truncation=True)
|
| 45 |
+
return summary[0]["summary_text"]
|
| 46 |
+
|
| 47 |
+
# ---- Gradio UI ----
|
| 48 |
+
with gr.Blocks(title="Multipurpose NLP (Translate + Summarize)") as demo:
|
| 49 |
+
gr.Markdown(
|
| 50 |
+
"""
|
| 51 |
+
# 🌐 Multipurpose NLP: Translation & Summarization
|
| 52 |
+
- **Translate** between many languages using **M2M100 (418M)**
|
| 53 |
+
- **Summarize** long English text using **DistilBART-CNN**
|
| 54 |
+
- Built with **Gradio** on Hugging Face Spaces
|
| 55 |
+
"""
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
with gr.Tab("🗣️ Translate"):
|
| 59 |
+
with gr.Row():
|
| 60 |
+
src_lang = gr.Dropdown(choices=LANG_NAMES, value="English", label="Source language")
|
| 61 |
+
tgt_lang = gr.Dropdown(choices=LANG_NAMES, value="Hindi", label="Target language")
|
| 62 |
+
input_text = gr.Textbox(lines=6, label="Input text")
|
| 63 |
+
max_toks = gr.Slider(16, 400, value=200, step=1, label="Max new tokens")
|
| 64 |
+
translate_btn = gr.Button("Translate")
|
| 65 |
+
output_text = gr.Textbox(lines=6, label="Translation", interactive=False)
|
| 66 |
+
|
| 67 |
+
translate_btn.click(
|
| 68 |
+
fn=translate_text,
|
| 69 |
+
inputs=[input_text, src_lang, tgt_lang, max_toks],
|
| 70 |
+
outputs=[output_text]
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
gr.Examples(
|
| 74 |
+
examples=[
|
| 75 |
+
["Hello, how are you?", "English", "French", 60],
|
| 76 |
+
["India is developing rapidly in science and technology.", "English", "Hindi", 80],
|
| 77 |
+
["مرحبا كيف حالك؟", "Arabic", "English", 60],
|
| 78 |
+
],
|
| 79 |
+
inputs=[input_text, src_lang, tgt_lang, max_toks],
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
with gr.Tab("📝 Summarize"):
|
| 83 |
+
long_input = gr.Textbox(lines=12, label="Long input text (English)")
|
| 84 |
+
with gr.Row():
|
| 85 |
+
min_len = gr.Slider(20, 200, value=60, step=1, label="Min summary length")
|
| 86 |
+
max_len = gr.Slider(60, 400, value=160, step=1, label="Max summary length")
|
| 87 |
+
sum_btn = gr.Button("Summarize")
|
| 88 |
+
sum_output = gr.Textbox(lines=8, label="Summary", interactive=False)
|
| 89 |
+
|
| 90 |
+
sum_btn.click(
|
| 91 |
+
fn=summarize_text,
|
| 92 |
+
inputs=[long_input, min_len, max_len],
|
| 93 |
+
outputs=[sum_output]
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
gr.Examples(
|
| 97 |
+
examples=[
|
| 98 |
+
["""Artificial intelligence (AI) has seen rapid growth over the past decade.
|
| 99 |
+
Advances in deep learning, transformers, and large-scale datasets have enabled breakthroughs across natural language processing, computer vision, and speech.
|
| 100 |
+
However, concerns about bias, interpretability, and energy consumption persist.
|
| 101 |
+
As AI systems are deployed in high-stakes settings, researchers and policymakers emphasize responsible AI principles such as fairness, transparency, and accountability."""],
|
| 102 |
+
],
|
| 103 |
+
inputs=[long_input],
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
if __name__ == "__main__":
|
| 107 |
+
demo.launch()
|
languages.json
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"Arabic": "ar",
|
| 3 |
+
"Bengali": "bn",
|
| 4 |
+
"Bulgarian": "bg",
|
| 5 |
+
"Chinese (Simplified)": "zh",
|
| 6 |
+
"Chinese (Traditional)": "zh",
|
| 7 |
+
"Czech": "cs",
|
| 8 |
+
"Danish": "da",
|
| 9 |
+
"Dutch": "nl",
|
| 10 |
+
"English": "en",
|
| 11 |
+
"Estonian": "et",
|
| 12 |
+
"Finnish": "fi",
|
| 13 |
+
"French": "fr",
|
| 14 |
+
"German": "de",
|
| 15 |
+
"Greek": "el",
|
| 16 |
+
"Hebrew": "he",
|
| 17 |
+
"Hindi": "hi",
|
| 18 |
+
"Hungarian": "hu",
|
| 19 |
+
"Indonesian": "id",
|
| 20 |
+
"Italian": "it",
|
| 21 |
+
"Japanese": "ja",
|
| 22 |
+
"Kannada": "kn",
|
| 23 |
+
"Korean": "ko",
|
| 24 |
+
"Malay": "ms",
|
| 25 |
+
"Marathi": "mr",
|
| 26 |
+
"Norwegian": "no",
|
| 27 |
+
"Persian (Farsi)": "fa",
|
| 28 |
+
"Polish": "pl",
|
| 29 |
+
"Portuguese": "pt",
|
| 30 |
+
"Punjabi": "pa",
|
| 31 |
+
"Romanian": "ro",
|
| 32 |
+
"Russian": "ru",
|
| 33 |
+
"Sinhala": "si",
|
| 34 |
+
"Slovak": "sk",
|
| 35 |
+
"Spanish": "es",
|
| 36 |
+
"Swedish": "sv",
|
| 37 |
+
"Tamil": "ta",
|
| 38 |
+
"Telugu": "te",
|
| 39 |
+
"Thai": "th",
|
| 40 |
+
"Turkish": "tr",
|
| 41 |
+
"Ukrainian": "uk",
|
| 42 |
+
"Urdu": "ur",
|
| 43 |
+
"Vietnamese": "vi",
|
| 44 |
+
"Swahili": "sw",
|
| 45 |
+
"Yoruba": "yo",
|
| 46 |
+
"Zulu": "zu",
|
| 47 |
+
"Malayalam": "ml",
|
| 48 |
+
"Gujarati": "gu",
|
| 49 |
+
"Nepali": "ne",
|
| 50 |
+
"Filipino (Tagalog)": "tl"
|
| 51 |
+
}
|
requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
gradio>=4.44.0
|
| 3 |
+
transformers>=4.44.0
|
| 4 |
+
torch>=2.2.0
|
| 5 |
+
sentencepiece>=0.1.99
|
| 6 |
+
sacremoses>=0.0.53
|
| 7 |
+
accelerate>=0.33.0
|
| 8 |
+
safetensors>=0.4.2
|