Pratham06 commited on
Commit
02f42df
·
verified ·
1 Parent(s): a53851e

Upload 4 files

Browse files
Files changed (4) hide show
  1. README.md +34 -10
  2. app.py +107 -0
  3. languages.json +51 -0
  4. requirements.txt +8 -0
README.md CHANGED
@@ -1,12 +1,36 @@
1
- ---
2
- title: LLMProject
3
- emoji: 📈
4
- colorFrom: green
5
- colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 5.49.1
8
- app_file: app.py
9
- pinned: false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+
2
+ # Multipurpose NLP Web App (Gradio on Hugging Face Spaces)
3
+
4
+ An all‑in‑one **Translation** + **Summarization** app:
5
+
6
+ - **Translation**: [facebook/m2m100_418M](https://huggingface.co/facebook/m2m100_418M) (~100 languages)
7
+ - **Summarization**: [sshleifer/distilbart-cnn-12-6](https://huggingface.co/sshleifer/distilbart-cnn-12-6) (English)
8
+
9
+ ## Deploy in 3 minutes
10
+
11
+ 1. **Create a Space**: https://huggingface.co/new-space
12
+ - **Space SDK**: `Gradio`
13
+ - **Hardware**: CPU Basic is fine (bigger models may load slower).
14
+ 2. **Upload these three files**: `app.py`, `requirements.txt`, `languages.json`.
15
+ 3. **Commit** → The Space will build and start. No secrets needed.
16
+
17
+ ## Features
18
+ - Two tabs: **Translate** and **Summarize**
19
+ - Dropdowns for source/target languages (common ~45 languages pre‑mapped)
20
+ - Adjustable generation lengths
21
+ - Example inputs to test quickly
22
+
23
+ ## Notes
24
+ - M2M100 requires setting `src_lang` and a `forced_bos_token_id` for the target language; handled for you in `app.py`.
25
+ - Summarization is English‑focused for speed. If you need multilingual summaries, swap the model in `app.py` for an mT5‑based XLSum model (e.g., `csebuetnlp/mT5_multilingual_XLSum`) and add `sentencepiece` (already included).
26
+
27
+ ## Optional: Multilingual Summarization
28
+ Replace the summarizer lines in `app.py` with:
29
+ ```python
30
+ SUM_MODEL_NAME = "csebuetnlp/mT5_multilingual_XLSum"
31
+ summarizer = pipeline("summarization", model=SUM_MODEL_NAME, tokenizer=SUM_MODEL_NAME)
32
+ ```
33
+
34
  ---
35
 
36
+ Built for assignments and demos. Enjoy!
app.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
4
+ import json
5
+
6
+ # ---- Load language map ----
7
+ with open("languages.json", "r", encoding="utf-8") as f:
8
+ LANG_MAP = json.load(f)
9
+
10
+ LANG_NAMES = list(LANG_MAP.keys())
11
+
12
+ # ---- Translation setup (M2M100 supports ~100 languages) ----
13
+ TRANS_MODEL_NAME = "facebook/m2m100_418M"
14
+ trans_tokenizer = AutoTokenizer.from_pretrained(TRANS_MODEL_NAME)
15
+ trans_model = AutoModelForSeq2SeqLM.from_pretrained(TRANS_MODEL_NAME)
16
+
17
+ def translate_text(text, src_name, tgt_name, max_new_tokens=200):
18
+ if not text.strip():
19
+ return ""
20
+ src = LANG_MAP[src_name]
21
+ tgt = LANG_MAP[tgt_name]
22
+ # Set source language for tokenizer
23
+ trans_tokenizer.src_lang = src
24
+ # Tokenize
25
+ encoded = trans_tokenizer(text, return_tensors="pt", truncation=True, max_length=1024)
26
+ # Set target language via forced BOS token
27
+ forced_bos_token_id = trans_tokenizer.get_lang_id(tgt)
28
+ generated_tokens = trans_model.generate(
29
+ **encoded,
30
+ forced_bos_token_id=forced_bos_token_id,
31
+ max_new_tokens=max_new_tokens,
32
+ num_beams=4
33
+ )
34
+ return trans_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
35
+
36
+ # ---- Summarization setup (English-focused, small & fast) ----
37
+ SUM_MODEL_NAME = "sshleifer/distilbart-cnn-12-6"
38
+ summarizer = pipeline("summarization", model=SUM_MODEL_NAME)
39
+
40
+ def summarize_text(long_text, min_len=60, max_len=160):
41
+ if not long_text.strip():
42
+ return ""
43
+ # DistilBART expects English and up to 1024 tokens; truncation handled by pipeline tokenizer
44
+ summary = summarizer(long_text, min_length=int(min_len), max_length=int(max_len), truncation=True)
45
+ return summary[0]["summary_text"]
46
+
47
+ # ---- Gradio UI ----
48
+ with gr.Blocks(title="Multipurpose NLP (Translate + Summarize)") as demo:
49
+ gr.Markdown(
50
+ """
51
+ # 🌐 Multipurpose NLP: Translation & Summarization
52
+ - **Translate** between many languages using **M2M100 (418M)**
53
+ - **Summarize** long English text using **DistilBART-CNN**
54
+ - Built with **Gradio** on Hugging Face Spaces
55
+ """
56
+ )
57
+
58
+ with gr.Tab("🗣️ Translate"):
59
+ with gr.Row():
60
+ src_lang = gr.Dropdown(choices=LANG_NAMES, value="English", label="Source language")
61
+ tgt_lang = gr.Dropdown(choices=LANG_NAMES, value="Hindi", label="Target language")
62
+ input_text = gr.Textbox(lines=6, label="Input text")
63
+ max_toks = gr.Slider(16, 400, value=200, step=1, label="Max new tokens")
64
+ translate_btn = gr.Button("Translate")
65
+ output_text = gr.Textbox(lines=6, label="Translation", interactive=False)
66
+
67
+ translate_btn.click(
68
+ fn=translate_text,
69
+ inputs=[input_text, src_lang, tgt_lang, max_toks],
70
+ outputs=[output_text]
71
+ )
72
+
73
+ gr.Examples(
74
+ examples=[
75
+ ["Hello, how are you?", "English", "French", 60],
76
+ ["India is developing rapidly in science and technology.", "English", "Hindi", 80],
77
+ ["مرحبا كيف حالك؟", "Arabic", "English", 60],
78
+ ],
79
+ inputs=[input_text, src_lang, tgt_lang, max_toks],
80
+ )
81
+
82
+ with gr.Tab("📝 Summarize"):
83
+ long_input = gr.Textbox(lines=12, label="Long input text (English)")
84
+ with gr.Row():
85
+ min_len = gr.Slider(20, 200, value=60, step=1, label="Min summary length")
86
+ max_len = gr.Slider(60, 400, value=160, step=1, label="Max summary length")
87
+ sum_btn = gr.Button("Summarize")
88
+ sum_output = gr.Textbox(lines=8, label="Summary", interactive=False)
89
+
90
+ sum_btn.click(
91
+ fn=summarize_text,
92
+ inputs=[long_input, min_len, max_len],
93
+ outputs=[sum_output]
94
+ )
95
+
96
+ gr.Examples(
97
+ examples=[
98
+ ["""Artificial intelligence (AI) has seen rapid growth over the past decade.
99
+ Advances in deep learning, transformers, and large-scale datasets have enabled breakthroughs across natural language processing, computer vision, and speech.
100
+ However, concerns about bias, interpretability, and energy consumption persist.
101
+ As AI systems are deployed in high-stakes settings, researchers and policymakers emphasize responsible AI principles such as fairness, transparency, and accountability."""],
102
+ ],
103
+ inputs=[long_input],
104
+ )
105
+
106
+ if __name__ == "__main__":
107
+ demo.launch()
languages.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Arabic": "ar",
3
+ "Bengali": "bn",
4
+ "Bulgarian": "bg",
5
+ "Chinese (Simplified)": "zh",
6
+ "Chinese (Traditional)": "zh",
7
+ "Czech": "cs",
8
+ "Danish": "da",
9
+ "Dutch": "nl",
10
+ "English": "en",
11
+ "Estonian": "et",
12
+ "Finnish": "fi",
13
+ "French": "fr",
14
+ "German": "de",
15
+ "Greek": "el",
16
+ "Hebrew": "he",
17
+ "Hindi": "hi",
18
+ "Hungarian": "hu",
19
+ "Indonesian": "id",
20
+ "Italian": "it",
21
+ "Japanese": "ja",
22
+ "Kannada": "kn",
23
+ "Korean": "ko",
24
+ "Malay": "ms",
25
+ "Marathi": "mr",
26
+ "Norwegian": "no",
27
+ "Persian (Farsi)": "fa",
28
+ "Polish": "pl",
29
+ "Portuguese": "pt",
30
+ "Punjabi": "pa",
31
+ "Romanian": "ro",
32
+ "Russian": "ru",
33
+ "Sinhala": "si",
34
+ "Slovak": "sk",
35
+ "Spanish": "es",
36
+ "Swedish": "sv",
37
+ "Tamil": "ta",
38
+ "Telugu": "te",
39
+ "Thai": "th",
40
+ "Turkish": "tr",
41
+ "Ukrainian": "uk",
42
+ "Urdu": "ur",
43
+ "Vietnamese": "vi",
44
+ "Swahili": "sw",
45
+ "Yoruba": "yo",
46
+ "Zulu": "zu",
47
+ "Malayalam": "ml",
48
+ "Gujarati": "gu",
49
+ "Nepali": "ne",
50
+ "Filipino (Tagalog)": "tl"
51
+ }
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+
2
+ gradio>=4.44.0
3
+ transformers>=4.44.0
4
+ torch>=2.2.0
5
+ sentencepiece>=0.1.99
6
+ sacremoses>=0.0.53
7
+ accelerate>=0.33.0
8
+ safetensors>=0.4.2