File size: 2,225 Bytes
59f72eb
86c8656
40e2d84
86c8656
 
59f72eb
86c8656
 
b042a07
 
86c8656
b042a07
86c8656
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed34fa1
40e2d84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed34fa1
40e2d84
7334e78
ed34fa1
b042a07
 
86c8656
 
 
 
 
 
 
 
 
 
 
 
 
40e2d84
86c8656
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from transformers import MBart50Tokenizer, AutoModelForSeq2SeqLM, pipeline
from langdetect import detect
import re

def load_models():
    tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50")
    model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50")
    summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
    
    return tokenizer, summarizer

tokenizer, summarizer = load_models()

import streamlit as st
LANGUAGE_CODES = {
    "en": "en_XX",  # English
    "fr": "fr_XX",  # French
    "de": "de_DE",  # German
    "ru": "ru_RU",  # Russian
    "hi": "hi_IN",  # Hindi
    "mr": "mr_IN",  # Marathi
    "ja": "ja_XX",  # Japanese
}

def detect_language(text):
    lang_code = detect(text)
    return lang_code




def summarize_text(text, lang_code):
    mbart_lang_code = LANGUAGE_CODES.get(lang_code, "en_XX")  # Default to English if unsupported
    inputs = tokenizer(
        f"<{mbart_lang_code}>{text}", 
        return_tensors="pt", 
        max_length=1024, 
        truncation=True
    )
    summary_ids = summarizer.model.generate(
        inputs["input_ids"], 
        max_length=100, 
        min_length=30, 
        length_penalty=2.0, 
        num_beams=4
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    summary = re.sub(r"<[^>]+>", "", summary).strip()
    return summary


st.title("Multilingual Summarization App")


user_input = st.text_area("Enter text in any language:", "")

if st.button("Process Text"):
    if user_input.strip():
        
        lang_code = detect_language(user_input)
        st.write(f"**Detected Language Code:** {lang_code}")
        
        if lang_code not in LANGUAGE_CODES:
            st.warning(f"The detected language ({lang_code}) is not supported by the model.")
        else:
            try:
                # First summarize the text
                summary = summarize_text(user_input, lang_code)
                st.write(f"### Summarized Text ({lang_code}):")
                st.write(summary)

            except Exception as e:
                st.error(f"An error occurred during processing: {e}")
    else:
        st.warning("Please enter some text to process.")