File size: 4,676 Bytes
27d1b5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import streamlit as st
import fitz
from transformers import pipeline, MBart50TokenizerFast, MBartForConditionalGeneration
from multiprocessing import Pool, cpu_count

# Load summarization pipeline
summarizer = pipeline("summarization", model="Falconsai/text_summarization")

# Load translation model and tokenizer
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", src_lang="en_XX")

# Define max chunk length
max_chunk_length = 1024

# Function to chunk text
def chunk_text(text, max_chunk_length):
    chunks = []
    current_chunk = ""
    for sentence in text.split("."):
        if len(current_chunk) + len(sentence) + 1 <= max_chunk_length:
            if current_chunk != "":
                current_chunk += " "
            current_chunk += sentence.strip()
        else:
            chunks.append(current_chunk)
            current_chunk = sentence.strip()
    if current_chunk != "":
        chunks.append(current_chunk)
    return chunks

# Function to summarize and translate a chunk
def summarize_and_translate_chunk(chunk, lang):
    summary = summarizer(chunk, max_length=150, min_length=30, do_sample=False)
    summary_text = summary[0]['summary_text']

    # Translate summary
    translated_chunk = translate_summary(summary_text, lang)
    return translated_chunk

# Function to translate the summary
def translate_summary(summary, lang):
    # Chunk text if it exceeds maximum length
    if len(summary) > max_chunk_length:
        chunks = chunk_text(summary, max_chunk_length)
    else:
        chunks = [summary]

    # Translate each chunk
    translated_chunks = []
    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True)
        generated_tokens = model.generate(
            **inputs,
            forced_bos_token_id=tokenizer.lang_code_to_id[lang],
            max_length=1024,
            num_beams=4,
            early_stopping=True,
            length_penalty=2.0,
        )
        translated_chunks.append(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])

    return " ".join(translated_chunks)

# Function to read PDF and summarize and translate chunk by chunk
def summarize_and_translate_pdf(pdf_path, lang):
    doc = fitz.open(pdf_path)
    total_chunks = len(doc)
    chunks = []

    for i in range(total_chunks):
        page = doc.load_page(i)
        text = page.get_text()
        chunks.extend([text[j:j+max_chunk_length] for j in range(0, len(text), max_chunk_length)])

    # Use multiprocessing to parallelize the process
    with Pool(cpu_count()) as pool:
        translated_chunks = pool.starmap(summarize_and_translate_chunk, [(chunk, lang) for chunk in chunks])

    return translated_chunks

# Streamlit UI
st.title("PDF Summarization and Translation")

# File upload
uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
if uploaded_file:
    # Display uploaded file
    st.write("Uploaded PDF file:", uploaded_file.name)

    # Language selection
    languages = {
        "Arabic": "ar_AR", "Czech": "cs_CZ", "German": "de_DE", "English": "en_XX", "Spanish": "es_XX",
        "Estonian": "et_EE", "Finnish": "fi_FI", "French": "fr_XX", "Gujarati": "gu_IN", "Hindi": "hi_IN",
        "Italian": "it_IT", "Japanese": "ja_XX", "Kazakh": "kk_KZ", "Korean": "ko_KR", "Lithuanian": "lt_LT",
        "Latvian": "lv_LV", "Burmese": "my_MM", "Nepali": "ne_NP", "Dutch": "nl_XX", "Romanian": "ro_RO",
        "Russian": "ru_RU", "Sinhala": "si_LK", "Turkish": "tr_TR", "Vietnamese": "vi_VN", "Chinese": "zh_CN",
        "Afrikaans": "af_ZA", "Azerbaijani": "az_AZ", "Bengali": "bn_IN", "Persian": "fa_IR", "Hebrew": "he_IL",
        "Croatian": "hr_HR", "Indonesian": "id_ID", "Georgian": "ka_GE", "Khmer": "km_KH", "Macedonian": "mk_MK",
        "Malayalam": "ml_IN", "Mongolian": "mn_MN", "Marathi": "mr_IN", "Polish": "pl_PL", "Pashto": "ps_AF",
        "Portuguese": "pt_XX", "Swedish": "sv_SE", "Swahili": "sw_KE", "Tamil": "ta_IN", "Telugu": "te_IN",
        "Thai": "th_TH", "Tagalog": "tl_XX", "Ukrainian": "uk_UA", "Urdu": "ur_PK", "Xhosa": "xh_ZA",
        "Galician": "gl_ES", "Slovene": "sl_SI"
    }

    lang = st.selectbox("Select language for translation", list(languages.keys()))

    # Translate PDF
    if st.button("Summarize and Translate"):
        translated_chunks = summarize_and_translate_pdf(uploaded_file, languages[lang])
        
        # Display translated text
        st.header("Translated Summary")
        for chunk in translated_chunks:
            st.write(chunk)