File size: 5,102 Bytes
27d1b5a
 
 
 
3c40c63
27d1b5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c40c63
 
27d1b5a
e077868
3c40c63
 
 
 
 
ee2694e
3c40c63
ee2694e
 
 
 
27d1b5a
 
 
 
 
 
 
 
 
 
 
 
3c40c63
 
 
27d1b5a
 
ee2694e
27d1b5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import streamlit as st
import fitz
from transformers import pipeline, MBart50TokenizerFast, MBartForConditionalGeneration
from multiprocessing import Pool, cpu_count
import tempfile

# Load summarization pipeline
summarizer = pipeline("summarization", model="Falconsai/text_summarization")

# Load translation model and tokenizer
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", src_lang="en_XX")

# Define max chunk length
max_chunk_length = 1024

# Function to chunk text
def chunk_text(text, max_chunk_length):
    chunks = []
    current_chunk = ""
    for sentence in text.split("."):
        if len(current_chunk) + len(sentence) + 1 <= max_chunk_length:
            if current_chunk != "":
                current_chunk += " "
            current_chunk += sentence.strip()
        else:
            chunks.append(current_chunk)
            current_chunk = sentence.strip()
    if current_chunk != "":
        chunks.append(current_chunk)
    return chunks

# Function to summarize and translate a chunk
def summarize_and_translate_chunk(chunk, lang):
    summary = summarizer(chunk, max_length=150, min_length=30, do_sample=False)
    summary_text = summary[0]['summary_text']

    # Translate summary
    translated_chunk = translate_summary(summary_text, lang)
    return translated_chunk

# Function to translate the summary
def translate_summary(summary, lang):
    # Chunk text if it exceeds maximum length
    if len(summary) > max_chunk_length:
        chunks = chunk_text(summary, max_chunk_length)
    else:
        chunks = [summary]

    # Translate each chunk
    translated_chunks = []
    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True)
        generated_tokens = model.generate(
            **inputs,
            forced_bos_token_id=tokenizer.lang_code_to_id[lang],
            max_length=1024,
            num_beams=4,
            early_stopping=True,
            length_penalty=2.0,
        )
        translated_chunks.append(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])

    return " ".join(translated_chunks)



# Function to read PDF and summarize and translate chunk by chunk
def summarize_and_translate_pdf(uploaded_file, lang):
    # Save uploaded PDF to a temporary file
    with tempfile.NamedTemporaryFile(delete=False) as temp_file:
        temp_file.write(uploaded_file.read())
        temp_file_path = temp_file.name

    try:
        doc = fitz.open(temp_file_path)
    except FileNotFoundError:
        st.error("File not found. Please make sure the file path is correct.")
        return []

    total_chunks = len(doc)
    chunks = []

    for i in range(total_chunks):
        page = doc.load_page(i)
        text = page.get_text()
        chunks.extend([text[j:j+max_chunk_length] for j in range(0, len(text), max_chunk_length)])

    # Use multiprocessing to parallelize the process
    with Pool(cpu_count()) as pool:
        translated_chunks = pool.starmap(summarize_and_translate_chunk, [(chunk, lang) for chunk in chunks])

    # Delete temporary file
    os.unlink(temp_file_path)

    return translated_chunks


# Streamlit UI
st.title("PDF Summarization and Translation")

# File upload
uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
if uploaded_file:
    # Display uploaded file
    st.write("Uploaded PDF file:", uploaded_file.name)

    # Language selection
    languages = {
        "Arabic": "ar_AR", "Czech": "cs_CZ", "German": "de_DE", "English": "en_XX", "Spanish": "es_XX",
        "Estonian": "et_EE", "Finnish": "fi_FI", "French": "fr_XX", "Gujarati": "gu_IN", "Hindi": "hi_IN",
        "Italian": "it_IT", "Japanese": "ja_XX", "Kazakh": "kk_KZ", "Korean": "ko_KR", "Lithuanian": "lt_LT",
        "Latvian": "lv_LV", "Burmese": "my_MM", "Nepali": "ne_NP", "Dutch": "nl_XX", "Romanian": "ro_RO",
        "Russian": "ru_RU", "Sinhala": "si_LK", "Turkish": "tr_TR", "Vietnamese": "vi_VN", "Chinese": "zh_CN",
        "Afrikaans": "af_ZA", "Azerbaijani": "az_AZ", "Bengali": "bn_IN", "Persian": "fa_IR", "Hebrew": "he_IL",
        "Croatian": "hr_HR", "Indonesian": "id_ID", "Georgian": "ka_GE", "Khmer": "km_KH", "Macedonian": "mk_MK",
        "Malayalam": "ml_IN", "Mongolian": "mn_MN", "Marathi": "mr_IN", "Polish": "pl_PL", "Pashto": "ps_AF",
        "Portuguese": "pt_XX", "Swedish": "sv_SE", "Swahili": "sw_KE", "Tamil": "ta_IN", "Telugu": "te_IN",
        "Thai": "th_TH", "Tagalog": "tl_XX", "Ukrainian": "uk_UA", "Urdu": "ur_PK", "Xhosa": "xh_ZA",
        "Galician": "gl_ES", "Slovene": "sl_SI"
    }

    lang = st.selectbox("Select language for translation", list(languages.keys()))

    # Translate PDF
    if st.button("Summarize and Translate"):
        translated_chunks = summarize_and_translate_pdf(uploaded_file, languages[lang])
        
        # Display translated text
        st.header("Translated Summary")
        for chunk in translated_chunks:
            st.write(chunk)