Rajut's picture
Update app.py
3c40c63 verified
raw
history blame contribute delete
No virus
5.1 kB
import streamlit as st
import fitz
from transformers import pipeline, MBart50TokenizerFast, MBartForConditionalGeneration
from multiprocessing import Pool, cpu_count
import tempfile
# Load summarization pipeline
summarizer = pipeline("summarization", model="Falconsai/text_summarization")
# Load translation model and tokenizer
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", src_lang="en_XX")
# Define max chunk length
max_chunk_length = 1024
# Function to chunk text
def chunk_text(text, max_chunk_length):
chunks = []
current_chunk = ""
for sentence in text.split("."):
if len(current_chunk) + len(sentence) + 1 <= max_chunk_length:
if current_chunk != "":
current_chunk += " "
current_chunk += sentence.strip()
else:
chunks.append(current_chunk)
current_chunk = sentence.strip()
if current_chunk != "":
chunks.append(current_chunk)
return chunks
# Function to summarize and translate a chunk
def summarize_and_translate_chunk(chunk, lang):
summary = summarizer(chunk, max_length=150, min_length=30, do_sample=False)
summary_text = summary[0]['summary_text']
# Translate summary
translated_chunk = translate_summary(summary_text, lang)
return translated_chunk
# Function to translate the summary
def translate_summary(summary, lang):
# Chunk text if it exceeds maximum length
if len(summary) > max_chunk_length:
chunks = chunk_text(summary, max_chunk_length)
else:
chunks = [summary]
# Translate each chunk
translated_chunks = []
for chunk in chunks:
inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True)
generated_tokens = model.generate(
**inputs,
forced_bos_token_id=tokenizer.lang_code_to_id[lang],
max_length=1024,
num_beams=4,
early_stopping=True,
length_penalty=2.0,
)
translated_chunks.append(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])
return " ".join(translated_chunks)
# Function to read PDF and summarize and translate chunk by chunk
def summarize_and_translate_pdf(uploaded_file, lang):
# Save uploaded PDF to a temporary file
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
temp_file.write(uploaded_file.read())
temp_file_path = temp_file.name
try:
doc = fitz.open(temp_file_path)
except FileNotFoundError:
st.error("File not found. Please make sure the file path is correct.")
return []
total_chunks = len(doc)
chunks = []
for i in range(total_chunks):
page = doc.load_page(i)
text = page.get_text()
chunks.extend([text[j:j+max_chunk_length] for j in range(0, len(text), max_chunk_length)])
# Use multiprocessing to parallelize the process
with Pool(cpu_count()) as pool:
translated_chunks = pool.starmap(summarize_and_translate_chunk, [(chunk, lang) for chunk in chunks])
# Delete temporary file
os.unlink(temp_file_path)
return translated_chunks
# Streamlit UI
st.title("PDF Summarization and Translation")
# File upload
uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
if uploaded_file:
# Display uploaded file
st.write("Uploaded PDF file:", uploaded_file.name)
# Language selection
languages = {
"Arabic": "ar_AR", "Czech": "cs_CZ", "German": "de_DE", "English": "en_XX", "Spanish": "es_XX",
"Estonian": "et_EE", "Finnish": "fi_FI", "French": "fr_XX", "Gujarati": "gu_IN", "Hindi": "hi_IN",
"Italian": "it_IT", "Japanese": "ja_XX", "Kazakh": "kk_KZ", "Korean": "ko_KR", "Lithuanian": "lt_LT",
"Latvian": "lv_LV", "Burmese": "my_MM", "Nepali": "ne_NP", "Dutch": "nl_XX", "Romanian": "ro_RO",
"Russian": "ru_RU", "Sinhala": "si_LK", "Turkish": "tr_TR", "Vietnamese": "vi_VN", "Chinese": "zh_CN",
"Afrikaans": "af_ZA", "Azerbaijani": "az_AZ", "Bengali": "bn_IN", "Persian": "fa_IR", "Hebrew": "he_IL",
"Croatian": "hr_HR", "Indonesian": "id_ID", "Georgian": "ka_GE", "Khmer": "km_KH", "Macedonian": "mk_MK",
"Malayalam": "ml_IN", "Mongolian": "mn_MN", "Marathi": "mr_IN", "Polish": "pl_PL", "Pashto": "ps_AF",
"Portuguese": "pt_XX", "Swedish": "sv_SE", "Swahili": "sw_KE", "Tamil": "ta_IN", "Telugu": "te_IN",
"Thai": "th_TH", "Tagalog": "tl_XX", "Ukrainian": "uk_UA", "Urdu": "ur_PK", "Xhosa": "xh_ZA",
"Galician": "gl_ES", "Slovene": "sl_SI"
}
lang = st.selectbox("Select language for translation", list(languages.keys()))
# Translate PDF
if st.button("Summarize and Translate"):
translated_chunks = summarize_and_translate_pdf(uploaded_file, languages[lang])
# Display translated text
st.header("Translated Summary")
for chunk in translated_chunks:
st.write(chunk)