Spaces:

songhieng
/

khmer-mt5-summarization-duplicated

Running

Create app.py

bff554c verified 4 months ago

1.73 kB

	import streamlit as st
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

	# Must be the first Streamlit command
	st.set_page_config(page_title="Khmer Text Summarization", page_icon="📝", layout="wide")

	import torch

	MODEL_ID = "songhieng/khmer-mt5-summarization-duplicated"

	@st.cache_resource
	def load_model():
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
	model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID)
	return tokenizer, model

	tokenizer, model = load_model()

	st.title("📝 Khmer Text Summarization")
	st.markdown("Input Khmer text and get a concise summary powered by your fine-tuned mT5 model.")

	# Sidebar controls
	st.sidebar.header("Settings")
	max_length = st.sidebar.slider("Max summary length", 50, 300, 150, step=10)
	min_length = st.sidebar.slider("Min summary length", 10, 100, 30, step=5)
	num_beams = st.sidebar.slider("Number of beams", 1, 10, 4)

	# Input text
	text = st.text_area("✏️ Paste Khmer text below:", height=300, placeholder="សូមវាយអត្ថបទខ្មែរនៅទីនេះ…")

	if st.button("🔍 Summarize"):
	if not text.strip():
	st.warning("⚠️ Please enter some text.")
	else:
	with st.spinner("Summarizing..."):
	inputs = tokenizer(text, return_tensors="pt", truncation=True)
	summary_ids = model.generate(
	**inputs,
	max_length=max_length,
	min_length=min_length,
	num_beams=num_beams,
	length_penalty=2.0,
	early_stopping=True
	)
	summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

	st.subheader("📄 Summary")
	st.success(summary)