LemmaLi / app.py
Sarpyy's picture
Update app.py
4718ec2 verified
import os
import re
import torch
import pandas as pd
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# =========================================================================
# 1. Sabitler ve Model Yükleme
# =========================================================================
HF_MODEL_ID = "LiProject/BERT-Turkish-Lemmatization-V3"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
try:
tok = AutoTokenizer.from_pretrained(HF_MODEL_ID, use_fast=True)
mdl = AutoModelForSeq2SeqLM.from_pretrained(HF_MODEL_ID).to(DEVICE).eval()
print(f"Model yükleme başarılı: {HF_MODEL_ID} ({DEVICE} üzerinde)")
except Exception as e:
print(f"Model veya Tokenizer yüklenirken kritik hata oluştu: {e}")
raise SystemExit(1)
# =========================================================================
# 2. Arka Plan İşlemleri
# =========================================================================
def get_lemma_for_word(word: str) -> str:
"""Tek kelimeyi temizler, sayıysa sayıyı bırakır, değilse modele yollar."""
clean_word = word.strip(".,!?();:\"'’")
if not clean_word:
return word
num_match = re.match(r"^(\d+(?:[.,]\d+)?)(?:['’.]?[a-zA-ZğüşıöçĞÜŞİÖÇ]*)$", clean_word)
if num_match:
return num_match.group(1)
inputs = tok(clean_word, return_tensors="pt", truncation=True, max_length=128).to(DEVICE)
outputs = mdl.generate(**inputs, max_length=128)
lemma = tok.decode(outputs[0], skip_special_tokens=True).strip()
return lemma if lemma else clean_word
@torch.inference_mode()
def lemmatize_rows(multiline_text: str):
rows = []
sentences = [s.strip() for s in multiline_text.splitlines() if s.strip()]
if not sentences:
return pd.DataFrame(columns=["Full_Sentence", "Word", "Lemma"])
for sent in sentences:
words = sent.split()
for w in words:
l = get_lemma_for_word(w)
rows.append({
"Full_Sentence": sent,
"Word": w,
"Lemma": l
})
return pd.DataFrame(rows)
def add_sentence_separators(df: pd.DataFrame, char: str = "-", repeat: int = 10) -> pd.DataFrame:
if df.empty:
return df
rows = []
prev = None
for _, r in df.iterrows():
if prev is not None and r["Full_Sentence"] != prev:
sep = char * repeat
rows.append({
"Full_Sentence": sep,
"Word": sep,
"Lemma": sep
})
rows.append(r.to_dict())
prev = r["Full_Sentence"]
return pd.DataFrame(rows)
def run_and_save(text):
df = lemmatize_rows(text)
df_view = add_sentence_separators(df, char="-", repeat=10)
out_path = "lemma_output.csv"
df.to_csv(out_path, index=False, encoding="utf-8-sig")
return df_view, out_path
examples = [
"Yolcular, zorlu yollarda yolculuk yaparken yoldan çıkmamaya özen gösterirler.",
"Öğrenciler 2'şerli gruplar halinde 15.30'da içeri alındılar.",
"Benimki seninkinden daha güzelmiş, dedi usulca."
]
# =========================================================================
# 3. Gradio Arayüzü
# =========================================================================
theme = gr.themes.Soft(
primary_hue="blue",
secondary_hue="slate",
neutral_hue="slate"
)
custom_css = """
.gradio-container {
max-width: 1100px !important;
margin: 0 auto !important;
padding-top: 20px !important;
}
#input_text textarea {
min-height: 190px !important;
font-size: 15px !important;
line-height: 1.5 !important;
}
#results_table {
max-height: 420px !important;
overflow: auto !important;
}
#results_table table {
table-layout: fixed !important;
width: 100% !important;
}
#results_table th, #results_table td {
white-space: normal !important;
word-break: break-word !important;
}
.main-title {
text-align: center;
margin-bottom: 4px;
}
.sub-text {
text-align: center;
opacity: 0.9;
margin-bottom: 18px;
}
.info-box {
border: 1px solid #cbd5e1;
border-radius: 14px;
padding: 14px 16px;
margin-top: 12px;
margin-bottom: 16px;
background: rgba(148,163,184,0.08);
}
footer {
visibility: hidden !important;
}
"""
with gr.Blocks(title="Türkçe Lemmatizer") as demo:
gr.HTML("""
<div class="main-title">
<h1>Türkçe Lemmatization Aracı</h1>
</div>
<div class="sub-text">
Türkçe cümleleri kelime kelime işleyerek köklerini çıkarır ve CSV olarak indirmenizi sağlar.
</div>
""")
gr.HTML(f"""
<div class="info-box">
<b>Model:</b> {HF_MODEL_ID}<br>
<b>Çalışma mantığı:</b> Metin satır satır, her satır da kelime kelime işlenir.<br>
<b>Not:</b> Arayüzde nadiren Türkçe karakter görüntüleme farkları olabilir; model mantığında Türkçe desteği korunur.
</div>
""")
with gr.Row():
with gr.Column(scale=3):
inp = gr.Textbox(
label="Metin Girişi",
placeholder="Buraya bir veya birden fazla Türkçe cümle yazın...",
lines=8,
elem_id="input_text"
)
gr.Examples(
examples=[[e] for e in examples],
inputs=inp,
label="Örnek girdiler"
)
with gr.Column(scale=1):
btn = gr.Button("Kökleri Bul", variant="primary")
clr = gr.Button("Temizle", variant="secondary")
out_tbl = gr.Dataframe(
headers=["Full_Sentence", "Word", "Lemma"],
label="Sonuç Önizleme",
interactive=False,
wrap=True,
elem_id="results_table"
)
out_file = gr.File(label="CSV Çıktısı")
btn.click(
fn=run_and_save,
inputs=inp,
outputs=[out_tbl, out_file]
)
inp.submit(
fn=run_and_save,
inputs=inp,
outputs=[out_tbl, out_file]
)
clr.click(
fn=lambda: ("", None, None),
inputs=None,
outputs=[inp, out_tbl, out_file]
)
if __name__ == "__main__":
demo.launch(theme=theme, css=custom_css)