| | import streamlit as st |
| | import tensorflow as tf |
| | import numpy as np |
| | import pandas as pd |
| | from transformers import AutoTokenizer |
| | from src.cross_encoder_model import CrossEncoderTF |
| | from src.mixed_cross_encoder_model import MixedDataCrossEncoderTF |
| |
|
| | MODEL_NAME = "dbmdz/bert-base-turkish-cased" |
| | SAVED_CROSS_ENCODER_MODEL_PATH = "src/v2_cross_encoder.keras" |
| | SAVED_MIXED_CROSS_ENCODER_MODEL_PATH = "src/v2_mixed_data_cross_encoder.keras" |
| | MAX_TOKEN_LEN = 32 |
| | DATA_FILE_PATH = "src/model_0_data.csv" |
| | TEXT_COLS = ['STRA', 'STRB'] |
| | LABEL_COL = 'DISTANCE' |
| | EXCLUDE_COLS = TEXT_COLS + [LABEL_COL, 'FILLER'] |
| | NUMERICAL_FEATURE_DIM = 5132 |
| | CACHE_DIR = "./.cache" |
| |
|
| | @st.cache_data |
| | def load_data(): |
| | try: |
| | df = pd.read_csv(DATA_FILE_PATH, decimal=',', low_memory=False) |
| | except FileNotFoundError: |
| | st.error(f"Veri dosyası bulunamadı: {DATA_FILE_PATH}. Lütfen dosyanın uygulamanın çalıştığı dizinde olduğundan emin olun.") |
| | st.stop() |
| | return df |
| |
|
| | @st.cache_resource |
| | def load_models_and_tokenizer(): |
| | tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR) |
| | |
| | cross_encoder_model = tf.keras.models.load_model( |
| | SAVED_CROSS_ENCODER_MODEL_PATH, |
| | custom_objects={'CrossEncoderTF': CrossEncoderTF} |
| | ) |
| |
|
| | mixed_cross_encoder_model = tf.keras.models.load_model( |
| | SAVED_MIXED_CROSS_ENCODER_MODEL_PATH, |
| | custom_objects={'MixedDataCrossEncoderTF': MixedDataCrossEncoderTF, |
| | 'numerical_feature_dim': NUMERICAL_FEATURE_DIM} |
| | ) |
| | |
| | return tokenizer, cross_encoder_model, mixed_cross_encoder_model |
| |
|
| | try: |
| | df_data = load_data() |
| | numerical_feature_cols = df_data.columns.drop(EXCLUDE_COLS).tolist() |
| | NUMERICAL_FEATURE_DIM = len(numerical_feature_cols) |
| | tokenizer, cross_encoder_model, mixed_cross_encoder_model = load_models_and_tokenizer() |
| | except Exception as e: |
| | st.error(f"Yüklenirken bir hata oluştu: {e}") |
| | st.stop() |
| |
|
| | def predict(model, tokenizer, str_a, str_b, numerical_features=None): |
| | tokenized = tokenizer( |
| | str_a, str_b, |
| | max_length=MAX_TOKEN_LEN, |
| | padding='max_length', |
| | truncation=True, |
| | return_tensors='np' |
| | ) |
| | |
| | model_input = { |
| | 'input_ids': tokenized['input_ids'], |
| | 'attention_mask': tokenized['attention_mask'], |
| | } |
| |
|
| | if numerical_features is not None: |
| | model_input['numerical_features'] = numerical_features.reshape(1, -1).astype('float32') |
| | |
| | prediction = model.predict(model_input) |
| | score = prediction[0][0] |
| | |
| | return float(score) |
| |
|
| | st.set_page_config(page_title="Varlık Benzerlik Testi", layout="centered") |
| | st.title("İki Model Karşılaştırmalı Varlık Benzerlik Test Arayüzü") |
| |
|
| | st.info( |
| | "Bu uygulama, metinsel verileri kullanarak iki varlığın " |
| | "benzerlik olasılığını tahmin eder ve iki farklı modelin sonuçlarını karşılaştırır." |
| | "(henüz bi-encoder mimarisi eklenmemiştir, sadece cross-encoder modeli kullanılıyor)" |
| | "\n\n**Cross-encoder mimarisi:** yalnızca metin1, metin2 ve distance özellikleri ile eğitilmiştir." |
| | "\n\n**Mixed-cross-encoder mimarisi:** metin1, metin2, distance ve numerik özellikler ile eğitilmiştir." |
| | ) |
| |
|
| | st.image("src/encoder_algorithm.png", caption="Encoder Algoritma Akışı", use_container_width=True) |
| |
|
| | st.header("Girdi String'leri") |
| |
|
| | stra_options = df_data['STRA'].unique() |
| | str_a_input = st.selectbox("String A (STRA)", stra_options) |
| |
|
| | filtered_strb_options = df_data[df_data['STRA'] == str_a_input]['STRB'].unique() |
| | str_b_input = st.selectbox("String B (STRB)", filtered_strb_options) |
| |
|
| | if st.button("Benzerliği Hesapla", type="primary"): |
| | if not str_a_input or not str_b_input: |
| | st.error("Lütfen her iki string alanını da seçin.") |
| | else: |
| | with st.spinner("Tahminler yapılıyor..."): |
| | selected_row = df_data[(df_data['STRA'] == str_a_input) & (df_data['STRB'] == str_b_input)] |
| | if not selected_row.empty: |
| | numerical_features_for_prediction = selected_row[numerical_feature_cols].iloc[0].values |
| | else: |
| | st.error("Seçilen string'lere ait veri bulunamadı. Lütfen farklı seçimler yapın.") |
| | st.stop() |
| |
|
| | cross_encoder_distance_score = predict(cross_encoder_model, tokenizer, str_a_input, str_b_input) |
| | cross_encoder_similarity_score = 1 - cross_encoder_distance_score |
| | |
| | mixed_cross_encoder_distance_score = predict(mixed_cross_encoder_model, tokenizer, str_a_input, str_b_input, numerical_features_for_prediction) |
| | mixed_cross_encoder_similarity_score = 1 - mixed_cross_encoder_distance_score |
| | |
| | actual_row = df_data[(df_data['STRA'] == str_a_input) & (df_data['STRB'] == str_b_input)] |
| | if not actual_row.empty: |
| | actual_distance = actual_row[LABEL_COL].iloc[0] |
| | actual_similarity = 1 - actual_distance |
| | else: |
| | actual_distance = np.nan |
| | actual_similarity = np.nan |
| |
|
| | st.subheader("Karşılaştırmalı Sonuçlar") |
| | |
| | results_data = { |
| | "Özellik": ["Tahmin Edilen Benzerlik", "Gerçek Benzerlik", "Tahmin Edilen Mesafe", "Gerçek Mesafe", "Karar"], |
| | "Cross-Encoder Model": [ |
| | f"{cross_encoder_similarity_score:.4f}", |
| | f"{actual_similarity:.4f}" if not np.isnan(actual_similarity) else "N/A", |
| | f"{cross_encoder_distance_score:.4f}", |
| | f"{actual_distance:.4f}" if not np.isnan(actual_distance) else "N/A", |
| | "BENZER" if cross_encoder_similarity_score > 0.5 else "BENZER DEĞİL" |
| | ], |
| | "Mixed Cross-Encoder Model": [ |
| | f"{mixed_cross_encoder_similarity_score:.4f}", |
| | f"{actual_similarity:.4f}" if not np.isnan(actual_similarity) else "N/A", |
| | f"{mixed_cross_encoder_distance_score:.4f}", |
| | f"{actual_distance:.4f}" if not np.isnan(actual_distance) else "N/A", |
| | "BENZER" if mixed_cross_encoder_similarity_score > 0.5 else "BENZER DEĞİL" |
| | ] |
| | } |
| | results_df = pd.DataFrame(results_data).set_index("Özellik") |
| | st.dataframe(results_df) |
| |
|
| | st.markdown("---") |
| | st.markdown(f"**Cross-Encoder Model Kararı:** `{str_a_input}` ve `{str_b_input}` kelimeleri **:{'blue' if cross_encoder_similarity_score > 0.5 else 'red'}[{'BENZER' if cross_encoder_similarity_score > 0.5 else 'BENZER DEĞİL'}]**.") |
| | st.markdown(f"**Mixed Cross-Encoder Model Kararı:** `{str_a_input}` ve `{str_b_input}` kelimeleri **:{'blue' if mixed_cross_encoder_similarity_score > 0.5 else 'red'}[{'BENZER' if mixed_cross_encoder_similarity_score > 0.5 else 'BENZER DEĞİL'}]**.") |
| |
|