|
import pandas as pd |
|
import numpy as np |
|
import xgboost as xgb |
|
import streamlit as st |
|
import requests |
|
from bs4 import BeautifulSoup |
|
from gensim.models import FastText |
|
import joblib |
|
|
|
|
|
try: |
|
fasttext_model = FastText.load('fasttext_model.bin') |
|
except FileNotFoundError: |
|
st.error("The FastText model file was not found. Please ensure 'fasttext_model.bin' and its associated files are in the correct directory.") |
|
st.stop() |
|
|
|
|
|
try: |
|
model = joblib.load('model.pkl') |
|
except FileNotFoundError: |
|
st.error("The XGBoost model file was not found. Please ensure 'model.pkl' is in the correct directory.") |
|
st.stop() |
|
|
|
def tokenize(text): |
|
if isinstance(text, str): |
|
return text.split() |
|
else: |
|
return [] |
|
|
|
def embed_text(text_series, fasttext_model): |
|
embeddings = [] |
|
for text in text_series: |
|
tokens = tokenize(text) |
|
vectors = [fasttext_model.wv[token] for token in tokens if token in fasttext_model.wv] |
|
if vectors: |
|
embeddings.append(np.mean(vectors, axis=0)) |
|
else: |
|
embeddings.append(np.zeros(fasttext_model.vector_size)) |
|
return np.array(embeddings) |
|
|
|
def preprocess_input(query, title, description, url, fasttext_model): |
|
query = str(query) if pd.notna(query) else '' |
|
title = str(title) if pd.notna(title) else '' |
|
description = str(description) if pd.notna(description) else '' |
|
url = str(url) if pd.notna(url) else '' |
|
|
|
query_ft = embed_text(pd.Series([query]), fasttext_model) |
|
title_ft = embed_text(pd.Series([title]), fasttext_model) |
|
description_ft = embed_text(pd.Series([description]), fasttext_model) |
|
url_ft = embed_text(pd.Series([url]), fasttext_model) |
|
|
|
combined_features = np.hstack([query_ft, title_ft, description_ft, url_ft]) |
|
|
|
dmatrix = xgb.DMatrix(combined_features) |
|
return dmatrix |
|
|
|
def extract_title_description(url): |
|
headers = { |
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36' |
|
} |
|
try: |
|
response = requests.get(url, headers=headers) |
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
title = soup.title.string if soup.title else 'No title found' |
|
description_tag = soup.find('meta', attrs={'name': 'description'}) |
|
description = description_tag['content'] if description_tag else 'No description found' |
|
return title, description |
|
except Exception as e: |
|
return 'Error extracting title', 'Error extracting description' |
|
|
|
def predict(query, title, description, url, fasttext_model): |
|
dmatrix = preprocess_input(query, title, description, url, fasttext_model) |
|
probability = model.predict(dmatrix, validate_features=False)[0] |
|
binary_prediction = int(probability >= 0.5) |
|
return binary_prediction, probability |
|
|
|
|
|
st.title('CTR Prediction Inference') |
|
|
|
tab1, tab2, tab3 = st.tabs(["Single Entry", "Batch Entry", "A/B Test"]) |
|
|
|
with tab1: |
|
st.header('Single Entry Inference') |
|
|
|
query = st.text_input('Query') |
|
url = st.text_input('URL') |
|
|
|
if st.button('Predict'): |
|
title, description = extract_title_description(url) |
|
st.write(f'Extracted Title: {title}') |
|
st.write(f'Extracted Description: {description}') |
|
|
|
if query and url: |
|
binary_result, confidence = predict(query, title, description, url, fasttext_model) |
|
st.write(f'Predicted +/-: {binary_result}') |
|
st.write(f'Conf.: {confidence:.2%}') |
|
confidence_percentage = int(confidence * 100) |
|
st.progress(confidence_percentage) |
|
else: |
|
st.write('Please enter both a query and a URL.') |
|
|
|
with tab2: |
|
st.header('Batch Entry Inference') |
|
|
|
uploaded_file = st.file_uploader("Upload CSV", type="csv") |
|
|
|
if uploaded_file is not None: |
|
df = pd.read_csv(uploaded_file) |
|
required_columns = ['Query', 'Title', 'Description', 'URL'] |
|
|
|
if set(required_columns).issubset(df.columns): |
|
predictions = [] |
|
confidences = [] |
|
for index, row in df.iterrows(): |
|
binary_result, confidence = predict(row['Query'], row['Title'], row['Description'], row['URL'], fasttext_model) |
|
predictions.append(binary_result) |
|
confidences.append(confidence) |
|
|
|
df['+/-'] = predictions |
|
df['Conf.'] = [f"{conf:.2%}" for conf in confidences] |
|
|
|
cols = ['+/-', 'Conf.'] + [col for col in df.columns if col not in ['+/-', 'Conf.']] |
|
df = df[cols] |
|
|
|
st.write(df) |
|
st.download_button("Download Predictions", df.to_csv(index=False), "predictions.csv") |
|
else: |
|
st.write('CSV must contain Query, Title, Description, and URL columns.') |
|
|
|
with tab3: |
|
st.header('A/B Test Inference') |
|
|
|
query = st.text_input('Query for A/B Test') |
|
url = st.text_input('URL for A/B Test') |
|
|
|
if 'step' not in st.session_state: |
|
st.session_state.step = 0 |
|
|
|
if st.button('Scrape A/B'): |
|
title_A, description_A = extract_title_description(url) |
|
st.session_state['title_A'] = title_A |
|
st.session_state['description_A'] = description_A |
|
st.session_state.step = 1 |
|
|
|
if st.session_state.step == 1: |
|
title_B = st.text_input('Title B', value=st.session_state.get('title_A', '')) |
|
description_B = st.text_area('Description B', value=st.session_state.get('description_A', '')) |
|
|
|
if st.button('Predict A/B'): |
|
if query and url: |
|
binary_result_A, confidence_A = predict(query, st.session_state['title_A'], st.session_state['description_A'], url, fasttext_model) |
|
binary_result_B, confidence_B = predict(query, title_B, description_B, url, fasttext_model) |
|
|
|
st.write(f'Results for A: Predicted +/-: {binary_result_A}, Conf.: {confidence_A:.2%}') |
|
st.write(f'Results for B: Predicted +/-: {binary_result_B}, Conf.: {confidence_B:.2%}') |
|
|
|
if binary_result_A == 1 and binary_result_B == 0: |
|
st.write("B is worse than A") |
|
elif binary_result_A == 0 and binary_result_B == 1: |
|
st.write("B is better than A") |
|
else: |
|
st.write("B is the same as A") |
|
|