CTR / app.py
dejanseo's picture
Update app.py
77517d8 verified
import pandas as pd
import numpy as np
import xgboost as xgb
import streamlit as st
import requests
from bs4 import BeautifulSoup
from gensim.models import FastText
import joblib
# Load the trained FastText model
try:
fasttext_model = FastText.load('fasttext_model.bin')
except FileNotFoundError:
st.error("The FastText model file was not found. Please ensure 'fasttext_model.bin' and its associated files are in the correct directory.")
st.stop()
# Load the trained XGBoost model for the combined features
try:
model = joblib.load('model.pkl')
except FileNotFoundError:
st.error("The XGBoost model file was not found. Please ensure 'model.pkl' is in the correct directory.")
st.stop()
def tokenize(text):
if isinstance(text, str):
return text.split()
else:
return []
def embed_text(text_series, fasttext_model):
embeddings = []
for text in text_series:
tokens = tokenize(text)
vectors = [fasttext_model.wv[token] for token in tokens if token in fasttext_model.wv]
if vectors:
embeddings.append(np.mean(vectors, axis=0))
else:
embeddings.append(np.zeros(fasttext_model.vector_size))
return np.array(embeddings)
def preprocess_input(query, title, description, url, fasttext_model):
query = str(query) if pd.notna(query) else ''
title = str(title) if pd.notna(title) else ''
description = str(description) if pd.notna(description) else ''
url = str(url) if pd.notna(url) else ''
query_ft = embed_text(pd.Series([query]), fasttext_model)
title_ft = embed_text(pd.Series([title]), fasttext_model)
description_ft = embed_text(pd.Series([description]), fasttext_model)
url_ft = embed_text(pd.Series([url]), fasttext_model)
combined_features = np.hstack([query_ft, title_ft, description_ft, url_ft])
dmatrix = xgb.DMatrix(combined_features)
return dmatrix
def extract_title_description(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36'
}
try:
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
title = soup.title.string if soup.title else 'No title found'
description_tag = soup.find('meta', attrs={'name': 'description'})
description = description_tag['content'] if description_tag else 'No description found'
return title, description
except Exception as e:
return 'Error extracting title', 'Error extracting description'
def predict(query, title, description, url, fasttext_model):
dmatrix = preprocess_input(query, title, description, url, fasttext_model)
probability = model.predict(dmatrix, validate_features=False)[0]
binary_prediction = int(probability >= 0.5)
return binary_prediction, probability
# Streamlit interface
st.title('CTR Prediction Inference')
tab1, tab2, tab3 = st.tabs(["Single Entry", "Batch Entry", "A/B Test"])
with tab1:
st.header('Single Entry Inference')
query = st.text_input('Query')
url = st.text_input('URL')
if st.button('Predict'):
title, description = extract_title_description(url)
st.write(f'Extracted Title: {title}')
st.write(f'Extracted Description: {description}')
if query and url:
binary_result, confidence = predict(query, title, description, url, fasttext_model)
st.write(f'Predicted +/-: {binary_result}')
st.write(f'Conf.: {confidence:.2%}')
confidence_percentage = int(confidence * 100)
st.progress(confidence_percentage)
else:
st.write('Please enter both a query and a URL.')
with tab2:
st.header('Batch Entry Inference')
uploaded_file = st.file_uploader("Upload CSV", type="csv")
if uploaded_file is not None:
df = pd.read_csv(uploaded_file)
required_columns = ['Query', 'Title', 'Description', 'URL']
if set(required_columns).issubset(df.columns):
predictions = []
confidences = []
for index, row in df.iterrows():
binary_result, confidence = predict(row['Query'], row['Title'], row['Description'], row['URL'], fasttext_model)
predictions.append(binary_result)
confidences.append(confidence)
df['+/-'] = predictions
df['Conf.'] = [f"{conf:.2%}" for conf in confidences]
cols = ['+/-', 'Conf.'] + [col for col in df.columns if col not in ['+/-', 'Conf.']]
df = df[cols]
st.write(df)
st.download_button("Download Predictions", df.to_csv(index=False), "predictions.csv")
else:
st.write('CSV must contain Query, Title, Description, and URL columns.')
with tab3:
st.header('A/B Test Inference')
query = st.text_input('Query for A/B Test')
url = st.text_input('URL for A/B Test')
if 'step' not in st.session_state:
st.session_state.step = 0
if st.button('Scrape A/B'):
title_A, description_A = extract_title_description(url)
st.session_state['title_A'] = title_A
st.session_state['description_A'] = description_A
st.session_state.step = 1
if st.session_state.step == 1:
title_B = st.text_input('Title B', value=st.session_state.get('title_A', ''))
description_B = st.text_area('Description B', value=st.session_state.get('description_A', ''))
if st.button('Predict A/B'):
if query and url:
binary_result_A, confidence_A = predict(query, st.session_state['title_A'], st.session_state['description_A'], url, fasttext_model)
binary_result_B, confidence_B = predict(query, title_B, description_B, url, fasttext_model)
st.write(f'Results for A: Predicted +/-: {binary_result_A}, Conf.: {confidence_A:.2%}')
st.write(f'Results for B: Predicted +/-: {binary_result_B}, Conf.: {confidence_B:.2%}')
if binary_result_A == 1 and binary_result_B == 0:
st.write("B is worse than A")
elif binary_result_A == 0 and binary_result_B == 1:
st.write("B is better than A")
else:
st.write("B is the same as A")