visualization_modules / src /data_processing.py
ever-flow's picture
Upload 12 files
bd24fca verified
Raw
History Blame Contribute Delete
7.71 kB
import pandas as pd
import numpy as np
import streamlit as st
from pathlib import Path
from config import PERIODS
from data_utils import convert_to_usd, create_multiple_classification_data
# 2. Data Load & Preโ€‘processing
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
@st.cache_data(show_spinner=True)
def load_raw_data() -> pd.DataFrame:
"""Load Excel source & basic cleaning"""
file_path = Path(__file__).with_name("heatmap_data_with_SE_v2.xlsx")
df = pd.read_excel(file_path, sheet_name='Sheet1')
# Keep rows where at least one EMSEC column is nonโ€‘null
emsec_cols = [f'EMSEC{i}' for i in range(1, 6)]
df = df[df[emsec_cols].notna().any(axis=1)].copy()
# Fallback Company column
if 'Company' not in df.columns:
df['Company'] = df['ticker']
# Map market โ†’ country
market_country_map = {
'KOSPI': 'ํ•œ๊ตญ', 'KOSDAQ': 'ํ•œ๊ตญ', 'KOSDAQ GLOBAL': 'ํ•œ๊ตญ',
'NASDAQ': '๋ฏธ๊ตญ', 'NYSE': '๋ฏธ๊ตญ',
'Prime (Domestic Stocks)': '์ผ๋ณธ', 'Standard (Domestic Stocks)': '์ผ๋ณธ',
'Prime (Foreign Stocks)': '์ผ๋ณธ'
}
df['Country'] = df['market'].map(market_country_map).fillna('Unclassified')
# Normalize market label
df['Market'] = df['market'].replace({'KOSDAQ GLOBAL': 'KOSDAQ'})
return df
def calculate_financial_metrics_with_currency_conversion(df: pd.DataFrame) -> pd.DataFrame:
"""Replicates the standalone v3.6 financialโ€‘metric pipeline."""
df = df.copy()
def safe_divide(num, den):
return num / den.replace(0, np.nan)
# โ€” absolute USD columns ---------------------------------------------------
abs_cols = ['Market Cap (2024-12-31)', 'Enterprise Value (FQ0)']
periods = PERIODS
for p in periods:
abs_cols.extend([
f'Revenue ({p})', f'EBIT ({p})', f'Net Income ({p})',
f'Total Assets ({p})', f'Equity ({p})', f'Total Liabilities ({p})',
f'Net Debt ({p})', f'Depreciation ({p})', f'Dividends ({p})',
f'Net Income After Minority ({p})'
])
for col in abs_cols:
if col in df.columns:
df[f'{col}_USD'] = df.apply(lambda r: convert_to_usd(r[col], r['Country']), axis=1)
# โ€” derived metrics --------------------------------------------------------
for p in periods:
mc = 'Market Cap (2024-12-31)_USD'
ebit_usd = f'EBIT ({p})_USD'
dep_usd = f'Depreciation ({p})_USD'
nd_usd = f'Net Debt ({p})_USD'
ev_usd = 'Enterprise Value (FQ0)_USD'
if ebit_usd in df.columns and dep_usd in df.columns:
df[f'EBITDA ({p})_USD'] = df[ebit_usd] + df[dep_usd]
if mc in df.columns and nd_usd in df.columns:
df[ev_usd] = df.get(ev_usd, np.nan)
mask = df[ev_usd].isna()
df.loc[mask, ev_usd] = df.loc[mask, mc] + df.loc[mask, nd_usd].fillna(0)
ni_usd = f'Net Income ({p})_USD'
eq_usd = f'Equity ({p})_USD'
ebitda_usd = f'EBITDA ({p})_USD'
rev_usd = f'Revenue ({p})_USD'
if mc in df.columns and ni_usd in df.columns:
df[f'PER ({p})'] = safe_divide(df[mc], df[ni_usd])
if mc in df.columns and eq_usd in df.columns:
df[f'PBR ({p})'] = safe_divide(df[mc], df[eq_usd])
if ev_usd in df.columns and ebitda_usd in df.columns:
df[f'EV_EBITDA ({p})'] = safe_divide(df[ev_usd], df[ebitda_usd])
if mc in df.columns and rev_usd in df.columns:
df[f'์‹œ๊ฐ€์ด์•ก/๋งค์ถœ์•ก ({p})'] = safe_divide(df[mc], df[rev_usd])
if mc in df.columns and ebit_usd in df.columns:
df[f'์‹œ๊ฐ€์ด์•ก/์˜์—…์ด์ต ({p})'] = safe_divide(df[mc], df[ebit_usd])
# โ€” ratios (local currency OK) ----------------------------------------
ni = f'Net Income ({p})'
ni_after = f'Net Income After Minority ({p})'
ni = ni_after if ni_after in df.columns else ni
assets = f'Total Assets ({p})'
liab = f'Total Liabilities ({p})'
if ni in df.columns and eq_usd.replace('_USD', '') in df.columns:
df[f'ROE ({p})'] = safe_divide(df[ni], df[eq_usd.replace('_USD', '')])
if ebit_usd.replace('_USD', '') in df.columns and rev_usd.replace('_USD', '') in df.columns:
df[f'์˜์—…์ด์ต๋ฅ  ({p})'] = safe_divide(df[ebit_usd.replace('_USD', '')], df[rev_usd.replace('_USD', '')])
if ebit_usd in df.columns and rev_usd in df.columns and dep_usd.replace('_USD', '') in df.columns:
df[f'EBITDA/Sales ({p})'] = safe_divide(df[ebit_usd.replace('_USD', '')] + df[dep_usd.replace('_USD', '')], df[rev_usd.replace('_USD', '')])
if ni in df.columns and assets.replace('_USD', '') in df.columns:
df[f'์ด์ž์‚ฐ์ด์ต๋ฅ  ({p})'] = safe_divide(df[ni], df[assets.replace('_USD', '')])
if rev_usd.replace('_USD', '') in df.columns and assets.replace('_USD', '') in df.columns:
df[f'์ž์‚ฐํšŒ์ „์œจ ({p})'] = safe_divide(df[rev_usd.replace('_USD', '')], df[assets.replace('_USD', '')])
if eq_usd.replace('_USD', '') in df.columns and assets.replace('_USD', '') in df.columns:
df[f'์ž๊ธฐ์ž๋ณธ๋น„์œจ ({p})'] = safe_divide(df[eq_usd.replace('_USD', '')], df[assets.replace('_USD', '')])
if liab.replace('_USD', '') in df.columns and eq_usd.replace('_USD', '') in df.columns:
df[f'๋ถ€์ฑ„๋น„์œจ ({p})'] = safe_divide(df[liab.replace('_USD', '')], df[eq_usd.replace('_USD', '')])
return df
def prepare_streamlit_data(df: pd.DataFrame) -> pd.DataFrame:
"""Convert wide โ†’ long for multiโ€‘year access in Streamlit."""
rows = []
for yr in PERIODS:
tmp = df.copy()
tmp['Year'] = yr
mapping = {
f'PER ({yr})': 'PER',
f'PBR ({yr})': 'PBR',
f'EV_EBITDA ({yr})': 'EV_EBITDA',
f'ROE ({yr})': 'ROE',
f'์˜์—…์ด์ต๋ฅ  ({yr})': '์˜์—…์ด์ต๋ฅ ',
f'EBITDA/Sales ({yr})': 'EBITDA/Sales',
f'์ด์ž์‚ฐ์ด์ต๋ฅ  ({yr})': '์ด์ž์‚ฐ์ด์ต๋ฅ ',
f'์ž์‚ฐํšŒ์ „์œจ ({yr})': '์ž์‚ฐํšŒ์ „์œจ',
f'์ž๊ธฐ์ž๋ณธ๋น„์œจ ({yr})': '์ž๊ธฐ์ž๋ณธ๋น„์œจ',
f'๋ถ€์ฑ„๋น„์œจ ({yr})': '๋ถ€์ฑ„๋น„์œจ',
f'์‹œ๊ฐ€์ด์•ก/๋งค์ถœ์•ก ({yr})': '์‹œ๊ฐ€์ด์•ก/๋งค์ถœ์•ก',
f'์‹œ๊ฐ€์ด์•ก/์˜์—…์ด์ต ({yr})': '์‹œ๊ฐ€์ด์•ก/์˜์—…์ด์ต',
f'Net Income ({yr})_USD': 'Net_Income',
f'EBITDA ({yr})_USD': 'EBITDA',
f'Revenue ({yr})_USD': 'Sales',
f'Total Assets ({yr})_USD': 'Assets',
f'Equity ({yr})_USD': 'Book'
}
for old, new in mapping.items():
if old in tmp.columns:
tmp[new] = tmp[old]
rows.append(tmp)
return pd.concat(rows, ignore_index=True)
@st.cache_data(show_spinner=True)
def load_processed_data() -> pd.DataFrame:
raw = load_raw_data()
metrics = calculate_financial_metrics_with_currency_conversion(raw)
# Identify companies with any missing financials ------------------------
non_financial_keywords = ['EMSEC', 'EMTEC', 'ticker', 'market', 'Country', 'Market', 'name', 'Company']
fin_cols = [c for c in metrics.columns if not any(k in c for k in non_financial_keywords)]
company_missing = metrics.groupby('ticker')[fin_cols].apply(lambda x: x.isnull().values.any())
metrics['has_missing_financials'] = metrics['ticker'].map(company_missing)
expanded = create_multiple_classification_data(metrics)
return prepare_streamlit_data(expanded)