FinSentLLM / FPB_Structured_Financial_Semantics.py

Upload 5 files

23a8f6d verified 4 months ago

5.16 kB


	import re
	from typing import Dict, List

	def _compile(patterns: List[str], flags=re.IGNORECASE):
	return [re.compile(p, flags=flags) for p in patterns]

	def _any_match(text: str, regs) -> bool:
	return any(r.search(text) for r in regs)

	# Operators per FinSentLLM Table 1
	_COMPARATIVE = _compile([
	r"\bcompared\s+to\b",
	r"\bcompared\s+with\b",
	r"\bversus\b",
	r"\bvs\.?\b",
	r"\bfrom\s+[-+]?\d+(?:\.\d+)?\s(?:%\|percent\|percentage\|[A-Za-z]+)?\s+to\s+[-+]?\d+(?:\.\d+)?\s(?:%\|percent\|percentage\|[A-Za-z]+)?\b",
	r"\bfrom\s+[A-Za-z0-9\.,%-]+\s+to\s+[A-Za-z0-9\.,%-]+\b",
	])

	_LOSS_IMPROVE = _compile([
	r"\bloss(?:es)?\s+(?:narrowed\|shr[aou]nk\|decreased\|fell\|reduced)\b",
	r"\bturn(?:ed)?\s+to\s+(?:profit\|black)\b",
	])
	_LOSS_WORSEN = _compile([
	r"\bloss(?:es)?\s+(?:widened\|grew\|increased\|rose\|deepened)\b",
	r"\bturn(?:ed)?\s+to\s+(?:loss\|red)\b",
	])

	_PROFIT_UP = _compile([
	r"\b(profit\|profits\|net\s+income\|earnings\|ebit\|ebitda\|eps\|roe\|roi\|return(?:s)?(?:\s+on\s+equity)?)\b.*\b(rose\|grew\|increased\|up\|higher\|improved\|jumped\|surged\|soared)\b",
	r"\b(rose\|grew\|increased\|up\|higher\|improved\|jumped\|surged\|soared)\b.*\b(profit\|profits\|net\s+income\|earnings\|ebit\|ebitda\|eps\|roe\|roi\|return(?:s)?(?:\s+on\s+equity)?)\b",
	])

	_COST_DOWN = _compile([
	r"\b(cost\|costs\|expenses\|opex\|operating\s+expense(?:s)?)\b.*\b(fell\|declined\|decreased\|lower\|reduced\|down)\b",
	r"\b(fell\|declined\|decreased\|lower\|reduced\|down)\b.*\b(cost\|costs\|expenses\|opex\|operating\s+expense(?:s)?)\b",
	])

	_CONTRACT_FIN = _compile([
	r"\b(agreement\|deal\|contract\|order\|purchase\s+order\|framework\s+agreement)\b",
	r"\b(bond\|notes?\|debenture\|convertible\|placement\|issuance\|issue\|offering\|ipo\|follow-?on)\b",
	r"\b(loan\|credit\s+facility\|credit\s+line\|revolver\|revolving\s+credit\|financing)\b",
	])

	_UNCERTAIN = _compile([
	r"\b(uncertain\|uncertainty\|cannot\s+be\s+determined\|not\s+clear\|unknown\|unpredictable)\b",
	r"\b(impairment\|write-?down\|one-?off\|exceptional\s+(?:item\|charge)\|non-?recurring)\b",
	r"\b(outlook\s+(?:uncertain\|cloudy\|cautious))\b",
	])

	_STABLE_GUIDE = _compile([
	r"\b(expects?\|expected\|expects\s+to\|guidance\|forecast\|outlook)\b.*\b(remain(?:s\|ed\|ing)?\s+(?:stable\|unchanged)\|in[-\s]?line)\b",
	r"\b(reiterated\|maintained)\s+(?:its\s+)?(guidance\|forecast\|outlook)\b",
	])

	_OPERATIONAL = _compile([
	r"\b(restructuring\|reorganization\|spin-?off\|divest(?:iture)?\|asset\s+sale)\b",
	r"\b(ban\|suspension\|halted\|blocked\|prohibited)\b",
	r"\b(recall\|probe\|investigation\|lawsuit\|litigation\|settlement)\b",
	r"\b(layoffs?\|headcount\s+reduction\|cut\s+jobs\|hiring\s+freeze)\b",
	])

	def extract_semantic_flags(text: str) -> Dict[str, int]:
	t = text.strip().lower()

	flags = {
	"sem_compared": int(_any_match(t, _COMPARATIVE)),
	"sem_loss_improve": int(_any_match(t, _LOSS_IMPROVE)),
	"sem_loss_worsen": int(_any_match(t, _LOSS_WORSEN)),
	"sem_profit_up": int(_any_match(t, _PROFIT_UP)),
	"sem_cost_down": int(_any_match(t, _COST_DOWN)),
	"sem_contract_fin": int(_any_match(t, _CONTRACT_FIN)),
	"sem_uncertainty": int(_any_match(t, _UNCERTAIN)),
	"sem_stable_guidance":int(_any_match(t, _STABLE_GUIDE)),
	"sem_operational": int(_any_match(t, _OPERATIONAL)),
	}
	return flags

	# ============================================================
	# Run directly from terminal
	# ============================================================
	if __name__ == "__main__":
	import argparse, pandas as pd
	from pathlib import Path

	parser = argparse.ArgumentParser(description="Extract Structured Financial Semantics from FPB text file.")
	parser.add_argument("--input", required=True, help="Path to Sentences_*.txt or a CSV with text column.")
	parser.add_argument("--out", required=True, help="Output CSV path.")
	parser.add_argument("--text_col", default="sentence", help="Column name if input is CSV.")
	args = parser.parse_args()

	def parse_fpb_line(line):
	if "@positive" in line:
	return line.rsplit("@positive", 1)[0].strip(), "positive"
	elif "@negative" in line:
	return line.rsplit("@negative", 1)[0].strip(), "negative"
	elif "@neutral" in line:
	return line.rsplit("@neutral", 1)[0].strip(), "neutral"
	else:
	return line.strip(), ""

	path = Path(args.input)
	rows = []
	if path.suffix.lower() == ".txt":
	with open(path, "r", encoding="utf-8", errors="ignore") as f:
	for i, line in enumerate(f):
	text, label = parse_fpb_line(line)
	if text:
	rows.append({"id": i, args.text_col: text, "label": label})
	df = pd.DataFrame(rows)
	else:
	df = pd.read_csv(path)

	# Apply semantic extraction
	df_feats = df[args.text_col].astype(str).apply(extract_semantic_flags).apply(pd.Series)
	df_out = pd.concat([df, df_feats], axis=1)
	df_out.to_csv(args.out, index=False)
	print(f"Saved structured semantics to: {args.out}")
	print("Columns:", [c for c in df_out.columns if c.startswith('sem_')])