Dev Goyal commited on
Commit ·
c6d67ac
1
Parent(s): 6cb1c7b
Initial deployment of FinAgent
Browse files- .env.example +25 -0
- .gitignore +22 -0
- CHANGELOG.md +59 -0
- Dockerfile +40 -0
- Dockerfile.api +13 -0
- Dockerfile.ui +13 -0
- HF_README.md +76 -0
- README.md +71 -7
- backend/__init__.py +0 -0
- backend/api.py +76 -0
- core/__init__.py +0 -0
- core/config.py +20 -0
- core/earnings_tools.py +550 -0
- core/graph_builder.py +372 -0
- core/rag_tools.py +60 -0
- core/runner.py +125 -0
- core/sec_tools.py +172 -0
- core/sentiment_tools.py +54 -0
- docker-compose.yml +27 -0
- frontend/streamlit_app.py +158 -0
- requirements.txt +32 -0
- scripts/ingest.py +79 -0
- scripts/ingest_earnings_calls.py +106 -0
- scripts/main.py +40 -0
- supervisord.conf +24 -0
.env.example
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# OpenAI-compatible LLM (default: local Ollama)
|
| 2 |
+
OPENAI_BASE_URL=http://localhost:11434/v1
|
| 3 |
+
OPENAI_API_KEY=ollama
|
| 4 |
+
OPENAI_MODEL=llama3.1
|
| 5 |
+
OPENAI_TEMPERATURE=0
|
| 6 |
+
|
| 7 |
+
# LangSmith tracing (use with python-dotenv / your shell)
|
| 8 |
+
LANGSMITH_TRACING=true
|
| 9 |
+
LANGSMITH_ENDPOINT=https://api.smith.langchain.com
|
| 10 |
+
LANGSMITH_API_KEY=<YOUR_API_KEY>
|
| 11 |
+
LANGSMITH_PROJECT=<YOUR_PROJECT_NAME>
|
| 12 |
+
|
| 13 |
+
# Alternative names LangChain also understands:
|
| 14 |
+
# LANGCHAIN_TRACING_V2=true
|
| 15 |
+
# LANGCHAIN_API_KEY=<YOUR_API_KEY>
|
| 16 |
+
# LANGCHAIN_PROJECT=<YOUR_PROJECT_NAME>
|
| 17 |
+
|
| 18 |
+
# Optional: verbose LangChain stdout (noisy; off by default)
|
| 19 |
+
# LANGCHAIN_DEBUG=true
|
| 20 |
+
|
| 21 |
+
# Earnings-call pipeline (Alpha Vantage free tier; falls back to SEC 8-K)
|
| 22 |
+
# Get a free key at https://www.alphavantage.co/support/#api-key
|
| 23 |
+
ALPHA_VANTAGE_API_KEY=demo
|
| 24 |
+
|
| 25 |
+
# HTTP API: uvicorn api:app --host 0.0.0.0 --port 8000
|
.gitignore
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.py[cod]
|
| 3 |
+
*$py.class
|
| 4 |
+
agent_env/
|
| 5 |
+
venv/
|
| 6 |
+
env/
|
| 7 |
+
|
| 8 |
+
# Environment Variables
|
| 9 |
+
.env
|
| 10 |
+
|
| 11 |
+
# Vector Databases & Local AI Models
|
| 12 |
+
chroma_db/
|
| 13 |
+
*.bin
|
| 14 |
+
*.pt
|
| 15 |
+
*.safetensors
|
| 16 |
+
|
| 17 |
+
# OS Generated Files
|
| 18 |
+
.DS_Store
|
| 19 |
+
Thumbs.db
|
| 20 |
+
|
| 21 |
+
# other
|
| 22 |
+
.vscode/
|
CHANGELOG.md
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Changelog
|
| 2 |
+
|
| 3 |
+
All notable changes to this project will be documented in this file.
|
| 4 |
+
|
| 5 |
+
## [Phase 6] - Earnings Call Analysis Integration
|
| 6 |
+
|
| 7 |
+
### Architecture
|
| 8 |
+
* **Two-Pipeline Design:** Added a new `Earnings_Agent` backed by an offline ingest pipeline and a runtime inference pipeline, following the same separation pattern as the existing 10-K RAG system.
|
| 9 |
+
* **Ingest Layer (`core/earnings_tools.py`):** Fetches transcripts from Alpha Vantage (premium) or SEC 8-K filings (free fallback), normalizes them into `Prepared Remarks` and `Q&A Session` segments, extracts keyword frequency counts, and embeds everything into a dedicated ChromaDB collection.
|
| 10 |
+
* **Inference Layer:** Three new `@tool` functions — `search_earnings_call` (RAG search), `get_earnings_sentiment_divergence` (section comparison), and `get_earnings_keyword_trends` (cross-quarter keyword tracking).
|
| 11 |
+
* **Graph Extension:** Added `Earnings_Agent` to the LangGraph `members` list, planner capability map, supervisor dispatch edges, and summarizer prompt.
|
| 12 |
+
|
| 13 |
+
### Added
|
| 14 |
+
* `core/earnings_tools.py` — Combined ingest + inference module for earnings-call data.
|
| 15 |
+
* `scripts/ingest_earnings_calls.py` — CLI tool for batch ingestion (`--tickers AAPL --quarters Q1-2025 Q2-2025`).
|
| 16 |
+
* Earnings-call example button in the Streamlit sidebar for quick testing.
|
| 17 |
+
* `ALPHA_VANTAGE_API_KEY` config setting in `.env.example` and `core/config.py`.
|
| 18 |
+
* Earnings Call Insights section in the Investment Memo (Summarizer) when earnings data is present.
|
| 19 |
+
|
| 20 |
+
### Changed
|
| 21 |
+
* **Planner Prompt:** Extended with `Earnings_Agent` capability mapping for earnings-call queries.
|
| 22 |
+
* **Summarizer Prompt:** New `## Earnings Call Insights` memo section for divergence and keyword findings.
|
| 23 |
+
* **Streamlit Sidebar:** Now detects and displays `_earnings` ChromaDB collections alongside `_10k` collections.
|
| 24 |
+
|
| 25 |
+
## [Phase 4] - Multi-Agent LangGraph Integration & SEC Pipeline Hardening
|
| 26 |
+
|
| 27 |
+
### Architecture Overhaul
|
| 28 |
+
* **Transitioned to LangGraph:** Replaced the legacy `while` loop with a deterministic StateGraph "Planner-Executor" architecture.
|
| 29 |
+
* **Dual-Node Governance:** Separated routing logic into a stateless `Planner` (generates JSON task arrays) and a stateful `Supervisor` (manages task queues), eliminating LLM cognitive overload and infinite routing loops.
|
| 30 |
+
* **Separation of Concerns:** Split worker capabilities into three strict React Agents: `Quant_Agent`, `Fundamental_Agent`, and `Sentiment_Agent`.
|
| 31 |
+
|
| 32 |
+
### Added
|
| 33 |
+
* **The "Honesty Guardrail":** Implemented programmatic checks in `make_worker_node` to verify `ToolMessage` execution. If an agent attempts to answer without triggering a tool, the output is blocked to prevent hallucination.
|
| 34 |
+
* **Strict Capability Matrix:** Updated the Planner prompt to explicitly map query types (e.g., "Risks", "Supply Chain") to specific RAG tool workflows.
|
| 35 |
+
* **Broad Query Protocol:** Added fallback logic for the Planner to execute standard Quant/Sentiment tasks when users ask for "general info" on a ticker.
|
| 36 |
+
* **Pydantic Enum Enforcement:** Added strict `args_schema` to SEC tools using `Literal` types to prevent the LLM from hallucinating invalid XBRL tags.
|
| 37 |
+
|
| 38 |
+
### Fixed
|
| 39 |
+
* **The SEC "2010 Bug":** The SEC API returns unordered historical data. Added Pandas-based datetime sorting, filing deduplication, and a 2-year lookback filter to ensure only modern data is served.
|
| 40 |
+
* **The SEC "Missing Revenue" Bug:** Implemented recursive fallback logic to try `RevenueFromContractWithCustomerIncludingAssessedTax` if the standard `Revenues` GAAP tag returns 404 (fixing data retrieval for MSFT, AAPL, etc.).
|
| 41 |
+
* **ChromaDB Deprecation:** Updated imports to `langchain_chroma` and improved SEC HTML `<DOCUMENT>` regex parsing for cleaner 10-K embeddings.
|
| 42 |
+
|
| 43 |
+
## [Phase 5] - API Streaming, Web UI & Containerization
|
| 44 |
+
|
| 45 |
+
### UI & Architecture
|
| 46 |
+
* **FastAPI Backend:** Exposed the LangGraph state machine via an asynchronous `GET /chat/stream` utilizing Server-Sent Events (SSE).
|
| 47 |
+
* **Streamlit Frontend:** Built a responsive agentic UI (`streamlit_app.py`) that visually streams the intermediate ReAct reasoning blocks into dynamically expanding dropdown menus.
|
| 48 |
+
* **State Persistence:** Rewrote the UI memory loop to preserve "Agent Thoughts" sequentially so historical messages contain dropdown logs linking exactly to how the agents generated their specific conclusions.
|
| 49 |
+
* **Docker Migration:** Shipped `Dockerfile.api` and `Dockerfile.ui` bridged via a custom network inside `docker-compose.yml`, successfully moving the application payload off MacOS and onto standardized, immutable infrastructure.
|
| 50 |
+
* **LLM Engine Swap:** Migrated the brain architecture permanently to Groq's `llama-3.1-8b-instant` and `llama3-70b-versatile` endpoints for incredibly fast serverless inference.
|
| 51 |
+
|
| 52 |
+
### Performance & Token Thrashing Fixes
|
| 53 |
+
* **LRU Caching:** Prevented the `SentenceTransformers` model from re-instantiating on disk during every RAG pipeline call, silencing repetitive console logs and saving heavy JVM/Python CPU overhead.
|
| 54 |
+
* **Groq 'Token Ghosting' Fix:** Squashed a massive token quota error ("Requested 17k tokens") by explicitly declaring `max_tokens=800`. This prevented Groq's load balancer from assuming maximum context window limits and throttling free-tier TPM budgets.
|
| 55 |
+
* **ReAct Infinite Loop Resolution:** The Fundamental agent historically trapped itself looping `search_10k_filings` tools when ordered to strictly "only output data". Bridged the behavior by injecting a hard "Stop once data is fetched" logic trap inside the system prompt.
|
| 56 |
+
* **Docker Hot-Reloading:** Injected `- ./:/app` mapping masks and explicitly overrode the Uvicorn execution loop with `--reload` to support real-time Python development without tearing down the containers.
|
| 57 |
+
|
| 58 |
+
### Next Steps 🚀
|
| 59 |
+
* Final push of the `finagent` backend/frontend images to **Google Cloud Run**.
|
Dockerfile
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ──────────────────────────────────────────────────────────────────────────────
|
| 2 |
+
# FinAgent — Hugging Face Spaces Dockerfile (Docker SDK)
|
| 3 |
+
# Runs FastAPI backend + Streamlit frontend in a single container via supervisord
|
| 4 |
+
# Pre-seeds ChromaDB with demo tickers at build time
|
| 5 |
+
# ──────────────────────────────────────────────────────────────────────────────
|
| 6 |
+
FROM python:3.11-slim
|
| 7 |
+
|
| 8 |
+
WORKDIR /app
|
| 9 |
+
|
| 10 |
+
# ── System deps ──────────────────────────────────────────────────────────────
|
| 11 |
+
RUN apt-get update && \
|
| 12 |
+
apt-get install -y --no-install-recommends \
|
| 13 |
+
build-essential gcc g++ curl supervisor && \
|
| 14 |
+
rm -rf /var/lib/apt/lists/*
|
| 15 |
+
|
| 16 |
+
# ── Python deps ──────────────────────────────────────────────────────────────
|
| 17 |
+
COPY requirements.txt .
|
| 18 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 19 |
+
|
| 20 |
+
# ── App source ───────────────────────────────────────────────────────────────
|
| 21 |
+
COPY . .
|
| 22 |
+
|
| 23 |
+
# ── Pre-seed ChromaDB at build time ─────────────────────────────────────────
|
| 24 |
+
# Ingest SEC 10-K filings for demo tickers
|
| 25 |
+
RUN python scripts/ingest.py --tickers AAPL MSFT TSLA GOOGL NVDA
|
| 26 |
+
|
| 27 |
+
# Ingest SEC 8-K / earnings call data for demo tickers
|
| 28 |
+
RUN python scripts/ingest_earnings_calls.py --tickers AAPL MSFT --quarters Q4-2024 Q1-2025
|
| 29 |
+
|
| 30 |
+
# ── Supervisord config (runs both services) ─────────────────────────────────
|
| 31 |
+
COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
|
| 32 |
+
|
| 33 |
+
# ── HF Spaces expects port 7860 ─────────────────────────────────────────────
|
| 34 |
+
EXPOSE 7860
|
| 35 |
+
|
| 36 |
+
# Streamlit health-check endpoint for HF Spaces
|
| 37 |
+
HEALTHCHECK --interval=30s --timeout=10s --retries=3 \
|
| 38 |
+
CMD curl -f http://localhost:7860/_stcore/health || exit 1
|
| 39 |
+
|
| 40 |
+
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
|
Dockerfile.api
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
WORKDIR /app
|
| 3 |
+
|
| 4 |
+
# Install build dependencies required for compiling certain Python packages (like lxml/pandas)
|
| 5 |
+
RUN apt-get update && apt-get install -y build-essential gcc g++ && rm -rf /var/lib/apt/lists/*
|
| 6 |
+
|
| 7 |
+
COPY requirements.txt .
|
| 8 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 9 |
+
|
| 10 |
+
COPY . .
|
| 11 |
+
|
| 12 |
+
EXPOSE 8000
|
| 13 |
+
CMD ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "8000"]
|
Dockerfile.ui
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
WORKDIR /app
|
| 3 |
+
|
| 4 |
+
# Streamlit dependencies and requirements
|
| 5 |
+
RUN apt-get update && apt-get install -y build-essential curl && rm -rf /var/lib/apt/lists/*
|
| 6 |
+
|
| 7 |
+
COPY requirements.txt .
|
| 8 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 9 |
+
|
| 10 |
+
COPY . .
|
| 11 |
+
|
| 12 |
+
EXPOSE 8501
|
| 13 |
+
CMD ["streamlit", "run", "frontend/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
HF_README.md
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: FinAgent - Autonomous Financial AI
|
| 3 |
+
emoji: 📈
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
pinned: true
|
| 9 |
+
license: mit
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# 📈 FinAgent: Autonomous Financial AI
|
| 13 |
+
|
| 14 |
+
An asynchronous, multi-agent LLM pipeline that automates quantitative financial research, fundamental document synthesis, earnings-call analysis, and real-time news sentiment scoring — built entirely with open-source models.
|
| 15 |
+
|
| 16 |
+
## 🏗️ Architecture
|
| 17 |
+
|
| 18 |
+
This system uses a **deterministic state-machine** architecture powered by [LangGraph](https://python.langchain.com/docs/langgraph):
|
| 19 |
+
|
| 20 |
+
1. **Planner Agent** — Parses the user query and generates a strict JSON task queue.
|
| 21 |
+
2. **Supervisor** — A Python-controlled router that dispatches tasks to specialist agents.
|
| 22 |
+
3. **Specialist Agents:**
|
| 23 |
+
- 🔢 **Quant Agent** — Live pricing, volume, and volatility metrics via `yfinance`.
|
| 24 |
+
- 📊 **Fundamental Agent** — SEC XBRL accounting data + RAG on 10-K filings.
|
| 25 |
+
- 📰 **Sentiment Agent** — Real-time news headline analysis and scoring.
|
| 26 |
+
- 🎙️ **Earnings Agent** — Sentiment divergence (Prepared Remarks vs Q&A) and keyword trend tracking from earnings-call transcripts.
|
| 27 |
+
4. **Summarizer** — Compiles all agent outputs into a unified Investment Memo.
|
| 28 |
+
|
| 29 |
+
## 🚀 Try It
|
| 30 |
+
|
| 31 |
+
Type a query in the chat box — here are some examples:
|
| 32 |
+
|
| 33 |
+
| Query | What It Does |
|
| 34 |
+
|-------|-------------|
|
| 35 |
+
| *"How is Apple's stock doing?"* | Quant analysis (price, volume, RSI) |
|
| 36 |
+
| *"What are the manufacturing risks in Tesla's latest 10-K?"* | RAG retrieval on SEC filings |
|
| 37 |
+
| *"What is the market sentiment on Microsoft?"* | Real-time news sentiment scoring |
|
| 38 |
+
| *"Analyze the latest earnings call for AAPL — compare management tone in prepared remarks vs Q&A"* | Earnings-call divergence analysis |
|
| 39 |
+
| *"Compare the current stock performance of Microsoft and Google"* | Multi-ticker parallel analysis |
|
| 40 |
+
|
| 41 |
+
## 📚 Pre-Loaded Data
|
| 42 |
+
|
| 43 |
+
This demo comes with pre-ingested data for immediate use:
|
| 44 |
+
|
| 45 |
+
- **SEC 10-K Filings:** AAPL, MSFT, TSLA, GOOGL, NVDA
|
| 46 |
+
- **Earnings Call Transcripts:** AAPL, MSFT (Q4-2024, Q1-2025)
|
| 47 |
+
|
| 48 |
+
> Quantitative data (prices, volume) and sentiment (news) are fetched **live** — no pre-loading needed.
|
| 49 |
+
|
| 50 |
+
## 🛠️ Tech Stack
|
| 51 |
+
|
| 52 |
+
| Component | Technology |
|
| 53 |
+
|-----------|-----------|
|
| 54 |
+
| Orchestration | LangGraph / LangChain |
|
| 55 |
+
| LLM Inference | Groq API (Llama-3.1-8B-Instruct) |
|
| 56 |
+
| Frontend | Streamlit |
|
| 57 |
+
| Backend API | FastAPI + Uvicorn |
|
| 58 |
+
| Vector DB | ChromaDB |
|
| 59 |
+
| Embeddings | HuggingFace `all-MiniLM-L6-v2` |
|
| 60 |
+
| Market Data | yfinance, SEC EDGAR API |
|
| 61 |
+
|
| 62 |
+
## ⚡ Performance Optimizations
|
| 63 |
+
|
| 64 |
+
This system was deliberately engineered for low-latency response times:
|
| 65 |
+
|
| 66 |
+
- **Parallel Agent Dispatch** — The Supervisor routes independent tasks to multiple specialist agents simultaneously (e.g., Quant + Sentiment + Fundamental in one batch) rather than sequentially, cutting multi-agent latency by up to 3×.
|
| 67 |
+
- **Server-Sent Event (SSE) Streaming** — Results stream live to the UI as each agent completes, so users see intermediate progress immediately instead of waiting for the full pipeline.
|
| 68 |
+
- **Groq Cloud Inference** — LLM calls use the Groq API (~200 tok/s on Llama-3.1-8B), eliminating local GPU bottlenecks and delivering sub-second per-agent response times.
|
| 69 |
+
- **Singleton Embedding Cache** — The HuggingFace embedding model is loaded once via `@lru_cache` and shared across all RAG queries (10-K, earnings, etc.), avoiding repeated 500MB+ model re-initialization.
|
| 70 |
+
- **Token Budget Tuning** — `max_tokens` is capped at 800 per LLM call to prevent Groq from reserving excessive context window, reducing queue wait times by ~40%.
|
| 71 |
+
- **Pre-Seeded Vector DB** — ChromaDB collections are embedded at Docker build time, so the app starts with zero cold-start ingestion delay.
|
| 72 |
+
- **Per-Step Latency Tracking** — Every agent step reports wall-clock latency in the UI, making performance bottlenecks immediately visible.
|
| 73 |
+
|
| 74 |
+
## 📂 Source Code
|
| 75 |
+
|
| 76 |
+
[GitHub Repository](https://github.com/devg24/financial-analysis-agent)
|
README.md
CHANGED
|
@@ -1,12 +1,76 @@
|
|
| 1 |
---
|
| 2 |
-
title: FinAgent
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
-
|
|
|
|
| 8 |
license: mit
|
| 9 |
-
short_description: 'Latency optimized, financial analysis agent using OS models '
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: FinAgent - Autonomous Financial AI
|
| 3 |
+
emoji: 📈
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
pinned: true
|
| 9 |
license: mit
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
+
# 📈 FinAgent: Autonomous Financial AI
|
| 13 |
+
|
| 14 |
+
An asynchronous, multi-agent LLM pipeline that automates quantitative financial research, fundamental document synthesis, earnings-call analysis, and real-time news sentiment scoring — built entirely with open-source models.
|
| 15 |
+
|
| 16 |
+
## 🏗️ Architecture
|
| 17 |
+
|
| 18 |
+
This system uses a **deterministic state-machine** architecture powered by [LangGraph](https://python.langchain.com/docs/langgraph):
|
| 19 |
+
|
| 20 |
+
1. **Planner Agent** — Parses the user query and generates a strict JSON task queue.
|
| 21 |
+
2. **Supervisor** — A Python-controlled router that dispatches tasks to specialist agents.
|
| 22 |
+
3. **Specialist Agents:**
|
| 23 |
+
- 🔢 **Quant Agent** — Live pricing, volume, and volatility metrics via `yfinance`.
|
| 24 |
+
- 📊 **Fundamental Agent** — SEC XBRL accounting data + RAG on 10-K filings.
|
| 25 |
+
- 📰 **Sentiment Agent** — Real-time news headline analysis and scoring.
|
| 26 |
+
- 🎙️ **Earnings Agent** — Sentiment divergence (Prepared Remarks vs Q&A) and keyword trend tracking from earnings-call transcripts.
|
| 27 |
+
4. **Summarizer** — Compiles all agent outputs into a unified Investment Memo.
|
| 28 |
+
|
| 29 |
+
## 🚀 Try It
|
| 30 |
+
|
| 31 |
+
Type a query in the chat box — here are some examples:
|
| 32 |
+
|
| 33 |
+
| Query | What It Does |
|
| 34 |
+
|-------|-------------|
|
| 35 |
+
| *"How is Apple's stock doing?"* | Quant analysis (price, volume, RSI) |
|
| 36 |
+
| *"What are the manufacturing risks in Tesla's latest 10-K?"* | RAG retrieval on SEC filings |
|
| 37 |
+
| *"What is the market sentiment on Microsoft?"* | Real-time news sentiment scoring |
|
| 38 |
+
| *"Analyze the latest earnings call for AAPL — compare management tone in prepared remarks vs Q&A"* | Earnings-call divergence analysis |
|
| 39 |
+
| *"Compare the current stock performance of Microsoft and Google"* | Multi-ticker parallel analysis |
|
| 40 |
+
|
| 41 |
+
## 📚 Pre-Loaded Data
|
| 42 |
+
|
| 43 |
+
This demo comes with pre-ingested data for immediate use:
|
| 44 |
+
|
| 45 |
+
- **SEC 10-K Filings:** AAPL, MSFT, TSLA, GOOGL, NVDA
|
| 46 |
+
- **Earnings Call Transcripts:** AAPL, MSFT (Q4-2024, Q1-2025)
|
| 47 |
+
|
| 48 |
+
> Quantitative data (prices, volume) and sentiment (news) are fetched **live** — no pre-loading needed.
|
| 49 |
+
|
| 50 |
+
## 🛠️ Tech Stack
|
| 51 |
+
|
| 52 |
+
| Component | Technology |
|
| 53 |
+
|-----------|-----------|
|
| 54 |
+
| Orchestration | LangGraph / LangChain |
|
| 55 |
+
| LLM Inference | Groq API (Llama-3.1-8B-Instruct) |
|
| 56 |
+
| Frontend | Streamlit |
|
| 57 |
+
| Backend API | FastAPI + Uvicorn |
|
| 58 |
+
| Vector DB | ChromaDB |
|
| 59 |
+
| Embeddings | HuggingFace `all-MiniLM-L6-v2` |
|
| 60 |
+
| Market Data | yfinance, SEC EDGAR API |
|
| 61 |
+
|
| 62 |
+
## ⚡ Performance Optimizations
|
| 63 |
+
|
| 64 |
+
This system was deliberately engineered for low-latency response times:
|
| 65 |
+
|
| 66 |
+
- **Parallel Agent Dispatch** — The Supervisor routes independent tasks to multiple specialist agents simultaneously (e.g., Quant + Sentiment + Fundamental in one batch) rather than sequentially, cutting multi-agent latency by up to 3×.
|
| 67 |
+
- **Server-Sent Event (SSE) Streaming** — Results stream live to the UI as each agent completes, so users see intermediate progress immediately instead of waiting for the full pipeline.
|
| 68 |
+
- **Groq Cloud Inference** — LLM calls use the Groq API (~200 tok/s on Llama-3.1-8B), eliminating local GPU bottlenecks and delivering sub-second per-agent response times.
|
| 69 |
+
- **Singleton Embedding Cache** — The HuggingFace embedding model is loaded once via `@lru_cache` and shared across all RAG queries (10-K, earnings, etc.), avoiding repeated 500MB+ model re-initialization.
|
| 70 |
+
- **Token Budget Tuning** — `max_tokens` is capped at 800 per LLM call to prevent Groq from reserving excessive context window, reducing queue wait times by ~40%.
|
| 71 |
+
- **Pre-Seeded Vector DB** — ChromaDB collections are embedded at Docker build time, so the app starts with zero cold-start ingestion delay.
|
| 72 |
+
- **Per-Step Latency Tracking** — Every agent step reports wall-clock latency in the UI, making performance bottlenecks immediately visible.
|
| 73 |
+
|
| 74 |
+
## 📂 Source Code
|
| 75 |
+
|
| 76 |
+
[GitHub Repository](https://github.com/devg24/financial-analysis-agent)
|
backend/__init__.py
ADDED
|
File without changes
|
backend/api.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from contextlib import asynccontextmanager
|
| 3 |
+
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
from fastapi import FastAPI, HTTPException, Request
|
| 6 |
+
from fastapi.concurrency import run_in_threadpool
|
| 7 |
+
from fastapi.responses import StreamingResponse
|
| 8 |
+
from pydantic import BaseModel, Field
|
| 9 |
+
|
| 10 |
+
import langchain
|
| 11 |
+
|
| 12 |
+
from core.config import Settings
|
| 13 |
+
from core.graph_builder import build_financial_graph
|
| 14 |
+
from core.runner import create_llm, run_financial_query, astream_financial_query
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
@asynccontextmanager
|
| 18 |
+
async def lifespan(app: FastAPI):
|
| 19 |
+
load_dotenv()
|
| 20 |
+
langchain.debug = os.getenv("LANGCHAIN_DEBUG", "").lower() in ("1", "true", "yes")
|
| 21 |
+
settings = Settings()
|
| 22 |
+
llm = create_llm(settings)
|
| 23 |
+
app.state.settings = settings
|
| 24 |
+
app.state.graph = build_financial_graph(llm)
|
| 25 |
+
yield
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
app = FastAPI(title="FinAgent", lifespan=lifespan)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class ChatRequest(BaseModel):
|
| 32 |
+
query: str = Field(..., min_length=1, max_length=16000)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class StepOut(BaseModel):
|
| 36 |
+
node: str
|
| 37 |
+
content: str
|
| 38 |
+
step_latency: float | None = None
|
| 39 |
+
total_latency: float | None = None
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class ChatResponse(BaseModel):
|
| 43 |
+
memo: str | None = None
|
| 44 |
+
steps: list[StepOut] = Field(default_factory=list)
|
| 45 |
+
total_latency: float | None = None
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
@app.get("/health")
|
| 49 |
+
def health():
|
| 50 |
+
return {"status": "ok"}
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
@app.post("/chat", response_model=ChatResponse)
|
| 54 |
+
async def chat(request: Request, body: ChatRequest):
|
| 55 |
+
graph = request.app.state.graph
|
| 56 |
+
q = body.query.strip()
|
| 57 |
+
if not q:
|
| 58 |
+
raise HTTPException(status_code=400, detail="query must not be empty")
|
| 59 |
+
try:
|
| 60 |
+
result = await run_in_threadpool(run_financial_query, graph, q)
|
| 61 |
+
except Exception as e:
|
| 62 |
+
raise HTTPException(status_code=503, detail=str(e)) from e
|
| 63 |
+
return ChatResponse(**result)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
@app.post("/chat/stream")
|
| 67 |
+
async def chat_stream(request: Request, body: ChatRequest):
|
| 68 |
+
graph = request.app.state.graph
|
| 69 |
+
q = body.query.strip()
|
| 70 |
+
if not q:
|
| 71 |
+
raise HTTPException(status_code=400, detail="query must not be empty")
|
| 72 |
+
|
| 73 |
+
return StreamingResponse(
|
| 74 |
+
astream_financial_query(graph, q),
|
| 75 |
+
media_type="text/event-stream"
|
| 76 |
+
)
|
core/__init__.py
ADDED
|
File without changes
|
core/config.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class Settings(BaseSettings):
|
| 5 |
+
"""OpenAI-compatible LLM endpoint (e.g. Ollama at localhost:11434/v1)."""
|
| 6 |
+
|
| 7 |
+
model_config = SettingsConfigDict(
|
| 8 |
+
env_file=".env",
|
| 9 |
+
env_file_encoding="utf-8",
|
| 10 |
+
extra="ignore",
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
openai_base_url: str = "http://localhost:11434/v1"
|
| 14 |
+
openai_api_key: str = "ollama"
|
| 15 |
+
openai_model: str = "llama3.1"
|
| 16 |
+
openai_temperature: float = 0.0
|
| 17 |
+
|
| 18 |
+
# Earnings-call pipeline
|
| 19 |
+
alpha_vantage_api_key: str = ""
|
| 20 |
+
earnings_chroma_path: str = "./chroma_db"
|
core/earnings_tools.py
ADDED
|
@@ -0,0 +1,550 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Earnings-call ingest + inference tools.
|
| 3 |
+
|
| 4 |
+
Ingest layer – fetch transcript (Alpha Vantage → SEC 8-K fallback),
|
| 5 |
+
normalize into Prepared Remarks / Q&A segments,
|
| 6 |
+
extract keyword counts, and embed into ChromaDB.
|
| 7 |
+
|
| 8 |
+
Inference layer – LangGraph @tool functions for retrieval,
|
| 9 |
+
sentiment divergence, and keyword trend analysis.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import json
|
| 13 |
+
import os
|
| 14 |
+
import re
|
| 15 |
+
from collections import Counter
|
| 16 |
+
from typing import Optional
|
| 17 |
+
|
| 18 |
+
import requests
|
| 19 |
+
from langchain_chroma import Chroma
|
| 20 |
+
from langchain_core.documents import Document
|
| 21 |
+
from langchain_core.tools import tool
|
| 22 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 23 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 24 |
+
|
| 25 |
+
from .rag_tools import get_cached_embeddings
|
| 26 |
+
from .sec_tools import HEADERS, get_cik_from_ticker
|
| 27 |
+
|
| 28 |
+
# ---------------------------------------------------------------------------
|
| 29 |
+
# Constants
|
| 30 |
+
# ---------------------------------------------------------------------------
|
| 31 |
+
|
| 32 |
+
TRACKED_KEYWORDS = [
|
| 33 |
+
"ai", "artificial intelligence", "machine learning",
|
| 34 |
+
"headwinds", "tailwinds", "guidance", "margin", "growth",
|
| 35 |
+
"inflation", "recession", "tariff", "supply chain",
|
| 36 |
+
"cloud", "capex", "capital expenditure", "free cash flow",
|
| 37 |
+
"buyback", "dividend", "restructuring", "layoff",
|
| 38 |
+
"regulation", "competition", "demand", "inventory",
|
| 39 |
+
]
|
| 40 |
+
|
| 41 |
+
# Markers used to split transcripts into sections
|
| 42 |
+
QA_MARKERS = [
|
| 43 |
+
"question-and-answer session",
|
| 44 |
+
"question-and-answer",
|
| 45 |
+
"q&a session",
|
| 46 |
+
"q & a session",
|
| 47 |
+
"operator instructions",
|
| 48 |
+
"and our first question",
|
| 49 |
+
"we will now begin the question",
|
| 50 |
+
"we'll now begin the question",
|
| 51 |
+
]
|
| 52 |
+
|
| 53 |
+
METADATA_DIR_NAME = "_earnings_meta"
|
| 54 |
+
|
| 55 |
+
# ---------------------------------------------------------------------------
|
| 56 |
+
# Quarter helpers
|
| 57 |
+
# ---------------------------------------------------------------------------
|
| 58 |
+
|
| 59 |
+
def parse_quarter(quarter_str: str) -> tuple[int, int]:
|
| 60 |
+
"""Parse 'Q1-2025' → (1, 2025). Also accepts 'Q1 2025' or 'q1-2025'."""
|
| 61 |
+
m = re.match(r"[Qq](\d)\s*[-_ ]?\s*(\d{4})", quarter_str.strip())
|
| 62 |
+
if not m:
|
| 63 |
+
raise ValueError(
|
| 64 |
+
f"Invalid quarter format '{quarter_str}'. Expected e.g. 'Q1-2025'."
|
| 65 |
+
)
|
| 66 |
+
q, y = int(m.group(1)), int(m.group(2))
|
| 67 |
+
if q < 1 or q > 4:
|
| 68 |
+
raise ValueError(f"Quarter must be 1-4, got {q}.")
|
| 69 |
+
return q, y
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def _quarter_to_month(q: int) -> str:
|
| 73 |
+
"""Map fiscal quarter to approximate month for Alpha Vantage API."""
|
| 74 |
+
return {1: "03", 2: "06", 3: "09", 4: "12"}[q]
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
# ---------------------------------------------------------------------------
|
| 78 |
+
# Transcript fetchers
|
| 79 |
+
# ---------------------------------------------------------------------------
|
| 80 |
+
|
| 81 |
+
def fetch_transcript_alpha_vantage(
|
| 82 |
+
ticker: str, quarter: int, year: int, api_key: str
|
| 83 |
+
) -> Optional[str]:
|
| 84 |
+
"""
|
| 85 |
+
Try the Alpha Vantage EARNINGS_CALL_TRANSCRIPT endpoint.
|
| 86 |
+
Returns raw transcript text or None on failure (premium-only).
|
| 87 |
+
"""
|
| 88 |
+
if not api_key:
|
| 89 |
+
return None
|
| 90 |
+
url = (
|
| 91 |
+
"https://www.alphavantage.co/query"
|
| 92 |
+
f"?function=EARNINGS_CALL_TRANSCRIPT"
|
| 93 |
+
f"&symbol={ticker}"
|
| 94 |
+
f"&quarter={year}Q{quarter}"
|
| 95 |
+
f"&apikey={api_key}"
|
| 96 |
+
)
|
| 97 |
+
try:
|
| 98 |
+
print(f"[Earnings Ingest] Trying Alpha Vantage for {ticker} Q{quarter}-{year}...")
|
| 99 |
+
resp = requests.get(url, timeout=30)
|
| 100 |
+
resp.raise_for_status()
|
| 101 |
+
data = resp.json()
|
| 102 |
+
# Alpha Vantage returns a list of transcript segments on success
|
| 103 |
+
if isinstance(data, dict) and "transcript" in data:
|
| 104 |
+
segments = data["transcript"]
|
| 105 |
+
lines = []
|
| 106 |
+
for seg in segments:
|
| 107 |
+
speaker = seg.get("speaker", "Unknown")
|
| 108 |
+
text = seg.get("content", "")
|
| 109 |
+
lines.append(f"{speaker}: {text}")
|
| 110 |
+
full = "\n".join(lines)
|
| 111 |
+
if len(full) > 200:
|
| 112 |
+
print(f"[Earnings Ingest] Alpha Vantage returned transcript ({len(full)} chars).")
|
| 113 |
+
return full
|
| 114 |
+
# Premium-required or empty response
|
| 115 |
+
info = data.get("Information") or data.get("Note") or ""
|
| 116 |
+
if info:
|
| 117 |
+
print(f"[Earnings Ingest] Alpha Vantage: {info[:120]}")
|
| 118 |
+
return None
|
| 119 |
+
except Exception as e:
|
| 120 |
+
print(f"[Earnings Ingest] Alpha Vantage failed: {e}")
|
| 121 |
+
return None
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def fetch_transcript_sec_8k(ticker: str, quarter: int, year: int) -> Optional[str]:
|
| 125 |
+
"""
|
| 126 |
+
Fallback: search SEC EDGAR for 8-K filings around the quarter-end date
|
| 127 |
+
that mention 'earnings' or 'results of operations'.
|
| 128 |
+
Returns extracted text or None.
|
| 129 |
+
"""
|
| 130 |
+
try:
|
| 131 |
+
cik = get_cik_from_ticker(ticker)
|
| 132 |
+
except ValueError:
|
| 133 |
+
print(f"[Earnings Ingest] Ticker {ticker} not found in SEC database.")
|
| 134 |
+
return None
|
| 135 |
+
|
| 136 |
+
try:
|
| 137 |
+
print(f"[Earnings Ingest] Trying SEC 8-K fallback for {ticker} Q{quarter}-{year}...")
|
| 138 |
+
url = f"https://data.sec.gov/submissions/CIK{cik}.json"
|
| 139 |
+
resp = requests.get(url, headers=HEADERS, timeout=30)
|
| 140 |
+
resp.raise_for_status()
|
| 141 |
+
filings = resp.json()["filings"]["recent"]
|
| 142 |
+
|
| 143 |
+
target_month = int(_quarter_to_month(quarter))
|
| 144 |
+
best_doc_url = None
|
| 145 |
+
|
| 146 |
+
for i, form in enumerate(filings["form"]):
|
| 147 |
+
if form != "8-K":
|
| 148 |
+
continue
|
| 149 |
+
filed = filings["filingDate"][i] # "2025-01-30"
|
| 150 |
+
filed_year, filed_month = int(filed[:4]), int(filed[5:7])
|
| 151 |
+
|
| 152 |
+
# Build a set of acceptable (year, month) pairs:
|
| 153 |
+
# Accept filings from the quarter-end month through 3 months after,
|
| 154 |
+
# handling year rollover (e.g., Q4 target_month=12 → Dec, Jan, Feb, Mar)
|
| 155 |
+
acceptable = set()
|
| 156 |
+
for offset in range(4): # 0, 1, 2, 3 months after quarter end
|
| 157 |
+
m = target_month + offset
|
| 158 |
+
y = year
|
| 159 |
+
if m > 12:
|
| 160 |
+
m -= 12
|
| 161 |
+
y += 1
|
| 162 |
+
acceptable.add((y, m))
|
| 163 |
+
|
| 164 |
+
if (filed_year, filed_month) in acceptable:
|
| 165 |
+
accession = filings["accessionNumber"][i]
|
| 166 |
+
acc_clean = accession.replace("-", "")
|
| 167 |
+
primary_doc = filings["primaryDocument"][i]
|
| 168 |
+
doc_url = (
|
| 169 |
+
f"https://www.sec.gov/Archives/edgar/data/"
|
| 170 |
+
f"{cik.lstrip('0')}/{acc_clean}/{primary_doc}"
|
| 171 |
+
)
|
| 172 |
+
best_doc_url = doc_url
|
| 173 |
+
break # Take the first matching 8-K
|
| 174 |
+
|
| 175 |
+
if not best_doc_url:
|
| 176 |
+
print(f"[Earnings Ingest] No matching SEC 8-K found for {ticker} Q{quarter}-{year}.")
|
| 177 |
+
return None
|
| 178 |
+
|
| 179 |
+
print(f"[Earnings Ingest] Downloading 8-K from {best_doc_url}...")
|
| 180 |
+
doc_resp = requests.get(best_doc_url, headers=HEADERS, timeout=30)
|
| 181 |
+
doc_resp.raise_for_status()
|
| 182 |
+
|
| 183 |
+
from bs4 import BeautifulSoup
|
| 184 |
+
|
| 185 |
+
soup = BeautifulSoup(doc_resp.text, "html.parser")
|
| 186 |
+
text = soup.get_text(separator=" ", strip=True)
|
| 187 |
+
|
| 188 |
+
if len(text) > 500:
|
| 189 |
+
print(f"[Earnings Ingest] SEC 8-K text extracted ({len(text)} chars).")
|
| 190 |
+
return text
|
| 191 |
+
print("[Earnings Ingest] SEC 8-K text too short, likely not a transcript.")
|
| 192 |
+
return None
|
| 193 |
+
|
| 194 |
+
except Exception as e:
|
| 195 |
+
print(f"[Earnings Ingest] SEC 8-K fallback failed: {e}")
|
| 196 |
+
return None
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
# ---------------------------------------------------------------------------
|
| 200 |
+
# Transcript normalization & segmentation
|
| 201 |
+
# ---------------------------------------------------------------------------
|
| 202 |
+
|
| 203 |
+
def normalize_transcript(
|
| 204 |
+
raw_text: str, ticker: str, quarter: int, year: int
|
| 205 |
+
) -> dict:
|
| 206 |
+
"""
|
| 207 |
+
Split a raw transcript into Prepared Remarks and Q&A Session.
|
| 208 |
+
Returns:
|
| 209 |
+
{
|
| 210 |
+
"ticker": ..., "quarter": ..., "year": ...,
|
| 211 |
+
"prepared_remarks": str,
|
| 212 |
+
"qa_session": str,
|
| 213 |
+
"source": "alpha_vantage" | "sec_8k",
|
| 214 |
+
}
|
| 215 |
+
"""
|
| 216 |
+
text_lower = raw_text.lower()
|
| 217 |
+
split_pos = -1
|
| 218 |
+
for marker in QA_MARKERS:
|
| 219 |
+
idx = text_lower.find(marker)
|
| 220 |
+
if idx != -1:
|
| 221 |
+
split_pos = idx
|
| 222 |
+
break
|
| 223 |
+
|
| 224 |
+
if split_pos > 0:
|
| 225 |
+
prepared = raw_text[:split_pos].strip()
|
| 226 |
+
qa = raw_text[split_pos:].strip()
|
| 227 |
+
else:
|
| 228 |
+
# Could not find Q&A boundary — treat entire text as prepared remarks
|
| 229 |
+
prepared = raw_text.strip()
|
| 230 |
+
qa = ""
|
| 231 |
+
|
| 232 |
+
return {
|
| 233 |
+
"ticker": ticker.upper(),
|
| 234 |
+
"quarter": quarter,
|
| 235 |
+
"year": year,
|
| 236 |
+
"prepared_remarks": prepared,
|
| 237 |
+
"qa_session": qa,
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
# ---------------------------------------------------------------------------
|
| 242 |
+
# Keyword / entity extraction
|
| 243 |
+
# ---------------------------------------------------------------------------
|
| 244 |
+
|
| 245 |
+
def extract_keywords(text: str) -> dict[str, int]:
|
| 246 |
+
"""
|
| 247 |
+
Count occurrences of tracked financial keywords in the text.
|
| 248 |
+
Returns a dict of keyword → count (only keywords with count > 0).
|
| 249 |
+
"""
|
| 250 |
+
text_lower = text.lower()
|
| 251 |
+
counts: dict[str, int] = {}
|
| 252 |
+
for kw in TRACKED_KEYWORDS:
|
| 253 |
+
c = len(re.findall(r"\b" + re.escape(kw) + r"\b", text_lower))
|
| 254 |
+
if c > 0:
|
| 255 |
+
counts[kw] = c
|
| 256 |
+
return counts
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
# ---------------------------------------------------------------------------
|
| 260 |
+
# ChromaDB ingest
|
| 261 |
+
# ---------------------------------------------------------------------------
|
| 262 |
+
|
| 263 |
+
def _meta_path(chroma_path: str, ticker: str) -> str:
|
| 264 |
+
d = os.path.join(chroma_path, f"{ticker.upper()}{METADATA_DIR_NAME}")
|
| 265 |
+
os.makedirs(d, exist_ok=True)
|
| 266 |
+
return d
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
def _save_metadata(
|
| 270 |
+
chroma_path: str,
|
| 271 |
+
ticker: str,
|
| 272 |
+
quarter: int,
|
| 273 |
+
year: int,
|
| 274 |
+
keywords: dict[str, int],
|
| 275 |
+
status: str,
|
| 276 |
+
) -> None:
|
| 277 |
+
meta_dir = _meta_path(chroma_path, ticker)
|
| 278 |
+
fname = os.path.join(meta_dir, f"Q{quarter}_{year}.json")
|
| 279 |
+
payload = {
|
| 280 |
+
"ticker": ticker.upper(),
|
| 281 |
+
"quarter": quarter,
|
| 282 |
+
"year": year,
|
| 283 |
+
"status": status,
|
| 284 |
+
"keywords": keywords,
|
| 285 |
+
}
|
| 286 |
+
with open(fname, "w") as f:
|
| 287 |
+
json.dump(payload, f, indent=2)
|
| 288 |
+
print(f"[Earnings Ingest] Metadata saved → {fname}")
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
def _load_metadata(chroma_path: str, ticker: str) -> list[dict]:
|
| 292 |
+
"""Load all quarter metadata files for a ticker."""
|
| 293 |
+
meta_dir = _meta_path(chroma_path, ticker)
|
| 294 |
+
results = []
|
| 295 |
+
if not os.path.isdir(meta_dir):
|
| 296 |
+
return results
|
| 297 |
+
for fname in sorted(os.listdir(meta_dir)):
|
| 298 |
+
if fname.endswith(".json"):
|
| 299 |
+
with open(os.path.join(meta_dir, fname)) as f:
|
| 300 |
+
results.append(json.load(f))
|
| 301 |
+
return results
|
| 302 |
+
|
| 303 |
+
|
| 304 |
+
def ingest_earnings_call(
|
| 305 |
+
ticker: str,
|
| 306 |
+
quarter: int,
|
| 307 |
+
year: int,
|
| 308 |
+
api_key: str = "",
|
| 309 |
+
chroma_path: str = "./chroma_db",
|
| 310 |
+
) -> str:
|
| 311 |
+
"""
|
| 312 |
+
Full ingest pipeline for one ticker/quarter pair.
|
| 313 |
+
Returns a status string: 'success', 'partial', or 'failed'.
|
| 314 |
+
"""
|
| 315 |
+
ticker = ticker.upper()
|
| 316 |
+
collection_dir = os.path.join(chroma_path, f"{ticker}_earnings")
|
| 317 |
+
|
| 318 |
+
# Check if already ingested
|
| 319 |
+
meta_dir = _meta_path(chroma_path, ticker)
|
| 320 |
+
meta_file = os.path.join(meta_dir, f"Q{quarter}_{year}.json")
|
| 321 |
+
if os.path.exists(meta_file):
|
| 322 |
+
print(f"[Earnings Ingest] Q{quarter}-{year} for {ticker} already ingested. Skipping.")
|
| 323 |
+
return "exists"
|
| 324 |
+
|
| 325 |
+
# 1. Fetch transcript
|
| 326 |
+
raw_text = fetch_transcript_alpha_vantage(ticker, quarter, year, api_key)
|
| 327 |
+
source = "alpha_vantage" if raw_text else None
|
| 328 |
+
|
| 329 |
+
if not raw_text:
|
| 330 |
+
raw_text = fetch_transcript_sec_8k(ticker, quarter, year)
|
| 331 |
+
source = "sec_8k" if raw_text else None
|
| 332 |
+
|
| 333 |
+
if not raw_text:
|
| 334 |
+
_save_metadata(chroma_path, ticker, quarter, year, {}, "failed")
|
| 335 |
+
return "failed"
|
| 336 |
+
|
| 337 |
+
# 2. Normalize & segment
|
| 338 |
+
segments = normalize_transcript(raw_text, ticker, quarter, year)
|
| 339 |
+
|
| 340 |
+
# 3. Extract keywords from both sections
|
| 341 |
+
all_text = segments["prepared_remarks"] + " " + segments["qa_session"]
|
| 342 |
+
keywords = extract_keywords(all_text)
|
| 343 |
+
|
| 344 |
+
# 4. Chunk & embed into ChromaDB
|
| 345 |
+
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
| 346 |
+
docs = []
|
| 347 |
+
|
| 348 |
+
if segments["prepared_remarks"]:
|
| 349 |
+
pr_doc = Document(
|
| 350 |
+
page_content=segments["prepared_remarks"],
|
| 351 |
+
metadata={
|
| 352 |
+
"ticker": ticker,
|
| 353 |
+
"quarter": quarter,
|
| 354 |
+
"year": year,
|
| 355 |
+
"section": "Prepared Remarks",
|
| 356 |
+
"source": source,
|
| 357 |
+
},
|
| 358 |
+
)
|
| 359 |
+
docs.extend(splitter.split_documents([pr_doc]))
|
| 360 |
+
|
| 361 |
+
if segments["qa_session"]:
|
| 362 |
+
qa_doc = Document(
|
| 363 |
+
page_content=segments["qa_session"],
|
| 364 |
+
metadata={
|
| 365 |
+
"ticker": ticker,
|
| 366 |
+
"quarter": quarter,
|
| 367 |
+
"year": year,
|
| 368 |
+
"section": "Q&A Session",
|
| 369 |
+
"source": source,
|
| 370 |
+
},
|
| 371 |
+
)
|
| 372 |
+
docs.extend(splitter.split_documents([qa_doc]))
|
| 373 |
+
|
| 374 |
+
if not docs:
|
| 375 |
+
_save_metadata(chroma_path, ticker, quarter, year, keywords, "partial")
|
| 376 |
+
return "partial"
|
| 377 |
+
|
| 378 |
+
print(f"[Earnings Ingest] Embedding {len(docs)} chunks into {collection_dir}...")
|
| 379 |
+
embeddings = get_cached_embeddings()
|
| 380 |
+
Chroma.from_documents(
|
| 381 |
+
documents=docs,
|
| 382 |
+
embedding=embeddings,
|
| 383 |
+
persist_directory=collection_dir,
|
| 384 |
+
)
|
| 385 |
+
|
| 386 |
+
status = "success" if segments["qa_session"] else "partial"
|
| 387 |
+
_save_metadata(chroma_path, ticker, quarter, year, keywords, status)
|
| 388 |
+
print(f"[Earnings Ingest] {ticker} Q{quarter}-{year} ingested ({status}).")
|
| 389 |
+
return status
|
| 390 |
+
|
| 391 |
+
|
| 392 |
+
# ---------------------------------------------------------------------------
|
| 393 |
+
# Inference tools (LangGraph runtime)
|
| 394 |
+
# ---------------------------------------------------------------------------
|
| 395 |
+
|
| 396 |
+
def _get_earnings_db(ticker: str, chroma_path: str = "./chroma_db") -> Chroma:
|
| 397 |
+
"""Load the earnings-call Chroma collection for a ticker."""
|
| 398 |
+
ticker = ticker.upper()
|
| 399 |
+
persist_directory = os.path.join(chroma_path, f"{ticker}_earnings")
|
| 400 |
+
|
| 401 |
+
if not os.path.exists(persist_directory):
|
| 402 |
+
raise FileNotFoundError(
|
| 403 |
+
f"Earnings data for {ticker} not ingested. "
|
| 404 |
+
f"Run: python scripts/ingest_earnings_calls.py --tickers {ticker} --quarters Q<N>-<YYYY>"
|
| 405 |
+
)
|
| 406 |
+
embeddings = get_cached_embeddings()
|
| 407 |
+
return Chroma(persist_directory=persist_directory, embedding_function=embeddings)
|
| 408 |
+
|
| 409 |
+
|
| 410 |
+
@tool
|
| 411 |
+
def search_earnings_call(ticker: str, query: str) -> str:
|
| 412 |
+
"""
|
| 413 |
+
Searches pre-ingested earnings-call transcripts for a given ticker.
|
| 414 |
+
Use this to find specific management commentary, guidance, or discussion topics.
|
| 415 |
+
CRITICAL: The ticker's earnings data must already be ingested.
|
| 416 |
+
Pass the stock ticker (e.g. 'AAPL') and a natural-language query.
|
| 417 |
+
"""
|
| 418 |
+
try:
|
| 419 |
+
db = _get_earnings_db(ticker.upper())
|
| 420 |
+
results = db.similarity_search(query, k=3)
|
| 421 |
+
|
| 422 |
+
if not results:
|
| 423 |
+
return f"No earnings-call matches found for '{query}' on {ticker}."
|
| 424 |
+
|
| 425 |
+
output_parts = [f"EARNINGS CALL SEARCH RESULTS FOR {ticker.upper()} — '{query}':\n"]
|
| 426 |
+
total_chars = 0
|
| 427 |
+
for doc in results:
|
| 428 |
+
meta = doc.metadata
|
| 429 |
+
label = f"[{meta.get('section', 'Unknown')} | Q{meta.get('quarter', '?')}-{meta.get('year', '?')}]"
|
| 430 |
+
snippet = doc.page_content[:700]
|
| 431 |
+
total_chars += len(snippet)
|
| 432 |
+
output_parts.append(f"{label}\n{snippet}\n")
|
| 433 |
+
if total_chars > 2000:
|
| 434 |
+
break
|
| 435 |
+
|
| 436 |
+
return "\n".join(output_parts)
|
| 437 |
+
except Exception as e:
|
| 438 |
+
return f"Error searching earnings data: {e}"
|
| 439 |
+
|
| 440 |
+
|
| 441 |
+
@tool
|
| 442 |
+
def get_earnings_sentiment_divergence(ticker: str) -> str:
|
| 443 |
+
"""
|
| 444 |
+
Retrieves evidence from both Prepared Remarks and Q&A sections of the
|
| 445 |
+
most recent earnings call for a ticker. Use this to analyze whether
|
| 446 |
+
management tone differs between the scripted portion and live Q&A.
|
| 447 |
+
CRITICAL: The ticker's earnings data must already be ingested.
|
| 448 |
+
"""
|
| 449 |
+
try:
|
| 450 |
+
db = _get_earnings_db(ticker.upper())
|
| 451 |
+
|
| 452 |
+
# Retrieve top chunks from each section
|
| 453 |
+
pr_results = db.similarity_search(
|
| 454 |
+
"management outlook guidance performance",
|
| 455 |
+
k=3,
|
| 456 |
+
filter={"section": "Prepared Remarks"},
|
| 457 |
+
)
|
| 458 |
+
qa_results = db.similarity_search(
|
| 459 |
+
"analyst question concern risk challenge",
|
| 460 |
+
k=3,
|
| 461 |
+
filter={"section": "Q&A Session"},
|
| 462 |
+
)
|
| 463 |
+
|
| 464 |
+
output = f"SENTIMENT DIVERGENCE EVIDENCE FOR {ticker.upper()}:\n\n"
|
| 465 |
+
|
| 466 |
+
output += "=== PREPARED REMARKS (scripted management commentary) ===\n"
|
| 467 |
+
if pr_results:
|
| 468 |
+
for doc in pr_results:
|
| 469 |
+
output += doc.page_content[:600] + "\n---\n"
|
| 470 |
+
else:
|
| 471 |
+
output += "(No Prepared Remarks data found.)\n"
|
| 472 |
+
|
| 473 |
+
output += "\n=== Q&A SESSION (live analyst questions & management responses) ===\n"
|
| 474 |
+
if qa_results:
|
| 475 |
+
for doc in qa_results:
|
| 476 |
+
output += doc.page_content[:600] + "\n---\n"
|
| 477 |
+
else:
|
| 478 |
+
output += "(No Q&A Session data found — transcript may not have contained a Q&A segment.)\n"
|
| 479 |
+
|
| 480 |
+
output += (
|
| 481 |
+
"\nINSTRUCTION: Compare the tone, confidence, and specificity between "
|
| 482 |
+
"Prepared Remarks and Q&A. Note any divergence where management was more "
|
| 483 |
+
"cautious, evasive, or forthcoming in one section vs the other."
|
| 484 |
+
)
|
| 485 |
+
return output
|
| 486 |
+
|
| 487 |
+
except Exception as e:
|
| 488 |
+
return f"Error retrieving divergence data: {e}"
|
| 489 |
+
|
| 490 |
+
|
| 491 |
+
@tool
|
| 492 |
+
def get_earnings_keyword_trends(ticker: str) -> str:
|
| 493 |
+
"""
|
| 494 |
+
Returns quarter-over-quarter keyword frequency trends from ingested
|
| 495 |
+
earnings calls for a given ticker. Shows how often key terms (AI, headwinds,
|
| 496 |
+
growth, guidance, etc.) were mentioned across available quarters.
|
| 497 |
+
CRITICAL: Multiple quarters must be ingested for trend comparison.
|
| 498 |
+
"""
|
| 499 |
+
try:
|
| 500 |
+
ticker = ticker.upper()
|
| 501 |
+
all_meta = _load_metadata("./chroma_db", ticker)
|
| 502 |
+
|
| 503 |
+
if not all_meta:
|
| 504 |
+
return (
|
| 505 |
+
f"No earnings metadata found for {ticker}. "
|
| 506 |
+
f"Run: python scripts/ingest_earnings_calls.py --tickers {ticker} --quarters Q<N>-<YYYY>"
|
| 507 |
+
)
|
| 508 |
+
|
| 509 |
+
# Sort by year, quarter
|
| 510 |
+
all_meta.sort(key=lambda m: (m["year"], m["quarter"]))
|
| 511 |
+
|
| 512 |
+
# Build output table
|
| 513 |
+
quarters = [f"Q{m['quarter']}-{m['year']}" for m in all_meta]
|
| 514 |
+
header = f"KEYWORD TRENDS FOR {ticker} ({', '.join(quarters)}):\n\n"
|
| 515 |
+
|
| 516 |
+
# Collect all keywords across quarters
|
| 517 |
+
all_kws = set()
|
| 518 |
+
for m in all_meta:
|
| 519 |
+
all_kws.update(m.get("keywords", {}).keys())
|
| 520 |
+
|
| 521 |
+
if not all_kws:
|
| 522 |
+
return header + "No tracked keywords found in any ingested quarter."
|
| 523 |
+
|
| 524 |
+
rows = []
|
| 525 |
+
rows.append(f"{'Keyword':<30} " + " ".join(f"{q:>10}" for q in quarters))
|
| 526 |
+
rows.append("-" * (30 + 11 * len(quarters)))
|
| 527 |
+
|
| 528 |
+
for kw in sorted(all_kws):
|
| 529 |
+
vals = []
|
| 530 |
+
for m in all_meta:
|
| 531 |
+
c = m.get("keywords", {}).get(kw, 0)
|
| 532 |
+
vals.append(f"{c:>10}")
|
| 533 |
+
rows.append(f"{kw:<30} " + " ".join(vals))
|
| 534 |
+
|
| 535 |
+
# Add trend commentary for the last two quarters
|
| 536 |
+
if len(all_meta) >= 2:
|
| 537 |
+
rows.append("")
|
| 538 |
+
rows.append("NOTABLE CHANGES (latest vs prior quarter):")
|
| 539 |
+
prev_kw = all_meta[-2].get("keywords", {})
|
| 540 |
+
curr_kw = all_meta[-1].get("keywords", {})
|
| 541 |
+
for kw in sorted(all_kws):
|
| 542 |
+
p, c = prev_kw.get(kw, 0), curr_kw.get(kw, 0)
|
| 543 |
+
if p != c:
|
| 544 |
+
direction = "↑" if c > p else "↓"
|
| 545 |
+
rows.append(f" {kw}: {p} → {c} ({direction})")
|
| 546 |
+
|
| 547 |
+
return header + "\n".join(rows)
|
| 548 |
+
|
| 549 |
+
except Exception as e:
|
| 550 |
+
return f"Error loading keyword trends: {e}"
|
core/graph_builder.py
ADDED
|
@@ -0,0 +1,372 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import operator
|
| 2 |
+
from typing import Annotated, Sequence, TypedDict, Literal, Set
|
| 3 |
+
|
| 4 |
+
import yfinance as yf
|
| 5 |
+
import pandas as pd
|
| 6 |
+
from pydantic import BaseModel, Field
|
| 7 |
+
|
| 8 |
+
from langchain_core.messages import HumanMessage, SystemMessage, BaseMessage, AIMessage
|
| 9 |
+
from langchain_core.tools import tool
|
| 10 |
+
from langgraph.graph import StateGraph, START, END
|
| 11 |
+
from langchain.agents import create_agent
|
| 12 |
+
|
| 13 |
+
from .sec_tools import get_company_concept_xbrl
|
| 14 |
+
from .rag_tools import search_10k_filings
|
| 15 |
+
from .sentiment_tools import get_recent_news
|
| 16 |
+
from .earnings_tools import (
|
| 17 |
+
search_earnings_call,
|
| 18 |
+
get_earnings_sentiment_divergence,
|
| 19 |
+
get_earnings_keyword_trends,
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
@tool
|
| 24 |
+
def get_stock_metrics(ticker: str) -> str:
|
| 25 |
+
"""
|
| 26 |
+
Fetches historical market data and calculates basic metrics for a stock.
|
| 27 |
+
CRITICAL: You must pass the official stock ticker symbol (e.g., 'AAPL' for Apple).
|
| 28 |
+
"""
|
| 29 |
+
try:
|
| 30 |
+
ticker = ticker.upper()
|
| 31 |
+
print(f"\n[System: Fetching yfinance data for {ticker}...]")
|
| 32 |
+
|
| 33 |
+
stock = yf.Ticker(ticker)
|
| 34 |
+
hist = stock.history(period="1mo")
|
| 35 |
+
|
| 36 |
+
if hist.empty:
|
| 37 |
+
return f"Could not find price data for ticker: {ticker}. Tell the user the data fetch failed."
|
| 38 |
+
|
| 39 |
+
current_price = hist["Close"].iloc[-1]
|
| 40 |
+
monthly_high = hist["High"].max()
|
| 41 |
+
monthly_low = hist["Low"].min()
|
| 42 |
+
avg_volume = hist["Volume"].mean()
|
| 43 |
+
|
| 44 |
+
summary = (
|
| 45 |
+
f"Data for {ticker}:\n"
|
| 46 |
+
f"- Current Price: ${current_price:.2f}\n"
|
| 47 |
+
f"- 1-Month High: ${monthly_high:.2f}\n"
|
| 48 |
+
f"- 1-Month Low: ${monthly_low:.2f}\n"
|
| 49 |
+
f"- Average Daily Volume: {int(avg_volume):,}"
|
| 50 |
+
)
|
| 51 |
+
return summary
|
| 52 |
+
except Exception as e:
|
| 53 |
+
return f"Error fetching data: {str(e)}"
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def merge_sets(a: Set, b: Set) -> Set:
|
| 57 |
+
return a | b
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
class AgentState(TypedDict):
|
| 61 |
+
messages: Annotated[Sequence[BaseMessage], operator.add]
|
| 62 |
+
next: str | list[str]
|
| 63 |
+
steps: Annotated[int, operator.add]
|
| 64 |
+
completed_tasks: Annotated[Set[str], merge_sets]
|
| 65 |
+
pending_tasks: list
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
members = ["Quant_Agent", "Fundamental_Agent", "Sentiment_Agent", "Earnings_Agent"]
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def make_worker_node(agent, name: str):
|
| 72 |
+
def node(state: AgentState):
|
| 73 |
+
pending = state.get("pending_tasks", [])
|
| 74 |
+
completed = state.get("completed_tasks", set())
|
| 75 |
+
|
| 76 |
+
my_task = next(
|
| 77 |
+
(
|
| 78 |
+
t
|
| 79 |
+
for t in pending
|
| 80 |
+
if t["agent"] == name and t["task_id"] not in completed
|
| 81 |
+
),
|
| 82 |
+
None,
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
if not my_task:
|
| 86 |
+
return {"completed_tasks": set()}
|
| 87 |
+
|
| 88 |
+
task_message = HumanMessage(
|
| 89 |
+
content=f"Ticker: {my_task['ticker']}. Task: {my_task['description']}"
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
result = agent.invoke({"messages": [task_message]})
|
| 93 |
+
has_tool_call = any(
|
| 94 |
+
isinstance(m, AIMessage) and m.tool_calls for m in result["messages"]
|
| 95 |
+
)
|
| 96 |
+
if not has_tool_call:
|
| 97 |
+
content = f"ERROR: The {name} attempted to answer without using a data tool. This analysis is unauthorized."
|
| 98 |
+
else:
|
| 99 |
+
content = result["messages"][-1].content.strip() or f"[{name}: No data retrieved]"
|
| 100 |
+
return {
|
| 101 |
+
"messages": [AIMessage(content=f"[{my_task['task_id']}] {content}", name=name)],
|
| 102 |
+
"completed_tasks": {my_task["task_id"]},
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
return node
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def create_planner_node(llm):
|
| 109 |
+
planner_prompt = """You are a task planner for a financial AI system.
|
| 110 |
+
GOLDEN RULE: Never assume or guess a number.
|
| 111 |
+
|
| 112 |
+
CRITICAL AGENT CAPABILITY MAPPING:
|
| 113 |
+
1. Quant_Agent: ONLY use for current stock price, trading volume, and 52-week high/lows.
|
| 114 |
+
=> GROUPING RULE: If the user asks for multiple price/volume metrics for the SAME ticker, group them into EXACTLY ONE Quant_Agent task. Do NOT make separate tasks for price and volume.
|
| 115 |
+
2. Sentiment_Agent: ONLY use for recent news headlines and market sentiment scores.
|
| 116 |
+
3. Fundamental_Agent: Use for TWO things:
|
| 117 |
+
- SEC Financial Metrics (Revenue, Net Income, Margins, Cash Flow).
|
| 118 |
+
- SEC 10-K RAG Searches: Use this for ANY qualitative questions about business strategy, supply chain, manufacturing, competition, and corporate RISKS.
|
| 119 |
+
4. Earnings_Agent: Use for earnings-call analysis. This includes:
|
| 120 |
+
- Management commentary and guidance from earnings calls.
|
| 121 |
+
- Sentiment divergence between Prepared Remarks and Q&A sessions.
|
| 122 |
+
- Keyword and entity tracking across quarters (e.g., mentions of "AI", "headwinds", "growth").
|
| 123 |
+
=> Use this agent when the user asks about earnings calls, management tone, guidance language, or quarter-over-quarter keyword trends.
|
| 124 |
+
|
| 125 |
+
Read the user's request and output a JSON list of tasks needed to answer it.
|
| 126 |
+
Each task must have:
|
| 127 |
+
- "agent": "Quant_Agent", "Fundamental_Agent", "Sentiment_Agent", or "Earnings_Agent"
|
| 128 |
+
- "ticker": the stock ticker symbol (e.g. "AAPL")
|
| 129 |
+
- "task_id": a unique string
|
| 130 |
+
- "description": specific instructions on what to fetch or search
|
| 131 |
+
|
| 132 |
+
Output ONLY valid JSON. No explanation.
|
| 133 |
+
example output:
|
| 134 |
+
[
|
| 135 |
+
{"agent": "Quant_Agent", "ticker": "AAPL", "task_id": "Quant_AAPL", "description": "Get price and volume for AAPL"},
|
| 136 |
+
{"agent": "Sentiment_Agent", "ticker": "MSFT", "task_id": "Sentiment_MSFT", "description": "Get sentiment analysis for MSFT"},
|
| 137 |
+
{"agent": "Earnings_Agent", "ticker": "AAPL", "task_id": "Earnings_AAPL", "description": "Analyze sentiment divergence between prepared remarks and Q&A in the latest earnings call"}
|
| 138 |
+
]"""
|
| 139 |
+
|
| 140 |
+
def planner_function(state: AgentState):
|
| 141 |
+
if state.get("pending_tasks"):
|
| 142 |
+
return {}
|
| 143 |
+
|
| 144 |
+
user_message = next(m for m in state["messages"] if isinstance(m, HumanMessage))
|
| 145 |
+
response = llm.invoke(
|
| 146 |
+
[
|
| 147 |
+
SystemMessage(content=planner_prompt),
|
| 148 |
+
HumanMessage(content=user_message.content),
|
| 149 |
+
]
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
import json
|
| 153 |
+
|
| 154 |
+
raw = response.content.strip().replace("```json", "").replace("```", "")
|
| 155 |
+
start = raw.find("[")
|
| 156 |
+
end = raw.rfind("]")
|
| 157 |
+
try:
|
| 158 |
+
tasks = json.loads(raw[start : end + 1]) if start != -1 and end != -1 else []
|
| 159 |
+
except Exception:
|
| 160 |
+
tasks = []
|
| 161 |
+
|
| 162 |
+
if not tasks:
|
| 163 |
+
print("[Planner]: No valid financial tasks found.")
|
| 164 |
+
return {
|
| 165 |
+
"pending_tasks": [],
|
| 166 |
+
"completed_tasks": set(),
|
| 167 |
+
"messages": [
|
| 168 |
+
AIMessage(
|
| 169 |
+
content="I can only answer questions about stock prices, SEC filings, and market sentiment.",
|
| 170 |
+
name="Supervisor",
|
| 171 |
+
)
|
| 172 |
+
],
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
print(f"\n[Planner]: Created {len(tasks)} tasks: {[t['task_id'] for t in tasks]}")
|
| 176 |
+
return {"pending_tasks": tasks, "completed_tasks": set()}
|
| 177 |
+
|
| 178 |
+
return planner_function
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
def create_supervisor_node(llm):
|
| 182 |
+
def supervisor_function(state: AgentState):
|
| 183 |
+
steps = state.get("steps", 0)
|
| 184 |
+
if steps >= 10:
|
| 185 |
+
return {"next": "FINISH", "steps": 1}
|
| 186 |
+
|
| 187 |
+
pending = state.get("pending_tasks", [])
|
| 188 |
+
completed = state.get("completed_tasks", set())
|
| 189 |
+
|
| 190 |
+
remaining = [t for t in pending if t["task_id"] not in completed]
|
| 191 |
+
|
| 192 |
+
if not remaining:
|
| 193 |
+
print("-> All tasks complete. Routing to Summarizer.")
|
| 194 |
+
return {"next": "FINISH", "steps": 1}
|
| 195 |
+
|
| 196 |
+
# Dispatch one task per unique agent in parallel
|
| 197 |
+
agents_to_dispatch = []
|
| 198 |
+
dispatched_tasks = []
|
| 199 |
+
for task in remaining:
|
| 200 |
+
if task["agent"] not in agents_to_dispatch:
|
| 201 |
+
agents_to_dispatch.append(task["agent"])
|
| 202 |
+
dispatched_tasks.append(task["task_id"])
|
| 203 |
+
|
| 204 |
+
print(f"\n[Supervisor]: Dispatching tasks in parallel → {dispatched_tasks}")
|
| 205 |
+
return {
|
| 206 |
+
"next": agents_to_dispatch,
|
| 207 |
+
"steps": 1,
|
| 208 |
+
}
|
| 209 |
+
|
| 210 |
+
return supervisor_function
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
def create_summarizer_node(llm):
|
| 214 |
+
summarizer_system = """You are a senior investment analyst drafting an internal **Investment Memo** for colleagues.
|
| 215 |
+
|
| 216 |
+
You will receive the user's original question and verbatim outputs from specialist agents (Quant_Agent, Fundamental_Agent, Sentiment_Agent, Earnings_Agent), or a single clarification/refusal if no research ran.
|
| 217 |
+
|
| 218 |
+
Write the memo using this structure and markdown headings:
|
| 219 |
+
|
| 220 |
+
# Investment Memo
|
| 221 |
+
## Executive Summary
|
| 222 |
+
2-4 sentences answering the user in plain language.
|
| 223 |
+
|
| 224 |
+
## Key Facts & Data
|
| 225 |
+
Bullet points. Use ONLY numbers, metrics, and quotes that appear in the specialist outputs. If a section had no data, say "No quantitative/fundamental/sentiment data provided" as appropriate.
|
| 226 |
+
|
| 227 |
+
## Earnings Call Insights
|
| 228 |
+
If Earnings_Agent data is present, summarize:
|
| 229 |
+
- Sentiment divergence between Prepared Remarks and Q&A (was management more cautious or bullish in live Q&A vs. scripted remarks?).
|
| 230 |
+
- Notable keyword/entity trends across quarters (e.g., increasing mentions of "AI", declining mentions of "headwinds").
|
| 231 |
+
If no earnings data was provided, omit this section entirely.
|
| 232 |
+
|
| 233 |
+
## Risks, Sentiment, and Context
|
| 234 |
+
Integrate fundamental and sentiment findings when present. If missing, state that briefly.
|
| 235 |
+
|
| 236 |
+
## Caveats
|
| 237 |
+
Note missing specialists, tool errors, or "unauthorized" / ERROR lines exactly as reported—do not soften them.
|
| 238 |
+
|
| 239 |
+
Rules:
|
| 240 |
+
- Do NOT invent tickers, prices, filings, or sentiment scores not present in the inputs.
|
| 241 |
+
- Do NOT cite tool names; write for a portfolio manager reader.
|
| 242 |
+
- Keep the tone professional and concise."""
|
| 243 |
+
|
| 244 |
+
def summarizer_function(state: AgentState):
|
| 245 |
+
user_messages = [m for m in state["messages"] if isinstance(m, HumanMessage)]
|
| 246 |
+
user_query = user_messages[0].content if user_messages else ""
|
| 247 |
+
|
| 248 |
+
blocks = []
|
| 249 |
+
for m in state["messages"]:
|
| 250 |
+
if not isinstance(m, AIMessage):
|
| 251 |
+
continue
|
| 252 |
+
label = m.name or "Assistant"
|
| 253 |
+
blocks.append(f"### {label}\n{m.content}")
|
| 254 |
+
|
| 255 |
+
specialist_blob = "\n\n".join(blocks) if blocks else "(No specialist outputs.)"
|
| 256 |
+
|
| 257 |
+
response = llm.invoke(
|
| 258 |
+
[
|
| 259 |
+
SystemMessage(content=summarizer_system),
|
| 260 |
+
HumanMessage(
|
| 261 |
+
content=(
|
| 262 |
+
f"User request:\n{user_query}\n\n"
|
| 263 |
+
f"Specialist outputs (verbatim):\n{specialist_blob}"
|
| 264 |
+
)
|
| 265 |
+
),
|
| 266 |
+
]
|
| 267 |
+
)
|
| 268 |
+
memo = (response.content or "").strip()
|
| 269 |
+
return {"messages": [AIMessage(content=memo, name="Summarizer")]}
|
| 270 |
+
|
| 271 |
+
return summarizer_function
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
def build_financial_graph(llm):
|
| 275 |
+
workflow = StateGraph(AgentState)
|
| 276 |
+
|
| 277 |
+
quant_agent = create_agent(
|
| 278 |
+
model=llm,
|
| 279 |
+
tools=[get_stock_metrics],
|
| 280 |
+
system_prompt=(
|
| 281 |
+
"You are a Quantitative Analyst. "
|
| 282 |
+
"You have exactly ONE tool: get_stock_metrics(ticker). "
|
| 283 |
+
"For any price, volume, or trading-range question you MUST call get_stock_metrics—do not answer from memory. "
|
| 284 |
+
"NEVER invent other tool names, NEVER output JSON blocks suggesting tools that do not exist. "
|
| 285 |
+
"GOLDEN RULE: After the tool returns, you must format the output gracefully so it is easy to read. "
|
| 286 |
+
"Bold the labels (like **Current Price:** or **Average Volume:**) before injecting the numbers. "
|
| 287 |
+
"NEVER use introductory conversational filler like 'Here is the data'. Just print the labeled metrics directly."
|
| 288 |
+
),
|
| 289 |
+
name="Quant_Agent",
|
| 290 |
+
)
|
| 291 |
+
|
| 292 |
+
fundamental_agent = create_agent(
|
| 293 |
+
model=llm,
|
| 294 |
+
tools=[search_10k_filings, get_company_concept_xbrl],
|
| 295 |
+
system_prompt=(
|
| 296 |
+
"You are a Fundamental Analyst. "
|
| 297 |
+
"GOLDEN RULE: You must output the EXACT DATA or TEXT returned by your tools. "
|
| 298 |
+
"Do NOT explain how the tools work or what they do. "
|
| 299 |
+
"CRITICAL: ONCE YOU HAVE CALLED TO THE TOOL ONCE AND RECEIVED THE DATA, YOU MUST WRITE YOUR FINAL ANSWER IMMEDIATELY. DO NOT CALL THE TOOL A SECOND TIME. "
|
| 300 |
+
"Just answer the user's question using the fetched data and stop."
|
| 301 |
+
),
|
| 302 |
+
name="Fundamental_Agent",
|
| 303 |
+
)
|
| 304 |
+
sentiment_agent = create_agent(
|
| 305 |
+
model=llm,
|
| 306 |
+
tools=[get_recent_news],
|
| 307 |
+
system_prompt=(
|
| 308 |
+
"You are a Sentiment Analyst. Fetch recent news using your tool. "
|
| 309 |
+
"CRITICAL RULES: Your final response MUST be exactly five lines. "
|
| 310 |
+
"Line 1: The sentiment score (a single number between -1.0 and 1.0). "
|
| 311 |
+
"Line 2-5: Justify the sentiment score based on the news articles."
|
| 312 |
+
"Include important keywords from the news articles in your response."
|
| 313 |
+
"Do not add conversational filler. Do not ask the user follow-up questions."
|
| 314 |
+
),
|
| 315 |
+
name="Sentiment_Agent",
|
| 316 |
+
)
|
| 317 |
+
|
| 318 |
+
earnings_agent = create_agent(
|
| 319 |
+
model=llm,
|
| 320 |
+
tools=[
|
| 321 |
+
search_earnings_call,
|
| 322 |
+
get_earnings_sentiment_divergence,
|
| 323 |
+
get_earnings_keyword_trends,
|
| 324 |
+
],
|
| 325 |
+
system_prompt=(
|
| 326 |
+
"You are an Earnings Call Analyst specializing in management commentary analysis. "
|
| 327 |
+
"You have THREE tools for analyzing pre-ingested earnings-call transcripts:\n"
|
| 328 |
+
"1. search_earnings_call: Search transcripts for specific topics (guidance, margins, strategy, etc.).\n"
|
| 329 |
+
"2. get_earnings_sentiment_divergence: Compare management tone in scripted Prepared Remarks vs live Q&A.\n"
|
| 330 |
+
"3. get_earnings_keyword_trends: Track keyword frequency changes across quarters.\n\n"
|
| 331 |
+
"CRITICAL RULES:\n"
|
| 332 |
+
"- You MUST call at least one tool. Do NOT answer from memory.\n"
|
| 333 |
+
"- If a tool returns an error about missing data, report that the earnings data for that "
|
| 334 |
+
"ticker/quarter has not been ingested and suggest running the ingest script.\n"
|
| 335 |
+
"- After the tool returns, write a clear, evidence-backed analysis. Bold key findings.\n"
|
| 336 |
+
"- Do NOT add conversational filler. Do NOT ask follow-up questions."
|
| 337 |
+
),
|
| 338 |
+
name="Earnings_Agent",
|
| 339 |
+
)
|
| 340 |
+
|
| 341 |
+
workflow.add_node("Planner", create_planner_node(llm))
|
| 342 |
+
workflow.add_node("Supervisor", create_supervisor_node(llm))
|
| 343 |
+
|
| 344 |
+
workflow.add_node("Quant_Agent", make_worker_node(quant_agent, "Quant_Agent"))
|
| 345 |
+
workflow.add_node(
|
| 346 |
+
"Fundamental_Agent", make_worker_node(fundamental_agent, "Fundamental_Agent")
|
| 347 |
+
)
|
| 348 |
+
workflow.add_node("Sentiment_Agent", make_worker_node(sentiment_agent, "Sentiment_Agent"))
|
| 349 |
+
workflow.add_node("Earnings_Agent", make_worker_node(earnings_agent, "Earnings_Agent"))
|
| 350 |
+
workflow.add_node("Summarizer", create_summarizer_node(llm))
|
| 351 |
+
|
| 352 |
+
for member in members:
|
| 353 |
+
workflow.add_edge(member, "Supervisor")
|
| 354 |
+
|
| 355 |
+
workflow.add_edge(START, "Planner")
|
| 356 |
+
workflow.add_edge("Planner", "Supervisor")
|
| 357 |
+
|
| 358 |
+
workflow.add_conditional_edges(
|
| 359 |
+
"Supervisor",
|
| 360 |
+
lambda state: state["next"],
|
| 361 |
+
{
|
| 362 |
+
"Quant_Agent": "Quant_Agent",
|
| 363 |
+
"Fundamental_Agent": "Fundamental_Agent",
|
| 364 |
+
"Sentiment_Agent": "Sentiment_Agent",
|
| 365 |
+
"Earnings_Agent": "Earnings_Agent",
|
| 366 |
+
"FINISH": "Summarizer",
|
| 367 |
+
},
|
| 368 |
+
)
|
| 369 |
+
|
| 370 |
+
workflow.add_edge("Summarizer", END)
|
| 371 |
+
|
| 372 |
+
return workflow.compile()
|
core/rag_tools.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from langchain_core.tools import tool
|
| 3 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 4 |
+
from langchain_chroma import Chroma
|
| 5 |
+
from langchain_core.messages import SystemMessage, HumanMessage
|
| 6 |
+
|
| 7 |
+
import functools
|
| 8 |
+
|
| 9 |
+
@functools.lru_cache(maxsize=1)
|
| 10 |
+
def get_cached_embeddings():
|
| 11 |
+
return HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
|
| 12 |
+
|
| 13 |
+
def get_10k_vector_db(ticker: str) -> Chroma:
|
| 14 |
+
"""Loads a pre-computed 10-K Chroma Vector Database from disk."""
|
| 15 |
+
ticker = ticker.upper()
|
| 16 |
+
persist_directory = f"./chroma_db/{ticker}_10k"
|
| 17 |
+
|
| 18 |
+
if not os.path.exists(persist_directory):
|
| 19 |
+
raise FileNotFoundError(
|
| 20 |
+
f"10-K data for {ticker} has not been ingested. "
|
| 21 |
+
f"Please run the ingestion pipeline: `python ingest.py --tickers {ticker}`"
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
# Using cached embeddings to prevent massive memory slowdowns on agent loops
|
| 25 |
+
embeddings = get_cached_embeddings()
|
| 26 |
+
return Chroma(
|
| 27 |
+
persist_directory=persist_directory,
|
| 28 |
+
embedding_function=embeddings
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
@tool
|
| 32 |
+
def search_10k_filings(ticker: str, query: str, llm=None) -> str:
|
| 33 |
+
"""Searches 10-K and returns a CONCISE summary of findings."""
|
| 34 |
+
try:
|
| 35 |
+
db = get_10k_vector_db(ticker)
|
| 36 |
+
results = db.similarity_search(query, k=2)
|
| 37 |
+
|
| 38 |
+
if not results:
|
| 39 |
+
return f"No info found for {query}."
|
| 40 |
+
|
| 41 |
+
# Combine the text
|
| 42 |
+
context = "\n".join([doc.page_content for doc in results])
|
| 43 |
+
|
| 44 |
+
# We can use the LLM to 'pre-process' the data so the Supervisor stays clean
|
| 45 |
+
# Note: You'll need to pass the 'llm' object into this tool or initialize a local one
|
| 46 |
+
if llm:
|
| 47 |
+
response = llm.invoke([
|
| 48 |
+
SystemMessage(content="You are a helpful assistant."),
|
| 49 |
+
HumanMessage(content=f"Summarize the following 10-K findings for {ticker} regarding {query}:\n\n{context}")
|
| 50 |
+
])
|
| 51 |
+
return response.content
|
| 52 |
+
else:
|
| 53 |
+
return f"SUMMARY OF 10-K FINDINGS FOR {ticker} ({query}):\n\n{context}"
|
| 54 |
+
|
| 55 |
+
except Exception as e:
|
| 56 |
+
return f"Error: {str(e)}"
|
| 57 |
+
|
| 58 |
+
if __name__ == "__main__":
|
| 59 |
+
# Test using .invoke() as we discussed for sec_tools
|
| 60 |
+
print(search_10k_filings.invoke({"ticker": "TSLA", "query": "marketing risks"}))
|
core/runner.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_openai import ChatOpenAI
|
| 2 |
+
from langchain_core.messages import HumanMessage
|
| 3 |
+
import json
|
| 4 |
+
import time
|
| 5 |
+
|
| 6 |
+
from .config import Settings
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def create_llm(settings: Settings) -> ChatOpenAI:
|
| 10 |
+
return ChatOpenAI(
|
| 11 |
+
model=settings.openai_model,
|
| 12 |
+
api_key=settings.openai_api_key,
|
| 13 |
+
base_url=settings.openai_base_url,
|
| 14 |
+
temperature=settings.openai_temperature,
|
| 15 |
+
max_tokens=800, # CRITICAL FIX: Stops Groq from 'reserving' 8000+ tokens per API call
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def run_financial_query(compiled_graph, user_query: str) -> dict:
|
| 20 |
+
"""
|
| 21 |
+
Run one LangGraph turn. Returns memo (if Summarizer ran) and per-node step contents.
|
| 22 |
+
"""
|
| 23 |
+
initial_state = {
|
| 24 |
+
"messages": [HumanMessage(content=user_query)],
|
| 25 |
+
"steps": 0,
|
| 26 |
+
"completed_tasks": set(),
|
| 27 |
+
"pending_tasks": [],
|
| 28 |
+
}
|
| 29 |
+
run_label = user_query if len(user_query) <= 80 else user_query[:77] + "..."
|
| 30 |
+
stream_config = {
|
| 31 |
+
"run_name": run_label,
|
| 32 |
+
"tags": ["fin-agent", "langgraph"],
|
| 33 |
+
"metadata": {"app": "FinAgent"},
|
| 34 |
+
}
|
| 35 |
+
steps: list[dict] = []
|
| 36 |
+
memo: str | None = None
|
| 37 |
+
|
| 38 |
+
start_time = time.time()
|
| 39 |
+
last_time = start_time
|
| 40 |
+
total_latency = 0.0
|
| 41 |
+
|
| 42 |
+
for output in compiled_graph.stream(initial_state, stream_config):
|
| 43 |
+
for node_name, state_update in output.items():
|
| 44 |
+
current_time = time.time()
|
| 45 |
+
step_latency = current_time - last_time
|
| 46 |
+
total_latency = current_time - start_time
|
| 47 |
+
last_time = current_time
|
| 48 |
+
|
| 49 |
+
if node_name == "Planner":
|
| 50 |
+
tasks = state_update.get("pending_tasks", [])
|
| 51 |
+
content = f"Generated {len(tasks)} parallel task(s): {[t['task_id'] for t in tasks]}"
|
| 52 |
+
elif node_name == "Supervisor":
|
| 53 |
+
next_agents = state_update.get("next", [])
|
| 54 |
+
if next_agents == "FINISH":
|
| 55 |
+
content = "All tasks complete. Routing to Summarizer."
|
| 56 |
+
else:
|
| 57 |
+
content = f"Dispatching tasks to: {next_agents}"
|
| 58 |
+
else:
|
| 59 |
+
messages = state_update.get("messages", [])
|
| 60 |
+
if not messages:
|
| 61 |
+
continue
|
| 62 |
+
content = messages[-1].content
|
| 63 |
+
|
| 64 |
+
if node_name == "Summarizer":
|
| 65 |
+
memo = content
|
| 66 |
+
else:
|
| 67 |
+
steps.append({
|
| 68 |
+
"node": node_name,
|
| 69 |
+
"content": content,
|
| 70 |
+
"step_latency": round(step_latency, 2),
|
| 71 |
+
"total_latency": round(total_latency, 2)
|
| 72 |
+
})
|
| 73 |
+
return {"memo": memo, "steps": steps, "total_latency": round(total_latency, 2)}
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
async def astream_financial_query(compiled_graph, user_query: str):
|
| 77 |
+
"""
|
| 78 |
+
Async generator yielding Server-Sent Events (SSE) for each graph step.
|
| 79 |
+
Useful for streaming over HTTP.
|
| 80 |
+
"""
|
| 81 |
+
initial_state = {
|
| 82 |
+
"messages": [HumanMessage(content=user_query)],
|
| 83 |
+
"steps": 0,
|
| 84 |
+
"completed_tasks": set(),
|
| 85 |
+
"pending_tasks": [],
|
| 86 |
+
}
|
| 87 |
+
run_label = user_query if len(user_query) <= 80 else user_query[:77] + "..."
|
| 88 |
+
stream_config = {
|
| 89 |
+
"run_name": run_label,
|
| 90 |
+
"tags": ["fin-agent", "langgraph"],
|
| 91 |
+
"metadata": {"app": "FinAgent"},
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
start_time = time.time()
|
| 95 |
+
last_time = start_time
|
| 96 |
+
|
| 97 |
+
async for output in compiled_graph.astream(initial_state, stream_config):
|
| 98 |
+
for node_name, state_update in output.items():
|
| 99 |
+
current_time = time.time()
|
| 100 |
+
step_latency = current_time - last_time
|
| 101 |
+
total_latency = current_time - start_time
|
| 102 |
+
last_time = current_time
|
| 103 |
+
|
| 104 |
+
if node_name == "Planner":
|
| 105 |
+
tasks = state_update.get("pending_tasks", [])
|
| 106 |
+
content = f"Generated {len(tasks)} parallel task(s): {[t['task_id'] for t in tasks]}"
|
| 107 |
+
elif node_name == "Supervisor":
|
| 108 |
+
next_agents = state_update.get("next", [])
|
| 109 |
+
if next_agents == "FINISH":
|
| 110 |
+
content = "All tasks complete. Routing to Summarizer."
|
| 111 |
+
else:
|
| 112 |
+
content = f"Dispatching tasks to: {next_agents}"
|
| 113 |
+
else:
|
| 114 |
+
messages = state_update.get("messages", [])
|
| 115 |
+
if not messages:
|
| 116 |
+
continue
|
| 117 |
+
content = messages[-1].content
|
| 118 |
+
|
| 119 |
+
data = {
|
| 120 |
+
"node": node_name,
|
| 121 |
+
"content": content,
|
| 122 |
+
"step_latency": round(step_latency, 2),
|
| 123 |
+
"total_latency": round(total_latency, 2)
|
| 124 |
+
}
|
| 125 |
+
yield f"data: {json.dumps(data)}\n\n"
|
core/sec_tools.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from langchain_core.tools import tool
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
from typing import Literal
|
| 6 |
+
from pydantic import BaseModel, Field
|
| 7 |
+
import functools
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
USER_AGENT = "Dev Goyal devgoyal9031@gmail.com"
|
| 11 |
+
HEADERS = {"User-Agent": USER_AGENT}
|
| 12 |
+
|
| 13 |
+
@functools.lru_cache(maxsize=1)
|
| 14 |
+
def _get_ticker_to_cik_mapping() -> dict[str, str]:
|
| 15 |
+
"""Fetches and caches the SEC ticker to CIK mapping."""
|
| 16 |
+
url = "https://www.sec.gov/files/company_tickers.json"
|
| 17 |
+
print("[System: Fetching SEC ticker to CIK mapping...]")
|
| 18 |
+
response = requests.get(url, headers=HEADERS)
|
| 19 |
+
response.raise_for_status()
|
| 20 |
+
data = response.json()
|
| 21 |
+
|
| 22 |
+
mapping = {}
|
| 23 |
+
for _, company_info in data.items():
|
| 24 |
+
mapping[company_info['ticker'].upper()] = str(company_info['cik_str']).zfill(10)
|
| 25 |
+
return mapping
|
| 26 |
+
|
| 27 |
+
def get_cik_from_ticker(ticker: str) -> str:
|
| 28 |
+
ticker = ticker.upper()
|
| 29 |
+
mapping = _get_ticker_to_cik_mapping()
|
| 30 |
+
if ticker in mapping:
|
| 31 |
+
return mapping[ticker]
|
| 32 |
+
raise ValueError(f"Ticker {ticker} not found in SEC database.")
|
| 33 |
+
|
| 34 |
+
def get_latest_10k_url(ticker: str) -> str:
|
| 35 |
+
"""Finds the URL for the most recent 10-K filing for a given ticker."""
|
| 36 |
+
try:
|
| 37 |
+
cik = get_cik_from_ticker(ticker)
|
| 38 |
+
url = f"https://data.sec.gov/submissions/CIK{cik}.json"
|
| 39 |
+
|
| 40 |
+
print(f"[System: Fetching filing history for CIK {cik}...]")
|
| 41 |
+
response = requests.get(url, headers=HEADERS)
|
| 42 |
+
response.raise_for_status()
|
| 43 |
+
|
| 44 |
+
filings = response.json()['filings']['recent']
|
| 45 |
+
|
| 46 |
+
# Search for the most recent 10-K
|
| 47 |
+
for i, form in enumerate(filings['form']):
|
| 48 |
+
if form == '10-K':
|
| 49 |
+
accession_number = filings['accessionNumber'][i]
|
| 50 |
+
# The SEC URL format removes dashes from the accession number
|
| 51 |
+
accession_no_dashes = accession_number.replace('-', '')
|
| 52 |
+
|
| 53 |
+
# Construct the final document URL
|
| 54 |
+
document_url = f"https://www.sec.gov/Archives/edgar/data/{cik.lstrip('0')}/{accession_no_dashes}/{accession_number}.txt"
|
| 55 |
+
return document_url
|
| 56 |
+
|
| 57 |
+
return f"No 10-K found for {ticker}."
|
| 58 |
+
|
| 59 |
+
except Exception as e:
|
| 60 |
+
return f"Error: {str(e)}"
|
| 61 |
+
# 1. Define the strict Pydantic Schema
|
| 62 |
+
class XBRLConceptInput(BaseModel):
|
| 63 |
+
ticker: str = Field(
|
| 64 |
+
...,
|
| 65 |
+
description="The official uppercase ticker symbol (e.g., AAPL)."
|
| 66 |
+
)
|
| 67 |
+
concept: Literal[
|
| 68 |
+
"Revenues",
|
| 69 |
+
"NetIncomeLoss",
|
| 70 |
+
"Assets",
|
| 71 |
+
"Liabilities",
|
| 72 |
+
"GrossProfit",
|
| 73 |
+
"OperatingIncomeLoss",
|
| 74 |
+
"AssetsCurrent",
|
| 75 |
+
"LiabilitiesCurrent",
|
| 76 |
+
"NetCashProvidedByUsedInOperatingActivities",
|
| 77 |
+
"PaymentsToAcquirePropertyPlantAndEquipment",
|
| 78 |
+
"EntityCommonStockSharesOutstanding"
|
| 79 |
+
] = Field(
|
| 80 |
+
...,
|
| 81 |
+
description="You MUST select the exact SEC XBRL concept from this list that best matches the user's request."
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
# 2. Bind the schema to the tool
|
| 85 |
+
@tool(args_schema=XBRLConceptInput)
|
| 86 |
+
def get_company_concept_xbrl(ticker: str, concept: str) -> str:
|
| 87 |
+
"""
|
| 88 |
+
Fetches official SEC accounting metrics for a company across recent quarters.
|
| 89 |
+
CRITICAL INSTRUCTIONS:
|
| 90 |
+
1. 'ticker': Must be the official uppercase ticker symbol (e.g., AAPL).
|
| 91 |
+
2. 'concept': You MUST use one of these exact SEC XBRL concepts (case-sensitive):
|
| 92 |
+
-- Core Size --
|
| 93 |
+
- 'Revenues' (Total Revenue / Sales)
|
| 94 |
+
- 'NetIncomeLoss' (Net Income / Profit)
|
| 95 |
+
- 'Assets' (Total Assets)
|
| 96 |
+
- 'Liabilities' (Total Liabilities)
|
| 97 |
+
|
| 98 |
+
-- Margins & Liquidity --
|
| 99 |
+
- 'GrossProfit' (Revenue minus Cost of Goods Sold)
|
| 100 |
+
- 'OperatingIncomeLoss' (Operating Income)
|
| 101 |
+
- 'AssetsCurrent' (Short-term assets like cash/inventory)
|
| 102 |
+
- 'LiabilitiesCurrent' (Short-term debt)
|
| 103 |
+
|
| 104 |
+
-- Cash Flow & Valuation --
|
| 105 |
+
- 'NetCashProvidedByUsedInOperatingActivities' (Operating Cash Flow)
|
| 106 |
+
- 'PaymentsToAcquirePropertyPlantAndEquipment' (Capital Expenditures / CapEx)
|
| 107 |
+
- 'EntityCommonStockSharesOutstanding' (Total shares outstanding)
|
| 108 |
+
|
| 109 |
+
Do not guess concepts. Only use the exact strings listed above.
|
| 110 |
+
"""
|
| 111 |
+
try:
|
| 112 |
+
cik = get_cik_from_ticker(ticker)
|
| 113 |
+
url = f"https://data.sec.gov/api/xbrl/companyconcept/CIK{cik}/us-gaap/{concept}.json"
|
| 114 |
+
|
| 115 |
+
print(f"[System: Fetching latest {concept} for {ticker}...]")
|
| 116 |
+
response = requests.get(url, headers=HEADERS)
|
| 117 |
+
response.raise_for_status()
|
| 118 |
+
data = response.json()
|
| 119 |
+
|
| 120 |
+
if "USD" not in data.get("units", {}):
|
| 121 |
+
return f"No USD data found for {concept}."
|
| 122 |
+
|
| 123 |
+
# 1. Convert to DataFrame
|
| 124 |
+
df = pd.DataFrame(data["units"]["USD"])
|
| 125 |
+
|
| 126 |
+
# 2. Convert date strings to datetime objects
|
| 127 |
+
df['end'] = pd.to_datetime(df['end'])
|
| 128 |
+
df['filed'] = pd.to_datetime(df['filed'])
|
| 129 |
+
|
| 130 |
+
# 3. Filter for standard filings to avoid "preliminary" noise
|
| 131 |
+
df = df[df['form'].isin(['10-Q', '10-K', '10-K/A', '10-Q/A'])]
|
| 132 |
+
|
| 133 |
+
# 4. CRITICAL: Deduplicate.
|
| 134 |
+
# If the same period ('end') is reported multiple times, take the most recently filed one.
|
| 135 |
+
df = df.sort_values(by=['end', 'filed'], ascending=[False, False])
|
| 136 |
+
df = df.drop_duplicates(subset=['end'])
|
| 137 |
+
|
| 138 |
+
# 5. Filter for the last 2 years
|
| 139 |
+
current_year = datetime.now().year
|
| 140 |
+
df = df[df['end'].dt.year >= (current_year - 2)]
|
| 141 |
+
|
| 142 |
+
# 6. Take top 4 most recent periods
|
| 143 |
+
df = df.head(4)
|
| 144 |
+
|
| 145 |
+
if df.empty:
|
| 146 |
+
return f"No recent (2024-2026) {concept} data available for {ticker}."
|
| 147 |
+
|
| 148 |
+
summary = f"Latest official {concept} data for {ticker}:\n"
|
| 149 |
+
for _, row in df.iterrows():
|
| 150 |
+
formatted_val = f"${int(row['val']):,}"
|
| 151 |
+
date_str = row['end'].strftime('%Y-%m-%d')
|
| 152 |
+
summary += f"- Period End: {date_str} (Filed: {row['filed'].strftime('%Y-%m-%d')}): {formatted_val}\n"
|
| 153 |
+
|
| 154 |
+
return summary
|
| 155 |
+
|
| 156 |
+
except Exception as e:
|
| 157 |
+
return f"Error fetching XBRL data: {str(e)}"
|
| 158 |
+
|
| 159 |
+
# Quick test block for the new function
|
| 160 |
+
if __name__ == "__main__":
|
| 161 |
+
test_ticker = "MSFT"
|
| 162 |
+
|
| 163 |
+
# Test 1: URL fetcher
|
| 164 |
+
try:
|
| 165 |
+
url = get_latest_10k_url(test_ticker)
|
| 166 |
+
print(f"\n10-K URL: {url}")
|
| 167 |
+
except Exception as e:
|
| 168 |
+
print(f"URL Fetch Failed: {e}")
|
| 169 |
+
|
| 170 |
+
# Test 2: XBRL fetcher
|
| 171 |
+
test_concept = "NetIncomeLoss"
|
| 172 |
+
print(get_company_concept_xbrl.invoke({"ticker": test_ticker, "concept": test_concept}))
|
core/sentiment_tools.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
from langchain_core.tools import tool
|
| 4 |
+
|
| 5 |
+
@tool
|
| 6 |
+
def get_recent_news(ticker: str) -> str:
|
| 7 |
+
"""
|
| 8 |
+
Fetches the most recent news headlines for a given stock ticker.
|
| 9 |
+
CRITICAL INSTRUCTIONS:
|
| 10 |
+
1. 'ticker': Must be the official uppercase ticker symbol (e.g., AAPL). DO NOT pass the full company name.
|
| 11 |
+
2. Use this tool to gauge current market sentiment, breaking news, and short-term catalysts.
|
| 12 |
+
"""
|
| 13 |
+
try:
|
| 14 |
+
ticker = ticker.upper()
|
| 15 |
+
print(f"\n[System: Fetching latest news for {ticker} via Yahoo Finance RSS...]")
|
| 16 |
+
|
| 17 |
+
# Hit the official Yahoo Finance RSS endpoint
|
| 18 |
+
url = f"https://feeds.finance.yahoo.com/rss/2.0/headline?s={ticker}®ion=US&lang=en-US"
|
| 19 |
+
|
| 20 |
+
# A standard web browser User-Agent so Yahoo doesn't block us
|
| 21 |
+
headers = {
|
| 22 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
response = requests.get(url, headers=headers)
|
| 26 |
+
response.raise_for_status()
|
| 27 |
+
|
| 28 |
+
# Parse the XML feed using BeautifulSoup
|
| 29 |
+
soup = BeautifulSoup(response.content, features="xml")
|
| 30 |
+
items = soup.find_all("item")
|
| 31 |
+
|
| 32 |
+
if not items:
|
| 33 |
+
return f"No recent news found for {ticker}."
|
| 34 |
+
|
| 35 |
+
summary = f"Recent News Headlines for {ticker}:\n\n"
|
| 36 |
+
|
| 37 |
+
# Grab the top 5 most recent articles
|
| 38 |
+
for i, item in enumerate(items[:10]):
|
| 39 |
+
title = item.title.text if item.title else "No Title"
|
| 40 |
+
# RSS provides nicely formatted publication dates
|
| 41 |
+
pub_date = item.pubDate.text if item.pubDate else "Recent"
|
| 42 |
+
|
| 43 |
+
summary += f"{i+1}. [{pub_date}] {title}\n"
|
| 44 |
+
|
| 45 |
+
return summary
|
| 46 |
+
|
| 47 |
+
except Exception as e:
|
| 48 |
+
return f"Error fetching news for {ticker}: {str(e)}"
|
| 49 |
+
|
| 50 |
+
# Quick test block
|
| 51 |
+
if __name__ == "__main__":
|
| 52 |
+
test_ticker = "NVDA"
|
| 53 |
+
print("Testing News Pipeline...")
|
| 54 |
+
print(get_recent_news.invoke({"ticker": test_ticker}))
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
services:
|
| 2 |
+
api:
|
| 3 |
+
build:
|
| 4 |
+
context: .
|
| 5 |
+
dockerfile: Dockerfile.api
|
| 6 |
+
ports:
|
| 7 |
+
- "8000:8000"
|
| 8 |
+
env_file:
|
| 9 |
+
- .env
|
| 10 |
+
volumes:
|
| 11 |
+
- ./:/app # Hot-reloading mapping
|
| 12 |
+
- ./chroma_db:/app/chroma_db # Persist the vector database locally
|
| 13 |
+
command: uvicorn backend.api:app --host 0.0.0.0 --port 8000 --reload
|
| 14 |
+
|
| 15 |
+
ui:
|
| 16 |
+
build:
|
| 17 |
+
context: .
|
| 18 |
+
dockerfile: Dockerfile.ui
|
| 19 |
+
ports:
|
| 20 |
+
- "8501:8501"
|
| 21 |
+
environment:
|
| 22 |
+
- API_URL=http://api:8000/chat/stream
|
| 23 |
+
volumes:
|
| 24 |
+
- ./:/app # Hot-reloading mapping
|
| 25 |
+
- ./chroma_db:/app/chroma_db
|
| 26 |
+
depends_on:
|
| 27 |
+
- api
|
frontend/streamlit_app.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import requests
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
# Configure the API URL. In local dev it is 127.0.0.1.
|
| 7 |
+
# Via docker-compose, this is overridden to http://api:8000/chat/stream via ENV.
|
| 8 |
+
API_URL = os.getenv("API_URL", "http://127.0.0.1:8000/chat/stream")
|
| 9 |
+
|
| 10 |
+
st.set_page_config(page_title="FinAgent Portfolio", page_icon="📈", layout="wide")
|
| 11 |
+
|
| 12 |
+
st.title("📈 🤖 FinAgent: Autonomous Financial AI")
|
| 13 |
+
|
| 14 |
+
with st.sidebar:
|
| 15 |
+
st.markdown("### 👨💻 About this Agent")
|
| 16 |
+
st.markdown(
|
| 17 |
+
"This application uses **LangGraph** to construct a deterministic multi-agent state machine. "
|
| 18 |
+
"The **Planner Agent** parses the query, while the **Supervisor** appropriately routes tasks "
|
| 19 |
+
"to specialized **Quant, Fundamental, Sentiment, and Earnings Agents**.\n\n"
|
| 20 |
+
"Finally, the **Summarizer** compiles a comprehensive Investment Memo."
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
st.divider()
|
| 24 |
+
|
| 25 |
+
# Automatically survey the indexed ChromeDB vector stores
|
| 26 |
+
available_tickers = []
|
| 27 |
+
earnings_tickers = []
|
| 28 |
+
if os.path.exists("./chroma_db"):
|
| 29 |
+
for d in os.listdir("./chroma_db"):
|
| 30 |
+
if d.endswith("_10k"):
|
| 31 |
+
available_tickers.append(d.replace("_10k", ""))
|
| 32 |
+
elif d.endswith("_earnings"):
|
| 33 |
+
earnings_tickers.append(d.replace("_earnings", ""))
|
| 34 |
+
|
| 35 |
+
if available_tickers:
|
| 36 |
+
st.markdown("### 📚 Supported 10-K Data")
|
| 37 |
+
st.markdown("Deep RAG (Fundamental SEC filings) currently verified & compiled for:")
|
| 38 |
+
cols = st.columns(4)
|
| 39 |
+
for i, t in enumerate(sorted(available_tickers)):
|
| 40 |
+
cols[i % 4].code(t)
|
| 41 |
+
|
| 42 |
+
if earnings_tickers:
|
| 43 |
+
st.markdown("### 🎙️ Earnings Call Data")
|
| 44 |
+
st.markdown("Ingested earnings-call transcripts available for:")
|
| 45 |
+
cols = st.columns(4)
|
| 46 |
+
for i, t in enumerate(sorted(earnings_tickers)):
|
| 47 |
+
cols[i % 4].code(t)
|
| 48 |
+
|
| 49 |
+
st.divider()
|
| 50 |
+
|
| 51 |
+
st.markdown("### ⚡ Recruiter Quick-Test")
|
| 52 |
+
st.markdown("Try one of these example queries to see the multi-agent graph in action:")
|
| 53 |
+
|
| 54 |
+
if st.button("🍎 Apple Financial Overview"):
|
| 55 |
+
st.session_state.example_query = "What is the price, sentiment, and recent 10-K risks for Apple (AAPL)?"
|
| 56 |
+
|
| 57 |
+
if st.button("🏎️ Tesla Breaking Sentiment"):
|
| 58 |
+
st.session_state.example_query = "What is the latest news sentiment for TSLA?"
|
| 59 |
+
|
| 60 |
+
if st.button("💻 MSFT vs GOOGL"):
|
| 61 |
+
st.session_state.example_query = "Compare the current stock performance of Microsoft and Google."
|
| 62 |
+
|
| 63 |
+
if st.button("🎙️ Earnings Call Analysis"):
|
| 64 |
+
st.session_state.example_query = "Analyze the latest earnings call for Apple (AAPL) — compare management tone in prepared remarks vs Q&A and show keyword trends."
|
| 65 |
+
|
| 66 |
+
st.divider()
|
| 67 |
+
st.caption("Powered by Llama-3.1-8B via Groq")
|
| 68 |
+
|
| 69 |
+
# Initialize chat history
|
| 70 |
+
if "messages" not in st.session_state:
|
| 71 |
+
st.session_state.messages = []
|
| 72 |
+
|
| 73 |
+
# Display chat history on screen
|
| 74 |
+
for message in st.session_state.messages:
|
| 75 |
+
with st.chat_message(message["role"]):
|
| 76 |
+
if message["role"] == "assistant" and "steps" in message and message["steps"]:
|
| 77 |
+
# Render past steps in a collapsed status box
|
| 78 |
+
total_time = message.get("total_latency", 0)
|
| 79 |
+
title = f"✅ Investment Memo Generated! (Total Latency: {total_time}s)" if total_time else "✅ Investment Memo Generated!"
|
| 80 |
+
with st.status(title, expanded=False):
|
| 81 |
+
for step in message["steps"]:
|
| 82 |
+
lat = step.get('step_latency', 0)
|
| 83 |
+
lat_str = f"({lat}s) " if lat else ""
|
| 84 |
+
st.write(f"**[{step['node']}]** {lat_str}{step['content']}")
|
| 85 |
+
st.markdown(message["content"])
|
| 86 |
+
|
| 87 |
+
# Unconditionally render the chat_input so it NEVER disappears from the UI
|
| 88 |
+
chat_val = st.chat_input("Ask about any stock ticker (e.g. AAPL, TSLA, NVDA)...")
|
| 89 |
+
|
| 90 |
+
if "example_query" in st.session_state and st.session_state.example_query:
|
| 91 |
+
prompt = st.session_state.example_query
|
| 92 |
+
st.session_state.example_query = "" # Reset
|
| 93 |
+
else:
|
| 94 |
+
prompt = chat_val
|
| 95 |
+
|
| 96 |
+
if prompt:
|
| 97 |
+
# Render user message
|
| 98 |
+
st.session_state.messages.append({"role": "user", "content": prompt})
|
| 99 |
+
with st.chat_message("user"):
|
| 100 |
+
st.markdown(prompt)
|
| 101 |
+
|
| 102 |
+
with st.chat_message("assistant"):
|
| 103 |
+
# We will stream the intermediate node outputs into a dynamic expanding status box
|
| 104 |
+
status_box = st.status("🧠 Consulting Specialized AI Agents...", expanded=True)
|
| 105 |
+
final_memo_placeholder = st.empty()
|
| 106 |
+
|
| 107 |
+
try:
|
| 108 |
+
# Stream the response via POST request
|
| 109 |
+
with requests.post(API_URL, json={"query": prompt}, stream=True) as response:
|
| 110 |
+
response.raise_for_status()
|
| 111 |
+
|
| 112 |
+
final_memo = ""
|
| 113 |
+
session_steps = []
|
| 114 |
+
|
| 115 |
+
# Consume Server-Sent Events (SSE)
|
| 116 |
+
for line in response.iter_lines():
|
| 117 |
+
if line:
|
| 118 |
+
decoded_line = line.decode('utf-8')
|
| 119 |
+
if decoded_line.startswith("data: "):
|
| 120 |
+
data_str = decoded_line[len("data: "):]
|
| 121 |
+
try:
|
| 122 |
+
data = json.loads(data_str)
|
| 123 |
+
node = data.get("node")
|
| 124 |
+
content = data.get("content")
|
| 125 |
+
step_latency = data.get("step_latency", 0)
|
| 126 |
+
total_latency = data.get("total_latency", 0)
|
| 127 |
+
|
| 128 |
+
if node == "Summarizer":
|
| 129 |
+
# The final node returns the full markdown report
|
| 130 |
+
final_memo = content
|
| 131 |
+
final_memo_placeholder.markdown(final_memo)
|
| 132 |
+
status_box.update(label=f"✅ Investment Memo Generated! (Total Latency: {total_latency}s)", state="complete", expanded=False)
|
| 133 |
+
else:
|
| 134 |
+
# Show what the different agents (Quant, Sentiment, etc.) are calculating
|
| 135 |
+
lat_str = f"({step_latency}s) " if step_latency else ""
|
| 136 |
+
status_box.write(f"**[{node}]** {lat_str}{content}")
|
| 137 |
+
session_steps.append({
|
| 138 |
+
"node": node,
|
| 139 |
+
"content": content,
|
| 140 |
+
"step_latency": step_latency,
|
| 141 |
+
"total_latency": total_latency
|
| 142 |
+
})
|
| 143 |
+
except json.JSONDecodeError:
|
| 144 |
+
pass
|
| 145 |
+
|
| 146 |
+
# Save the final memo and intermediate steps to history
|
| 147 |
+
if final_memo:
|
| 148 |
+
final_total_latency = session_steps[-1].get("total_latency", 0) if session_steps else 0
|
| 149 |
+
st.session_state.messages.append({
|
| 150 |
+
"role": "assistant",
|
| 151 |
+
"content": final_memo,
|
| 152 |
+
"steps": session_steps,
|
| 153 |
+
"total_latency": final_total_latency
|
| 154 |
+
})
|
| 155 |
+
|
| 156 |
+
except requests.exceptions.RequestException as e:
|
| 157 |
+
status_box.update(label="❌ Connection Error", state="error", expanded=False)
|
| 158 |
+
st.error(f"Failed to connect to the backend FastAPI server: {e}")
|
requirements.txt
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# AI Orchestration & Agent Framework
|
| 2 |
+
langchain>=0.2.0
|
| 3 |
+
langchain-core>=0.2.0
|
| 4 |
+
langchain-openai>=0.1.0
|
| 5 |
+
langgraph>=0.0.60
|
| 6 |
+
pydantic>=2.0.0
|
| 7 |
+
pydantic-settings>=2.0.0
|
| 8 |
+
fastapi>=0.115.0
|
| 9 |
+
uvicorn[standard]>=0.30.0
|
| 10 |
+
|
| 11 |
+
# Quantitative Data & Math
|
| 12 |
+
yfinance>=0.2.40
|
| 13 |
+
pandas>=2.2.0
|
| 14 |
+
|
| 15 |
+
# SEC API & Web Scraping
|
| 16 |
+
requests>=2.31.0
|
| 17 |
+
beautifulsoup4>=4.12.0
|
| 18 |
+
lxml>=5.2.0
|
| 19 |
+
|
| 20 |
+
# Vector DB & Embeddings (RAG)
|
| 21 |
+
langchain-community>=0.0.10
|
| 22 |
+
langchain-text-splitters>=0.2.0
|
| 23 |
+
langchain-huggingface>=0.0.3
|
| 24 |
+
sentence-transformers>=3.0.0
|
| 25 |
+
chromadb>=0.5.0
|
| 26 |
+
langchain-chroma>=0.1.2
|
| 27 |
+
|
| 28 |
+
# Utilities
|
| 29 |
+
python-dotenv>=1.0.0
|
| 30 |
+
|
| 31 |
+
# UI
|
| 32 |
+
streamlit>=1.39.0
|
scripts/ingest.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import argparse
|
| 3 |
+
import requests
|
| 4 |
+
import re
|
| 5 |
+
from bs4 import BeautifulSoup
|
| 6 |
+
|
| 7 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 8 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 9 |
+
from langchain_chroma import Chroma
|
| 10 |
+
from langchain_core.documents import Document
|
| 11 |
+
|
| 12 |
+
from core.sec_tools import get_latest_10k_url, HEADERS
|
| 13 |
+
|
| 14 |
+
def ingest_10k(ticker: str):
|
| 15 |
+
"""Downloads, cleans, and embeds a 10-K into a local Chroma Vector Database."""
|
| 16 |
+
ticker = ticker.upper()
|
| 17 |
+
persist_directory = f"./chroma_db/{ticker}_10k"
|
| 18 |
+
|
| 19 |
+
if os.path.exists(persist_directory):
|
| 20 |
+
print(f"[Ingest: Vector DB for {ticker} 10-K already exists at {persist_directory}. Skipping...]")
|
| 21 |
+
return
|
| 22 |
+
|
| 23 |
+
print(f"\n==============================================")
|
| 24 |
+
print(f"Starting Ingestion Pipeline for {ticker}")
|
| 25 |
+
print(f"==============================================")
|
| 26 |
+
|
| 27 |
+
url = get_latest_10k_url(ticker)
|
| 28 |
+
if url.startswith("Error") or url.startswith("No 10-K"):
|
| 29 |
+
print(f"[Error: SEC URL Fetch failed: {url}]")
|
| 30 |
+
return
|
| 31 |
+
|
| 32 |
+
print(f"[1/4] Downloading raw 10-K from SEC: {url}")
|
| 33 |
+
response = requests.get(url, headers=HEADERS)
|
| 34 |
+
response.raise_for_status()
|
| 35 |
+
raw_text = response.text
|
| 36 |
+
|
| 37 |
+
print(f"[2/4] Parsing HTML and isolating text payload...")
|
| 38 |
+
doc_match = re.search(r'<DOCUMENT>(.*?)</DOCUMENT>', raw_text, re.DOTALL | re.IGNORECASE)
|
| 39 |
+
if doc_match:
|
| 40 |
+
raw_text = doc_match.group(1)
|
| 41 |
+
|
| 42 |
+
soup = BeautifulSoup(raw_text, "html.parser")
|
| 43 |
+
clean_text = soup.get_text(separator=" ", strip=True)
|
| 44 |
+
|
| 45 |
+
print(f"[3/4] Chunking document...")
|
| 46 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 47 |
+
chunk_size=1000,
|
| 48 |
+
chunk_overlap=200
|
| 49 |
+
)
|
| 50 |
+
docs = [Document(page_content=clean_text, metadata={"source": url, "ticker": ticker})]
|
| 51 |
+
chunks = text_splitter.split_documents(docs)
|
| 52 |
+
|
| 53 |
+
print(f"[4/4] Embedding {len(chunks)} chunks into Chroma DB. (This may take a minute) ...")
|
| 54 |
+
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
|
| 55 |
+
Chroma.from_documents(
|
| 56 |
+
documents=chunks,
|
| 57 |
+
embedding=embeddings,
|
| 58 |
+
persist_directory=persist_directory
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
print(f"[Success] {ticker} 10-K successfully ingested.")
|
| 62 |
+
|
| 63 |
+
if __name__ == "__main__":
|
| 64 |
+
parser = argparse.ArgumentParser(description="Ingest SEC 10-K filings into Chroma DB.")
|
| 65 |
+
parser.add_argument(
|
| 66 |
+
"--tickers",
|
| 67 |
+
nargs="+",
|
| 68 |
+
required=True,
|
| 69 |
+
help="List of stock tickers to ingest (e.g., --tickers AAPL MSFT TSLA)"
|
| 70 |
+
)
|
| 71 |
+
args = parser.parse_args()
|
| 72 |
+
|
| 73 |
+
os.makedirs("./chroma_db", exist_ok=True)
|
| 74 |
+
|
| 75 |
+
for t in args.tickers:
|
| 76 |
+
try:
|
| 77 |
+
ingest_10k(t)
|
| 78 |
+
except Exception as e:
|
| 79 |
+
print(f"[Error] Failed to ingest {t}: {str(e)}")
|
scripts/ingest_earnings_calls.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
CLI script to ingest earnings-call transcripts into ChromaDB.
|
| 4 |
+
|
| 5 |
+
Usage:
|
| 6 |
+
python scripts/ingest_earnings_calls.py --tickers AAPL MSFT --quarters Q4-2024 Q1-2025
|
| 7 |
+
python scripts/ingest_earnings_calls.py --tickers TSLA --quarters Q1-2025
|
| 8 |
+
|
| 9 |
+
Data sources (tried in order):
|
| 10 |
+
1. Alpha Vantage EARNINGS_CALL_TRANSCRIPT (requires premium key)
|
| 11 |
+
2. SEC EDGAR 8-K filings (free, always available)
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import argparse
|
| 15 |
+
import os
|
| 16 |
+
import sys
|
| 17 |
+
|
| 18 |
+
from dotenv import load_dotenv
|
| 19 |
+
|
| 20 |
+
load_dotenv()
|
| 21 |
+
|
| 22 |
+
# Ensure project root is on sys.path so `core.*` imports work
|
| 23 |
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
| 24 |
+
|
| 25 |
+
from core.config import Settings
|
| 26 |
+
from core.earnings_tools import ingest_earnings_call, parse_quarter
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def main():
|
| 30 |
+
parser = argparse.ArgumentParser(
|
| 31 |
+
description="Ingest earnings-call transcripts into ChromaDB."
|
| 32 |
+
)
|
| 33 |
+
parser.add_argument(
|
| 34 |
+
"--tickers",
|
| 35 |
+
nargs="+",
|
| 36 |
+
required=True,
|
| 37 |
+
help="Stock tickers to ingest (e.g. --tickers AAPL MSFT)",
|
| 38 |
+
)
|
| 39 |
+
parser.add_argument(
|
| 40 |
+
"--quarters",
|
| 41 |
+
nargs="+",
|
| 42 |
+
required=True,
|
| 43 |
+
help="Quarters to ingest, format Q<N>-<YYYY> (e.g. --quarters Q4-2024 Q1-2025)",
|
| 44 |
+
)
|
| 45 |
+
args = parser.parse_args()
|
| 46 |
+
|
| 47 |
+
settings = Settings()
|
| 48 |
+
api_key = settings.alpha_vantage_api_key or os.getenv("ALPHA_VANTAGE_API_KEY", "")
|
| 49 |
+
chroma_path = settings.earnings_chroma_path
|
| 50 |
+
|
| 51 |
+
os.makedirs(chroma_path, exist_ok=True)
|
| 52 |
+
|
| 53 |
+
# Parse quarters upfront to fail fast on bad formats
|
| 54 |
+
parsed_quarters: list[tuple[int, int]] = []
|
| 55 |
+
for q_str in args.quarters:
|
| 56 |
+
try:
|
| 57 |
+
q, y = parse_quarter(q_str)
|
| 58 |
+
parsed_quarters.append((q, y))
|
| 59 |
+
except ValueError as e:
|
| 60 |
+
print(f"[Error] {e}")
|
| 61 |
+
sys.exit(1)
|
| 62 |
+
|
| 63 |
+
results: list[dict] = []
|
| 64 |
+
|
| 65 |
+
for ticker in args.tickers:
|
| 66 |
+
ticker = ticker.upper()
|
| 67 |
+
for quarter, year in parsed_quarters:
|
| 68 |
+
print(f"\n{'=' * 50}")
|
| 69 |
+
print(f"Ingesting {ticker} Q{quarter}-{year}")
|
| 70 |
+
print(f"{'=' * 50}")
|
| 71 |
+
try:
|
| 72 |
+
status = ingest_earnings_call(
|
| 73 |
+
ticker=ticker,
|
| 74 |
+
quarter=quarter,
|
| 75 |
+
year=year,
|
| 76 |
+
api_key=api_key,
|
| 77 |
+
chroma_path=chroma_path,
|
| 78 |
+
)
|
| 79 |
+
except Exception as e:
|
| 80 |
+
print(f"[Error] Failed to ingest {ticker} Q{quarter}-{year}: {e}")
|
| 81 |
+
status = "error"
|
| 82 |
+
|
| 83 |
+
results.append(
|
| 84 |
+
{"ticker": ticker, "quarter": f"Q{quarter}-{year}", "status": status}
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
# Summary
|
| 88 |
+
print(f"\n{'=' * 50}")
|
| 89 |
+
print("INGEST SUMMARY")
|
| 90 |
+
print(f"{'=' * 50}")
|
| 91 |
+
for r in results:
|
| 92 |
+
icon = {"success": "✅", "partial": "⚠️", "failed": "❌", "exists": "⏭️", "error": "💥"}.get(
|
| 93 |
+
r["status"], "❓"
|
| 94 |
+
)
|
| 95 |
+
print(f" {icon} {r['ticker']} {r['quarter']}: {r['status']}")
|
| 96 |
+
|
| 97 |
+
failed = [r for r in results if r["status"] in ("failed", "error")]
|
| 98 |
+
if failed:
|
| 99 |
+
print(f"\n{len(failed)} ingest(s) failed. Check logs above.")
|
| 100 |
+
sys.exit(1)
|
| 101 |
+
else:
|
| 102 |
+
print(f"\nAll {len(results)} ingest(s) completed.")
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
if __name__ == "__main__":
|
| 106 |
+
main()
|
scripts/main.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
import langchain
|
| 5 |
+
from langchain_core.messages import HumanMessage
|
| 6 |
+
|
| 7 |
+
from core.config import Settings
|
| 8 |
+
from core.graph_builder import build_financial_graph
|
| 9 |
+
from core.runner import create_llm, run_financial_query
|
| 10 |
+
|
| 11 |
+
load_dotenv()
|
| 12 |
+
langchain.debug = os.getenv("LANGCHAIN_DEBUG", "").lower() in ("1", "true", "yes")
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def main():
|
| 16 |
+
settings = Settings()
|
| 17 |
+
llm = create_llm(settings)
|
| 18 |
+
|
| 19 |
+
print("--- Multi-Agent System + Summarizer Initialized ---")
|
| 20 |
+
print("Type 'exit' to quit.\n")
|
| 21 |
+
|
| 22 |
+
compiled = build_financial_graph(llm)
|
| 23 |
+
|
| 24 |
+
while True:
|
| 25 |
+
user_query = input("\nAsk about a stock: ")
|
| 26 |
+
if user_query.lower() == "exit":
|
| 27 |
+
break
|
| 28 |
+
|
| 29 |
+
print("\n--- Agent Workflow Started ---")
|
| 30 |
+
result = run_financial_query(compiled, user_query)
|
| 31 |
+
for step in result["steps"]:
|
| 32 |
+
print(f"\n[{step['node']}]: {step['content']}")
|
| 33 |
+
if result.get("memo") is not None:
|
| 34 |
+
print("\n--- Investment Memo ---")
|
| 35 |
+
print(result["memo"])
|
| 36 |
+
print("\n--- Workflow Complete ---")
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
if __name__ == "__main__":
|
| 40 |
+
main()
|
supervisord.conf
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[supervisord]
|
| 2 |
+
nodaemon=true
|
| 3 |
+
logfile=/tmp/supervisord.log
|
| 4 |
+
pidfile=/tmp/supervisord.pid
|
| 5 |
+
|
| 6 |
+
[program:fastapi]
|
| 7 |
+
command=uvicorn backend.api:app --host 0.0.0.0 --port 8000
|
| 8 |
+
directory=/app
|
| 9 |
+
autostart=true
|
| 10 |
+
autorestart=true
|
| 11 |
+
stdout_logfile=/dev/stdout
|
| 12 |
+
stdout_logfile_maxbytes=0
|
| 13 |
+
stderr_logfile=/dev/stderr
|
| 14 |
+
stderr_logfile_maxbytes=0
|
| 15 |
+
|
| 16 |
+
[program:streamlit]
|
| 17 |
+
command=streamlit run frontend/streamlit_app.py --server.port=7860 --server.address=0.0.0.0 --server.headless=true --browser.gatherUsageStats=false
|
| 18 |
+
directory=/app
|
| 19 |
+
autostart=true
|
| 20 |
+
autorestart=true
|
| 21 |
+
stdout_logfile=/dev/stdout
|
| 22 |
+
stdout_logfile_maxbytes=0
|
| 23 |
+
stderr_logfile=/dev/stderr
|
| 24 |
+
stderr_logfile_maxbytes=0
|