Spaces:
Running
Running
Tom
Claude
commited on
Commit
·
6466c00
1
Parent(s):
b47c9fb
Add complete RAG-powered OSINT investigation assistant
Browse filesImplements Gradio app with Supabase PGVector and HuggingFace Inference API for generating structured OSINT investigation methodologies. Features include semantic tool retrieval from 344+ tools, chat interface, and REST API endpoints.
Key components:
- Gradio ChatInterface with auto-generated API
- Supabase PGVector for 768-dim semantic search
- HuggingFace Inference Provider (Llama-3.1-8B)
- RAG pipeline with LangChain-style architecture
- Low-hallucination prompts (temp=0.2, max_tokens=600)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
- .env.example +55 -0
- .gitignore +50 -0
- .mcp.json +8 -0
- QUICKSTART.md +15 -0
- README.md +275 -6
- app.py +257 -0
- requirements.txt +14 -0
- src/__init__.py +3 -0
- src/llm_client.py +195 -0
- src/prompts.py +105 -0
- src/rag_pipeline.py +195 -0
- src/vectorstore.py +280 -0
.env.example
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# OSINT Investigation Assistant - Environment Variables
|
| 2 |
+
|
| 3 |
+
# =============================================================================
|
| 4 |
+
# REQUIRED: Supabase Database Connection
|
| 5 |
+
# =============================================================================
|
| 6 |
+
# PostgreSQL connection string for your Supabase database
|
| 7 |
+
# Format: postgresql://[user]:[password]@[host]:[port]/[database]
|
| 8 |
+
# Get this from: Supabase Dashboard > Project Settings > Database > Connection String
|
| 9 |
+
SUPABASE_CONNECTION_STRING=postgresql://postgres:[YOUR-PASSWORD]@db.[PROJECT-REF].supabase.co:5432/postgres
|
| 10 |
+
|
| 11 |
+
# =============================================================================
|
| 12 |
+
# REQUIRED: Hugging Face API Token
|
| 13 |
+
# =============================================================================
|
| 14 |
+
# Get your token from: https://huggingface.co/settings/tokens
|
| 15 |
+
# This is used for Inference Providers API access
|
| 16 |
+
HF_TOKEN=hf_your_token_here
|
| 17 |
+
|
| 18 |
+
# =============================================================================
|
| 19 |
+
# OPTIONAL: LLM Configuration
|
| 20 |
+
# =============================================================================
|
| 21 |
+
# Model to use for generation (default: meta-llama/Llama-3.1-8B-Instruct)
|
| 22 |
+
# Other options:
|
| 23 |
+
# - meta-llama/Meta-Llama-3-8B-Instruct
|
| 24 |
+
# - Qwen/Qwen2.5-72B-Instruct
|
| 25 |
+
# - mistralai/Mistral-7B-Instruct-v0.3
|
| 26 |
+
LLM_MODEL=meta-llama/Llama-3.1-8B-Instruct
|
| 27 |
+
|
| 28 |
+
# Temperature for LLM generation (0.0 to 1.0, default: 0.7)
|
| 29 |
+
# Lower = more focused/deterministic, Higher = more creative/diverse
|
| 30 |
+
LLM_TEMPERATURE=0.7
|
| 31 |
+
|
| 32 |
+
# Maximum tokens to generate (default: 2000)
|
| 33 |
+
LLM_MAX_TOKENS=2000
|
| 34 |
+
|
| 35 |
+
# =============================================================================
|
| 36 |
+
# OPTIONAL: Vector Store Configuration
|
| 37 |
+
# =============================================================================
|
| 38 |
+
# Number of tools to retrieve for context (default: 5)
|
| 39 |
+
RETRIEVAL_K=5
|
| 40 |
+
|
| 41 |
+
# Embedding model for vector search (default: sentence-transformers/all-mpnet-base-v2)
|
| 42 |
+
# Note: Database uses 768-dimensional embeddings
|
| 43 |
+
EMBEDDING_MODEL=sentence-transformers/all-mpnet-base-v2
|
| 44 |
+
|
| 45 |
+
# =============================================================================
|
| 46 |
+
# OPTIONAL: Gradio Configuration
|
| 47 |
+
# =============================================================================
|
| 48 |
+
# Port for Gradio app (default: 7860)
|
| 49 |
+
GRADIO_PORT=7860
|
| 50 |
+
|
| 51 |
+
# Server name (default: 0.0.0.0 for all interfaces)
|
| 52 |
+
GRADIO_SERVER_NAME=0.0.0.0
|
| 53 |
+
|
| 54 |
+
# Enable Gradio sharing link (default: False)
|
| 55 |
+
GRADIO_SHARE=False
|
.gitignore
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Environment variables (contains secrets)
|
| 2 |
+
.env
|
| 3 |
+
|
| 4 |
+
# Python
|
| 5 |
+
__pycache__/
|
| 6 |
+
*.py[cod]
|
| 7 |
+
*$py.class
|
| 8 |
+
*.so
|
| 9 |
+
.Python
|
| 10 |
+
build/
|
| 11 |
+
develop-eggs/
|
| 12 |
+
dist/
|
| 13 |
+
downloads/
|
| 14 |
+
eggs/
|
| 15 |
+
.eggs/
|
| 16 |
+
lib/
|
| 17 |
+
lib64/
|
| 18 |
+
parts/
|
| 19 |
+
sdist/
|
| 20 |
+
var/
|
| 21 |
+
wheels/
|
| 22 |
+
*.egg-info/
|
| 23 |
+
.installed.cfg
|
| 24 |
+
*.egg
|
| 25 |
+
|
| 26 |
+
# Virtual environments
|
| 27 |
+
venv/
|
| 28 |
+
env/
|
| 29 |
+
ENV/
|
| 30 |
+
.venv
|
| 31 |
+
|
| 32 |
+
# IDE
|
| 33 |
+
.vscode/
|
| 34 |
+
.idea/
|
| 35 |
+
*.swp
|
| 36 |
+
*.swo
|
| 37 |
+
*~
|
| 38 |
+
|
| 39 |
+
# Jupyter Notebook
|
| 40 |
+
.ipynb_checkpoints
|
| 41 |
+
|
| 42 |
+
# macOS
|
| 43 |
+
.DS_Store
|
| 44 |
+
|
| 45 |
+
# Gradio
|
| 46 |
+
gradio_cached_examples/
|
| 47 |
+
flagged/
|
| 48 |
+
|
| 49 |
+
# Logs
|
| 50 |
+
*.log
|
.mcp.json
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"mcpServers": {
|
| 3 |
+
"supabase": {
|
| 4 |
+
"type": "http",
|
| 5 |
+
"url": "https://mcp.supabase.com/mcp?project_ref=zhprqpnxpdcmsjukpurx"
|
| 6 |
+
}
|
| 7 |
+
}
|
| 8 |
+
}
|
QUICKSTART.md
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# OSINT RAG App Quickstart
|
| 2 |
+
|
| 3 |
+
## Stack
|
| 4 |
+
- **Frontend**: Gradio 4.0+ (ChatInterface with auto API endpoints)
|
| 5 |
+
- **Database**: Supabase PGVector (768-dim embeddings, HNSW index)
|
| 6 |
+
- **LLM**: HuggingFace Inference API (Llama-3.1-8B-Instruct)
|
| 7 |
+
- **Embeddings**: HuggingFace Inference API (all-mpnet-base-v2, 768-dim)
|
| 8 |
+
- **Client**: Supabase Python client + InferenceClient (huggingface_hub)
|
| 9 |
+
|
| 10 |
+
## Key Parameters
|
| 11 |
+
- **Temperature**: 0.2 (low hallucination)
|
| 12 |
+
- **Max Tokens**: 600 (short responses)
|
| 13 |
+
- **Retrieval K**: 5 tools
|
| 14 |
+
- **Match Threshold**: 0.5 (cosine similarity)
|
| 15 |
+
- **Connection**: Transaction Pooler (port 6543)
|
README.md
CHANGED
|
@@ -1,13 +1,282 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 5.49.1
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
-
short_description:
|
|
|
|
| 11 |
---
|
| 12 |
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: OSINT Investigation Assistant
|
| 3 |
+
emoji: 🔍
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 5.49.1
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
+
short_description: RAG-powered OSINT investigation assistant with 344+ tools
|
| 11 |
+
license: mit
|
| 12 |
---
|
| 13 |
|
| 14 |
+
# 🔍 OSINT Investigation Assistant
|
| 15 |
+
|
| 16 |
+
A RAG-powered AI assistant that helps investigators develop structured methodologies for open-source intelligence (OSINT) investigations. Built with LangChain, Supabase PGVector, and Hugging Face Inference Providers.
|
| 17 |
+
|
| 18 |
+
## ✨ Features
|
| 19 |
+
|
| 20 |
+
- **🎯 Structured Methodologies**: Generate step-by-step investigation plans tailored to your query
|
| 21 |
+
- **🛠️ 344+ OSINT Tools**: Access recommendations from a comprehensive database of curated OSINT tools
|
| 22 |
+
- **🔍 Context-Aware Retrieval**: Semantic search finds the most relevant tools for your investigation
|
| 23 |
+
- **🚀 API Access**: Built-in REST API for integration with external applications
|
| 24 |
+
- **💬 Chat Interface**: User-friendly conversational interface
|
| 25 |
+
- **🔌 MCP Support**: Can be extended to work with AI agents via MCP protocol
|
| 26 |
+
|
| 27 |
+
## 🏗️ Architecture
|
| 28 |
+
|
| 29 |
+
```
|
| 30 |
+
┌──────────────────────────────────────┐
|
| 31 |
+
│ Gradio UI + API Endpoints │
|
| 32 |
+
└──────────────┬───────────────────────┘
|
| 33 |
+
│
|
| 34 |
+
┌──────────────▼───────────────────────┐
|
| 35 |
+
│ LangChain RAG Pipeline │
|
| 36 |
+
│ • Query Understanding │
|
| 37 |
+
│ • Tool Retrieval (PGVector) │
|
| 38 |
+
│ • Response Generation (LLM) │
|
| 39 |
+
└──────────────┬───────────────────────┘
|
| 40 |
+
│
|
| 41 |
+
┌──────────┴──────────┐
|
| 42 |
+
│ │
|
| 43 |
+
┌───▼───────────┐ ┌─────▼────────────┐
|
| 44 |
+
│ Supabase │ │ HF Inference │
|
| 45 |
+
│ PGVector DB │ │ Providers │
|
| 46 |
+
│ (344 tools) │ │ (Llama 3.1) │
|
| 47 |
+
└───────────────┘ └──────────────────┘
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
## 🚀 Quick Start
|
| 51 |
+
|
| 52 |
+
### Local Development
|
| 53 |
+
|
| 54 |
+
1. **Clone the repository**
|
| 55 |
+
```bash
|
| 56 |
+
git clone <your-repo-url>
|
| 57 |
+
cd osint-llm
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
2. **Install dependencies**
|
| 61 |
+
```bash
|
| 62 |
+
pip install -r requirements.txt
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
3. **Set up environment variables**
|
| 66 |
+
```bash
|
| 67 |
+
cp .env.example .env
|
| 68 |
+
# Edit .env with your credentials
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
Required variables:
|
| 72 |
+
- `SUPABASE_CONNECTION_STRING`: Your Supabase PostgreSQL connection string
|
| 73 |
+
- `HF_TOKEN`: Your Hugging Face API token
|
| 74 |
+
|
| 75 |
+
4. **Run the application**
|
| 76 |
+
```bash
|
| 77 |
+
python app.py
|
| 78 |
+
```
|
| 79 |
+
|
| 80 |
+
The app will be available at `http://localhost:7860`
|
| 81 |
+
|
| 82 |
+
### Hugging Face Spaces Deployment
|
| 83 |
+
|
| 84 |
+
1. **Create a new Space** on Hugging Face
|
| 85 |
+
2. **Push this repository** to your Space
|
| 86 |
+
3. **Set environment variables** in Space settings:
|
| 87 |
+
- `SUPABASE_CONNECTION_STRING`
|
| 88 |
+
- `HF_TOKEN`
|
| 89 |
+
4. **Deploy** - The Space will automatically build and launch
|
| 90 |
+
|
| 91 |
+
## 📚 Usage
|
| 92 |
+
|
| 93 |
+
### Chat Interface
|
| 94 |
+
|
| 95 |
+
Simply ask your investigation questions:
|
| 96 |
+
|
| 97 |
+
```
|
| 98 |
+
"How do I investigate a suspicious domain?"
|
| 99 |
+
"What tools can I use to verify an image's authenticity?"
|
| 100 |
+
"How can I trace the origin of a social media account?"
|
| 101 |
+
```
|
| 102 |
+
|
| 103 |
+
The assistant will provide:
|
| 104 |
+
1. Investigation overview
|
| 105 |
+
2. Step-by-step methodology
|
| 106 |
+
3. Recommended tools with descriptions and URLs
|
| 107 |
+
4. Best practices and safety considerations
|
| 108 |
+
5. Expected outcomes
|
| 109 |
+
|
| 110 |
+
### Tool Search
|
| 111 |
+
|
| 112 |
+
Use the "Tool Search" tab to directly search for OSINT tools by category or purpose.
|
| 113 |
+
|
| 114 |
+
### API Access
|
| 115 |
+
|
| 116 |
+
This app automatically exposes REST API endpoints for external integration.
|
| 117 |
+
|
| 118 |
+
**Python Client:**
|
| 119 |
+
|
| 120 |
+
```python
|
| 121 |
+
from gradio_client import Client
|
| 122 |
+
|
| 123 |
+
client = Client("your-space-url")
|
| 124 |
+
result = client.predict(
|
| 125 |
+
"How do I investigate a domain?",
|
| 126 |
+
api_name="/investigate"
|
| 127 |
+
)
|
| 128 |
+
print(result)
|
| 129 |
+
```
|
| 130 |
+
|
| 131 |
+
**JavaScript Client:**
|
| 132 |
+
|
| 133 |
+
```javascript
|
| 134 |
+
import { Client } from "@gradio/client";
|
| 135 |
+
|
| 136 |
+
const client = await Client.connect("your-space-url");
|
| 137 |
+
const result = await client.predict("/investigate", {
|
| 138 |
+
message: "How do I investigate a domain?"
|
| 139 |
+
});
|
| 140 |
+
console.log(result.data);
|
| 141 |
+
```
|
| 142 |
+
|
| 143 |
+
**cURL:**
|
| 144 |
+
|
| 145 |
+
```bash
|
| 146 |
+
curl -X POST "https://your-space.hf.space/call/investigate" \
|
| 147 |
+
-H "Content-Type: application/json" \
|
| 148 |
+
-d '{"data": ["How do I investigate a domain?"]}'
|
| 149 |
+
```
|
| 150 |
+
|
| 151 |
+
**Available Endpoints:**
|
| 152 |
+
- `/call/investigate` - Main investigation assistant
|
| 153 |
+
- `/call/search_tools` - Direct tool search
|
| 154 |
+
- `/gradio_api/openapi.json` - OpenAPI specification
|
| 155 |
+
|
| 156 |
+
## 🗄️ Database
|
| 157 |
+
|
| 158 |
+
The app uses Supabase with PGVector extension to store and retrieve OSINT tools.
|
| 159 |
+
|
| 160 |
+
**Database Schema:**
|
| 161 |
+
```sql
|
| 162 |
+
CREATE TABLE bellingcat_tools (
|
| 163 |
+
id BIGINT PRIMARY KEY,
|
| 164 |
+
name TEXT,
|
| 165 |
+
category TEXT,
|
| 166 |
+
content TEXT,
|
| 167 |
+
url TEXT,
|
| 168 |
+
cost TEXT,
|
| 169 |
+
details TEXT,
|
| 170 |
+
embedding VECTOR,
|
| 171 |
+
created_at TIMESTAMP WITH TIME ZONE
|
| 172 |
+
);
|
| 173 |
+
```
|
| 174 |
+
|
| 175 |
+
**Tool Categories:**
|
| 176 |
+
- Archiving & Preservation
|
| 177 |
+
- Social Media Investigation
|
| 178 |
+
- Image & Video Analysis
|
| 179 |
+
- Domain & Network Investigation
|
| 180 |
+
- Geolocation
|
| 181 |
+
- Data Extraction
|
| 182 |
+
- Verification & Fact-Checking
|
| 183 |
+
- And more...
|
| 184 |
+
|
| 185 |
+
## 🛠️ Technology Stack
|
| 186 |
+
|
| 187 |
+
- **UI/API**: [Gradio](https://gradio.app/) - Automatic API generation
|
| 188 |
+
- **RAG Framework**: [LangChain](https://langchain.com/) - Retrieval pipeline
|
| 189 |
+
- **Vector Database**: [Supabase](https://supabase.com/) with PGVector extension
|
| 190 |
+
- **Embeddings**: HuggingFace sentence-transformers
|
| 191 |
+
- **LLM**: [Hugging Face Inference Providers](https://huggingface.co/docs/inference-providers/) - Llama 3.1
|
| 192 |
+
- **Language**: Python 3.9+
|
| 193 |
+
|
| 194 |
+
## 📁 Project Structure
|
| 195 |
+
|
| 196 |
+
```
|
| 197 |
+
osint-llm/
|
| 198 |
+
├── app.py # Main Gradio application
|
| 199 |
+
├── requirements.txt # Python dependencies
|
| 200 |
+
├── .env.example # Environment variables template
|
| 201 |
+
├── README.md # This file
|
| 202 |
+
└── src/
|
| 203 |
+
├── __init__.py
|
| 204 |
+
├── vectorstore.py # Supabase PGVector connection
|
| 205 |
+
├── rag_pipeline.py # LangChain RAG logic
|
| 206 |
+
├── llm_client.py # Inference Provider client
|
| 207 |
+
└── prompts.py # Investigation prompt templates
|
| 208 |
+
```
|
| 209 |
+
|
| 210 |
+
## ⚙️ Configuration
|
| 211 |
+
|
| 212 |
+
### Environment Variables
|
| 213 |
+
|
| 214 |
+
See `.env.example` for all available configuration options.
|
| 215 |
+
|
| 216 |
+
**Required:**
|
| 217 |
+
- `SUPABASE_CONNECTION_STRING` - PostgreSQL connection string
|
| 218 |
+
- `HF_TOKEN` - Hugging Face API token
|
| 219 |
+
|
| 220 |
+
**Optional:**
|
| 221 |
+
- `LLM_MODEL` - Model to use (default: meta-llama/Llama-3.1-8B-Instruct)
|
| 222 |
+
- `LLM_TEMPERATURE` - Generation temperature (default: 0.7)
|
| 223 |
+
- `LLM_MAX_TOKENS` - Max tokens to generate (default: 2000)
|
| 224 |
+
- `RETRIEVAL_K` - Number of tools to retrieve (default: 5)
|
| 225 |
+
- `EMBEDDING_MODEL` - Embedding model (default: sentence-transformers/all-MiniLM-L6-v2)
|
| 226 |
+
|
| 227 |
+
### Supported LLM Models
|
| 228 |
+
|
| 229 |
+
- `meta-llama/Llama-3.1-8B-Instruct` (recommended)
|
| 230 |
+
- `meta-llama/Meta-Llama-3-8B-Instruct`
|
| 231 |
+
- `Qwen/Qwen2.5-72B-Instruct`
|
| 232 |
+
- `mistralai/Mistral-7B-Instruct-v0.3`
|
| 233 |
+
|
| 234 |
+
## 💰 Cost Considerations
|
| 235 |
+
|
| 236 |
+
### Hugging Face Inference Providers
|
| 237 |
+
- Free tier: $0.10/month credits
|
| 238 |
+
- PRO tier: $2.00/month credits + pay-as-you-go
|
| 239 |
+
- Typical cost: ~$0.001-0.01 per query
|
| 240 |
+
- Recommended budget: $10-50/month for moderate usage
|
| 241 |
+
|
| 242 |
+
### Supabase
|
| 243 |
+
- Free tier sufficient for most use cases
|
| 244 |
+
- PGVector operations are standard database queries
|
| 245 |
+
|
| 246 |
+
### Hugging Face Spaces
|
| 247 |
+
- Free CPU hosting available
|
| 248 |
+
- GPU upgrade: ~$0.60/hour (optional, not required)
|
| 249 |
+
|
| 250 |
+
## 🔮 Future Enhancements
|
| 251 |
+
|
| 252 |
+
- [ ] MCP server integration for AI agent tool use
|
| 253 |
+
- [ ] Multi-turn conversation with memory
|
| 254 |
+
- [ ] User authentication and query logging
|
| 255 |
+
- [ ] Additional tool databases and sources
|
| 256 |
+
- [ ] Export methodologies as PDF/markdown
|
| 257 |
+
- [ ] Tool usage examples and tutorials
|
| 258 |
+
- [ ] Community-contributed tool reviews
|
| 259 |
+
|
| 260 |
+
## 🤝 Contributing
|
| 261 |
+
|
| 262 |
+
Contributions are welcome! Please feel free to submit issues or pull requests.
|
| 263 |
+
|
| 264 |
+
## 📄 License
|
| 265 |
+
|
| 266 |
+
MIT License - See LICENSE file for details
|
| 267 |
+
|
| 268 |
+
## 🙏 Acknowledgments
|
| 269 |
+
|
| 270 |
+
- Tool data sourced from [Bellingcat's Online Investigation Toolkit](https://www.bellingcat.com/)
|
| 271 |
+
- Built with support from the OSINT community
|
| 272 |
+
|
| 273 |
+
## 📞 Support
|
| 274 |
+
|
| 275 |
+
For issues or questions:
|
| 276 |
+
- Open an issue on GitHub
|
| 277 |
+
- Check the [Hugging Face Spaces documentation](https://huggingface.co/docs/hub/spaces)
|
| 278 |
+
- Review the [Gradio documentation](https://gradio.app/docs/)
|
| 279 |
+
|
| 280 |
+
---
|
| 281 |
+
|
| 282 |
+
Built with ❤️ for the OSINT community
|
app.py
ADDED
|
@@ -0,0 +1,257 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
OSINT Investigation Assistant - Gradio App
|
| 3 |
+
|
| 4 |
+
A RAG-powered assistant that helps investigators develop methodologies
|
| 5 |
+
for OSINT investigations using a database of 344+ OSINT tools.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
import gradio as gr
|
| 10 |
+
from dotenv import load_dotenv
|
| 11 |
+
from src.rag_pipeline import create_pipeline
|
| 12 |
+
|
| 13 |
+
# Load environment variables
|
| 14 |
+
load_dotenv()
|
| 15 |
+
|
| 16 |
+
# Initialize the RAG pipeline
|
| 17 |
+
print("Initializing OSINT Investigation Pipeline...")
|
| 18 |
+
try:
|
| 19 |
+
pipeline = create_pipeline(
|
| 20 |
+
retrieval_k=5,
|
| 21 |
+
model=os.getenv("LLM_MODEL", "meta-llama/Llama-3.1-8B-Instruct"),
|
| 22 |
+
temperature=float(os.getenv("LLM_TEMPERATURE", "0.7"))
|
| 23 |
+
)
|
| 24 |
+
print("✓ Pipeline initialized successfully")
|
| 25 |
+
except Exception as e:
|
| 26 |
+
print(f"✗ Error initializing pipeline: {e}")
|
| 27 |
+
raise
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def investigate(message: str, history: list) -> str:
|
| 31 |
+
"""
|
| 32 |
+
Main chat function for investigation queries
|
| 33 |
+
|
| 34 |
+
Args:
|
| 35 |
+
message: User's investigation query
|
| 36 |
+
history: Chat history (list of [user_msg, bot_msg] pairs)
|
| 37 |
+
|
| 38 |
+
Returns:
|
| 39 |
+
Generated investigation methodology
|
| 40 |
+
"""
|
| 41 |
+
try:
|
| 42 |
+
# Generate response (non-streaming for simplicity)
|
| 43 |
+
response = pipeline.generate_methodology(message, stream=False)
|
| 44 |
+
return response
|
| 45 |
+
except Exception as e:
|
| 46 |
+
return f"Error generating response: {str(e)}\n\nPlease check your environment variables (HF_TOKEN, SUPABASE_CONNECTION_STRING) and try again."
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def investigate_stream(message: str, history: list):
|
| 50 |
+
"""
|
| 51 |
+
Streaming version of investigation function
|
| 52 |
+
|
| 53 |
+
Args:
|
| 54 |
+
message: User's investigation query
|
| 55 |
+
history: Chat history
|
| 56 |
+
|
| 57 |
+
Yields:
|
| 58 |
+
Response chunks
|
| 59 |
+
"""
|
| 60 |
+
try:
|
| 61 |
+
response_stream = pipeline.generate_methodology(message, stream=True)
|
| 62 |
+
full_response = ""
|
| 63 |
+
for chunk in response_stream:
|
| 64 |
+
full_response += chunk
|
| 65 |
+
yield full_response
|
| 66 |
+
except Exception as e:
|
| 67 |
+
yield f"Error generating response: {str(e)}\n\nPlease check your environment variables (HF_TOKEN, SUPABASE_CONNECTION_STRING) and try again."
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def get_tool_recommendations(query: str, k: int = 5) -> str:
|
| 71 |
+
"""
|
| 72 |
+
Get tool recommendations for a query
|
| 73 |
+
|
| 74 |
+
Args:
|
| 75 |
+
query: Investigation query
|
| 76 |
+
k: Number of tools to recommend
|
| 77 |
+
|
| 78 |
+
Returns:
|
| 79 |
+
Formatted tool recommendations
|
| 80 |
+
"""
|
| 81 |
+
try:
|
| 82 |
+
tools = pipeline.get_tool_recommendations(query, k=k)
|
| 83 |
+
|
| 84 |
+
if not tools:
|
| 85 |
+
return "No relevant tools found."
|
| 86 |
+
|
| 87 |
+
output = f"## Top {len(tools)} Recommended Tools\n\n"
|
| 88 |
+
|
| 89 |
+
for i, tool in enumerate(tools, 1):
|
| 90 |
+
output += f"### {i}. {tool['name']}\n"
|
| 91 |
+
output += f"- **Category**: {tool['category']}\n"
|
| 92 |
+
output += f"- **Cost**: {tool['cost']}\n"
|
| 93 |
+
output += f"- **URL**: {tool['url']}\n"
|
| 94 |
+
output += f"- **Description**: {tool['description']}\n"
|
| 95 |
+
if tool['details'] and tool['details'] != 'N/A':
|
| 96 |
+
output += f"- **Details**: {tool['details']}\n"
|
| 97 |
+
output += "\n"
|
| 98 |
+
|
| 99 |
+
return output
|
| 100 |
+
except Exception as e:
|
| 101 |
+
return f"Error retrieving tools: {str(e)}"
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
# Custom CSS for better appearance
|
| 105 |
+
custom_css = """
|
| 106 |
+
.gradio-container {
|
| 107 |
+
max-width: 900px !important;
|
| 108 |
+
}
|
| 109 |
+
#component-0 {
|
| 110 |
+
max-width: 900px;
|
| 111 |
+
}
|
| 112 |
+
"""
|
| 113 |
+
|
| 114 |
+
# Create Gradio interface
|
| 115 |
+
with gr.Blocks(
|
| 116 |
+
title="OSINT Investigation Assistant",
|
| 117 |
+
theme=gr.themes.Soft(),
|
| 118 |
+
css=custom_css
|
| 119 |
+
) as demo:
|
| 120 |
+
gr.Markdown("""
|
| 121 |
+
# 🔍 OSINT Investigation Assistant
|
| 122 |
+
|
| 123 |
+
Ask me how to investigate anything using open-source intelligence methods.
|
| 124 |
+
I'll provide you with a structured methodology and recommend specific OSINT tools
|
| 125 |
+
from a database of 344+ tools.
|
| 126 |
+
|
| 127 |
+
**Examples:**
|
| 128 |
+
- "How do I investigate a suspicious domain?"
|
| 129 |
+
- "What tools can I use to verify an image's authenticity?"
|
| 130 |
+
- "How can I trace the origin of a social media account?"
|
| 131 |
+
""")
|
| 132 |
+
|
| 133 |
+
# Main chat interface
|
| 134 |
+
chatbot = gr.ChatInterface(
|
| 135 |
+
fn=investigate_stream,
|
| 136 |
+
type="messages",
|
| 137 |
+
examples=[
|
| 138 |
+
"How do I investigate a suspicious domain?",
|
| 139 |
+
"What tools can I use to verify an image's authenticity?",
|
| 140 |
+
"How can I trace the origin of a social media account?",
|
| 141 |
+
"What's the best way to archive web content for investigation?",
|
| 142 |
+
"How do I geolocate an image from social media?"
|
| 143 |
+
],
|
| 144 |
+
cache_examples=False,
|
| 145 |
+
title="Chat Interface",
|
| 146 |
+
description="Ask your investigation questions here",
|
| 147 |
+
api_name="investigate" # This creates the /call/investigate API endpoint
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
# Additional tab for direct tool search
|
| 151 |
+
with gr.Tab("Tool Search"):
|
| 152 |
+
gr.Markdown("### Search for OSINT Tools")
|
| 153 |
+
with gr.Row():
|
| 154 |
+
tool_query = gr.Textbox(
|
| 155 |
+
label="Search Query",
|
| 156 |
+
placeholder="e.g., social media analysis, image verification, domain investigation",
|
| 157 |
+
lines=2
|
| 158 |
+
)
|
| 159 |
+
tool_count = gr.Slider(
|
| 160 |
+
minimum=1,
|
| 161 |
+
maximum=20,
|
| 162 |
+
value=5,
|
| 163 |
+
step=1,
|
| 164 |
+
label="Number of Tools"
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
tool_search_btn = gr.Button("Search Tools", variant="primary")
|
| 168 |
+
tool_output = gr.Markdown(label="Recommended Tools")
|
| 169 |
+
|
| 170 |
+
tool_search_btn.click(
|
| 171 |
+
fn=get_tool_recommendations,
|
| 172 |
+
inputs=[tool_query, tool_count],
|
| 173 |
+
outputs=tool_output,
|
| 174 |
+
api_name="search_tools" # This creates the /call/search_tools API endpoint
|
| 175 |
+
)
|
| 176 |
+
|
| 177 |
+
# Information tab
|
| 178 |
+
with gr.Tab("About"):
|
| 179 |
+
gr.Markdown("""
|
| 180 |
+
## About This Assistant
|
| 181 |
+
|
| 182 |
+
This OSINT Investigation Assistant helps researchers and investigators develop
|
| 183 |
+
structured methodologies for open-source intelligence investigations.
|
| 184 |
+
|
| 185 |
+
### Features
|
| 186 |
+
- 🎯 **Structured Methodologies**: Get step-by-step investigation plans
|
| 187 |
+
- 🛠️ **Tool Recommendations**: Access a database of 344+ OSINT tools
|
| 188 |
+
- 🔍 **Context-Aware**: Tools are recommended based on your specific needs
|
| 189 |
+
- 🚀 **API Access**: Use this app via API for integration with other tools
|
| 190 |
+
|
| 191 |
+
### Technology Stack
|
| 192 |
+
- **Vector Database**: Supabase with PGVector (344 OSINT tools)
|
| 193 |
+
- **LLM**: Hugging Face Inference Providers (Llama 3.1)
|
| 194 |
+
- **RAG Framework**: LangChain for retrieval-augmented generation
|
| 195 |
+
- **UI/API**: Gradio with automatic API generation
|
| 196 |
+
|
| 197 |
+
### API Usage
|
| 198 |
+
|
| 199 |
+
This app automatically exposes API endpoints. You can access them using:
|
| 200 |
+
|
| 201 |
+
**Python Client:**
|
| 202 |
+
```python
|
| 203 |
+
from gradio_client import Client
|
| 204 |
+
|
| 205 |
+
client = Client("your-space-url")
|
| 206 |
+
result = client.predict("How do I investigate a domain?", api_name="/investigate")
|
| 207 |
+
print(result)
|
| 208 |
+
```
|
| 209 |
+
|
| 210 |
+
**cURL:**
|
| 211 |
+
```bash
|
| 212 |
+
curl -X POST "https://your-space.hf.space/call/investigate" \\
|
| 213 |
+
-H "Content-Type: application/json" \\
|
| 214 |
+
-d '{"data": ["How do I investigate a domain?"]}'
|
| 215 |
+
```
|
| 216 |
+
|
| 217 |
+
View the full API documentation at the bottom of this page (click "Use via API").
|
| 218 |
+
|
| 219 |
+
### Environment Variables Required
|
| 220 |
+
- `SUPABASE_CONNECTION_STRING`: PostgreSQL connection string for Supabase
|
| 221 |
+
- `HF_TOKEN`: Hugging Face API token for Inference Providers
|
| 222 |
+
- `LLM_MODEL` (optional): Model to use (default: meta-llama/Llama-3.1-8B-Instruct)
|
| 223 |
+
- `LLM_TEMPERATURE` (optional): Temperature for generation (default: 0.7)
|
| 224 |
+
|
| 225 |
+
### Data Source
|
| 226 |
+
The tool recommendations are based on the Bellingcat OSINT Toolkit and other
|
| 227 |
+
curated sources, with 344+ tools across categories including:
|
| 228 |
+
- Social Media Investigation
|
| 229 |
+
- Image and Video Analysis
|
| 230 |
+
- Domain and Network Investigation
|
| 231 |
+
- Geolocation
|
| 232 |
+
- Archiving and Preservation
|
| 233 |
+
- And more...
|
| 234 |
+
|
| 235 |
+
---
|
| 236 |
+
|
| 237 |
+
Built with ❤️ for the OSINT community
|
| 238 |
+
""")
|
| 239 |
+
|
| 240 |
+
# Launch configuration
|
| 241 |
+
if __name__ == "__main__":
|
| 242 |
+
# Check for required environment variables
|
| 243 |
+
required_vars = ["SUPABASE_CONNECTION_STRING", "HF_TOKEN"]
|
| 244 |
+
missing_vars = [var for var in required_vars if not os.getenv(var)]
|
| 245 |
+
|
| 246 |
+
if missing_vars:
|
| 247 |
+
print(f"⚠️ Warning: Missing environment variables: {', '.join(missing_vars)}")
|
| 248 |
+
print("Please set these in your .env file or as environment variables")
|
| 249 |
+
|
| 250 |
+
# Launch the app
|
| 251 |
+
# Set mcp_server=True to enable MCP protocol for agent integration
|
| 252 |
+
demo.launch(
|
| 253 |
+
server_name="0.0.0.0",
|
| 254 |
+
server_port=7860,
|
| 255 |
+
share=False,
|
| 256 |
+
show_api=True # Show API documentation
|
| 257 |
+
)
|
requirements.txt
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Gradio for UI and API
|
| 2 |
+
gradio>=4.0.0
|
| 3 |
+
|
| 4 |
+
# Supabase client for vector store
|
| 5 |
+
supabase>=2.0.0
|
| 6 |
+
|
| 7 |
+
# Hugging Face Inference (for LLM and embeddings)
|
| 8 |
+
huggingface-hub>=0.20.0
|
| 9 |
+
|
| 10 |
+
# Environment variables
|
| 11 |
+
python-dotenv>=1.0.0
|
| 12 |
+
|
| 13 |
+
# Utilities
|
| 14 |
+
pydantic>=2.0.0
|
src/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""OSINT Investigation Assistant - Core modules"""
|
| 2 |
+
|
| 3 |
+
__version__ = "0.1.0"
|
src/llm_client.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""LLM client for Hugging Face Inference API"""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from typing import Iterator, Optional
|
| 5 |
+
from huggingface_hub import InferenceClient
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class InferenceProviderClient:
|
| 9 |
+
"""Client for Hugging Face Inference API"""
|
| 10 |
+
|
| 11 |
+
def __init__(
|
| 12 |
+
self,
|
| 13 |
+
model: str = "meta-llama/Llama-3.1-8B-Instruct",
|
| 14 |
+
api_key: Optional[str] = None,
|
| 15 |
+
temperature: float = 0.2,
|
| 16 |
+
max_tokens: int = 600
|
| 17 |
+
):
|
| 18 |
+
"""
|
| 19 |
+
Initialize the Inference client
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
model: Model identifier (default: Llama-3.1-8B-Instruct)
|
| 23 |
+
api_key: HuggingFace API token (defaults to HF_TOKEN env var)
|
| 24 |
+
temperature: Sampling temperature (0.0 to 1.0)
|
| 25 |
+
max_tokens: Maximum tokens to generate
|
| 26 |
+
"""
|
| 27 |
+
self.model = model
|
| 28 |
+
self.temperature = temperature
|
| 29 |
+
self.max_tokens = max_tokens
|
| 30 |
+
|
| 31 |
+
# Get API key from parameter or environment
|
| 32 |
+
api_key = api_key or os.getenv("HF_TOKEN")
|
| 33 |
+
if not api_key:
|
| 34 |
+
raise ValueError("HF_TOKEN environment variable must be set or api_key provided")
|
| 35 |
+
|
| 36 |
+
# Initialize Hugging Face Inference Client
|
| 37 |
+
self.client = InferenceClient(token=api_key)
|
| 38 |
+
|
| 39 |
+
def generate(
|
| 40 |
+
self,
|
| 41 |
+
prompt: str,
|
| 42 |
+
system_prompt: Optional[str] = None,
|
| 43 |
+
temperature: Optional[float] = None,
|
| 44 |
+
max_tokens: Optional[int] = None
|
| 45 |
+
) -> str:
|
| 46 |
+
"""
|
| 47 |
+
Generate a response from the LLM
|
| 48 |
+
|
| 49 |
+
Args:
|
| 50 |
+
prompt: User prompt
|
| 51 |
+
system_prompt: Optional system prompt
|
| 52 |
+
temperature: Override default temperature
|
| 53 |
+
max_tokens: Override default max tokens
|
| 54 |
+
|
| 55 |
+
Returns:
|
| 56 |
+
Generated text response
|
| 57 |
+
"""
|
| 58 |
+
messages = []
|
| 59 |
+
|
| 60 |
+
if system_prompt:
|
| 61 |
+
messages.append({"role": "system", "content": system_prompt})
|
| 62 |
+
|
| 63 |
+
messages.append({"role": "user", "content": prompt})
|
| 64 |
+
|
| 65 |
+
response = self.client.chat_completion(
|
| 66 |
+
model=self.model,
|
| 67 |
+
messages=messages,
|
| 68 |
+
temperature=temperature or self.temperature,
|
| 69 |
+
max_tokens=max_tokens or self.max_tokens
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
return response.choices[0].message.content
|
| 73 |
+
|
| 74 |
+
def generate_stream(
|
| 75 |
+
self,
|
| 76 |
+
prompt: str,
|
| 77 |
+
system_prompt: Optional[str] = None,
|
| 78 |
+
temperature: Optional[float] = None,
|
| 79 |
+
max_tokens: Optional[int] = None
|
| 80 |
+
) -> Iterator[str]:
|
| 81 |
+
"""
|
| 82 |
+
Generate a streaming response from the LLM
|
| 83 |
+
|
| 84 |
+
Args:
|
| 85 |
+
prompt: User prompt
|
| 86 |
+
system_prompt: Optional system prompt
|
| 87 |
+
temperature: Override default temperature
|
| 88 |
+
max_tokens: Override default max tokens
|
| 89 |
+
|
| 90 |
+
Yields:
|
| 91 |
+
Text chunks as they are generated
|
| 92 |
+
"""
|
| 93 |
+
messages = []
|
| 94 |
+
|
| 95 |
+
if system_prompt:
|
| 96 |
+
messages.append({"role": "system", "content": system_prompt})
|
| 97 |
+
|
| 98 |
+
messages.append({"role": "user", "content": prompt})
|
| 99 |
+
|
| 100 |
+
stream = self.client.chat_completion(
|
| 101 |
+
model=self.model,
|
| 102 |
+
messages=messages,
|
| 103 |
+
temperature=temperature or self.temperature,
|
| 104 |
+
max_tokens=max_tokens or self.max_tokens,
|
| 105 |
+
stream=True
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
for chunk in stream:
|
| 109 |
+
try:
|
| 110 |
+
if hasattr(chunk, 'choices') and len(chunk.choices) > 0:
|
| 111 |
+
if hasattr(chunk.choices[0], 'delta') and hasattr(chunk.choices[0].delta, 'content'):
|
| 112 |
+
if chunk.choices[0].delta.content is not None:
|
| 113 |
+
yield chunk.choices[0].delta.content
|
| 114 |
+
except (IndexError, AttributeError) as e:
|
| 115 |
+
# Gracefully handle malformed chunks
|
| 116 |
+
continue
|
| 117 |
+
|
| 118 |
+
def chat(
|
| 119 |
+
self,
|
| 120 |
+
messages: list[dict],
|
| 121 |
+
temperature: Optional[float] = None,
|
| 122 |
+
max_tokens: Optional[int] = None,
|
| 123 |
+
stream: bool = False
|
| 124 |
+
):
|
| 125 |
+
"""
|
| 126 |
+
Multi-turn chat completion
|
| 127 |
+
|
| 128 |
+
Args:
|
| 129 |
+
messages: List of message dicts with 'role' and 'content'
|
| 130 |
+
temperature: Override default temperature
|
| 131 |
+
max_tokens: Override default max tokens
|
| 132 |
+
stream: Whether to stream the response
|
| 133 |
+
|
| 134 |
+
Returns:
|
| 135 |
+
Response text (or iterator if stream=True)
|
| 136 |
+
"""
|
| 137 |
+
response = self.client.chat_completion(
|
| 138 |
+
model=self.model,
|
| 139 |
+
messages=messages,
|
| 140 |
+
temperature=temperature or self.temperature,
|
| 141 |
+
max_tokens=max_tokens or self.max_tokens,
|
| 142 |
+
stream=stream
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
if stream:
|
| 146 |
+
def stream_generator():
|
| 147 |
+
for chunk in response:
|
| 148 |
+
try:
|
| 149 |
+
if hasattr(chunk, 'choices') and len(chunk.choices) > 0:
|
| 150 |
+
if hasattr(chunk.choices[0], 'delta') and hasattr(chunk.choices[0].delta, 'content'):
|
| 151 |
+
if chunk.choices[0].delta.content is not None:
|
| 152 |
+
yield chunk.choices[0].delta.content
|
| 153 |
+
except (IndexError, AttributeError):
|
| 154 |
+
# Gracefully handle malformed chunks
|
| 155 |
+
continue
|
| 156 |
+
return stream_generator()
|
| 157 |
+
else:
|
| 158 |
+
return response.choices[0].message.content
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
def create_llm_client(
|
| 162 |
+
model: str = "meta-llama/Llama-3.1-8B-Instruct",
|
| 163 |
+
temperature: float = 0.7,
|
| 164 |
+
max_tokens: int = 2000
|
| 165 |
+
) -> InferenceProviderClient:
|
| 166 |
+
"""
|
| 167 |
+
Factory function to create and return a configured LLM client
|
| 168 |
+
|
| 169 |
+
Args:
|
| 170 |
+
model: Model identifier
|
| 171 |
+
temperature: Sampling temperature
|
| 172 |
+
max_tokens: Maximum tokens to generate
|
| 173 |
+
|
| 174 |
+
Returns:
|
| 175 |
+
Configured InferenceProviderClient
|
| 176 |
+
"""
|
| 177 |
+
return InferenceProviderClient(
|
| 178 |
+
model=model,
|
| 179 |
+
temperature=temperature,
|
| 180 |
+
max_tokens=max_tokens
|
| 181 |
+
)
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
# Available models (commonly used for OSINT tasks)
|
| 185 |
+
AVAILABLE_MODELS = {
|
| 186 |
+
"llama-3.1-8b": "meta-llama/Llama-3.1-8B-Instruct",
|
| 187 |
+
"llama-3-8b": "meta-llama/Meta-Llama-3-8B-Instruct",
|
| 188 |
+
"qwen-32b": "Qwen/Qwen2.5-72B-Instruct",
|
| 189 |
+
"mistral-7b": "mistralai/Mistral-7B-Instruct-v0.3",
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
def get_model_identifier(model_name: str) -> str:
|
| 194 |
+
"""Get full model identifier from short name"""
|
| 195 |
+
return AVAILABLE_MODELS.get(model_name, AVAILABLE_MODELS["llama-3.1-8b"])
|
src/prompts.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Prompt templates for OSINT investigation assistant"""
|
| 2 |
+
|
| 3 |
+
from langchain_core.prompts import PromptTemplate
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
SYSTEM_PROMPT = """You are an OSINT investigation assistant. Your responses must be SHORT and FOCUSED.
|
| 7 |
+
|
| 8 |
+
STRICT RULES:
|
| 9 |
+
1. ONLY recommend tools from the provided database - DO NOT suggest tools not in the list
|
| 10 |
+
2. Keep your response under 300 words
|
| 11 |
+
3. List 3-5 steps maximum
|
| 12 |
+
4. Include tool names and URLs from the database
|
| 13 |
+
5. NO lengthy explanations
|
| 14 |
+
6. NO additional tools beyond what's provided
|
| 15 |
+
|
| 16 |
+
Format:
|
| 17 |
+
**Investigation Steps:**
|
| 18 |
+
1. [Step] - Use [Tool Name] ([URL])
|
| 19 |
+
2. [Step] - Use [Tool Name] ([URL])
|
| 20 |
+
3. [Step] - Use [Tool Name] ([URL])
|
| 21 |
+
|
| 22 |
+
**Why these tools:** [1-2 sentences max]"""
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
INVESTIGATION_PROMPT_TEMPLATE = """USER QUESTION: {query}
|
| 26 |
+
|
| 27 |
+
AVAILABLE TOOLS FROM DATABASE:
|
| 28 |
+
{context}
|
| 29 |
+
|
| 30 |
+
INSTRUCTIONS:
|
| 31 |
+
- Provide 3-5 investigation steps ONLY
|
| 32 |
+
- Use ONLY tools from the list above
|
| 33 |
+
- Include tool name + URL for each step
|
| 34 |
+
- Keep response under 300 words
|
| 35 |
+
- Be specific and direct
|
| 36 |
+
- NO lengthy explanations
|
| 37 |
+
|
| 38 |
+
Respond with:
|
| 39 |
+
**Steps:**
|
| 40 |
+
1. [Action] using [Tool Name] ([URL])
|
| 41 |
+
2. [Action] using [Tool Name] ([URL])
|
| 42 |
+
3. [Action] using [Tool Name] ([URL])
|
| 43 |
+
|
| 44 |
+
**Notes:** [1-2 sentences explaining why these specific tools]"""
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
INVESTIGATION_PROMPT = PromptTemplate(
|
| 48 |
+
template=INVESTIGATION_PROMPT_TEMPLATE,
|
| 49 |
+
input_variables=["query", "context"]
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
FOLLOWUP_PROMPT_TEMPLATE = """You are an expert OSINT investigation assistant continuing a conversation.
|
| 54 |
+
|
| 55 |
+
CONVERSATION HISTORY:
|
| 56 |
+
{chat_history}
|
| 57 |
+
|
| 58 |
+
USER FOLLOW-UP QUESTION:
|
| 59 |
+
{query}
|
| 60 |
+
|
| 61 |
+
RELEVANT OSINT TOOLS FROM DATABASE:
|
| 62 |
+
{context}
|
| 63 |
+
|
| 64 |
+
Based on the conversation history and the user's follow-up question, provide a helpful response. If they're asking for clarification or more details about a specific tool or technique, provide that information. If they're asking a new question, follow the structured investigation methodology format."""
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
FOLLOWUP_PROMPT = PromptTemplate(
|
| 68 |
+
template=FOLLOWUP_PROMPT_TEMPLATE,
|
| 69 |
+
input_variables=["chat_history", "query", "context"]
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
TOOL_RECOMMENDATION_TEMPLATE = """Based on this investigation need: {query}
|
| 74 |
+
|
| 75 |
+
Available tools:
|
| 76 |
+
{context}
|
| 77 |
+
|
| 78 |
+
Recommend the top 3-5 most relevant tools and explain why each is suitable. Format as:
|
| 79 |
+
|
| 80 |
+
1. **Tool Name** ([URL])
|
| 81 |
+
- Category: [category]
|
| 82 |
+
- Cost: [cost]
|
| 83 |
+
- Why it's useful: [explanation]
|
| 84 |
+
"""
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
TOOL_RECOMMENDATION_PROMPT = PromptTemplate(
|
| 88 |
+
template=TOOL_RECOMMENDATION_TEMPLATE,
|
| 89 |
+
input_variables=["query", "context"]
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def get_investigation_prompt(include_system: bool = True) -> PromptTemplate:
|
| 94 |
+
"""Get the main investigation prompt template"""
|
| 95 |
+
return INVESTIGATION_PROMPT
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def get_followup_prompt() -> PromptTemplate:
|
| 99 |
+
"""Get the follow-up conversation prompt template"""
|
| 100 |
+
return FOLLOWUP_PROMPT
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def get_tool_recommendation_prompt() -> PromptTemplate:
|
| 104 |
+
"""Get the tool recommendation prompt template"""
|
| 105 |
+
return TOOL_RECOMMENDATION_PROMPT
|
src/rag_pipeline.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""RAG pipeline for OSINT investigation assistant"""
|
| 2 |
+
|
| 3 |
+
from typing import Iterator, Optional, List, Tuple
|
| 4 |
+
from .vectorstore import OSINTVectorStore, create_vectorstore
|
| 5 |
+
from .llm_client import InferenceProviderClient, create_llm_client
|
| 6 |
+
from .prompts import (
|
| 7 |
+
SYSTEM_PROMPT,
|
| 8 |
+
INVESTIGATION_PROMPT,
|
| 9 |
+
get_investigation_prompt
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class OSINTInvestigationPipeline:
|
| 14 |
+
"""RAG pipeline for generating OSINT investigation methodologies"""
|
| 15 |
+
|
| 16 |
+
def __init__(
|
| 17 |
+
self,
|
| 18 |
+
vectorstore: Optional[OSINTVectorStore] = None,
|
| 19 |
+
llm_client: Optional[InferenceProviderClient] = None,
|
| 20 |
+
retrieval_k: int = 5
|
| 21 |
+
):
|
| 22 |
+
"""
|
| 23 |
+
Initialize the RAG pipeline
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
vectorstore: Vector store instance (creates default if None)
|
| 27 |
+
llm_client: LLM client instance (creates default if None)
|
| 28 |
+
retrieval_k: Number of tools to retrieve for context
|
| 29 |
+
"""
|
| 30 |
+
self.vectorstore = vectorstore or create_vectorstore()
|
| 31 |
+
self.llm_client = llm_client or create_llm_client()
|
| 32 |
+
self.retrieval_k = retrieval_k
|
| 33 |
+
|
| 34 |
+
def retrieve_tools(self, query: str, k: Optional[int] = None) -> List:
|
| 35 |
+
"""
|
| 36 |
+
Retrieve relevant OSINT tools for a query
|
| 37 |
+
|
| 38 |
+
Args:
|
| 39 |
+
query: User's investigation query
|
| 40 |
+
k: Number of tools to retrieve (uses default if None)
|
| 41 |
+
|
| 42 |
+
Returns:
|
| 43 |
+
List of relevant tool documents
|
| 44 |
+
"""
|
| 45 |
+
k = k or self.retrieval_k
|
| 46 |
+
return self.vectorstore.similarity_search(query, k=k)
|
| 47 |
+
|
| 48 |
+
def generate_methodology(
|
| 49 |
+
self,
|
| 50 |
+
query: str,
|
| 51 |
+
stream: bool = False
|
| 52 |
+
) -> str | Iterator[str]:
|
| 53 |
+
"""
|
| 54 |
+
Generate investigation methodology for a query
|
| 55 |
+
|
| 56 |
+
Args:
|
| 57 |
+
query: User's investigation query
|
| 58 |
+
stream: Whether to stream the response
|
| 59 |
+
|
| 60 |
+
Returns:
|
| 61 |
+
Generated methodology (string or iterator)
|
| 62 |
+
"""
|
| 63 |
+
# Retrieve relevant tools
|
| 64 |
+
relevant_tools = self.retrieve_tools(query)
|
| 65 |
+
|
| 66 |
+
# Format tools for context
|
| 67 |
+
context = self.vectorstore.format_tools_for_context(relevant_tools)
|
| 68 |
+
|
| 69 |
+
# Generate prompt
|
| 70 |
+
prompt_template = get_investigation_prompt()
|
| 71 |
+
full_prompt = prompt_template.format(query=query, context=context)
|
| 72 |
+
|
| 73 |
+
# Generate response
|
| 74 |
+
if stream:
|
| 75 |
+
return self.llm_client.generate_stream(
|
| 76 |
+
prompt=full_prompt,
|
| 77 |
+
system_prompt=SYSTEM_PROMPT
|
| 78 |
+
)
|
| 79 |
+
else:
|
| 80 |
+
return self.llm_client.generate(
|
| 81 |
+
prompt=full_prompt,
|
| 82 |
+
system_prompt=SYSTEM_PROMPT
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
def chat(
|
| 86 |
+
self,
|
| 87 |
+
message: str,
|
| 88 |
+
history: Optional[List[Tuple[str, str]]] = None,
|
| 89 |
+
stream: bool = False
|
| 90 |
+
) -> str | Iterator[str]:
|
| 91 |
+
"""
|
| 92 |
+
Handle a chat message with conversation history
|
| 93 |
+
|
| 94 |
+
Args:
|
| 95 |
+
message: User's message
|
| 96 |
+
history: Conversation history as list of (user_msg, assistant_msg) tuples
|
| 97 |
+
stream: Whether to stream the response
|
| 98 |
+
|
| 99 |
+
Returns:
|
| 100 |
+
Generated response (string or iterator)
|
| 101 |
+
"""
|
| 102 |
+
# For now, treat each message as a new investigation query
|
| 103 |
+
# In the future, could implement follow-up handling
|
| 104 |
+
return self.generate_methodology(message, stream=stream)
|
| 105 |
+
|
| 106 |
+
def get_tool_recommendations(
|
| 107 |
+
self,
|
| 108 |
+
query: str,
|
| 109 |
+
k: int = 5
|
| 110 |
+
) -> List[dict]:
|
| 111 |
+
"""
|
| 112 |
+
Get tool recommendations with metadata
|
| 113 |
+
|
| 114 |
+
Args:
|
| 115 |
+
query: Investigation query
|
| 116 |
+
k: Number of tools to recommend
|
| 117 |
+
|
| 118 |
+
Returns:
|
| 119 |
+
List of tool dictionaries with metadata
|
| 120 |
+
"""
|
| 121 |
+
docs = self.retrieve_tools(query, k=k)
|
| 122 |
+
|
| 123 |
+
tools = []
|
| 124 |
+
for doc in docs:
|
| 125 |
+
tool = {
|
| 126 |
+
"name": doc.metadata.get("name", "Unknown"),
|
| 127 |
+
"category": doc.metadata.get("category", "N/A"),
|
| 128 |
+
"cost": doc.metadata.get("cost", "N/A"),
|
| 129 |
+
"url": doc.metadata.get("url", "N/A"),
|
| 130 |
+
"description": doc.page_content,
|
| 131 |
+
"details": doc.metadata.get("details", "N/A")
|
| 132 |
+
}
|
| 133 |
+
tools.append(tool)
|
| 134 |
+
|
| 135 |
+
return tools
|
| 136 |
+
|
| 137 |
+
def search_tools_by_category(
|
| 138 |
+
self,
|
| 139 |
+
category: str,
|
| 140 |
+
k: int = 10
|
| 141 |
+
) -> List[dict]:
|
| 142 |
+
"""
|
| 143 |
+
Search tools by category
|
| 144 |
+
|
| 145 |
+
Args:
|
| 146 |
+
category: Tool category (e.g., "Archiving", "Social Media")
|
| 147 |
+
k: Number of tools to return
|
| 148 |
+
|
| 149 |
+
Returns:
|
| 150 |
+
List of tool dictionaries
|
| 151 |
+
"""
|
| 152 |
+
docs = self.vectorstore.similarity_search(
|
| 153 |
+
query=category,
|
| 154 |
+
k=k,
|
| 155 |
+
filter_category=category
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
tools = []
|
| 159 |
+
for doc in docs:
|
| 160 |
+
tool = {
|
| 161 |
+
"name": doc.metadata.get("name", "Unknown"),
|
| 162 |
+
"category": doc.metadata.get("category", "N/A"),
|
| 163 |
+
"cost": doc.metadata.get("cost", "N/A"),
|
| 164 |
+
"url": doc.metadata.get("url", "N/A"),
|
| 165 |
+
"description": doc.page_content
|
| 166 |
+
}
|
| 167 |
+
tools.append(tool)
|
| 168 |
+
|
| 169 |
+
return tools
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def create_pipeline(
|
| 173 |
+
retrieval_k: int = 5,
|
| 174 |
+
model: str = "meta-llama/Llama-3.1-8B-Instruct",
|
| 175 |
+
temperature: float = 0.2
|
| 176 |
+
) -> OSINTInvestigationPipeline:
|
| 177 |
+
"""
|
| 178 |
+
Factory function to create a configured RAG pipeline
|
| 179 |
+
|
| 180 |
+
Args:
|
| 181 |
+
retrieval_k: Number of tools to retrieve
|
| 182 |
+
model: LLM model identifier
|
| 183 |
+
temperature: LLM temperature
|
| 184 |
+
|
| 185 |
+
Returns:
|
| 186 |
+
Configured OSINTInvestigationPipeline
|
| 187 |
+
"""
|
| 188 |
+
vectorstore = create_vectorstore()
|
| 189 |
+
llm_client = create_llm_client(model=model, temperature=temperature)
|
| 190 |
+
|
| 191 |
+
return OSINTInvestigationPipeline(
|
| 192 |
+
vectorstore=vectorstore,
|
| 193 |
+
llm_client=llm_client,
|
| 194 |
+
retrieval_k=retrieval_k
|
| 195 |
+
)
|
src/vectorstore.py
ADDED
|
@@ -0,0 +1,280 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Supabase PGVector connection and retrieval functionality"""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from typing import List, Dict, Any, Optional
|
| 5 |
+
from supabase import create_client, Client
|
| 6 |
+
from huggingface_hub import InferenceClient
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class Document:
|
| 10 |
+
"""Simple document class to match LangChain interface"""
|
| 11 |
+
|
| 12 |
+
def __init__(self, page_content: str, metadata: dict):
|
| 13 |
+
self.page_content = page_content
|
| 14 |
+
self.metadata = metadata
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class OSINTVectorStore:
|
| 18 |
+
"""Manages connection to Supabase PGVector database with OSINT tools"""
|
| 19 |
+
|
| 20 |
+
def __init__(
|
| 21 |
+
self,
|
| 22 |
+
supabase_url: Optional[str] = None,
|
| 23 |
+
supabase_key: Optional[str] = None,
|
| 24 |
+
hf_token: Optional[str] = None,
|
| 25 |
+
embedding_model: str = "sentence-transformers/all-mpnet-base-v2"
|
| 26 |
+
):
|
| 27 |
+
"""
|
| 28 |
+
Initialize the vector store connection
|
| 29 |
+
|
| 30 |
+
Args:
|
| 31 |
+
supabase_url: Supabase project URL (defaults to SUPABASE_URL env var)
|
| 32 |
+
supabase_key: Supabase anon key (defaults to SUPABASE_KEY env var)
|
| 33 |
+
hf_token: HuggingFace API token (defaults to HF_TOKEN env var)
|
| 34 |
+
embedding_model: HuggingFace model for embeddings
|
| 35 |
+
"""
|
| 36 |
+
# Get credentials from parameters or environment
|
| 37 |
+
self.supabase_url = supabase_url or os.getenv("SUPABASE_URL")
|
| 38 |
+
self.supabase_key = supabase_key or os.getenv("SUPABASE_KEY")
|
| 39 |
+
self.hf_token = hf_token or os.getenv("HF_TOKEN")
|
| 40 |
+
|
| 41 |
+
if not self.supabase_url or not self.supabase_key:
|
| 42 |
+
raise ValueError("SUPABASE_URL and SUPABASE_KEY environment variables must be set")
|
| 43 |
+
|
| 44 |
+
if not self.hf_token:
|
| 45 |
+
raise ValueError("HF_TOKEN environment variable must be set")
|
| 46 |
+
|
| 47 |
+
# Initialize Supabase client
|
| 48 |
+
self.supabase: Client = create_client(self.supabase_url, self.supabase_key)
|
| 49 |
+
|
| 50 |
+
# Initialize HuggingFace Inference client for embeddings
|
| 51 |
+
self.embedding_model = embedding_model
|
| 52 |
+
self.hf_client = InferenceClient(token=self.hf_token)
|
| 53 |
+
|
| 54 |
+
def _generate_embedding(self, text: str) -> List[float]:
|
| 55 |
+
"""
|
| 56 |
+
Generate embedding for text using HuggingFace Inference API
|
| 57 |
+
|
| 58 |
+
Args:
|
| 59 |
+
text: Text to embed
|
| 60 |
+
|
| 61 |
+
Returns:
|
| 62 |
+
List of floats representing the embedding vector (768 dimensions)
|
| 63 |
+
"""
|
| 64 |
+
try:
|
| 65 |
+
# Use feature extraction to get embeddings
|
| 66 |
+
# Note: We rely on the API's default model which returns 768-dim embeddings
|
| 67 |
+
result = self.hf_client.feature_extraction(text=text)
|
| 68 |
+
|
| 69 |
+
# Convert to list (handles numpy arrays and nested lists)
|
| 70 |
+
import numpy as np
|
| 71 |
+
|
| 72 |
+
# If it's a numpy array, convert to list
|
| 73 |
+
if isinstance(result, np.ndarray):
|
| 74 |
+
if result.ndim > 1:
|
| 75 |
+
result = result[0] # Take first row if 2D
|
| 76 |
+
return result.tolist()
|
| 77 |
+
|
| 78 |
+
# If it's a nested list, flatten if needed
|
| 79 |
+
if isinstance(result, list) and len(result) > 0:
|
| 80 |
+
if isinstance(result[0], list):
|
| 81 |
+
return result[0] # Take first embedding if batched
|
| 82 |
+
# Handle nested numpy arrays in list
|
| 83 |
+
if isinstance(result[0], np.ndarray):
|
| 84 |
+
return result[0].tolist()
|
| 85 |
+
return result
|
| 86 |
+
|
| 87 |
+
return result
|
| 88 |
+
except Exception as e:
|
| 89 |
+
raise Exception(f"Error generating embedding: {str(e)}")
|
| 90 |
+
|
| 91 |
+
def similarity_search(
|
| 92 |
+
self,
|
| 93 |
+
query: str,
|
| 94 |
+
k: int = 5,
|
| 95 |
+
filter_category: Optional[str] = None,
|
| 96 |
+
filter_cost: Optional[str] = None,
|
| 97 |
+
match_threshold: float = 0.5
|
| 98 |
+
) -> List[Document]:
|
| 99 |
+
"""
|
| 100 |
+
Perform similarity search on the OSINT tools database
|
| 101 |
+
|
| 102 |
+
Args:
|
| 103 |
+
query: Search query
|
| 104 |
+
k: Number of results to return
|
| 105 |
+
filter_category: Optional category filter
|
| 106 |
+
filter_cost: Optional cost filter (e.g., 'Free', 'Paid')
|
| 107 |
+
match_threshold: Minimum similarity threshold (0.0 to 1.0)
|
| 108 |
+
|
| 109 |
+
Returns:
|
| 110 |
+
List of Document objects with relevant OSINT tools
|
| 111 |
+
"""
|
| 112 |
+
# Generate embedding for query
|
| 113 |
+
query_embedding = self._generate_embedding(query)
|
| 114 |
+
|
| 115 |
+
# Call RPC function
|
| 116 |
+
try:
|
| 117 |
+
response = self.supabase.rpc(
|
| 118 |
+
'match_bellingcat_tools',
|
| 119 |
+
{
|
| 120 |
+
'query_embedding': query_embedding,
|
| 121 |
+
'match_threshold': match_threshold,
|
| 122 |
+
'match_count': k,
|
| 123 |
+
'filter_category': filter_category,
|
| 124 |
+
'filter_cost': filter_cost
|
| 125 |
+
}
|
| 126 |
+
).execute()
|
| 127 |
+
|
| 128 |
+
# Convert results to Document objects
|
| 129 |
+
documents = []
|
| 130 |
+
for item in response.data:
|
| 131 |
+
doc = Document(
|
| 132 |
+
page_content=item.get('content', ''),
|
| 133 |
+
metadata={
|
| 134 |
+
'id': item.get('id'),
|
| 135 |
+
'name': item.get('name'),
|
| 136 |
+
'category': item.get('category'),
|
| 137 |
+
'url': item.get('url'),
|
| 138 |
+
'cost': item.get('cost'),
|
| 139 |
+
'details': item.get('details'),
|
| 140 |
+
'similarity': item.get('similarity')
|
| 141 |
+
}
|
| 142 |
+
)
|
| 143 |
+
documents.append(doc)
|
| 144 |
+
|
| 145 |
+
return documents
|
| 146 |
+
|
| 147 |
+
except Exception as e:
|
| 148 |
+
raise Exception(f"Error performing similarity search: {str(e)}")
|
| 149 |
+
|
| 150 |
+
def similarity_search_with_score(
|
| 151 |
+
self,
|
| 152 |
+
query: str,
|
| 153 |
+
k: int = 5
|
| 154 |
+
) -> List[tuple]:
|
| 155 |
+
"""
|
| 156 |
+
Perform similarity search and return documents with relevance scores
|
| 157 |
+
|
| 158 |
+
Args:
|
| 159 |
+
query: Search query
|
| 160 |
+
k: Number of results to return
|
| 161 |
+
|
| 162 |
+
Returns:
|
| 163 |
+
List of tuples (Document, score)
|
| 164 |
+
"""
|
| 165 |
+
# Generate embedding for query
|
| 166 |
+
query_embedding = self._generate_embedding(query)
|
| 167 |
+
|
| 168 |
+
# Call RPC function
|
| 169 |
+
try:
|
| 170 |
+
response = self.supabase.rpc(
|
| 171 |
+
'match_bellingcat_tools',
|
| 172 |
+
{
|
| 173 |
+
'query_embedding': query_embedding,
|
| 174 |
+
'match_threshold': 0.0, # Get all matches
|
| 175 |
+
'match_count': k,
|
| 176 |
+
'filter_category': None,
|
| 177 |
+
'filter_cost': None
|
| 178 |
+
}
|
| 179 |
+
).execute()
|
| 180 |
+
|
| 181 |
+
# Convert results to Document objects with scores
|
| 182 |
+
results = []
|
| 183 |
+
for item in response.data:
|
| 184 |
+
doc = Document(
|
| 185 |
+
page_content=item.get('content', ''),
|
| 186 |
+
metadata={
|
| 187 |
+
'id': item.get('id'),
|
| 188 |
+
'name': item.get('name'),
|
| 189 |
+
'category': item.get('category'),
|
| 190 |
+
'url': item.get('url'),
|
| 191 |
+
'cost': item.get('cost'),
|
| 192 |
+
'details': item.get('details')
|
| 193 |
+
}
|
| 194 |
+
)
|
| 195 |
+
score = item.get('similarity', 0.0)
|
| 196 |
+
results.append((doc, score))
|
| 197 |
+
|
| 198 |
+
return results
|
| 199 |
+
|
| 200 |
+
except Exception as e:
|
| 201 |
+
raise Exception(f"Error performing similarity search: {str(e)}")
|
| 202 |
+
|
| 203 |
+
def get_retriever(self, k: int = 5):
|
| 204 |
+
"""
|
| 205 |
+
Get a retriever-like object for LangChain compatibility
|
| 206 |
+
|
| 207 |
+
Args:
|
| 208 |
+
k: Number of results to return
|
| 209 |
+
|
| 210 |
+
Returns:
|
| 211 |
+
Simple retriever object with get_relevant_documents method
|
| 212 |
+
"""
|
| 213 |
+
class SimpleRetriever:
|
| 214 |
+
def __init__(self, vectorstore, k):
|
| 215 |
+
self.vectorstore = vectorstore
|
| 216 |
+
self.k = k
|
| 217 |
+
|
| 218 |
+
def get_relevant_documents(self, query: str) -> List[Document]:
|
| 219 |
+
return self.vectorstore.similarity_search(query, k=self.k)
|
| 220 |
+
|
| 221 |
+
return SimpleRetriever(self, k)
|
| 222 |
+
|
| 223 |
+
def format_tools_for_context(self, documents: List[Document]) -> str:
|
| 224 |
+
"""
|
| 225 |
+
Format retrieved tools for inclusion in LLM context
|
| 226 |
+
|
| 227 |
+
Args:
|
| 228 |
+
documents: List of retrieved Document objects
|
| 229 |
+
|
| 230 |
+
Returns:
|
| 231 |
+
Formatted string with tool information
|
| 232 |
+
"""
|
| 233 |
+
formatted_tools = []
|
| 234 |
+
|
| 235 |
+
for i, doc in enumerate(documents, 1):
|
| 236 |
+
metadata = doc.metadata
|
| 237 |
+
tool_info = f"""
|
| 238 |
+
Tool {i}: {metadata.get('name', 'Unknown')}
|
| 239 |
+
Category: {metadata.get('category', 'N/A')}
|
| 240 |
+
Cost: {metadata.get('cost', 'N/A')}
|
| 241 |
+
URL: {metadata.get('url', 'N/A')}
|
| 242 |
+
Description: {doc.page_content}
|
| 243 |
+
Details: {metadata.get('details', 'N/A')}
|
| 244 |
+
"""
|
| 245 |
+
formatted_tools.append(tool_info.strip())
|
| 246 |
+
|
| 247 |
+
return "\n\n---\n\n".join(formatted_tools)
|
| 248 |
+
|
| 249 |
+
def get_tool_categories(self) -> List[str]:
|
| 250 |
+
"""Get list of available tool categories from database"""
|
| 251 |
+
try:
|
| 252 |
+
response = self.supabase.table('bellingcat_tools')\
|
| 253 |
+
.select('category')\
|
| 254 |
+
.execute()
|
| 255 |
+
|
| 256 |
+
# Extract unique categories
|
| 257 |
+
categories = set()
|
| 258 |
+
for item in response.data:
|
| 259 |
+
if item.get('category'):
|
| 260 |
+
categories.add(item['category'])
|
| 261 |
+
|
| 262 |
+
return sorted(list(categories))
|
| 263 |
+
|
| 264 |
+
except Exception as e:
|
| 265 |
+
# Return common categories as fallback
|
| 266 |
+
return [
|
| 267 |
+
"Archiving",
|
| 268 |
+
"Social Media",
|
| 269 |
+
"Geolocation",
|
| 270 |
+
"Image Analysis",
|
| 271 |
+
"Domain Investigation",
|
| 272 |
+
"Network Analysis",
|
| 273 |
+
"Data Extraction",
|
| 274 |
+
"Verification"
|
| 275 |
+
]
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
def create_vectorstore() -> OSINTVectorStore:
|
| 279 |
+
"""Factory function to create and return a configured vector store"""
|
| 280 |
+
return OSINTVectorStore()
|