Spaces:
Runtime error
Runtime error
Upload Utils and its files
Browse files- utils/chunker.py +1314 -0
- utils/export.py +1896 -0
- utils/optimizer.py +558 -0
- utils/parser.py +549 -0
- utils/scorer.py +501 -0
utils/chunker.py
ADDED
|
@@ -0,0 +1,1314 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Vector Chunking and RAG Module
|
| 3 |
+
Handles document chunking, vector embeddings, and RAG question-answering
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import json
|
| 8 |
+
import numpy as np
|
| 9 |
+
from typing import Dict, Any, List, Optional, Tuple
|
| 10 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
|
| 11 |
+
from langchain.schema import Document
|
| 12 |
+
from langchain_community.vectorstores import FAISS, Chroma
|
| 13 |
+
from langchain.chains import RetrievalQA, ConversationalRetrievalChain
|
| 14 |
+
from langchain.memory import ConversationBufferMemory
|
| 15 |
+
from langchain.prompts import PromptTemplate
|
| 16 |
+
import tempfile
|
| 17 |
+
import shutil
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class VectorChunker:
|
| 21 |
+
"""Main class for document chunking and vector operations"""
|
| 22 |
+
|
| 23 |
+
def __init__(self, embeddings_model, chunk_size: int = 1000, chunk_overlap: int = 200):
|
| 24 |
+
self.embeddings = embeddings_model
|
| 25 |
+
self.chunk_size = chunk_size
|
| 26 |
+
self.chunk_overlap = chunk_overlap
|
| 27 |
+
self.setup_text_splitters()
|
| 28 |
+
self.vector_stores = {} # Cache for vector stores
|
| 29 |
+
|
| 30 |
+
def setup_text_splitters(self):
|
| 31 |
+
"""Initialize different text splitting strategies"""
|
| 32 |
+
|
| 33 |
+
# Default recursive splitter
|
| 34 |
+
self.recursive_splitter = RecursiveCharacterTextSplitter(
|
| 35 |
+
chunk_size=self.chunk_size,
|
| 36 |
+
chunk_overlap=self.chunk_overlap,
|
| 37 |
+
length_function=len,
|
| 38 |
+
separators=["\n\n", "\n", " ", ""]
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
# Character-based splitter
|
| 42 |
+
self.character_splitter = CharacterTextSplitter(
|
| 43 |
+
chunk_size=self.chunk_size,
|
| 44 |
+
chunk_overlap=self.chunk_overlap,
|
| 45 |
+
separator="\n\n"
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
# Semantic splitter for better context preservation
|
| 49 |
+
self.semantic_splitter = RecursiveCharacterTextSplitter(
|
| 50 |
+
chunk_size=800, # Smaller chunks for better semantic coherence
|
| 51 |
+
chunk_overlap=150,
|
| 52 |
+
length_function=len,
|
| 53 |
+
separators=["\n\n", "\n", ". ", " ", ""]
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
def chunk_documents(self, documents: List[Document], strategy: str = "recursive") -> List[Document]:
|
| 57 |
+
"""
|
| 58 |
+
Chunk documents using specified strategy
|
| 59 |
+
|
| 60 |
+
Args:
|
| 61 |
+
documents (List[Document]): List of documents to chunk
|
| 62 |
+
strategy (str): Chunking strategy ("recursive", "character", "semantic")
|
| 63 |
+
|
| 64 |
+
Returns:
|
| 65 |
+
List[Document]: List of chunked documents
|
| 66 |
+
"""
|
| 67 |
+
try:
|
| 68 |
+
# Choose splitter based on strategy
|
| 69 |
+
if strategy == "character":
|
| 70 |
+
splitter = self.character_splitter
|
| 71 |
+
elif strategy == "semantic":
|
| 72 |
+
splitter = self.semantic_splitter
|
| 73 |
+
else:
|
| 74 |
+
splitter = self.recursive_splitter
|
| 75 |
+
|
| 76 |
+
# Split documents
|
| 77 |
+
chunked_docs = []
|
| 78 |
+
|
| 79 |
+
for doc in documents:
|
| 80 |
+
chunks = splitter.split_documents([doc])
|
| 81 |
+
|
| 82 |
+
# Add chunk metadata
|
| 83 |
+
for i, chunk in enumerate(chunks):
|
| 84 |
+
chunk.metadata.update({
|
| 85 |
+
'chunk_index': i,
|
| 86 |
+
'total_chunks': len(chunks),
|
| 87 |
+
'chunk_strategy': strategy,
|
| 88 |
+
'original_source': doc.metadata.get('source', 'unknown'),
|
| 89 |
+
'chunk_size': len(chunk.page_content),
|
| 90 |
+
'chunk_word_count': len(chunk.page_content.split())
|
| 91 |
+
})
|
| 92 |
+
|
| 93 |
+
chunked_docs.extend(chunks)
|
| 94 |
+
|
| 95 |
+
return chunked_docs
|
| 96 |
+
|
| 97 |
+
except Exception as e:
|
| 98 |
+
raise Exception(f"Document chunking failed: {str(e)}")
|
| 99 |
+
|
| 100 |
+
def create_vector_store(self, documents: List[Document], store_type: str = "faiss",
|
| 101 |
+
persist_directory: Optional[str] = None) -> Any:
|
| 102 |
+
"""
|
| 103 |
+
Create vector store from documents
|
| 104 |
+
|
| 105 |
+
Args:
|
| 106 |
+
documents (List[Document]): Documents to vectorize
|
| 107 |
+
store_type (str): Type of vector store ("faiss", "chroma")
|
| 108 |
+
persist_directory (str): Optional directory to persist the store
|
| 109 |
+
|
| 110 |
+
Returns:
|
| 111 |
+
Vector store instance
|
| 112 |
+
"""
|
| 113 |
+
try:
|
| 114 |
+
if not documents:
|
| 115 |
+
raise ValueError("No documents provided for vector store creation")
|
| 116 |
+
|
| 117 |
+
if store_type.lower() == "chroma":
|
| 118 |
+
if persist_directory:
|
| 119 |
+
vector_store = Chroma.from_documents(
|
| 120 |
+
documents=documents,
|
| 121 |
+
embedding=self.embeddings,
|
| 122 |
+
persist_directory=persist_directory
|
| 123 |
+
)
|
| 124 |
+
vector_store.persist()
|
| 125 |
+
else:
|
| 126 |
+
vector_store = Chroma.from_documents(
|
| 127 |
+
documents=documents,
|
| 128 |
+
embedding=self.embeddings
|
| 129 |
+
)
|
| 130 |
+
else: # Default to FAISS
|
| 131 |
+
vector_store = FAISS.from_documents(
|
| 132 |
+
documents=documents,
|
| 133 |
+
embedding=self.embeddings
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
# Save FAISS index if persist directory provided
|
| 137 |
+
if persist_directory:
|
| 138 |
+
os.makedirs(persist_directory, exist_ok=True)
|
| 139 |
+
vector_store.save_local(persist_directory)
|
| 140 |
+
|
| 141 |
+
return vector_store
|
| 142 |
+
|
| 143 |
+
except Exception as e:
|
| 144 |
+
raise Exception(f"Vector store creation failed: {str(e)}")
|
| 145 |
+
|
| 146 |
+
def create_qa_chain(self, documents: List[Document], llm, chain_type: str = "stuff") -> RetrievalQA:
|
| 147 |
+
"""
|
| 148 |
+
Create a Question-Answering chain from documents
|
| 149 |
+
|
| 150 |
+
Args:
|
| 151 |
+
documents (List[Document]): Documents for the knowledge base
|
| 152 |
+
llm: Language model for answering questions
|
| 153 |
+
chain_type (str): Type of QA chain ("stuff", "map_reduce", "refine")
|
| 154 |
+
|
| 155 |
+
Returns:
|
| 156 |
+
RetrievalQA: Configured QA chain
|
| 157 |
+
"""
|
| 158 |
+
try:
|
| 159 |
+
# Chunk documents
|
| 160 |
+
chunked_docs = self.chunk_documents(documents, strategy="semantic")
|
| 161 |
+
|
| 162 |
+
# Create vector store
|
| 163 |
+
vector_store = self.create_vector_store(chunked_docs, store_type="faiss")
|
| 164 |
+
|
| 165 |
+
# Create retriever
|
| 166 |
+
retriever = vector_store.as_retriever(
|
| 167 |
+
search_type="similarity",
|
| 168 |
+
search_kwargs={"k": 4} # Retrieve top 4 most relevant chunks
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
# Custom prompt for GEO-focused QA
|
| 172 |
+
qa_prompt_template = """Use the following pieces of context to answer the question at the end.
|
| 173 |
+
If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
| 174 |
+
Focus on providing clear, accurate, and complete answers that would be suitable for AI search engines.
|
| 175 |
+
|
| 176 |
+
Context:
|
| 177 |
+
{context}
|
| 178 |
+
|
| 179 |
+
Question: {question}
|
| 180 |
+
|
| 181 |
+
Answer:"""
|
| 182 |
+
|
| 183 |
+
qa_prompt = PromptTemplate(
|
| 184 |
+
template=qa_prompt_template,
|
| 185 |
+
input_variables=["context", "question"]
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
# Create QA chain
|
| 189 |
+
qa_chain = RetrievalQA.from_chain_type(
|
| 190 |
+
llm=llm,
|
| 191 |
+
chain_type=chain_type,
|
| 192 |
+
retriever=retriever,
|
| 193 |
+
return_source_documents=True,
|
| 194 |
+
chain_type_kwargs={"prompt": qa_prompt}
|
| 195 |
+
)
|
| 196 |
+
|
| 197 |
+
return qa_chain
|
| 198 |
+
|
| 199 |
+
except Exception as e:
|
| 200 |
+
raise Exception(f"QA chain creation failed: {str(e)}")
|
| 201 |
+
|
| 202 |
+
def create_conversational_chain(self, documents: List[Document], llm) -> ConversationalRetrievalChain:
|
| 203 |
+
"""
|
| 204 |
+
Create a conversational retrieval chain with memory
|
| 205 |
+
|
| 206 |
+
Args:
|
| 207 |
+
documents (List[Document]): Documents for the knowledge base
|
| 208 |
+
llm: Language model for conversation
|
| 209 |
+
|
| 210 |
+
Returns:
|
| 211 |
+
ConversationalRetrievalChain: Configured conversational chain
|
| 212 |
+
"""
|
| 213 |
+
try:
|
| 214 |
+
# Chunk documents
|
| 215 |
+
chunked_docs = self.chunk_documents(documents, strategy="semantic")
|
| 216 |
+
|
| 217 |
+
# Create vector store
|
| 218 |
+
vector_store = self.create_vector_store(chunked_docs, store_type="faiss")
|
| 219 |
+
|
| 220 |
+
# Create retriever
|
| 221 |
+
retriever = vector_store.as_retriever(
|
| 222 |
+
search_type="similarity",
|
| 223 |
+
search_kwargs={"k": 3}
|
| 224 |
+
)
|
| 225 |
+
|
| 226 |
+
# Create memory
|
| 227 |
+
memory = ConversationBufferMemory(
|
| 228 |
+
memory_key="chat_history",
|
| 229 |
+
return_messages=True,
|
| 230 |
+
output_key="answer"
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
# Custom prompt for conversational QA
|
| 234 |
+
condense_question_prompt = """Given the following conversation and a follow up question,
|
| 235 |
+
rephrase the follow up question to be a standalone question that can be understood without the chat history.
|
| 236 |
+
|
| 237 |
+
Chat History:
|
| 238 |
+
{chat_history}
|
| 239 |
+
Follow Up Input: {question}
|
| 240 |
+
Standalone question:"""
|
| 241 |
+
|
| 242 |
+
# Create conversational chain
|
| 243 |
+
conv_chain = ConversationalRetrievalChain.from_llm(
|
| 244 |
+
llm=llm,
|
| 245 |
+
retriever=retriever,
|
| 246 |
+
memory=memory,
|
| 247 |
+
return_source_documents=True,
|
| 248 |
+
condense_question_prompt=PromptTemplate.from_template(condense_question_prompt)
|
| 249 |
+
)
|
| 250 |
+
|
| 251 |
+
return conv_chain
|
| 252 |
+
|
| 253 |
+
except Exception as e:
|
| 254 |
+
raise Exception(f"Conversational chain creation failed: {str(e)}")
|
| 255 |
+
|
| 256 |
+
def semantic_search(self, query: str, documents: List[Document], top_k: int = 5) -> List[Dict[str, Any]]:
|
| 257 |
+
"""
|
| 258 |
+
Perform semantic search on documents
|
| 259 |
+
|
| 260 |
+
Args:
|
| 261 |
+
query (str): Search query
|
| 262 |
+
documents (List[Document]): Documents to search
|
| 263 |
+
top_k (int): Number of top results to return
|
| 264 |
+
|
| 265 |
+
Returns:
|
| 266 |
+
List[Dict]: Search results with scores
|
| 267 |
+
"""
|
| 268 |
+
try:
|
| 269 |
+
# Chunk documents
|
| 270 |
+
chunked_docs = self.chunk_documents(documents, strategy="semantic")
|
| 271 |
+
|
| 272 |
+
# Create vector store
|
| 273 |
+
vector_store = self.create_vector_store(chunked_docs, store_type="faiss")
|
| 274 |
+
|
| 275 |
+
# Perform similarity search with scores
|
| 276 |
+
results = vector_store.similarity_search_with_score(query, k=top_k)
|
| 277 |
+
|
| 278 |
+
# Format results
|
| 279 |
+
formatted_results = []
|
| 280 |
+
for doc, score in results:
|
| 281 |
+
result = {
|
| 282 |
+
'content': doc.page_content,
|
| 283 |
+
'metadata': doc.metadata,
|
| 284 |
+
'similarity_score': float(score),
|
| 285 |
+
'relevance_rank': len(formatted_results) + 1
|
| 286 |
+
}
|
| 287 |
+
formatted_results.append(result)
|
| 288 |
+
|
| 289 |
+
return formatted_results
|
| 290 |
+
|
| 291 |
+
except Exception as e:
|
| 292 |
+
raise Exception(f"Semantic search failed: {str(e)}")
|
| 293 |
+
|
| 294 |
+
def analyze_document_similarity(self, documents: List[Document]) -> Dict[str, Any]:
|
| 295 |
+
"""
|
| 296 |
+
Analyze similarity between documents
|
| 297 |
+
|
| 298 |
+
Args:
|
| 299 |
+
documents (List[Document]): Documents to analyze
|
| 300 |
+
|
| 301 |
+
Returns:
|
| 302 |
+
Dict: Similarity analysis results
|
| 303 |
+
"""
|
| 304 |
+
try:
|
| 305 |
+
if len(documents) < 2:
|
| 306 |
+
return {'error': 'Need at least 2 documents for similarity analysis'}
|
| 307 |
+
|
| 308 |
+
# Chunk documents
|
| 309 |
+
chunked_docs = self.chunk_documents(documents, strategy="semantic")
|
| 310 |
+
|
| 311 |
+
# Create embeddings for each document
|
| 312 |
+
doc_embeddings = []
|
| 313 |
+
doc_metadata = []
|
| 314 |
+
|
| 315 |
+
for doc in chunked_docs:
|
| 316 |
+
# Get embedding for the document
|
| 317 |
+
embedding = self.embeddings.embed_query(doc.page_content)
|
| 318 |
+
doc_embeddings.append(embedding)
|
| 319 |
+
doc_metadata.append({
|
| 320 |
+
'content_preview': doc.page_content[:200] + "...",
|
| 321 |
+
'metadata': doc.metadata,
|
| 322 |
+
'length': len(doc.page_content)
|
| 323 |
+
})
|
| 324 |
+
|
| 325 |
+
# Calculate pairwise similarities
|
| 326 |
+
similarities = []
|
| 327 |
+
embeddings_array = np.array(doc_embeddings)
|
| 328 |
+
|
| 329 |
+
for i in range(len(embeddings_array)):
|
| 330 |
+
for j in range(i + 1, len(embeddings_array)):
|
| 331 |
+
# Calculate cosine similarity
|
| 332 |
+
similarity = np.dot(embeddings_array[i], embeddings_array[j]) / (
|
| 333 |
+
np.linalg.norm(embeddings_array[i]) * np.linalg.norm(embeddings_array[j])
|
| 334 |
+
)
|
| 335 |
+
|
| 336 |
+
similarities.append({
|
| 337 |
+
'doc_1_index': i,
|
| 338 |
+
'doc_2_index': j,
|
| 339 |
+
'similarity_score': float(similarity),
|
| 340 |
+
'doc_1_preview': doc_metadata[i]['content_preview'],
|
| 341 |
+
'doc_2_preview': doc_metadata[j]['content_preview']
|
| 342 |
+
})
|
| 343 |
+
|
| 344 |
+
# Sort by similarity score
|
| 345 |
+
similarities.sort(key=lambda x: x['similarity_score'], reverse=True)
|
| 346 |
+
|
| 347 |
+
# Calculate statistics
|
| 348 |
+
similarity_scores = [s['similarity_score'] for s in similarities]
|
| 349 |
+
|
| 350 |
+
return {
|
| 351 |
+
'total_comparisons': len(similarities),
|
| 352 |
+
'average_similarity': np.mean(similarity_scores),
|
| 353 |
+
'max_similarity': max(similarity_scores),
|
| 354 |
+
'min_similarity': min(similarity_scores),
|
| 355 |
+
'similarity_distribution': {
|
| 356 |
+
'high_similarity': len([s for s in similarity_scores if s > 0.8]),
|
| 357 |
+
'medium_similarity': len([s for s in similarity_scores if 0.5 < s <= 0.8]),
|
| 358 |
+
'low_similarity': len([s for s in similarity_scores if s <= 0.5])
|
| 359 |
+
},
|
| 360 |
+
'top_similar_pairs': similarities[:5],
|
| 361 |
+
'most_dissimilar_pairs': similarities[-3:]
|
| 362 |
+
}
|
| 363 |
+
|
| 364 |
+
except Exception as e:
|
| 365 |
+
return {'error': f"Similarity analysis failed: {str(e)}"}
|
| 366 |
+
|
| 367 |
+
def extract_key_passages(self, documents: List[Document], queries: List[str],
|
| 368 |
+
passages_per_query: int = 3) -> Dict[str, List[Dict[str, Any]]]:
|
| 369 |
+
"""
|
| 370 |
+
Extract key passages from documents based on multiple queries
|
| 371 |
+
|
| 372 |
+
Args:
|
| 373 |
+
documents (List[Document]): Documents to search
|
| 374 |
+
queries (List[str]): List of queries to search for
|
| 375 |
+
passages_per_query (int): Number of passages to extract per query
|
| 376 |
+
|
| 377 |
+
Returns:
|
| 378 |
+
Dict: Key passages organized by query
|
| 379 |
+
"""
|
| 380 |
+
try:
|
| 381 |
+
# Chunk documents
|
| 382 |
+
chunked_docs = self.chunk_documents(documents, strategy="semantic")
|
| 383 |
+
|
| 384 |
+
# Create vector store
|
| 385 |
+
vector_store = self.create_vector_store(chunked_docs, store_type="faiss")
|
| 386 |
+
|
| 387 |
+
key_passages = {}
|
| 388 |
+
|
| 389 |
+
for query in queries:
|
| 390 |
+
# Search for relevant passages
|
| 391 |
+
results = vector_store.similarity_search_with_score(query, k=passages_per_query)
|
| 392 |
+
|
| 393 |
+
passages = []
|
| 394 |
+
for doc, score in results:
|
| 395 |
+
passage = {
|
| 396 |
+
'content': doc.page_content,
|
| 397 |
+
'relevance_score': float(score),
|
| 398 |
+
'metadata': doc.metadata,
|
| 399 |
+
'word_count': len(doc.page_content.split()),
|
| 400 |
+
'query_match': query
|
| 401 |
+
}
|
| 402 |
+
passages.append(passage)
|
| 403 |
+
|
| 404 |
+
key_passages[query] = passages
|
| 405 |
+
|
| 406 |
+
return key_passages
|
| 407 |
+
|
| 408 |
+
except Exception as e:
|
| 409 |
+
return {'error': f"Key passage extraction failed: {str(e)}"}
|
| 410 |
+
|
| 411 |
+
def optimize_chunking_strategy(self, documents: List[Document],
|
| 412 |
+
test_queries: List[str]) -> Dict[str, Any]:
|
| 413 |
+
"""
|
| 414 |
+
Test different chunking strategies and recommend the best one
|
| 415 |
+
|
| 416 |
+
Args:
|
| 417 |
+
documents (List[Document]): Documents to test
|
| 418 |
+
test_queries (List[str]): Queries to test retrieval performance
|
| 419 |
+
|
| 420 |
+
Returns:
|
| 421 |
+
Dict: Optimization results and recommendations
|
| 422 |
+
"""
|
| 423 |
+
try:
|
| 424 |
+
strategies = ["recursive", "character", "semantic"]
|
| 425 |
+
strategy_results = {}
|
| 426 |
+
|
| 427 |
+
for strategy in strategies:
|
| 428 |
+
try:
|
| 429 |
+
# Test this strategy
|
| 430 |
+
chunked_docs = self.chunk_documents(documents, strategy=strategy)
|
| 431 |
+
vector_store = self.create_vector_store(chunked_docs, store_type="faiss")
|
| 432 |
+
|
| 433 |
+
# Test retrieval performance
|
| 434 |
+
retrieval_scores = []
|
| 435 |
+
|
| 436 |
+
for query in test_queries:
|
| 437 |
+
results = vector_store.similarity_search_with_score(query, k=3)
|
| 438 |
+
|
| 439 |
+
# Calculate average relevance score
|
| 440 |
+
if results:
|
| 441 |
+
avg_score = sum(score for _, score in results) / len(results)
|
| 442 |
+
retrieval_scores.append(float(avg_score))
|
| 443 |
+
|
| 444 |
+
# Calculate strategy metrics
|
| 445 |
+
avg_retrieval_score = np.mean(retrieval_scores) if retrieval_scores else 0
|
| 446 |
+
total_chunks = len(chunked_docs)
|
| 447 |
+
avg_chunk_size = np.mean([len(doc.page_content) for doc in chunked_docs])
|
| 448 |
+
|
| 449 |
+
strategy_results[strategy] = {
|
| 450 |
+
'average_retrieval_score': avg_retrieval_score,
|
| 451 |
+
'total_chunks': total_chunks,
|
| 452 |
+
'average_chunk_size': avg_chunk_size,
|
| 453 |
+
'retrieval_scores': retrieval_scores,
|
| 454 |
+
'chunk_size_distribution': {
|
| 455 |
+
'min': min(len(doc.page_content) for doc in chunked_docs),
|
| 456 |
+
'max': max(len(doc.page_content) for doc in chunked_docs),
|
| 457 |
+
'std': float(np.std([len(doc.page_content) for doc in chunked_docs]))
|
| 458 |
+
}
|
| 459 |
+
}
|
| 460 |
+
|
| 461 |
+
except Exception as e:
|
| 462 |
+
strategy_results[strategy] = {'error': f"Strategy test failed: {str(e)}"}
|
| 463 |
+
|
| 464 |
+
# Determine best strategy
|
| 465 |
+
valid_strategies = {k: v for k, v in strategy_results.items() if 'error' not in v}
|
| 466 |
+
|
| 467 |
+
if valid_strategies:
|
| 468 |
+
best_strategy = max(valid_strategies.keys(),
|
| 469 |
+
key=lambda k: valid_strategies[k]['average_retrieval_score'])
|
| 470 |
+
|
| 471 |
+
recommendation = {
|
| 472 |
+
'recommended_strategy': best_strategy,
|
| 473 |
+
'reason': f"Best average retrieval score: {valid_strategies[best_strategy]['average_retrieval_score']:.4f}",
|
| 474 |
+
'all_results': strategy_results,
|
| 475 |
+
'performance_summary': {
|
| 476 |
+
strategy: result.get('average_retrieval_score', 0)
|
| 477 |
+
for strategy, result in valid_strategies.items()
|
| 478 |
+
}
|
| 479 |
+
}
|
| 480 |
+
else:
|
| 481 |
+
recommendation = {
|
| 482 |
+
'recommended_strategy': 'recursive', # Default fallback
|
| 483 |
+
'reason': 'All strategies failed, using default',
|
| 484 |
+
'all_results': strategy_results
|
| 485 |
+
}
|
| 486 |
+
|
| 487 |
+
return recommendation
|
| 488 |
+
|
| 489 |
+
except Exception as e:
|
| 490 |
+
return {'error': f"Chunking optimization failed: {str(e)}"}
|
| 491 |
+
|
| 492 |
+
def create_document_summary(self, documents: List[Document], llm,
|
| 493 |
+
summary_type: str = "extractive") -> Dict[str, Any]:
|
| 494 |
+
"""
|
| 495 |
+
Create document summaries using the chunked content
|
| 496 |
+
|
| 497 |
+
Args:
|
| 498 |
+
documents (List[Document]): Documents to summarize
|
| 499 |
+
llm: Language model for summarization
|
| 500 |
+
summary_type (str): Type of summary ("extractive", "abstractive")
|
| 501 |
+
|
| 502 |
+
Returns:
|
| 503 |
+
Dict: Summary results
|
| 504 |
+
"""
|
| 505 |
+
try:
|
| 506 |
+
# Chunk documents for better processing
|
| 507 |
+
chunked_docs = self.chunk_documents(documents, strategy="semantic")
|
| 508 |
+
|
| 509 |
+
if summary_type == "extractive":
|
| 510 |
+
# Extract key sentences/chunks
|
| 511 |
+
return self._create_extractive_summary(chunked_docs)
|
| 512 |
+
else:
|
| 513 |
+
# Generate abstractive summary using LLM
|
| 514 |
+
return self._create_abstractive_summary(chunked_docs, llm)
|
| 515 |
+
|
| 516 |
+
except Exception as e:
|
| 517 |
+
return {'error': f"Document summarization failed: {str(e)}"}
|
| 518 |
+
|
| 519 |
+
def _create_extractive_summary(self, chunked_docs: List[Document]) -> Dict[str, Any]:
|
| 520 |
+
"""Create extractive summary by selecting key chunks"""
|
| 521 |
+
try:
|
| 522 |
+
# Simple extractive approach: select chunks with highest semantic density
|
| 523 |
+
chunk_scores = []
|
| 524 |
+
|
| 525 |
+
for doc in chunked_docs:
|
| 526 |
+
content = doc.page_content
|
| 527 |
+
# Simple scoring based on content characteristics
|
| 528 |
+
word_count = len(content.split())
|
| 529 |
+
sentence_count = len([s for s in content.split('.') if s.strip()])
|
| 530 |
+
|
| 531 |
+
# Score based on information density
|
| 532 |
+
density_score = word_count / max(sentence_count, 1)
|
| 533 |
+
|
| 534 |
+
# Bonus for chunks with questions, definitions, or lists
|
| 535 |
+
structure_bonus = 0
|
| 536 |
+
if '?' in content:
|
| 537 |
+
structure_bonus += 1
|
| 538 |
+
if any(word in content.lower() for word in ['define', 'definition', 'means', 'refers to']):
|
| 539 |
+
structure_bonus += 2
|
| 540 |
+
if content.count('\n•') > 0 or content.count('1.') > 0:
|
| 541 |
+
structure_bonus += 1
|
| 542 |
+
|
| 543 |
+
total_score = density_score + structure_bonus
|
| 544 |
+
chunk_scores.append((doc, total_score))
|
| 545 |
+
|
| 546 |
+
# Sort by score and select top chunks for summary
|
| 547 |
+
chunk_scores.sort(key=lambda x: x[1], reverse=True)
|
| 548 |
+
top_chunks = chunk_scores[:min(5, len(chunk_scores))]
|
| 549 |
+
|
| 550 |
+
summary_content = []
|
| 551 |
+
for doc, score in top_chunks:
|
| 552 |
+
summary_content.append({
|
| 553 |
+
'content': doc.page_content,
|
| 554 |
+
'score': score,
|
| 555 |
+
'metadata': doc.metadata
|
| 556 |
+
})
|
| 557 |
+
|
| 558 |
+
return {
|
| 559 |
+
'summary_type': 'extractive',
|
| 560 |
+
'key_chunks': summary_content,
|
| 561 |
+
'total_chunks_analyzed': len(chunked_docs),
|
| 562 |
+
'chunks_selected': len(top_chunks)
|
| 563 |
+
}
|
| 564 |
+
|
| 565 |
+
except Exception as e:
|
| 566 |
+
return {'error': f"Extractive summary failed: {str(e)}"}
|
| 567 |
+
|
| 568 |
+
def _create_abstractive_summary(self, chunked_docs: List[Document], llm) -> Dict[str, Any]:
|
| 569 |
+
"""Create abstractive summary using language model"""
|
| 570 |
+
try:
|
| 571 |
+
# Combine content from top chunks
|
| 572 |
+
combined_content = "\n\n".join([doc.page_content for doc in chunked_docs[:10]])
|
| 573 |
+
|
| 574 |
+
summary_prompt = f"""Please provide a comprehensive summary of the following content.
|
| 575 |
+
Focus on the main topics, key insights, and important details that would be valuable for AI search engines.
|
| 576 |
+
|
| 577 |
+
Content:
|
| 578 |
+
{combined_content[:5000]}
|
| 579 |
+
|
| 580 |
+
Summary:"""
|
| 581 |
+
|
| 582 |
+
from langchain.prompts import ChatPromptTemplate
|
| 583 |
+
|
| 584 |
+
prompt_template = ChatPromptTemplate.from_messages([
|
| 585 |
+
("system", "You are a professional content summarizer. Create clear, informative summaries."),
|
| 586 |
+
("user", summary_prompt)
|
| 587 |
+
])
|
| 588 |
+
|
| 589 |
+
chain = prompt_template | llm
|
| 590 |
+
result = chain.invoke({})
|
| 591 |
+
|
| 592 |
+
summary_text = result.content if hasattr(result, 'content') else str(result)
|
| 593 |
+
|
| 594 |
+
return {
|
| 595 |
+
'summary_type': 'abstractive',
|
| 596 |
+
'summary': summary_text,
|
| 597 |
+
'source_chunks': len(chunked_docs),
|
| 598 |
+
'content_length_processed': len(combined_content)
|
| 599 |
+
}
|
| 600 |
+
|
| 601 |
+
except Exception as e:
|
| 602 |
+
return {'error': f"Abstractive summary failed: {str(e)}"}
|
| 603 |
+
|
| 604 |
+
def save_vector_store(self, vector_store, directory_path: str, store_type: str = "faiss") -> bool:
|
| 605 |
+
"""
|
| 606 |
+
Save vector store to disk
|
| 607 |
+
|
| 608 |
+
Args:
|
| 609 |
+
vector_store: Vector store instance to save
|
| 610 |
+
directory_path (str): Directory to save the store
|
| 611 |
+
store_type (str): Type of vector store
|
| 612 |
+
|
| 613 |
+
Returns:
|
| 614 |
+
bool: Success status
|
| 615 |
+
"""
|
| 616 |
+
try:
|
| 617 |
+
os.makedirs(directory_path, exist_ok=True)
|
| 618 |
+
|
| 619 |
+
if store_type.lower() == "faiss":
|
| 620 |
+
vector_store.save_local(directory_path)
|
| 621 |
+
elif store_type.lower() == "chroma":
|
| 622 |
+
# Chroma stores are typically persisted during creation
|
| 623 |
+
pass
|
| 624 |
+
|
| 625 |
+
return True
|
| 626 |
+
|
| 627 |
+
except Exception as e:
|
| 628 |
+
print(f"Failed to save vector store: {str(e)}")
|
| 629 |
+
return False
|
| 630 |
+
|
| 631 |
+
def load_vector_store(self, directory_path: str, store_type: str = "faiss"):
|
| 632 |
+
"""
|
| 633 |
+
Load vector store from disk
|
| 634 |
+
|
| 635 |
+
Args:
|
| 636 |
+
directory_path (str): Directory containing the saved store
|
| 637 |
+
store_type (str): Type of vector store
|
| 638 |
+
|
| 639 |
+
Returns:
|
| 640 |
+
Vector store instance or None if failed
|
| 641 |
+
"""
|
| 642 |
+
try:
|
| 643 |
+
if not os.path.exists(directory_path):
|
| 644 |
+
return None
|
| 645 |
+
|
| 646 |
+
if store_type.lower() == "faiss":
|
| 647 |
+
vector_store = FAISS.load_local(
|
| 648 |
+
directory_path,
|
| 649 |
+
self.embeddings,
|
| 650 |
+
allow_dangerous_deserialization=True
|
| 651 |
+
)
|
| 652 |
+
return vector_store
|
| 653 |
+
elif store_type.lower() == "chroma":
|
| 654 |
+
vector_store = Chroma(
|
| 655 |
+
persist_directory=directory_path,
|
| 656 |
+
embedding_function=self.embeddings
|
| 657 |
+
)
|
| 658 |
+
return vector_store
|
| 659 |
+
|
| 660 |
+
return None
|
| 661 |
+
|
| 662 |
+
except Exception as e:
|
| 663 |
+
print(f"Failed to load vector store: {str(e)}")
|
| 664 |
+
return None
|
| 665 |
+
|
| 666 |
+
def get_chunking_stats(self, documents: List[Document], strategy: str = "recursive") -> Dict[str, Any]:
|
| 667 |
+
"""
|
| 668 |
+
Get detailed statistics about document chunking
|
| 669 |
+
|
| 670 |
+
Args:
|
| 671 |
+
documents (List[Document]): Documents to analyze
|
| 672 |
+
strategy (str): Chunking strategy to use
|
| 673 |
+
|
| 674 |
+
Returns:
|
| 675 |
+
Dict: Detailed chunking statistics
|
| 676 |
+
"""
|
| 677 |
+
try:
|
| 678 |
+
# Chunk documents
|
| 679 |
+
chunked_docs = self.chunk_documents(documents, strategy=strategy)
|
| 680 |
+
|
| 681 |
+
# Calculate statistics
|
| 682 |
+
chunk_sizes = [len(doc.page_content) for doc in chunked_docs]
|
| 683 |
+
word_counts = [len(doc.page_content.split()) for doc in chunked_docs]
|
| 684 |
+
|
| 685 |
+
stats = {
|
| 686 |
+
'strategy_used': strategy,
|
| 687 |
+
'original_documents': len(documents),
|
| 688 |
+
'total_chunks': len(chunked_docs),
|
| 689 |
+
'chunk_size_stats': {
|
| 690 |
+
'min': min(chunk_sizes) if chunk_sizes else 0,
|
| 691 |
+
'max': max(chunk_sizes) if chunk_sizes else 0,
|
| 692 |
+
'mean': np.mean(chunk_sizes) if chunk_sizes else 0,
|
| 693 |
+
'median': np.median(chunk_sizes) if chunk_sizes else 0,
|
| 694 |
+
'std': np.std(chunk_sizes) if chunk_sizes else 0
|
| 695 |
+
},
|
| 696 |
+
'word_count_stats': {
|
| 697 |
+
'min': min(word_counts) if word_counts else 0,
|
| 698 |
+
'max': max(word_counts) if word_counts else 0,
|
| 699 |
+
'mean': np.mean(word_counts) if word_counts else 0,
|
| 700 |
+
'median': np.median(word_counts) if word_counts else 0,
|
| 701 |
+
'std': np.std(word_counts) if word_counts else 0
|
| 702 |
+
},
|
| 703 |
+
'chunk_distribution': {
|
| 704 |
+
'very_small': len([s for s in chunk_sizes if s < 200]),
|
| 705 |
+
'small': len([s for s in chunk_sizes if 200 <= s < 500]),
|
| 706 |
+
'medium': len([s for s in chunk_sizes if 500 <= s < 1000]),
|
| 707 |
+
'large': len([s for s in chunk_sizes if 1000 <= s < 2000]),
|
| 708 |
+
'very_large': len([s for s in chunk_sizes if s >= 2000])
|
| 709 |
+
},
|
| 710 |
+
'overlap_efficiency': self._calculate_overlap_efficiency(chunked_docs),
|
| 711 |
+
'content_coverage': self._calculate_content_coverage(documents, chunked_docs)
|
| 712 |
+
}
|
| 713 |
+
|
| 714 |
+
return stats
|
| 715 |
+
|
| 716 |
+
except Exception as e:
|
| 717 |
+
return {'error': f"Chunking statistics failed: {str(e)}"}
|
| 718 |
+
|
| 719 |
+
def _calculate_overlap_efficiency(self, chunked_docs: List[Document]) -> float:
|
| 720 |
+
"""Calculate efficiency of chunk overlaps"""
|
| 721 |
+
try:
|
| 722 |
+
if len(chunked_docs) < 2:
|
| 723 |
+
return 1.0
|
| 724 |
+
|
| 725 |
+
total_content_length = sum(len(doc.page_content) for doc in chunked_docs)
|
| 726 |
+
unique_content = set()
|
| 727 |
+
|
| 728 |
+
# Rough estimate of content uniqueness
|
| 729 |
+
for doc in chunked_docs:
|
| 730 |
+
words = doc.page_content.split()
|
| 731 |
+
for i in range(0, len(words), 10): # Sample every 10th word
|
| 732 |
+
unique_content.add(' '.join(words[i:i+10]))
|
| 733 |
+
|
| 734 |
+
# Efficiency as ratio of unique content to total content
|
| 735 |
+
efficiency = len(unique_content) * 10 / total_content_length if total_content_length > 0 else 0
|
| 736 |
+
return min(efficiency, 1.0)
|
| 737 |
+
|
| 738 |
+
except Exception:
|
| 739 |
+
return 0.5 # Default neutral efficiency
|
| 740 |
+
|
| 741 |
+
def _calculate_content_coverage(self, original_docs: List[Document],
|
| 742 |
+
chunked_docs: List[Document]) -> float:
|
| 743 |
+
"""Calculate how well chunks cover original content"""
|
| 744 |
+
try:
|
| 745 |
+
original_content = ' '.join([doc.page_content for doc in original_docs])
|
| 746 |
+
chunked_content = ' '.join([doc.page_content for doc in chunked_docs])
|
| 747 |
+
|
| 748 |
+
# Simple coverage metric based on length
|
| 749 |
+
coverage = len(chunked_content) / len(original_content) if original_content else 0
|
| 750 |
+
return min(coverage, 1.0)
|
| 751 |
+
|
| 752 |
+
except Exception:
|
| 753 |
+
return 0.0
|
| 754 |
+
|
| 755 |
+
|
| 756 |
+
class ChunkingOptimizer:
|
| 757 |
+
"""Helper class for optimizing chunking parameters"""
|
| 758 |
+
|
| 759 |
+
def __init__(self, embeddings_model):
|
| 760 |
+
self.embeddings = embeddings_model
|
| 761 |
+
|
| 762 |
+
def optimize_chunk_size(self, documents: List[Document], test_queries: List[str],
|
| 763 |
+
size_range: Tuple[int, int] = (200, 2000),
|
| 764 |
+
step_size: int = 200) -> Dict[str, Any]:
|
| 765 |
+
"""
|
| 766 |
+
Find optimal chunk size for given documents and queries
|
| 767 |
+
|
| 768 |
+
Args:
|
| 769 |
+
documents (List[Document]): Documents to test
|
| 770 |
+
test_queries (List[str]): Queries for testing retrieval
|
| 771 |
+
size_range (Tuple[int, int]): Range of chunk sizes to test
|
| 772 |
+
step_size (int): Step size for testing
|
| 773 |
+
|
| 774 |
+
Returns:
|
| 775 |
+
Dict: Optimization results with recommended chunk size
|
| 776 |
+
"""
|
| 777 |
+
try:
|
| 778 |
+
results = {}
|
| 779 |
+
min_size, max_size = size_range
|
| 780 |
+
|
| 781 |
+
for chunk_size in range(min_size, max_size + 1, step_size):
|
| 782 |
+
# Test this chunk size
|
| 783 |
+
chunker = VectorChunker(self.embeddings, chunk_size=chunk_size)
|
| 784 |
+
|
| 785 |
+
try:
|
| 786 |
+
chunked_docs = chunker.chunk_documents(documents)
|
| 787 |
+
vector_store = chunker.create_vector_store(chunked_docs)
|
| 788 |
+
|
| 789 |
+
# Test retrieval performance
|
| 790 |
+
retrieval_scores = []
|
| 791 |
+
for query in test_queries:
|
| 792 |
+
search_results = vector_store.similarity_search_with_score(query, k=3)
|
| 793 |
+
if search_results:
|
| 794 |
+
avg_score = sum(score for _, score in search_results) / len(search_results)
|
| 795 |
+
retrieval_scores.append(float(avg_score))
|
| 796 |
+
|
| 797 |
+
avg_performance = np.mean(retrieval_scores) if retrieval_scores else 0
|
| 798 |
+
|
| 799 |
+
results[chunk_size] = {
|
| 800 |
+
'average_retrieval_score': avg_performance,
|
| 801 |
+
'total_chunks': len(chunked_docs),
|
| 802 |
+
'retrieval_scores': retrieval_scores
|
| 803 |
+
}
|
| 804 |
+
|
| 805 |
+
except Exception as e:
|
| 806 |
+
results[chunk_size] = {'error': str(e)}
|
| 807 |
+
|
| 808 |
+
# Find optimal chunk size
|
| 809 |
+
valid_results = {k: v for k, v in results.items() if 'error' not in v}
|
| 810 |
+
|
| 811 |
+
if valid_results:
|
| 812 |
+
optimal_size = max(valid_results.keys(),
|
| 813 |
+
key=lambda k: valid_results[k]['average_retrieval_score'])
|
| 814 |
+
|
| 815 |
+
return {
|
| 816 |
+
'optimal_chunk_size': optimal_size,
|
| 817 |
+
'optimal_performance': valid_results[optimal_size]['average_retrieval_score'],
|
| 818 |
+
'all_results': results,
|
| 819 |
+
'performance_trend': self._analyze_performance_trend(valid_results),
|
| 820 |
+
'recommendation': f"Use chunk size {optimal_size} for best retrieval performance"
|
| 821 |
+
}
|
| 822 |
+
else:
|
| 823 |
+
return {
|
| 824 |
+
'error': 'No valid chunk sizes could be tested',
|
| 825 |
+
'all_results': results
|
| 826 |
+
}
|
| 827 |
+
|
| 828 |
+
except Exception as e:
|
| 829 |
+
return {'error': f"Chunk size optimization failed: {str(e)}"}
|
| 830 |
+
|
| 831 |
+
def _analyze_performance_trend(self, results: Dict[int, Dict[str, Any]]) -> Dict[str, Any]:
|
| 832 |
+
"""Analyze performance trend across different chunk sizes"""
|
| 833 |
+
try:
|
| 834 |
+
sizes = sorted(results.keys())
|
| 835 |
+
performances = [results[size]['average_retrieval_score'] for size in sizes]
|
| 836 |
+
|
| 837 |
+
# Find trend direction
|
| 838 |
+
if len(performances) >= 2:
|
| 839 |
+
trend_direction = "increasing" if performances[-1] > performances[0] else "decreasing"
|
| 840 |
+
peak_performance = max(performances)
|
| 841 |
+
peak_size = sizes[performances.index(peak_performance)]
|
| 842 |
+
|
| 843 |
+
return {
|
| 844 |
+
'trend_direction': trend_direction,
|
| 845 |
+
'peak_performance': peak_performance,
|
| 846 |
+
'peak_size': peak_size,
|
| 847 |
+
'performance_range': max(performances) - min(performances),
|
| 848 |
+
'stable_performance': max(performances) - min(performances) < 0.1
|
| 849 |
+
}
|
| 850 |
+
else:
|
| 851 |
+
return {'error': 'Insufficient data for trend analysis'}
|
| 852 |
+
|
| 853 |
+
except Exception:
|
| 854 |
+
return {'error': 'Trend analysis failed'}
|
| 855 |
+
|
| 856 |
+
|
| 857 |
+
class RAGPipeline:
|
| 858 |
+
"""Complete RAG pipeline for document question-answering"""
|
| 859 |
+
|
| 860 |
+
def __init__(self, embeddings_model, llm):
|
| 861 |
+
self.embeddings = embeddings_model
|
| 862 |
+
self.llm = llm
|
| 863 |
+
self.chunker = VectorChunker(embeddings_model)
|
| 864 |
+
self.vector_stores = {}
|
| 865 |
+
self.qa_chains = {}
|
| 866 |
+
|
| 867 |
+
def create_pipeline(self, documents: List[Document], pipeline_id: str,
|
| 868 |
+
chunking_strategy: str = "semantic") -> Dict[str, Any]:
|
| 869 |
+
"""
|
| 870 |
+
Create a complete RAG pipeline for documents
|
| 871 |
+
|
| 872 |
+
Args:
|
| 873 |
+
documents (List[Document]): Documents to process
|
| 874 |
+
pipeline_id (str): Unique identifier for this pipeline
|
| 875 |
+
chunking_strategy (str): Strategy for document chunking
|
| 876 |
+
|
| 877 |
+
Returns:
|
| 878 |
+
Dict: Pipeline creation results
|
| 879 |
+
"""
|
| 880 |
+
try:
|
| 881 |
+
# Step 1: Chunk documents
|
| 882 |
+
chunked_docs = self.chunker.chunk_documents(documents, strategy=chunking_strategy)
|
| 883 |
+
|
| 884 |
+
# Step 2: Create vector store
|
| 885 |
+
vector_store = self.chunker.create_vector_store(chunked_docs, store_type="faiss")
|
| 886 |
+
|
| 887 |
+
# Step 3: Create QA chain
|
| 888 |
+
qa_chain = self.chunker.create_qa_chain(documents, self.llm)
|
| 889 |
+
|
| 890 |
+
# Store pipeline components
|
| 891 |
+
self.vector_stores[pipeline_id] = vector_store
|
| 892 |
+
self.qa_chains[pipeline_id] = qa_chain
|
| 893 |
+
|
| 894 |
+
# Pipeline statistics
|
| 895 |
+
stats = {
|
| 896 |
+
'pipeline_id': pipeline_id,
|
| 897 |
+
'documents_processed': len(documents),
|
| 898 |
+
'chunks_created': len(chunked_docs),
|
| 899 |
+
'chunking_strategy': chunking_strategy,
|
| 900 |
+
'vector_store_type': 'faiss',
|
| 901 |
+
'embedding_model': str(self.embeddings),
|
| 902 |
+
'created_at': self._get_timestamp()
|
| 903 |
+
}
|
| 904 |
+
|
| 905 |
+
return {
|
| 906 |
+
'success': True,
|
| 907 |
+
'pipeline_stats': stats,
|
| 908 |
+
'chunking_info': self.chunker.get_chunking_stats(documents, chunking_strategy)
|
| 909 |
+
}
|
| 910 |
+
|
| 911 |
+
except Exception as e:
|
| 912 |
+
return {'error': f"Pipeline creation failed: {str(e)}"}
|
| 913 |
+
|
| 914 |
+
def query_pipeline(self, pipeline_id: str, query: str,
|
| 915 |
+
return_sources: bool = True) -> Dict[str, Any]:
|
| 916 |
+
"""
|
| 917 |
+
Query a created RAG pipeline
|
| 918 |
+
|
| 919 |
+
Args:
|
| 920 |
+
pipeline_id (str): ID of the pipeline to query
|
| 921 |
+
query (str): Question to ask
|
| 922 |
+
return_sources (bool): Whether to return source documents
|
| 923 |
+
|
| 924 |
+
Returns:
|
| 925 |
+
Dict: Query results with answer and sources
|
| 926 |
+
"""
|
| 927 |
+
try:
|
| 928 |
+
if pipeline_id not in self.qa_chains:
|
| 929 |
+
return {'error': f"Pipeline '{pipeline_id}' not found"}
|
| 930 |
+
|
| 931 |
+
qa_chain = self.qa_chains[pipeline_id]
|
| 932 |
+
|
| 933 |
+
# Execute query
|
| 934 |
+
result = qa_chain({"query": query})
|
| 935 |
+
|
| 936 |
+
# Format response
|
| 937 |
+
response = {
|
| 938 |
+
'query': query,
|
| 939 |
+
'answer': result.get('result', 'No answer generated'),
|
| 940 |
+
'pipeline_id': pipeline_id,
|
| 941 |
+
'query_timestamp': self._get_timestamp()
|
| 942 |
+
}
|
| 943 |
+
|
| 944 |
+
# Add source documents if requested
|
| 945 |
+
if return_sources and 'source_documents' in result:
|
| 946 |
+
sources = []
|
| 947 |
+
for i, doc in enumerate(result['source_documents']):
|
| 948 |
+
source = {
|
| 949 |
+
'source_index': i,
|
| 950 |
+
'content': doc.page_content,
|
| 951 |
+
'metadata': doc.metadata,
|
| 952 |
+
'relevance_rank': i + 1
|
| 953 |
+
}
|
| 954 |
+
sources.append(source)
|
| 955 |
+
|
| 956 |
+
response['sources'] = sources
|
| 957 |
+
response['num_sources'] = len(sources)
|
| 958 |
+
|
| 959 |
+
return response
|
| 960 |
+
|
| 961 |
+
except Exception as e:
|
| 962 |
+
return {'error': f"Pipeline query failed: {str(e)}"}
|
| 963 |
+
|
| 964 |
+
def batch_query_pipeline(self, pipeline_id: str, queries: List[str]) -> List[Dict[str, Any]]:
|
| 965 |
+
"""
|
| 966 |
+
Execute multiple queries on a pipeline
|
| 967 |
+
|
| 968 |
+
Args:
|
| 969 |
+
pipeline_id (str): ID of the pipeline to query
|
| 970 |
+
queries (List[str]): List of questions to ask
|
| 971 |
+
|
| 972 |
+
Returns:
|
| 973 |
+
List[Dict]: List of query results
|
| 974 |
+
"""
|
| 975 |
+
results = []
|
| 976 |
+
|
| 977 |
+
for i, query in enumerate(queries):
|
| 978 |
+
try:
|
| 979 |
+
result = self.query_pipeline(pipeline_id, query, return_sources=False)
|
| 980 |
+
result['batch_index'] = i
|
| 981 |
+
results.append(result)
|
| 982 |
+
|
| 983 |
+
except Exception as e:
|
| 984 |
+
results.append({
|
| 985 |
+
'batch_index': i,
|
| 986 |
+
'query': query,
|
| 987 |
+
'error': f"Batch query failed: {str(e)}"
|
| 988 |
+
})
|
| 989 |
+
|
| 990 |
+
return results
|
| 991 |
+
|
| 992 |
+
def evaluate_pipeline(self, pipeline_id: str, test_queries: List[str],
|
| 993 |
+
expected_answers: List[str] = None) -> Dict[str, Any]:
|
| 994 |
+
"""
|
| 995 |
+
Evaluate pipeline performance on test queries
|
| 996 |
+
|
| 997 |
+
Args:
|
| 998 |
+
pipeline_id (str): ID of the pipeline to evaluate
|
| 999 |
+
test_queries (List[str]): Test questions
|
| 1000 |
+
expected_answers (List[str]): Optional expected answers for comparison
|
| 1001 |
+
|
| 1002 |
+
Returns:
|
| 1003 |
+
Dict: Evaluation results
|
| 1004 |
+
"""
|
| 1005 |
+
try:
|
| 1006 |
+
if pipeline_id not in self.qa_chains:
|
| 1007 |
+
return {'error': f"Pipeline '{pipeline_id}' not found"}
|
| 1008 |
+
|
| 1009 |
+
evaluation_results = []
|
| 1010 |
+
response_times = []
|
| 1011 |
+
|
| 1012 |
+
for i, query in enumerate(test_queries):
|
| 1013 |
+
import time
|
| 1014 |
+
start_time = time.time()
|
| 1015 |
+
|
| 1016 |
+
# Execute query
|
| 1017 |
+
result = self.query_pipeline(pipeline_id, query, return_sources=True)
|
| 1018 |
+
|
| 1019 |
+
end_time = time.time()
|
| 1020 |
+
response_time = end_time - start_time
|
| 1021 |
+
response_times.append(response_time)
|
| 1022 |
+
|
| 1023 |
+
# Evaluate result
|
| 1024 |
+
eval_result = {
|
| 1025 |
+
'query_index': i,
|
| 1026 |
+
'query': query,
|
| 1027 |
+
'answer_generated': not result.get('error'),
|
| 1028 |
+
'response_time': response_time,
|
| 1029 |
+
'answer_length': len(result.get('answer', '')),
|
| 1030 |
+
'sources_returned': result.get('num_sources', 0)
|
| 1031 |
+
}
|
| 1032 |
+
|
| 1033 |
+
# If expected answer provided, calculate similarity
|
| 1034 |
+
if expected_answers and i < len(expected_answers):
|
| 1035 |
+
expected = expected_answers[i]
|
| 1036 |
+
generated = result.get('answer', '')
|
| 1037 |
+
|
| 1038 |
+
# Simple similarity metric
|
| 1039 |
+
similarity = self._calculate_answer_similarity(expected, generated)
|
| 1040 |
+
eval_result['answer_similarity'] = similarity
|
| 1041 |
+
eval_result['expected_answer'] = expected
|
| 1042 |
+
|
| 1043 |
+
evaluation_results.append(eval_result)
|
| 1044 |
+
|
| 1045 |
+
# Calculate aggregate metrics
|
| 1046 |
+
successful_queries = len([r for r in evaluation_results if r['answer_generated']])
|
| 1047 |
+
avg_response_time = np.mean(response_times) if response_times else 0
|
| 1048 |
+
|
| 1049 |
+
if expected_answers:
|
| 1050 |
+
similarities = [r.get('answer_similarity', 0) for r in evaluation_results
|
| 1051 |
+
if 'answer_similarity' in r]
|
| 1052 |
+
avg_similarity = np.mean(similarities) if similarities else 0
|
| 1053 |
+
else:
|
| 1054 |
+
avg_similarity = None
|
| 1055 |
+
|
| 1056 |
+
return {
|
| 1057 |
+
'pipeline_id': pipeline_id,
|
| 1058 |
+
'total_queries': len(test_queries),
|
| 1059 |
+
'successful_queries': successful_queries,
|
| 1060 |
+
'success_rate': successful_queries / len(test_queries) if test_queries else 0,
|
| 1061 |
+
'average_response_time': avg_response_time,
|
| 1062 |
+
'average_answer_similarity': avg_similarity,
|
| 1063 |
+
'detailed_results': evaluation_results,
|
| 1064 |
+
'evaluation_timestamp': self._get_timestamp()
|
| 1065 |
+
}
|
| 1066 |
+
|
| 1067 |
+
except Exception as e:
|
| 1068 |
+
return {'error': f"Pipeline evaluation failed: {str(e)}"}
|
| 1069 |
+
|
| 1070 |
+
def _calculate_answer_similarity(self, expected: str, generated: str) -> float:
|
| 1071 |
+
"""Calculate similarity between expected and generated answers"""
|
| 1072 |
+
try:
|
| 1073 |
+
# Simple word overlap similarity
|
| 1074 |
+
expected_words = set(expected.lower().split())
|
| 1075 |
+
generated_words = set(generated.lower().split())
|
| 1076 |
+
|
| 1077 |
+
if not expected_words and not generated_words:
|
| 1078 |
+
return 1.0
|
| 1079 |
+
|
| 1080 |
+
intersection = expected_words.intersection(generated_words)
|
| 1081 |
+
union = expected_words.union(generated_words)
|
| 1082 |
+
|
| 1083 |
+
return len(intersection) / len(union) if union else 0.0
|
| 1084 |
+
|
| 1085 |
+
except Exception:
|
| 1086 |
+
return 0.0
|
| 1087 |
+
|
| 1088 |
+
def get_pipeline_info(self, pipeline_id: str) -> Dict[str, Any]:
|
| 1089 |
+
"""Get information about a specific pipeline"""
|
| 1090 |
+
try:
|
| 1091 |
+
if pipeline_id not in self.qa_chains:
|
| 1092 |
+
return {'error': f"Pipeline '{pipeline_id}' not found"}
|
| 1093 |
+
|
| 1094 |
+
# Get vector store info
|
| 1095 |
+
vector_store = self.vector_stores.get(pipeline_id)
|
| 1096 |
+
if vector_store:
|
| 1097 |
+
try:
|
| 1098 |
+
# Try to get vector store statistics
|
| 1099 |
+
total_vectors = vector_store.index.ntotal if hasattr(vector_store, 'index') else 'unknown'
|
| 1100 |
+
except:
|
| 1101 |
+
total_vectors = 'unknown'
|
| 1102 |
+
else:
|
| 1103 |
+
total_vectors = 'unknown'
|
| 1104 |
+
|
| 1105 |
+
return {
|
| 1106 |
+
'pipeline_id': pipeline_id,
|
| 1107 |
+
'has_qa_chain': pipeline_id in self.qa_chains,
|
| 1108 |
+
'has_vector_store': pipeline_id in self.vector_stores,
|
| 1109 |
+
'total_vectors': total_vectors,
|
| 1110 |
+
'embedding_model': str(self.embeddings),
|
| 1111 |
+
'llm_model': str(self.llm)
|
| 1112 |
+
}
|
| 1113 |
+
|
| 1114 |
+
except Exception as e:
|
| 1115 |
+
return {'error': f"Failed to get pipeline info: {str(e)}"}
|
| 1116 |
+
|
| 1117 |
+
def list_pipelines(self) -> Dict[str, Any]:
|
| 1118 |
+
"""List all created pipelines"""
|
| 1119 |
+
return {
|
| 1120 |
+
'total_pipelines': len(self.qa_chains),
|
| 1121 |
+
'pipeline_ids': list(self.qa_chains.keys()),
|
| 1122 |
+
'vector_stores': list(self.vector_stores.keys())
|
| 1123 |
+
}
|
| 1124 |
+
|
| 1125 |
+
def delete_pipeline(self, pipeline_id: str) -> Dict[str, Any]:
|
| 1126 |
+
"""Delete a pipeline and free resources"""
|
| 1127 |
+
try:
|
| 1128 |
+
deleted_components = []
|
| 1129 |
+
|
| 1130 |
+
if pipeline_id in self.qa_chains:
|
| 1131 |
+
del self.qa_chains[pipeline_id]
|
| 1132 |
+
deleted_components.append('qa_chain')
|
| 1133 |
+
|
| 1134 |
+
if pipeline_id in self.vector_stores:
|
| 1135 |
+
del self.vector_stores[pipeline_id]
|
| 1136 |
+
deleted_components.append('vector_store')
|
| 1137 |
+
|
| 1138 |
+
if deleted_components:
|
| 1139 |
+
return {
|
| 1140 |
+
'success': True,
|
| 1141 |
+
'pipeline_id': pipeline_id,
|
| 1142 |
+
'deleted_components': deleted_components
|
| 1143 |
+
}
|
| 1144 |
+
else:
|
| 1145 |
+
return {'error': f"Pipeline '{pipeline_id}' not found"}
|
| 1146 |
+
|
| 1147 |
+
except Exception as e:
|
| 1148 |
+
return {'error': f"Pipeline deletion failed: {str(e)}"}
|
| 1149 |
+
|
| 1150 |
+
def export_pipeline_config(self, pipeline_id: str) -> Dict[str, Any]:
|
| 1151 |
+
"""Export pipeline configuration for recreation"""
|
| 1152 |
+
try:
|
| 1153 |
+
if pipeline_id not in self.qa_chains:
|
| 1154 |
+
return {'error': f"Pipeline '{pipeline_id}' not found"}
|
| 1155 |
+
|
| 1156 |
+
config = {
|
| 1157 |
+
'pipeline_id': pipeline_id,
|
| 1158 |
+
'embedding_model_name': getattr(self.embeddings, 'model_name', 'unknown'),
|
| 1159 |
+
'llm_model_name': getattr(self.llm, 'model_name', 'unknown'),
|
| 1160 |
+
'chunker_config': {
|
| 1161 |
+
'chunk_size': self.chunker.chunk_size,
|
| 1162 |
+
'chunk_overlap': self.chunker.chunk_overlap
|
| 1163 |
+
},
|
| 1164 |
+
'export_timestamp': self._get_timestamp(),
|
| 1165 |
+
'vector_store_type': 'faiss'
|
| 1166 |
+
}
|
| 1167 |
+
|
| 1168 |
+
return config
|
| 1169 |
+
|
| 1170 |
+
except Exception as e:
|
| 1171 |
+
return {'error': f"Pipeline export failed: {str(e)}"}
|
| 1172 |
+
|
| 1173 |
+
def _get_timestamp(self) -> str:
|
| 1174 |
+
"""Get current timestamp"""
|
| 1175 |
+
from datetime import datetime
|
| 1176 |
+
return datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
| 1177 |
+
|
| 1178 |
+
|
| 1179 |
+
# Utility functions for the module
|
| 1180 |
+
|
| 1181 |
+
def optimize_rag_pipeline(documents: List[Document], embeddings_model, llm,
|
| 1182 |
+
test_queries: List[str]) -> Dict[str, Any]:
|
| 1183 |
+
"""
|
| 1184 |
+
Optimize RAG pipeline configuration for given documents and queries
|
| 1185 |
+
|
| 1186 |
+
Args:
|
| 1187 |
+
documents (List[Document]): Documents to optimize for
|
| 1188 |
+
embeddings_model: Embedding model to use
|
| 1189 |
+
llm: Language model to use
|
| 1190 |
+
test_queries (List[str]): Test queries for optimization
|
| 1191 |
+
|
| 1192 |
+
Returns:
|
| 1193 |
+
Dict: Optimization recommendations
|
| 1194 |
+
"""
|
| 1195 |
+
try:
|
| 1196 |
+
# Test different chunking strategies
|
| 1197 |
+
chunker = VectorChunker(embeddings_model)
|
| 1198 |
+
chunking_results = chunker.optimize_chunking_strategy(documents, test_queries)
|
| 1199 |
+
|
| 1200 |
+
# Test different chunk sizes
|
| 1201 |
+
optimizer = ChunkingOptimizer(embeddings_model)
|
| 1202 |
+
size_results = optimizer.optimize_chunk_size(documents, test_queries)
|
| 1203 |
+
|
| 1204 |
+
# Create optimized pipeline
|
| 1205 |
+
best_strategy = chunking_results.get('recommended_strategy', 'semantic')
|
| 1206 |
+
best_size = size_results.get('optimal_chunk_size', 1000)
|
| 1207 |
+
|
| 1208 |
+
# Create optimized chunker
|
| 1209 |
+
optimized_chunker = VectorChunker(
|
| 1210 |
+
embeddings_model,
|
| 1211 |
+
chunk_size=best_size,
|
| 1212 |
+
chunk_overlap=best_size // 5 # 20% overlap
|
| 1213 |
+
)
|
| 1214 |
+
|
| 1215 |
+
# Test the optimized configuration
|
| 1216 |
+
pipeline = RAGPipeline(embeddings_model, llm)
|
| 1217 |
+
pipeline.chunker = optimized_chunker
|
| 1218 |
+
|
| 1219 |
+
test_pipeline_id = "optimization_test"
|
| 1220 |
+
creation_result = pipeline.create_pipeline(documents, test_pipeline_id, best_strategy)
|
| 1221 |
+
|
| 1222 |
+
if not creation_result.get('error'):
|
| 1223 |
+
evaluation_result = pipeline.evaluate_pipeline(test_pipeline_id, test_queries)
|
| 1224 |
+
pipeline.delete_pipeline(test_pipeline_id) # Clean up
|
| 1225 |
+
else:
|
| 1226 |
+
evaluation_result = {'error': 'Could not evaluate optimized pipeline'}
|
| 1227 |
+
|
| 1228 |
+
return {
|
| 1229 |
+
'optimization_complete': True,
|
| 1230 |
+
'recommended_config': {
|
| 1231 |
+
'chunking_strategy': best_strategy,
|
| 1232 |
+
'chunk_size': best_size,
|
| 1233 |
+
'chunk_overlap': best_size // 5
|
| 1234 |
+
},
|
| 1235 |
+
'chunking_optimization': chunking_results,
|
| 1236 |
+
'size_optimization': size_results,
|
| 1237 |
+
'performance_evaluation': evaluation_result,
|
| 1238 |
+
'recommendations': [
|
| 1239 |
+
f"Use {best_strategy} chunking strategy",
|
| 1240 |
+
f"Set chunk size to {best_size} characters",
|
| 1241 |
+
f"Use {best_size // 5} character overlap",
|
| 1242 |
+
"Monitor and adjust based on query performance"
|
| 1243 |
+
]
|
| 1244 |
+
}
|
| 1245 |
+
|
| 1246 |
+
except Exception as e:
|
| 1247 |
+
return {'error': f"RAG optimization failed: {str(e)}"}
|
| 1248 |
+
|
| 1249 |
+
|
| 1250 |
+
def create_demo_rag_system(sample_documents: List[Document], embeddings_model, llm) -> Dict[str, Any]:
|
| 1251 |
+
"""
|
| 1252 |
+
Create a demonstration RAG system with sample documents
|
| 1253 |
+
|
| 1254 |
+
Args:
|
| 1255 |
+
sample_documents (List[Document]): Sample documents for demo
|
| 1256 |
+
embeddings_model: Embedding model
|
| 1257 |
+
llm: Language model
|
| 1258 |
+
|
| 1259 |
+
Returns:
|
| 1260 |
+
Dict: Demo system information and sample interactions
|
| 1261 |
+
"""
|
| 1262 |
+
try:
|
| 1263 |
+
# Create RAG pipeline
|
| 1264 |
+
pipeline = RAGPipeline(embeddings_model, llm)
|
| 1265 |
+
demo_id = "demo_system"
|
| 1266 |
+
|
| 1267 |
+
# Create the pipeline
|
| 1268 |
+
creation_result = pipeline.create_pipeline(sample_documents, demo_id, "semantic")
|
| 1269 |
+
|
| 1270 |
+
if creation_result.get('error'):
|
| 1271 |
+
return {'error': f"Demo system creation failed: {creation_result['error']}"}
|
| 1272 |
+
|
| 1273 |
+
# Sample queries for demonstration
|
| 1274 |
+
demo_queries = [
|
| 1275 |
+
"What is the main topic of these documents?",
|
| 1276 |
+
"Can you summarize the key points?",
|
| 1277 |
+
"What are the most important concepts mentioned?"
|
| 1278 |
+
]
|
| 1279 |
+
|
| 1280 |
+
# Execute demo queries
|
| 1281 |
+
demo_results = []
|
| 1282 |
+
for query in demo_queries:
|
| 1283 |
+
result = pipeline.query_pipeline(demo_id, query, return_sources=True)
|
| 1284 |
+
demo_results.append(result)
|
| 1285 |
+
|
| 1286 |
+
# Get system statistics
|
| 1287 |
+
pipeline_info = pipeline.get_pipeline_info(demo_id)
|
| 1288 |
+
|
| 1289 |
+
return {
|
| 1290 |
+
'demo_system_created': True,
|
| 1291 |
+
'pipeline_id': demo_id,
|
| 1292 |
+
'creation_stats': creation_result,
|
| 1293 |
+
'pipeline_info': pipeline_info,
|
| 1294 |
+
'demo_queries': demo_queries,
|
| 1295 |
+
'demo_results': demo_results,
|
| 1296 |
+
'usage_instructions': [
|
| 1297 |
+
f"Use pipeline.query_pipeline('{demo_id}', 'your question') to ask questions",
|
| 1298 |
+
"The system will return answers with source document references",
|
| 1299 |
+
"Sources show which parts of the documents were used for the answer"
|
| 1300 |
+
]
|
| 1301 |
+
}
|
| 1302 |
+
|
| 1303 |
+
except Exception as e:
|
| 1304 |
+
return {'error': f"Demo system creation failed: {str(e)}"}
|
| 1305 |
+
|
| 1306 |
+
|
| 1307 |
+
# Export the main classes for use in other modules
|
| 1308 |
+
__all__ = [
|
| 1309 |
+
'VectorChunker',
|
| 1310 |
+
'ChunkingOptimizer',
|
| 1311 |
+
'RAGPipeline',
|
| 1312 |
+
'optimize_rag_pipeline',
|
| 1313 |
+
'create_demo_rag_system'
|
| 1314 |
+
]
|
utils/export.py
ADDED
|
@@ -0,0 +1,1896 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Results Export and Reporting Module
|
| 3 |
+
Handles export of analysis results, reports, and data for external use
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
import csv
|
| 8 |
+
import io
|
| 9 |
+
import zipfile
|
| 10 |
+
import tempfile
|
| 11 |
+
import os
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
from typing import Dict, Any, List, Optional, Union
|
| 14 |
+
import pandas as pd
|
| 15 |
+
from dataclasses import dataclass, asdict
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
@dataclass
|
| 19 |
+
class GEOReport:
|
| 20 |
+
"""Data class for GEO analysis reports"""
|
| 21 |
+
website_url: str
|
| 22 |
+
analysis_date: str
|
| 23 |
+
overall_score: float
|
| 24 |
+
pages_analyzed: int
|
| 25 |
+
geo_scores: Dict[str, float]
|
| 26 |
+
recommendations: List[str]
|
| 27 |
+
optimization_opportunities: List[Dict[str, Any]]
|
| 28 |
+
competitive_position: str
|
| 29 |
+
|
| 30 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 31 |
+
"""Convert report to dictionary"""
|
| 32 |
+
return asdict(self)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
@dataclass
|
| 36 |
+
class ContentAnalysis:
|
| 37 |
+
"""Data class for content optimization analysis"""
|
| 38 |
+
original_content: str
|
| 39 |
+
analysis_date: str
|
| 40 |
+
clarity_score: float
|
| 41 |
+
structure_score: float
|
| 42 |
+
answerability_score: float
|
| 43 |
+
keywords: List[str]
|
| 44 |
+
optimized_content: Optional[str]
|
| 45 |
+
improvements_made: List[str]
|
| 46 |
+
|
| 47 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 48 |
+
"""Convert analysis to dictionary"""
|
| 49 |
+
return asdict(self)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class ResultExporter:
|
| 53 |
+
"""Main class for exporting analysis results and generating reports"""
|
| 54 |
+
|
| 55 |
+
def __init__(self):
|
| 56 |
+
self.export_formats = ['json', 'csv', 'html', 'pdf', 'xlsx']
|
| 57 |
+
self.supported_types = ['geo_analysis', 'content_optimization', 'qa_results', 'batch_analysis']
|
| 58 |
+
|
| 59 |
+
def export_geo_results(self, geo_results: List[Dict[str, Any]],
|
| 60 |
+
website_url: str, format_type: str = 'json') -> Union[str, bytes, Dict[str, Any]]:
|
| 61 |
+
"""
|
| 62 |
+
Export GEO analysis results in specified format
|
| 63 |
+
|
| 64 |
+
Args:
|
| 65 |
+
geo_results (List[Dict]): List of GEO analysis results
|
| 66 |
+
website_url (str): URL of analyzed website
|
| 67 |
+
format_type (str): Export format ('json', 'csv', 'html', 'xlsx')
|
| 68 |
+
|
| 69 |
+
Returns:
|
| 70 |
+
Union[str, bytes, Dict]: Exported data in requested format
|
| 71 |
+
"""
|
| 72 |
+
try:
|
| 73 |
+
# Prepare consolidated data
|
| 74 |
+
export_data = self._prepare_geo_export_data(geo_results, website_url)
|
| 75 |
+
|
| 76 |
+
if format_type.lower() == 'json':
|
| 77 |
+
return self._export_geo_json(export_data)
|
| 78 |
+
elif format_type.lower() == 'csv':
|
| 79 |
+
return self._export_geo_csv(export_data)
|
| 80 |
+
elif format_type.lower() == 'html':
|
| 81 |
+
return self._export_geo_html(export_data)
|
| 82 |
+
elif format_type.lower() == 'xlsx':
|
| 83 |
+
return self._export_geo_excel(export_data)
|
| 84 |
+
elif format_type.lower() == 'pdf':
|
| 85 |
+
return self._export_geo_pdf(export_data)
|
| 86 |
+
else:
|
| 87 |
+
raise ValueError(f"Unsupported export format: {format_type}")
|
| 88 |
+
|
| 89 |
+
except Exception as e:
|
| 90 |
+
return {'error': f"Export failed: {str(e)}"}
|
| 91 |
+
|
| 92 |
+
def export_enhancement_results(self, enhancement_result: Dict[str, Any],
|
| 93 |
+
format_type: str = 'json') -> Union[str, bytes, Dict[str, Any]]:
|
| 94 |
+
"""
|
| 95 |
+
Export content enhancement results
|
| 96 |
+
|
| 97 |
+
Args:
|
| 98 |
+
enhancement_result (Dict): Content enhancement analysis result
|
| 99 |
+
format_type (str): Export format
|
| 100 |
+
|
| 101 |
+
Returns:
|
| 102 |
+
Union[str, bytes, Dict]: Exported data
|
| 103 |
+
"""
|
| 104 |
+
try:
|
| 105 |
+
# Prepare data for export
|
| 106 |
+
export_data = self._prepare_enhancement_export_data(enhancement_result)
|
| 107 |
+
|
| 108 |
+
if format_type.lower() == 'json':
|
| 109 |
+
return json.dumps(export_data, indent=2, ensure_ascii=False)
|
| 110 |
+
elif format_type.lower() == 'html':
|
| 111 |
+
return self._export_enhancement_html(export_data)
|
| 112 |
+
elif format_type.lower() == 'csv':
|
| 113 |
+
return self._export_enhancement_csv(export_data)
|
| 114 |
+
else:
|
| 115 |
+
return json.dumps(export_data, indent=2, ensure_ascii=False)
|
| 116 |
+
|
| 117 |
+
except Exception as e:
|
| 118 |
+
return {'error': f"Enhancement export failed: {str(e)}"}
|
| 119 |
+
|
| 120 |
+
def export_qa_results(self, qa_results: List[Dict[str, Any]],
|
| 121 |
+
format_type: str = 'json') -> Union[str, bytes, Dict[str, Any]]:
|
| 122 |
+
"""
|
| 123 |
+
Export Q&A session results
|
| 124 |
+
|
| 125 |
+
Args:
|
| 126 |
+
qa_results (List[Dict]): List of Q&A interactions
|
| 127 |
+
format_type (str): Export format
|
| 128 |
+
|
| 129 |
+
Returns:
|
| 130 |
+
Union[str, bytes, Dict]: Exported data
|
| 131 |
+
"""
|
| 132 |
+
try:
|
| 133 |
+
export_data = {
|
| 134 |
+
'qa_session': {
|
| 135 |
+
'session_date': datetime.now().isoformat(),
|
| 136 |
+
'total_questions': len(qa_results),
|
| 137 |
+
'interactions': qa_results
|
| 138 |
+
},
|
| 139 |
+
'summary': {
|
| 140 |
+
'successful_answers': len([r for r in qa_results if not r.get('error')]),
|
| 141 |
+
'average_response_length': self._calculate_avg_response_length(qa_results),
|
| 142 |
+
'most_common_topics': self._extract_common_topics(qa_results)
|
| 143 |
+
}
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
if format_type.lower() == 'json':
|
| 147 |
+
return json.dumps(export_data, indent=2, ensure_ascii=False)
|
| 148 |
+
elif format_type.lower() == 'html':
|
| 149 |
+
return self._export_qa_html(export_data)
|
| 150 |
+
elif format_type.lower() == 'csv':
|
| 151 |
+
return self._export_qa_csv(export_data)
|
| 152 |
+
else:
|
| 153 |
+
return json.dumps(export_data, indent=2, ensure_ascii=False)
|
| 154 |
+
|
| 155 |
+
except Exception as e:
|
| 156 |
+
return {'error': f"Q&A export failed: {str(e)}"}
|
| 157 |
+
|
| 158 |
+
def create_comprehensive_report(self, analysis_data: Dict[str, Any],
|
| 159 |
+
report_type: str = 'full') -> Dict[str, Any]:
|
| 160 |
+
"""
|
| 161 |
+
Create comprehensive analysis report
|
| 162 |
+
|
| 163 |
+
Args:
|
| 164 |
+
analysis_data (Dict): Combined analysis data from multiple sources
|
| 165 |
+
report_type (str): Type of report ('full', 'summary', 'executive')
|
| 166 |
+
|
| 167 |
+
Returns:
|
| 168 |
+
Dict: Comprehensive report data
|
| 169 |
+
"""
|
| 170 |
+
try:
|
| 171 |
+
report = {
|
| 172 |
+
'report_metadata': {
|
| 173 |
+
'generated_at': datetime.now().isoformat(),
|
| 174 |
+
'report_type': report_type,
|
| 175 |
+
'generator': 'GEO SEO AI Optimizer',
|
| 176 |
+
'version': '1.0'
|
| 177 |
+
}
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
if report_type == 'executive':
|
| 181 |
+
report.update(self._create_executive_summary(analysis_data))
|
| 182 |
+
elif report_type == 'summary':
|
| 183 |
+
report.update(self._create_summary_report(analysis_data))
|
| 184 |
+
else: # full report
|
| 185 |
+
report.update(self._create_full_report(analysis_data))
|
| 186 |
+
|
| 187 |
+
return report
|
| 188 |
+
|
| 189 |
+
except Exception as e:
|
| 190 |
+
return {'error': f"Report creation failed: {str(e)}"}
|
| 191 |
+
|
| 192 |
+
def export_batch_results(self, batch_results: List[Dict[str, Any]],
|
| 193 |
+
batch_metadata: Dict[str, Any],
|
| 194 |
+
format_type: str = 'xlsx') -> Union[str, bytes, Dict[str, Any]]:
|
| 195 |
+
"""
|
| 196 |
+
Export batch analysis results
|
| 197 |
+
|
| 198 |
+
Args:
|
| 199 |
+
batch_results (List[Dict]): List of batch analysis results
|
| 200 |
+
batch_metadata (Dict): Metadata about the batch process
|
| 201 |
+
format_type (str): Export format
|
| 202 |
+
|
| 203 |
+
Returns:
|
| 204 |
+
Union[str, bytes, Dict]: Exported batch data
|
| 205 |
+
"""
|
| 206 |
+
try:
|
| 207 |
+
export_data = {
|
| 208 |
+
'batch_metadata': batch_metadata,
|
| 209 |
+
'batch_results': batch_results,
|
| 210 |
+
'batch_summary': self._create_batch_summary(batch_results),
|
| 211 |
+
'export_timestamp': datetime.now().isoformat()
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
if format_type.lower() == 'xlsx':
|
| 215 |
+
return self._export_batch_excel(export_data)
|
| 216 |
+
elif format_type.lower() == 'json':
|
| 217 |
+
return json.dumps(export_data, indent=2, ensure_ascii=False)
|
| 218 |
+
elif format_type.lower() == 'csv':
|
| 219 |
+
return self._export_batch_csv(export_data)
|
| 220 |
+
else:
|
| 221 |
+
return json.dumps(export_data, indent=2, ensure_ascii=False)
|
| 222 |
+
|
| 223 |
+
except Exception as e:
|
| 224 |
+
return {'error': f"Batch export failed: {str(e)}"}
|
| 225 |
+
|
| 226 |
+
def create_export_package(self, analysis_data: Dict[str, Any],
|
| 227 |
+
package_name: str = "geo_analysis") -> bytes:
|
| 228 |
+
"""
|
| 229 |
+
Create a ZIP package with multiple export formats
|
| 230 |
+
|
| 231 |
+
Args:
|
| 232 |
+
analysis_data (Dict): Analysis data to package
|
| 233 |
+
package_name (str): Name for the package
|
| 234 |
+
|
| 235 |
+
Returns:
|
| 236 |
+
bytes: ZIP file content
|
| 237 |
+
"""
|
| 238 |
+
try:
|
| 239 |
+
# Create temporary directory
|
| 240 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
| 241 |
+
zip_path = os.path.join(temp_dir, f"{package_name}.zip")
|
| 242 |
+
|
| 243 |
+
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zip_file:
|
| 244 |
+
# Add JSON export
|
| 245 |
+
json_data = json.dumps(analysis_data, indent=2, ensure_ascii=False)
|
| 246 |
+
zip_file.writestr(f"{package_name}.json", json_data)
|
| 247 |
+
|
| 248 |
+
# Add HTML report
|
| 249 |
+
if 'geo_results' in analysis_data:
|
| 250 |
+
html_data = self._export_geo_html(analysis_data)
|
| 251 |
+
zip_file.writestr(f"{package_name}_report.html", html_data)
|
| 252 |
+
|
| 253 |
+
# Add CSV data
|
| 254 |
+
if 'geo_results' in analysis_data:
|
| 255 |
+
csv_data = self._export_geo_csv(analysis_data)
|
| 256 |
+
zip_file.writestr(f"{package_name}_data.csv", csv_data)
|
| 257 |
+
|
| 258 |
+
# Add README
|
| 259 |
+
readme_content = self._generate_package_readme(analysis_data)
|
| 260 |
+
zip_file.writestr("README.txt", readme_content)
|
| 261 |
+
|
| 262 |
+
# Read the ZIP file
|
| 263 |
+
with open(zip_path, 'rb') as zip_file:
|
| 264 |
+
return zip_file.read()
|
| 265 |
+
|
| 266 |
+
except Exception as e:
|
| 267 |
+
raise Exception(f"Package creation failed: {str(e)}")
|
| 268 |
+
|
| 269 |
+
def _prepare_geo_export_data(self, geo_results: List[Dict[str, Any]], website_url: str) -> Dict[str, Any]:
|
| 270 |
+
"""Prepare GEO data for export"""
|
| 271 |
+
try:
|
| 272 |
+
# Calculate aggregate metrics
|
| 273 |
+
valid_results = [r for r in geo_results if 'geo_scores' in r and not r.get('error')]
|
| 274 |
+
|
| 275 |
+
if not valid_results:
|
| 276 |
+
return {
|
| 277 |
+
'error': 'No valid GEO results to export',
|
| 278 |
+
'website_url': website_url,
|
| 279 |
+
'export_timestamp': datetime.now().isoformat()
|
| 280 |
+
}
|
| 281 |
+
|
| 282 |
+
# Aggregate scores
|
| 283 |
+
all_scores = {}
|
| 284 |
+
for result in valid_results:
|
| 285 |
+
for metric, score in result.get('geo_scores', {}).items():
|
| 286 |
+
if metric not in all_scores:
|
| 287 |
+
all_scores[metric] = []
|
| 288 |
+
all_scores[metric].append(score)
|
| 289 |
+
|
| 290 |
+
avg_scores = {metric: sum(scores) / len(scores) for metric, scores in all_scores.items()}
|
| 291 |
+
overall_avg = sum(avg_scores.values()) / len(avg_scores) if avg_scores else 0
|
| 292 |
+
|
| 293 |
+
# Collect recommendations
|
| 294 |
+
all_recommendations = []
|
| 295 |
+
all_opportunities = []
|
| 296 |
+
|
| 297 |
+
for result in valid_results:
|
| 298 |
+
all_recommendations.extend(result.get('recommendations', []))
|
| 299 |
+
all_opportunities.extend(result.get('optimization_opportunities', []))
|
| 300 |
+
|
| 301 |
+
# Remove duplicates
|
| 302 |
+
unique_recommendations = list(set(all_recommendations))
|
| 303 |
+
|
| 304 |
+
return {
|
| 305 |
+
'website_analysis': {
|
| 306 |
+
'url': website_url,
|
| 307 |
+
'analysis_date': datetime.now().isoformat(),
|
| 308 |
+
'pages_analyzed': len(valid_results),
|
| 309 |
+
'overall_geo_score': round(overall_avg, 2)
|
| 310 |
+
},
|
| 311 |
+
'aggregate_scores': avg_scores,
|
| 312 |
+
'individual_page_results': valid_results,
|
| 313 |
+
'recommendations': unique_recommendations[:10], # Top 10
|
| 314 |
+
'optimization_opportunities': all_opportunities,
|
| 315 |
+
'performance_insights': self._generate_performance_insights(avg_scores, overall_avg),
|
| 316 |
+
'export_metadata': {
|
| 317 |
+
'exported_by': 'GEO SEO AI Optimizer',
|
| 318 |
+
'export_timestamp': datetime.now().isoformat(),
|
| 319 |
+
'data_format': 'GEO Analysis Results v1.0'
|
| 320 |
+
}
|
| 321 |
+
}
|
| 322 |
+
|
| 323 |
+
except Exception as e:
|
| 324 |
+
return {'error': f"Data preparation failed: {str(e)}"}
|
| 325 |
+
|
| 326 |
+
def _prepare_enhancement_export_data(self, enhancement_result: Dict[str, Any]) -> Dict[str, Any]:
|
| 327 |
+
"""Prepare content enhancement data for export"""
|
| 328 |
+
try:
|
| 329 |
+
scores = enhancement_result.get('scores', {})
|
| 330 |
+
|
| 331 |
+
return {
|
| 332 |
+
'content_analysis': {
|
| 333 |
+
'analysis_date': datetime.now().isoformat(),
|
| 334 |
+
'original_content_length': enhancement_result.get('original_length', 0),
|
| 335 |
+
'original_word_count': enhancement_result.get('original_word_count', 0),
|
| 336 |
+
'analysis_type': enhancement_result.get('optimization_type', 'standard')
|
| 337 |
+
},
|
| 338 |
+
'performance_scores': {
|
| 339 |
+
'clarity': scores.get('clarity', 0),
|
| 340 |
+
'structure': scores.get('structuredness', 0),
|
| 341 |
+
'answerability': scores.get('answerability', 0),
|
| 342 |
+
'overall_average': sum(scores.values()) / len(scores) if scores else 0
|
| 343 |
+
},
|
| 344 |
+
'optimization_results': {
|
| 345 |
+
'keywords_identified': enhancement_result.get('keywords', []),
|
| 346 |
+
'optimized_content': enhancement_result.get('optimized_text', ''),
|
| 347 |
+
'improvements_made': enhancement_result.get('optimization_suggestions', []),
|
| 348 |
+
'analyze_only': enhancement_result.get('analyze_only', False)
|
| 349 |
+
},
|
| 350 |
+
'export_metadata': {
|
| 351 |
+
'exported_by': 'GEO SEO AI Optimizer',
|
| 352 |
+
'export_timestamp': datetime.now().isoformat(),
|
| 353 |
+
'data_format': 'Content Enhancement Results v1.0'
|
| 354 |
+
}
|
| 355 |
+
}
|
| 356 |
+
|
| 357 |
+
except Exception as e:
|
| 358 |
+
return {'error': f"Enhancement data preparation failed: {str(e)}"}
|
| 359 |
+
|
| 360 |
+
def _export_geo_json(self, data: Dict[str, Any]) -> str:
|
| 361 |
+
"""Export GEO data as JSON"""
|
| 362 |
+
return json.dumps(data, indent=2, ensure_ascii=False)
|
| 363 |
+
|
| 364 |
+
def _export_geo_csv(self, data: Dict[str, Any]) -> str:
|
| 365 |
+
"""Export GEO data as CSV"""
|
| 366 |
+
try:
|
| 367 |
+
output = io.StringIO()
|
| 368 |
+
|
| 369 |
+
# Write aggregate scores
|
| 370 |
+
writer = csv.writer(output)
|
| 371 |
+
writer.writerow(['GEO Analysis Results'])
|
| 372 |
+
writer.writerow(['Website:', data.get('website_analysis', {}).get('url', 'Unknown')])
|
| 373 |
+
writer.writerow(['Analysis Date:', data.get('website_analysis', {}).get('analysis_date', 'Unknown')])
|
| 374 |
+
writer.writerow(['Overall Score:', data.get('website_analysis', {}).get('overall_geo_score', 0)])
|
| 375 |
+
writer.writerow([])
|
| 376 |
+
|
| 377 |
+
# Write aggregate scores
|
| 378 |
+
writer.writerow(['Metric', 'Score'])
|
| 379 |
+
for metric, score in data.get('aggregate_scores', {}).items():
|
| 380 |
+
writer.writerow([metric.replace('_', ' ').title(), round(score, 2)])
|
| 381 |
+
|
| 382 |
+
writer.writerow([])
|
| 383 |
+
writer.writerow(['Recommendations'])
|
| 384 |
+
for i, rec in enumerate(data.get('recommendations', []), 1):
|
| 385 |
+
writer.writerow([f"{i}.", rec])
|
| 386 |
+
|
| 387 |
+
# Individual page results
|
| 388 |
+
if data.get('individual_page_results'):
|
| 389 |
+
writer.writerow([])
|
| 390 |
+
writer.writerow(['Individual Page Results'])
|
| 391 |
+
|
| 392 |
+
# Header for page results
|
| 393 |
+
first_result = data['individual_page_results'][0]
|
| 394 |
+
if 'geo_scores' in first_result:
|
| 395 |
+
headers = ['Page Index', 'Page URL', 'Page Title'] + list(first_result['geo_scores'].keys())
|
| 396 |
+
writer.writerow(headers)
|
| 397 |
+
|
| 398 |
+
for i, result in enumerate(data['individual_page_results']):
|
| 399 |
+
page_data = result.get('page_data', {})
|
| 400 |
+
scores = result.get('geo_scores', {})
|
| 401 |
+
|
| 402 |
+
row = [
|
| 403 |
+
i + 1,
|
| 404 |
+
page_data.get('url', 'Unknown'),
|
| 405 |
+
page_data.get('title', 'Unknown')
|
| 406 |
+
] + [round(scores.get(metric, 0), 2) for metric in headers[3:]]
|
| 407 |
+
|
| 408 |
+
writer.writerow(row)
|
| 409 |
+
|
| 410 |
+
return output.getvalue()
|
| 411 |
+
|
| 412 |
+
except Exception as e:
|
| 413 |
+
return f"CSV export error: {str(e)}"
|
| 414 |
+
|
| 415 |
+
def _export_geo_html(self, data: Dict[str, Any]) -> str:
|
| 416 |
+
"""Export GEO data as HTML report"""
|
| 417 |
+
try:
|
| 418 |
+
website_info = data.get('website_analysis', {})
|
| 419 |
+
scores = data.get('aggregate_scores', {})
|
| 420 |
+
recommendations = data.get('recommendations', [])
|
| 421 |
+
|
| 422 |
+
html_content = f"""
|
| 423 |
+
<!DOCTYPE html>
|
| 424 |
+
<html lang="en">
|
| 425 |
+
<head>
|
| 426 |
+
<meta charset="UTF-8">
|
| 427 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 428 |
+
<title>GEO Analysis Report - {website_info.get('url', 'Website')}</title>
|
| 429 |
+
<style>
|
| 430 |
+
body {{
|
| 431 |
+
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
| 432 |
+
line-height: 1.6;
|
| 433 |
+
color: #333;
|
| 434 |
+
max-width: 1200px;
|
| 435 |
+
margin: 0 auto;
|
| 436 |
+
padding: 20px;
|
| 437 |
+
background-color: #f5f5f5;
|
| 438 |
+
}}
|
| 439 |
+
.header {{
|
| 440 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 441 |
+
color: white;
|
| 442 |
+
padding: 30px;
|
| 443 |
+
border-radius: 10px;
|
| 444 |
+
margin-bottom: 30px;
|
| 445 |
+
text-align: center;
|
| 446 |
+
}}
|
| 447 |
+
.header h1 {{
|
| 448 |
+
margin: 0;
|
| 449 |
+
font-size: 2.5em;
|
| 450 |
+
}}
|
| 451 |
+
.summary-cards {{
|
| 452 |
+
display: grid;
|
| 453 |
+
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
|
| 454 |
+
gap: 20px;
|
| 455 |
+
margin-bottom: 30px;
|
| 456 |
+
}}
|
| 457 |
+
.card {{
|
| 458 |
+
background: white;
|
| 459 |
+
padding: 20px;
|
| 460 |
+
border-radius: 10px;
|
| 461 |
+
box-shadow: 0 4px 6px rgba(0,0,0,0.1);
|
| 462 |
+
text-align: center;
|
| 463 |
+
}}
|
| 464 |
+
.card h3 {{
|
| 465 |
+
margin-top: 0;
|
| 466 |
+
color: #667eea;
|
| 467 |
+
}}
|
| 468 |
+
.score {{
|
| 469 |
+
font-size: 2em;
|
| 470 |
+
font-weight: bold;
|
| 471 |
+
color: #333;
|
| 472 |
+
}}
|
| 473 |
+
.scores-grid {{
|
| 474 |
+
display: grid;
|
| 475 |
+
grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
|
| 476 |
+
gap: 20px;
|
| 477 |
+
margin-bottom: 30px;
|
| 478 |
+
}}
|
| 479 |
+
.score-item {{
|
| 480 |
+
background: white;
|
| 481 |
+
padding: 15px;
|
| 482 |
+
border-radius: 8px;
|
| 483 |
+
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
| 484 |
+
display: flex;
|
| 485 |
+
justify-content: space-between;
|
| 486 |
+
align-items: center;
|
| 487 |
+
}}
|
| 488 |
+
.score-bar {{
|
| 489 |
+
width: 100px;
|
| 490 |
+
height: 10px;
|
| 491 |
+
background: #e0e0e0;
|
| 492 |
+
border-radius: 5px;
|
| 493 |
+
overflow: hidden;
|
| 494 |
+
}}
|
| 495 |
+
.score-fill {{
|
| 496 |
+
height: 100%;
|
| 497 |
+
background: linear-gradient(90deg, #ff6b6b, #ffa500, #4ecdc4);
|
| 498 |
+
transition: width 0.3s ease;
|
| 499 |
+
}}
|
| 500 |
+
.recommendations {{
|
| 501 |
+
background: white;
|
| 502 |
+
padding: 30px;
|
| 503 |
+
border-radius: 10px;
|
| 504 |
+
box-shadow: 0 4px 6px rgba(0,0,0,0.1);
|
| 505 |
+
margin-bottom: 30px;
|
| 506 |
+
}}
|
| 507 |
+
.recommendations h2 {{
|
| 508 |
+
color: #667eea;
|
| 509 |
+
border-bottom: 2px solid #667eea;
|
| 510 |
+
padding-bottom: 10px;
|
| 511 |
+
}}
|
| 512 |
+
.rec-item {{
|
| 513 |
+
padding: 10px 0;
|
| 514 |
+
border-bottom: 1px solid #eee;
|
| 515 |
+
}}
|
| 516 |
+
.footer {{
|
| 517 |
+
text-align: center;
|
| 518 |
+
color: #666;
|
| 519 |
+
margin-top: 40px;
|
| 520 |
+
padding-top: 20px;
|
| 521 |
+
border-top: 1px solid #ddd;
|
| 522 |
+
}}
|
| 523 |
+
</style>
|
| 524 |
+
</head>
|
| 525 |
+
<body>
|
| 526 |
+
<div class="header">
|
| 527 |
+
<h1>🚀 GEO Analysis Report</h1>
|
| 528 |
+
<p>Generative Engine Optimization Performance Analysis</p>
|
| 529 |
+
<p><strong>Website:</strong> {website_info.get('url', 'Not specified')}</p>
|
| 530 |
+
<p><strong>Analysis Date:</strong> {website_info.get('analysis_date', 'Not specified')}</p>
|
| 531 |
+
</div>
|
| 532 |
+
|
| 533 |
+
<div class="summary-cards">
|
| 534 |
+
<div class="card">
|
| 535 |
+
<h3>Overall GEO Score</h3>
|
| 536 |
+
<div class="score">{website_info.get('overall_geo_score', 0)}/10</div>
|
| 537 |
+
</div>
|
| 538 |
+
<div class="card">
|
| 539 |
+
<h3>Pages Analyzed</h3>
|
| 540 |
+
<div class="score">{website_info.get('pages_analyzed', 0)}</div>
|
| 541 |
+
</div>
|
| 542 |
+
<div class="card">
|
| 543 |
+
<h3>Recommendations</h3>
|
| 544 |
+
<div class="score">{len(recommendations)}</div>
|
| 545 |
+
</div>
|
| 546 |
+
</div>
|
| 547 |
+
|
| 548 |
+
<h2>📊 Detailed GEO Metrics</h2>
|
| 549 |
+
<div class="scores-grid">
|
| 550 |
+
"""
|
| 551 |
+
|
| 552 |
+
# Add individual scores
|
| 553 |
+
for metric, score in scores.items():
|
| 554 |
+
metric_display = metric.replace('_', ' ').title()
|
| 555 |
+
score_percentage = min(score * 10, 100) # Convert to percentage
|
| 556 |
+
|
| 557 |
+
html_content += f"""
|
| 558 |
+
<div class="score-item">
|
| 559 |
+
<div>
|
| 560 |
+
<strong>{metric_display}</strong><br>
|
| 561 |
+
<span style="color: #666;">{score:.1f}/10</span>
|
| 562 |
+
</div>
|
| 563 |
+
<div class="score-bar">
|
| 564 |
+
<div class="score-fill" style="width: {score_percentage}%;"></div>
|
| 565 |
+
</div>
|
| 566 |
+
</div>
|
| 567 |
+
"""
|
| 568 |
+
|
| 569 |
+
html_content += """
|
| 570 |
+
</div>
|
| 571 |
+
|
| 572 |
+
<div class="recommendations">
|
| 573 |
+
<h2>💡 Optimization Recommendations</h2>
|
| 574 |
+
"""
|
| 575 |
+
|
| 576 |
+
# Add recommendations
|
| 577 |
+
for i, rec in enumerate(recommendations, 1):
|
| 578 |
+
html_content += f'<div class="rec-item"><strong>{i}.</strong> {rec}</div>'
|
| 579 |
+
|
| 580 |
+
html_content += f"""
|
| 581 |
+
</div>
|
| 582 |
+
|
| 583 |
+
<div class="footer">
|
| 584 |
+
<p>Generated by GEO SEO AI Optimizer | {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
|
| 585 |
+
<p>This report provides AI-first SEO optimization insights for better generative engine performance.</p>
|
| 586 |
+
</div>
|
| 587 |
+
</body>
|
| 588 |
+
</html>
|
| 589 |
+
"""
|
| 590 |
+
|
| 591 |
+
return html_content
|
| 592 |
+
|
| 593 |
+
except Exception as e:
|
| 594 |
+
return f"<html><body><h1>HTML Export Error</h1><p>{str(e)}</p></body></html>"
|
| 595 |
+
|
| 596 |
+
def _export_geo_excel(self, data: Dict[str, Any]) -> bytes:
|
| 597 |
+
"""Export GEO data as Excel file"""
|
| 598 |
+
try:
|
| 599 |
+
output = io.BytesIO()
|
| 600 |
+
|
| 601 |
+
with pd.ExcelWriter(output, engine='openpyxl') as writer:
|
| 602 |
+
# Summary sheet
|
| 603 |
+
summary_data = {
|
| 604 |
+
'Metric': ['Website URL', 'Analysis Date', 'Pages Analyzed', 'Overall Score'],
|
| 605 |
+
'Value': [
|
| 606 |
+
data.get('website_analysis', {}).get('url', 'Unknown'),
|
| 607 |
+
data.get('website_analysis', {}).get('analysis_date', 'Unknown'),
|
| 608 |
+
data.get('website_analysis', {}).get('pages_analyzed', 0),
|
| 609 |
+
data.get('website_analysis', {}).get('overall_geo_score', 0)
|
| 610 |
+
]
|
| 611 |
+
}
|
| 612 |
+
pd.DataFrame(summary_data).to_excel(writer, sheet_name='Summary', index=False)
|
| 613 |
+
|
| 614 |
+
# Scores sheet
|
| 615 |
+
scores_data = []
|
| 616 |
+
for metric, score in data.get('aggregate_scores', {}).items():
|
| 617 |
+
scores_data.append({
|
| 618 |
+
'Metric': metric.replace('_', ' ').title(),
|
| 619 |
+
'Score': round(score, 2),
|
| 620 |
+
'Performance': self._get_performance_level(score)
|
| 621 |
+
})
|
| 622 |
+
|
| 623 |
+
pd.DataFrame(scores_data).to_excel(writer, sheet_name='GEO Scores', index=False)
|
| 624 |
+
|
| 625 |
+
# Recommendations sheet
|
| 626 |
+
rec_data = []
|
| 627 |
+
for i, rec in enumerate(data.get('recommendations', []), 1):
|
| 628 |
+
rec_data.append({
|
| 629 |
+
'Priority': i,
|
| 630 |
+
'Recommendation': rec,
|
| 631 |
+
'Category': self._categorize_recommendation(rec)
|
| 632 |
+
})
|
| 633 |
+
|
| 634 |
+
if rec_data:
|
| 635 |
+
pd.DataFrame(rec_data).to_excel(writer, sheet_name='Recommendations', index=False)
|
| 636 |
+
|
| 637 |
+
# Individual pages sheet
|
| 638 |
+
if data.get('individual_page_results'):
|
| 639 |
+
pages_data = []
|
| 640 |
+
for i, result in enumerate(data['individual_page_results']):
|
| 641 |
+
page_data = result.get('page_data', {})
|
| 642 |
+
scores = result.get('geo_scores', {})
|
| 643 |
+
|
| 644 |
+
page_row = {
|
| 645 |
+
'Page_Index': i + 1,
|
| 646 |
+
'URL': page_data.get('url', 'Unknown'),
|
| 647 |
+
'Title': page_data.get('title', 'Unknown'),
|
| 648 |
+
'Word_Count': page_data.get('word_count', 0)
|
| 649 |
+
}
|
| 650 |
+
|
| 651 |
+
# Add all GEO scores
|
| 652 |
+
for metric, score in scores.items():
|
| 653 |
+
page_row[metric.replace('_', ' ').title()] = round(score, 2)
|
| 654 |
+
|
| 655 |
+
pages_data.append(page_row)
|
| 656 |
+
|
| 657 |
+
pd.DataFrame(pages_data).to_excel(writer, sheet_name='Individual Pages', index=False)
|
| 658 |
+
|
| 659 |
+
output.seek(0)
|
| 660 |
+
return output.getvalue()
|
| 661 |
+
|
| 662 |
+
except Exception as e:
|
| 663 |
+
# Return error as text file if Excel creation fails
|
| 664 |
+
error_content = f"Excel export failed: {str(e)}\n\nData:\n{json.dumps(data, indent=2)}"
|
| 665 |
+
return error_content.encode('utf-8')
|
| 666 |
+
|
| 667 |
+
def _export_enhancement_html(self, data: Dict[str, Any]) -> str:
|
| 668 |
+
"""Export content enhancement results as HTML"""
|
| 669 |
+
try:
|
| 670 |
+
analysis = data.get('content_analysis', {})
|
| 671 |
+
scores = data.get('performance_scores', {})
|
| 672 |
+
optimization = data.get('optimization_results', {})
|
| 673 |
+
|
| 674 |
+
html_content = f"""
|
| 675 |
+
<!DOCTYPE html>
|
| 676 |
+
<html lang="en">
|
| 677 |
+
<head>
|
| 678 |
+
<meta charset="UTF-8">
|
| 679 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 680 |
+
<title>Content Enhancement Report</title>
|
| 681 |
+
<style>
|
| 682 |
+
body {{
|
| 683 |
+
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
| 684 |
+
line-height: 1.6;
|
| 685 |
+
color: #333;
|
| 686 |
+
max-width: 1000px;
|
| 687 |
+
margin: 0 auto;
|
| 688 |
+
padding: 20px;
|
| 689 |
+
background-color: #f8f9fa;
|
| 690 |
+
}}
|
| 691 |
+
.header {{
|
| 692 |
+
background: linear-gradient(135deg, #28a745 0%, #20c997 100%);
|
| 693 |
+
color: white;
|
| 694 |
+
padding: 30px;
|
| 695 |
+
border-radius: 10px;
|
| 696 |
+
margin-bottom: 30px;
|
| 697 |
+
text-align: center;
|
| 698 |
+
}}
|
| 699 |
+
.scores {{
|
| 700 |
+
display: grid;
|
| 701 |
+
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
|
| 702 |
+
gap: 20px;
|
| 703 |
+
margin-bottom: 30px;
|
| 704 |
+
}}
|
| 705 |
+
.score-card {{
|
| 706 |
+
background: white;
|
| 707 |
+
padding: 20px;
|
| 708 |
+
border-radius: 10px;
|
| 709 |
+
box-shadow: 0 4px 6px rgba(0,0,0,0.1);
|
| 710 |
+
text-align: center;
|
| 711 |
+
}}
|
| 712 |
+
.content-section {{
|
| 713 |
+
background: white;
|
| 714 |
+
padding: 30px;
|
| 715 |
+
border-radius: 10px;
|
| 716 |
+
box-shadow: 0 4px 6px rgba(0,0,0,0.1);
|
| 717 |
+
margin-bottom: 20px;
|
| 718 |
+
}}
|
| 719 |
+
.keywords {{
|
| 720 |
+
display: flex;
|
| 721 |
+
flex-wrap: wrap;
|
| 722 |
+
gap: 10px;
|
| 723 |
+
margin-top: 15px;
|
| 724 |
+
}}
|
| 725 |
+
.keyword {{
|
| 726 |
+
background: #e9ecef;
|
| 727 |
+
padding: 5px 10px;
|
| 728 |
+
border-radius: 20px;
|
| 729 |
+
font-size: 0.9em;
|
| 730 |
+
}}
|
| 731 |
+
.optimized-content {{
|
| 732 |
+
background: #f8f9fa;
|
| 733 |
+
padding: 20px;
|
| 734 |
+
border-left: 4px solid #28a745;
|
| 735 |
+
border-radius: 5px;
|
| 736 |
+
font-style: italic;
|
| 737 |
+
}}
|
| 738 |
+
</style>
|
| 739 |
+
</head>
|
| 740 |
+
<body>
|
| 741 |
+
<div class="header">
|
| 742 |
+
<h1>🔧 Content Enhancement Report</h1>
|
| 743 |
+
<p>AI-Optimized Content Analysis Results</p>
|
| 744 |
+
<p><strong>Analysis Date:</strong> {analysis.get('analysis_date', 'Unknown')}</p>
|
| 745 |
+
</div>
|
| 746 |
+
|
| 747 |
+
<div class="scores">
|
| 748 |
+
<div class="score-card">
|
| 749 |
+
<h3>Clarity Score</h3>
|
| 750 |
+
<div style="font-size: 2em; font-weight: bold; color: #28a745;">
|
| 751 |
+
{scores.get('clarity', 0):.1f}/10
|
| 752 |
+
</div>
|
| 753 |
+
</div>
|
| 754 |
+
<div class="score-card">
|
| 755 |
+
<h3>Structure Score</h3>
|
| 756 |
+
<div style="font-size: 2em; font-weight: bold; color: #28a745;">
|
| 757 |
+
{scores.get('structure', 0):.1f}/10
|
| 758 |
+
</div>
|
| 759 |
+
</div>
|
| 760 |
+
<div class="score-card">
|
| 761 |
+
<h3>Answerability Score</h3>
|
| 762 |
+
<div style="font-size: 2em; font-weight: bold; color: #28a745;">
|
| 763 |
+
{scores.get('answerability', 0):.1f}/10
|
| 764 |
+
</div>
|
| 765 |
+
</div>
|
| 766 |
+
<div class="score-card">
|
| 767 |
+
<h3>Overall Average</h3>
|
| 768 |
+
<div style="font-size: 2em; font-weight: bold; color: #28a745;">
|
| 769 |
+
{scores.get('overall_average', 0):.1f}/10
|
| 770 |
+
</div>
|
| 771 |
+
</div>
|
| 772 |
+
</div>
|
| 773 |
+
|
| 774 |
+
<div class="content-section">
|
| 775 |
+
<h2>🔑 Identified Keywords</h2>
|
| 776 |
+
<div class="keywords">
|
| 777 |
+
{' '.join([f'<span class="keyword">{keyword}</span>' for keyword in optimization.get('keywords_identified', [])])}
|
| 778 |
+
</div>
|
| 779 |
+
</div>
|
| 780 |
+
|
| 781 |
+
{'<div class="content-section"><h2>✨ Optimized Content</h2><div class="optimized-content">' + optimization.get('optimized_content', '') + '</div></div>' if optimization.get('optimized_content') and not optimization.get('analyze_only') else ''}
|
| 782 |
+
|
| 783 |
+
<div class="content-section">
|
| 784 |
+
<h2>💡 Improvements Made</h2>
|
| 785 |
+
<ul>
|
| 786 |
+
{' '.join([f'<li>{improvement}</li>' for improvement in optimization.get('improvements_made', [])])}
|
| 787 |
+
</ul>
|
| 788 |
+
</div>
|
| 789 |
+
|
| 790 |
+
<div style="text-align: center; color: #666; margin-top: 40px; padding-top: 20px; border-top: 1px solid #ddd;">
|
| 791 |
+
<p>Generated by GEO SEO AI Optimizer | {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
|
| 792 |
+
</div>
|
| 793 |
+
</body>
|
| 794 |
+
</html>
|
| 795 |
+
"""
|
| 796 |
+
|
| 797 |
+
return html_content
|
| 798 |
+
|
| 799 |
+
except Exception as e:
|
| 800 |
+
return f"<html><body><h1>Enhancement HTML Export Error</h1><p>{str(e)}</p></body></html>"
|
| 801 |
+
|
| 802 |
+
def _export_enhancement_csv(self, data: Dict[str, Any]) -> str:
|
| 803 |
+
"""Export content enhancement results as CSV"""
|
| 804 |
+
try:
|
| 805 |
+
output = io.StringIO()
|
| 806 |
+
writer = csv.writer(output)
|
| 807 |
+
|
| 808 |
+
# Header information
|
| 809 |
+
analysis = data.get('content_analysis', {})
|
| 810 |
+
scores = data.get('performance_scores', {})
|
| 811 |
+
optimization = data.get('optimization_results', {})
|
| 812 |
+
|
| 813 |
+
writer.writerow(['Content Enhancement Analysis Report'])
|
| 814 |
+
writer.writerow(['Analysis Date:', analysis.get('analysis_date', 'Unknown')])
|
| 815 |
+
writer.writerow(['Original Content Length:', analysis.get('original_content_length', 0)])
|
| 816 |
+
writer.writerow(['Original Word Count:', analysis.get('original_word_count', 0)])
|
| 817 |
+
writer.writerow([])
|
| 818 |
+
|
| 819 |
+
# Performance scores
|
| 820 |
+
writer.writerow(['Performance Scores'])
|
| 821 |
+
writer.writerow(['Metric', 'Score'])
|
| 822 |
+
for metric, score in scores.items():
|
| 823 |
+
writer.writerow([metric.replace('_', ' ').title(), round(score, 2)])
|
| 824 |
+
|
| 825 |
+
writer.writerow([])
|
| 826 |
+
writer.writerow(['Keywords Identified'])
|
| 827 |
+
for keyword in optimization.get('keywords_identified', []):
|
| 828 |
+
writer.writerow([keyword])
|
| 829 |
+
|
| 830 |
+
writer.writerow([])
|
| 831 |
+
writer.writerow(['Improvements Made'])
|
| 832 |
+
for improvement in optimization.get('improvements_made', []):
|
| 833 |
+
writer.writerow([improvement])
|
| 834 |
+
|
| 835 |
+
return output.getvalue()
|
| 836 |
+
|
| 837 |
+
except Exception as e:
|
| 838 |
+
return f"Enhancement CSV export error: {str(e)}"
|
| 839 |
+
|
| 840 |
+
def _export_qa_html(self, data: Dict[str, Any]) -> str:
|
| 841 |
+
"""Export Q&A results as HTML"""
|
| 842 |
+
try:
|
| 843 |
+
session = data.get('qa_session', {})
|
| 844 |
+
summary = data.get('summary', {})
|
| 845 |
+
interactions = session.get('interactions', [])
|
| 846 |
+
|
| 847 |
+
html_content = f"""
|
| 848 |
+
<!DOCTYPE html>
|
| 849 |
+
<html lang="en">
|
| 850 |
+
<head>
|
| 851 |
+
<meta charset="UTF-8">
|
| 852 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 853 |
+
<title>Q&A Session Report</title>
|
| 854 |
+
<style>
|
| 855 |
+
body {{
|
| 856 |
+
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
| 857 |
+
line-height: 1.6;
|
| 858 |
+
color: #333;
|
| 859 |
+
max-width: 1000px;
|
| 860 |
+
margin: 0 auto;
|
| 861 |
+
padding: 20px;
|
| 862 |
+
background-color: #f8f9fa;
|
| 863 |
+
}}
|
| 864 |
+
.header {{
|
| 865 |
+
background: linear-gradient(135deg, #6f42c1 0%, #e83e8c 100%);
|
| 866 |
+
color: white;
|
| 867 |
+
padding: 30px;
|
| 868 |
+
border-radius: 10px;
|
| 869 |
+
margin-bottom: 30px;
|
| 870 |
+
text-align: center;
|
| 871 |
+
}}
|
| 872 |
+
.summary {{
|
| 873 |
+
display: grid;
|
| 874 |
+
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
|
| 875 |
+
gap: 20px;
|
| 876 |
+
margin-bottom: 30px;
|
| 877 |
+
}}
|
| 878 |
+
.summary-card {{
|
| 879 |
+
background: white;
|
| 880 |
+
padding: 20px;
|
| 881 |
+
border-radius: 10px;
|
| 882 |
+
box-shadow: 0 4px 6px rgba(0,0,0,0.1);
|
| 883 |
+
text-align: center;
|
| 884 |
+
}}
|
| 885 |
+
.qa-item {{
|
| 886 |
+
background: white;
|
| 887 |
+
padding: 20px;
|
| 888 |
+
border-radius: 10px;
|
| 889 |
+
box-shadow: 0 4px 6px rgba(0,0,0,0.1);
|
| 890 |
+
margin-bottom: 20px;
|
| 891 |
+
}}
|
| 892 |
+
.question {{
|
| 893 |
+
background: #e9ecef;
|
| 894 |
+
padding: 15px;
|
| 895 |
+
border-left: 4px solid #6f42c1;
|
| 896 |
+
border-radius: 5px;
|
| 897 |
+
margin-bottom: 15px;
|
| 898 |
+
}}
|
| 899 |
+
.answer {{
|
| 900 |
+
padding: 15px;
|
| 901 |
+
border-left: 4px solid #28a745;
|
| 902 |
+
border-radius: 5px;
|
| 903 |
+
background: #f8f9fa;
|
| 904 |
+
}}
|
| 905 |
+
.sources {{
|
| 906 |
+
margin-top: 15px;
|
| 907 |
+
padding: 10px;
|
| 908 |
+
background: #fff3cd;
|
| 909 |
+
border-radius: 5px;
|
| 910 |
+
font-size: 0.9em;
|
| 911 |
+
}}
|
| 912 |
+
</style>
|
| 913 |
+
</head>
|
| 914 |
+
<body>
|
| 915 |
+
<div class="header">
|
| 916 |
+
<h1>💬 Q&A Session Report</h1>
|
| 917 |
+
<p>Document Question & Answer Analysis</p>
|
| 918 |
+
<p><strong>Session Date:</strong> {session.get('session_date', 'Unknown')}</p>
|
| 919 |
+
</div>
|
| 920 |
+
|
| 921 |
+
<div class="summary">
|
| 922 |
+
<div class="summary-card">
|
| 923 |
+
<h3>Total Questions</h3>
|
| 924 |
+
<div style="font-size: 2em; font-weight: bold; color: #6f42c1;">
|
| 925 |
+
{session.get('total_questions', 0)}
|
| 926 |
+
</div>
|
| 927 |
+
</div>
|
| 928 |
+
<div class="summary-card">
|
| 929 |
+
<h3>Successful Answers</h3>
|
| 930 |
+
<div style="font-size: 2em; font-weight: bold; color: #28a745;">
|
| 931 |
+
{summary.get('successful_answers', 0)}
|
| 932 |
+
</div>
|
| 933 |
+
</div>
|
| 934 |
+
<div class="summary-card">
|
| 935 |
+
<h3>Avg Response Length</h3>
|
| 936 |
+
<div style="font-size: 2em; font-weight: bold; color: #17a2b8;">
|
| 937 |
+
{summary.get('average_response_length', 0):.0f}
|
| 938 |
+
</div>
|
| 939 |
+
</div>
|
| 940 |
+
</div>
|
| 941 |
+
|
| 942 |
+
<h2>📝 Q&A Interactions</h2>
|
| 943 |
+
"""
|
| 944 |
+
|
| 945 |
+
# Add individual Q&A items
|
| 946 |
+
for i, interaction in enumerate(interactions, 1):
|
| 947 |
+
question = interaction.get('query', 'No question')
|
| 948 |
+
answer = interaction.get('result', interaction.get('answer', 'No answer'))
|
| 949 |
+
sources = interaction.get('sources', [])
|
| 950 |
+
|
| 951 |
+
html_content += f"""
|
| 952 |
+
<div class="qa-item">
|
| 953 |
+
<h3>Question {i}</h3>
|
| 954 |
+
<div class="question">
|
| 955 |
+
<strong>Q:</strong> {question}
|
| 956 |
+
</div>
|
| 957 |
+
<div class="answer">
|
| 958 |
+
<strong>A:</strong> {answer}
|
| 959 |
+
</div>
|
| 960 |
+
"""
|
| 961 |
+
|
| 962 |
+
if sources:
|
| 963 |
+
html_content += '<div class="sources"><strong>Sources:</strong><ul>'
|
| 964 |
+
for source in sources[:3]: # Limit to first 3 sources
|
| 965 |
+
content_preview = source.get('content', '')[:200] + '...' if len(source.get('content', '')) > 200 else source.get('content', '')
|
| 966 |
+
html_content += f'<li>{content_preview}</li>'
|
| 967 |
+
html_content += '</ul></div>'
|
| 968 |
+
|
| 969 |
+
html_content += '</div>'
|
| 970 |
+
|
| 971 |
+
html_content += f"""
|
| 972 |
+
|
| 973 |
+
<div style="text-align: center; color: #666; margin-top: 40px; padding-top: 20px; border-top: 1px solid #ddd;">
|
| 974 |
+
<p>Generated by GEO SEO AI Optimizer | {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
|
| 975 |
+
</div>
|
| 976 |
+
</body>
|
| 977 |
+
</html>
|
| 978 |
+
"""
|
| 979 |
+
|
| 980 |
+
return html_content
|
| 981 |
+
|
| 982 |
+
except Exception as e:
|
| 983 |
+
return f"<html><body><h1>Q&A HTML Export Error</h1><p>{str(e)}</p></body></html>"
|
| 984 |
+
|
| 985 |
+
def _export_qa_csv(self, data: Dict[str, Any]) -> str:
|
| 986 |
+
"""Export Q&A results as CSV"""
|
| 987 |
+
try:
|
| 988 |
+
output = io.StringIO()
|
| 989 |
+
writer = csv.writer(output)
|
| 990 |
+
|
| 991 |
+
session = data.get('qa_session', {})
|
| 992 |
+
summary = data.get('summary', {})
|
| 993 |
+
interactions = session.get('interactions', [])
|
| 994 |
+
|
| 995 |
+
# Header
|
| 996 |
+
writer.writerow(['Q&A Session Report'])
|
| 997 |
+
writer.writerow(['Session Date:', session.get('session_date', 'Unknown')])
|
| 998 |
+
writer.writerow(['Total Questions:', session.get('total_questions', 0)])
|
| 999 |
+
writer.writerow(['Successful Answers:', summary.get('successful_answers', 0)])
|
| 1000 |
+
writer.writerow([])
|
| 1001 |
+
|
| 1002 |
+
# Q&A data
|
| 1003 |
+
writer.writerow(['Question Index', 'Question', 'Answer', 'Has Sources', 'Answer Length'])
|
| 1004 |
+
|
| 1005 |
+
for i, interaction in enumerate(interactions, 1):
|
| 1006 |
+
question = interaction.get('query', 'No question')
|
| 1007 |
+
answer = interaction.get('result', interaction.get('answer', 'No answer'))
|
| 1008 |
+
has_sources = 'Yes' if interaction.get('sources') else 'No'
|
| 1009 |
+
answer_length = len(answer) if answer else 0
|
| 1010 |
+
|
| 1011 |
+
writer.writerow([i, question, answer, has_sources, answer_length])
|
| 1012 |
+
|
| 1013 |
+
return output.getvalue()
|
| 1014 |
+
|
| 1015 |
+
except Exception as e:
|
| 1016 |
+
return f"Q&A CSV export error: {str(e)}"
|
| 1017 |
+
|
| 1018 |
+
def _export_batch_excel(self, data: Dict[str, Any]) -> bytes:
|
| 1019 |
+
"""Export batch results as Excel file"""
|
| 1020 |
+
try:
|
| 1021 |
+
output = io.BytesIO()
|
| 1022 |
+
|
| 1023 |
+
with pd.ExcelWriter(output, engine='openpyxl') as writer:
|
| 1024 |
+
# Batch metadata sheet
|
| 1025 |
+
metadata = data.get('batch_metadata', {})
|
| 1026 |
+
metadata_df = pd.DataFrame([
|
| 1027 |
+
{'Property': k, 'Value': v} for k, v in metadata.items()
|
| 1028 |
+
])
|
| 1029 |
+
metadata_df.to_excel(writer, sheet_name='Batch Metadata', index=False)
|
| 1030 |
+
|
| 1031 |
+
# Batch summary sheet
|
| 1032 |
+
summary = data.get('batch_summary', {})
|
| 1033 |
+
summary_df = pd.DataFrame([
|
| 1034 |
+
{'Metric': k, 'Value': v} for k, v in summary.items()
|
| 1035 |
+
])
|
| 1036 |
+
summary_df.to_excel(writer, sheet_name='Batch Summary', index=False)
|
| 1037 |
+
|
| 1038 |
+
# Individual results sheet
|
| 1039 |
+
results = data.get('batch_results', [])
|
| 1040 |
+
if results:
|
| 1041 |
+
# Flatten results for tabular format
|
| 1042 |
+
flattened_results = []
|
| 1043 |
+
for i, result in enumerate(results):
|
| 1044 |
+
flat_result = {'Batch_Index': i}
|
| 1045 |
+
self._flatten_dict(result, flat_result)
|
| 1046 |
+
flattened_results.append(flat_result)
|
| 1047 |
+
|
| 1048 |
+
results_df = pd.DataFrame(flattened_results)
|
| 1049 |
+
results_df.to_excel(writer, sheet_name='Batch Results', index=False)
|
| 1050 |
+
|
| 1051 |
+
output.seek(0)
|
| 1052 |
+
return output.getvalue()
|
| 1053 |
+
|
| 1054 |
+
except Exception as e:
|
| 1055 |
+
error_content = f"Batch Excel export failed: {str(e)}\n\nData:\n{json.dumps(data, indent=2)}"
|
| 1056 |
+
return error_content.encode('utf-8')
|
| 1057 |
+
|
| 1058 |
+
def _export_batch_csv(self, data: Dict[str, Any]) -> str:
|
| 1059 |
+
"""Export batch results as CSV"""
|
| 1060 |
+
try:
|
| 1061 |
+
output = io.StringIO()
|
| 1062 |
+
writer = csv.writer(output)
|
| 1063 |
+
|
| 1064 |
+
# Batch metadata
|
| 1065 |
+
metadata = data.get('batch_metadata', {})
|
| 1066 |
+
writer.writerow(['Batch Analysis Results'])
|
| 1067 |
+
writer.writerow(['Export Timestamp:', data.get('export_timestamp', 'Unknown')])
|
| 1068 |
+
writer.writerow([])
|
| 1069 |
+
|
| 1070 |
+
writer.writerow(['Batch Metadata'])
|
| 1071 |
+
for key, value in metadata.items():
|
| 1072 |
+
writer.writerow([key, value])
|
| 1073 |
+
|
| 1074 |
+
writer.writerow([])
|
| 1075 |
+
|
| 1076 |
+
# Batch summary
|
| 1077 |
+
summary = data.get('batch_summary', {})
|
| 1078 |
+
writer.writerow(['Batch Summary'])
|
| 1079 |
+
for key, value in summary.items():
|
| 1080 |
+
writer.writerow([key, value])
|
| 1081 |
+
|
| 1082 |
+
writer.writerow([])
|
| 1083 |
+
|
| 1084 |
+
# Individual results (simplified)
|
| 1085 |
+
results = data.get('batch_results', [])
|
| 1086 |
+
if results:
|
| 1087 |
+
writer.writerow(['Individual Results'])
|
| 1088 |
+
writer.writerow(['Index', 'Status', 'Summary'])
|
| 1089 |
+
|
| 1090 |
+
for i, result in enumerate(results):
|
| 1091 |
+
status = 'Success' if not result.get('error') else 'Error'
|
| 1092 |
+
summary_text = str(result)[:100] + '...' if len(str(result)) > 100 else str(result)
|
| 1093 |
+
writer.writerow([i, status, summary_text])
|
| 1094 |
+
|
| 1095 |
+
return output.getvalue()
|
| 1096 |
+
|
| 1097 |
+
except Exception as e:
|
| 1098 |
+
return f"Batch CSV export error: {str(e)}"
|
| 1099 |
+
|
| 1100 |
+
def _export_geo_pdf(self, data: Dict[str, Any]) -> bytes:
|
| 1101 |
+
"""Export GEO data as PDF (placeholder - would need reportlab)"""
|
| 1102 |
+
try:
|
| 1103 |
+
# For now, return HTML content as bytes
|
| 1104 |
+
# In a full implementation, you'd use reportlab or weasyprint
|
| 1105 |
+
html_content = self._export_geo_html(data)
|
| 1106 |
+
return html_content.encode('utf-8')
|
| 1107 |
+
|
| 1108 |
+
except Exception as e:
|
| 1109 |
+
error_content = f"PDF export not fully implemented. Error: {str(e)}"
|
| 1110 |
+
return error_content.encode('utf-8')
|
| 1111 |
+
|
| 1112 |
+
def _create_executive_summary(self, analysis_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 1113 |
+
"""Create executive summary report"""
|
| 1114 |
+
try:
|
| 1115 |
+
geo_results = analysis_data.get('geo_results', [])
|
| 1116 |
+
enhancement_results = analysis_data.get('enhancement_results', {})
|
| 1117 |
+
qa_results = analysis_data.get('qa_results', [])
|
| 1118 |
+
|
| 1119 |
+
# Calculate key metrics
|
| 1120 |
+
overall_performance = self._calculate_overall_performance(analysis_data)
|
| 1121 |
+
|
| 1122 |
+
return {
|
| 1123 |
+
'executive_summary': {
|
| 1124 |
+
'overall_performance_score': overall_performance,
|
| 1125 |
+
'key_findings': self._extract_key_findings(analysis_data),
|
| 1126 |
+
'priority_recommendations': self._get_priority_recommendations(analysis_data),
|
| 1127 |
+
'roi_potential': self._estimate_roi_potential(overall_performance),
|
| 1128 |
+
'implementation_timeline': self._suggest_implementation_timeline(analysis_data),
|
| 1129 |
+
'resource_requirements': self._estimate_resource_requirements(analysis_data)
|
| 1130 |
+
}
|
| 1131 |
+
}
|
| 1132 |
+
|
| 1133 |
+
except Exception as e:
|
| 1134 |
+
return {'error': f"Executive summary creation failed: {str(e)}"}
|
| 1135 |
+
|
| 1136 |
+
def _create_summary_report(self, analysis_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 1137 |
+
"""Create summary report"""
|
| 1138 |
+
try:
|
| 1139 |
+
return {
|
| 1140 |
+
'summary_report': {
|
| 1141 |
+
'analysis_overview': self._create_analysis_overview(analysis_data),
|
| 1142 |
+
'performance_metrics': self._summarize_performance_metrics(analysis_data),
|
| 1143 |
+
'improvement_opportunities': self._identify_improvement_opportunities(analysis_data),
|
| 1144 |
+
'competitive_position': self._assess_competitive_position(analysis_data),
|
| 1145 |
+
'next_steps': self._recommend_next_steps(analysis_data)
|
| 1146 |
+
}
|
| 1147 |
+
}
|
| 1148 |
+
|
| 1149 |
+
except Exception as e:
|
| 1150 |
+
return {'error': f"Summary report creation failed: {str(e)}"}
|
| 1151 |
+
|
| 1152 |
+
def _create_full_report(self, analysis_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 1153 |
+
"""Create full detailed report"""
|
| 1154 |
+
try:
|
| 1155 |
+
return {
|
| 1156 |
+
'full_report': {
|
| 1157 |
+
'executive_summary': self._create_executive_summary(analysis_data).get('executive_summary', {}),
|
| 1158 |
+
'detailed_analysis': {
|
| 1159 |
+
'geo_analysis_details': analysis_data.get('geo_results', []),
|
| 1160 |
+
'content_optimization_details': analysis_data.get('enhancement_results', {}),
|
| 1161 |
+
'qa_performance_details': analysis_data.get('qa_results', [])
|
| 1162 |
+
},
|
| 1163 |
+
'methodology': self._document_methodology(),
|
| 1164 |
+
'data_sources': self._document_data_sources(analysis_data),
|
| 1165 |
+
'limitations': self._document_limitations(),
|
| 1166 |
+
'appendices': self._create_appendices(analysis_data)
|
| 1167 |
+
}
|
| 1168 |
+
}
|
| 1169 |
+
|
| 1170 |
+
except Exception as e:
|
| 1171 |
+
return {'error': f"Full report creation failed: {str(e)}"}
|
| 1172 |
+
|
| 1173 |
+
def _create_batch_summary(self, batch_results: List[Dict[str, Any]]) -> Dict[str, Any]:
|
| 1174 |
+
"""Create summary of batch processing results"""
|
| 1175 |
+
try:
|
| 1176 |
+
total_items = len(batch_results)
|
| 1177 |
+
successful_items = len([r for r in batch_results if not r.get('error')])
|
| 1178 |
+
failed_items = total_items - successful_items
|
| 1179 |
+
|
| 1180 |
+
return {
|
| 1181 |
+
'total_items': total_items,
|
| 1182 |
+
'successful_items': successful_items,
|
| 1183 |
+
'failed_items': failed_items,
|
| 1184 |
+
'success_rate': (successful_items / total_items * 100) if total_items > 0 else 0,
|
| 1185 |
+
'processing_status': 'Completed',
|
| 1186 |
+
'average_processing_time': self._calculate_avg_processing_time(batch_results),
|
| 1187 |
+
'common_errors': self._identify_common_errors(batch_results)
|
| 1188 |
+
}
|
| 1189 |
+
|
| 1190 |
+
except Exception as e:
|
| 1191 |
+
return {'error': f"Batch summary creation failed: {str(e)}"}
|
| 1192 |
+
|
| 1193 |
+
def _generate_performance_insights(self, scores: Dict[str, float], overall_avg: float) -> List[str]:
|
| 1194 |
+
"""Generate performance insights from scores"""
|
| 1195 |
+
insights = []
|
| 1196 |
+
|
| 1197 |
+
try:
|
| 1198 |
+
# Overall performance insight
|
| 1199 |
+
if overall_avg >= 8.0:
|
| 1200 |
+
insights.append("Excellent overall GEO performance - content is well-optimized for AI search engines")
|
| 1201 |
+
elif overall_avg >= 6.0:
|
| 1202 |
+
insights.append("Good GEO performance with room for improvement in specific areas")
|
| 1203 |
+
elif overall_avg >= 4.0:
|
| 1204 |
+
insights.append("Moderate GEO performance - significant optimization opportunities exist")
|
| 1205 |
+
else:
|
| 1206 |
+
insights.append("Low GEO performance - comprehensive optimization needed")
|
| 1207 |
+
|
| 1208 |
+
# Specific metric insights
|
| 1209 |
+
for metric, score in scores.items():
|
| 1210 |
+
if score < 5.0:
|
| 1211 |
+
metric_name = metric.replace('_', ' ').title()
|
| 1212 |
+
insights.append(f"Low {metric_name} score ({score:.1f}) needs immediate attention")
|
| 1213 |
+
elif score >= 8.5:
|
| 1214 |
+
metric_name = metric.replace('_', ' ').title()
|
| 1215 |
+
insights.append(f"Excellent {metric_name} score ({score:.1f}) - maintain current approach")
|
| 1216 |
+
|
| 1217 |
+
return insights[:5] # Return top 5 insights
|
| 1218 |
+
|
| 1219 |
+
except Exception:
|
| 1220 |
+
return ["Unable to generate performance insights"]
|
| 1221 |
+
|
| 1222 |
+
def _generate_package_readme(self, analysis_data: Dict[str, Any]) -> str:
|
| 1223 |
+
"""Generate README file for export package"""
|
| 1224 |
+
try:
|
| 1225 |
+
readme_content = f"""
|
| 1226 |
+
GEO SEO AI Optimizer - Analysis Package
|
| 1227 |
+
======================================
|
| 1228 |
+
|
| 1229 |
+
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
| 1230 |
+
|
| 1231 |
+
This package contains the complete analysis results from the GEO SEO AI Optimizer tool.
|
| 1232 |
+
|
| 1233 |
+
Files Included:
|
| 1234 |
+
- JSON file: Complete raw data in JSON format
|
| 1235 |
+
- HTML file: Visual report for web viewing
|
| 1236 |
+
- CSV file: Tabular data for spreadsheet analysis
|
| 1237 |
+
- README.txt: This file
|
| 1238 |
+
|
| 1239 |
+
About GEO (Generative Engine Optimization):
|
| 1240 |
+
GEO is the practice of optimizing content for AI-powered search engines and
|
| 1241 |
+
language models. Unlike traditional SEO, GEO focuses on:
|
| 1242 |
+
|
| 1243 |
+
- AI search visibility
|
| 1244 |
+
- Query intent matching
|
| 1245 |
+
- Conversational readiness
|
| 1246 |
+
- Citation worthiness
|
| 1247 |
+
- Semantic richness
|
| 1248 |
+
- Context completeness
|
| 1249 |
+
|
| 1250 |
+
How to Use These Files:
|
| 1251 |
+
1. Open the HTML file in a web browser for a visual report
|
| 1252 |
+
2. Import the CSV file into Excel or Google Sheets for analysis
|
| 1253 |
+
3. Use the JSON file for programmatic processing or integration
|
| 1254 |
+
|
| 1255 |
+
For more information about GEO optimization, visit the tool documentation.
|
| 1256 |
+
|
| 1257 |
+
Generated by: GEO SEO AI Optimizer v1.0
|
| 1258 |
+
"""
|
| 1259 |
+
return readme_content
|
| 1260 |
+
|
| 1261 |
+
except Exception as e:
|
| 1262 |
+
return f"README generation failed: {str(e)}"
|
| 1263 |
+
|
| 1264 |
+
# Helper methods for data processing and analysis
|
| 1265 |
+
|
| 1266 |
+
def _get_performance_level(self, score: float) -> str:
|
| 1267 |
+
"""Get performance level description for a score"""
|
| 1268 |
+
if score >= 8.0:
|
| 1269 |
+
return "Excellent"
|
| 1270 |
+
elif score >= 6.0:
|
| 1271 |
+
return "Good"
|
| 1272 |
+
elif score >= 4.0:
|
| 1273 |
+
return "Fair"
|
| 1274 |
+
else:
|
| 1275 |
+
return "Needs Improvement"
|
| 1276 |
+
|
| 1277 |
+
def _categorize_recommendation(self, recommendation: str) -> str:
|
| 1278 |
+
"""Categorize a recommendation based on content"""
|
| 1279 |
+
rec_lower = recommendation.lower()
|
| 1280 |
+
|
| 1281 |
+
if any(word in rec_lower for word in ['structure', 'heading', 'format']):
|
| 1282 |
+
return "Content Structure"
|
| 1283 |
+
elif any(word in rec_lower for word in ['keyword', 'semantic', 'topic']):
|
| 1284 |
+
return "SEO & Keywords"
|
| 1285 |
+
elif any(word in rec_lower for word in ['clarity', 'readability', 'language']):
|
| 1286 |
+
return "Content Quality"
|
| 1287 |
+
elif any(word in rec_lower for word in ['technical', 'schema', 'markup']):
|
| 1288 |
+
return "Technical SEO"
|
| 1289 |
+
else:
|
| 1290 |
+
return "General"
|
| 1291 |
+
|
| 1292 |
+
def _calculate_avg_response_length(self, qa_results: List[Dict[str, Any]]) -> float:
|
| 1293 |
+
"""Calculate average response length for Q&A results"""
|
| 1294 |
+
try:
|
| 1295 |
+
response_lengths = []
|
| 1296 |
+
for result in qa_results:
|
| 1297 |
+
answer = result.get('result', result.get('answer', ''))
|
| 1298 |
+
if answer and not result.get('error'):
|
| 1299 |
+
response_lengths.append(len(answer))
|
| 1300 |
+
|
| 1301 |
+
return sum(response_lengths) / len(response_lengths) if response_lengths else 0
|
| 1302 |
+
|
| 1303 |
+
except Exception:
|
| 1304 |
+
return 0
|
| 1305 |
+
|
| 1306 |
+
def _extract_common_topics(self, qa_results: List[Dict[str, Any]]) -> List[str]:
|
| 1307 |
+
"""Extract common topics from Q&A results"""
|
| 1308 |
+
try:
|
| 1309 |
+
# Simple topic extraction based on question keywords
|
| 1310 |
+
topics = {}
|
| 1311 |
+
|
| 1312 |
+
for result in qa_results:
|
| 1313 |
+
question = result.get('query', result.get('question', ''))
|
| 1314 |
+
if question:
|
| 1315 |
+
words = question.lower().split()
|
| 1316 |
+
for word in words:
|
| 1317 |
+
if len(word) > 4: # Focus on longer words
|
| 1318 |
+
topics[word] = topics.get(word, 0) + 1
|
| 1319 |
+
|
| 1320 |
+
# Return top 5 most common topics
|
| 1321 |
+
sorted_topics = sorted(topics.items(), key=lambda x: x[1], reverse=True)
|
| 1322 |
+
return [topic for topic, count in sorted_topics[:5]]
|
| 1323 |
+
|
| 1324 |
+
except Exception:
|
| 1325 |
+
return []
|
| 1326 |
+
|
| 1327 |
+
def _flatten_dict(self, d: Dict[str, Any], parent_dict: Dict[str, Any], parent_key: str = '') -> None:
|
| 1328 |
+
"""Flatten nested dictionary for tabular export"""
|
| 1329 |
+
try:
|
| 1330 |
+
for key, value in d.items():
|
| 1331 |
+
new_key = f"{parent_key}_{key}" if parent_key else key
|
| 1332 |
+
|
| 1333 |
+
if isinstance(value, dict):
|
| 1334 |
+
self._flatten_dict(value, parent_dict, new_key)
|
| 1335 |
+
elif isinstance(value, list):
|
| 1336 |
+
parent_dict[new_key] = json.dumps(value) # Convert lists to JSON strings
|
| 1337 |
+
else:
|
| 1338 |
+
parent_dict[new_key] = value
|
| 1339 |
+
|
| 1340 |
+
except Exception:
|
| 1341 |
+
pass # Skip problematic keys
|
| 1342 |
+
|
| 1343 |
+
def _calculate_overall_performance(self, analysis_data: Dict[str, Any]) -> float:
|
| 1344 |
+
"""Calculate overall performance score across all analyses"""
|
| 1345 |
+
try:
|
| 1346 |
+
scores = []
|
| 1347 |
+
|
| 1348 |
+
# GEO scores
|
| 1349 |
+
geo_results = analysis_data.get('geo_results', [])
|
| 1350 |
+
for result in geo_results:
|
| 1351 |
+
if 'geo_scores' in result:
|
| 1352 |
+
geo_score_values = list(result['geo_scores'].values())
|
| 1353 |
+
if geo_score_values:
|
| 1354 |
+
scores.append(sum(geo_score_values) / len(geo_score_values))
|
| 1355 |
+
|
| 1356 |
+
# Enhancement scores
|
| 1357 |
+
enhancement = analysis_data.get('enhancement_results', {})
|
| 1358 |
+
if 'scores' in enhancement:
|
| 1359 |
+
enh_scores = list(enhancement['scores'].values())
|
| 1360 |
+
if enh_scores:
|
| 1361 |
+
scores.append(sum(enh_scores) / len(enh_scores))
|
| 1362 |
+
|
| 1363 |
+
return sum(scores) / len(scores) if scores else 0
|
| 1364 |
+
|
| 1365 |
+
except Exception:
|
| 1366 |
+
return 0
|
| 1367 |
+
|
| 1368 |
+
def _extract_key_findings(self, analysis_data: Dict[str, Any]) -> List[str]:
|
| 1369 |
+
"""Extract key findings from analysis data"""
|
| 1370 |
+
findings = []
|
| 1371 |
+
|
| 1372 |
+
try:
|
| 1373 |
+
# Add findings based on performance scores
|
| 1374 |
+
overall_perf = self._calculate_overall_performance(analysis_data)
|
| 1375 |
+
|
| 1376 |
+
if overall_perf >= 8.0:
|
| 1377 |
+
findings.append("Content demonstrates excellent AI search optimization")
|
| 1378 |
+
elif overall_perf <= 4.0:
|
| 1379 |
+
findings.append("Significant optimization opportunities identified")
|
| 1380 |
+
|
| 1381 |
+
# Add more specific findings based on data
|
| 1382 |
+
geo_results = analysis_data.get('geo_results', [])
|
| 1383 |
+
if geo_results:
|
| 1384 |
+
findings.append(f"Analyzed {len(geo_results)} pages for GEO performance")
|
| 1385 |
+
|
| 1386 |
+
enhancement = analysis_data.get('enhancement_results', {})
|
| 1387 |
+
if enhancement and 'keywords' in enhancement:
|
| 1388 |
+
findings.append(f"Identified {len(enhancement['keywords'])} key optimization terms")
|
| 1389 |
+
|
| 1390 |
+
return findings[:5] # Return top 5 findings
|
| 1391 |
+
|
| 1392 |
+
except Exception:
|
| 1393 |
+
return ["Unable to extract key findings"]
|
| 1394 |
+
|
| 1395 |
+
def _get_priority_recommendations(self, analysis_data: Dict[str, Any]) -> List[str]:
|
| 1396 |
+
"""Get priority recommendations from analysis"""
|
| 1397 |
+
try:
|
| 1398 |
+
recommendations = []
|
| 1399 |
+
|
| 1400 |
+
# Collect all recommendations from different analyses
|
| 1401 |
+
geo_results = analysis_data.get('geo_results', [])
|
| 1402 |
+
for result in geo_results:
|
| 1403 |
+
recommendations.extend(result.get('recommendations', []))
|
| 1404 |
+
|
| 1405 |
+
# Remove duplicates and return top priorities
|
| 1406 |
+
unique_recs = list(set(recommendations))
|
| 1407 |
+
return unique_recs[:3] # Top 3 priority recommendations
|
| 1408 |
+
|
| 1409 |
+
except Exception:
|
| 1410 |
+
return ["Review and implement GEO best practices"]
|
| 1411 |
+
|
| 1412 |
+
def _estimate_roi_potential(self, performance_score: float) -> str:
|
| 1413 |
+
"""Estimate ROI potential based on performance score"""
|
| 1414 |
+
if performance_score <= 4.0:
|
| 1415 |
+
return "High - Significant improvement potential"
|
| 1416 |
+
elif performance_score <= 6.0:
|
| 1417 |
+
return "Medium - Moderate improvement opportunities"
|
| 1418 |
+
else:
|
| 1419 |
+
return "Low - Already well-optimized"
|
| 1420 |
+
|
| 1421 |
+
def _suggest_implementation_timeline(self, analysis_data: Dict[str, Any]) -> str:
|
| 1422 |
+
"""Suggest implementation timeline"""
|
| 1423 |
+
try:
|
| 1424 |
+
overall_perf = self._calculate_overall_performance(analysis_data)
|
| 1425 |
+
|
| 1426 |
+
if overall_perf <= 4.0:
|
| 1427 |
+
return "3-6 months for comprehensive optimization"
|
| 1428 |
+
elif overall_perf <= 6.0:
|
| 1429 |
+
return "1-3 months for targeted improvements"
|
| 1430 |
+
else:
|
| 1431 |
+
return "Ongoing maintenance and monitoring"
|
| 1432 |
+
|
| 1433 |
+
except Exception:
|
| 1434 |
+
return "Timeline assessment unavailable"
|
| 1435 |
+
|
| 1436 |
+
def _estimate_resource_requirements(self, analysis_data: Dict[str, Any]) -> Dict[str, str]:
|
| 1437 |
+
"""Estimate resource requirements"""
|
| 1438 |
+
return {
|
| 1439 |
+
'content_team': 'Required for content optimization',
|
| 1440 |
+
'technical_team': 'Required for technical implementations',
|
| 1441 |
+
'timeline': self._suggest_implementation_timeline(analysis_data),
|
| 1442 |
+
'budget': 'Varies based on scope of optimizations'
|
| 1443 |
+
}
|
| 1444 |
+
|
| 1445 |
+
def _create_analysis_overview(self, analysis_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 1446 |
+
"""Create analysis overview"""
|
| 1447 |
+
try:
|
| 1448 |
+
return {
|
| 1449 |
+
'analyses_performed': list(analysis_data.keys()),
|
| 1450 |
+
'total_items_analyzed': sum(len(v) if isinstance(v, list) else 1 for v in analysis_data.values()),
|
| 1451 |
+
'analysis_scope': 'Comprehensive GEO and content optimization analysis',
|
| 1452 |
+
'key_focus_areas': ['AI Search Optimization', 'Content Enhancement', 'Performance Analysis']
|
| 1453 |
+
}
|
| 1454 |
+
|
| 1455 |
+
except Exception:
|
| 1456 |
+
return {'error': 'Overview creation failed'}
|
| 1457 |
+
|
| 1458 |
+
def _summarize_performance_metrics(self, analysis_data: Dict[str, Any]) -> Dict[str, float]:
|
| 1459 |
+
"""Summarize performance metrics"""
|
| 1460 |
+
try:
|
| 1461 |
+
return {
|
| 1462 |
+
'overall_performance': self._calculate_overall_performance(analysis_data),
|
| 1463 |
+
'optimization_potential': 10 - self._calculate_overall_performance(analysis_data),
|
| 1464 |
+
'completion_rate': 100.0 # Assuming analysis completed successfully
|
| 1465 |
+
}
|
| 1466 |
+
|
| 1467 |
+
except Exception:
|
| 1468 |
+
return {}
|
| 1469 |
+
|
| 1470 |
+
def _identify_improvement_opportunities(self, analysis_data: Dict[str, Any]) -> List[str]:
|
| 1471 |
+
"""Identify improvement opportunities"""
|
| 1472 |
+
return self._get_priority_recommendations(analysis_data)
|
| 1473 |
+
|
| 1474 |
+
def _assess_competitive_position(self, analysis_data: Dict[str, Any]) -> str:
|
| 1475 |
+
"""Assess competitive position"""
|
| 1476 |
+
try:
|
| 1477 |
+
overall_perf = self._calculate_overall_performance(analysis_data)
|
| 1478 |
+
|
| 1479 |
+
if overall_perf >= 8.0:
|
| 1480 |
+
return "Strong - Above average GEO performance"
|
| 1481 |
+
elif overall_perf >= 6.0:
|
| 1482 |
+
return "Competitive - Meeting industry standards"
|
| 1483 |
+
elif overall_perf >= 4.0:
|
| 1484 |
+
return "Below Average - Improvement needed"
|
| 1485 |
+
else:
|
| 1486 |
+
return "Weak - Significant optimization required"
|
| 1487 |
+
|
| 1488 |
+
except Exception:
|
| 1489 |
+
return "Assessment unavailable"
|
| 1490 |
+
|
| 1491 |
+
def _recommend_next_steps(self, analysis_data: Dict[str, Any]) -> List[str]:
|
| 1492 |
+
"""Recommend next steps"""
|
| 1493 |
+
steps = [
|
| 1494 |
+
"Review detailed analysis results",
|
| 1495 |
+
"Prioritize recommendations by impact",
|
| 1496 |
+
"Develop implementation plan",
|
| 1497 |
+
"Monitor performance improvements"
|
| 1498 |
+
]
|
| 1499 |
+
|
| 1500 |
+
# Add specific steps based on performance
|
| 1501 |
+
overall_perf = self._calculate_overall_performance(analysis_data)
|
| 1502 |
+
if overall_perf <= 4.0:
|
| 1503 |
+
steps.insert(1, "Focus on fundamental GEO optimization")
|
| 1504 |
+
|
| 1505 |
+
return steps
|
| 1506 |
+
|
| 1507 |
+
def _document_methodology(self) -> Dict[str, str]:
|
| 1508 |
+
"""Document analysis methodology"""
|
| 1509 |
+
return {
|
| 1510 |
+
'geo_analysis': 'AI-powered content analysis using specialized GEO metrics',
|
| 1511 |
+
'content_optimization': 'LLM-based content enhancement and scoring',
|
| 1512 |
+
'performance_scoring': 'Multi-dimensional scoring system for AI search optimization',
|
| 1513 |
+
'data_collection': 'Automated content parsing and analysis',
|
| 1514 |
+
'validation': 'Cross-referenced metrics and quality assurance checks'
|
| 1515 |
+
}
|
| 1516 |
+
|
| 1517 |
+
def _document_data_sources(self, analysis_data: Dict[str, Any]) -> List[str]:
|
| 1518 |
+
"""Document data sources used in analysis"""
|
| 1519 |
+
sources = []
|
| 1520 |
+
|
| 1521 |
+
if 'geo_results' in analysis_data:
|
| 1522 |
+
sources.append("Website content analysis")
|
| 1523 |
+
if 'enhancement_results' in analysis_data:
|
| 1524 |
+
sources.append("Content optimization analysis")
|
| 1525 |
+
if 'qa_results' in analysis_data:
|
| 1526 |
+
sources.append("Document Q&A interactions")
|
| 1527 |
+
|
| 1528 |
+
sources.extend([
|
| 1529 |
+
"AI-powered content scoring",
|
| 1530 |
+
"GEO performance metrics",
|
| 1531 |
+
"Industry best practices database"
|
| 1532 |
+
])
|
| 1533 |
+
|
| 1534 |
+
return sources
|
| 1535 |
+
|
| 1536 |
+
def _document_limitations(self) -> List[str]:
|
| 1537 |
+
"""Document analysis limitations"""
|
| 1538 |
+
return [
|
| 1539 |
+
"Analysis based on current content snapshot",
|
| 1540 |
+
"Performance may vary with search engine algorithm updates",
|
| 1541 |
+
"Recommendations require human review for implementation",
|
| 1542 |
+
"Results depend on quality of input content",
|
| 1543 |
+
"AI model performance may vary across different content types"
|
| 1544 |
+
]
|
| 1545 |
+
|
| 1546 |
+
def _create_appendices(self, analysis_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 1547 |
+
"""Create report appendices"""
|
| 1548 |
+
try:
|
| 1549 |
+
return {
|
| 1550 |
+
'technical_details': {
|
| 1551 |
+
'models_used': ['GPT-based content analysis', 'Semantic similarity scoring'],
|
| 1552 |
+
'processing_time': 'Variable based on content volume',
|
| 1553 |
+
'confidence_intervals': 'Scores provided with ±0.5 accuracy'
|
| 1554 |
+
},
|
| 1555 |
+
'glossary': {
|
| 1556 |
+
'GEO': 'Generative Engine Optimization - optimization for AI search engines',
|
| 1557 |
+
'AI Search Visibility': 'Likelihood of content appearing in AI search results',
|
| 1558 |
+
'Citation Worthiness': 'Probability of content being cited by AI systems',
|
| 1559 |
+
'Conversational Readiness': 'Suitability for AI chat responses'
|
| 1560 |
+
},
|
| 1561 |
+
'references': [
|
| 1562 |
+
'GEO Best Practices Guide',
|
| 1563 |
+
'AI Search Engine Optimization Standards',
|
| 1564 |
+
'Content Performance Benchmarks'
|
| 1565 |
+
]
|
| 1566 |
+
}
|
| 1567 |
+
|
| 1568 |
+
except Exception:
|
| 1569 |
+
return {}
|
| 1570 |
+
|
| 1571 |
+
def _calculate_avg_processing_time(self, batch_results: List[Dict[str, Any]]) -> float:
|
| 1572 |
+
"""Calculate average processing time for batch results"""
|
| 1573 |
+
try:
|
| 1574 |
+
processing_times = []
|
| 1575 |
+
|
| 1576 |
+
for result in batch_results:
|
| 1577 |
+
if 'processing_time' in result:
|
| 1578 |
+
processing_times.append(result['processing_time'])
|
| 1579 |
+
|
| 1580 |
+
return sum(processing_times) / len(processing_times) if processing_times else 0
|
| 1581 |
+
|
| 1582 |
+
except Exception:
|
| 1583 |
+
return 0
|
| 1584 |
+
|
| 1585 |
+
def _identify_common_errors(self, batch_results: List[Dict[str, Any]]) -> List[str]:
|
| 1586 |
+
"""Identify common errors in batch processing"""
|
| 1587 |
+
try:
|
| 1588 |
+
error_counts = {}
|
| 1589 |
+
|
| 1590 |
+
for result in batch_results:
|
| 1591 |
+
if result.get('error'):
|
| 1592 |
+
error_msg = str(result['error'])[:50] # First 50 chars
|
| 1593 |
+
error_counts[error_msg] = error_counts.get(error_msg, 0) + 1
|
| 1594 |
+
|
| 1595 |
+
# Return top 3 most common errors
|
| 1596 |
+
sorted_errors = sorted(error_counts.items(), key=lambda x: x[1], reverse=True)
|
| 1597 |
+
return [error for error, count in sorted_errors[:3]]
|
| 1598 |
+
|
| 1599 |
+
except Exception:
|
| 1600 |
+
return []
|
| 1601 |
+
|
| 1602 |
+
|
| 1603 |
+
class DataValidator:
|
| 1604 |
+
"""Helper class for validating export data"""
|
| 1605 |
+
|
| 1606 |
+
@staticmethod
|
| 1607 |
+
def validate_geo_data(geo_results: List[Dict[str, Any]]) -> Dict[str, Any]:
|
| 1608 |
+
"""Validate GEO analysis data structure"""
|
| 1609 |
+
validation_result = {
|
| 1610 |
+
'valid': True,
|
| 1611 |
+
'errors': [],
|
| 1612 |
+
'warnings': []
|
| 1613 |
+
}
|
| 1614 |
+
|
| 1615 |
+
try:
|
| 1616 |
+
if not geo_results:
|
| 1617 |
+
validation_result['errors'].append("No GEO results provided")
|
| 1618 |
+
validation_result['valid'] = False
|
| 1619 |
+
return validation_result
|
| 1620 |
+
|
| 1621 |
+
for i, result in enumerate(geo_results):
|
| 1622 |
+
# Check required fields
|
| 1623 |
+
if 'geo_scores' not in result:
|
| 1624 |
+
validation_result['warnings'].append(f"Result {i} missing geo_scores")
|
| 1625 |
+
|
| 1626 |
+
if 'page_data' not in result:
|
| 1627 |
+
validation_result['warnings'].append(f"Result {i} missing page_data")
|
| 1628 |
+
|
| 1629 |
+
# Validate score ranges
|
| 1630 |
+
if 'geo_scores' in result:
|
| 1631 |
+
for metric, score in result['geo_scores'].items():
|
| 1632 |
+
if not isinstance(score, (int, float)) or score < 0 or score > 10:
|
| 1633 |
+
validation_result['errors'].append(f"Invalid score for {metric} in result {i}")
|
| 1634 |
+
validation_result['valid'] = False
|
| 1635 |
+
|
| 1636 |
+
return validation_result
|
| 1637 |
+
|
| 1638 |
+
except Exception as e:
|
| 1639 |
+
validation_result['errors'].append(f"Validation failed: {str(e)}")
|
| 1640 |
+
validation_result['valid'] = False
|
| 1641 |
+
return validation_result
|
| 1642 |
+
|
| 1643 |
+
@staticmethod
|
| 1644 |
+
def validate_enhancement_data(enhancement_result: Dict[str, Any]) -> Dict[str, Any]:
|
| 1645 |
+
"""Validate content enhancement data structure"""
|
| 1646 |
+
validation_result = {
|
| 1647 |
+
'valid': True,
|
| 1648 |
+
'errors': [],
|
| 1649 |
+
'warnings': []
|
| 1650 |
+
}
|
| 1651 |
+
|
| 1652 |
+
try:
|
| 1653 |
+
# Check for required fields
|
| 1654 |
+
if 'scores' not in enhancement_result:
|
| 1655 |
+
validation_result['warnings'].append("Enhancement result missing scores")
|
| 1656 |
+
|
| 1657 |
+
# Validate score structure
|
| 1658 |
+
if 'scores' in enhancement_result:
|
| 1659 |
+
scores = enhancement_result['scores']
|
| 1660 |
+
required_scores = ['clarity', 'structuredness', 'answerability']
|
| 1661 |
+
|
| 1662 |
+
for req_score in required_scores:
|
| 1663 |
+
if req_score not in scores:
|
| 1664 |
+
validation_result['warnings'].append(f"Missing {req_score} score")
|
| 1665 |
+
elif not isinstance(scores[req_score], (int, float)):
|
| 1666 |
+
validation_result['errors'].append(f"Invalid {req_score} score type")
|
| 1667 |
+
validation_result['valid'] = False
|
| 1668 |
+
|
| 1669 |
+
return validation_result
|
| 1670 |
+
|
| 1671 |
+
except Exception as e:
|
| 1672 |
+
validation_result['errors'].append(f"Enhancement validation failed: {str(e)}")
|
| 1673 |
+
validation_result['valid'] = False
|
| 1674 |
+
return validation_result
|
| 1675 |
+
|
| 1676 |
+
|
| 1677 |
+
class ExportManager:
|
| 1678 |
+
"""High-level export management class"""
|
| 1679 |
+
|
| 1680 |
+
def __init__(self):
|
| 1681 |
+
self.exporter = ResultExporter()
|
| 1682 |
+
self.validator = DataValidator()
|
| 1683 |
+
self.export_history = []
|
| 1684 |
+
|
| 1685 |
+
def export_with_validation(self, data: Dict[str, Any], data_type: str,
|
| 1686 |
+
format_type: str = 'json') -> Dict[str, Any]:
|
| 1687 |
+
"""Export data with validation"""
|
| 1688 |
+
try:
|
| 1689 |
+
# Validate data first
|
| 1690 |
+
if data_type == 'geo_analysis':
|
| 1691 |
+
validation = self.validator.validate_geo_data(data.get('geo_results', []))
|
| 1692 |
+
elif data_type == 'content_optimization':
|
| 1693 |
+
validation = self.validator.validate_enhancement_data(data)
|
| 1694 |
+
else:
|
| 1695 |
+
validation = {'valid': True, 'errors': [], 'warnings': []}
|
| 1696 |
+
|
| 1697 |
+
# Proceed with export if validation passes
|
| 1698 |
+
if validation['valid']:
|
| 1699 |
+
if data_type == 'geo_analysis':
|
| 1700 |
+
result = self.exporter.export_geo_results(
|
| 1701 |
+
data.get('geo_results', []),
|
| 1702 |
+
data.get('website_url', 'unknown'),
|
| 1703 |
+
format_type
|
| 1704 |
+
)
|
| 1705 |
+
elif data_type == 'content_optimization':
|
| 1706 |
+
result = self.exporter.export_enhancement_results(data, format_type)
|
| 1707 |
+
else:
|
| 1708 |
+
result = json.dumps(data, indent=2, ensure_ascii=False)
|
| 1709 |
+
|
| 1710 |
+
# Log export
|
| 1711 |
+
self.export_history.append({
|
| 1712 |
+
'timestamp': datetime.now().isoformat(),
|
| 1713 |
+
'data_type': data_type,
|
| 1714 |
+
'format_type': format_type,
|
| 1715 |
+
'validation_warnings': validation.get('warnings', []),
|
| 1716 |
+
'success': True
|
| 1717 |
+
})
|
| 1718 |
+
|
| 1719 |
+
return {
|
| 1720 |
+
'success': True,
|
| 1721 |
+
'data': result,
|
| 1722 |
+
'validation': validation
|
| 1723 |
+
}
|
| 1724 |
+
else:
|
| 1725 |
+
return {
|
| 1726 |
+
'success': False,
|
| 1727 |
+
'error': 'Data validation failed',
|
| 1728 |
+
'validation': validation
|
| 1729 |
+
}
|
| 1730 |
+
|
| 1731 |
+
except Exception as e:
|
| 1732 |
+
self.export_history.append({
|
| 1733 |
+
'timestamp': datetime.now().isoformat(),
|
| 1734 |
+
'data_type': data_type,
|
| 1735 |
+
'format_type': format_type,
|
| 1736 |
+
'success': False,
|
| 1737 |
+
'error': str(e)
|
| 1738 |
+
})
|
| 1739 |
+
|
| 1740 |
+
return {
|
| 1741 |
+
'success': False,
|
| 1742 |
+
'error': f"Export failed: {str(e)}"
|
| 1743 |
+
}
|
| 1744 |
+
|
| 1745 |
+
def get_export_history(self) -> List[Dict[str, Any]]:
|
| 1746 |
+
"""Get export history"""
|
| 1747 |
+
return self.export_history
|
| 1748 |
+
|
| 1749 |
+
def clear_export_history(self) -> None:
|
| 1750 |
+
"""Clear export history"""
|
| 1751 |
+
self.export_history.clear()
|
| 1752 |
+
|
| 1753 |
+
def get_supported_formats(self) -> Dict[str, List[str]]:
|
| 1754 |
+
"""Get supported export formats by data type"""
|
| 1755 |
+
return {
|
| 1756 |
+
'geo_analysis': ['json', 'csv', 'html', 'xlsx', 'pdf'],
|
| 1757 |
+
'content_optimization': ['json', 'html', 'csv'],
|
| 1758 |
+
'qa_results': ['json', 'html', 'csv'],
|
| 1759 |
+
'batch_analysis': ['json', 'xlsx', 'csv']
|
| 1760 |
+
}
|
| 1761 |
+
|
| 1762 |
+
def create_multi_format_export(self, data: Dict[str, Any], data_type: str,
|
| 1763 |
+
formats: List[str] = None) -> Dict[str, Any]:
|
| 1764 |
+
"""Create export in multiple formats"""
|
| 1765 |
+
if formats is None:
|
| 1766 |
+
formats = ['json', 'html', 'csv']
|
| 1767 |
+
|
| 1768 |
+
results = {}
|
| 1769 |
+
|
| 1770 |
+
for format_type in formats:
|
| 1771 |
+
try:
|
| 1772 |
+
export_result = self.export_with_validation(data, data_type, format_type)
|
| 1773 |
+
if export_result['success']:
|
| 1774 |
+
results[format_type] = export_result['data']
|
| 1775 |
+
else:
|
| 1776 |
+
results[format_type] = {'error': export_result['error']}
|
| 1777 |
+
|
| 1778 |
+
except Exception as e:
|
| 1779 |
+
results[format_type] = {'error': str(e)}
|
| 1780 |
+
|
| 1781 |
+
return {
|
| 1782 |
+
'multi_format_export': results,
|
| 1783 |
+
'formats_generated': list(results.keys()),
|
| 1784 |
+
'successful_formats': [fmt for fmt, data in results.items() if 'error' not in data]
|
| 1785 |
+
}
|
| 1786 |
+
|
| 1787 |
+
|
| 1788 |
+
# Utility functions for the export module
|
| 1789 |
+
|
| 1790 |
+
def create_export_template(data_type: str) -> Dict[str, Any]:
|
| 1791 |
+
"""Create export template for different data types"""
|
| 1792 |
+
templates = {
|
| 1793 |
+
'geo_analysis': {
|
| 1794 |
+
'website_url': 'https://example.com',
|
| 1795 |
+
'geo_results': [
|
| 1796 |
+
{
|
| 1797 |
+
'page_data': {
|
| 1798 |
+
'url': 'https://example.com/page1',
|
| 1799 |
+
'title': 'Example Page',
|
| 1800 |
+
'word_count': 500
|
| 1801 |
+
},
|
| 1802 |
+
'geo_scores': {
|
| 1803 |
+
'ai_search_visibility': 7.5,
|
| 1804 |
+
'query_intent_matching': 6.8,
|
| 1805 |
+
'conversational_readiness': 8.2,
|
| 1806 |
+
'citation_worthiness': 7.1
|
| 1807 |
+
},
|
| 1808 |
+
'recommendations': [
|
| 1809 |
+
'Improve content structure',
|
| 1810 |
+
'Add more specific examples'
|
| 1811 |
+
]
|
| 1812 |
+
}
|
| 1813 |
+
]
|
| 1814 |
+
},
|
| 1815 |
+
'content_optimization': {
|
| 1816 |
+
'scores': {
|
| 1817 |
+
'clarity': 7.5,
|
| 1818 |
+
'structuredness': 6.8,
|
| 1819 |
+
'answerability': 8.2
|
| 1820 |
+
},
|
| 1821 |
+
'keywords': ['example', 'optimization', 'content'],
|
| 1822 |
+
'optimized_text': 'This is the optimized version of the content...',
|
| 1823 |
+
'optimization_suggestions': [
|
| 1824 |
+
'Improve sentence structure',
|
| 1825 |
+
'Add more specific keywords'
|
| 1826 |
+
]
|
| 1827 |
+
},
|
| 1828 |
+
'qa_results': [
|
| 1829 |
+
{
|
| 1830 |
+
'query': 'What is the main topic?',
|
| 1831 |
+
'result': 'The main topic is content optimization for AI systems.',
|
| 1832 |
+
'sources': [
|
| 1833 |
+
{
|
| 1834 |
+
'content': 'Source document content...',
|
| 1835 |
+
'metadata': {'source': 'document1.pdf'}
|
| 1836 |
+
}
|
| 1837 |
+
]
|
| 1838 |
+
}
|
| 1839 |
+
]
|
| 1840 |
+
}
|
| 1841 |
+
|
| 1842 |
+
return templates.get(data_type, {})
|
| 1843 |
+
|
| 1844 |
+
|
| 1845 |
+
def export_demo_data() -> Dict[str, Any]:
|
| 1846 |
+
"""Export demonstration data for testing"""
|
| 1847 |
+
demo_data = {
|
| 1848 |
+
'geo_analysis_demo': create_export_template('geo_analysis'),
|
| 1849 |
+
'content_optimization_demo': create_export_template('content_optimization'),
|
| 1850 |
+
'qa_results_demo': create_export_template('qa_results')
|
| 1851 |
+
}
|
| 1852 |
+
|
| 1853 |
+
return demo_data
|
| 1854 |
+
|
| 1855 |
+
|
| 1856 |
+
# Export the main classes and functions
|
| 1857 |
+
__all__ = [
|
| 1858 |
+
'ResultExporter',
|
| 1859 |
+
'GEOReport',
|
| 1860 |
+
'ContentAnalysis',
|
| 1861 |
+
'DataValidator',
|
| 1862 |
+
'ExportManager',
|
| 1863 |
+
'create_export_template',
|
| 1864 |
+
'export_demo_data'
|
| 1865 |
+
]
|
| 1866 |
+
|
| 1867 |
+
|
| 1868 |
+
# Example usage for testing
|
| 1869 |
+
if __name__ == "__main__":
|
| 1870 |
+
# Create exporter instance
|
| 1871 |
+
exporter = ResultExporter()
|
| 1872 |
+
|
| 1873 |
+
# Test with demo data
|
| 1874 |
+
demo_geo_data = create_export_template('geo_analysis')
|
| 1875 |
+
|
| 1876 |
+
# Export in different formats
|
| 1877 |
+
json_export = exporter.export_geo_results(
|
| 1878 |
+
demo_geo_data['geo_results'],
|
| 1879 |
+
demo_geo_data['website_url'],
|
| 1880 |
+
'json'
|
| 1881 |
+
)
|
| 1882 |
+
|
| 1883 |
+
html_export = exporter.export_geo_results(
|
| 1884 |
+
demo_geo_data['geo_results'],
|
| 1885 |
+
demo_geo_data['website_url'],
|
| 1886 |
+
'html'
|
| 1887 |
+
)
|
| 1888 |
+
|
| 1889 |
+
print("JSON Export:", json_export[:200] + "..." if len(str(json_export)) > 200 else json_export)
|
| 1890 |
+
print("\nHTML Export:", html_export[:200] + "..." if len(str(html_export)) > 200 else html_export)
|
| 1891 |
+
|
| 1892 |
+
# Test enhancement export
|
| 1893 |
+
demo_enhancement = create_export_template('content_optimization')
|
| 1894 |
+
enhancement_export = exporter.export_enhancement_results(demo_enhancement, 'json')
|
| 1895 |
+
|
| 1896 |
+
print("\nEnhancement Export:", enhancement_export[:200] + "..." if len(str(enhancement_export)) > 200 else enhancement_export)
|
utils/optimizer.py
ADDED
|
@@ -0,0 +1,558 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Content Optimization Module
|
| 3 |
+
Enhances content for better AI/LLM performance and GEO scores
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
import re
|
| 8 |
+
from typing import Dict, Any, List, Optional
|
| 9 |
+
from langchain.prompts import ChatPromptTemplate
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class ContentOptimizer:
|
| 13 |
+
"""Main class for optimizing content for AI search engines"""
|
| 14 |
+
|
| 15 |
+
def __init__(self, llm):
|
| 16 |
+
self.llm = llm
|
| 17 |
+
self.setup_prompts()
|
| 18 |
+
|
| 19 |
+
def setup_prompts(self):
|
| 20 |
+
"""Initialize optimization prompts"""
|
| 21 |
+
|
| 22 |
+
# Main content enhancement prompt
|
| 23 |
+
self.enhancement_prompt = """You are an AI Content Enhancement Specialist. Your purpose is to optimize user-provided text to maximize its effectiveness for large language models (LLMs) in search, question-answering, and conversational AI systems.
|
| 24 |
+
|
| 25 |
+
Evaluate the input text based on the following criteria, assigning a score from 1–10 for each:
|
| 26 |
+
|
| 27 |
+
Clarity: How easily can the content be understood?
|
| 28 |
+
Structuredness: How well-organized and coherent is the content?
|
| 29 |
+
LLM Answerability: How easily can an LLM extract precise answers from the content?
|
| 30 |
+
|
| 31 |
+
Identify the most salient keywords.
|
| 32 |
+
|
| 33 |
+
Rewrite the text to improve:
|
| 34 |
+
- Clarity and precision
|
| 35 |
+
- Logical structure and flow
|
| 36 |
+
- Suitability for LLM-based information retrieval
|
| 37 |
+
|
| 38 |
+
Present your analysis and optimized text in the following JSON format:
|
| 39 |
+
|
| 40 |
+
```json
|
| 41 |
+
{
|
| 42 |
+
"scores": {
|
| 43 |
+
"clarity": 8.5,
|
| 44 |
+
"structuredness": 7.0,
|
| 45 |
+
"answerability": 9.0
|
| 46 |
+
},
|
| 47 |
+
"keywords": ["example", "installation", "setup"],
|
| 48 |
+
"optimized_text": "..."
|
| 49 |
+
}
|
| 50 |
+
```"""
|
| 51 |
+
|
| 52 |
+
# SEO-style optimization prompt
|
| 53 |
+
self.seo_style_prompt = """You are an AI-first SEO specialist. Optimize this content for AI search engines and LLM systems.
|
| 54 |
+
|
| 55 |
+
Focus on:
|
| 56 |
+
1. Semantic keyword optimization
|
| 57 |
+
2. Question-answer format enhancement
|
| 58 |
+
3. Factual accuracy and authority signals
|
| 59 |
+
4. Conversational readiness
|
| 60 |
+
5. Citation-worthy structure
|
| 61 |
+
|
| 62 |
+
Provide analysis and optimization in JSON:
|
| 63 |
+
|
| 64 |
+
```json
|
| 65 |
+
{
|
| 66 |
+
"seo_analysis": {
|
| 67 |
+
"keyword_density": "analysis of current keywords",
|
| 68 |
+
"semantic_gaps": ["missing semantic terms"],
|
| 69 |
+
"readability_score": 8.5,
|
| 70 |
+
"authority_signals": ["credentials", "citations"]
|
| 71 |
+
},
|
| 72 |
+
"optimized_content": {
|
| 73 |
+
"title_suggestions": ["optimized title 1", "optimized title 2"],
|
| 74 |
+
"meta_description": "AI-optimized meta description",
|
| 75 |
+
"enhanced_content": "full optimized content...",
|
| 76 |
+
"structured_data_suggestions": ["schema markup recommendations"]
|
| 77 |
+
},
|
| 78 |
+
"improvement_summary": {
|
| 79 |
+
"changes_made": ["change 1", "change 2"],
|
| 80 |
+
"expected_impact": "description of expected improvements"
|
| 81 |
+
}
|
| 82 |
+
}
|
| 83 |
+
```"""
|
| 84 |
+
|
| 85 |
+
# Competitive content analysis prompt
|
| 86 |
+
self.competitive_analysis_prompt = """Compare this content against best practices for AI search optimization. Identify gaps and opportunities.
|
| 87 |
+
|
| 88 |
+
Original Content: {content}
|
| 89 |
+
|
| 90 |
+
Analyze against these AI search factors:
|
| 91 |
+
- Entity recognition and linking
|
| 92 |
+
- Question coverage completeness
|
| 93 |
+
- Factual statement clarity
|
| 94 |
+
- Conversational flow
|
| 95 |
+
- Semantic relationship mapping
|
| 96 |
+
|
| 97 |
+
Provide competitive analysis in JSON format with specific recommendations."""
|
| 98 |
+
|
| 99 |
+
def optimize_content(self, content: str, analyze_only: bool = False,
|
| 100 |
+
include_keywords: bool = True, optimization_type: str = "standard") -> Dict[str, Any]:
|
| 101 |
+
"""
|
| 102 |
+
Main content optimization function
|
| 103 |
+
|
| 104 |
+
Args:
|
| 105 |
+
content (str): Content to optimize
|
| 106 |
+
analyze_only (bool): If True, only analyze without rewriting
|
| 107 |
+
include_keywords (bool): Whether to include keyword analysis
|
| 108 |
+
optimization_type (str): Type of optimization ("standard", "seo", "competitive")
|
| 109 |
+
|
| 110 |
+
Returns:
|
| 111 |
+
Dict: Optimization results with scores and enhanced content
|
| 112 |
+
"""
|
| 113 |
+
try:
|
| 114 |
+
# Choose optimization approach
|
| 115 |
+
if optimization_type == "seo":
|
| 116 |
+
return self._seo_style_optimization(content, analyze_only)
|
| 117 |
+
elif optimization_type == "competitive":
|
| 118 |
+
return self._competitive_optimization(content)
|
| 119 |
+
else:
|
| 120 |
+
return self._standard_optimization(content, analyze_only, include_keywords)
|
| 121 |
+
|
| 122 |
+
except Exception as e:
|
| 123 |
+
return {'error': f"Optimization failed: {str(e)}"}
|
| 124 |
+
|
| 125 |
+
def _standard_optimization(self, content: str, analyze_only: bool, include_keywords: bool) -> Dict[str, Any]:
|
| 126 |
+
"""Standard content optimization using enhancement prompt"""
|
| 127 |
+
try:
|
| 128 |
+
# Modify prompt based on options
|
| 129 |
+
prompt_text = self.enhancement_prompt
|
| 130 |
+
|
| 131 |
+
if analyze_only:
|
| 132 |
+
prompt_text = prompt_text.replace(
|
| 133 |
+
"Rewrite the text to improve:",
|
| 134 |
+
"Analyze the text for potential improvements in:"
|
| 135 |
+
).replace(
|
| 136 |
+
'"optimized_text": "..."',
|
| 137 |
+
'"optimization_suggestions": ["suggestion 1", "suggestion 2"]'
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
if not include_keywords:
|
| 141 |
+
prompt_text = prompt_text.replace(
|
| 142 |
+
'"keywords": ["example", "installation", "setup"],',
|
| 143 |
+
''
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
# Create and run chain
|
| 147 |
+
prompt_template = ChatPromptTemplate.from_messages([
|
| 148 |
+
("system", prompt_text),
|
| 149 |
+
("user", content[:6000]) # Limit content length
|
| 150 |
+
])
|
| 151 |
+
|
| 152 |
+
chain = prompt_template | self.llm
|
| 153 |
+
result = chain.invoke({})
|
| 154 |
+
|
| 155 |
+
# Parse result
|
| 156 |
+
result_content = result.content if hasattr(result, 'content') else str(result)
|
| 157 |
+
parsed_result = self._parse_optimization_result(result_content)
|
| 158 |
+
|
| 159 |
+
# Add metadata
|
| 160 |
+
parsed_result.update({
|
| 161 |
+
'optimization_type': 'standard',
|
| 162 |
+
'analyze_only': analyze_only,
|
| 163 |
+
'original_length': len(content),
|
| 164 |
+
'original_word_count': len(content.split())
|
| 165 |
+
})
|
| 166 |
+
|
| 167 |
+
return parsed_result
|
| 168 |
+
|
| 169 |
+
except Exception as e:
|
| 170 |
+
return {'error': f"Standard optimization failed: {str(e)}"}
|
| 171 |
+
|
| 172 |
+
def _seo_style_optimization(self, content: str, analyze_only: bool) -> Dict[str, Any]:
|
| 173 |
+
"""SEO-focused optimization for AI search engines"""
|
| 174 |
+
try:
|
| 175 |
+
prompt_template = ChatPromptTemplate.from_messages([
|
| 176 |
+
("system", self.seo_style_prompt),
|
| 177 |
+
("user", f"Optimize this content for AI search engines:\n\n{content[:6000]}")
|
| 178 |
+
])
|
| 179 |
+
|
| 180 |
+
chain = prompt_template | self.llm
|
| 181 |
+
result = chain.invoke({})
|
| 182 |
+
|
| 183 |
+
result_content = result.content if hasattr(result, 'content') else str(result)
|
| 184 |
+
parsed_result = self._parse_optimization_result(result_content)
|
| 185 |
+
|
| 186 |
+
# Add SEO-specific metadata
|
| 187 |
+
parsed_result.update({
|
| 188 |
+
'optimization_type': 'seo',
|
| 189 |
+
'analyze_only': analyze_only,
|
| 190 |
+
'seo_focused': True
|
| 191 |
+
})
|
| 192 |
+
|
| 193 |
+
return parsed_result
|
| 194 |
+
|
| 195 |
+
except Exception as e:
|
| 196 |
+
return {'error': f"SEO optimization failed: {str(e)}"}
|
| 197 |
+
|
| 198 |
+
def _competitive_optimization(self, content: str) -> Dict[str, Any]:
|
| 199 |
+
"""Competitive analysis-based optimization"""
|
| 200 |
+
try:
|
| 201 |
+
formatted_prompt = self.competitive_analysis_prompt.format(content=content[:5000])
|
| 202 |
+
|
| 203 |
+
prompt_template = ChatPromptTemplate.from_messages([
|
| 204 |
+
("system", formatted_prompt),
|
| 205 |
+
("user", "Perform the competitive analysis and provide optimization recommendations.")
|
| 206 |
+
])
|
| 207 |
+
|
| 208 |
+
chain = prompt_template | self.llm
|
| 209 |
+
result = chain.invoke({})
|
| 210 |
+
|
| 211 |
+
result_content = result.content if hasattr(result, 'content') else str(result)
|
| 212 |
+
parsed_result = self._parse_optimization_result(result_content)
|
| 213 |
+
|
| 214 |
+
parsed_result.update({
|
| 215 |
+
'optimization_type': 'competitive',
|
| 216 |
+
'competitive_analysis': True
|
| 217 |
+
})
|
| 218 |
+
|
| 219 |
+
return parsed_result
|
| 220 |
+
|
| 221 |
+
except Exception as e:
|
| 222 |
+
return {'error': f"Competitive optimization failed: {str(e)}"}
|
| 223 |
+
|
| 224 |
+
def batch_optimize_content(self, content_list: List[str], optimization_type: str = "standard") -> List[Dict[str, Any]]:
|
| 225 |
+
"""
|
| 226 |
+
Optimize multiple pieces of content in batch
|
| 227 |
+
|
| 228 |
+
Args:
|
| 229 |
+
content_list (List[str]): List of content pieces to optimize
|
| 230 |
+
optimization_type (str): Type of optimization to apply
|
| 231 |
+
|
| 232 |
+
Returns:
|
| 233 |
+
List[Dict]: List of optimization results
|
| 234 |
+
"""
|
| 235 |
+
results = []
|
| 236 |
+
|
| 237 |
+
for i, content in enumerate(content_list):
|
| 238 |
+
try:
|
| 239 |
+
result = self.optimize_content(
|
| 240 |
+
content,
|
| 241 |
+
optimization_type=optimization_type
|
| 242 |
+
)
|
| 243 |
+
result['batch_index'] = i
|
| 244 |
+
results.append(result)
|
| 245 |
+
|
| 246 |
+
except Exception as e:
|
| 247 |
+
results.append({
|
| 248 |
+
'batch_index': i,
|
| 249 |
+
'error': f"Batch optimization failed: {str(e)}"
|
| 250 |
+
})
|
| 251 |
+
|
| 252 |
+
return results
|
| 253 |
+
|
| 254 |
+
def generate_content_variations(self, content: str, num_variations: int = 3) -> List[Dict[str, Any]]:
|
| 255 |
+
"""
|
| 256 |
+
Generate multiple optimized variations of the same content
|
| 257 |
+
|
| 258 |
+
Args:
|
| 259 |
+
content (str): Original content
|
| 260 |
+
num_variations (int): Number of variations to generate
|
| 261 |
+
|
| 262 |
+
Returns:
|
| 263 |
+
List[Dict]: List of content variations with analysis
|
| 264 |
+
"""
|
| 265 |
+
variations = []
|
| 266 |
+
|
| 267 |
+
variation_prompts = [
|
| 268 |
+
"Create a more conversational version optimized for AI chat responses",
|
| 269 |
+
"Create a more authoritative version optimized for citations",
|
| 270 |
+
"Create a more structured version optimized for question-answering"
|
| 271 |
+
]
|
| 272 |
+
|
| 273 |
+
for i in range(min(num_variations, len(variation_prompts))):
|
| 274 |
+
try:
|
| 275 |
+
custom_prompt = f"""You are optimizing content for AI systems. {variation_prompts[i]}.
|
| 276 |
+
|
| 277 |
+
Original content: {content[:4000]}
|
| 278 |
+
|
| 279 |
+
Provide the optimized variation in JSON format:
|
| 280 |
+
```json
|
| 281 |
+
{{
|
| 282 |
+
"variation_type": "conversational/authoritative/structured",
|
| 283 |
+
"optimized_content": "the rewritten content...",
|
| 284 |
+
"key_changes": ["change 1", "change 2"],
|
| 285 |
+
"target_use_case": "description of ideal use case"
|
| 286 |
+
}}
|
| 287 |
+
```"""
|
| 288 |
+
|
| 289 |
+
prompt_template = ChatPromptTemplate.from_messages([
|
| 290 |
+
("system", custom_prompt),
|
| 291 |
+
("user", "Generate the variation.")
|
| 292 |
+
])
|
| 293 |
+
|
| 294 |
+
chain = prompt_template | self.llm
|
| 295 |
+
result = chain.invoke({})
|
| 296 |
+
|
| 297 |
+
result_content = result.content if hasattr(result, 'content') else str(result)
|
| 298 |
+
parsed_result = self._parse_optimization_result(result_content)
|
| 299 |
+
|
| 300 |
+
parsed_result.update({
|
| 301 |
+
'variation_index': i,
|
| 302 |
+
'variation_prompt': variation_prompts[i]
|
| 303 |
+
})
|
| 304 |
+
|
| 305 |
+
variations.append(parsed_result)
|
| 306 |
+
|
| 307 |
+
except Exception as e:
|
| 308 |
+
variations.append({
|
| 309 |
+
'variation_index': i,
|
| 310 |
+
'error': f"Variation generation failed: {str(e)}"
|
| 311 |
+
})
|
| 312 |
+
|
| 313 |
+
return variations
|
| 314 |
+
|
| 315 |
+
def analyze_content_readability(self, content: str) -> Dict[str, Any]:
|
| 316 |
+
"""
|
| 317 |
+
Analyze content readability for AI systems
|
| 318 |
+
|
| 319 |
+
Args:
|
| 320 |
+
content (str): Content to analyze
|
| 321 |
+
|
| 322 |
+
Returns:
|
| 323 |
+
Dict: Readability analysis results
|
| 324 |
+
"""
|
| 325 |
+
try:
|
| 326 |
+
# Basic readability metrics
|
| 327 |
+
words = content.split()
|
| 328 |
+
sentences = re.split(r'[.!?]+', content)
|
| 329 |
+
sentences = [s.strip() for s in sentences if s.strip()]
|
| 330 |
+
|
| 331 |
+
paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()]
|
| 332 |
+
|
| 333 |
+
# Calculate metrics
|
| 334 |
+
avg_words_per_sentence = len(words) / len(sentences) if sentences else 0
|
| 335 |
+
avg_sentences_per_paragraph = len(sentences) / len(paragraphs) if paragraphs else 0
|
| 336 |
+
|
| 337 |
+
# Character-based metrics
|
| 338 |
+
avg_word_length = sum(len(word) for word in words) / len(words) if words else 0
|
| 339 |
+
|
| 340 |
+
# Complexity indicators
|
| 341 |
+
long_sentences = [s for s in sentences if len(s.split()) > 20]
|
| 342 |
+
complex_words = [w for w in words if len(w) > 6]
|
| 343 |
+
|
| 344 |
+
return {
|
| 345 |
+
'basic_metrics': {
|
| 346 |
+
'total_words': len(words),
|
| 347 |
+
'total_sentences': len(sentences),
|
| 348 |
+
'total_paragraphs': len(paragraphs),
|
| 349 |
+
'avg_words_per_sentence': avg_words_per_sentence,
|
| 350 |
+
'avg_sentences_per_paragraph': avg_sentences_per_paragraph,
|
| 351 |
+
'avg_word_length': avg_word_length
|
| 352 |
+
},
|
| 353 |
+
'complexity_indicators': {
|
| 354 |
+
'long_sentences_count': len(long_sentences),
|
| 355 |
+
'long_sentences_percentage': len(long_sentences) / len(sentences) * 100 if sentences else 0,
|
| 356 |
+
'complex_words_count': len(complex_words),
|
| 357 |
+
'complex_words_percentage': len(complex_words) / len(words) * 100 if words else 0
|
| 358 |
+
},
|
| 359 |
+
'ai_readability_score': self._calculate_ai_readability_score({
|
| 360 |
+
'avg_words_per_sentence': avg_words_per_sentence,
|
| 361 |
+
'avg_word_length': avg_word_length,
|
| 362 |
+
'complex_words_percentage': len(complex_words) / len(words) * 100 if words else 0
|
| 363 |
+
}),
|
| 364 |
+
'recommendations': self._generate_readability_recommendations({
|
| 365 |
+
'avg_words_per_sentence': avg_words_per_sentence,
|
| 366 |
+
'long_sentences_percentage': len(long_sentences) / len(sentences) * 100 if sentences else 0,
|
| 367 |
+
'complex_words_percentage': len(complex_words) / len(words) * 100 if words else 0
|
| 368 |
+
})
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
+
except Exception as e:
|
| 372 |
+
return {'error': f"Readability analysis failed: {str(e)}"}
|
| 373 |
+
|
| 374 |
+
def extract_key_entities(self, content: str) -> Dict[str, Any]:
|
| 375 |
+
"""
|
| 376 |
+
Extract key entities and topics for optimization
|
| 377 |
+
|
| 378 |
+
Args:
|
| 379 |
+
content (str): Content to analyze
|
| 380 |
+
|
| 381 |
+
Returns:
|
| 382 |
+
Dict: Extracted entities and topics
|
| 383 |
+
"""
|
| 384 |
+
try:
|
| 385 |
+
entity_prompt = """Extract key entities, topics, and concepts from this content for AI optimization.
|
| 386 |
+
|
| 387 |
+
Content: {content}
|
| 388 |
+
|
| 389 |
+
Identify:
|
| 390 |
+
1. Named entities (people, places, organizations)
|
| 391 |
+
2. Key concepts and topics
|
| 392 |
+
3. Technical terms and jargon
|
| 393 |
+
4. Potential semantic keywords
|
| 394 |
+
5. Question-answer opportunities
|
| 395 |
+
|
| 396 |
+
Format as JSON:
|
| 397 |
+
```json
|
| 398 |
+
{{
|
| 399 |
+
"named_entities": ["entity1", "entity2"],
|
| 400 |
+
"key_topics": ["topic1", "topic2"],
|
| 401 |
+
"technical_terms": ["term1", "term2"],
|
| 402 |
+
"semantic_keywords": ["keyword1", "keyword2"],
|
| 403 |
+
"question_opportunities": ["What is...", "How does..."],
|
| 404 |
+
"entity_relationships": ["relationship descriptions"]
|
| 405 |
+
}}
|
| 406 |
+
```"""
|
| 407 |
+
|
| 408 |
+
prompt_template = ChatPromptTemplate.from_messages([
|
| 409 |
+
("system", entity_prompt.format(content=content[:5000])),
|
| 410 |
+
("user", "Extract the entities and topics.")
|
| 411 |
+
])
|
| 412 |
+
|
| 413 |
+
chain = prompt_template | self.llm
|
| 414 |
+
result = chain.invoke({})
|
| 415 |
+
|
| 416 |
+
result_content = result.content if hasattr(result, 'content') else str(result)
|
| 417 |
+
return self._parse_optimization_result(result_content)
|
| 418 |
+
|
| 419 |
+
except Exception as e:
|
| 420 |
+
return {'error': f"Entity extraction failed: {str(e)}"}
|
| 421 |
+
|
| 422 |
+
def optimize_for_voice_search(self, content: str) -> Dict[str, Any]:
|
| 423 |
+
"""
|
| 424 |
+
Optimize content specifically for voice search and conversational AI
|
| 425 |
+
|
| 426 |
+
Args:
|
| 427 |
+
content (str): Content to optimize
|
| 428 |
+
|
| 429 |
+
Returns:
|
| 430 |
+
Dict: Voice search optimization results
|
| 431 |
+
"""
|
| 432 |
+
try:
|
| 433 |
+
voice_prompt = """Optimize this content for voice search and conversational AI systems.
|
| 434 |
+
|
| 435 |
+
Focus on:
|
| 436 |
+
1. Natural language patterns
|
| 437 |
+
2. Question-based structure
|
| 438 |
+
3. Conversational tone
|
| 439 |
+
4. Clear, direct answers
|
| 440 |
+
5. Featured snippet optimization
|
| 441 |
+
|
| 442 |
+
Original content: {content}
|
| 443 |
+
|
| 444 |
+
Provide optimization in JSON:
|
| 445 |
+
```json
|
| 446 |
+
{{
|
| 447 |
+
"voice_optimized_content": "conversational version...",
|
| 448 |
+
"question_answer_pairs": [
|
| 449 |
+
{{"question": "What is...", "answer": "Direct answer..."}},
|
| 450 |
+
{{"question": "How does...", "answer": "Step by step..."}}
|
| 451 |
+
],
|
| 452 |
+
"featured_snippet_candidates": ["snippet 1", "snippet 2"],
|
| 453 |
+
"natural_language_improvements": ["improvement 1", "improvement 2"],
|
| 454 |
+
"conversational_score": 8.5
|
| 455 |
+
}}
|
| 456 |
+
```"""
|
| 457 |
+
|
| 458 |
+
prompt_template = ChatPromptTemplate.from_messages([
|
| 459 |
+
("system", voice_prompt.format(content=content[:4000])),
|
| 460 |
+
("user", "Optimize for voice search.")
|
| 461 |
+
])
|
| 462 |
+
|
| 463 |
+
chain = prompt_template | self.llm
|
| 464 |
+
result = chain.invoke({})
|
| 465 |
+
|
| 466 |
+
result_content = result.content if hasattr(result, 'content') else str(result)
|
| 467 |
+
parsed_result = self._parse_optimization_result(result_content)
|
| 468 |
+
|
| 469 |
+
parsed_result.update({
|
| 470 |
+
'optimization_type': 'voice_search',
|
| 471 |
+
'voice_optimized': True
|
| 472 |
+
})
|
| 473 |
+
|
| 474 |
+
return parsed_result
|
| 475 |
+
|
| 476 |
+
except Exception as e:
|
| 477 |
+
return {'error': f"Voice search optimization failed: {str(e)}"}
|
| 478 |
+
|
| 479 |
+
def _parse_optimization_result(self, response_text: str) -> Dict[str, Any]:
|
| 480 |
+
"""Parse LLM response and extract structured results"""
|
| 481 |
+
try:
|
| 482 |
+
# Find JSON content in the response
|
| 483 |
+
json_start = response_text.find('{')
|
| 484 |
+
json_end = response_text.rfind('}') + 1
|
| 485 |
+
|
| 486 |
+
if json_start != -1 and json_end != -1:
|
| 487 |
+
json_str = response_text[json_start:json_end]
|
| 488 |
+
parsed = json.loads(json_str)
|
| 489 |
+
|
| 490 |
+
# Ensure consistent structure
|
| 491 |
+
if 'scores' not in parsed and 'score' in parsed:
|
| 492 |
+
parsed['scores'] = parsed['score']
|
| 493 |
+
|
| 494 |
+
return parsed
|
| 495 |
+
else:
|
| 496 |
+
# If no JSON found, return raw response with error flag
|
| 497 |
+
return {
|
| 498 |
+
'raw_response': response_text,
|
| 499 |
+
'parsing_error': 'No JSON structure found in response',
|
| 500 |
+
'scores': {'clarity': 0, 'structuredness': 0, 'answerability': 0}
|
| 501 |
+
}
|
| 502 |
+
|
| 503 |
+
except json.JSONDecodeError as e:
|
| 504 |
+
return {
|
| 505 |
+
'raw_response': response_text,
|
| 506 |
+
'parsing_error': f'JSON decode error: {str(e)}',
|
| 507 |
+
'scores': {'clarity': 0, 'structuredness': 0, 'answerability': 0}
|
| 508 |
+
}
|
| 509 |
+
except Exception as e:
|
| 510 |
+
return {
|
| 511 |
+
'raw_response': response_text,
|
| 512 |
+
'parsing_error': f'Unexpected parsing error: {str(e)}',
|
| 513 |
+
'scores': {'clarity': 0, 'structuredness': 0, 'answerability': 0}
|
| 514 |
+
}
|
| 515 |
+
|
| 516 |
+
def _calculate_ai_readability_score(self, metrics: Dict[str, float]) -> float:
|
| 517 |
+
"""Calculate AI-specific readability score"""
|
| 518 |
+
try:
|
| 519 |
+
# Optimal ranges for AI consumption
|
| 520 |
+
optimal_words_per_sentence = 15 # Sweet spot for AI processing
|
| 521 |
+
optimal_word_length = 5 # Balance of complexity and clarity
|
| 522 |
+
optimal_complex_words_percentage = 15 # Some complexity is good for authority
|
| 523 |
+
|
| 524 |
+
# Calculate deviations from optimal
|
| 525 |
+
sentence_score = max(0, 10 - abs(metrics['avg_words_per_sentence'] - optimal_words_per_sentence) * 0.5)
|
| 526 |
+
word_length_score = max(0, 10 - abs(metrics['avg_word_length'] - optimal_word_length) * 2)
|
| 527 |
+
complexity_score = max(0, 10 - abs(metrics['complex_words_percentage'] - optimal_complex_words_percentage) * 0.3)
|
| 528 |
+
|
| 529 |
+
# Weighted average
|
| 530 |
+
overall_score = (sentence_score * 0.4 + word_length_score * 0.3 + complexity_score * 0.3)
|
| 531 |
+
|
| 532 |
+
return round(overall_score, 1)
|
| 533 |
+
|
| 534 |
+
except Exception:
|
| 535 |
+
return 5.0 # Default neutral score
|
| 536 |
+
|
| 537 |
+
def _generate_readability_recommendations(self, metrics: Dict[str, float]) -> List[str]:
|
| 538 |
+
"""Generate specific readability improvement recommendations"""
|
| 539 |
+
recommendations = []
|
| 540 |
+
|
| 541 |
+
try:
|
| 542 |
+
if metrics['avg_words_per_sentence'] > 20:
|
| 543 |
+
recommendations.append("Break down long sentences for better AI processing")
|
| 544 |
+
elif metrics['avg_words_per_sentence'] < 8:
|
| 545 |
+
recommendations.append("Consider combining very short sentences for better context")
|
| 546 |
+
|
| 547 |
+
if metrics['long_sentences_percentage'] > 30:
|
| 548 |
+
recommendations.append("Reduce the number of complex sentences (>20 words)")
|
| 549 |
+
|
| 550 |
+
if metrics['complex_words_percentage'] > 25:
|
| 551 |
+
recommendations.append("Simplify vocabulary where possible for broader accessibility")
|
| 552 |
+
elif metrics['complex_words_percentage'] < 5:
|
| 553 |
+
recommendations.append("Add more specific terminology to establish authority")
|
| 554 |
+
|
| 555 |
+
return recommendations
|
| 556 |
+
|
| 557 |
+
except Exception:
|
| 558 |
+
return ["Unable to generate specific recommendations"]
|
utils/parser.py
ADDED
|
@@ -0,0 +1,549 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Content Parsing Module
|
| 3 |
+
Handles extraction of content from PDFs, text, and webpages
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import requests
|
| 7 |
+
from bs4 import BeautifulSoup
|
| 8 |
+
from urllib.parse import urljoin, urlparse
|
| 9 |
+
from typing import List, Dict, Any
|
| 10 |
+
import time
|
| 11 |
+
from langchain_community.document_loaders import PyPDFLoader
|
| 12 |
+
from langchain.schema import Document
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class BaseParser:
|
| 16 |
+
"""Base class for all content parsers"""
|
| 17 |
+
|
| 18 |
+
def __init__(self):
|
| 19 |
+
self.supported_formats = []
|
| 20 |
+
|
| 21 |
+
def parse(self, source: str) -> List[Document]:
|
| 22 |
+
"""Parse content from source and return LangChain Documents"""
|
| 23 |
+
raise NotImplementedError("Subclasses must implement parse method")
|
| 24 |
+
|
| 25 |
+
def validate_source(self, source: str) -> bool:
|
| 26 |
+
"""Validate if the source can be processed"""
|
| 27 |
+
return True
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class PDFParser(BaseParser):
|
| 31 |
+
"""Parser for PDF documents"""
|
| 32 |
+
|
| 33 |
+
def __init__(self):
|
| 34 |
+
super().__init__()
|
| 35 |
+
self.supported_formats = ['.pdf']
|
| 36 |
+
|
| 37 |
+
def parse(self, pdf_path: str) -> List[Document]:
|
| 38 |
+
"""
|
| 39 |
+
Parse PDF file and return list of Document objects
|
| 40 |
+
|
| 41 |
+
Args:
|
| 42 |
+
pdf_path (str): Path to the PDF file
|
| 43 |
+
|
| 44 |
+
Returns:
|
| 45 |
+
List[Document]: List of parsed documents with metadata
|
| 46 |
+
"""
|
| 47 |
+
try:
|
| 48 |
+
loader = PyPDFLoader(pdf_path)
|
| 49 |
+
documents = loader.load_and_split()
|
| 50 |
+
|
| 51 |
+
# Add additional metadata
|
| 52 |
+
for i, doc in enumerate(documents):
|
| 53 |
+
doc.metadata.update({
|
| 54 |
+
'source_type': 'pdf',
|
| 55 |
+
'page_number': i + 1,
|
| 56 |
+
'total_pages': len(documents),
|
| 57 |
+
'parser': 'PDFParser'
|
| 58 |
+
})
|
| 59 |
+
|
| 60 |
+
return documents
|
| 61 |
+
|
| 62 |
+
except Exception as e:
|
| 63 |
+
raise Exception(f"Error parsing PDF: {str(e)}")
|
| 64 |
+
|
| 65 |
+
def get_pdf_metadata(self, pdf_path: str) -> Dict[str, Any]:
|
| 66 |
+
"""Extract metadata from PDF file"""
|
| 67 |
+
try:
|
| 68 |
+
loader = PyPDFLoader(pdf_path)
|
| 69 |
+
documents = loader.load()
|
| 70 |
+
|
| 71 |
+
total_pages = len(documents)
|
| 72 |
+
total_words = sum(len(doc.page_content.split()) for doc in documents)
|
| 73 |
+
|
| 74 |
+
return {
|
| 75 |
+
'total_pages': total_pages,
|
| 76 |
+
'total_words': total_words,
|
| 77 |
+
'average_words_per_page': total_words / total_pages if total_pages > 0 else 0,
|
| 78 |
+
'file_type': 'PDF',
|
| 79 |
+
'parser_used': 'PyPDFLoader'
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
except Exception as e:
|
| 83 |
+
return {'error': f"Could not extract metadata: {str(e)}"}
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
class TextParser(BaseParser):
|
| 87 |
+
"""Parser for plain text content"""
|
| 88 |
+
|
| 89 |
+
def __init__(self):
|
| 90 |
+
super().__init__()
|
| 91 |
+
self.supported_formats = ['.txt', 'plain_text']
|
| 92 |
+
self.chunk_size = 1000 # Default chunk size for long texts
|
| 93 |
+
|
| 94 |
+
def parse(self, text_content: str, chunk_size: int = None) -> List[Document]:
|
| 95 |
+
"""
|
| 96 |
+
Parse text content and return list of Document objects
|
| 97 |
+
|
| 98 |
+
Args:
|
| 99 |
+
text_content (str): Raw text content
|
| 100 |
+
chunk_size (int): Optional chunk size for splitting long texts
|
| 101 |
+
|
| 102 |
+
Returns:
|
| 103 |
+
List[Document]: List of documents, potentially chunked
|
| 104 |
+
"""
|
| 105 |
+
try:
|
| 106 |
+
if not text_content.strip():
|
| 107 |
+
raise ValueError("Empty text content provided")
|
| 108 |
+
|
| 109 |
+
chunk_size = chunk_size or self.chunk_size
|
| 110 |
+
|
| 111 |
+
# If text is short, return as single document
|
| 112 |
+
if len(text_content) <= chunk_size:
|
| 113 |
+
doc = Document(
|
| 114 |
+
page_content=text_content,
|
| 115 |
+
metadata={
|
| 116 |
+
'source_type': 'text',
|
| 117 |
+
'word_count': len(text_content.split()),
|
| 118 |
+
'char_count': len(text_content),
|
| 119 |
+
'chunk_index': 0,
|
| 120 |
+
'total_chunks': 1,
|
| 121 |
+
'parser': 'TextParser'
|
| 122 |
+
}
|
| 123 |
+
)
|
| 124 |
+
return [doc]
|
| 125 |
+
|
| 126 |
+
# Split long text into chunks
|
| 127 |
+
chunks = self._split_text_into_chunks(text_content, chunk_size)
|
| 128 |
+
documents = []
|
| 129 |
+
|
| 130 |
+
for i, chunk in enumerate(chunks):
|
| 131 |
+
doc = Document(
|
| 132 |
+
page_content=chunk,
|
| 133 |
+
metadata={
|
| 134 |
+
'source_type': 'text',
|
| 135 |
+
'word_count': len(chunk.split()),
|
| 136 |
+
'char_count': len(chunk),
|
| 137 |
+
'chunk_index': i,
|
| 138 |
+
'total_chunks': len(chunks),
|
| 139 |
+
'parser': 'TextParser'
|
| 140 |
+
}
|
| 141 |
+
)
|
| 142 |
+
documents.append(doc)
|
| 143 |
+
|
| 144 |
+
return documents
|
| 145 |
+
|
| 146 |
+
except Exception as e:
|
| 147 |
+
raise Exception(f"Error parsing text: {str(e)}")
|
| 148 |
+
|
| 149 |
+
def _split_text_into_chunks(self, text: str, chunk_size: int) -> List[str]:
|
| 150 |
+
"""Split text into chunks while preserving sentence boundaries"""
|
| 151 |
+
sentences = text.split('. ')
|
| 152 |
+
chunks = []
|
| 153 |
+
current_chunk = ""
|
| 154 |
+
|
| 155 |
+
for sentence in sentences:
|
| 156 |
+
# Add sentence to current chunk if it fits
|
| 157 |
+
test_chunk = current_chunk + sentence + ". "
|
| 158 |
+
|
| 159 |
+
if len(test_chunk) <= chunk_size:
|
| 160 |
+
current_chunk = test_chunk
|
| 161 |
+
else:
|
| 162 |
+
# Start new chunk if current chunk has content
|
| 163 |
+
if current_chunk.strip():
|
| 164 |
+
chunks.append(current_chunk.strip())
|
| 165 |
+
current_chunk = sentence + ". "
|
| 166 |
+
|
| 167 |
+
# Add final chunk if it has content
|
| 168 |
+
if current_chunk.strip():
|
| 169 |
+
chunks.append(current_chunk.strip())
|
| 170 |
+
|
| 171 |
+
return chunks
|
| 172 |
+
|
| 173 |
+
def analyze_text_structure(self, text_content: str) -> Dict[str, Any]:
|
| 174 |
+
"""Analyze the structure and characteristics of text content"""
|
| 175 |
+
try:
|
| 176 |
+
lines = text_content.split('\n')
|
| 177 |
+
words = text_content.split()
|
| 178 |
+
sentences = text_content.split('.')
|
| 179 |
+
|
| 180 |
+
# Count different elements
|
| 181 |
+
paragraphs = [p.strip() for p in text_content.split('\n\n') if p.strip()]
|
| 182 |
+
|
| 183 |
+
return {
|
| 184 |
+
'total_words': len(words),
|
| 185 |
+
'total_sentences': len([s for s in sentences if s.strip()]),
|
| 186 |
+
'total_lines': len(lines),
|
| 187 |
+
'total_paragraphs': len(paragraphs),
|
| 188 |
+
'average_words_per_sentence': len(words) / len(sentences) if sentences else 0,
|
| 189 |
+
'average_sentences_per_paragraph': len(sentences) / len(paragraphs) if paragraphs else 0,
|
| 190 |
+
'character_count': len(text_content),
|
| 191 |
+
'reading_time_minutes': len(words) / 200, # Assuming 200 words per minute
|
| 192 |
+
'complexity_score': self._calculate_text_complexity(text_content)
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
except Exception as e:
|
| 196 |
+
return {'error': f"Could not analyze text structure: {str(e)}"}
|
| 197 |
+
|
| 198 |
+
def _calculate_text_complexity(self, text: str) -> float:
|
| 199 |
+
"""Calculate a simple text complexity score"""
|
| 200 |
+
words = text.split()
|
| 201 |
+
sentences = [s for s in text.split('.') if s.strip()]
|
| 202 |
+
|
| 203 |
+
if not sentences:
|
| 204 |
+
return 0.0
|
| 205 |
+
|
| 206 |
+
# Average words per sentence (higher = more complex)
|
| 207 |
+
avg_words_per_sentence = len(words) / len(sentences)
|
| 208 |
+
|
| 209 |
+
# Average characters per word (higher = more complex)
|
| 210 |
+
avg_chars_per_word = sum(len(word) for word in words) / len(words) if words else 0
|
| 211 |
+
|
| 212 |
+
# Simple complexity score (normalized to 1-10 scale)
|
| 213 |
+
complexity = (avg_words_per_sentence * 0.1) + (avg_chars_per_word * 0.5)
|
| 214 |
+
return min(complexity, 10.0)
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
class WebpageParser(BaseParser):
|
| 218 |
+
"""Parser for web content"""
|
| 219 |
+
|
| 220 |
+
def __init__(self):
|
| 221 |
+
super().__init__()
|
| 222 |
+
self.supported_formats = ['http', 'https']
|
| 223 |
+
self.headers = {
|
| 224 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
| 225 |
+
}
|
| 226 |
+
self.timeout = 10
|
| 227 |
+
self.max_retries = 3
|
| 228 |
+
|
| 229 |
+
def parse_website(self, url: str, max_pages: int = 1, include_subpages: bool = False) -> List[Dict[str, Any]]:
|
| 230 |
+
"""
|
| 231 |
+
Parse website content and return structured data
|
| 232 |
+
|
| 233 |
+
Args:
|
| 234 |
+
url (str): Website URL to parse
|
| 235 |
+
max_pages (int): Maximum number of pages to parse
|
| 236 |
+
include_subpages (bool): Whether to include subpages
|
| 237 |
+
|
| 238 |
+
Returns:
|
| 239 |
+
List[Dict]: List of page data with content and metadata
|
| 240 |
+
"""
|
| 241 |
+
try:
|
| 242 |
+
pages_data = []
|
| 243 |
+
urls_to_process = [url]
|
| 244 |
+
processed_urls = set()
|
| 245 |
+
|
| 246 |
+
# If including subpages, find additional URLs
|
| 247 |
+
if include_subpages and max_pages > 1:
|
| 248 |
+
subpage_urls = self._find_subpages(url, max_pages - 1)
|
| 249 |
+
urls_to_process.extend(subpage_urls)
|
| 250 |
+
|
| 251 |
+
# Process each URL
|
| 252 |
+
for current_url in urls_to_process[:max_pages]:
|
| 253 |
+
if current_url in processed_urls:
|
| 254 |
+
continue
|
| 255 |
+
|
| 256 |
+
page_data = self._parse_single_page(current_url)
|
| 257 |
+
if page_data:
|
| 258 |
+
pages_data.append(page_data)
|
| 259 |
+
processed_urls.add(current_url)
|
| 260 |
+
|
| 261 |
+
# Add small delay to be respectful
|
| 262 |
+
time.sleep(1)
|
| 263 |
+
|
| 264 |
+
return pages_data
|
| 265 |
+
|
| 266 |
+
except Exception as e:
|
| 267 |
+
raise Exception(f"Error parsing website: {str(e)}")
|
| 268 |
+
|
| 269 |
+
def _parse_single_page(self, url: str) -> Dict[str, Any]:
|
| 270 |
+
"""Parse a single webpage and extract content"""
|
| 271 |
+
try:
|
| 272 |
+
# Make request with retries
|
| 273 |
+
response = None
|
| 274 |
+
for attempt in range(self.max_retries):
|
| 275 |
+
try:
|
| 276 |
+
response = requests.get(url, headers=self.headers, timeout=self.timeout)
|
| 277 |
+
response.raise_for_status()
|
| 278 |
+
break
|
| 279 |
+
except requests.RequestException as e:
|
| 280 |
+
if attempt == self.max_retries - 1:
|
| 281 |
+
raise e
|
| 282 |
+
time.sleep(2 ** attempt) # Exponential backoff
|
| 283 |
+
|
| 284 |
+
if not response:
|
| 285 |
+
return None
|
| 286 |
+
|
| 287 |
+
# Parse HTML content
|
| 288 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 289 |
+
|
| 290 |
+
# Remove unwanted elements
|
| 291 |
+
for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
|
| 292 |
+
element.decompose()
|
| 293 |
+
|
| 294 |
+
# Extract main content
|
| 295 |
+
main_content = self._extract_main_content(soup)
|
| 296 |
+
|
| 297 |
+
# Extract metadata
|
| 298 |
+
title = self._extract_title(soup)
|
| 299 |
+
description = self._extract_description(soup)
|
| 300 |
+
headings = self._extract_headings(soup)
|
| 301 |
+
links = self._extract_links(soup, url)
|
| 302 |
+
|
| 303 |
+
# Clean and process text
|
| 304 |
+
cleaned_text = self._clean_text_content(main_content)
|
| 305 |
+
|
| 306 |
+
return {
|
| 307 |
+
'url': url,
|
| 308 |
+
'title': title,
|
| 309 |
+
'description': description,
|
| 310 |
+
'content': cleaned_text,
|
| 311 |
+
'headings': headings,
|
| 312 |
+
'internal_links': links['internal'],
|
| 313 |
+
'external_links': links['external'],
|
| 314 |
+
'word_count': len(cleaned_text.split()),
|
| 315 |
+
'char_count': len(cleaned_text),
|
| 316 |
+
'meta_keywords': self._extract_meta_keywords(soup),
|
| 317 |
+
'images': self._extract_images(soup, url),
|
| 318 |
+
'parser': 'WebpageParser',
|
| 319 |
+
'parsed_at': time.strftime('%Y-%m-%d %H:%M:%S')
|
| 320 |
+
}
|
| 321 |
+
|
| 322 |
+
except Exception as e:
|
| 323 |
+
return {'url': url, 'error': f"Failed to parse page: {str(e)}"}
|
| 324 |
+
|
| 325 |
+
def _extract_main_content(self, soup: BeautifulSoup) -> str:
|
| 326 |
+
"""Extract the main content from the page"""
|
| 327 |
+
# Try to find main content in order of preference
|
| 328 |
+
content_selectors = [
|
| 329 |
+
'main',
|
| 330 |
+
'article',
|
| 331 |
+
'[role="main"]',
|
| 332 |
+
'.content',
|
| 333 |
+
'.main-content',
|
| 334 |
+
'#content',
|
| 335 |
+
'#main',
|
| 336 |
+
'.post-content',
|
| 337 |
+
'.entry-content'
|
| 338 |
+
]
|
| 339 |
+
|
| 340 |
+
for selector in content_selectors:
|
| 341 |
+
element = soup.select_one(selector)
|
| 342 |
+
if element:
|
| 343 |
+
return element.get_text(separator=' ', strip=True)
|
| 344 |
+
|
| 345 |
+
# Fallback to body content
|
| 346 |
+
body = soup.find('body')
|
| 347 |
+
if body:
|
| 348 |
+
return body.get_text(separator=' ', strip=True)
|
| 349 |
+
|
| 350 |
+
return soup.get_text(separator=' ', strip=True)
|
| 351 |
+
|
| 352 |
+
def _extract_title(self, soup: BeautifulSoup) -> str:
|
| 353 |
+
"""Extract page title"""
|
| 354 |
+
title_tag = soup.find('title')
|
| 355 |
+
if title_tag:
|
| 356 |
+
return title_tag.get_text().strip()
|
| 357 |
+
|
| 358 |
+
# Fallback to h1
|
| 359 |
+
h1 = soup.find('h1')
|
| 360 |
+
if h1:
|
| 361 |
+
return h1.get_text().strip()
|
| 362 |
+
|
| 363 |
+
return "No Title Found"
|
| 364 |
+
|
| 365 |
+
def _extract_description(self, soup: BeautifulSoup) -> str:
|
| 366 |
+
"""Extract meta description"""
|
| 367 |
+
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
| 368 |
+
if meta_desc and meta_desc.get('content'):
|
| 369 |
+
return meta_desc['content'].strip()
|
| 370 |
+
|
| 371 |
+
# Fallback to Open Graph description
|
| 372 |
+
og_desc = soup.find('meta', attrs={'property': 'og:description'})
|
| 373 |
+
if og_desc and og_desc.get('content'):
|
| 374 |
+
return og_desc['content'].strip()
|
| 375 |
+
|
| 376 |
+
return "No Description Found"
|
| 377 |
+
|
| 378 |
+
def _extract_headings(self, soup: BeautifulSoup) -> List[Dict[str, Any]]:
|
| 379 |
+
"""Extract all headings with their hierarchy"""
|
| 380 |
+
headings = []
|
| 381 |
+
|
| 382 |
+
for i in range(1, 7): # h1 to h6
|
| 383 |
+
for heading in soup.find_all(f'h{i}'):
|
| 384 |
+
text = heading.get_text(strip=True)
|
| 385 |
+
if text:
|
| 386 |
+
headings.append({
|
| 387 |
+
'level': i,
|
| 388 |
+
'text': text,
|
| 389 |
+
'id': heading.get('id', ''),
|
| 390 |
+
'class': heading.get('class', [])
|
| 391 |
+
})
|
| 392 |
+
|
| 393 |
+
return headings
|
| 394 |
+
|
| 395 |
+
def _extract_links(self, soup: BeautifulSoup, base_url: str) -> Dict[str, List[str]]:
|
| 396 |
+
"""Extract internal and external links"""
|
| 397 |
+
internal_links = []
|
| 398 |
+
external_links = []
|
| 399 |
+
base_domain = urlparse(base_url).netloc
|
| 400 |
+
|
| 401 |
+
for link in soup.find_all('a', href=True):
|
| 402 |
+
href = link['href']
|
| 403 |
+
full_url = urljoin(base_url, href)
|
| 404 |
+
parsed_url = urlparse(full_url)
|
| 405 |
+
|
| 406 |
+
if parsed_url.netloc == base_domain:
|
| 407 |
+
internal_links.append(full_url)
|
| 408 |
+
elif parsed_url.netloc: # External link with domain
|
| 409 |
+
external_links.append(full_url)
|
| 410 |
+
|
| 411 |
+
return {
|
| 412 |
+
'internal': list(set(internal_links)),
|
| 413 |
+
'external': list(set(external_links))
|
| 414 |
+
}
|
| 415 |
+
|
| 416 |
+
def _extract_meta_keywords(self, soup: BeautifulSoup) -> List[str]:
|
| 417 |
+
"""Extract meta keywords if available"""
|
| 418 |
+
meta_keywords = soup.find('meta', attrs={'name': 'keywords'})
|
| 419 |
+
if meta_keywords and meta_keywords.get('content'):
|
| 420 |
+
keywords = meta_keywords['content'].split(',')
|
| 421 |
+
return [kw.strip() for kw in keywords if kw.strip()]
|
| 422 |
+
return []
|
| 423 |
+
|
| 424 |
+
def _extract_images(self, soup: BeautifulSoup, base_url: str) -> List[Dict[str, str]]:
|
| 425 |
+
"""Extract image information"""
|
| 426 |
+
images = []
|
| 427 |
+
|
| 428 |
+
for img in soup.find_all('img'):
|
| 429 |
+
src = img.get('src')
|
| 430 |
+
if src:
|
| 431 |
+
full_url = urljoin(base_url, src)
|
| 432 |
+
images.append({
|
| 433 |
+
'src': full_url,
|
| 434 |
+
'alt': img.get('alt', ''),
|
| 435 |
+
'title': img.get('title', '')
|
| 436 |
+
})
|
| 437 |
+
|
| 438 |
+
return images
|
| 439 |
+
|
| 440 |
+
def _clean_text_content(self, text: str) -> str:
|
| 441 |
+
"""Clean and normalize text content"""
|
| 442 |
+
if not text:
|
| 443 |
+
return ""
|
| 444 |
+
|
| 445 |
+
# Split into lines and clean each line
|
| 446 |
+
lines = text.split('\n')
|
| 447 |
+
cleaned_lines = []
|
| 448 |
+
|
| 449 |
+
for line in lines:
|
| 450 |
+
line = line.strip()
|
| 451 |
+
if line and len(line) > 1: # Skip empty lines and single characters
|
| 452 |
+
cleaned_lines.append(line)
|
| 453 |
+
|
| 454 |
+
# Join lines with single spaces
|
| 455 |
+
cleaned_text = ' '.join(cleaned_lines)
|
| 456 |
+
|
| 457 |
+
# Remove multiple spaces
|
| 458 |
+
while ' ' in cleaned_text:
|
| 459 |
+
cleaned_text = cleaned_text.replace(' ', ' ')
|
| 460 |
+
|
| 461 |
+
return cleaned_text
|
| 462 |
+
|
| 463 |
+
def _find_subpages(self, url: str, max_subpages: int) -> List[str]:
|
| 464 |
+
"""Find subpages from the main page"""
|
| 465 |
+
try:
|
| 466 |
+
response = requests.get(url, headers=self.headers, timeout=self.timeout)
|
| 467 |
+
response.raise_for_status()
|
| 468 |
+
|
| 469 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 470 |
+
base_domain = urlparse(url).netloc
|
| 471 |
+
subpages = set()
|
| 472 |
+
|
| 473 |
+
# Find internal links
|
| 474 |
+
for link in soup.find_all('a', href=True):
|
| 475 |
+
href = link['href']
|
| 476 |
+
full_url = urljoin(url, href)
|
| 477 |
+
parsed_url = urlparse(full_url)
|
| 478 |
+
|
| 479 |
+
# Only include internal links from same domain
|
| 480 |
+
if (parsed_url.netloc == base_domain and
|
| 481 |
+
full_url != url and
|
| 482 |
+
not any(ext in full_url.lower() for ext in ['.pdf', '.jpg', '.png', '.gif', '.zip'])):
|
| 483 |
+
subpages.add(full_url)
|
| 484 |
+
|
| 485 |
+
if len(subpages) >= max_subpages:
|
| 486 |
+
break
|
| 487 |
+
|
| 488 |
+
return list(subpages)[:max_subpages]
|
| 489 |
+
|
| 490 |
+
except Exception:
|
| 491 |
+
return []
|
| 492 |
+
|
| 493 |
+
def validate_url(self, url: str) -> bool:
|
| 494 |
+
"""Validate if URL is accessible"""
|
| 495 |
+
try:
|
| 496 |
+
response = requests.head(url, headers=self.headers, timeout=5)
|
| 497 |
+
return response.status_code == 200
|
| 498 |
+
except:
|
| 499 |
+
return False
|
| 500 |
+
|
| 501 |
+
def get_website_info(self, url: str) -> Dict[str, Any]:
|
| 502 |
+
"""Get basic information about a website"""
|
| 503 |
+
try:
|
| 504 |
+
response = requests.get(url, headers=self.headers, timeout=self.timeout)
|
| 505 |
+
response.raise_for_status()
|
| 506 |
+
|
| 507 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 508 |
+
|
| 509 |
+
return {
|
| 510 |
+
'url': url,
|
| 511 |
+
'title': self._extract_title(soup),
|
| 512 |
+
'description': self._extract_description(soup),
|
| 513 |
+
'meta_keywords': self._extract_meta_keywords(soup),
|
| 514 |
+
'has_robots_meta': bool(soup.find('meta', attrs={'name': 'robots'})),
|
| 515 |
+
'has_viewport_meta': bool(soup.find('meta', attrs={'name': 'viewport'})),
|
| 516 |
+
'language': soup.get('lang', 'unknown'),
|
| 517 |
+
'status_code': response.status_code,
|
| 518 |
+
'content_type': response.headers.get('content-type', 'unknown'),
|
| 519 |
+
'server': response.headers.get('server', 'unknown')
|
| 520 |
+
}
|
| 521 |
+
|
| 522 |
+
except Exception as e:
|
| 523 |
+
return {'url': url, 'error': f"Could not get website info: {str(e)}"}
|
| 524 |
+
|
| 525 |
+
|
| 526 |
+
class ParserFactory:
|
| 527 |
+
"""Factory class to create appropriate parsers"""
|
| 528 |
+
|
| 529 |
+
@staticmethod
|
| 530 |
+
def get_parser(source_type: str):
|
| 531 |
+
"""Get the appropriate parser for the source type"""
|
| 532 |
+
parsers = {
|
| 533 |
+
'pdf': PDFParser(),
|
| 534 |
+
'text': TextParser(),
|
| 535 |
+
'webpage': WebpageParser(),
|
| 536 |
+
'url': WebpageParser()
|
| 537 |
+
}
|
| 538 |
+
|
| 539 |
+
return parsers.get(source_type.lower())
|
| 540 |
+
|
| 541 |
+
@staticmethod
|
| 542 |
+
def detect_source_type(source: str) -> str:
|
| 543 |
+
"""Detect the type of content source"""
|
| 544 |
+
if source.startswith(('http://', 'https://')):
|
| 545 |
+
return 'webpage'
|
| 546 |
+
elif source.endswith('.pdf'):
|
| 547 |
+
return 'pdf'
|
| 548 |
+
else:
|
| 549 |
+
return 'text'
|
utils/scorer.py
ADDED
|
@@ -0,0 +1,501 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GEO Scoring Module
|
| 3 |
+
Analyzes content for Generative Engine Optimization (GEO) performance
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
from typing import Dict, Any, List
|
| 8 |
+
from langchain.prompts import ChatPromptTemplate
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class GEOScorer:
|
| 12 |
+
"""Main class for calculating GEO scores and analysis"""
|
| 13 |
+
|
| 14 |
+
def __init__(self, llm):
|
| 15 |
+
self.llm = llm
|
| 16 |
+
self.setup_prompts()
|
| 17 |
+
|
| 18 |
+
def setup_prompts(self):
|
| 19 |
+
"""Initialize prompts for different types of analysis"""
|
| 20 |
+
|
| 21 |
+
# Main GEO analysis prompt
|
| 22 |
+
self.geo_analysis_prompt = """You are a Generative Engine Optimizer (GEO) specialist. Analyze the provided content for its effectiveness in AI-powered search engines and LLM systems.
|
| 23 |
+
|
| 24 |
+
Evaluate the content based on these GEO criteria (score 1-10 each):
|
| 25 |
+
|
| 26 |
+
1. **AI Search Visibility**: How likely is this content to be surfaced by AI search engines?
|
| 27 |
+
2. **Query Intent Matching**: How well does the content match common user queries?
|
| 28 |
+
3. **Factual Accuracy & Authority**: How trustworthy and authoritative is the information?
|
| 29 |
+
4. **Conversational Readiness**: How suitable is the content for AI chat responses?
|
| 30 |
+
5. **Semantic Richness**: How well does the content use relevant semantic keywords?
|
| 31 |
+
6. **Context Completeness**: Does the content provide complete, self-contained answers?
|
| 32 |
+
7. **Citation Worthiness**: How likely are AI systems to cite this content?
|
| 33 |
+
8. **Multi-Query Coverage**: Does the content answer multiple related questions?
|
| 34 |
+
|
| 35 |
+
Also identify:
|
| 36 |
+
- Primary topics and entities
|
| 37 |
+
- Missing information gaps
|
| 38 |
+
- Optimization opportunities
|
| 39 |
+
- Specific enhancement recommendations
|
| 40 |
+
|
| 41 |
+
Format your response as JSON:
|
| 42 |
+
|
| 43 |
+
```json
|
| 44 |
+
{
|
| 45 |
+
"geo_scores": {
|
| 46 |
+
"ai_search_visibility": 7.5,
|
| 47 |
+
"query_intent_matching": 8.0,
|
| 48 |
+
"factual_accuracy": 9.0,
|
| 49 |
+
"conversational_readiness": 6.5,
|
| 50 |
+
"semantic_richness": 7.0,
|
| 51 |
+
"context_completeness": 8.5,
|
| 52 |
+
"citation_worthiness": 7.8,
|
| 53 |
+
"multi_query_coverage": 6.0
|
| 54 |
+
},
|
| 55 |
+
"overall_geo_score": 7.5,
|
| 56 |
+
"primary_topics": ["topic1", "topic2"],
|
| 57 |
+
"entities": ["entity1", "entity2"],
|
| 58 |
+
"missing_gaps": ["gap1", "gap2"],
|
| 59 |
+
"optimization_opportunities": [
|
| 60 |
+
{
|
| 61 |
+
"type": "semantic_enhancement",
|
| 62 |
+
"description": "Add more related terms",
|
| 63 |
+
"priority": "high"
|
| 64 |
+
}
|
| 65 |
+
],
|
| 66 |
+
"recommendations": [
|
| 67 |
+
"Specific actionable recommendation 1",
|
| 68 |
+
"Specific actionable recommendation 2"
|
| 69 |
+
]
|
| 70 |
+
}
|
| 71 |
+
```"""
|
| 72 |
+
|
| 73 |
+
# Quick scoring prompt for faster analysis
|
| 74 |
+
self.quick_score_prompt = """Analyze this content for AI search optimization. Provide scores (1-10) for:
|
| 75 |
+
|
| 76 |
+
1. AI Search Visibility
|
| 77 |
+
2. Query Intent Matching
|
| 78 |
+
3. Conversational Readiness
|
| 79 |
+
4. Citation Worthiness
|
| 80 |
+
|
| 81 |
+
Respond in JSON format:
|
| 82 |
+
```json
|
| 83 |
+
{
|
| 84 |
+
"scores": {
|
| 85 |
+
"ai_search_visibility": 7.5,
|
| 86 |
+
"query_intent_matching": 8.0,
|
| 87 |
+
"conversational_readiness": 6.5,
|
| 88 |
+
"citation_worthiness": 7.8
|
| 89 |
+
},
|
| 90 |
+
"overall_score": 7.5,
|
| 91 |
+
"top_recommendation": "Most important improvement needed"
|
| 92 |
+
}
|
| 93 |
+
```"""
|
| 94 |
+
|
| 95 |
+
# Competitive analysis prompt
|
| 96 |
+
self.competitive_prompt = """Compare these content pieces for GEO performance. Identify which performs better for AI search and why.
|
| 97 |
+
|
| 98 |
+
Content A: {content_a}
|
| 99 |
+
|
| 100 |
+
Content B: {content_b}
|
| 101 |
+
|
| 102 |
+
Provide analysis in JSON:
|
| 103 |
+
```json
|
| 104 |
+
{
|
| 105 |
+
"winner": "A" or "B",
|
| 106 |
+
"score_comparison": {
|
| 107 |
+
"content_a_score": 7.5,
|
| 108 |
+
"content_b_score": 8.2
|
| 109 |
+
},
|
| 110 |
+
"key_differences": ["difference1", "difference2"],
|
| 111 |
+
"improvement_suggestions": {
|
| 112 |
+
"content_a": ["suggestion1"],
|
| 113 |
+
"content_b": ["suggestion1"]
|
| 114 |
+
}
|
| 115 |
+
}
|
| 116 |
+
```"""
|
| 117 |
+
|
| 118 |
+
def analyze_page_geo(self, content: str, title: str, detailed: bool = True) -> Dict[str, Any]:
|
| 119 |
+
"""
|
| 120 |
+
Analyze a single page for GEO performance
|
| 121 |
+
|
| 122 |
+
Args:
|
| 123 |
+
content (str): Page content to analyze
|
| 124 |
+
title (str): Page title
|
| 125 |
+
detailed (bool): Whether to perform detailed analysis
|
| 126 |
+
|
| 127 |
+
Returns:
|
| 128 |
+
Dict: GEO analysis results
|
| 129 |
+
"""
|
| 130 |
+
try:
|
| 131 |
+
# Choose prompt based on detail level
|
| 132 |
+
if detailed:
|
| 133 |
+
prompt_template = ChatPromptTemplate.from_messages([
|
| 134 |
+
("system", self.geo_analysis_prompt),
|
| 135 |
+
("user", f"Title: {title}\n\nContent: {content[:8000]}") # Limit content length
|
| 136 |
+
])
|
| 137 |
+
else:
|
| 138 |
+
prompt_template = ChatPromptTemplate.from_messages([
|
| 139 |
+
("system", self.quick_score_prompt),
|
| 140 |
+
("user", f"Title: {title}\n\nContent: {content[:4000]}")
|
| 141 |
+
])
|
| 142 |
+
|
| 143 |
+
# Run analysis
|
| 144 |
+
chain = prompt_template | self.llm
|
| 145 |
+
result = chain.invoke({})
|
| 146 |
+
|
| 147 |
+
# Extract and parse result
|
| 148 |
+
result_content = result.content if hasattr(result, 'content') else str(result)
|
| 149 |
+
parsed_result = self._parse_llm_response(result_content)
|
| 150 |
+
|
| 151 |
+
# Add metadata
|
| 152 |
+
parsed_result.update({
|
| 153 |
+
'analyzed_title': title,
|
| 154 |
+
'content_length': len(content),
|
| 155 |
+
'word_count': len(content.split()),
|
| 156 |
+
'analysis_type': 'detailed' if detailed else 'quick'
|
| 157 |
+
})
|
| 158 |
+
|
| 159 |
+
return parsed_result
|
| 160 |
+
|
| 161 |
+
except Exception as e:
|
| 162 |
+
return {'error': f"GEO analysis failed: {str(e)}"}
|
| 163 |
+
|
| 164 |
+
def analyze_multiple_pages(self, pages_data: List[Dict[str, Any]], detailed: bool = True) -> List[Dict[str, Any]]:
|
| 165 |
+
"""
|
| 166 |
+
Analyze multiple pages and return consolidated results
|
| 167 |
+
|
| 168 |
+
Args:
|
| 169 |
+
pages_data (List[Dict]): List of page data with content and metadata
|
| 170 |
+
detailed (bool): Whether to perform detailed analysis
|
| 171 |
+
|
| 172 |
+
Returns:
|
| 173 |
+
List[Dict]: List of GEO analysis results
|
| 174 |
+
"""
|
| 175 |
+
results = []
|
| 176 |
+
|
| 177 |
+
for i, page_data in enumerate(pages_data):
|
| 178 |
+
try:
|
| 179 |
+
content = page_data.get('content', '')
|
| 180 |
+
title = page_data.get('title', f'Page {i+1}')
|
| 181 |
+
|
| 182 |
+
analysis = self.analyze_page_geo(content, title, detailed)
|
| 183 |
+
|
| 184 |
+
# Add page-specific metadata
|
| 185 |
+
analysis.update({
|
| 186 |
+
'page_url': page_data.get('url', ''),
|
| 187 |
+
'page_index': i,
|
| 188 |
+
'source_word_count': page_data.get('word_count', 0)
|
| 189 |
+
})
|
| 190 |
+
|
| 191 |
+
results.append(analysis)
|
| 192 |
+
|
| 193 |
+
except Exception as e:
|
| 194 |
+
results.append({
|
| 195 |
+
'page_index': i,
|
| 196 |
+
'page_url': page_data.get('url', ''),
|
| 197 |
+
'error': f"Analysis failed: {str(e)}"
|
| 198 |
+
})
|
| 199 |
+
|
| 200 |
+
return results
|
| 201 |
+
|
| 202 |
+
def compare_content_geo(self, content_a: str, content_b: str, titles: tuple = None) -> Dict[str, Any]:
|
| 203 |
+
"""
|
| 204 |
+
Compare two pieces of content for GEO performance
|
| 205 |
+
|
| 206 |
+
Args:
|
| 207 |
+
content_a (str): First content to compare
|
| 208 |
+
content_b (str): Second content to compare
|
| 209 |
+
titles (tuple): Optional titles for the content pieces
|
| 210 |
+
|
| 211 |
+
Returns:
|
| 212 |
+
Dict: Comparison analysis results
|
| 213 |
+
"""
|
| 214 |
+
try:
|
| 215 |
+
title_a, title_b = titles if titles else ("Content A", "Content B")
|
| 216 |
+
|
| 217 |
+
prompt_template = ChatPromptTemplate.from_messages([
|
| 218 |
+
("system", self.competitive_prompt),
|
| 219 |
+
("user", "")
|
| 220 |
+
])
|
| 221 |
+
|
| 222 |
+
# Format the competitive analysis prompt
|
| 223 |
+
formatted_prompt = self.competitive_prompt.format(
|
| 224 |
+
content_a=f"Title: {title_a}\nContent: {content_a[:4000]}",
|
| 225 |
+
content_b=f"Title: {title_b}\nContent: {content_b[:4000]}"
|
| 226 |
+
)
|
| 227 |
+
|
| 228 |
+
chain = ChatPromptTemplate.from_messages([
|
| 229 |
+
("system", formatted_prompt),
|
| 230 |
+
("user", "Perform the comparison analysis.")
|
| 231 |
+
]) | self.llm
|
| 232 |
+
|
| 233 |
+
result = chain.invoke({})
|
| 234 |
+
result_content = result.content if hasattr(result, 'content') else str(result)
|
| 235 |
+
|
| 236 |
+
return self._parse_llm_response(result_content)
|
| 237 |
+
|
| 238 |
+
except Exception as e:
|
| 239 |
+
return {'error': f"Comparison analysis failed: {str(e)}"}
|
| 240 |
+
|
| 241 |
+
def calculate_aggregate_scores(self, individual_results: List[Dict[str, Any]]) -> Dict[str, Any]:
|
| 242 |
+
"""
|
| 243 |
+
Calculate aggregate GEO scores from multiple page analyses
|
| 244 |
+
|
| 245 |
+
Args:
|
| 246 |
+
individual_results (List[Dict]): List of individual page analysis results
|
| 247 |
+
|
| 248 |
+
Returns:
|
| 249 |
+
Dict: Aggregate scores and insights
|
| 250 |
+
"""
|
| 251 |
+
try:
|
| 252 |
+
valid_results = [r for r in individual_results if 'geo_scores' in r and not r.get('error')]
|
| 253 |
+
|
| 254 |
+
if not valid_results:
|
| 255 |
+
return {'error': 'No valid results to aggregate'}
|
| 256 |
+
|
| 257 |
+
# Calculate average scores
|
| 258 |
+
score_keys = list(valid_results[0]['geo_scores'].keys())
|
| 259 |
+
avg_scores = {}
|
| 260 |
+
|
| 261 |
+
for key in score_keys:
|
| 262 |
+
scores = [r['geo_scores'][key] for r in valid_results if key in r['geo_scores']]
|
| 263 |
+
avg_scores[key] = sum(scores) / len(scores) if scores else 0
|
| 264 |
+
|
| 265 |
+
overall_avg = sum(avg_scores.values()) / len(avg_scores) if avg_scores else 0
|
| 266 |
+
|
| 267 |
+
# Collect all recommendations and opportunities
|
| 268 |
+
all_recommendations = []
|
| 269 |
+
all_opportunities = []
|
| 270 |
+
all_topics = []
|
| 271 |
+
all_entities = []
|
| 272 |
+
|
| 273 |
+
for result in valid_results:
|
| 274 |
+
all_recommendations.extend(result.get('recommendations', []))
|
| 275 |
+
all_opportunities.extend(result.get('optimization_opportunities', []))
|
| 276 |
+
all_topics.extend(result.get('primary_topics', []))
|
| 277 |
+
all_entities.extend(result.get('entities', []))
|
| 278 |
+
|
| 279 |
+
# Remove duplicates and prioritize
|
| 280 |
+
unique_recommendations = list(set(all_recommendations))
|
| 281 |
+
unique_topics = list(set(all_topics))
|
| 282 |
+
unique_entities = list(set(all_entities))
|
| 283 |
+
|
| 284 |
+
# Find highest and lowest performing areas
|
| 285 |
+
best_score = max(avg_scores.items(), key=lambda x: x[1]) if avg_scores else ('none', 0)
|
| 286 |
+
worst_score = min(avg_scores.items(), key=lambda x: x[1]) if avg_scores else ('none', 0)
|
| 287 |
+
|
| 288 |
+
return {
|
| 289 |
+
'aggregate_scores': avg_scores,
|
| 290 |
+
'overall_score': overall_avg,
|
| 291 |
+
'pages_analyzed': len(valid_results),
|
| 292 |
+
'best_performing_metric': {
|
| 293 |
+
'metric': best_score[0],
|
| 294 |
+
'score': best_score[1]
|
| 295 |
+
},
|
| 296 |
+
'lowest_performing_metric': {
|
| 297 |
+
'metric': worst_score[0],
|
| 298 |
+
'score': worst_score[1]
|
| 299 |
+
},
|
| 300 |
+
'consolidated_recommendations': unique_recommendations[:10],
|
| 301 |
+
'all_topics': unique_topics,
|
| 302 |
+
'all_entities': unique_entities,
|
| 303 |
+
'high_priority_opportunities': [
|
| 304 |
+
opp for opp in all_opportunities
|
| 305 |
+
if opp.get('priority') == 'high'
|
| 306 |
+
][:5],
|
| 307 |
+
'score_distribution': self._calculate_score_distribution(avg_scores)
|
| 308 |
+
}
|
| 309 |
+
|
| 310 |
+
except Exception as e:
|
| 311 |
+
return {'error': f"Aggregation failed: {str(e)}"}
|
| 312 |
+
|
| 313 |
+
def generate_geo_report(self, analysis_results: Dict[str, Any], website_url: str = None) -> Dict[str, Any]:
|
| 314 |
+
"""
|
| 315 |
+
Generate a comprehensive GEO report
|
| 316 |
+
|
| 317 |
+
Args:
|
| 318 |
+
analysis_results (Dict): Results from aggregate analysis
|
| 319 |
+
website_url (str): Optional website URL for context
|
| 320 |
+
|
| 321 |
+
Returns:
|
| 322 |
+
Dict: Comprehensive GEO report
|
| 323 |
+
"""
|
| 324 |
+
try:
|
| 325 |
+
report = {
|
| 326 |
+
'report_metadata': {
|
| 327 |
+
'generated_at': self._get_timestamp(),
|
| 328 |
+
'website_url': website_url,
|
| 329 |
+
'analysis_type': 'GEO Performance Report'
|
| 330 |
+
},
|
| 331 |
+
'executive_summary': self._generate_executive_summary(analysis_results),
|
| 332 |
+
'detailed_scores': analysis_results.get('aggregate_scores', {}),
|
| 333 |
+
'performance_insights': self._generate_performance_insights(analysis_results),
|
| 334 |
+
'actionable_recommendations': self._prioritize_recommendations(
|
| 335 |
+
analysis_results.get('consolidated_recommendations', [])
|
| 336 |
+
),
|
| 337 |
+
'optimization_roadmap': self._create_optimization_roadmap(analysis_results),
|
| 338 |
+
'competitive_position': self._assess_competitive_position(analysis_results),
|
| 339 |
+
'technical_details': {
|
| 340 |
+
'pages_analyzed': analysis_results.get('pages_analyzed', 0),
|
| 341 |
+
'overall_score': analysis_results.get('overall_score', 0),
|
| 342 |
+
'score_distribution': analysis_results.get('score_distribution', {})
|
| 343 |
+
}
|
| 344 |
+
}
|
| 345 |
+
|
| 346 |
+
return report
|
| 347 |
+
|
| 348 |
+
except Exception as e:
|
| 349 |
+
return {'error': f"Report generation failed: {str(e)}"}
|
| 350 |
+
|
| 351 |
+
def _parse_llm_response(self, response_text: str) -> Dict[str, Any]:
|
| 352 |
+
"""Parse LLM response and extract JSON content"""
|
| 353 |
+
try:
|
| 354 |
+
# Find JSON content in the response
|
| 355 |
+
json_start = response_text.find('{')
|
| 356 |
+
json_end = response_text.rfind('}') + 1
|
| 357 |
+
|
| 358 |
+
if json_start != -1 and json_end != -1:
|
| 359 |
+
json_str = response_text[json_start:json_end]
|
| 360 |
+
return json.loads(json_str)
|
| 361 |
+
else:
|
| 362 |
+
# If no JSON found, return the raw response
|
| 363 |
+
return {'raw_response': response_text, 'parsing_error': 'No JSON found'}
|
| 364 |
+
|
| 365 |
+
except json.JSONDecodeError as e:
|
| 366 |
+
return {'raw_response': response_text, 'parsing_error': f'JSON decode error: {str(e)}'}
|
| 367 |
+
except Exception as e:
|
| 368 |
+
return {'raw_response': response_text, 'parsing_error': f'Unexpected error: {str(e)}'}
|
| 369 |
+
|
| 370 |
+
def _calculate_score_distribution(self, scores: Dict[str, float]) -> Dict[str, Any]:
|
| 371 |
+
"""Calculate distribution of scores for insights"""
|
| 372 |
+
if not scores:
|
| 373 |
+
return {}
|
| 374 |
+
|
| 375 |
+
score_values = list(scores.values())
|
| 376 |
+
|
| 377 |
+
return {
|
| 378 |
+
'highest_score': max(score_values),
|
| 379 |
+
'lowest_score': min(score_values),
|
| 380 |
+
'average_score': sum(score_values) / len(score_values),
|
| 381 |
+
'score_range': max(score_values) - min(score_values),
|
| 382 |
+
'scores_above_7': len([s for s in score_values if s >= 7.0]),
|
| 383 |
+
'scores_below_5': len([s for s in score_values if s < 5.0])
|
| 384 |
+
}
|
| 385 |
+
|
| 386 |
+
def _generate_executive_summary(self, analysis_results: Dict[str, Any]) -> str:
|
| 387 |
+
"""Generate executive summary based on analysis results"""
|
| 388 |
+
overall_score = analysis_results.get('overall_score', 0)
|
| 389 |
+
pages_analyzed = analysis_results.get('pages_analyzed', 0)
|
| 390 |
+
|
| 391 |
+
if overall_score >= 8.0:
|
| 392 |
+
performance = "excellent"
|
| 393 |
+
elif overall_score >= 6.5:
|
| 394 |
+
performance = "good"
|
| 395 |
+
elif overall_score >= 5.0:
|
| 396 |
+
performance = "moderate"
|
| 397 |
+
else:
|
| 398 |
+
performance = "needs improvement"
|
| 399 |
+
|
| 400 |
+
return f"Analysis of {pages_analyzed} pages shows {performance} GEO performance with an overall score of {overall_score:.1f}/10. Key opportunities exist in {analysis_results.get('lowest_performing_metric', {}).get('metric', 'multiple areas')}."
|
| 401 |
+
|
| 402 |
+
def _generate_performance_insights(self, analysis_results: Dict[str, Any]) -> List[str]:
|
| 403 |
+
"""Generate performance insights based on analysis"""
|
| 404 |
+
insights = []
|
| 405 |
+
|
| 406 |
+
best_metric = analysis_results.get('best_performing_metric', {})
|
| 407 |
+
worst_metric = analysis_results.get('lowest_performing_metric', {})
|
| 408 |
+
|
| 409 |
+
if best_metric.get('score', 0) >= 8.0:
|
| 410 |
+
insights.append(f"Strong performance in {best_metric.get('metric', 'unknown')} (score: {best_metric.get('score', 0):.1f})")
|
| 411 |
+
|
| 412 |
+
if worst_metric.get('score', 10) < 6.0:
|
| 413 |
+
insights.append(f"Significant improvement needed in {worst_metric.get('metric', 'unknown')} (score: {worst_metric.get('score', 0):.1f})")
|
| 414 |
+
|
| 415 |
+
score_dist = analysis_results.get('score_distribution', {})
|
| 416 |
+
if score_dist.get('score_range', 0) > 3.0:
|
| 417 |
+
insights.append("High variability in scores indicates inconsistent optimization across metrics")
|
| 418 |
+
|
| 419 |
+
return insights
|
| 420 |
+
|
| 421 |
+
def _prioritize_recommendations(self, recommendations: List[str]) -> List[Dict[str, Any]]:
|
| 422 |
+
"""Prioritize recommendations based on impact potential"""
|
| 423 |
+
prioritized = []
|
| 424 |
+
|
| 425 |
+
# Simple prioritization based on keywords
|
| 426 |
+
high_impact_keywords = ['semantic', 'structure', 'authority', 'factual']
|
| 427 |
+
medium_impact_keywords = ['readability', 'clarity', 'format']
|
| 428 |
+
|
| 429 |
+
for i, rec in enumerate(recommendations):
|
| 430 |
+
priority = 'low'
|
| 431 |
+
if any(keyword in rec.lower() for keyword in high_impact_keywords):
|
| 432 |
+
priority = 'high'
|
| 433 |
+
elif any(keyword in rec.lower() for keyword in medium_impact_keywords):
|
| 434 |
+
priority = 'medium'
|
| 435 |
+
|
| 436 |
+
prioritized.append({
|
| 437 |
+
'recommendation': rec,
|
| 438 |
+
'priority': priority,
|
| 439 |
+
'order': i + 1
|
| 440 |
+
})
|
| 441 |
+
|
| 442 |
+
# Sort by priority
|
| 443 |
+
priority_order = {'high': 1, 'medium': 2, 'low': 3}
|
| 444 |
+
prioritized.sort(key=lambda x: priority_order[x['priority']])
|
| 445 |
+
|
| 446 |
+
return prioritized
|
| 447 |
+
|
| 448 |
+
def _create_optimization_roadmap(self, analysis_results: Dict[str, Any]) -> Dict[str, List[str]]:
|
| 449 |
+
"""Create a phased optimization roadmap"""
|
| 450 |
+
roadmap = {
|
| 451 |
+
'immediate_actions': [],
|
| 452 |
+
'short_term_goals': [],
|
| 453 |
+
'long_term_strategy': []
|
| 454 |
+
}
|
| 455 |
+
|
| 456 |
+
overall_score = analysis_results.get('overall_score', 0)
|
| 457 |
+
worst_metric = analysis_results.get('lowest_performing_metric', {})
|
| 458 |
+
|
| 459 |
+
# Immediate actions based on worst performing metric
|
| 460 |
+
if worst_metric.get('score', 10) < 5.0:
|
| 461 |
+
roadmap['immediate_actions'].append(f"Address critical issues in {worst_metric.get('metric', 'low-scoring areas')}")
|
| 462 |
+
|
| 463 |
+
# Short-term goals
|
| 464 |
+
if overall_score < 7.0:
|
| 465 |
+
roadmap['short_term_goals'].append("Improve overall GEO score to above 7.0")
|
| 466 |
+
roadmap['short_term_goals'].append("Enhance content structure and semantic richness")
|
| 467 |
+
|
| 468 |
+
# Long-term strategy
|
| 469 |
+
roadmap['long_term_strategy'].append("Establish consistent GEO optimization process")
|
| 470 |
+
roadmap['long_term_strategy'].append("Monitor and track AI search performance")
|
| 471 |
+
|
| 472 |
+
return roadmap
|
| 473 |
+
|
| 474 |
+
def _assess_competitive_position(self, analysis_results: Dict[str, Any]) -> Dict[str, Any]:
|
| 475 |
+
"""Assess competitive position based on scores"""
|
| 476 |
+
overall_score = analysis_results.get('overall_score', 0)
|
| 477 |
+
|
| 478 |
+
if overall_score >= 8.5:
|
| 479 |
+
position = "market_leader"
|
| 480 |
+
description = "Content is highly optimized for AI search engines"
|
| 481 |
+
elif overall_score >= 7.0:
|
| 482 |
+
position = "competitive"
|
| 483 |
+
description = "Content performs well but has room for improvement"
|
| 484 |
+
elif overall_score >= 5.5:
|
| 485 |
+
position = "average"
|
| 486 |
+
description = "Content meets basic standards but lacks optimization"
|
| 487 |
+
else:
|
| 488 |
+
position = "needs_work"
|
| 489 |
+
description = "Content requires significant optimization for AI search"
|
| 490 |
+
|
| 491 |
+
return {
|
| 492 |
+
'position': position,
|
| 493 |
+
'description': description,
|
| 494 |
+
'score': overall_score,
|
| 495 |
+
'percentile_estimate': min(overall_score * 10, 100) # Rough percentile estimate
|
| 496 |
+
}
|
| 497 |
+
|
| 498 |
+
def _get_timestamp(self) -> str:
|
| 499 |
+
"""Get current timestamp"""
|
| 500 |
+
from datetime import datetime
|
| 501 |
+
return datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|