Spaces:
Paused
Paused
| #!/usr/bin/env python3 | |
| """ | |
| 🧬 CLAK DIGITAL CLONE PROFILER | |
| =============================== | |
| Analyserer dine sendte mails og beskeder for at skabe en digital klon | |
| af din kommunikationsstil, tænkemåde og viden. | |
| Features: | |
| - Harvest ALLE sendte mails og beskeder | |
| - Analysér kommunikationsstil (tone, ordvalg, struktur) | |
| - Identificér ekspertiseområder og viden | |
| - Byg personlighedsprofil | |
| - Generér "clone prompts" til AI-modeller | |
| - Gem som embeddings i Neo4j for RAG | |
| Output: | |
| - CloneProfile node i Neo4j | |
| - Communication patterns | |
| - Knowledge domains | |
| - Writing style analysis | |
| - Ready-to-use system prompt | |
| """ | |
| import os | |
| import sys | |
| import json | |
| import hashlib | |
| import re | |
| from pathlib import Path | |
| from datetime import datetime, timedelta | |
| from dataclasses import dataclass, asdict, field | |
| from typing import List, Dict, Any, Optional, Tuple | |
| from collections import Counter, defaultdict | |
| import statistics | |
| # Neo4j | |
| from neo4j import GraphDatabase | |
| # ============================================================ | |
| # CONFIGURATION | |
| # ============================================================ | |
| NEO4J_URI = "neo4j+s://054eff27.databases.neo4j.io" | |
| NEO4J_USER = "neo4j" | |
| NEO4J_PASSWORD = "Qrt37mkb0xBZ7_ts5tG1J70K2mVDGPMF2L7Njlm7cg8" | |
| USER_HOME = Path(os.environ.get("USERPROFILE", os.path.expanduser("~"))) | |
| # Din identitet | |
| CLONE_IDENTITY = { | |
| "name": "Claus Vesterlund Hansen", | |
| "email_patterns": ["claus", "cvh", "clha", "vesterlund"], | |
| "role": "Cyber Security Specialist / AI Strategist", | |
| "organization": "TDC" | |
| } | |
| # ============================================================ | |
| # DATA CLASSES | |
| # ============================================================ | |
| class SentMessage: | |
| """En sendt besked""" | |
| id: str | |
| source: str # outlook, teams | |
| recipients: List[str] | |
| subject: str | |
| body: str | |
| timestamp: str | |
| thread_id: Optional[str] = None | |
| is_reply: bool = False | |
| attachments: List[str] = field(default_factory=list) | |
| class CommunicationPattern: | |
| """Kommunikationsmønster""" | |
| avg_message_length: int | |
| avg_sentence_length: float | |
| greeting_style: List[str] | |
| closing_style: List[str] | |
| common_phrases: List[Tuple[str, int]] | |
| punctuation_style: Dict[str, int] | |
| emoji_usage: int | |
| formality_score: float # 0-1, 0=casual, 1=formal | |
| response_patterns: List[str] | |
| class KnowledgeDomain: | |
| """Vidensdomæne""" | |
| domain: str | |
| keywords: List[str] | |
| message_count: int | |
| confidence: float | |
| sample_contexts: List[str] | |
| class WritingStyle: | |
| """Skrivestil-analyse""" | |
| vocabulary_richness: float | |
| avg_word_length: float | |
| sentence_starters: List[Tuple[str, int]] | |
| transition_words: List[str] | |
| question_frequency: float | |
| exclamation_frequency: float | |
| danish_vs_english: float # 0=all Danish, 1=all English | |
| technical_density: float | |
| action_orientation: float # How action-oriented | |
| class CloneProfile: | |
| """Komplet klon-profil""" | |
| identity: Dict[str, str] | |
| communication: CommunicationPattern | |
| knowledge_domains: List[KnowledgeDomain] | |
| writing_style: WritingStyle | |
| personality_traits: List[str] | |
| expertise_areas: List[str] | |
| common_topics: List[Tuple[str, int]] | |
| message_stats: Dict[str, int] | |
| system_prompt: str | |
| created_at: str | |
| # ============================================================ | |
| # TEXT ANALYSIS | |
| # ============================================================ | |
| class TextAnalyzer: | |
| """Analysér tekst for mønstre""" | |
| # Danske og engelske stop words | |
| STOP_WORDS = { | |
| 'og', 'i', 'at', 'er', 'det', 'en', 'til', 'på', 'for', 'med', 'af', | |
| 'den', 'de', 'som', 'har', 'jeg', 'vi', 'du', 'kan', 'vil', 'skal', | |
| 'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'being', | |
| 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', | |
| 'should', 'may', 'might', 'must', 'shall', 'to', 'of', 'in', 'for', | |
| 'on', 'with', 'at', 'by', 'from', 'or', 'as', 'this', 'that', 'it', | |
| 'ikke', 'så', 'men', 'om', 'fra', 'var', 'være', 'blevet', 'have', | |
| 'bliver', 'eller', 'også', 'hvis', 'når', 'hvad', 'hvor', 'hvordan' | |
| } | |
| # Formalitets-indikatorer | |
| FORMAL_INDICATORS = [ | |
| 'venlig hilsen', 'med venlig hilsen', 'mvh', 'best regards', | |
| 'kind regards', 'regards', 'sincerely', 'hereby', 'hermed', | |
| 'vedrørende', 'angående', 'concerning', 'regarding' | |
| ] | |
| INFORMAL_INDICATORS = [ | |
| 'hej', 'hi', 'hey', 'tak', 'thanks', 'thx', 'cool', 'nice', | |
| 'super', 'fedt', 'awesome', 'great', ':-)', ':)', '👍' | |
| ] | |
| # Tekniske termer | |
| TECH_TERMS = [ | |
| 'api', 'cloud', 'azure', 'aws', 'docker', 'kubernetes', 'k8s', | |
| 'cyber', 'security', 'soc', 'mdr', 'nis2', 'gdpr', 'compliance', | |
| 'ai', 'ml', 'llm', 'rag', 'embedding', 'vector', 'neo4j', | |
| 'python', 'javascript', 'react', 'node', 'sql', 'database', | |
| 'endpoint', 'firewall', 'vulnerability', 'threat', 'incident' | |
| ] | |
| # Action-ord | |
| ACTION_WORDS = [ | |
| 'gør', 'lav', 'send', 'tjek', 'undersøg', 'analyser', 'implementer', | |
| 'do', 'make', 'send', 'check', 'investigate', 'analyze', 'implement', | |
| 'create', 'build', 'deploy', 'test', 'review', 'approve', 'schedule' | |
| ] | |
| def extract_sentences(text: str) -> List[str]: | |
| """Split tekst i sætninger""" | |
| sentences = re.split(r'[.!?]+', text) | |
| return [s.strip() for s in sentences if s.strip() and len(s.strip()) > 5] | |
| def extract_words(text: str) -> List[str]: | |
| """Udtræk ord fra tekst""" | |
| words = re.findall(r'\b[a-zA-ZæøåÆØÅ]{2,}\b', text.lower()) | |
| return words | |
| def extract_phrases(text: str, n: int = 3) -> List[str]: | |
| """Udtræk n-gram phrases""" | |
| words = TextAnalyzer.extract_words(text) | |
| phrases = [] | |
| for i in range(len(words) - n + 1): | |
| phrase = ' '.join(words[i:i+n]) | |
| if not all(w in TextAnalyzer.STOP_WORDS for w in words[i:i+n]): | |
| phrases.append(phrase) | |
| return phrases | |
| def calculate_formality(text: str) -> float: | |
| """Beregn formalitets-score (0-1)""" | |
| text_lower = text.lower() | |
| formal_count = sum(1 for ind in TextAnalyzer.FORMAL_INDICATORS if ind in text_lower) | |
| informal_count = sum(1 for ind in TextAnalyzer.INFORMAL_INDICATORS if ind in text_lower) | |
| total = formal_count + informal_count | |
| if total == 0: | |
| return 0.5 | |
| return formal_count / total | |
| def calculate_technical_density(text: str) -> float: | |
| """Beregn teknisk tæthed""" | |
| words = TextAnalyzer.extract_words(text) | |
| if not words: | |
| return 0 | |
| tech_count = sum(1 for w in words if w in TextAnalyzer.TECH_TERMS) | |
| return min(tech_count / len(words) * 10, 1.0) # Normaliseret | |
| def calculate_action_orientation(text: str) -> float: | |
| """Beregn action-orientering""" | |
| words = TextAnalyzer.extract_words(text) | |
| if not words: | |
| return 0 | |
| action_count = sum(1 for w in words if w in TextAnalyzer.ACTION_WORDS) | |
| return min(action_count / len(words) * 20, 1.0) | |
| def detect_language_ratio(text: str) -> float: | |
| """Detect dansk vs engelsk ratio (0=dansk, 1=engelsk)""" | |
| danish_chars = len(re.findall(r'[æøåÆØÅ]', text)) | |
| danish_words = ['og', 'er', 'det', 'en', 'af', 'til', 'på', 'med', 'har', 'jeg', 'vi', 'kan'] | |
| english_words = ['the', 'is', 'are', 'and', 'or', 'with', 'for', 'have', 'has', 'can', 'will'] | |
| text_lower = text.lower() | |
| danish_count = sum(1 for w in danish_words if f' {w} ' in f' {text_lower} ') | |
| english_count = sum(1 for w in english_words if f' {w} ' in f' {text_lower} ') | |
| danish_count += danish_chars * 2 # Danske tegn tæller ekstra | |
| total = danish_count + english_count | |
| if total == 0: | |
| return 0.5 | |
| return english_count / total | |
| def extract_greeting(text: str) -> Optional[str]: | |
| """Udtræk hilsen fra besked""" | |
| lines = text.strip().split('\n') | |
| if not lines: | |
| return None | |
| first_line = lines[0].strip() | |
| greetings = ['hej', 'hi', 'hey', 'kære', 'dear', 'godmorgen', 'good morning', 'hello'] | |
| for greeting in greetings: | |
| if first_line.lower().startswith(greeting): | |
| return first_line[:50] | |
| return None | |
| def extract_closing(text: str) -> Optional[str]: | |
| """Udtræk afslutning fra besked""" | |
| lines = [l.strip() for l in text.strip().split('\n') if l.strip()] | |
| if len(lines) < 2: | |
| return None | |
| # Check sidste 3 linjer | |
| for line in lines[-3:]: | |
| closings = ['mvh', 'vh', 'hilsen', 'regards', 'best', 'tak', 'thanks', '/'] | |
| if any(c in line.lower() for c in closings): | |
| return line[:50] | |
| return None | |
| # ============================================================ | |
| # SENT MESSAGE HARVESTER | |
| # ============================================================ | |
| class SentMessageHarvester: | |
| """Harvest alle sendte beskeder""" | |
| def __init__(self): | |
| self.messages: List[SentMessage] = [] | |
| self.stats = {"outlook_sent": 0, "teams_sent": 0, "total": 0} | |
| def harvest_outlook_sent(self, days_back: int = 365) -> List[SentMessage]: | |
| """Harvest sendte Outlook emails""" | |
| print(" 📤 Harvester sendte Outlook emails...") | |
| try: | |
| import win32com.client | |
| import pythoncom | |
| pythoncom.CoInitialize() | |
| outlook = win32com.client.Dispatch("Outlook.Application") | |
| namespace = outlook.GetNamespace("MAPI") | |
| # Sent Items folder (5 = olFolderSentMail) | |
| sent_folder = namespace.GetDefaultFolder(5) | |
| items = sent_folder.Items | |
| items.Sort("[SentOn]", True) | |
| cutoff = datetime.now() - timedelta(days=days_back) | |
| count = 0 | |
| for item in items: | |
| try: | |
| if item.Class != 43: # MailItem | |
| continue | |
| sent_time = item.SentOn | |
| if hasattr(sent_time, 'year'): | |
| item_date = datetime(sent_time.year, sent_time.month, sent_time.day) | |
| if item_date < cutoff: | |
| break | |
| # Get recipients | |
| recipients = [] | |
| for i in range(1, item.Recipients.Count + 1): | |
| try: | |
| recipients.append(str(item.Recipients.Item(i).Address)) | |
| except: | |
| pass | |
| # Check if reply | |
| subject = str(item.Subject or "") | |
| is_reply = subject.lower().startswith(('re:', 'sv:', 'aw:')) | |
| # Get attachments | |
| attachments = [] | |
| for i in range(1, item.Attachments.Count + 1): | |
| try: | |
| attachments.append(str(item.Attachments.Item(i).FileName)) | |
| except: | |
| pass | |
| msg = SentMessage( | |
| id=item.EntryID, | |
| source="outlook", | |
| recipients=recipients, | |
| subject=subject, | |
| body=str(item.Body or ""), | |
| timestamp=sent_time.strftime("%Y-%m-%d %H:%M") if hasattr(sent_time, 'strftime') else str(sent_time), | |
| thread_id=str(item.ConversationID) if hasattr(item, 'ConversationID') else None, | |
| is_reply=is_reply, | |
| attachments=attachments | |
| ) | |
| self.messages.append(msg) | |
| count += 1 | |
| if count >= 2000: # Limit | |
| break | |
| except Exception as e: | |
| continue | |
| self.stats["outlook_sent"] = count | |
| print(f" ✅ {count} sendte emails harvested") | |
| except Exception as e: | |
| print(f" ❌ Outlook fejl: {e}") | |
| return self.messages | |
| def get_all_sent(self, days_back: int = 365) -> List[SentMessage]: | |
| """Harvest alle sendte beskeder""" | |
| self.harvest_outlook_sent(days_back) | |
| self.stats["total"] = len(self.messages) | |
| return self.messages | |
| # ============================================================ | |
| # CLONE PROFILE BUILDER | |
| # ============================================================ | |
| class CloneProfileBuilder: | |
| """Byg klon-profil fra sendte beskeder""" | |
| def __init__(self, messages: List[SentMessage]): | |
| self.messages = messages | |
| self.analyzer = TextAnalyzer() | |
| # Aggregated data | |
| self.all_bodies = " ".join([m.body for m in messages]) | |
| self.all_subjects = " ".join([m.subject for m in messages]) | |
| self.all_text = f"{self.all_subjects} {self.all_bodies}" | |
| def analyze_communication_patterns(self) -> CommunicationPattern: | |
| """Analysér kommunikationsmønstre""" | |
| print(" 🔍 Analyserer kommunikationsmønstre...") | |
| # Message lengths | |
| message_lengths = [len(m.body) for m in self.messages if m.body] | |
| avg_length = int(statistics.mean(message_lengths)) if message_lengths else 0 | |
| # Sentence lengths | |
| all_sentences = [] | |
| for m in self.messages: | |
| all_sentences.extend(TextAnalyzer.extract_sentences(m.body)) | |
| sentence_lengths = [len(s.split()) for s in all_sentences] | |
| avg_sentence = statistics.mean(sentence_lengths) if sentence_lengths else 0 | |
| # Greetings | |
| greetings = [] | |
| for m in self.messages: | |
| g = TextAnalyzer.extract_greeting(m.body) | |
| if g: | |
| greetings.append(g) | |
| greeting_counter = Counter(greetings) | |
| top_greetings = [g for g, _ in greeting_counter.most_common(5)] | |
| # Closings | |
| closings = [] | |
| for m in self.messages: | |
| c = TextAnalyzer.extract_closing(m.body) | |
| if c: | |
| closings.append(c) | |
| closing_counter = Counter(closings) | |
| top_closings = [c for c, _ in closing_counter.most_common(5)] | |
| # Common phrases (3-grams) | |
| all_phrases = [] | |
| for m in self.messages: | |
| all_phrases.extend(TextAnalyzer.extract_phrases(m.body, 3)) | |
| phrase_counter = Counter(all_phrases) | |
| common_phrases = phrase_counter.most_common(20) | |
| # Punctuation style | |
| punct_counts = { | |
| 'exclamation': self.all_text.count('!'), | |
| 'question': self.all_text.count('?'), | |
| 'ellipsis': self.all_text.count('...'), | |
| 'dash': self.all_text.count(' - '), | |
| 'colon': self.all_text.count(':'), | |
| } | |
| # Emoji usage | |
| emoji_pattern = re.compile(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF]') | |
| emoji_count = len(emoji_pattern.findall(self.all_text)) | |
| # Formality | |
| formality_scores = [TextAnalyzer.calculate_formality(m.body) for m in self.messages if m.body] | |
| avg_formality = statistics.mean(formality_scores) if formality_scores else 0.5 | |
| # Response patterns (fra replies) | |
| response_starters = [] | |
| for m in self.messages: | |
| if m.is_reply and m.body: | |
| first_sentence = TextAnalyzer.extract_sentences(m.body) | |
| if first_sentence: | |
| response_starters.append(first_sentence[0][:100]) | |
| response_counter = Counter(response_starters) | |
| top_responses = [r for r, _ in response_counter.most_common(10)] | |
| return CommunicationPattern( | |
| avg_message_length=avg_length, | |
| avg_sentence_length=round(avg_sentence, 1), | |
| greeting_style=top_greetings, | |
| closing_style=top_closings, | |
| common_phrases=common_phrases, | |
| punctuation_style=punct_counts, | |
| emoji_usage=emoji_count, | |
| formality_score=round(avg_formality, 2), | |
| response_patterns=top_responses | |
| ) | |
| def analyze_knowledge_domains(self) -> List[KnowledgeDomain]: | |
| """Identificér vidensdomæner""" | |
| print(" 🧠 Identificerer vidensdomæner...") | |
| # Domæne-definitioner | |
| domain_definitions = { | |
| "Cybersecurity": ["cyber", "security", "soc", "mdr", "threat", "vulnerability", "incident", "firewall", "endpoint", "nis2"], | |
| "Cloud & Infrastructure": ["cloud", "azure", "aws", "docker", "kubernetes", "infrastructure", "server", "hosting", "devops"], | |
| "AI & Machine Learning": ["ai", "ml", "llm", "gpt", "copilot", "machine learning", "neural", "embedding", "rag", "model"], | |
| "Business Strategy": ["strategi", "strategy", "roadmap", "budget", "forecast", "business", "plan", "goals", "kpi"], | |
| "Customer Relations": ["kunde", "customer", "klient", "client", "account", "partner", "relation", "service"], | |
| "Compliance & Governance": ["compliance", "gdpr", "nis2", "audit", "policy", "governance", "risk", "regulation"], | |
| "Project Management": ["projekt", "project", "deadline", "milestone", "delivery", "sprint", "agile", "task"], | |
| "Data & Analytics": ["data", "analytics", "database", "sql", "neo4j", "graph", "analysis", "insight", "dashboard"], | |
| } | |
| domains = [] | |
| for domain_name, keywords in domain_definitions.items(): | |
| # Count occurrences | |
| total_count = 0 | |
| matched_keywords = [] | |
| sample_contexts = [] | |
| for kw in keywords: | |
| count = self.all_text.lower().count(kw.lower()) | |
| if count > 0: | |
| total_count += count | |
| matched_keywords.append(kw) | |
| # Find sample context | |
| for m in self.messages[:100]: | |
| if kw.lower() in m.body.lower(): | |
| # Extract context around keyword | |
| idx = m.body.lower().find(kw.lower()) | |
| start = max(0, idx - 50) | |
| end = min(len(m.body), idx + len(kw) + 50) | |
| context = m.body[start:end].replace('\n', ' ').strip() | |
| if context and len(sample_contexts) < 3: | |
| sample_contexts.append(f"...{context}...") | |
| break | |
| if total_count > 10: # Minimum threshold | |
| # Calculate confidence based on keyword coverage and frequency | |
| keyword_coverage = len(matched_keywords) / len(keywords) | |
| frequency_score = min(total_count / 100, 1.0) | |
| confidence = (keyword_coverage * 0.6 + frequency_score * 0.4) | |
| domains.append(KnowledgeDomain( | |
| domain=domain_name, | |
| keywords=matched_keywords, | |
| message_count=total_count, | |
| confidence=round(confidence, 2), | |
| sample_contexts=sample_contexts | |
| )) | |
| # Sort by confidence | |
| domains.sort(key=lambda x: x.confidence, reverse=True) | |
| return domains | |
| def analyze_writing_style(self) -> WritingStyle: | |
| """Analysér skrivestil""" | |
| print(" ✍️ Analyserer skrivestil...") | |
| all_words = TextAnalyzer.extract_words(self.all_text) | |
| unique_words = set(all_words) | |
| # Vocabulary richness (type-token ratio) | |
| vocab_richness = len(unique_words) / len(all_words) if all_words else 0 | |
| # Average word length | |
| avg_word_len = statistics.mean([len(w) for w in all_words]) if all_words else 0 | |
| # Sentence starters | |
| sentence_starters = [] | |
| for m in self.messages: | |
| sentences = TextAnalyzer.extract_sentences(m.body) | |
| for s in sentences: | |
| words = s.split() | |
| if words: | |
| starter = ' '.join(words[:2]).lower() | |
| sentence_starters.append(starter) | |
| starter_counter = Counter(sentence_starters) | |
| top_starters = starter_counter.most_common(15) | |
| # Transition words | |
| transition_patterns = [ | |
| 'derfor', 'desuden', 'derudover', 'men', 'dog', 'imidlertid', | |
| 'therefore', 'however', 'moreover', 'furthermore', 'additionally', | |
| 'først', 'derefter', 'så', 'endelig', 'first', 'then', 'finally' | |
| ] | |
| found_transitions = [t for t in transition_patterns if t in self.all_text.lower()] | |
| # Question and exclamation frequency | |
| total_sentences = len(TextAnalyzer.extract_sentences(self.all_text)) | |
| question_freq = self.all_text.count('?') / total_sentences if total_sentences else 0 | |
| exclamation_freq = self.all_text.count('!') / total_sentences if total_sentences else 0 | |
| # Language ratio | |
| lang_ratio = TextAnalyzer.detect_language_ratio(self.all_text) | |
| # Technical density | |
| tech_density = TextAnalyzer.calculate_technical_density(self.all_text) | |
| # Action orientation | |
| action_orient = TextAnalyzer.calculate_action_orientation(self.all_text) | |
| return WritingStyle( | |
| vocabulary_richness=round(vocab_richness, 3), | |
| avg_word_length=round(avg_word_len, 1), | |
| sentence_starters=top_starters, | |
| transition_words=found_transitions, | |
| question_frequency=round(question_freq, 3), | |
| exclamation_frequency=round(exclamation_freq, 3), | |
| danish_vs_english=round(lang_ratio, 2), | |
| technical_density=round(tech_density, 2), | |
| action_orientation=round(action_orient, 2) | |
| ) | |
| def infer_personality_traits(self, comm: CommunicationPattern, style: WritingStyle, domains: List[KnowledgeDomain]) -> List[str]: | |
| """Udled personlighedstræk fra analyse""" | |
| print(" 🎭 Udleder personlighedstræk...") | |
| traits = [] | |
| # Baseret på formality | |
| if comm.formality_score > 0.6: | |
| traits.append("Professional and formal communicator") | |
| elif comm.formality_score < 0.4: | |
| traits.append("Casual and approachable communicator") | |
| else: | |
| traits.append("Balanced formal/informal communicator") | |
| # Baseret på message length | |
| if comm.avg_message_length > 500: | |
| traits.append("Thorough and detailed in explanations") | |
| elif comm.avg_message_length < 150: | |
| traits.append("Concise and to-the-point") | |
| # Baseret på technical density | |
| if style.technical_density > 0.3: | |
| traits.append("Highly technical and precise") | |
| # Baseret på action orientation | |
| if style.action_orientation > 0.3: | |
| traits.append("Action-oriented and decisive") | |
| # Baseret på question frequency | |
| if style.question_frequency > 0.15: | |
| traits.append("Inquisitive and engaged") | |
| # Baseret på vocabulary richness | |
| if style.vocabulary_richness > 0.4: | |
| traits.append("Articulate with diverse vocabulary") | |
| # Baseret på language mix | |
| if style.danish_vs_english > 0.6: | |
| traits.append("Primarily English communicator") | |
| elif style.danish_vs_english < 0.3: | |
| traits.append("Primarily Danish communicator") | |
| else: | |
| traits.append("Bilingual (Danish/English)") | |
| # Baseret på emoji usage | |
| if comm.emoji_usage > 50: | |
| traits.append("Expressive with visual elements") | |
| # Baseret på top domains | |
| if domains: | |
| top_domain = domains[0].domain | |
| traits.append(f"Deep expertise in {top_domain}") | |
| return traits | |
| def generate_system_prompt(self, profile_data: dict) -> str: | |
| """Generér et system prompt baseret på profilen""" | |
| print(" 📝 Genererer system prompt...") | |
| # Extract key info | |
| identity = profile_data.get('identity', {}) | |
| comm = profile_data.get('communication', {}) | |
| style = profile_data.get('writing_style', {}) | |
| domains = profile_data.get('knowledge_domains', []) | |
| traits = profile_data.get('personality_traits', []) | |
| # Build expertise list | |
| expertise = [d['domain'] for d in domains[:5]] if domains else [] | |
| # Common phrases for authenticity | |
| phrases = [p[0] for p in comm.get('common_phrases', [])[:5]] | |
| # Greeting/closing style | |
| greetings = comm.get('greeting_style', ['Hej'])[:2] | |
| closings = comm.get('closing_style', ['Mvh'])[:2] | |
| prompt = f"""Du er en AI-klon af {identity.get('name', 'bruger')}, {identity.get('role', 'specialist')} hos {identity.get('organization', 'virksomhed')}. | |
| ## Personlighed og Kommunikationsstil | |
| {chr(10).join(['- ' + t for t in traits])} | |
| ## Ekspertiseområder | |
| {chr(10).join(['- ' + e for e in expertise])} | |
| ## Skrivestil | |
| - Gennemsnitlig beskedlængde: {comm.get('avg_message_length', 200)} tegn | |
| - Sætningslængde: {comm.get('avg_sentence_length', 15)} ord | |
| - Formalitetsniveau: {round(comm.get('formality_score', 0.5) * 100)}% formel | |
| - Teknisk densitet: {round(style.get('technical_density', 0.2) * 100)}% | |
| - Sprog: {'Primært engelsk' if style.get('danish_vs_english', 0.5) > 0.6 else 'Primært dansk' if style.get('danish_vs_english', 0.5) < 0.3 else 'Blanding af dansk og engelsk'} | |
| ## Typiske fraser og udtryk | |
| {chr(10).join(['- "' + p + '"' for p in phrases[:5]])} | |
| ## Hilsner og afslutniger | |
| - Start ofte med: {', '.join(greetings)} | |
| - Afslut ofte med: {', '.join(closings)} | |
| ## Instruktioner | |
| 1. Kommunikér som {identity.get('name', 'brugeren')} ville gøre | |
| 2. Brug samme tone, ordvalg og struktur | |
| 3. Træk på viden inden for ekspertiseområderne | |
| 4. Vær {'formel' if comm.get('formality_score', 0.5) > 0.6 else 'afslappet'} men professionel | |
| 5. Svar {'grundigt og detaljeret' if comm.get('avg_message_length', 200) > 400 else 'kortfattet og præcist'} | |
| 6. Inkludér tekniske detaljer når relevant | |
| 7. Vær handlingsorienteret og løsningsfokuseret""" | |
| return prompt | |
| def build_profile(self) -> CloneProfile: | |
| """Byg komplet klon-profil""" | |
| print("\n" + "=" * 60) | |
| print("🧬 BUILDING CLONE PROFILE") | |
| print("=" * 60) | |
| print(f" 📨 Analyserer {len(self.messages)} sendte beskeder...") | |
| # Run analyses | |
| communication = self.analyze_communication_patterns() | |
| domains = self.analyze_knowledge_domains() | |
| writing_style = self.analyze_writing_style() | |
| # Prepare data for trait inference | |
| profile_data = { | |
| 'identity': CLONE_IDENTITY, | |
| 'communication': asdict(communication), | |
| 'knowledge_domains': [asdict(d) for d in domains], | |
| 'writing_style': asdict(writing_style), | |
| } | |
| # Infer traits | |
| traits = self.infer_personality_traits(communication, writing_style, domains) | |
| profile_data['personality_traits'] = traits | |
| # Generate system prompt | |
| system_prompt = self.generate_system_prompt(profile_data) | |
| # Common topics | |
| all_words = TextAnalyzer.extract_words(self.all_text) | |
| word_counts = Counter(w for w in all_words if w not in TextAnalyzer.STOP_WORDS and len(w) > 3) | |
| common_topics = word_counts.most_common(30) | |
| # Message stats | |
| message_stats = { | |
| "total_messages": len(self.messages), | |
| "replies": sum(1 for m in self.messages if m.is_reply), | |
| "with_attachments": sum(1 for m in self.messages if m.attachments), | |
| "total_recipients": len(set(r for m in self.messages for r in m.recipients)), | |
| "total_words": len(all_words), | |
| "unique_words": len(set(all_words)), | |
| } | |
| # Expertise areas (simplified) | |
| expertise = [d.domain for d in domains[:7]] | |
| profile = CloneProfile( | |
| identity=CLONE_IDENTITY, | |
| communication=communication, | |
| knowledge_domains=domains, | |
| writing_style=writing_style, | |
| personality_traits=traits, | |
| expertise_areas=expertise, | |
| common_topics=common_topics, | |
| message_stats=message_stats, | |
| system_prompt=system_prompt, | |
| created_at=datetime.now().isoformat() | |
| ) | |
| return profile | |
| # ============================================================ | |
| # NEO4J STORAGE | |
| # ============================================================ | |
| class CloneProfileStorage: | |
| """Gem klon-profil i Neo4j""" | |
| def __init__(self): | |
| self.driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD)) | |
| def save_profile(self, profile: CloneProfile): | |
| """Gem profil i Neo4j""" | |
| print("\n 💾 Gemmer profil i Neo4j...") | |
| profile_hash = hashlib.md5(f"clone:{profile.identity['name']}".encode()).hexdigest() | |
| with self.driver.session() as session: | |
| # Create main CloneProfile node | |
| session.run(""" | |
| MERGE (p:CloneProfile {profileHash: $hash}) | |
| ON CREATE SET | |
| p.name = $name, | |
| p.role = $role, | |
| p.organization = $org, | |
| p.createdAt = datetime() | |
| ON MATCH SET | |
| p.updatedAt = datetime() | |
| SET | |
| p.systemPrompt = $prompt, | |
| p.personalityTraits = $traits, | |
| p.expertiseAreas = $expertise, | |
| p.avgMessageLength = $avgLen, | |
| p.formalityScore = $formality, | |
| p.technicalDensity = $techDensity, | |
| p.vocabularyRichness = $vocabRich, | |
| p.totalMessages = $totalMsgs, | |
| p.totalWords = $totalWords, | |
| p.uniqueWords = $uniqueWords | |
| """, | |
| hash=profile_hash, | |
| name=profile.identity['name'], | |
| role=profile.identity['role'], | |
| org=profile.identity['organization'], | |
| prompt=profile.system_prompt, | |
| traits=profile.personality_traits, | |
| expertise=profile.expertise_areas, | |
| avgLen=profile.communication.avg_message_length, | |
| formality=profile.communication.formality_score, | |
| techDensity=profile.writing_style.technical_density, | |
| vocabRich=profile.writing_style.vocabulary_richness, | |
| totalMsgs=profile.message_stats['total_messages'], | |
| totalWords=profile.message_stats['total_words'], | |
| uniqueWords=profile.message_stats['unique_words'] | |
| ) | |
| # Create knowledge domain nodes | |
| for domain in profile.knowledge_domains: | |
| session.run(""" | |
| MERGE (d:KnowledgeDomain {name: $name}) | |
| ON CREATE SET d.keywords = $keywords | |
| WITH d | |
| MATCH (p:CloneProfile {profileHash: $hash}) | |
| MERGE (p)-[r:HAS_EXPERTISE]->(d) | |
| SET r.confidence = $confidence, r.messageCount = $count | |
| """, | |
| name=domain.domain, | |
| keywords=domain.keywords, | |
| hash=profile_hash, | |
| confidence=domain.confidence, | |
| count=domain.message_count | |
| ) | |
| # Create common phrase nodes (for RAG) | |
| for phrase, count in profile.communication.common_phrases[:20]: | |
| phrase_hash = hashlib.md5(phrase.encode()).hexdigest()[:12] | |
| session.run(""" | |
| MERGE (ph:CommonPhrase {hash: $phash}) | |
| ON CREATE SET ph.phrase = $phrase | |
| WITH ph | |
| MATCH (p:CloneProfile {profileHash: $hash}) | |
| MERGE (p)-[r:USES_PHRASE]->(ph) | |
| SET r.frequency = $count | |
| """, | |
| phash=phrase_hash, | |
| phrase=phrase, | |
| hash=profile_hash, | |
| count=count | |
| ) | |
| # Create topic nodes | |
| for topic, count in profile.common_topics[:30]: | |
| session.run(""" | |
| MERGE (t:CloneTopic {name: $topic}) | |
| WITH t | |
| MATCH (p:CloneProfile {profileHash: $hash}) | |
| MERGE (p)-[r:DISCUSSES]->(t) | |
| SET r.frequency = $count | |
| """, | |
| topic=topic, | |
| hash=profile_hash, | |
| count=count | |
| ) | |
| print(" ✅ Profil gemt i Neo4j!") | |
| def close(self): | |
| self.driver.close() | |
| # ============================================================ | |
| # MAIN | |
| # ============================================================ | |
| class ClakCloneProfiler: | |
| """Main profiler class""" | |
| def __init__(self): | |
| self.output_dir = Path("data/clone_profile") | |
| self.output_dir.mkdir(parents=True, exist_ok=True) | |
| def run(self, days_back: int = 365, save_to_neo4j: bool = True): | |
| """Kør komplet profiling""" | |
| print("\n" + "=" * 60) | |
| print("🧬 CLAK DIGITAL CLONE PROFILER") | |
| print("=" * 60) | |
| print(f" 👤 Target: {CLONE_IDENTITY['name']}") | |
| print(f" 📅 Periode: Sidste {days_back} dage") | |
| print("=" * 60) | |
| # Harvest sendte beskeder | |
| print("\n📤 HARVESTING SENT MESSAGES") | |
| harvester = SentMessageHarvester() | |
| messages = harvester.get_all_sent(days_back) | |
| if not messages: | |
| print("❌ Ingen sendte beskeder fundet!") | |
| return None | |
| print(f"\n 📊 Stats:") | |
| print(f" Outlook sendt: {harvester.stats['outlook_sent']}") | |
| print(f" Total: {harvester.stats['total']}") | |
| # Build profile | |
| builder = CloneProfileBuilder(messages) | |
| profile = builder.build_profile() | |
| # Save to Neo4j | |
| if save_to_neo4j: | |
| storage = CloneProfileStorage() | |
| storage.save_profile(profile) | |
| storage.close() | |
| # Save to JSON | |
| output_file = self.output_dir / f"clone_profile_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" | |
| # Convert profile to serializable dict | |
| profile_dict = { | |
| "identity": profile.identity, | |
| "communication": asdict(profile.communication), | |
| "knowledge_domains": [asdict(d) for d in profile.knowledge_domains], | |
| "writing_style": asdict(profile.writing_style), | |
| "personality_traits": profile.personality_traits, | |
| "expertise_areas": profile.expertise_areas, | |
| "common_topics": profile.common_topics, | |
| "message_stats": profile.message_stats, | |
| "system_prompt": profile.system_prompt, | |
| "created_at": profile.created_at | |
| } | |
| with open(output_file, 'w', encoding='utf-8') as f: | |
| json.dump(profile_dict, f, indent=2, ensure_ascii=False) | |
| # Save system prompt separately | |
| prompt_file = self.output_dir / "system_prompt.txt" | |
| with open(prompt_file, 'w', encoding='utf-8') as f: | |
| f.write(profile.system_prompt) | |
| # Print summary | |
| self._print_summary(profile) | |
| print(f"\n💾 Filer gemt:") | |
| print(f" 📄 {output_file}") | |
| print(f" 📝 {prompt_file}") | |
| return profile | |
| def _print_summary(self, profile: CloneProfile): | |
| """Print profil-summary""" | |
| print("\n" + "=" * 60) | |
| print("📊 CLONE PROFILE SUMMARY") | |
| print("=" * 60) | |
| print(f"\n👤 IDENTITY") | |
| print(f" Navn: {profile.identity['name']}") | |
| print(f" Rolle: {profile.identity['role']}") | |
| print(f" Organisation: {profile.identity['organization']}") | |
| print(f"\n📝 COMMUNICATION STYLE") | |
| print(f" Gns. beskedlængde: {profile.communication.avg_message_length} tegn") | |
| print(f" Gns. sætningslængde: {profile.communication.avg_sentence_length} ord") | |
| print(f" Formalitet: {round(profile.communication.formality_score * 100)}%") | |
| print(f" Emoji brug: {profile.communication.emoji_usage}") | |
| print(f"\n✍️ WRITING STYLE") | |
| print(f" Ordforråd-rigdom: {round(profile.writing_style.vocabulary_richness * 100)}%") | |
| print(f" Teknisk densitet: {round(profile.writing_style.technical_density * 100)}%") | |
| print(f" Action-orientering: {round(profile.writing_style.action_orientation * 100)}%") | |
| print(f" Sprog mix: {'Primært engelsk' if profile.writing_style.danish_vs_english > 0.6 else 'Primært dansk' if profile.writing_style.danish_vs_english < 0.3 else 'Blanding'}") | |
| print(f"\n🧠 KNOWLEDGE DOMAINS") | |
| for domain in profile.knowledge_domains[:5]: | |
| confidence_bar = "█" * int(domain.confidence * 10) | |
| print(f" {domain.domain}: {confidence_bar} ({round(domain.confidence * 100)}%)") | |
| print(f"\n🎭 PERSONALITY TRAITS") | |
| for trait in profile.personality_traits: | |
| print(f" • {trait}") | |
| print(f"\n📈 MESSAGE STATS") | |
| print(f" Total beskeder: {profile.message_stats['total_messages']}") | |
| print(f" Replies: {profile.message_stats['replies']}") | |
| print(f" Med vedhæftninger: {profile.message_stats['with_attachments']}") | |
| print(f" Unikke modtagere: {profile.message_stats['total_recipients']}") | |
| print(f" Total ord: {profile.message_stats['total_words']}") | |
| print(f" Unikt ordforråd: {profile.message_stats['unique_words']}") | |
| print(f"\n🏷️ TOP TOPICS") | |
| for topic, count in profile.common_topics[:10]: | |
| print(f" {topic}: {count}") | |
| print("\n" + "=" * 60) | |
| print("📝 SYSTEM PROMPT (første 500 tegn):") | |
| print("-" * 60) | |
| print(profile.system_prompt[:500] + "...") | |
| print("=" * 60) | |
| def main(): | |
| import argparse | |
| parser = argparse.ArgumentParser(description="CLAK Digital Clone Profiler") | |
| parser.add_argument("--days", type=int, default=365, help="Dage tilbage at analysere") | |
| parser.add_argument("--no-neo4j", action="store_true", help="Skip Neo4j storage") | |
| args = parser.parse_args() | |
| profiler = ClakCloneProfiler() | |
| profiler.run(days_back=args.days, save_to_neo4j=not args.no_neo4j) | |
| if __name__ == "__main__": | |
| main() | |