Spaces:
Paused
Paused
| #!/usr/bin/env python3 | |
| """ | |
| 🏢 TDC SharePoint Harvester - MSAL Device Code Flow | |
| Simpel, holdbar løsning der virker med enhver Microsoft konto | |
| """ | |
| import json | |
| import hashlib | |
| import webbrowser | |
| from pathlib import Path | |
| from datetime import datetime | |
| from neo4j import GraphDatabase | |
| try: | |
| import msal | |
| import requests | |
| except ImportError: | |
| print("Installing required packages...") | |
| import subprocess | |
| subprocess.run(["pip", "install", "msal", "requests", "--quiet"]) | |
| import msal | |
| import requests | |
| class TDCSharePointHarvester: | |
| """SharePoint harvester med MSAL device code authentication""" | |
| # Microsoft Graph public client (no app registration needed) | |
| CLIENT_ID = "14d82eec-204b-4c2f-b7e8-296a70dab67e" # Microsoft Graph Explorer | |
| AUTHORITY = "https://login.microsoftonline.com/common" | |
| SCOPES = ["https://graph.microsoft.com/.default"] | |
| GRAPH_URL = "https://graph.microsoft.com/v1.0" | |
| # Neo4j | |
| NEO4J_URI = "neo4j+s://054eff27.databases.neo4j.io" | |
| NEO4J_USER = "neo4j" | |
| NEO4J_PASSWORD = "Qrt37mkb0xBZ7_ts5tG1J70K2mVDGPMF2L7Njlm7cg8" | |
| # Søgetermer | |
| SEARCH_TERMS = [ | |
| "strategi TDC", | |
| "cybersikkerhed", | |
| "cloud strategi", | |
| "AI strategi", | |
| "Columbus ERP", | |
| "budget 2025", | |
| "kundeliste", | |
| "rammeaftale", | |
| "produktkatalog", | |
| "SOC MDR", | |
| "NIS2", | |
| "IT arkitektur" | |
| ] | |
| def __init__(self): | |
| self.output_dir = Path("data/sharepoint_harvest") | |
| self.output_dir.mkdir(parents=True, exist_ok=True) | |
| self.access_token = None | |
| self.documents = [] | |
| self.stats = {"searches": 0, "documents": 0, "sites": 0} | |
| # Token cache | |
| self.token_cache_file = self.output_dir / "token_cache.json" | |
| # Neo4j | |
| self.neo4j = GraphDatabase.driver( | |
| self.NEO4J_URI, auth=(self.NEO4J_USER, self.NEO4J_PASSWORD) | |
| ) | |
| print("🏢 TDC SharePoint Harvester") | |
| print("=" * 50) | |
| def authenticate(self): | |
| """Authenticate via device code flow""" | |
| print("\n🔐 AUTHENTICATION") | |
| print("-" * 40) | |
| # Load cached token if exists | |
| cache = msal.SerializableTokenCache() | |
| if self.token_cache_file.exists(): | |
| cache.deserialize(self.token_cache_file.read_text()) | |
| app = msal.PublicClientApplication( | |
| self.CLIENT_ID, | |
| authority=self.AUTHORITY, | |
| token_cache=cache | |
| ) | |
| # Try silent auth first | |
| accounts = app.get_accounts() | |
| if accounts: | |
| print(f" Found cached account: {accounts[0].get('username', 'Unknown')}") | |
| result = app.acquire_token_silent( | |
| ["Sites.Read.All", "Files.Read.All", "User.Read"], | |
| account=accounts[0] | |
| ) | |
| if result and "access_token" in result: | |
| self.access_token = result["access_token"] | |
| print(" ✅ Using cached token") | |
| return True | |
| # Device code flow | |
| print("\n 📱 Device Code Authentication:") | |
| flow = app.initiate_device_flow( | |
| scopes=["Sites.Read.All", "Files.Read.All", "User.Read"] | |
| ) | |
| if "user_code" not in flow: | |
| print(f" ❌ Error: {flow.get('error_description', 'Unknown')}") | |
| return False | |
| print(f"\n 🔗 Gå til: {flow['verification_uri']}") | |
| print(f" 📝 Indtast kode: {flow['user_code']}") | |
| print("\n Venter på login...") | |
| # Open browser | |
| webbrowser.open(flow['verification_uri']) | |
| # Wait for auth | |
| result = app.acquire_token_by_device_flow(flow) | |
| if "access_token" in result: | |
| self.access_token = result["access_token"] | |
| # Save cache | |
| if cache.has_state_changed: | |
| self.token_cache_file.write_text(cache.serialize()) | |
| # Get user info | |
| headers = {"Authorization": f"Bearer {self.access_token}"} | |
| user = requests.get(f"{self.GRAPH_URL}/me", headers=headers).json() | |
| print(f"\n ✅ Logget ind som: {user.get('displayName', 'Unknown')}") | |
| print(f" Email: {user.get('mail', user.get('userPrincipalName', 'Unknown'))}") | |
| return True | |
| else: | |
| print(f" ❌ Auth failed: {result.get('error_description', 'Unknown')}") | |
| return False | |
| def api_get(self, endpoint: str) -> dict: | |
| """Make authenticated API call""" | |
| headers = {"Authorization": f"Bearer {self.access_token}"} | |
| response = requests.get(f"{self.GRAPH_URL}{endpoint}", headers=headers) | |
| if response.status_code == 200: | |
| return response.json() | |
| return {} | |
| def api_post(self, endpoint: str, data: dict) -> dict: | |
| """Make authenticated POST call""" | |
| headers = { | |
| "Authorization": f"Bearer {self.access_token}", | |
| "Content-Type": "application/json" | |
| } | |
| response = requests.post(f"{self.GRAPH_URL}{endpoint}", headers=headers, json=data) | |
| if response.status_code == 200: | |
| return response.json() | |
| return {} | |
| def search(self, query: str) -> list: | |
| """Search SharePoint via Graph API""" | |
| results = [] | |
| search_body = { | |
| "requests": [{ | |
| "entityTypes": ["driveItem", "listItem", "site"], | |
| "query": {"queryString": query}, | |
| "from": 0, | |
| "size": 25 | |
| }] | |
| } | |
| data = self.api_post("/search/query", search_body) | |
| for result_set in data.get("value", []): | |
| for container in result_set.get("hitsContainers", []): | |
| for hit in container.get("hits", []): | |
| resource = hit.get("resource", {}) | |
| doc = { | |
| "id": resource.get("id", ""), | |
| "title": resource.get("name", "") or resource.get("displayName", ""), | |
| "url": resource.get("webUrl", ""), | |
| "summary": hit.get("summary", "")[:500], | |
| "type": resource.get("@odata.type", "").split(".")[-1], | |
| "modified": resource.get("lastModifiedDateTime", ""), | |
| "query": query | |
| } | |
| if doc["title"] and doc["url"]: | |
| results.append(doc) | |
| return results | |
| def get_sites(self) -> list: | |
| """Get accessible SharePoint sites""" | |
| sites = [] | |
| data = self.api_get("/sites?search=*") | |
| for site in data.get("value", []): | |
| sites.append({ | |
| "id": site.get("id"), | |
| "name": site.get("displayName"), | |
| "url": site.get("webUrl"), | |
| "description": site.get("description", "") | |
| }) | |
| self.stats["sites"] += 1 | |
| return sites | |
| def get_my_drive(self) -> list: | |
| """Get OneDrive files""" | |
| files = [] | |
| data = self.api_get("/me/drive/root/children") | |
| for item in data.get("value", []): | |
| files.append({ | |
| "name": item.get("name"), | |
| "url": item.get("webUrl"), | |
| "type": "folder" if item.get("folder") else "file", | |
| "size": item.get("size", 0), | |
| "modified": item.get("lastModifiedDateTime", "") | |
| }) | |
| return files | |
| def save_to_neo4j(self, doc: dict): | |
| """Save document to Neo4j""" | |
| content_hash = hashlib.md5( | |
| f"{doc.get('title','')}:{doc.get('url','')}".encode() | |
| ).hexdigest() | |
| with self.neo4j.session() as session: | |
| session.run(""" | |
| MERGE (d:SharePointDocument {contentHash: $hash}) | |
| SET d.title = $title, | |
| d.url = $url, | |
| d.summary = $summary, | |
| d.docType = $type, | |
| d.searchQuery = $query, | |
| d.modified = $modified, | |
| d.harvestedAt = datetime() | |
| MERGE (ds:DataSource {name: 'TDC_SharePoint'}) | |
| MERGE (d)-[:HARVESTED_FROM]->(ds) | |
| """, | |
| hash=content_hash, | |
| title=doc.get('title', '')[:200], | |
| url=doc.get('url', ''), | |
| summary=doc.get('summary', '')[:1000], | |
| type=doc.get('type', ''), | |
| query=doc.get('query', ''), | |
| modified=doc.get('modified', '') | |
| ) | |
| self.stats["documents"] += 1 | |
| def run(self): | |
| """Run full harvest""" | |
| if not self.authenticate(): | |
| return | |
| # 1. Get sites | |
| print("\n📍 SHAREPOINT SITES") | |
| print("-" * 40) | |
| sites = self.get_sites() | |
| print(f" Found {len(sites)} sites") | |
| for site in sites[:10]: | |
| print(f" • {site['name']}: {site['url']}") | |
| # 2. Search documents | |
| print("\n🔍 SEARCHING DOCUMENTS") | |
| print("-" * 40) | |
| all_docs = [] | |
| seen_urls = set() | |
| for query in self.SEARCH_TERMS: | |
| print(f" Søger: {query}", end="") | |
| results = self.search(query) | |
| self.stats["searches"] += 1 | |
| new_count = 0 | |
| for doc in results: | |
| if doc["url"] not in seen_urls: | |
| seen_urls.add(doc["url"]) | |
| all_docs.append(doc) | |
| self.save_to_neo4j(doc) | |
| new_count += 1 | |
| print(f" → {len(results)} results ({new_count} new)") | |
| # 3. OneDrive | |
| print("\n📁 ONEDRIVE FILES") | |
| print("-" * 40) | |
| my_files = self.get_my_drive() | |
| print(f" Found {len(my_files)} items") | |
| # 4. Summary | |
| print("\n" + "=" * 50) | |
| print("📊 HARVEST COMPLETE") | |
| print("=" * 50) | |
| print(f" 🔍 Searches: {self.stats['searches']}") | |
| print(f" 📍 Sites: {self.stats['sites']}") | |
| print(f" 📄 Documents: {self.stats['documents']}") | |
| # Save JSON | |
| output = { | |
| "timestamp": datetime.now().isoformat(), | |
| "stats": self.stats, | |
| "sites": sites, | |
| "documents": all_docs, | |
| "onedrive": my_files | |
| } | |
| output_file = self.output_dir / "sharepoint_harvest.json" | |
| output_file.write_text(json.dumps(output, indent=2, ensure_ascii=False)) | |
| print(f"\n📁 Saved: {output_file}") | |
| self.neo4j.close() | |
| if __name__ == "__main__": | |
| harvester = TDCSharePointHarvester() | |
| harvester.run() | |