import os import streamlit as st import chardet import aiohttp import asyncio import pandas as pd from io import BytesIO from langchain_community.document_loaders import WebBaseLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.vectorstores import FAISS from langchain_community.embeddings import SentenceTransformerEmbeddings from langchain.prompts import PromptTemplate from langchain.chains import RetrievalQA from langchain_community.llms import HuggingFaceHub from langchain.schema import Document import bs4 from PyPDF2 import PdfReader import logging import base64 from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM from langchain import HuggingFacePipeline from langchain_community.embeddings import HuggingFaceEmbeddings # Setup logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Initialize the HuggingFace Mistral-7B model llm = HuggingFaceHub( repo_id="mistralai/Mistral-7B-Instruct-v0.3", model_kwargs={"temperature": 0.7, "max_length": 512} ) # Initialize the HuggingFace model with caching @st.cache_resource def load_model(repo_id): return HuggingFaceHub( repo_id=repo_id, model_kwargs={"temperature": 0.7, "max_length": 512} ) # List of available open-source models open_source_models = { "Mistral-7B": "mistralai/Mistral-7B-Instruct-v0.3", "Llama-2-7B": "meta-llama/Llama-2-7b-chat-hf", "Zephyr-7B": "HuggingFaceH4/zephyr-7b-beta" } # Asynchronous fetching and processing URLs async def fetch_and_process_url(session, url): documents = [] try: async with session.get(url) as response: if response.status != 200: logger.error(f"Failed to fetch URL {url}: HTTP {response.status}") return documents content = await response.read() content_type = response.headers.get('Content-Type', '') if 'application/pdf' in content_type: reader = PdfReader(BytesIO(content)) documents.extend([Document(page_content=page.extract_text(), metadata={"source": url, "page": i+1}) for i, page in enumerate(reader.pages) if page.extract_text()]) else: text = content.decode('utf-8', errors='ignore') soup = bs4.BeautifulSoup(text, 'html.parser') title = soup.find('title').text if soup.find('title') else "" author = soup.find('meta', attrs={'name': 'author'})['content'] if soup.find('meta', attrs={'name': 'author'}) else "" publication_date = soup.find('meta', attrs={'name': 'publication-date'})['content'] if soup.find('meta', attrs={'name': 'publication-date'}) else "" cleaned_text = soup.get_text() documents.append(Document(page_content=cleaned_text, metadata={"source": url, "title": title, "author": author, "publication_date": publication_date})) except Exception as e: logger.error(f"Failed to fetch or process URL {url}: {e}") return documents async def load_data_async(file_paths): documents = [] async with aiohttp.ClientSession() as session: tasks = [fetch_and_process_url(session, file_path) for file_path in file_paths] #st.write(tasks) results = await asyncio.gather(*tasks, return_exceptions=True) #st.write(results) for result in results: if isinstance(result, Exception): logger.error(f"Error processing URL: {result}") elif result: documents.extend(result) logger.info(f"Total documents loaded: {len(documents)}") #st.write(documents) return documents # Initialize embeddings and vector store with caching @st.cache_resource def initialize_embeddings(_documents): if not _documents: logger.error("No documents available to embed.") raise ValueError("No documents available to embed.") embedding_model = SentenceTransformerEmbeddings( model_name="all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True} ) text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) split_documents = text_splitter.split_documents(_documents) if not split_documents: logger.error("No documents available after splitting.") raise ValueError("No documents available after splitting.") vectorstore = FAISS.from_documents(split_documents, embedding_model) return vectorstore, split_documents # Default URLs default_urls=['https://www.protectuk.police.uk/threat-risk/threat-analysis/complex-attacks', 'https://threatconnect.com/blog/preparing-organizations-to-stop-ransomware-in-2023/', 'https://arstechnica.com/security/2013/04/more-then-30-mmorpg-companies-targeted-in-ongoing-malware-attack/', 'https://www.mandiant.com/resources/insights/uncategorized-unc-threat-groups', 'https://attack.mitre.org/techniques/T1071', 'https://blog.talosintelligence.com/vulnerability-roundup-august-9-23/', 'https://cybersecurity.att.com/blogs', 'https://www.threatintelligence.com/blog/australia-data-breaches', 'https://www.netresec.com/?page=NetworkMiner', 'https://www.justice.gov/opa/pr/justice-department-announces-actions-disrupt-advanced-persistent-threat-28-botnet-infected', 'https://blog.morphisec.com/in2al5d-p3in4er', 'https://www.mandiant.com/resources/apt29-domain-frontin', 'https://blog.talosintelligence.com/threat-source-newsletter-july-20-2023/', 'https://www.malwarebytes.com/blog/threat-intelligence/2023/08/global-ransomware-attacks-at-an-all-time-high-shows-latest-2023-state-of-ransomware-report', 'https://www.humansecurity.com/tech-engineering-blog/exfiltrating-users-private-data-using-google-analytics-to-bypass-csp', 'https://www.secureworks.com/blog/secureworks-achieves-100-visibility-and-95-detection-in-inaugural-mitre-evaluation-for-taegis', 'https://cybersecurity.att.com/blogs/labs-research/seroxen-rat-for-sale', 'https://attack.mitre.org/techniques/T1027', 'https://usa.kaspersky.com/resource-center/threats/what-is-a-honeypot', 'https://www.microsoft.com/security/blog/2021/03/02/hafnium-targeting-exchange-servers/', 'https://symantec-enterprise-blogs.security.com/blogs/threat-intelligence/3am-ransomware-lockbit', 'https://blog.rsisecurity.com/how-to-identify-signs-of-ransomware-attacks/', 'https://www.gov.uk/government/news/russia-behind-cyber-attack-with-europe-wide-impact-an-hour-before-ukraine-invasion', 'https://www.reversemode.com/2022/03/satcom-terminals-under-attack-in-europe.html', 'https://ddanchev.blogspot.com', 'https://securelist.com/the-cozyduke-apt/69731/', 'https://www.viasat.com/about/newsroom/blog/ka-sat-network-cyber-attack-overview/', 'https://www.malwarebytes.com/blog/news/2023/06/cl0p-ransomware-gang-claims-first-victims-of-the-moveit-vulnerability', 'https://www.digitalguardian.com/blog/what-are-indicators-compromise', 'https://www.malwarebytes.com/business/endpoint-protection', 'https://krebsonsecurity.com/2015/07/online-cheating-site-ashleymadison-hacked/', 'https://github.com/trendmicro/tlsh', 'https://www.malwarebytes.com/business/vulnerability-patch-management', 'https://www.malwarebytes.com/premium', 'https://blog.google/threat-analysis-group/', 'https://www.trendmicro.com/vinfo/us/security/research-and-analysis/predictions/2017', 'https://www.linkedin.com/legal/privacy-policy?session_redirect=https%3A%2F%2Fwww%2Elinkedin%2Ecom%2Fgroups%2F3054767%2F&trk=registration-frontend_join-form-privacy-policy', 'https://www.malwarebytes.com/blog/threat-intelligence/2023/04/ransomware-review-april-2023', 'https://attack.mitre.org/techniques/T1562/006/', 'https://attack.mitre.org/techniques/T1548', 'https://www.mandiant.com/resources/how-mandiant-tracks-uncategorized-threat-actors', 'https://nakedsecurity.sophos.com', 'https://en.wikipedia.org/wiki/False_imprisonment', 'https://blog.talosintelligence.com/2018/09/vpnfilter-part-3.html', 'https://www.trendmicro.com/en_us/research/17/d/pigs-malware-examining-possible-member-winnti-group.html', 'https://www.malwarebytes.com/blog/news/2021/07/ransomwares-russia-problem', 'https://cybersecurity.att.com/blogs/security-essentials/understanding-cyber-attacker-motivations-to-best-apply-controls', 'https://www.att.com/privacy', 'https://www.itgovernance.eu/blog/en/how-to-defend-against-man-in-the-middle-attacks', 'https://www.malwarebytes.com/blog/threat-intelligence/2023/04/kritec-art', 'https://www.musl-libc.org/', 'https://www.cisa.gov/uscert/ncas/alerts/aa22-076a', 'https://www.faronics.com/news/blog/malware-becoming-more-sophisticated-majority-of-organizations-infected', 'https://securelist.com/bad-magic-apt/109087/', 'https://www.mandiant.com/resources/blog/irongate-ics-malware', 'https://github.com/cobbr/Covenant', 'https://attack.mitre.org/techniques/T1571', 'https://www.trendmicro.com/vinfo/us/security/definition/targeted-attacks', 'https://www.malwarebytes.com/blog/news/2023/06/vice-society', 'https://www.state.gov/attribution-of-russias-malicious-cyber-activity-against-ukraine/', 'https://libevent.org/', 'https://www.malwarebytes.com/blog/threat-intelligence/2023/07/socgholish-copycat-delivers-netsupport-rat', 'https://securelist.com/the-miniduke-mystery-pdf-0-day-government-spy-assembler-0x29a-micro-backdoor/31112/', 'https://www.virustotal.com/gui/file/301e819008e19b9803ad8b75ecede9ecfa5b11a3ecd8df0316914588b95371c8', 'https://blog.talosintelligence.com/need-to-know-commodity-malware/', 'https://www.reuters.com/business/media-telecom/exclusive-hackers-who-crippled-viasat-modems-ukraine-are-still-active-company-2022-03-30/', 'https://www.mandiant.com/resources/blog/detection-response-to-exploitation-of-microsoft-exchange-zero-day-vulnerabilities', 'https://gridinsoft.com/blogs/rorschach-ransomware-analysis/', 'https://www.malwarebytes.com/blog/news/2022/03/blunting-rdp-brute-force-attacks-with-rate-limiting', 'https://www.malwarebytes.com/business/edr', 'https://www.malwarebytes.com/blog/threat-intelligence/2023/07/criminals-target-businesses-with-malicious-extension-for-metas-ads-manager-and-accidentally-leak-stolen-accounts', 'https://medium.com/@crovax/ducktail-multi-stage-analysis-39c2a7d9675d', 'https://www.mandiant.com/resources/evasive-attacker-leverages-solarwinds-supply-chain-compromises-with-sunburst-backdoor', 'https://www.secureworks.com/blog/qa-top-cybersecurity-eperts-weigh-in-on-incident-response-best-practices', 'https://engineering.fb.com/2023/05/03/security/malware-nodestealer-ducktail/', 'https://www.coretech.us/blog/6-motivations-of-cyber-criminals', 'https://www.webroot.com/blog', 'https://www.whitehouse.gov/briefing-room/statements-releases/2021/04/15/fact-sheet-imposing-costs-for-harmful-foreign-activities-by-the-russian-government/', 'https://www.brighttalk.com/webcast/7451/462719', 'https://www.malwarebytes.com/blog/news/2023/03/clop-ransomware-is-victimizing-goanywhere-mft-customers', 'https://krebsonsecurity.com/2023/07/top-suspect-in-2015-ashley-madison-hack-committed-suicide-in-2014/', 'https://www.malwarebytes.com/blog/news/2023/06/rewards-up-to-10-million-for-information-about-cl0p-ransomware-operation', 'https://nvd.nist.gov/vuln/detail/CVE-2022-24682', 'https://blog.talosintelligence.com/talos-ir-q2-2023-quarterly-recap/', 'https://www.forbes.com/sites/forbestechcouncil/2022/01/05/ransomware-attacks-are-not-increasing-in-sophistication/', 'https://www.kaspersky.com/enterprise-security/mitre/apt29', 'https://www.malwarebytes.com/business/contact-us/', 'https://www.mandiant.com/services/incident-response', 'https://www.virusbulletin.com/conference/vb2018/abstracts/anatomy-attack-detecting-and-defeating-crashoverride/', 'https://www.securin.io/articles/all-about-lockbit-ransomware/', 'https://attack.mitre.org/techniques/T1059/001', 'https://www.trendmicro.com/en_us/research/21/a/vpnfilter-two-years-later-routers-still-compromised-.html', 'https://www.welivesecurity.com/2017/06/12/industroyer-biggest-threat-industrial-control-systems-since-stuxnet/', 'https://attack.mitre.org/techniques/T0807/', 'https://www.linkedin.com/legal/cookie-policy?session_redirect=https%3A%2F%2Fwww%2Elinkedin%2Ecom%2Fgroups%2F3054767%2F&trk=registration-frontend_join-form-cookie-policy', 'https://attack.mitre.org/techniques/T1112', 'https://knowledge.threatconnect.com/docs/the-threatconnect-data-model', 'https://www.ironnet.com/blog/iranian-cyber-attack-updates', 'https://www.justice.gov/usao-nj/pr/russian-national-charged-conspiring-commit-lockbit-ransomware-attacks-against-us-and', 'https://attack.mitre.org/techniques/T1059', 'https://intel471.com/blog/malvertising-surges-to-distribute-malware', 'https://securelist.com', 'https://www.mandiant.com/advantage/threat-intelligence/free-version', 'https://www.mandiant.com/sites/default/files/2021-09/rpt-apt29-hammertoss-1-1.pdf', 'https://www.mandiant.com/sites/default/files/2021-11/wp-m-unc2452-000343.pdf', 'https://www.malwarebytes.com/business/managed-detection-and-response', 'http://www.trendmicro.com/cloud-content/us/pdfs/security-intelligence/reports/rpt_fakeav-growing-problem.pdf', 'https://try.malwarebytes.com/business-2023-state-of-ransomware/?utm_source=blog&utm_medium=social&utm_campaign=b2b_ws_state_of_ransomware_2023_169048562376', 'https://lolol.farm/', 'https://www.state.gov/u-s-support-for-connectivity-and-cybersecurity-in-ukraine/', 'https://attack.mitre.org/techniques/T1548/002', 'https://symantec-enterprise-blogs.security.com/blogs/threat-intelligence/critical-infrastructure-attacks', 'https://www.mandiant.com/resources/sunburst-additional-technical-details', 'https://www.washingtonpost.com/politics/2022/03/07/putins-invasion-ukraine-didnt-rely-cyber-warfare-heres-why/', 'https://www.mandiant.com/resources/blog/incontroller-state-sponsored-ics-tool', 'https://crayondata.ai/cybersecurity-the-motivation-behind-cyber-hacks-infographic/', 'https://www.threatintelligence.com/blog/data-breach-reporting', 'https://www.hackmageddon.com/', 'https://blog.talosintelligence.com/lazarus-magicrat/', 'https://web.archive.org/web/20100623223703/http://www.marriage-playground.com:80/', 'https://threatconnect.com/blog/healthcare-email-threats-are-growing-combat-them-with-phishing-analysis-response/', 'https://blog.google/threat-analysis-group/fog-of-war-how-the-ukraine-conflict-transformed-the-cyber-threat-landscape/', 'https://www.malwarebytes.com/blog/news/2018/04/fakeupdates-campaign-leverages-multiple-website-platforms', 'https://tadviser.com/index.php/Project:Cyber_%E2%80%8B_%E2%80%8B_police_of_Russia_for_information_security_training', 'https://www.splunk.com/en_us/blog/learn/ioa-indicators-of-attack.html', 'https://taosecurity.blogspot.com/2022/08/the-humble-hub.html', 'https://www.malwarebytes.com/blog/threat-intelligence/2023/07/malicious-ad-for-usps-phishes-for-jpmorgan-chase-credentials', 'https://www.mandiant.com/resources/russian-targeting-gov-business', 'https://www.malwarebytes.com/blog/threat-intelligence/2023/06/thousands-of-malicious-google-cloud-run-instances-deployed-to-scam-facebook-users', 'https://www.airforcemag.com/hackers-attacked-satellite-terminals-through-management-network-viasat-officials-say/', 'https://blog.malwarebytes.com', 'https://securelist.com/miniduke-web-based-infection-vector/57622/', 'https://www.sentinelone.com/labs/hermetic-wiper-ukraine-under-attack/', 'https://www.reuters.com/business/energy/satellite-outage-knocks-out-control-enercon-wind-turbines-2022-02-28/', 'https://www.crowdstrike.com/blog/owassrf-exploit-analysis-and-recommendations/', 'https://attack.mitre.org/techniques/T0831/', 'https://securelist.com/minidionis-one-more-apt-with-a-usage-of-cloud-drives/71443/', 'https://blog.trendmicro.com/', 'https://www.mandiant.com/resources/blog/chinese-actors-exploit-fortios-flaw', 'https://attack.mitre.org/techniques/T1480/', 'https://www.fireeye.com/content/dam/fireeye-www/global/en/current-threats/pdfs/rpt-china-chopper.pdf', 'https://blog.talosintelligence.com/rhysida-ransomware/', 'https://www.businesswire.com/news/home/20201030005321/en/20-Billion-Cyber-Insurance-Market---Global-Forecast-to-2025---ResearchAndMarkets.com', 'https://developer.mozilla.org/en-US/docs/Web/CSS/:modal', 'https://www.facebook.com/business/tools/ads-manager', 'https://news.sophos.com/en-us/2022/07/14/rapid-response-the-ngrok-incident-guide/', 'https://attack.mitre.org/techniques/T1564', 'https://www.mandiant.com/resources/blog/cosmicenergy-ot-malware-russian-response', 'https://github.com/CrowdStrike/OWASSRF', 'https://symantec-enterprise-blogs.security.com/blogs/threat-intelligence/budworm-tool-update-telecoms-govt', 'https://www.linkedin.com/groups/3054767/', 'https://taosecurity.blogspot.com', 'https://www.malwarebytes.com/blog/threat-intelligence/2023/04/ransomware-review-france', 'https://www.mandiant.com/resources/blog/zero-day-exploits-in-sonicwall-email-security-lead-to-compromise', 'https://www.linkedin.com/posts/jeromesegura_malvertising-putty-batloader-activity-7079485115545899008-7ovx?utm_source=share&utm_medium=member_desktop', 'https://www.malwarebytes.com/blog/threat-intelligence/2023/03/new-kritec-skimmer', 'https://cloud.google.com/run/docs/overview/what-is-cloud-run', 'https://blog.talosintelligence.com/implementing-an-iso-compliant-threat-intelligence-program/', 'https://blog.talosintelligence.com/2018/06/vpnfilter-update.html?m=1', 'https://www.payplug.com/blog/dalenys-joins-forces-with-payplug-to-transform-your-payments/', 'https://www.mimecast.com/blog/zero-trust-may-hold-the-key-to-cybersecurity-in-apac/', 'https://forumspb.com/en/programme/partner-events-extra/99796/', 'https://www.anomali.com/blog/anomali-cyber-watch-pseudomanuscrypt-mass-spyware-campaign-targets-35k-systems-apt31-intrusion-set-campaign-description-countermeasures-and-code-state-sponsored-hackers-abuse-slack-api-to-steal', 'https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/302', 'https://mandiant.widen.net/s/zvmfw5fnjs/apt43-report', 'https://try.malwarebytes.com/2023-state-of-malware/?utm_source=blog&utm_medium=social&utm_campaign=b2b_ws_global_som_167578574700', 'https://symantec-enterprise-blogs.security.com/blogs/threat-intelligence/qakbot-takedown-disruption', 'https://www.mandiant.com/resources/blog/unc2452-merged-into-apt29', 'https://www.mandiant.com/resources/blog/live-off-the-land-an-overview-of-unc1945', 'https://www.mimecast.com/blog/analyzing-the-integration-of-python-in-microsoft-excel/', 'https://securelist.com/miniduke-is-back-nemesis-gemina-and-the-botgen-studio/64107/', 'https://developer.mozilla.org/en-US/docs/Mozilla/Add-ons/WebExtensions/API/cookies/getAll', 'https://www.withsecure.com/en/expertise/research-and-innovation/research/ducktail-an-infostealer-malware', 'https://www.cyfirma.com/outofband/windows-internet-key-exchange-ike-remote-code-execution-vulnerability-analysis/', 'https://www.rapid7.com/fundamentals/types-of-attacks/', 'https://www.hexacorn.com/blog', 'https://data.worldbank.org/indicator/NY.GDP.MKTP.CD', 'https://cybersecurity.att.com/blogs/labs-research/mac-systems-turned-into-proxy-exit-nodes-by-adload', 'https://www.crowdstrike.com/blog/patch-tuesday-analysis-november-2022/', 'https://www.malwarebytes.com/blog/threat-intelligence/2023/05/malvertising-its-a-jungle-out-there', 'https://twitter.com/juanandres_gs', 'https://www.reuters.com/world/ukraine-annexation-votes-end-amid-russian-mobilisation-exodus-2022-09-26/', 'https://www.mandiant.com/advantage/threat-intelligence', 'https://www.malwarebytes.com/blog/news/2019/09/new-social-engineering-toolkit-draws-inspiration-from-previous-web-campaigns', 'https://www.mandiant.com/resources/blog/attackers-deploy-new-ics-attack-framework-triton', 'https://www.mandiant.com/resources/blog/Mandiant-approach-to-operational-technology-security', 'https://www.cisa.gov/uscert/ncas/alerts/aa22-054a', 'https://www.malwarebytes.com/browserguard', 'https://krebsonsecurity.com/', 'https://www.wired.com/story/viasat-internet-hack-ukraine-russia/', 'https://attack.mitre.org/techniques/T1140/', 'https://attack.mitre.org/techniques/T1059/003', 'https://stratixsystems.com/what-are-the-motivations-for-cyber-attacks/', 'https://www.mandiant.com/resources/blog/updates-on-chinese-apt-compromising-pulse-secure-vpn-devices', 'https://attack.mitre.org/techniques/T1082', 'https://www.mandiant.com/resources/blog/cyber-operations-russian-vulkan', 'https://attack.mitre.org/groups/G0016/', 'https://www.malwarebytes.com/blog/threat-intelligence/2023/05/redstinger', 'https://techcrunch.com/2023/05/05/hacked-verified-facebook-pages-impersonating-meta-are-buying-ads-from-meta/', 'https://news.itmo.ru/en/university_live/achievements/news/12646/', 'https://attack.mitre.org/techniques/T0809/', 'https://krebsonsecurity.com/2023/07/seo-expert-hired-and-fired-by-ashley-madison-turned-on-company-promising-revenge/#more-64197', 'https://www.eset.com/int/industroyer/', 'https://www.wolfssl.com/', 'https://docs.fortinet.com/document/fortigate/7.2.3/administration-guide/109991/virtual-domains', 'https://www.linkedin.com/legal/user-agreement?session_redirect=https%3A%2F%2Fwww%2Elinkedin%2Ecom%2Fgroups%2F3054767%2F&trk=registration-frontend_join-form-user-agreement', 'https://en.wikipedia.org/wiki/Ukrop#:~:text=Ukrop%20(Russian%3A%20%D0%A3%D0%BA%D1%80%D0%BE%D0%BF%3B%20literally,the%20Russian%20word%20for%20Ukrainians.', 'https://www.malwarebytes.com/blog/news/2023/06/lockbit-ransomware-advisory-from-cisa-provides-interesting-insights', 'https://www.mandiant.com/resources/blog/industroyer-v2-old-malware-new-tricks', 'https://www.malwarebytes.com/blog/threat-intelligence/2023/06/the-2023-state-of-ransomware-in-education-84-increase-in-known-attacks-over-6-month-period', 'https://www.sentinelone.com/labs/acidrain-a-modem-wiper-rains-down-on-europe/', 'https://www.malwarebytes.com/blog/threat-intelligence/2023/05/fake-system-update-drops-new-highly-evasive-loader', 'https://www.malwarebytes.com/blog/threat-intelligence/2023/07/ransomware-review-july-2023', 'https://github.com/besimorhino/powercat', 'https://blog.talosintelligence.com/code-leaks-new-ransomware-actors/', 'https://www.bleepingcomputer.com/news/security/op-sharpshooter-connected-to-north-koreas-lazarus-group/', 'https://blog.talosintelligence.com/what-is-commercial-spyware/', 'https://blog.talosintelligence.com/malicious-campaigns-target-entities-in-ukraine-poland/', 'https://www.hexacorn.com/blog/2023/06/22/the-myth-of-knowing-your-org-know_your_org-docx/', 'https://github.com/samratashok/nishang', 'https://www.cyberuk.uk/', 'https://www.malwarebytes.com/blog/news/2023/06/moveit-discloses-yet-another-vulnerability-three-times-a-charm', 'https://blog.talosintelligence.com/2018/05/VPNFilter.html', 'https://www.proofpoint.com/us/blog/threat-insight/part-1-socgholish-very-real-threat-very-fake-update', 'https://attack.mitre.org/techniques/T1218', 'https://cybergeeks.tech/a-deep-dive-into-brute-ratel-c4-payloads/', 'https://www.malwarebytes.com/blog/threat-intelligence/2023/06/ransomware-review-june-2023', 'https://www.mandiant.com/resources/blog/kegtap-and-singlemalt-with-a-ransomware-chaser', 'https://attack.mitre.org/techniques/T0855/', 'https://www.brighttalk.com/webcast/7451/475010?utm_source=FireEye&utm_medium=brighttalk&utm_campaign=475010', 'https://www.malwarebytes.com/blog/news/2023/06/update-now-moveit-transfer-vulnerability-actively-exploited', 'https://d01a.github.io/pikabot/'] # Synchronous function to load initial data asynchronously @st.cache_data def load_initial_data(): return asyncio.run(load_data_async(default_urls)) # Main logic to initialize embeddings and vector store initial_data = load_initial_data() if not initial_data: raise ValueError("No documents were loaded from the provided URLs.") vectorstore_local, all_documents = initialize_embeddings(initial_data) # Define zero-shot and few-shot prompt templates for each question type def get_zero_shot_prompt(question_type): templates = { "factual": """ You are a Cybersecurity expert focusing on the latest trends and investigative techniques in cyber-attacks. Provide a concise answer based on the following context. Context: {context} Question: {question} Answer: """, "contrastive": """ You are a Cybersecurity expert focusing on the latest trends and investigative techniques in cyber-attacks. Provide a concise answer based on the following context. Context: {context} Question: {question} Answer: """, "opinion": """ You are a Cybersecurity expert focusing on the latest trends and investigative techniques in cyber-attacks. Provide a concise answer based on the following context. Context: {context} Question: {question} Answer: """, "inferential": """ You are a Cybersecurity expert focusing on the latest trends and investigative techniques in cyber-attacks. Provide a concise answer based on the following context. Context: {context} Question: {question} Answer: """ } return templates[question_type] def get_few_shot_prompt(question_type): templates = { "factual": """ You are a Cybersecurity expert focusing on the latest trends and investigative techniques in cyber-attacks. Provide a concise answer based on the following context. Context: {context} Question: {question} Example: Context: In recent years, cyber-attacks have increased significantly. Question: What are the main reasons for the rise in cyber-attacks? Answer: The main reasons include increased connectivity, outdated security infrastructure, and sophisticated attack methods. Answer: """, "contrastive": """ You are a Cybersecurity expert focusing on the latest trends and investigative techniques in cyber-attacks. Provide a concise answer based on the following context. Context: {context} Question: {question} Example: Context: Both phishing and malware attacks are common cyber threats. Question: How do phishing attacks differ from malware attacks? Answer: Phishing attacks involve tricking individuals into revealing sensitive information, while malware attacks involve malicious software designed to damage or gain unauthorized access to systems. Answer: """, "opinion": """ You are a Cybersecurity expert focusing on the latest trends and investigative techniques in cyber-attacks. Provide a concise answer based on the following context. Context:{context} Question: {question} Example: Context: Many experts believe that AI can improve cybersecurity defenses. Question: Do you think AI can effectively combat cyber-attacks? Answer: Yes, AI can help identify patterns and anomalies that indicate potential cyber-attacks, making it an effective tool for enhancing cybersecurity defenses. Answer: """, "inferential": """ You are a Cybersecurity expert focusing on the latest trends and investigative techniques in cyber-attacks. Provide a concise answer based on the following context. Context: {context} Question: {question} Example: Context: An organization faced a significant data breach last year. Question: What measures can the organization take to prevent future breaches? Answer: The organization can implement stronger access controls, regular security audits, employee training, and advanced threat detection systems. Answer: """ } return templates[question_type] # Function to identify question type def identify_question_type(question): question = question.lower() if any(keyword in question for keyword in ["what", "when", "who", "how many", "how much"]): return "factual" elif any(keyword in question for keyword in ["compare", "difference", "similar"]): return "contrastive" elif any(keyword in question for keyword in ["opinion", "feel", "think", "believe"]): return "opinion" elif any(keyword in question for keyword in ["why", "how", "cause", "reason"]): return "inferential" else: return "factual" # Default to factual if no keywords match # Function to filter documents based on metadata def filter_documents_by_metadata(query, documents): query_terms = query.lower().split() def doc_matches_query(doc): metadata = doc.metadata for term in query_terms: if any(term in str(value).lower() for key, value in metadata.items() if isinstance(value, str)): return True return False filtered_documents = [doc for doc in documents if doc_matches_query(doc)] return filtered_documents # Retrieve and generate zero-shot answers def retrieve_and_generate_zero_shot_answers(query, vectorstore, question_type): prompt_template = get_zero_shot_prompt(question_type) qa_chain = RetrievalQA.from_chain_type( llm=llm, chain_type="stuff", retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3}), return_source_documents=True, chain_type_kwargs={"prompt": PromptTemplate(template=prompt_template, input_variables=["context", "question"])} ) response = qa_chain.invoke({"query": query}) if not response['result'].strip(): # Check if the result is empty or only whitespace return "Sorry, I don't know.", [], [] sources = [Document(page_content=doc.page_content, metadata=doc.metadata) for doc in response['source_documents']] contexts = [doc.page_content for doc in sources] return response['result'], sources, contexts # Retrieve and generate few-shot answers def retrieve_and_generate_few_shot_answers(query, vectorstore, question_type): prompt_template = get_few_shot_prompt(question_type) qa_chain = RetrievalQA.from_chain_type( llm=llm, chain_type="stuff", retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3}), return_source_documents=True, chain_type_kwargs={"prompt": PromptTemplate(template=prompt_template, input_variables=["context", "question"])} ) response = qa_chain.invoke({"query": query}) if not response['result'].strip(): # Check if the result is empty or only whitespace return "Sorry, I don't know.", [], [] sources = [Document(page_content=doc.page_content, metadata=doc.metadata) for doc in response['source_documents']] contexts = [doc.page_content for doc in sources] return response['result'], sources, contexts # Function to generate answer using selected LLM def generate_answer_with_llm(query, selected_model, question_type=None): model = load_model(selected_model) prompt_template_zero_shot = get_zero_shot_prompt(question_type) prompt_template_few_shot = get_few_shot_prompt(question_type) context = "This is a placeholder context. Replace with actual context if available." # Replace with actual context if available formatted_query_zero_shot = prompt_template_zero_shot.format(context=context, question=query) formatted_query_few_shot = prompt_template_few_shot.format(context=context, question=query) response_zero_shot = model.generate(prompts=[formatted_query_zero_shot]) response_few_shot = model.generate(prompts=[formatted_query_few_shot]) return response_zero_shot.generations[0][0].text, response_few_shot.generations[0][0].text # Handle user query or file paths def handle_user_query_or_urls(query_mode, input_text, question_type, selected_model=None): if query_mode == "external data": #st.write(input_text) file_paths = [line.strip() for line in input_text.split('\n') if line.strip()] #st.write(file_paths) external_data = asyncio.run(load_data_async(file_paths)) #st.write(external_data) if not external_data: raise ValueError("No valid documents found from the provided file paths.") vectorstore, _ = initialize_embeddings(external_data) elif query_mode == "specific document": documents = [Document(page_content=input_text, metadata={"source": "user"})] vectorstore, _ = initialize_embeddings(documents) elif query_mode == "metadata": filtered_documents = filter_documents_by_metadata(input_text, all_documents) vectorstore, split_documents = initialize_embeddings(filtered_documents) else: # entire dataset vectorstore = vectorstore_local # Use precomputed vectorstore for the entire dataset if selected_model: zero_shot_answer, few_shot_answer = generate_answer_with_llm(input_text, selected_model, question_type=question_type) zero_shot_sources = few_shot_sources = [] zero_shot_contexts = few_shot_contexts = [] else: zero_shot_answer, zero_shot_sources, zero_shot_contexts = retrieve_and_generate_zero_shot_answers(input_text, vectorstore, question_type) few_shot_answer, few_shot_sources, few_shot_contexts = retrieve_and_generate_few_shot_answers(input_text, vectorstore, question_type) return zero_shot_answer, zero_shot_sources, zero_shot_contexts, few_shot_answer, few_shot_sources, few_shot_contexts def process_csv_file(file_path, vectorstore, save_path, selected_model=None, query_mode="entire dataset"): try: # Detect file encoding with open(file_path, 'rb') as f: result = chardet.detect(f.read()) encoding = result['encoding'] df = pd.read_csv(file_path, encoding=encoding) except Exception as e: st.error(f"An error occurred while reading the CSV file: {e}") return question_types = [] zero_shot_answers = [] few_shot_answers = [] zero_shot_sources_list = [] few_shot_sources_list = [] zero_shot_contexts_list = [] few_shot_contexts_list = [] for idx, row in df.iterrows(): question = row['questions'] ground_truth = row.get('answers', None) # Assuming there is an 'answers' column for ground truth question_type = identify_question_type(question) question_types.append(question_type) if query_mode == "specific document": specific_document = row['specific document'] if not specific_document: st.error(f"Content for the specific document is missing in the CSV file for question: {question}") return # Exit function if content is missing for any specific document combined_input = specific_document + " " + question zero_shot_answer, zero_shot_sources, zero_shot_contexts, few_shot_answer, few_shot_sources, few_shot_contexts = handle_user_query_or_urls(query_mode, combined_input, question_type, selected_model) else: zero_shot_answer, zero_shot_sources, zero_shot_contexts, few_shot_answer, few_shot_sources, few_shot_contexts = handle_user_query_or_urls(query_mode, question, question_type, selected_model) zero_shot_answers.append(zero_shot_answer) few_shot_answers.append(few_shot_answer) zero_shot_sources_list.append("; ".join([f"source: {doc.metadata.get('source')}, page: {doc.metadata.get('page')}, title: {doc.metadata.get('title')}, author: {doc.metadata.get('author')}, date_published: {doc.metadata.get('date_published')}, chunk: {doc.metadata.get('chunk')}" for doc in zero_shot_sources])) few_shot_sources_list.append("; ".join([f"source: {doc.metadata.get('source')}, page: {doc.metadata.get('page')}, title: {doc.metadata.get('title')}, author: {doc.metadata.get('author')}, date_published: {doc.metadata.get('date_published')}, chunk: {doc.metadata.get('chunk')}" for doc in few_shot_sources])) zero_shot_contexts_list.append(" ".join(zero_shot_contexts)) few_shot_contexts_list.append(" ".join(few_shot_contexts)) result_df = pd.DataFrame({ 'questions': df['questions'], 'question_type': question_types, 'zero_shot_answers': zero_shot_answers, 'zero_shot_sources': zero_shot_sources_list, 'zero_shot_contexts': zero_shot_contexts_list, 'few_shot_answers': few_shot_answers, 'few_shot_sources': few_shot_sources_list, 'few_shot_contexts': few_shot_contexts_list }) try: if save_path: result_df.to_csv(save_path, index=False, encoding='utf-8') st.success(f"CSV file processed and saved to {save_path} successfully.") else: result_df.to_csv(file_path, index=False, encoding='utf-8') st.success("CSV file processed and updated successfully.") csv = result_df.to_csv(index=False, encoding='utf-8') b64 = base64.b64encode(csv.encode()).decode() # some strings <-> bytes conversions necessary here href = f'Download processed CSV file' st.markdown(href, unsafe_allow_html=True) except Exception as e: st.error(f"An error occurred while saving the CSV file: {e}") def format_source_document(doc): source_html = f"""

Source: {doc.metadata.get('source')}

Page: {doc.metadata.get('page', 'None')}

Title: {doc.metadata.get('title', 'None')}

Author: {doc.metadata.get('author', 'None')}

Date Published: {doc.metadata.get('date_published', 'None')}

Venue: {doc.metadata.get('venue', 'None')}

Chunk: {doc.metadata.get('chunk', 'None')}

""" return source_html # Create Streamlit interface def main(): st.title("Cybersecurity QA System") st.write("Ask any questions related to cybersecurity, including trends, attack techniques, and investigative methods.") # Step 1: Choose between RAG and LLM approach = st.selectbox("Select approach:", ["Retrieval Augmentation Generation (RAG)", "Large Language Model (LLM)"]) # Step 2: Choose single query or bulk queries query_type = st.selectbox("Select query type:", ["Single query", "Bulk queries (CSV file)"]) if approach == "Retrieval Augmentation Generation (RAG)": if query_type == "Single query": query_mode = st.selectbox("Select query mode:", ["entire dataset", "specific document", "metadata", "external data"]) question_type = st.selectbox("Select question type:", ["factual", "contrastive", "opinion", "inferential"]) if query_mode == "entire dataset": input_text = st.text_area("Enter your question to ask it from entire dataset") elif query_mode == "metadata": st.write("Enter query with metadata filters (e.g., 'title: Cybersecurity, author: John Doe, date: 2023'):") input_text = st.text_area("Enter your question with specific metadata:") elif query_mode == "specific document": st.write("Enter content from a specific document along with question") input_text = st.text_area("Enter your question from entered content:") elif query_mode == "external data": st.write("Enter external URLs/file paths along with question") input_text = st.text_area("Enter your question with URLs/file paths:") if st.button("Get Answer"): try: zero_shot_answer, zero_shot_sources, zero_shot_contexts, few_shot_answer, few_shot_sources, few_shot_contexts = handle_user_query_or_urls(query_mode, input_text, question_type) st.write(f"**Question Type:** {question_type.capitalize()}") st.write("**Zero-Shot Answer:**") st.markdown(f"
{zero_shot_answer}
", unsafe_allow_html=True) st.write("**Zero-Shot Context:**") st.markdown(f"
{' '.join(zero_shot_contexts)}
", unsafe_allow_html=True) st.write("**Zero-Shot Source Documents:**") for doc in zero_shot_sources: st.markdown(format_source_document(doc), unsafe_allow_html=True) st.write("**Few-Shot Answer:**") st.markdown(f"
{few_shot_answer}
", unsafe_allow_html=True) st.write("**Few-Shot Context:**") st.markdown(f"
{' '.join(few_shot_contexts)}
", unsafe_allow_html=True) st.write("**Few-Shot Source Documents:**") for doc in few_shot_sources: st.markdown(format_source_document(doc), unsafe_allow_html=True) except Exception as e: st.error(f"An error occurred: {e}") else: # Bulk queries query_mode_csv = st.selectbox("Select query mode for CSV processing:", ["entire dataset", "specific document", "metadata", "external data"]) uploaded_file = st.file_uploader("Choose a CSV file with questions", type="csv") if st.button("Process CSV"): try: file_path = uploaded_file.name with open(file_path, "wb") as f: f.write(uploaded_file.getbuffer()) save_path = st.text_input("Enter the path to save the processed CSV results file:") process_csv_file(file_path, vectorstore_local, save_path, query_mode=query_mode_csv) except Exception as e: st.error(f"An error occurred while processing the CSV file: {e}") else: # Large Language Model (LLM) selected_model = st.selectbox("Select Open-Source LLM:", ["None"] + list(open_source_models.keys())) if query_type == "Single query": question_type = st.selectbox("Select question type:", ["factual", "contrastive", "opinion", "inferential"]) input_text = st.text_area("Enter your question:") if st.button("Get Answer"): try: selected_model_repo = open_source_models[selected_model] if selected_model != "None" else None zero_shot_answer, few_shot_answer = generate_answer_with_llm(input_text, selected_model_repo, question_type=question_type) st.write(f"**Question Type:** {question_type.capitalize()}") st.write("**Zero-Shot Answer:**") st.markdown(f"
{zero_shot_answer}
", unsafe_allow_html=True) st.write("**Few-Shot Answer:**") st.markdown(f"
{few_shot_answer}
", unsafe_allow_html=True) except Exception as e: st.error(f"An error occurred: {e}") else: # Bulk queries uploaded_file = st.file_uploader("Choose a CSV file with questions", type="csv") if st.button("Process CSV"): try: file_path = uploaded_file.name with open(file_path, "wb") as f: f.write(uploaded_file.getbuffer()) save_path = st.text_input("Enter the path to save the processed CSV results file:") process_csv_file(file_path, vectorstore_local, save_path, selected_model=open_source_models[selected_model]) except Exception as e: st.error(f"An error occurred while processing the CSV file: {e}") if __name__ == '__main__': main()