import os
import streamlit as st
import chardet
import aiohttp
import asyncio
import pandas as pd
from io import BytesIO
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_community.llms import HuggingFaceHub
from langchain.schema import Document
import bs4
from PyPDF2 import PdfReader
import logging
import base64
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from langchain import HuggingFacePipeline
from langchain_community.embeddings import HuggingFaceEmbeddings


# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize the HuggingFace Mistral-7B model
llm = HuggingFaceHub(
    repo_id="mistralai/Mistral-7B-Instruct-v0.3",
    model_kwargs={"temperature": 0.7, "max_length": 512}
)

# Initialize the HuggingFace model with caching
@st.cache_resource
def load_model(repo_id):
    return HuggingFaceHub(
        repo_id=repo_id,
        model_kwargs={"temperature": 0.7, "max_length": 512}
    )

# List of available open-source models
open_source_models = {
    "Mistral-7B": "mistralai/Mistral-7B-Instruct-v0.3",
    "Llama-2-7B": "meta-llama/Llama-2-7b-chat-hf",
    "Zephyr-7B": "HuggingFaceH4/zephyr-7b-beta"
}

# Asynchronous fetching and processing URLs
async def fetch_and_process_url(session, url):
    documents = []
    try:
        async with session.get(url) as response:
            if response.status != 200:
                logger.error(f"Failed to fetch URL {url}: HTTP {response.status}")
                return documents
            
            content = await response.read()
            content_type = response.headers.get('Content-Type', '')

            if 'application/pdf' in content_type:
                reader = PdfReader(BytesIO(content))
                documents.extend([Document(page_content=page.extract_text(), metadata={"source": url, "page": i+1}) for i, page in enumerate(reader.pages) if page.extract_text()])
            else:
                text = content.decode('utf-8', errors='ignore')
                soup = bs4.BeautifulSoup(text, 'html.parser')
                title = soup.find('title').text if soup.find('title') else ""
                author = soup.find('meta', attrs={'name': 'author'})['content'] if soup.find('meta', attrs={'name': 'author'}) else ""
                publication_date = soup.find('meta', attrs={'name': 'publication-date'})['content'] if soup.find('meta', attrs={'name': 'publication-date'}) else ""
                cleaned_text = soup.get_text()
                documents.append(Document(page_content=cleaned_text, metadata={"source": url, "title": title, "author": author, "publication_date": publication_date}))
    except Exception as e:
        logger.error(f"Failed to fetch or process URL {url}: {e}")

    return documents

async def load_data_async(file_paths):
    documents = []
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_and_process_url(session, file_path) for file_path in file_paths]
        #st.write(tasks)
        results = await asyncio.gather(*tasks, return_exceptions=True)
        #st.write(results)
        for result in results:
            if isinstance(result, Exception):
                logger.error(f"Error processing URL: {result}")
            elif result:
                documents.extend(result)
    logger.info(f"Total documents loaded: {len(documents)}")
    #st.write(documents)
    return documents

# Initialize embeddings and vector store with caching
@st.cache_resource
def initialize_embeddings(_documents):
    if not _documents:
        logger.error("No documents available to embed.")
        raise ValueError("No documents available to embed.")
    
    embedding_model = SentenceTransformerEmbeddings(
        model_name="all-MiniLM-L6-v2",
        model_kwargs={'device': 'cpu'},
        encode_kwargs={'normalize_embeddings': True}
    )
    
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    
    split_documents = text_splitter.split_documents(_documents)
    
    if not split_documents:
        logger.error("No documents available after splitting.")
        raise ValueError("No documents available after splitting.")
    
    vectorstore = FAISS.from_documents(split_documents, embedding_model)
    return vectorstore, split_documents

# Default URLs
default_urls=['https://www.protectuk.police.uk/threat-risk/threat-analysis/complex-attacks', 
             'https://threatconnect.com/blog/preparing-organizations-to-stop-ransomware-in-2023/', 
             'https://arstechnica.com/security/2013/04/more-then-30-mmorpg-companies-targeted-in-ongoing-malware-attack/', 
             'https://www.mandiant.com/resources/insights/uncategorized-unc-threat-groups',
             'https://attack.mitre.org/techniques/T1071', 
             'https://blog.talosintelligence.com/vulnerability-roundup-august-9-23/',
             'https://cybersecurity.att.com/blogs', 
             'https://www.threatintelligence.com/blog/australia-data-breaches',
             'https://www.netresec.com/?page=NetworkMiner', 
             'https://www.justice.gov/opa/pr/justice-department-announces-actions-disrupt-advanced-persistent-threat-28-botnet-infected',
             'https://blog.morphisec.com/in2al5d-p3in4er',
             'https://www.mandiant.com/resources/apt29-domain-frontin',
             'https://blog.talosintelligence.com/threat-source-newsletter-july-20-2023/',
             'https://www.malwarebytes.com/blog/threat-intelligence/2023/08/global-ransomware-attacks-at-an-all-time-high-shows-latest-2023-state-of-ransomware-report', 
             'https://www.humansecurity.com/tech-engineering-blog/exfiltrating-users-private-data-using-google-analytics-to-bypass-csp',
             'https://www.secureworks.com/blog/secureworks-achieves-100-visibility-and-95-detection-in-inaugural-mitre-evaluation-for-taegis',
             'https://cybersecurity.att.com/blogs/labs-research/seroxen-rat-for-sale',
             'https://attack.mitre.org/techniques/T1027',
             'https://usa.kaspersky.com/resource-center/threats/what-is-a-honeypot', 
             'https://www.microsoft.com/security/blog/2021/03/02/hafnium-targeting-exchange-servers/', 
             'https://symantec-enterprise-blogs.security.com/blogs/threat-intelligence/3am-ransomware-lockbit', 
             'https://blog.rsisecurity.com/how-to-identify-signs-of-ransomware-attacks/', 
             'https://www.gov.uk/government/news/russia-behind-cyber-attack-with-europe-wide-impact-an-hour-before-ukraine-invasion', 
             'https://www.reversemode.com/2022/03/satcom-terminals-under-attack-in-europe.html', 
             'https://ddanchev.blogspot.com', 
             'https://securelist.com/the-cozyduke-apt/69731/',
             'https://www.viasat.com/about/newsroom/blog/ka-sat-network-cyber-attack-overview/',
             'https://www.malwarebytes.com/blog/news/2023/06/cl0p-ransomware-gang-claims-first-victims-of-the-moveit-vulnerability', 
             'https://www.digitalguardian.com/blog/what-are-indicators-compromise',
             'https://www.malwarebytes.com/business/endpoint-protection', 
             'https://krebsonsecurity.com/2015/07/online-cheating-site-ashleymadison-hacked/',
             'https://github.com/trendmicro/tlsh', 
             'https://www.malwarebytes.com/business/vulnerability-patch-management', 
             'https://www.malwarebytes.com/premium',
             'https://blog.google/threat-analysis-group/', 
             'https://www.trendmicro.com/vinfo/us/security/research-and-analysis/predictions/2017',
             'https://www.linkedin.com/legal/privacy-policy?session_redirect=https%3A%2F%2Fwww%2Elinkedin%2Ecom%2Fgroups%2F3054767%2F&trk=registration-frontend_join-form-privacy-policy',
             'https://www.malwarebytes.com/blog/threat-intelligence/2023/04/ransomware-review-april-2023', 
             'https://attack.mitre.org/techniques/T1562/006/', 
             'https://attack.mitre.org/techniques/T1548', 
             'https://www.mandiant.com/resources/how-mandiant-tracks-uncategorized-threat-actors',
             'https://nakedsecurity.sophos.com',
             'https://en.wikipedia.org/wiki/False_imprisonment',
             'https://blog.talosintelligence.com/2018/09/vpnfilter-part-3.html', 
             'https://www.trendmicro.com/en_us/research/17/d/pigs-malware-examining-possible-member-winnti-group.html', 
             'https://www.malwarebytes.com/blog/news/2021/07/ransomwares-russia-problem', 
             'https://cybersecurity.att.com/blogs/security-essentials/understanding-cyber-attacker-motivations-to-best-apply-controls', 
             'https://www.att.com/privacy', 
             'https://www.itgovernance.eu/blog/en/how-to-defend-against-man-in-the-middle-attacks',
             'https://www.malwarebytes.com/blog/threat-intelligence/2023/04/kritec-art', 
             'https://www.musl-libc.org/',
             'https://www.cisa.gov/uscert/ncas/alerts/aa22-076a', 
             'https://www.faronics.com/news/blog/malware-becoming-more-sophisticated-majority-of-organizations-infected',
             'https://securelist.com/bad-magic-apt/109087/',
             'https://www.mandiant.com/resources/blog/irongate-ics-malware', 
             'https://github.com/cobbr/Covenant', 
             'https://attack.mitre.org/techniques/T1571',
             'https://www.trendmicro.com/vinfo/us/security/definition/targeted-attacks',
             'https://www.malwarebytes.com/blog/news/2023/06/vice-society', 
             'https://www.state.gov/attribution-of-russias-malicious-cyber-activity-against-ukraine/', 
             'https://libevent.org/',
             'https://www.malwarebytes.com/blog/threat-intelligence/2023/07/socgholish-copycat-delivers-netsupport-rat', 
             'https://securelist.com/the-miniduke-mystery-pdf-0-day-government-spy-assembler-0x29a-micro-backdoor/31112/',
             'https://www.virustotal.com/gui/file/301e819008e19b9803ad8b75ecede9ecfa5b11a3ecd8df0316914588b95371c8',
             'https://blog.talosintelligence.com/need-to-know-commodity-malware/',
             'https://www.reuters.com/business/media-telecom/exclusive-hackers-who-crippled-viasat-modems-ukraine-are-still-active-company-2022-03-30/', 
             'https://www.mandiant.com/resources/blog/detection-response-to-exploitation-of-microsoft-exchange-zero-day-vulnerabilities', 
             'https://gridinsoft.com/blogs/rorschach-ransomware-analysis/', 
             'https://www.malwarebytes.com/blog/news/2022/03/blunting-rdp-brute-force-attacks-with-rate-limiting', 
             'https://www.malwarebytes.com/business/edr', 
             'https://www.malwarebytes.com/blog/threat-intelligence/2023/07/criminals-target-businesses-with-malicious-extension-for-metas-ads-manager-and-accidentally-leak-stolen-accounts',
             'https://medium.com/@crovax/ducktail-multi-stage-analysis-39c2a7d9675d', 'https://www.mandiant.com/resources/evasive-attacker-leverages-solarwinds-supply-chain-compromises-with-sunburst-backdoor',
             'https://www.secureworks.com/blog/qa-top-cybersecurity-eperts-weigh-in-on-incident-response-best-practices', 
             'https://engineering.fb.com/2023/05/03/security/malware-nodestealer-ducktail/', 
             'https://www.coretech.us/blog/6-motivations-of-cyber-criminals', 
             'https://www.webroot.com/blog',
             'https://www.whitehouse.gov/briefing-room/statements-releases/2021/04/15/fact-sheet-imposing-costs-for-harmful-foreign-activities-by-the-russian-government/', 
             'https://www.brighttalk.com/webcast/7451/462719',
             'https://www.malwarebytes.com/blog/news/2023/03/clop-ransomware-is-victimizing-goanywhere-mft-customers',
             'https://krebsonsecurity.com/2023/07/top-suspect-in-2015-ashley-madison-hack-committed-suicide-in-2014/',
             'https://www.malwarebytes.com/blog/news/2023/06/rewards-up-to-10-million-for-information-about-cl0p-ransomware-operation',
             'https://nvd.nist.gov/vuln/detail/CVE-2022-24682',
             'https://blog.talosintelligence.com/talos-ir-q2-2023-quarterly-recap/',
             'https://www.forbes.com/sites/forbestechcouncil/2022/01/05/ransomware-attacks-are-not-increasing-in-sophistication/',
             'https://www.kaspersky.com/enterprise-security/mitre/apt29', 
             'https://www.malwarebytes.com/business/contact-us/', 
             'https://www.mandiant.com/services/incident-response',
             'https://www.virusbulletin.com/conference/vb2018/abstracts/anatomy-attack-detecting-and-defeating-crashoverride/', 
             'https://www.securin.io/articles/all-about-lockbit-ransomware/', 
             'https://attack.mitre.org/techniques/T1059/001', 
             'https://www.trendmicro.com/en_us/research/21/a/vpnfilter-two-years-later-routers-still-compromised-.html',
             'https://www.welivesecurity.com/2017/06/12/industroyer-biggest-threat-industrial-control-systems-since-stuxnet/', 
             'https://attack.mitre.org/techniques/T0807/', 
             'https://www.linkedin.com/legal/cookie-policy?session_redirect=https%3A%2F%2Fwww%2Elinkedin%2Ecom%2Fgroups%2F3054767%2F&trk=registration-frontend_join-form-cookie-policy',
             'https://attack.mitre.org/techniques/T1112', 
             'https://knowledge.threatconnect.com/docs/the-threatconnect-data-model', 
             'https://www.ironnet.com/blog/iranian-cyber-attack-updates',
             'https://www.justice.gov/usao-nj/pr/russian-national-charged-conspiring-commit-lockbit-ransomware-attacks-against-us-and', 
             'https://attack.mitre.org/techniques/T1059',
             'https://intel471.com/blog/malvertising-surges-to-distribute-malware',
             'https://securelist.com',
             'https://www.mandiant.com/advantage/threat-intelligence/free-version', 
             'https://www.mandiant.com/sites/default/files/2021-09/rpt-apt29-hammertoss-1-1.pdf', 
             'https://www.mandiant.com/sites/default/files/2021-11/wp-m-unc2452-000343.pdf',
             'https://www.malwarebytes.com/business/managed-detection-and-response',
             'http://www.trendmicro.com/cloud-content/us/pdfs/security-intelligence/reports/rpt_fakeav-growing-problem.pdf', 
             'https://try.malwarebytes.com/business-2023-state-of-ransomware/?utm_source=blog&utm_medium=social&utm_campaign=b2b_ws_state_of_ransomware_2023_169048562376',
             'https://lolol.farm/', 
             'https://www.state.gov/u-s-support-for-connectivity-and-cybersecurity-in-ukraine/',
             'https://attack.mitre.org/techniques/T1548/002', 
             'https://symantec-enterprise-blogs.security.com/blogs/threat-intelligence/critical-infrastructure-attacks',
             'https://www.mandiant.com/resources/sunburst-additional-technical-details',
             'https://www.washingtonpost.com/politics/2022/03/07/putins-invasion-ukraine-didnt-rely-cyber-warfare-heres-why/',
             'https://www.mandiant.com/resources/blog/incontroller-state-sponsored-ics-tool',
             'https://crayondata.ai/cybersecurity-the-motivation-behind-cyber-hacks-infographic/', 
             'https://www.threatintelligence.com/blog/data-breach-reporting', 
             'https://www.hackmageddon.com/', 
             'https://blog.talosintelligence.com/lazarus-magicrat/', 
             'https://web.archive.org/web/20100623223703/http://www.marriage-playground.com:80/', 
             'https://threatconnect.com/blog/healthcare-email-threats-are-growing-combat-them-with-phishing-analysis-response/',
             'https://blog.google/threat-analysis-group/fog-of-war-how-the-ukraine-conflict-transformed-the-cyber-threat-landscape/',
             'https://www.malwarebytes.com/blog/news/2018/04/fakeupdates-campaign-leverages-multiple-website-platforms', 
             'https://tadviser.com/index.php/Project:Cyber_%E2%80%8B_%E2%80%8B_police_of_Russia_for_information_security_training', 
             'https://www.splunk.com/en_us/blog/learn/ioa-indicators-of-attack.html', 
             'https://taosecurity.blogspot.com/2022/08/the-humble-hub.html',
             'https://www.malwarebytes.com/blog/threat-intelligence/2023/07/malicious-ad-for-usps-phishes-for-jpmorgan-chase-credentials',
             'https://www.mandiant.com/resources/russian-targeting-gov-business', 
             'https://www.malwarebytes.com/blog/threat-intelligence/2023/06/thousands-of-malicious-google-cloud-run-instances-deployed-to-scam-facebook-users',
             'https://www.airforcemag.com/hackers-attacked-satellite-terminals-through-management-network-viasat-officials-say/', 
             'https://blog.malwarebytes.com', 'https://securelist.com/miniduke-web-based-infection-vector/57622/', 
             'https://www.sentinelone.com/labs/hermetic-wiper-ukraine-under-attack/',
             'https://www.reuters.com/business/energy/satellite-outage-knocks-out-control-enercon-wind-turbines-2022-02-28/', 
             'https://www.crowdstrike.com/blog/owassrf-exploit-analysis-and-recommendations/',
             'https://attack.mitre.org/techniques/T0831/', 
             'https://securelist.com/minidionis-one-more-apt-with-a-usage-of-cloud-drives/71443/',
             'https://blog.trendmicro.com/', 
             'https://www.mandiant.com/resources/blog/chinese-actors-exploit-fortios-flaw', 
             'https://attack.mitre.org/techniques/T1480/',
             'https://www.fireeye.com/content/dam/fireeye-www/global/en/current-threats/pdfs/rpt-china-chopper.pdf',
             'https://blog.talosintelligence.com/rhysida-ransomware/', 
             'https://www.businesswire.com/news/home/20201030005321/en/20-Billion-Cyber-Insurance-Market---Global-Forecast-to-2025---ResearchAndMarkets.com', 
             'https://developer.mozilla.org/en-US/docs/Web/CSS/:modal', 
             'https://www.facebook.com/business/tools/ads-manager',
             'https://news.sophos.com/en-us/2022/07/14/rapid-response-the-ngrok-incident-guide/', 
             'https://attack.mitre.org/techniques/T1564', 
             'https://www.mandiant.com/resources/blog/cosmicenergy-ot-malware-russian-response', 
             'https://github.com/CrowdStrike/OWASSRF', 
             'https://symantec-enterprise-blogs.security.com/blogs/threat-intelligence/budworm-tool-update-telecoms-govt',
             'https://www.linkedin.com/groups/3054767/',
             'https://taosecurity.blogspot.com', 
             'https://www.malwarebytes.com/blog/threat-intelligence/2023/04/ransomware-review-france', 
             'https://www.mandiant.com/resources/blog/zero-day-exploits-in-sonicwall-email-security-lead-to-compromise', 
             'https://www.linkedin.com/posts/jeromesegura_malvertising-putty-batloader-activity-7079485115545899008-7ovx?utm_source=share&utm_medium=member_desktop', 
             'https://www.malwarebytes.com/blog/threat-intelligence/2023/03/new-kritec-skimmer', 
             'https://cloud.google.com/run/docs/overview/what-is-cloud-run', 
             'https://blog.talosintelligence.com/implementing-an-iso-compliant-threat-intelligence-program/', 
             'https://blog.talosintelligence.com/2018/06/vpnfilter-update.html?m=1', 
             'https://www.payplug.com/blog/dalenys-joins-forces-with-payplug-to-transform-your-payments/', 
             'https://www.mimecast.com/blog/zero-trust-may-hold-the-key-to-cybersecurity-in-apac/', 
             'https://forumspb.com/en/programme/partner-events-extra/99796/', 
             'https://www.anomali.com/blog/anomali-cyber-watch-pseudomanuscrypt-mass-spyware-campaign-targets-35k-systems-apt31-intrusion-set-campaign-description-countermeasures-and-code-state-sponsored-hackers-abuse-slack-api-to-steal',
             'https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/302', 
             'https://mandiant.widen.net/s/zvmfw5fnjs/apt43-report', 
             'https://try.malwarebytes.com/2023-state-of-malware/?utm_source=blog&utm_medium=social&utm_campaign=b2b_ws_global_som_167578574700', 
             'https://symantec-enterprise-blogs.security.com/blogs/threat-intelligence/qakbot-takedown-disruption', 
             'https://www.mandiant.com/resources/blog/unc2452-merged-into-apt29', 
             'https://www.mandiant.com/resources/blog/live-off-the-land-an-overview-of-unc1945', 
             'https://www.mimecast.com/blog/analyzing-the-integration-of-python-in-microsoft-excel/',
             'https://securelist.com/miniduke-is-back-nemesis-gemina-and-the-botgen-studio/64107/',
             'https://developer.mozilla.org/en-US/docs/Mozilla/Add-ons/WebExtensions/API/cookies/getAll', 
             'https://www.withsecure.com/en/expertise/research-and-innovation/research/ducktail-an-infostealer-malware', 
             'https://www.cyfirma.com/outofband/windows-internet-key-exchange-ike-remote-code-execution-vulnerability-analysis/', 
             'https://www.rapid7.com/fundamentals/types-of-attacks/', 'https://www.hexacorn.com/blog', 
             'https://data.worldbank.org/indicator/NY.GDP.MKTP.CD', 
             'https://cybersecurity.att.com/blogs/labs-research/mac-systems-turned-into-proxy-exit-nodes-by-adload', 
             'https://www.crowdstrike.com/blog/patch-tuesday-analysis-november-2022/', 
             'https://www.malwarebytes.com/blog/threat-intelligence/2023/05/malvertising-its-a-jungle-out-there',
             'https://twitter.com/juanandres_gs',
             'https://www.reuters.com/world/ukraine-annexation-votes-end-amid-russian-mobilisation-exodus-2022-09-26/', 
             'https://www.mandiant.com/advantage/threat-intelligence',
             'https://www.malwarebytes.com/blog/news/2019/09/new-social-engineering-toolkit-draws-inspiration-from-previous-web-campaigns', 
             'https://www.mandiant.com/resources/blog/attackers-deploy-new-ics-attack-framework-triton',
             'https://www.mandiant.com/resources/blog/Mandiant-approach-to-operational-technology-security',
             'https://www.cisa.gov/uscert/ncas/alerts/aa22-054a',
             'https://www.malwarebytes.com/browserguard', 
             'https://krebsonsecurity.com/', 
             'https://www.wired.com/story/viasat-internet-hack-ukraine-russia/', 
             'https://attack.mitre.org/techniques/T1140/', 
             'https://attack.mitre.org/techniques/T1059/003', 
             'https://stratixsystems.com/what-are-the-motivations-for-cyber-attacks/',
             'https://www.mandiant.com/resources/blog/updates-on-chinese-apt-compromising-pulse-secure-vpn-devices',
             'https://attack.mitre.org/techniques/T1082', 
             'https://www.mandiant.com/resources/blog/cyber-operations-russian-vulkan',
             'https://attack.mitre.org/groups/G0016/', 
             'https://www.malwarebytes.com/blog/threat-intelligence/2023/05/redstinger', 
             'https://techcrunch.com/2023/05/05/hacked-verified-facebook-pages-impersonating-meta-are-buying-ads-from-meta/',
             'https://news.itmo.ru/en/university_live/achievements/news/12646/', 
             'https://attack.mitre.org/techniques/T0809/', 
             'https://krebsonsecurity.com/2023/07/seo-expert-hired-and-fired-by-ashley-madison-turned-on-company-promising-revenge/#more-64197', 
             'https://www.eset.com/int/industroyer/', 'https://www.wolfssl.com/', 'https://docs.fortinet.com/document/fortigate/7.2.3/administration-guide/109991/virtual-domains',
             'https://www.linkedin.com/legal/user-agreement?session_redirect=https%3A%2F%2Fwww%2Elinkedin%2Ecom%2Fgroups%2F3054767%2F&trk=registration-frontend_join-form-user-agreement',
             'https://en.wikipedia.org/wiki/Ukrop#:~:text=Ukrop%20(Russian%3A%20%D0%A3%D0%BA%D1%80%D0%BE%D0%BF%3B%20literally,the%20Russian%20word%20for%20Ukrainians.',
             'https://www.malwarebytes.com/blog/news/2023/06/lockbit-ransomware-advisory-from-cisa-provides-interesting-insights', 
             'https://www.mandiant.com/resources/blog/industroyer-v2-old-malware-new-tricks', 
             'https://www.malwarebytes.com/blog/threat-intelligence/2023/06/the-2023-state-of-ransomware-in-education-84-increase-in-known-attacks-over-6-month-period',
             'https://www.sentinelone.com/labs/acidrain-a-modem-wiper-rains-down-on-europe/', 
             'https://www.malwarebytes.com/blog/threat-intelligence/2023/05/fake-system-update-drops-new-highly-evasive-loader', 
             'https://www.malwarebytes.com/blog/threat-intelligence/2023/07/ransomware-review-july-2023', 
             'https://github.com/besimorhino/powercat', 
             'https://blog.talosintelligence.com/code-leaks-new-ransomware-actors/',
             'https://www.bleepingcomputer.com/news/security/op-sharpshooter-connected-to-north-koreas-lazarus-group/',
             'https://blog.talosintelligence.com/what-is-commercial-spyware/', 
             'https://blog.talosintelligence.com/malicious-campaigns-target-entities-in-ukraine-poland/', 
             'https://www.hexacorn.com/blog/2023/06/22/the-myth-of-knowing-your-org-know_your_org-docx/', 
             'https://github.com/samratashok/nishang', 'https://www.cyberuk.uk/',
             'https://www.malwarebytes.com/blog/news/2023/06/moveit-discloses-yet-another-vulnerability-three-times-a-charm',
             'https://blog.talosintelligence.com/2018/05/VPNFilter.html', 
             'https://www.proofpoint.com/us/blog/threat-insight/part-1-socgholish-very-real-threat-very-fake-update', 
             'https://attack.mitre.org/techniques/T1218', 
             'https://cybergeeks.tech/a-deep-dive-into-brute-ratel-c4-payloads/', 
             'https://www.malwarebytes.com/blog/threat-intelligence/2023/06/ransomware-review-june-2023',
             'https://www.mandiant.com/resources/blog/kegtap-and-singlemalt-with-a-ransomware-chaser', 
             'https://attack.mitre.org/techniques/T0855/', 
             'https://www.brighttalk.com/webcast/7451/475010?utm_source=FireEye&utm_medium=brighttalk&utm_campaign=475010',
             'https://www.malwarebytes.com/blog/news/2023/06/update-now-moveit-transfer-vulnerability-actively-exploited', 
             'https://d01a.github.io/pikabot/']


# Synchronous function to load initial data asynchronously
@st.cache_data
def load_initial_data():
    return asyncio.run(load_data_async(default_urls))

# Main logic to initialize embeddings and vector store
initial_data = load_initial_data()
if not initial_data:
    raise ValueError("No documents were loaded from the provided URLs.")
vectorstore_local, all_documents = initialize_embeddings(initial_data)

# Define zero-shot and few-shot prompt templates for each question type
def get_zero_shot_prompt(question_type):
    templates = {
        "factual": """
        You are a Cybersecurity expert focusing on the latest trends and investigative techniques in cyber-attacks. Provide a concise answer based on the following context.
        Context: {context}
        Question: {question}
        Answer:
        """,
        "contrastive": """
        You are a Cybersecurity expert focusing on the latest trends and investigative techniques in cyber-attacks. Provide a concise answer based on the following context.
        Context: {context}
        Question: {question}
        Answer:
        """,
        "opinion": """
        You are a Cybersecurity expert focusing on the latest trends and investigative techniques in cyber-attacks. Provide a concise answer based on the following context.
        Context: {context}
        Question: {question}
        Answer:
        """,
        "inferential": """
        You are a Cybersecurity expert focusing on the latest trends and investigative techniques in cyber-attacks. Provide a concise answer based on the following context.
        Context: {context}
        Question: {question}
        Answer:
        """
    }
    return templates[question_type]

def get_few_shot_prompt(question_type):
    templates = {
        "factual": """
        You are a Cybersecurity expert focusing on the latest trends and investigative techniques in cyber-attacks. Provide a concise answer based on the following context.
        Context: {context}
        Question: {question}
        Example:
        Context: In recent years, cyber-attacks have increased significantly. 
        Question: What are the main reasons for the rise in cyber-attacks?
        Answer: The main reasons include increased connectivity, outdated security infrastructure, and sophisticated attack methods.
        Answer:
        """,
        "contrastive": """
        You are a Cybersecurity expert focusing on the latest trends and investigative techniques in cyber-attacks. Provide a concise answer based on the following context.
        Context: {context}
        Question: {question}
        Example:
        Context: Both phishing and malware attacks are common cyber threats.
        Question: How do phishing attacks differ from malware attacks?
        Answer: Phishing attacks involve tricking individuals into revealing sensitive information, while malware attacks involve malicious software designed to damage or gain unauthorized access to systems.
        Answer:
        """,
        "opinion": """
        You are a Cybersecurity expert focusing on the latest trends and investigative techniques in cyber-attacks. Provide a concise answer based on the following context.
        Context:{context}
        Question: {question}
        Example:
        Context: Many experts believe that AI can improve cybersecurity defenses.
        Question: Do you think AI can effectively combat cyber-attacks?
        Answer: Yes, AI can help identify patterns and anomalies that indicate potential cyber-attacks, making it an effective tool for enhancing cybersecurity defenses.
        Answer:
        """,
        "inferential": """
        You are a Cybersecurity expert focusing on the latest trends and investigative techniques in cyber-attacks. Provide a concise answer based on the following context.
        Context: {context}
        Question: {question}
        Example:
        Context: An organization faced a significant data breach last year.
        Question: What measures can the organization take to prevent future breaches?
        Answer: The organization can implement stronger access controls, regular security audits, employee training, and advanced threat detection systems.
        Answer:
        """
    }
    return templates[question_type]

# Function to identify question type
def identify_question_type(question):
    question = question.lower()
    if any(keyword in question for keyword in ["what", "when", "who", "how many", "how much"]):
        return "factual"
    elif any(keyword in question for keyword in ["compare", "difference", "similar"]):
        return "contrastive"
    elif any(keyword in question for keyword in ["opinion", "feel", "think", "believe"]):
        return "opinion"
    elif any(keyword in question for keyword in ["why", "how", "cause", "reason"]):
        return "inferential"
    else:
        return "factual"  # Default to factual if no keywords match

# Function to filter documents based on metadata
def filter_documents_by_metadata(query, documents):
    query_terms = query.lower().split()
    
    def doc_matches_query(doc):
        metadata = doc.metadata
        for term in query_terms:
            if any(term in str(value).lower() for key, value in metadata.items() if isinstance(value, str)):
                return True
        return False
    
    filtered_documents = [doc for doc in documents if doc_matches_query(doc)]
    return filtered_documents

# Retrieve and generate zero-shot answers
def retrieve_and_generate_zero_shot_answers(query, vectorstore, question_type):
    prompt_template = get_zero_shot_prompt(question_type)
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3}),
        return_source_documents=True,
        chain_type_kwargs={"prompt": PromptTemplate(template=prompt_template, input_variables=["context", "question"])}
    )
    response = qa_chain.invoke({"query": query})
    if not response['result'].strip():  # Check if the result is empty or only whitespace
        return "Sorry, I don't know.", [], []
    sources = [Document(page_content=doc.page_content, metadata=doc.metadata) for doc in response['source_documents']]
    contexts = [doc.page_content for doc in sources]
    return response['result'], sources, contexts

# Retrieve and generate few-shot answers
def retrieve_and_generate_few_shot_answers(query, vectorstore, question_type):
    prompt_template = get_few_shot_prompt(question_type)
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3}),
        return_source_documents=True,
        chain_type_kwargs={"prompt": PromptTemplate(template=prompt_template, input_variables=["context", "question"])}
    )
    response = qa_chain.invoke({"query": query})
    if not response['result'].strip():  # Check if the result is empty or only whitespace
        return "Sorry, I don't know.", [], []
    sources = [Document(page_content=doc.page_content, metadata=doc.metadata) for doc in response['source_documents']]
    contexts = [doc.page_content for doc in sources]
    return response['result'], sources, contexts

# Function to generate answer using selected LLM
def generate_answer_with_llm(query, selected_model, question_type=None):
    model = load_model(selected_model)
    prompt_template_zero_shot = get_zero_shot_prompt(question_type)
    prompt_template_few_shot = get_few_shot_prompt(question_type)
    context = "This is a placeholder context. Replace with actual context if available."  # Replace with actual context if available
    formatted_query_zero_shot = prompt_template_zero_shot.format(context=context, question=query)
    formatted_query_few_shot = prompt_template_few_shot.format(context=context, question=query)
    
    response_zero_shot = model.generate(prompts=[formatted_query_zero_shot])
    response_few_shot = model.generate(prompts=[formatted_query_few_shot])
    
    return response_zero_shot.generations[0][0].text, response_few_shot.generations[0][0].text

# Handle user query or file paths
def handle_user_query_or_urls(query_mode, input_text, question_type, selected_model=None):
    if query_mode == "external data":
        #st.write(input_text)
        file_paths = [line.strip() for line in input_text.split('\n') if line.strip()]
        #st.write(file_paths)
        external_data = asyncio.run(load_data_async(file_paths))
        #st.write(external_data)
        if not external_data:
            raise ValueError("No valid documents found from the provided file paths.")
        vectorstore, _ = initialize_embeddings(external_data)
    elif query_mode == "specific document":
        documents = [Document(page_content=input_text, metadata={"source": "user"})]
        vectorstore, _ = initialize_embeddings(documents)
    elif query_mode == "metadata":
        filtered_documents = filter_documents_by_metadata(input_text, all_documents)
        vectorstore, split_documents = initialize_embeddings(filtered_documents)
    else:  # entire dataset
        vectorstore = vectorstore_local  # Use precomputed vectorstore for the entire dataset

    if selected_model:
        zero_shot_answer, few_shot_answer = generate_answer_with_llm(input_text, selected_model, question_type=question_type)
        zero_shot_sources = few_shot_sources = []
        zero_shot_contexts = few_shot_contexts = []
    else:
        zero_shot_answer, zero_shot_sources, zero_shot_contexts = retrieve_and_generate_zero_shot_answers(input_text, vectorstore, question_type)
        few_shot_answer, few_shot_sources, few_shot_contexts = retrieve_and_generate_few_shot_answers(input_text, vectorstore, question_type)
    
    return zero_shot_answer, zero_shot_sources, zero_shot_contexts, few_shot_answer, few_shot_sources, few_shot_contexts

def process_csv_file(file_path, vectorstore, save_path, selected_model=None, query_mode="entire dataset"):
    try:
        # Detect file encoding
        with open(file_path, 'rb') as f:
            result = chardet.detect(f.read())
        encoding = result['encoding']
        
        df = pd.read_csv(file_path, encoding=encoding)
    except Exception as e:
        st.error(f"An error occurred while reading the CSV file: {e}")
        return

    question_types = []
    zero_shot_answers = []
    few_shot_answers = []
    zero_shot_sources_list = []
    few_shot_sources_list = []
    zero_shot_contexts_list = []
    few_shot_contexts_list = []

    for idx, row in df.iterrows():
        question = row['questions']
        ground_truth = row.get('answers', None)  # Assuming there is an 'answers' column for ground truth
        question_type = identify_question_type(question)
        question_types.append(question_type)
        if query_mode == "specific document":
            specific_document = row['specific document']
            if not specific_document:
                st.error(f"Content for the specific document is missing in the CSV file for question: {question}")
                return  # Exit function if content is missing for any specific document
            combined_input = specific_document + " " + question
            zero_shot_answer, zero_shot_sources, zero_shot_contexts, few_shot_answer, few_shot_sources, few_shot_contexts = handle_user_query_or_urls(query_mode, combined_input, question_type, selected_model)
        else:
            zero_shot_answer, zero_shot_sources, zero_shot_contexts, few_shot_answer, few_shot_sources, few_shot_contexts = handle_user_query_or_urls(query_mode, question, question_type, selected_model)
        
        zero_shot_answers.append(zero_shot_answer)
        few_shot_answers.append(few_shot_answer)
        zero_shot_sources_list.append("; ".join([f"source: {doc.metadata.get('source')}, page: {doc.metadata.get('page')}, title: {doc.metadata.get('title')}, author: {doc.metadata.get('author')}, date_published: {doc.metadata.get('date_published')}, chunk: {doc.metadata.get('chunk')}" for doc in zero_shot_sources]))
        few_shot_sources_list.append("; ".join([f"source: {doc.metadata.get('source')}, page: {doc.metadata.get('page')}, title: {doc.metadata.get('title')}, author: {doc.metadata.get('author')}, date_published: {doc.metadata.get('date_published')}, chunk: {doc.metadata.get('chunk')}" for doc in few_shot_sources]))
        zero_shot_contexts_list.append(" ".join(zero_shot_contexts))
        few_shot_contexts_list.append(" ".join(few_shot_contexts))

    result_df = pd.DataFrame({
        'questions': df['questions'],
        'question_type': question_types,
        'zero_shot_answers': zero_shot_answers,
        'zero_shot_sources': zero_shot_sources_list,
        'zero_shot_contexts': zero_shot_contexts_list,
        'few_shot_answers': few_shot_answers,
        'few_shot_sources': few_shot_sources_list,
        'few_shot_contexts': few_shot_contexts_list
    })

    try:
        if save_path:
            result_df.to_csv(save_path, index=False, encoding='utf-8')
            st.success(f"CSV file processed and saved to {save_path} successfully.")
        else:
            result_df.to_csv(file_path, index=False, encoding='utf-8')
            st.success("CSV file processed and updated successfully.")
        
        csv = result_df.to_csv(index=False, encoding='utf-8')
        b64 = base64.b64encode(csv.encode()).decode()  # some strings <-> bytes conversions necessary here
        href = f'<a href="data:file/csv;base64,{b64}" download="processed_results.csv">Download processed CSV file</a>'
        st.markdown(href, unsafe_allow_html=True)
    except Exception as e:
        st.error(f"An error occurred while saving the CSV file: {e}")

def format_source_document(doc):
    source_html = f"""
    <div style='padding: 10px; border-radius: 5px; margin-bottom: 10px; background-color: #e8f4ff;'>
        <p><strong>Source:</strong> <a href="{doc.metadata.get('source')}" target="_blank">{doc.metadata.get('source')}</a></p>
        <p><strong>Page:</strong> {doc.metadata.get('page', 'None')}</p>
        <p><strong>Title:</strong> {doc.metadata.get('title', 'None')}</p>
        <p><strong>Author:</strong> {doc.metadata.get('author', 'None')}</p>
        <p><strong>Date Published:</strong> {doc.metadata.get('date_published', 'None')}</p>
        <p><strong>Venue:</strong> {doc.metadata.get('venue', 'None')}</p>
        <p><strong>Chunk:</strong> {doc.metadata.get('chunk', 'None')}</p>
    </div>
    """
    return source_html

# Create Streamlit interface
def main():
    st.title("Cybersecurity QA System")
    st.write("Ask any questions related to cybersecurity, including trends, attack techniques, and investigative methods.")

    # Step 1: Choose between RAG and LLM
    approach = st.selectbox("Select approach:", ["Retrieval Augmentation Generation (RAG)", "Large Language Model (LLM)"])

    # Step 2: Choose single query or bulk queries
    query_type = st.selectbox("Select query type:", ["Single query", "Bulk queries (CSV file)"])

    if approach == "Retrieval Augmentation Generation (RAG)":
        if query_type == "Single query":
            query_mode = st.selectbox("Select query mode:", ["entire dataset", "specific document", "metadata", "external data"])
            question_type = st.selectbox("Select question type:", ["factual", "contrastive", "opinion", "inferential"])

            if query_mode == "entire dataset":
                input_text = st.text_area("Enter your question to ask it from entire dataset")

            elif query_mode == "metadata":
                st.write("Enter query with metadata filters (e.g., 'title: Cybersecurity, author: John Doe, date: 2023'):")
                input_text = st.text_area("Enter your question with specific metadata:")
            
            elif query_mode == "specific document":
                st.write("Enter content from a specific document along with question")
                input_text = st.text_area("Enter your question from entered content:")

            elif query_mode == "external data":
                st.write("Enter external URLs/file paths along with question")
                input_text = st.text_area("Enter your question with URLs/file paths:")

            if st.button("Get Answer"):
                try:
                    zero_shot_answer, zero_shot_sources, zero_shot_contexts, few_shot_answer, few_shot_sources, few_shot_contexts = handle_user_query_or_urls(query_mode, input_text, question_type)
                    st.write(f"**Question Type:** {question_type.capitalize()}")
                    st.write("**Zero-Shot Answer:**")
                    st.markdown(f"<div style='background-color: #f0f0f0; padding: 10px; border-radius: 5px;'>{zero_shot_answer}</div>", unsafe_allow_html=True)
                    st.write("**Zero-Shot Context:**")
                    st.markdown(f"<div style='background-color: #e8f4ff; padding: 10px; border-radius: 5px;'>{' '.join(zero_shot_contexts)}</div>", unsafe_allow_html=True)
                    st.write("**Zero-Shot Source Documents:**")
                    for doc in zero_shot_sources:
                        st.markdown(format_source_document(doc), unsafe_allow_html=True)
                    st.write("**Few-Shot Answer:**")
                    st.markdown(f"<div style='background-color: #f0f0f0; padding: 10px; border-radius: 5px;'>{few_shot_answer}</div>", unsafe_allow_html=True)
                    st.write("**Few-Shot Context:**")
                    st.markdown(f"<div style='background-color: #e8f4ff; padding: 10px; border-radius: 5px;'>{' '.join(few_shot_contexts)}</div>", unsafe_allow_html=True)
                    st.write("**Few-Shot Source Documents:**")
                    for doc in few_shot_sources:
                        st.markdown(format_source_document(doc), unsafe_allow_html=True)
                except Exception as e:
                    st.error(f"An error occurred: {e}")

        else:  # Bulk queries
            query_mode_csv = st.selectbox("Select query mode for CSV processing:", ["entire dataset", "specific document", "metadata", "external data"])
            uploaded_file = st.file_uploader("Choose a CSV file with questions", type="csv")
            if st.button("Process CSV"):
                try:
                    file_path = uploaded_file.name
                    with open(file_path, "wb") as f:
                        f.write(uploaded_file.getbuffer())
                    save_path = st.text_input("Enter the path to save the processed CSV results file:")
                    process_csv_file(file_path, vectorstore_local, save_path, query_mode=query_mode_csv)
                except Exception as e:
                    st.error(f"An error occurred while processing the CSV file: {e}")

    else:  # Large Language Model (LLM)
        selected_model = st.selectbox("Select Open-Source LLM:", ["None"] + list(open_source_models.keys()))
        if query_type == "Single query":
            question_type = st.selectbox("Select question type:", ["factual", "contrastive", "opinion", "inferential"])
            input_text = st.text_area("Enter your question:")

            if st.button("Get Answer"):
                try:
                    selected_model_repo = open_source_models[selected_model] if selected_model != "None" else None
                    zero_shot_answer, few_shot_answer = generate_answer_with_llm(input_text, selected_model_repo, question_type=question_type)
                    st.write(f"**Question Type:** {question_type.capitalize()}")
                    st.write("**Zero-Shot Answer:**")
                    st.markdown(f"<div style='background-color: #f0f0f0; padding: 10px; border-radius: 5px;'>{zero_shot_answer}</div>", unsafe_allow_html=True)
                    st.write("**Few-Shot Answer:**")
                    st.markdown(f"<div style='background-color: #f0f0f0; padding: 10px; border-radius: 5px;'>{few_shot_answer}</div>", unsafe_allow_html=True)
                except Exception as e:
                    st.error(f"An error occurred: {e}")

        else:  # Bulk queries
            uploaded_file = st.file_uploader("Choose a CSV file with questions", type="csv")
            if st.button("Process CSV"):
                try:
                    file_path = uploaded_file.name
                    with open(file_path, "wb") as f:
                        f.write(uploaded_file.getbuffer())
                    save_path = st.text_input("Enter the path to save the processed CSV results file:")
                    process_csv_file(file_path, vectorstore_local, save_path, selected_model=open_source_models[selected_model])
                except Exception as e:
                    st.error(f"An error occurred while processing the CSV file: {e}")

if __name__ == '__main__':
    main()