Basti8499 commited on
Commit
579ab0b
1 Parent(s): 2701e36

Adds all necessary files

Browse files
Files changed (44) hide show
  1. .env +2 -0
  2. .gitattributes +1 -0
  3. .gitignore +12 -0
  4. Dockerfile +52 -0
  5. README_PROJECT.md +73 -0
  6. app/.chainlit/config.toml +97 -0
  7. app/.chainlit/translations/en-US.json +155 -0
  8. app/app.py +99 -0
  9. app/chainlit.md +7 -0
  10. app/helper.py +217 -0
  11. app/prompts.py +39 -0
  12. app/public/logo_dark.png +0 -0
  13. app/public/logo_light.png +0 -0
  14. chroma/28f27476-0d16-464f-b0fa-5ebbcf277b95/data_level0.bin +3 -0
  15. chroma/28f27476-0d16-464f-b0fa-5ebbcf277b95/header.bin +3 -0
  16. chroma/28f27476-0d16-464f-b0fa-5ebbcf277b95/length.bin +3 -0
  17. chroma/28f27476-0d16-464f-b0fa-5ebbcf277b95/link_lists.bin +3 -0
  18. chroma/chroma.sqlite3 +3 -0
  19. index_preparation/build_index.ipynb +250 -0
  20. index_preparation/create_QA_set_documents.ipynb +84 -0
  21. index_preparation/create_pdf_documents.ipynb +139 -0
  22. index_preparation/create_template_documents.ipynb +140 -0
  23. index_preparation/create_web_documents.ipynb +176 -0
  24. index_preparation/preprocess_data.ipynb +229 -0
  25. init_embedding_model.py +4 -0
  26. input_data/PDF/documents/all_documents +0 -0
  27. input_data/PDF/documents/new_documents +0 -0
  28. input_data/QA_dataset/all_documents +0 -0
  29. input_data/QA_dataset/golden_qa_set.json +0 -0
  30. input_data/Templates/documents/all_documents +5 -0
  31. input_data/Templates/documents/new_documents +0 -0
  32. input_data/Templates/template_files/processed/Backup policy.docx +0 -0
  33. input_data/Templates/template_files/processed/Change management policy.docx +0 -0
  34. input_data/Templates/template_files/processed/Encryption policy.docx +0 -0
  35. input_data/Templates/template_files/processed/IC-ISO-27001-Controls-Checklist.xlsx +0 -0
  36. input_data/Templates/template_files/processed/IC-ISO-27001-Risk-Assessment.xlsx +0 -0
  37. input_data/Web/URLs/cleaned_urls.txt +62 -0
  38. input_data/Web/URLs/uncleaned_urls.txt +0 -0
  39. input_data/Web/documents/all_documents +0 -0
  40. input_data/Web/documents/new_documents +0 -0
  41. requirements.txt +0 -0
  42. requirements_Docker.txt +0 -0
  43. setup.sh +16 -0
  44. sparse_index/sparse_1536_264 +0 -0
.env ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ TESSERACT_PATH=C:\Program Files\Tesseract-OCR\tesseract.exe
2
+ CHROMA_PATH=./../chroma
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ chroma/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .DS_Store
2
+ .vscode
3
+ venv/
4
+ evaluationResults/*
5
+
6
+ # Byte-compiled / optimized / DLL files
7
+ __pycache__/
8
+ *.py[cod]
9
+ *$py.class
10
+
11
+ # Jupyter Notebook
12
+ .ipynb_checkpoints
Dockerfile ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # The python builder image, used to build the virtual environment
2
+ FROM python:3.11-slim-buster
3
+
4
+ # Install system dependencies
5
+ RUN apt-get update && apt-get install -y --no-install-recommends git && \
6
+ rm -rf /var/lib/apt/lists/*
7
+
8
+ # Create a user to run the app
9
+ RUN useradd -m -u 1000 user
10
+
11
+ # Switch to user and set environment variables
12
+ USER user
13
+ ENV HOME=/home/user \
14
+ PATH="/home/user/.local/bin:$PATH" \
15
+ VIRTUAL_ENV=/home/user/venv \
16
+ LISTEN_PORT=7860 \
17
+ HOST=0.0.0.0
18
+
19
+ # Set the working directory in the container
20
+ WORKDIR $HOME
21
+
22
+ # Create a virtual environment to isolate our package dependencies locally
23
+ RUN python -m venv $VIRTUAL_ENV
24
+ ENV PATH="$VIRTUAL_ENV/bin:$PATH"
25
+
26
+ # Copy necessary files to container directory
27
+ COPY --chown=user ./app ./app/app
28
+ COPY --chown=user ./chroma ./chroma
29
+ COPY --chown=user ./embedding_model ./embedding_model
30
+ COPY --chown=user ./sparse_index ./sparse_index
31
+ COPY --chown=user ./.env ./app/.env
32
+ COPY --chown=user ./app/chainlit.md ./app/chainlit.md
33
+ COPY --chown=user ./app/.chainlit ./app/.chainlit
34
+ COPY --chown=user ./app/public ./app/public
35
+ COPY --chown=user ./input_data/Templates/template_files ./input_data/Templates/template_files
36
+ COPY --chown=user ./requirements_Docker.txt ./app/requirements_Docker.txt
37
+ COPY --chown=user ./init_embedding_model.py ./init_embedding_model.py
38
+
39
+ WORKDIR $HOME/app
40
+
41
+ # Install Python dependencies
42
+ RUN pip install --upgrade pip && \
43
+ pip install -r ./requirements_Docker.txt
44
+
45
+ # Run the script to initialize and cache the fine-tuned embedding model
46
+ RUN python ./init_model.py
47
+
48
+ # Expose the port the app runs on
49
+ EXPOSE $LISTEN_PORT
50
+
51
+ # Run the chainlit app
52
+ CMD ["chainlit", "run", "app/app.py", "--host", "0.0.0.0", "--port", "7860"]
README_PROJECT.md ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # General Information
2
+
3
+ ## 1. Project Initilization
4
+ - After pulling the project, do the following to initialize the project:
5
+ - Make sure that a Python Version >= 3.11 is installed
6
+ - Run the following command to execute the initialization script: "source setup.sh"
7
+ - If you want to insert new PDF documents and update the document base, you first need to install Tesseract which is the OCR engine used in this code:
8
+ - Download Tesseract Installer for Windows: https://github.com/UB-Mannheim/tesseract/wiki
9
+ - For others, see here: https://tesseract-ocr.github.io/tessdoc/Installation.html
10
+ - Create a .env file on the root directory level, with the following keys:
11
+ - TESSERACT_PATH={path}
12
+ - Set path to the installation path of tesseract, e.g. "C:\Program Files\Tesseract-OCR\tesseract.exe"
13
+ - CHROMA_PATH=./../chroma
14
+
15
+ ## 2. Using the Chatbot locally
16
+ - In the app/helper.py file comment out lines 8 to 10 if you are not on a Linux machine
17
+ - To start the chatbot locally, run "cd app" and "chainlit run app.py -w"
18
+ - To use the chatbot, you need two API keys which you can create under the following links
19
+ - [OpenAI](https://openai.com/blog/openai-api)
20
+ - [Cohere](https://dashboard.cohere.com/api-keys)
21
+
22
+ ## 3. Using Docker
23
+ - Go to helper.py and uncomment the three lines for the import. This is necessary to use Chroma within the container
24
+ - Build the docker file: "docker build -t iso_27001_chatbot ."
25
+ - Running the docker file: "docker run -p 7860:7860 iso_27001_chatbot"
26
+ - Access at: http://localhost:7860
27
+ - Note that the docker file uses the requirements_Docker.txt which do not include Cuda support, as the free version of HF spaces does not come with GPU availability. If you want to include Cuda support, you need to integrate the command seen above for installing torch into the Dockerfile.
28
+
29
+ # Project Structure
30
+
31
+ ## app
32
+ Contains the chatbot web application, created with Chainlit. Also, includes classes for prompts and helper functions.
33
+
34
+ ## chroma
35
+ The chroma folder consists of all the indices that were created by using the notebooks inside the index_preparation folder.
36
+
37
+ ## embedding_model
38
+ This folder contains the embedding model fine-tuned on an ISO 27001 text corpus under. It is based on [bge-large-en-v.1.5](https://huggingface.co/BAAI/bge-large-en-v1.5) and can be accessed and downloaded on [HuggingFace](https://huggingface.co/Basti8499/bge-large-en-v1.5-ISO-27001).
39
+
40
+ ## index_preparation
41
+ Stores all Jupyter notebooks needed to create the vector database which stores the ISO 27001 documents. Before creating the index with the build_index.ipynb, the documents for PDFs, Web and Templates need be created inside the other notebooks.
42
+
43
+ ## input_data
44
+
45
+ ### PDF Files (/PDF)
46
+ - Directory structure:
47
+ - PDF/files: After manually cleaning the PDFs (removing pages), the PDFs should be moved manually to this folder.
48
+ - PDF/documents
49
+ - /all_documents: JSON file for all processed PDF documents
50
+ - /new_documents: JSON file for newly processed PDF documents
51
+ - PDF/PDF_images: Empty folder in which the images during OCR are stored and deleted afterwards.
52
+
53
+ ### Web Files (/Web)
54
+ - Directory structure:
55
+ - Web/documents:
56
+ - /all_documents: JSON file for all processed web documents
57
+ - /newl_documents: JSON file for newly processed web documents
58
+ - Web/URLs:
59
+ - /cleaned_urls.txt: .txt file for URLs that were already processed and documents exist
60
+ - /uncleaned_urls.txt: .txt file for URLs that have not been processed
61
+
62
+ ### Template Files (/Templates)
63
+ - Directory structure:
64
+ - Templates/documents:
65
+ - /all_documents: JSON file for all processed template documents
66
+ - /new_documents: JSON file for all newly processed template documents
67
+ - Templates/template_files:
68
+ - /new: Not yet processed template files
69
+ - /processed: Already processed template files
70
+ - For templates it is important that the actual files to the template are stored under Templates/template_files/new for processing, as the paths are used in the chatbot.
71
+
72
+ ## sparse_index
73
+ Stores the chunked documents that were created in the build_index.ipynb in a .txt file for later sparse retrieval.
app/.chainlit/config.toml ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ # Whether to enable telemetry (default: true). No personal data is collected.
3
+ enable_telemetry = true
4
+
5
+
6
+ # List of environment variables to be provided by each user to use the app.
7
+ user_env = ["OPENAI_API_KEY","COHERE_API_KEY"]
8
+
9
+ # Duration (in seconds) during which the session is saved when the connection is lost
10
+ session_timeout = 3600
11
+
12
+ # Enable third parties caching (e.g LangChain cache)
13
+ cache = false
14
+
15
+ # Authorized origins
16
+ allow_origins = ["*"]
17
+
18
+ # Follow symlink for asset mount (see https://github.com/Chainlit/chainlit/issues/317)
19
+ # follow_symlink = false
20
+
21
+ [features]
22
+ # Show the prompt playground
23
+ prompt_playground = false
24
+
25
+ # Process and display HTML in messages. This can be a security risk (see https://stackoverflow.com/questions/19603097/why-is-it-dangerous-to-render-user-generated-html-or-javascript)
26
+ unsafe_allow_html = false
27
+
28
+ # Process and display mathematical expressions. This can clash with "$" characters in messages.
29
+ latex = false
30
+
31
+ # Authorize users to upload files with messages
32
+ multi_modal = false
33
+
34
+ # Allows user to use speech to text
35
+ [features.speech_to_text]
36
+ enabled = false
37
+ # See all languages here https://github.com/JamesBrill/react-speech-recognition/blob/HEAD/docs/API.md#language-string
38
+ # language = "en-US"
39
+
40
+ [UI]
41
+ # Name of the app and chatbot.
42
+ name = "ISO 27001 Chatbot"
43
+
44
+ # Show the readme while the thread is empty.
45
+ show_readme_as_default = true
46
+
47
+ # Description of the app and chatbot. This is used for HTML tags.
48
+ # description = ""
49
+
50
+ # Large size content are by default collapsed for a cleaner ui
51
+ default_collapse_content = true
52
+
53
+ # The default value for the expand messages settings.
54
+ default_expand_messages = false
55
+
56
+ # Hide the chain of thought details from the user in the UI.
57
+ hide_cot = true
58
+
59
+ # Link to your github repo. This will add a github button in the UI's header.
60
+ # github = ".."
61
+
62
+ # Specify a CSS file that can be used to customize the user interface.
63
+ # The CSS file can be served from the public directory or via an external link.
64
+ # custom_css = "/public/test.css"
65
+
66
+ # Specify a Javascript file that can be used to customize the user interface.
67
+ # The Javascript file can be served from the public directory.
68
+ # custom_js = "/public/test.js"
69
+
70
+ # Specify a custom font url.
71
+ # custom_font = "https://fonts.googleapis.com/css2?family=Inter:wght@400;500;700&display=swap"
72
+
73
+ # Override default MUI light theme. (Check theme.ts)
74
+ [UI.theme]
75
+ #font_family = "Inter, sans-serif"
76
+ [UI.theme.light]
77
+ #background = "#FAFAFA"
78
+ #paper = "#FFFFFF"
79
+
80
+ [UI.theme.light.primary]
81
+ #main = "#F80061"
82
+ #dark = "#980039"
83
+ #light = "#FFE7EB"
84
+
85
+ # Override default MUI dark theme. (Check theme.ts)
86
+ [UI.theme.dark]
87
+ #background = "#FAFAFA"
88
+ #paper = "#FFFFFF"
89
+
90
+ [UI.theme.dark.primary]
91
+ #main = "#F80061"
92
+ #dark = "#980039"
93
+ #light = "#FFE7EB"
94
+
95
+
96
+ [meta]
97
+ generated_by = "1.0.401"
app/.chainlit/translations/en-US.json ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "components": {
3
+ "atoms": {
4
+ "buttons": {
5
+ "userButton": {
6
+ "menu": {
7
+ "settings": "Settings",
8
+ "settingsKey": "S",
9
+ "APIKeys": "API Keys",
10
+ "logout": "Logout"
11
+ }
12
+ }
13
+ }
14
+ },
15
+ "molecules": {
16
+ "newChatButton": {
17
+ "newChat": "New Chat"
18
+ },
19
+ "tasklist": {
20
+ "TaskList": {
21
+ "title": "\ud83d\uddd2\ufe0f Task List",
22
+ "loading": "Loading...",
23
+ "error": "An error occured"
24
+ }
25
+ },
26
+ "attachments": {
27
+ "cancelUpload": "Cancel upload",
28
+ "removeAttachment": "Remove attachment"
29
+ },
30
+ "newChatDialog": {
31
+ "createNewChat": "Create new chat?",
32
+ "clearChat": "This will clear the current messages and start a new chat.",
33
+ "cancel": "Cancel",
34
+ "confirm": "Confirm"
35
+ },
36
+ "settingsModal": {
37
+ "expandMessages": "Expand Messages",
38
+ "hideChainOfThought": "Hide Chain of Thought",
39
+ "darkMode": "Dark Mode"
40
+ }
41
+ },
42
+ "organisms": {
43
+ "chat": {
44
+ "history": {
45
+ "index": {
46
+ "lastInputs": "Last Inputs",
47
+ "noInputs": "Such empty...",
48
+ "loading": "Loading..."
49
+ }
50
+ },
51
+ "inputBox": {
52
+ "input": {
53
+ "placeholder": "Type your message here..."
54
+ },
55
+ "speechButton": {
56
+ "start": "Start recording",
57
+ "stop": "Stop recording"
58
+ },
59
+ "SubmitButton": {
60
+ "sendMessage": "Send message",
61
+ "stopTask": "Stop Task"
62
+ },
63
+ "UploadButton": {
64
+ "attachFiles": "Attach files"
65
+ },
66
+ "waterMark": {
67
+ "text": "Built with"
68
+ }
69
+ },
70
+ "Messages": {
71
+ "index": {
72
+ "running": "Running",
73
+ "executedSuccessfully": "executed successfully",
74
+ "failed": "failed",
75
+ "feedbackUpdated": "Feedback updated",
76
+ "updating": "Updating"
77
+ }
78
+ },
79
+ "dropScreen": {
80
+ "dropYourFilesHere": "Drop your files here"
81
+ },
82
+ "index": {
83
+ "failedToUpload": "Failed to upload",
84
+ "cancelledUploadOf": "Cancelled upload of",
85
+ "couldNotReachServer": "Could not reach the server",
86
+ "continuingChat": "Continuing previous chat"
87
+ },
88
+ "settings": {
89
+ "settingsPanel": "Settings panel",
90
+ "reset": "Reset",
91
+ "cancel": "Cancel",
92
+ "confirm": "Confirm"
93
+ }
94
+ },
95
+ "threadHistory": {
96
+ "sidebar": {
97
+ "filters": {
98
+ "FeedbackSelect": {
99
+ "feedbackAll": "Feedback: All",
100
+ "feedbackPositive": "Feedback: Positive",
101
+ "feedbackNegative": "Feedback: Negative"
102
+ },
103
+ "SearchBar": {
104
+ "search": "Search"
105
+ }
106
+ },
107
+ "DeleteThreadButton": {
108
+ "confirmMessage": "This will delete the thread as well as it's messages and elements.",
109
+ "cancel": "Cancel",
110
+ "confirm": "Confirm",
111
+ "deletingChat": "Deleting chat",
112
+ "chatDeleted": "Chat deleted"
113
+ },
114
+ "index": {
115
+ "pastChats": "Past Chats"
116
+ },
117
+ "ThreadList": {
118
+ "empty": "Empty..."
119
+ },
120
+ "TriggerButton": {
121
+ "closeSidebar": "Close sidebar",
122
+ "openSidebar": "Open sidebar"
123
+ }
124
+ },
125
+ "Thread": {
126
+ "backToChat": "Go back to chat",
127
+ "chatCreatedOn": "This chat was created on"
128
+ }
129
+ },
130
+ "header": {
131
+ "chat": "Chat",
132
+ "readme": "Readme"
133
+ }
134
+ }
135
+ },
136
+ "hooks": {
137
+ "useLLMProviders": {
138
+ "failedToFetchProviders": "Failed to fetch providers:"
139
+ }
140
+ },
141
+ "pages": {
142
+ "Design": {},
143
+ "Env": {
144
+ "savedSuccessfully": "Saved successfully",
145
+ "requiredApiKeys": "Required API Keys",
146
+ "requiredApiKeysInfo": "To use this app, the following API keys are required. The keys are stored on your device's local storage."
147
+ },
148
+ "Page": {
149
+ "notPartOfProject": "You are not part of this project."
150
+ },
151
+ "ResumeButton": {
152
+ "resumeChat": "Resume Chat"
153
+ }
154
+ }
155
+ }
app/app.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import chainlit as cl
2
+ from helper import HelperMethods
3
+ from pydantic.v1.error_wrappers import ValidationError
4
+ from cohere.error import CohereAPIError
5
+
6
+ COLLECTION_NAME = "ISO_27001_Collection"
7
+
8
+
9
+ @cl.on_chat_start
10
+ async def on_chat_start():
11
+ """
12
+ Is called when a new chat session is created. Adds an initial message and sets important objects into session state.
13
+ """
14
+
15
+ await cl.sleep(1)
16
+
17
+ msg = cl.Message(author="ISO 27001 - Assistant", content="Hello, do you have questions on ISO 27001? Feel free to ask me.")
18
+ await msg.send()
19
+
20
+ helper = HelperMethods()
21
+
22
+ try:
23
+ llm, MAX_CONTEXT_SIZE = await helper.get_LLM()
24
+ except ValidationError as e:
25
+ error_message = cl.ErrorMessage(
26
+ author="ISO 27001 - Assistant",
27
+ content="A validation error occurred. Please ensure the Open API_KEY is correctly set. You can navigate to the profile icon and then reset the keys. After that reload the page and try to ask the question again.",
28
+ )
29
+ await error_message.send()
30
+ return
31
+
32
+ state = {"llm": llm, "max_context_size": MAX_CONTEXT_SIZE, "vectordb": helper.get_index_vector_db(COLLECTION_NAME)}
33
+ cl.user_session.set("state_ISO", state)
34
+
35
+
36
+ @cl.on_message
37
+ async def on_message(message: cl.Message):
38
+ """
39
+ Is called when a new message is sent by the user. Executes the RAG pipeline (check english, retrieve contexts, check relevancy, check context size, prompt LLM)
40
+ """
41
+
42
+ state = cl.user_session.get("state_ISO")
43
+ helper = HelperMethods()
44
+ query = message.content
45
+
46
+ if helper.check_if_english(query):
47
+
48
+ try:
49
+ docs = helper.retrieve_contexts(state["vectordb"], query)
50
+ except CohereAPIError as e:
51
+ error_message = cl.ErrorMessage(
52
+ author="ISO 27001 - Assistant",
53
+ content="A validation error occurred. Please ensure the Cohere API_KEY is correctly set. You can navigate to the profile icon and then reset the keys. After that reload the page and try to ask the question again.",
54
+ )
55
+ await error_message.send()
56
+ return
57
+
58
+ if helper.check_if_relevant(docs):
59
+ if helper.is_context_size_valid(docs, query, state["max_context_size"]):
60
+
61
+ msg = cl.Message(author="ISO 27001 - Assistant", content="")
62
+ await msg.send()
63
+
64
+ full_prompt, sources, template_path, template_source= helper.get_full_prompt_sources_and_template(docs, state["llm"], query)
65
+
66
+ try:
67
+ stream = state["llm"].astream(full_prompt)
68
+ except ValidationError as e:
69
+ error_message = cl.ErrorMessage(
70
+ author="ISO 27001 - Assistant",
71
+ content="A validation error occurred. Please ensure the Open API_KEY is correctly set. You can navigate to the profile icon and then reset the keys. After that reload the page and try to ask the question again.",
72
+ )
73
+ await error_message.send()
74
+ return
75
+
76
+ async for part in stream:
77
+ await msg.stream_token(part.content)
78
+
79
+ if template_path == "":
80
+ sources_str = "\n\nSources: \n" + sources
81
+ msg.content += sources_str
82
+ await msg.update()
83
+ else:
84
+ sources_str = "\n\nSources: \n" + sources
85
+ elements = [cl.File(name=template_source, path=template_path, display="inline")]
86
+ msg.content += sources_str
87
+ msg.elements = elements
88
+ await msg.update()
89
+ else:
90
+ await cl.Message(
91
+ author="ISO 27001 - Assistant",
92
+ content="I am sorry. I cannot process your question, as it would exceed my token limit. Please try to reformulate your question, or ask something else.",
93
+ ).send()
94
+ else:
95
+ await cl.Message(author="ISO 27001 - Assistant", content="I am sorry. I cannot process your question, as it is not related to ISO 27001.").send()
96
+ else:
97
+ await cl.Message(
98
+ author="ISO 27001 - Assistant", content="I am sorry. I cannot process your question, as I can only answer questions written in English."
99
+ ).send()
app/chainlit.md ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # Welcome to the ISO 27001 Chatbot! 🤖
2
+
3
+ Hello, this Chainlit application lets you chat with a Retrieval Augmented Generation Pipeline with the focus on ISO 27001. It will only answer English questions related to this topic. Please be aware that this is not a production ready system and do not fully trust the answers, as it can still include errors.
4
+
5
+ The RAG Pipeline needs to load the underlying models, so you can start as soon as the Assistant greets you. However, to use the chatbot, you need two API keys which you can create under the following links: [OpenAI](https://openai.com/blog/openai-api) (for the generation) and [Cohere](https://dashboard.cohere.com/api-keys) (for the retrieval re-ranking). After creating both keys, you need to assign them. This can be done by clicking on the user profile and setting the keys. After that please reload the page.
6
+
7
+ Please be advised that because this HF Space runs on the free tier, no GPU is available. That's why answering a question takes up to 40 seconds. With GPU support this would be reduced to below 10 seconds.
app/helper.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.embeddings import HuggingFaceEmbeddings
2
+ from langchain_openai import ChatOpenAI
3
+ from langchain.retrievers.document_compressors import CohereRerank
4
+ from langchain_community.retrievers import BM25Retriever
5
+ import tiktoken
6
+
7
+ # ONLY USE WITH DOCKER, then uncomment
8
+ import pysqlite3
9
+ import sys
10
+ sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
11
+
12
+ import chromadb
13
+ import chainlit as cl
14
+ from langdetect import detect
15
+ from langchain_community.vectorstores import Chroma
16
+ from typing import List
17
+ from typing import Tuple
18
+ import os
19
+ import json
20
+ from langchain.docstore.document import Document
21
+ from prompts import get_system_prompt, get_human_prompt, get_system_prompt_template, get_full_prompt
22
+
23
+ class HelperMethods:
24
+ """
25
+ Helper class with all important methods for the RAG pipeline.
26
+ """
27
+
28
+ def __init__(self):
29
+ pass
30
+
31
+ def _get_embedding_model(self):
32
+ """
33
+ Gets the finetuned embedding model based on bge-large-en-v1.5
34
+ """
35
+ path = "Basti8499/bge-large-en-v1.5-ISO-27001"
36
+ model = HuggingFaceEmbeddings(model_name=path)
37
+ return model
38
+
39
+
40
+ async def get_LLM(self):
41
+ """
42
+ Initializes the gpt-4.5 16k LLM
43
+ """
44
+ llm = ChatOpenAI(model_name="gpt-3.5-turbo-0125", temperature=0, max_tokens=680, streaming=True, api_key=cl.user_session.get("env")["OPENAI_API_KEY"])
45
+ max_context_size = 16385
46
+ return llm, max_context_size
47
+
48
+
49
+ def get_index_vector_db(self, collection_name: str):
50
+ """
51
+ Gets the index vector base based on the collection name, if existent.
52
+ """
53
+ new_client = chromadb.PersistentClient(path=os.environ.get("CHROMA_PATH"))
54
+
55
+ # Check if collection already exists
56
+ collection_exists = True
57
+ try:
58
+ new_client.get_collection(collection_name)
59
+ except ValueError as e:
60
+ collection_exists = False
61
+
62
+ if not collection_exists:
63
+ raise Exception("Error, raised exception: Collection does not exist.")
64
+ else:
65
+ embedding_model = self._get_embedding_model()
66
+ vectordb = Chroma(client=new_client, collection_name=collection_name, embedding_function=embedding_model)
67
+
68
+ return vectordb
69
+
70
+
71
+ def _load_documents(self, file_path: str) -> List[Document]:
72
+ documents = []
73
+ with open(file_path, "r") as jsonl_file:
74
+ for line in jsonl_file:
75
+ data = json.loads(line)
76
+ obj = Document(**data)
77
+ documents.append(obj)
78
+ return documents
79
+
80
+ def check_if_english(self, query: str) -> bool:
81
+ """
82
+ Uses the langdetect library based on Google's language-detection library to check which language the query is in.
83
+ Returns True if it is English.
84
+ """
85
+ language = detect(query)
86
+ return language == "en"
87
+
88
+ def check_if_relevant(self, docs: List[Document]) -> bool:
89
+
90
+ relevance_scores = [doc.metadata["relevance_score"] for doc in docs]
91
+ avg_score = sum(relevance_scores) / len(relevance_scores)
92
+ return avg_score > 0.75
93
+
94
+ def retrieve_contexts(self, vectordb, query: str, k: int = 8, rerank_k: int = 50, dense_percent: float = 0.5) -> List[Document]:
95
+ """
96
+ Retrieves the documents from the vector database by using a hybrid approach (dense (similarity search) + sparse (BM25)) and the Cohere re-ranking endpoint.
97
+ """
98
+ dense_k = int(rerank_k * dense_percent)
99
+ sparse_k = rerank_k - dense_k
100
+
101
+ # Sparse Retrieval
102
+ sparse_documents = self._load_documents(f"./../sparse_index/sparse_1536_264")
103
+ bm25_retriever = BM25Retriever.from_documents(sparse_documents)
104
+ bm25_retriever.k = sparse_k
105
+ result_documents_BM25 = bm25_retriever.get_relevant_documents(query)
106
+
107
+ # Dense Retrieval
108
+ result_documents_Dense = vectordb.similarity_search(query, k=dense_k)
109
+
110
+ result_documents_all = []
111
+ result_documents_all.extend(result_documents_BM25)
112
+ result_documents_all.extend(result_documents_Dense)
113
+
114
+ # Only get unique documents and remove duplicates that were retrieved in both sparse and dense
115
+ unique_documents_dict = {}
116
+ for doc in result_documents_all:
117
+ if doc.page_content not in unique_documents_dict:
118
+ unique_documents_dict[doc.page_content] = doc
119
+ result_documents_unique = list(unique_documents_dict.values())
120
+
121
+ # Re-ranking with Cohere
122
+ compressor = CohereRerank(top_n=k, user_agent="langchain", cohere_api_key=cl.user_session.get("env")["COHERE_API_KEY"])
123
+ result_documents = compressor.compress_documents(documents=result_documents_unique, query=query)
124
+
125
+ return result_documents
126
+
127
+
128
+ def is_context_size_valid(self, contexts: List[Document], query: str, max_context_size: int) -> bool:
129
+ """
130
+ Checks if the context size is valid with the cl100k tokenizer which is used for OpenAI LLM's.
131
+ """
132
+ # Transform List[Document] into List[str]
133
+ concatenated_contexts = ""
134
+ for index, document in enumerate(contexts):
135
+ original_text = document.metadata.get("original_text", "")
136
+ # Replace curly brackets, as otherwise problems can be encountered with formatting the prompt
137
+ original_text = original_text.replace("{", "").replace("}", "")
138
+ concatenated_contexts += f"{index+1}. {original_text}\n\n"
139
+
140
+ if not query.endswith("?"):
141
+ query = query + "?"
142
+
143
+ # Get the prompts
144
+ system_str, system_prompt = get_system_prompt()
145
+ human_str, human_prompt = get_human_prompt(concatenated_contexts, query)
146
+ full_prompt = system_str + "\n" + human_str
147
+
148
+ # Count token length
149
+ tokenizer = tiktoken.get_encoding("cl100k_base")
150
+ token_length = len(tokenizer.encode(full_prompt))
151
+
152
+ if token_length <= max_context_size:
153
+ return True
154
+ else:
155
+ return False
156
+
157
+ def get_full_prompt_sources_and_template(self, contexts: List[Document], llm, prompt: str) -> Tuple[str, str, str, str]:
158
+
159
+ # Check if the query is aimed at a template and check if the context documents also have a template
160
+ # If it is a template question the query and system prompt has to be altered
161
+ # Only check first two because otherwise the re-ranked score is not high enough to assume that the retrieved template is valid for that question
162
+ is_template_question = False
163
+ template_path = ""
164
+ template_source = ""
165
+ if "template" in prompt.lower():
166
+ for context in contexts[:2]:
167
+ if "template_path" in context.metadata:
168
+ is_template_question = True
169
+ template_path = context.metadata["template_path"]
170
+ template_source = context.metadata["source"]
171
+ break
172
+
173
+ # Concatenate all document texts and sources
174
+ concatenated_contexts = ""
175
+ concatenated_sources = ""
176
+ seen_sources = set()
177
+ if is_template_question:
178
+
179
+ for index, document in enumerate(contexts[:2]):
180
+ original_text = document.metadata.get('original_text', '')
181
+ # Replace curly brackets, as otherwise problems can be encountered with formatting the prompt
182
+ original_text = original_text.replace("{", "").replace("}", "")
183
+ concatenated_contexts += f"{index+1}. {original_text}\n\n"
184
+
185
+ source = document.metadata.get('source', '')
186
+ if source not in seen_sources:
187
+ concatenated_sources += f"{len(seen_sources) + 1}. {source}\n"
188
+ seen_sources.add(source)
189
+
190
+ else:
191
+ for index, document in enumerate(contexts):
192
+ original_text = document.metadata.get('original_text', '')
193
+ # Replace curly brackets, as otherwise problems can be encountered with formatting the prompt
194
+ original_text = original_text.replace("{", "").replace("}", "")
195
+ concatenated_contexts += f"{index+1}. {original_text}\n\n"
196
+
197
+ source = document.metadata.get('source', '')
198
+ if source not in seen_sources:
199
+ concatenated_sources += f"{len(seen_sources) + 1}. {source}\n"
200
+ seen_sources.add(source)
201
+
202
+ # Check if question mark is at the end of the prompt
203
+ if not prompt.endswith("?"):
204
+ prompt = prompt + "?"
205
+
206
+ if is_template_question:
207
+ system_str, system_prompt = get_system_prompt_template()
208
+ human_str, human_prompt = get_human_prompt(concatenated_contexts, prompt)
209
+ full_prompt = get_full_prompt(system_prompt, human_prompt)
210
+ #answer = llm.invoke(full_prompt).content
211
+ else:
212
+ system_str, system_prompt = get_system_prompt()
213
+ human_str, human_prompt = get_human_prompt(concatenated_contexts, prompt)
214
+ full_prompt = get_full_prompt(system_prompt, human_prompt)
215
+ #answer = llm.invoke(full_prompt).content
216
+
217
+ return full_prompt, concatenated_sources, template_path, template_source
app/prompts.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Tuple
2
+ from typing import List
3
+ from langchain.prompts.chat import (
4
+ ChatPromptTemplate,
5
+ SystemMessagePromptTemplate,
6
+ HumanMessagePromptTemplate,
7
+ )
8
+ from langchain_core.messages import BaseMessage
9
+ from langchain.docstore.document import Document
10
+
11
+ """
12
+ Python file for getting the prompts and the respective templates.
13
+ """
14
+
15
+ def get_system_prompt() -> Tuple[str, SystemMessagePromptTemplate]:
16
+
17
+ prompt_str = """You are an expert in information security, especially for ISO 27001 certifications. Answer the following question as truthfully as possible, using the provided context and not prior knowledge. If the answer is not contained within the context or the question is not related to the topic of information security or ISO 27001 or the question is not written in English, respond with 'I am sorry. I do not have knowledge on that topic'. Write a maximum of 400 words."""
18
+ template = SystemMessagePromptTemplate.from_template(prompt_str)
19
+ return prompt_str, template
20
+
21
+ def get_system_prompt_template() -> Tuple[str, SystemMessagePromptTemplate]:
22
+
23
+ prompt_str = f"""Answer the following question with that you can provide a template to the user and say that it is attached to this message. After that end your answer."""
24
+ template = SystemMessagePromptTemplate.from_template(prompt_str)
25
+
26
+ return prompt_str, template
27
+
28
+ def get_human_prompt(contexts: List[str], question: str) -> Tuple[str, HumanMessagePromptTemplate]:
29
+
30
+ prompt_str = f"""Question: {question} \n Context: {contexts}"""
31
+ template = HumanMessagePromptTemplate.from_template(prompt_str)
32
+ return prompt_str, template
33
+
34
+ def get_full_prompt(system_prompt: SystemMessagePromptTemplate, human_prompt: HumanMessagePromptTemplate) -> List[BaseMessage]:
35
+
36
+ full_prompt = ChatPromptTemplate.from_messages([system_prompt, human_prompt])
37
+ prompt_messages = full_prompt.format_prompt().to_messages()
38
+
39
+ return prompt_messages
app/public/logo_dark.png ADDED
app/public/logo_light.png ADDED
chroma/28f27476-0d16-464f-b0fa-5ebbcf277b95/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95042e844cfb77b20e578cf65635282a99d7c4dd20e589ac062f38bc389f8e58
3
+ size 4236000
chroma/28f27476-0d16-464f-b0fa-5ebbcf277b95/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcc596bc1909f7cc610d5839236c90513b4fbad06776c253fa1b21bfd712e940
3
+ size 100
chroma/28f27476-0d16-464f-b0fa-5ebbcf277b95/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc19b1997119425765295aeab72d76faa6927d4f83985d328c26f20468d6cc76
3
+ size 4000
chroma/28f27476-0d16-464f-b0fa-5ebbcf277b95/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
3
+ size 0
chroma/chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:491712cb0724ea8cd14691afba9f5c0ae3f07b780ec11e0a383b24c6cd711fe6
3
+ size 10010624
index_preparation/build_index.ipynb ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "### Notebook for creating/updating the dense and sparse indices"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": null,
13
+ "metadata": {},
14
+ "outputs": [],
15
+ "source": [
16
+ "from ipynb.fs.defs.preprocess_data import preprocess_data\n",
17
+ "from ipynb.fs.defs.preprocess_data import get_documents_from_files\n",
18
+ "from ipynb.fs.defs.preprocess_data import split_docs\n",
19
+ "from ipynb.fs.defs.preprocess_data import clean_and_process_chunked_documents\n",
20
+ "from ipynb.fs.defs.preprocess_data import store_documents\n",
21
+ "import chromadb\n",
22
+ "from langchain.vectorstores import Chroma\n",
23
+ "from langchain.docstore.document import Document\n",
24
+ "from typing import List\n",
25
+ "import os\n",
26
+ "\n",
27
+ "\n",
28
+ "def build_or_update_index_vector_db(documents: List[Document], embeddings, collection_name: str, dist_function: str, collection_metadata: dict):\n",
29
+ " '''\n",
30
+ " Builds the index vector DB from documents with the specified embeddings and collection_name\n",
31
+ " If it already exists, updates the index with the new documents\n",
32
+ " '''\n",
33
+ " new_client = chromadb.PersistentClient(path=os.environ.get(\"CHROMA_PATH\"))\n",
34
+ "\n",
35
+ " print(\"Starting to build index for: \", collection_metadata)\n",
36
+ "\n",
37
+ " # Check if collection already exists\n",
38
+ " collection_exists = True\n",
39
+ " try:\n",
40
+ " collection = new_client.get_collection(collection_name)\n",
41
+ " except ValueError as e:\n",
42
+ " collection_exists = False\n",
43
+ "\n",
44
+ " if not collection_exists:\n",
45
+ " print(\"Collection is new\")\n",
46
+ " # If collection does not exist, create it\n",
47
+ " collection = new_client.create_collection(collection_name)\n",
48
+ " # Each document needs an ID\n",
49
+ " ids = [str(i) for i in range(1, len(documents) + 1)]\n",
50
+ "\n",
51
+ " # Store the text of the document and metadata separately in order to insert it into Chroma\n",
52
+ " texts = []\n",
53
+ " metadata_docs = []\n",
54
+ " for document in documents:\n",
55
+ " texts.append(document.page_content)\n",
56
+ " metadata_docs.append(document.metadata)\n",
57
+ "\n",
58
+ " # Add them in batches (otherwise Chroma error)\n",
59
+ " for start_idx in range(0, len(embeddings), 1000):\n",
60
+ " end_idx = start_idx + 1000\n",
61
+ " # Ensure not to go out of bounds\n",
62
+ " embeddings_batch = embeddings[start_idx : min(end_idx, len(embeddings))]\n",
63
+ " texts_batch = texts[start_idx : min(end_idx, len(embeddings))]\n",
64
+ " ids_batch = ids[start_idx : min(end_idx, len(embeddings))]\n",
65
+ " metadatas_batch = metadata_docs[start_idx : min(end_idx, len(embeddings))]\n",
66
+ "\n",
67
+ " collection.add(embeddings=embeddings_batch, documents=texts_batch, ids=ids_batch, metadatas=metadatas_batch)\n",
68
+ " print(f\"Added embeddings from {start_idx} to {min(end_idx, len(embeddings))-1}\")\n",
69
+ "\n",
70
+ " vectordb = Chroma(\n",
71
+ " client=new_client,\n",
72
+ " collection_name=collection_name,\n",
73
+ " collection_metadata={\n",
74
+ " \"embedding_model_provider\": collection_metadata[\"embedding_model_provider\"],\n",
75
+ " \"embedding_model_name\": collection_metadata[\"embedding_model_name\"],\n",
76
+ " \"chunk_size\": collection_metadata[\"chunk_size\"],\n",
77
+ " \"chunk_overlap\": collection_metadata[\"chunk_overlap\"],\n",
78
+ " \"hnsw:space\": dist_function, # either \"l2\" or \"ip\" or \"cosine\"\n",
79
+ " },\n",
80
+ " )\n",
81
+ " print(f\"Collection {collection_name} successfully created.\")\n",
82
+ " print(\"There are\", vectordb._collection.count(), \"entries in the collection.\")\n",
83
+ "\n",
84
+ " return new_client, vectordb\n",
85
+ "\n",
86
+ " else:\n",
87
+ " print(\"Collection already exists\")\n",
88
+ " vectordb = Chroma(client=new_client, collection_name=collection_name)\n",
89
+ "\n",
90
+ " collection_count = vectordb._collection.count()\n",
91
+ " print(f\"There are {collection_count} entries in the collection {collection_name} prior to updating.\")\n",
92
+ "\n",
93
+ " # Continue the IDs from the last ID\n",
94
+ " ids = [str(i) for i in range(collection_count + 1, collection_count + len(documents) + 1)]\n",
95
+ " # Store the text of the document and metadata separately in order to insert it into Chroma\n",
96
+ " texts = []\n",
97
+ " metadata_docs = []\n",
98
+ " for document in documents:\n",
99
+ " texts.append(document.page_content)\n",
100
+ " metadata_docs.append(document.metadata)\n",
101
+ "\n",
102
+ " # Add them in batches (otherwise Chroma error)\n",
103
+ " for start_idx in range(0, len(embeddings), 1000):\n",
104
+ " end_idx = start_idx + 1000\n",
105
+ " # Ensure not to go out of bounds\n",
106
+ " embeddings_batch = embeddings[start_idx : min(end_idx, len(embeddings))]\n",
107
+ " texts_batch = texts[start_idx : min(end_idx, len(embeddings))]\n",
108
+ " ids_batch = ids[start_idx : min(end_idx, len(embeddings))]\n",
109
+ " metadatas_batch = metadata_docs[start_idx : min(end_idx, len(embeddings))]\n",
110
+ "\n",
111
+ " collection.add(embeddings=embeddings_batch, documents=texts_batch, ids=ids_batch, metadatas=metadatas_batch)\n",
112
+ " print(f\"Added embeddings from {start_idx} to {min(end_idx, len(embeddings))-1}\")\n",
113
+ "\n",
114
+ " collection_count = vectordb._collection.count()\n",
115
+ " print(f\"There are {collection_count} entries in the collection {collection_name} after updating.\")\n",
116
+ " return new_client, 0"
117
+ ]
118
+ },
119
+ {
120
+ "cell_type": "code",
121
+ "execution_count": null,
122
+ "metadata": {},
123
+ "outputs": [],
124
+ "source": [
125
+ "chunk_size = 1536\n",
126
+ "chunk_overlap = 264\n",
127
+ "# If update is needed, set to False\n",
128
+ "all_docs = True\n",
129
+ "\n",
130
+ "documents, embedding_model, embeddings = preprocess_data(chunk_size, chunk_overlap, all_docs)\n",
131
+ "collection_name = \"ISO_27001_Collection\"\n",
132
+ "collection_metadata = {\n",
133
+ "\"embedding_model_provider\": \"Fine-tuned\",\n",
134
+ "\"embedding_model_name\": \"finetuned-BGE-large-ISO-27001\",\n",
135
+ "\"chunk_size\": str(chunk_size),\n",
136
+ "\"chunk_overlap\": str(chunk_overlap),\n",
137
+ "}\n",
138
+ "\n",
139
+ "build_or_update_index_vector_db(documents, embeddings, collection_name, \"l2\", collection_metadata)"
140
+ ]
141
+ },
142
+ {
143
+ "cell_type": "code",
144
+ "execution_count": null,
145
+ "metadata": {},
146
+ "outputs": [],
147
+ "source": [
148
+ "def store_documents_for_sparse_retrieval(chunk_size: int, chunk_overlap: int):\n",
149
+ " \"\"\"\n",
150
+ " Stores the documents for sparse retrieval in a basic text file\n",
151
+ " \"\"\"\n",
152
+ " documents = get_documents_from_files(True)\n",
153
+ " chunked_documents = split_docs(documents, chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n",
154
+ " chunked_cleaned_documents = clean_and_process_chunked_documents(chunked_documents)\n",
155
+ "\n",
156
+ " store_documents(chunked_cleaned_documents, f\"./../sparse_index/sparse_1536_264\")"
157
+ ]
158
+ },
159
+ {
160
+ "cell_type": "code",
161
+ "execution_count": null,
162
+ "metadata": {},
163
+ "outputs": [],
164
+ "source": [
165
+ "# Create the actual sparse index\n",
166
+ "store_documents_for_sparse_retrieval(chunk_size, chunk_overlap)"
167
+ ]
168
+ },
169
+ {
170
+ "cell_type": "markdown",
171
+ "metadata": {},
172
+ "source": [
173
+ "#### Helper methods for Chroma"
174
+ ]
175
+ },
176
+ {
177
+ "cell_type": "code",
178
+ "execution_count": null,
179
+ "metadata": {},
180
+ "outputs": [],
181
+ "source": [
182
+ "# Returns the vectorDB based on the collection name if it exists\n",
183
+ "def get_index_vector_db(collection_name: str):\n",
184
+ " new_client = chromadb.PersistentClient(path=os.environ.get(\"CHROMA_PATH\"))\n",
185
+ "\n",
186
+ " # Check if collection already exists\n",
187
+ " collection_exists = True\n",
188
+ " try:\n",
189
+ " new_client.get_collection(collection_name)\n",
190
+ " except ValueError as e:\n",
191
+ " collection_exists = False\n",
192
+ "\n",
193
+ " if not collection_exists:\n",
194
+ " raise Exception(\"Error, raised exception: Collection does not exist.\")\n",
195
+ " else:\n",
196
+ " vectordb = Chroma(client=new_client, collection_name=collection_name)\n",
197
+ "\n",
198
+ " return new_client, vectordb"
199
+ ]
200
+ },
201
+ {
202
+ "cell_type": "code",
203
+ "execution_count": null,
204
+ "metadata": {},
205
+ "outputs": [],
206
+ "source": [
207
+ "def delete_collection(collection_name: str):\n",
208
+ " new_client = chromadb.PersistentClient(path=os.environ.get(\"CHROMA_PATH\"))\n",
209
+ "\n",
210
+ " try:\n",
211
+ " new_client.delete_collection(collection_name)\n",
212
+ " except ValueError as e:\n",
213
+ " print(\"Collection could not be deleted.\")"
214
+ ]
215
+ },
216
+ {
217
+ "cell_type": "code",
218
+ "execution_count": null,
219
+ "metadata": {},
220
+ "outputs": [],
221
+ "source": [
222
+ "def return_collections():\n",
223
+ " new_client = chromadb.PersistentClient(path=os.environ.get(\"CHROMA_PATH\"))\n",
224
+ " collections = new_client.list_collections()\n",
225
+ " return collections"
226
+ ]
227
+ }
228
+ ],
229
+ "metadata": {
230
+ "kernelspec": {
231
+ "display_name": "venv",
232
+ "language": "python",
233
+ "name": "python3"
234
+ },
235
+ "language_info": {
236
+ "codemirror_mode": {
237
+ "name": "ipython",
238
+ "version": 3
239
+ },
240
+ "file_extension": ".py",
241
+ "mimetype": "text/x-python",
242
+ "name": "python",
243
+ "nbconvert_exporter": "python",
244
+ "pygments_lexer": "ipython3",
245
+ "version": "3.11.5"
246
+ }
247
+ },
248
+ "nbformat": 4,
249
+ "nbformat_minor": 2
250
+ }
index_preparation/create_QA_set_documents.ipynb ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "### Notebook for creating the documents based on the curated QA pair dataset"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": null,
13
+ "metadata": {},
14
+ "outputs": [],
15
+ "source": [
16
+ "from ipynb.fs.defs.preprocess_data import store_documents\n",
17
+ "from langchain.docstore.document import Document\n",
18
+ "import json\n",
19
+ "\n",
20
+ "# Load QA dataset\n",
21
+ "with open(\"./../input_data/QA_dataset/golden_qa_set.json\", 'r') as file:\n",
22
+ " golden_qa_set = json.load(file)\n",
23
+ "\n",
24
+ "# Remove duplicate answers (Kersten + Secondary Literature) and template answers\n",
25
+ "indices_to_remove = list(range(102, 121)) + list(range(122, 133)) + list(range(134, 157))\n",
26
+ "indices_to_remove = sorted(set(indices_to_remove), reverse=True)\n",
27
+ "for index in indices_to_remove:\n",
28
+ " del golden_qa_set['qa_set'][index]\n",
29
+ "\n",
30
+ "question_set = [qa['question'] for qa in golden_qa_set['qa_set']]\n",
31
+ "golden_answer_set = [qa['golden_answer'] for qa in golden_qa_set['qa_set']]"
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "code",
36
+ "execution_count": null,
37
+ "metadata": {},
38
+ "outputs": [],
39
+ "source": [
40
+ "# Create one document for each question\n",
41
+ "all_qa_dataset_documents = []\n",
42
+ "for q, a in zip(question_set, golden_answer_set):\n",
43
+ "\n",
44
+ " document = Document(\n",
45
+ " page_content=f\"{q} \\n {a}\", \n",
46
+ " metadata={\n",
47
+ " \"source\": \"QA Dataset\",\n",
48
+ " \"title\": \"QA Dataset\"\n",
49
+ " })\n",
50
+ " all_qa_dataset_documents.append(document)"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "code",
55
+ "execution_count": null,
56
+ "metadata": {},
57
+ "outputs": [],
58
+ "source": [
59
+ "store_documents(all_qa_dataset_documents, \"./../input_data/QA_dataset/all_documents\")"
60
+ ]
61
+ }
62
+ ],
63
+ "metadata": {
64
+ "kernelspec": {
65
+ "display_name": "venv",
66
+ "language": "python",
67
+ "name": "python3"
68
+ },
69
+ "language_info": {
70
+ "codemirror_mode": {
71
+ "name": "ipython",
72
+ "version": 3
73
+ },
74
+ "file_extension": ".py",
75
+ "mimetype": "text/x-python",
76
+ "name": "python",
77
+ "nbconvert_exporter": "python",
78
+ "pygments_lexer": "ipython3",
79
+ "version": "3.8.5"
80
+ }
81
+ },
82
+ "nbformat": 4,
83
+ "nbformat_minor": 2
84
+ }
index_preparation/create_pdf_documents.ipynb ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "### Notebook for updating the PDF document"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": null,
13
+ "metadata": {},
14
+ "outputs": [],
15
+ "source": [
16
+ "from ipynb.fs.defs.preprocess_data import store_documents\n",
17
+ "from ipynb.fs.defs.preprocess_data import load_documents\n",
18
+ "from langchain.docstore.document import Document\n",
19
+ "import pypdfium2 as pdfium\n",
20
+ "import cv2\n",
21
+ "import os\n",
22
+ "import pytesseract\n",
23
+ "from typing import List\n",
24
+ "import shutil\n",
25
+ "\n",
26
+ "pytesseract_path = os.environ.get(\"TESSERACT_PATH\")\n",
27
+ "pytesseract.pytesseract.tesseract_cmd = pytesseract_path\n",
28
+ "\n",
29
+ "\n",
30
+ "def update_pdf_documents() -> List[Document]:\n",
31
+ " \"\"\"\n",
32
+ " Method for processing and updating documents based on the PDFs stored in input_data/PDF/documents. For that the PDFs, that were not processed yet, are converted to images and then transformed to texts. For each PDF one document is then created with all text from all pages. In the end the filename is changed, so that it is clear that it was already processed.\n",
33
+ " \"\"\"\n",
34
+ "\n",
35
+ " # List for either all documents or only new ones\n",
36
+ " documents_PDF = []\n",
37
+ " # List for all documents\n",
38
+ " already_processed_documents = load_documents(\"./../input_data/PDF/documents/all_documents\")\n",
39
+ "\n",
40
+ " PDF_images_path = \"./../input_data/PDF/PDF_Images\"\n",
41
+ " directory_path = \"./../input_data/PDF/files\"\n",
42
+ "\n",
43
+ " # Go through each PDF file in the directory\n",
44
+ " for file in os.listdir(directory_path):\n",
45
+ " if \"Tesseract_processed\" not in file:\n",
46
+ " file_path = os.path.join(directory_path, file)\n",
47
+ " pdf = pdfium.PdfDocument(file_path)\n",
48
+ " n_pages = len(pdf)\n",
49
+ " # Create directory to store the image\n",
50
+ " os.makedirs(PDF_images_path + f\"/{file}\")\n",
51
+ " complete_text = \"\"\n",
52
+ " # Go through each page of the PDF and save the according image\n",
53
+ " for page_number in range(n_pages):\n",
54
+ " page = pdf.get_page(page_number)\n",
55
+ " pil_image = page.render(\n",
56
+ " scale=300 / 72,\n",
57
+ " rotation=0,\n",
58
+ " crop=(0, 0, 0, 0),\n",
59
+ " ).to_pil()\n",
60
+ " pil_image_path = PDF_images_path + f\"/{file}/image_{page_number+1}.png\"\n",
61
+ " pil_image.save(pil_image_path)\n",
62
+ " img = cv2.imread(pil_image_path)\n",
63
+ " # Convert image to grayscale\n",
64
+ " gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n",
65
+ " # Apply threshold to convert to binary image\n",
66
+ " threshold_img = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]\n",
67
+ " # Pass the image through pytesseract and add the text to the whole document text\n",
68
+ " complete_text += pytesseract.image_to_string(threshold_img) + \"\\n\"\n",
69
+ " # Remove the image as it is already processed\n",
70
+ " os.remove(pil_image_path)\n",
71
+ "\n",
72
+ " file_name_without_pdf = file\n",
73
+ " if file.endswith(\".pdf\"):\n",
74
+ " file_name_without_pdf = file[:-4]\n",
75
+ " # Create a document based on the whole text and metadata\n",
76
+ " document_PDF = Document(page_content=complete_text, metadata={\"source\": file, \"title\": file_name_without_pdf})\n",
77
+ " documents_PDF.append(document_PDF)\n",
78
+ " already_processed_documents.append(document_PDF)\n",
79
+ "\n",
80
+ " # Change the filename, so that in future calls the PDF is not processed again\n",
81
+ " new_filename = file.replace(\".pdf\", \"_Tesseract_processed.pdf\")\n",
82
+ " new_pdf_path = os.path.join(directory_path, new_filename)\n",
83
+ " print(new_pdf_path)\n",
84
+ " pdf.close()\n",
85
+ " os.rename(file_path, new_pdf_path)\n",
86
+ "\n",
87
+ " # Store docs if new documents were processed\n",
88
+ " if len(documents_PDF) > 0:\n",
89
+ " # Store all documents, including the new ones\n",
90
+ " store_documents(already_processed_documents, \"./../input_data/PDF/documents/all_documents\")\n",
91
+ " # Store the new documents\n",
92
+ " store_documents(documents_PDF, \"./../input_data/PDF/documents/new_documents\")\n",
93
+ "\n",
94
+ " # Delete the empty folders inside the images folder\n",
95
+ " target_dir = \"./../input_data/PDF/PDF_images\"\n",
96
+ "\n",
97
+ " # Check if the target directory exists to avoid errors\n",
98
+ " if os.path.exists(target_dir):\n",
99
+ " # List all the items in the directory\n",
100
+ " for item in os.listdir(target_dir):\n",
101
+ " item_path = os.path.join(target_dir, item)\n",
102
+ " if os.path.isdir(item_path):\n",
103
+ " # Use shutil.rmtree to delete the directory and all its contents\n",
104
+ " shutil.rmtree(item_path)"
105
+ ]
106
+ },
107
+ {
108
+ "cell_type": "code",
109
+ "execution_count": null,
110
+ "metadata": {},
111
+ "outputs": [],
112
+ "source": [
113
+ "# Uncomment update needed because of new unprocessed files\n",
114
+ "# update_pdf_documents()"
115
+ ]
116
+ }
117
+ ],
118
+ "metadata": {
119
+ "kernelspec": {
120
+ "display_name": "venv",
121
+ "language": "python",
122
+ "name": "python3"
123
+ },
124
+ "language_info": {
125
+ "codemirror_mode": {
126
+ "name": "ipython",
127
+ "version": 3
128
+ },
129
+ "file_extension": ".py",
130
+ "mimetype": "text/x-python",
131
+ "name": "python",
132
+ "nbconvert_exporter": "python",
133
+ "pygments_lexer": "ipython3",
134
+ "version": "3.8.5"
135
+ }
136
+ },
137
+ "nbformat": 4,
138
+ "nbformat_minor": 2
139
+ }
index_preparation/create_template_documents.ipynb ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "### Notebook for creating the template documents"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": null,
13
+ "metadata": {},
14
+ "outputs": [],
15
+ "source": [
16
+ "from ipynb.fs.defs.preprocess_data import get_template_documents\n",
17
+ "from ipynb.fs.defs.preprocess_data import store_documents\n",
18
+ "from langchain.docstore.document import Document\n",
19
+ "import os\n",
20
+ "import shutil\n",
21
+ "\n",
22
+ "# Load all already existing documents and store paths of new documents to be processed\n",
23
+ "all_template_documents = get_template_documents(True)\n",
24
+ "template_paths = os.listdir(\"./../input_data/Templates/template_files/new\")\n",
25
+ "\n",
26
+ "print(all_template_documents)\n",
27
+ "print(\"\\n\")\n",
28
+ "print(template_paths)"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": null,
34
+ "metadata": {},
35
+ "outputs": [],
36
+ "source": [
37
+ "# Manually create the documents for each template\n",
38
+ "full_path = \"./../input_data/Templates/template_files/processed\"\n",
39
+ "\n",
40
+ "template_document_1 = Document(\n",
41
+ " page_content=f\"You can find a possible template for the backup policy from the Annex A of ISO 27001 attached to this message. It contains pre-written texts for purpose, scope, content and more for the backup policy.\", \n",
42
+ " metadata={\n",
43
+ " \"template_path\": full_path + \"/\" + template_paths[0], \n",
44
+ " \"source\": template_paths[0],\n",
45
+ " })\n",
46
+ "\n",
47
+ "template_document_2 = Document(\n",
48
+ " page_content=f\"You can find a possible template for the change management policy from the Annex A of ISO 27001 attached to this message. It contains pre-written texts for purpose, scope, content, procedures, risk management and more for the change management policy.\", \n",
49
+ " metadata={\n",
50
+ " \"template_path\": full_path + \"/\" + template_paths[1], \n",
51
+ " \"source\": template_paths[1],\n",
52
+ " })\n",
53
+ "\n",
54
+ "\n",
55
+ "template_document_3 = Document(\n",
56
+ " page_content=f\"You can find a possible template for the encryption policy from the Annex A of ISO 27001 attached to this message. It contains pre-written texts for purpose, scope, content and more for the encryption policy.\", \n",
57
+ " metadata={\n",
58
+ " \"template_path\": full_path + \"/\" + template_paths[2], \n",
59
+ " \"source\": template_paths[2],\n",
60
+ " })\n",
61
+ "\n",
62
+ "\n",
63
+ "template_document_4 = Document(\n",
64
+ " page_content=f\"You can find a possible template for a checklist for all ISO-27001 controls (Version 2013) attached to this message. It contains a simple checklist for the ISO 27001 controls 5 to 18.\", \n",
65
+ " metadata={\n",
66
+ " \"template_path\": full_path + \"/\" + template_paths[3], \n",
67
+ " \"source\": template_paths[3],\n",
68
+ " })\n",
69
+ "\n",
70
+ "template_document_5 = Document(\n",
71
+ " page_content=f\"You can find a possible template for a risk assessment needed for the ISO-27001 certification attached to this message. It contains a simple checklist of selected controls for which a risk assessment is needed.\", \n",
72
+ " metadata={\n",
73
+ " \"template_path\": full_path + \"/\" + template_paths[4], \n",
74
+ " \"source\": template_paths[4],\n",
75
+ " })\n",
76
+ "\n",
77
+ "template_document_5 = Document(\n",
78
+ " page_content=f\"You can find a possible template for a risk assessment needed for the ISO-27001 certification attached to this message. It contains a simple checklist of selected controls for which a risk assessment is needed.\", \n",
79
+ " metadata={\n",
80
+ " \"template_path\": full_path + \"/\" + template_paths[4], \n",
81
+ " \"source\": template_paths[4],\n",
82
+ " })\n",
83
+ "\n",
84
+ "new_template_documents = []\n",
85
+ "new_template_documents.append(template_document_1)\n",
86
+ "new_template_documents.append(template_document_2)\n",
87
+ "new_template_documents.append(template_document_3)\n",
88
+ "new_template_documents.append(template_document_4)\n",
89
+ "new_template_documents.append(template_document_5)"
90
+ ]
91
+ },
92
+ {
93
+ "cell_type": "code",
94
+ "execution_count": null,
95
+ "metadata": {},
96
+ "outputs": [],
97
+ "source": [
98
+ "# Store the new templates and all templates\n",
99
+ "store_documents(new_template_documents, \"./../input_data/Templates/documents/new_documents\")\n",
100
+ "\n",
101
+ "all_template_documents.extend(new_template_documents)\n",
102
+ "store_documents(new_template_documents, \"./../input_data/Templates/documents/all_documents\")"
103
+ ]
104
+ },
105
+ {
106
+ "cell_type": "code",
107
+ "execution_count": null,
108
+ "metadata": {},
109
+ "outputs": [],
110
+ "source": [
111
+ "# Move new templates to processed templates\n",
112
+ "for path in template_paths:\n",
113
+ " source_file = f\"./../input_data/Templates/template_files/new/{path}\"\n",
114
+ " destination_folder = \"./../input_data/Templates/template_files/processed\"\n",
115
+ " shutil.move(source_file, destination_folder)"
116
+ ]
117
+ }
118
+ ],
119
+ "metadata": {
120
+ "kernelspec": {
121
+ "display_name": "venv",
122
+ "language": "python",
123
+ "name": "python3"
124
+ },
125
+ "language_info": {
126
+ "codemirror_mode": {
127
+ "name": "ipython",
128
+ "version": 3
129
+ },
130
+ "file_extension": ".py",
131
+ "mimetype": "text/x-python",
132
+ "name": "python",
133
+ "nbconvert_exporter": "python",
134
+ "pygments_lexer": "ipython3",
135
+ "version": "3.11.5"
136
+ }
137
+ },
138
+ "nbformat": 4,
139
+ "nbformat_minor": 2
140
+ }
index_preparation/create_web_documents.ipynb ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "### Notebook for cleaning, creating and updating the web documents\n",
8
+ "\n",
9
+ "First import the documents from the uncleaned URLs and store the text into separate files. Then manually clean files and update it. After that, get all already cleaned documents and store both, only the new documents and all documents (old cleaned + new cleaned)."
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": null,
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "from ipynb.fs.defs.preprocess_data import get_web_documents\n",
19
+ "from ipynb.fs.defs.preprocess_data import store_documents\n",
20
+ "from langchain.docstore.document import Document\n",
21
+ "from langchain.document_loaders import AsyncHtmlLoader\n",
22
+ "from langchain.document_transformers import Html2TextTransformer\n",
23
+ "from typing import List\n",
24
+ "\n",
25
+ "def get_web_documents_for_cleaning() -> List[Document]:\n",
26
+ " \"\"\"\n",
27
+ " Method for returning documents based on the URLs. Looks at the .txt file with all uncleaned urls and uses the AsyncHTMLoader and HTML2TextTransformer to get the texts.\n",
28
+ " \"\"\"\n",
29
+ " directory_path_web = \"./../input_data/Web/URLs/uncleaned_urls.txt\"\n",
30
+ "\n",
31
+ " imported_urls = []\n",
32
+ " with open(directory_path_web, \"r\") as file:\n",
33
+ " for line in file:\n",
34
+ " imported_urls.append(line.strip())\n",
35
+ "\n",
36
+ " loader_web = AsyncHtmlLoader(imported_urls)\n",
37
+ " documents_web = loader_web.load()\n",
38
+ "\n",
39
+ " html2text = Html2TextTransformer()\n",
40
+ " documents_web_transformed = html2text.transform_documents(documents_web)\n",
41
+ " print(\"Number of documents: \" + str(len(documents_web_transformed)) + \"\\n\")\n",
42
+ "\n",
43
+ " return documents_web_transformed\n",
44
+ "\n",
45
+ "documents = get_web_documents_for_cleaning()\n",
46
+ "already_cleaned_documents = get_web_documents(True)"
47
+ ]
48
+ },
49
+ {
50
+ "cell_type": "code",
51
+ "execution_count": null,
52
+ "metadata": {},
53
+ "outputs": [],
54
+ "source": [
55
+ "# Loop over the array and store each string in a separate txt file\n",
56
+ "counter = 1\n",
57
+ "for doc in documents:\n",
58
+ " # Specify the file name for each string (e.g., file0.txt, file1.txt, ...)\n",
59
+ " file_name = f\"file_{counter}.txt\"\n",
60
+ " counter += 1\n",
61
+ " \n",
62
+ " # Open the file in write mode\n",
63
+ " with open(file_name, 'w', encoding='utf-8') as file:\n",
64
+ " # Write the string to the file\n",
65
+ " file.write(doc.page_content)\n",
66
+ "\n",
67
+ " print(f'The string has been successfully stored in {file_name}.')"
68
+ ]
69
+ },
70
+ {
71
+ "cell_type": "code",
72
+ "execution_count": null,
73
+ "metadata": {},
74
+ "outputs": [],
75
+ "source": [
76
+ "# NOW MANUALLY CLEAN"
77
+ ]
78
+ },
79
+ {
80
+ "cell_type": "code",
81
+ "execution_count": null,
82
+ "metadata": {},
83
+ "outputs": [],
84
+ "source": [
85
+ "cleaned_texts = []\n",
86
+ "\n",
87
+ "counter = 1\n",
88
+ "for doc in documents:\n",
89
+ " # Specify the file name for each string (e.g., file0.txt, file1.txt, ...)\n",
90
+ " file_name = f\"file_{counter}.txt\"\n",
91
+ " counter += 1\n",
92
+ " \n",
93
+ " # Open the file in write mode\n",
94
+ " with open(file_name, 'r', encoding='utf-8') as file:\n",
95
+ " # Write the string to the file\n",
96
+ " text = file.read()\n",
97
+ " cleaned_texts.append(text)"
98
+ ]
99
+ },
100
+ {
101
+ "cell_type": "code",
102
+ "execution_count": null,
103
+ "metadata": {},
104
+ "outputs": [],
105
+ "source": [
106
+ "# Set the new cleaned texts\n",
107
+ "if len(documents) == len(cleaned_texts):\n",
108
+ " for i in range(len(documents)):\n",
109
+ " documents[i].page_content = cleaned_texts[i]\n",
110
+ "else:\n",
111
+ " raise Exception(\"Error.\")"
112
+ ]
113
+ },
114
+ {
115
+ "cell_type": "code",
116
+ "execution_count": null,
117
+ "metadata": {},
118
+ "outputs": [],
119
+ "source": [
120
+ "# Store only the new documents and all documents\n",
121
+ "store_documents(documents, \"./../input_data/Web/documents/new_documents\")\n",
122
+ "\n",
123
+ "already_cleaned_documents.extend(documents)\n",
124
+ "store_documents(already_cleaned_documents, \"./../input_data/Web/documents/all_documents\")"
125
+ ]
126
+ },
127
+ {
128
+ "cell_type": "code",
129
+ "execution_count": null,
130
+ "metadata": {},
131
+ "outputs": [],
132
+ "source": [
133
+ "# Update the URLs list for cleaned and uncleaned\n",
134
+ "uncleaned_url_file_path = \"./../input_data/Web/URLs/uncleaned_urls.txt\"\n",
135
+ "cleaned_url_file_path = \"./../input_data/Web/URLs/cleaned_urls.txt\"\n",
136
+ "\n",
137
+ "# Read URLs from the source file and store them in a list\n",
138
+ "with open(uncleaned_url_file_path, \"r\") as source_file:\n",
139
+ " urls = source_file.readlines()\n",
140
+ "\n",
141
+ "# Open the destination file in append mode and write the URLs to it\n",
142
+ "with open(cleaned_url_file_path, \"a\") as destination_file:\n",
143
+ " destination_file.writelines(urls)\n",
144
+ "\n",
145
+ "# Remove the URLs from the source file\n",
146
+ "with open(uncleaned_url_file_path, \"w\") as source_file:\n",
147
+ " source_file.write(\"\")\n",
148
+ "\n",
149
+ "# Print moved urls\n",
150
+ "for url in urls:\n",
151
+ " print(\"Moved URL:\", url.strip())"
152
+ ]
153
+ }
154
+ ],
155
+ "metadata": {
156
+ "kernelspec": {
157
+ "display_name": "venv",
158
+ "language": "python",
159
+ "name": "python3"
160
+ },
161
+ "language_info": {
162
+ "codemirror_mode": {
163
+ "name": "ipython",
164
+ "version": 3
165
+ },
166
+ "file_extension": ".py",
167
+ "mimetype": "text/x-python",
168
+ "name": "python",
169
+ "nbconvert_exporter": "python",
170
+ "pygments_lexer": "ipython3",
171
+ "version": "3.8.5"
172
+ }
173
+ },
174
+ "nbformat": 4,
175
+ "nbformat_minor": 2
176
+ }
index_preparation/preprocess_data.ipynb ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "### Notebook for processing the text data (chunking, cleaning, embeddings)"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": 2,
13
+ "metadata": {},
14
+ "outputs": [],
15
+ "source": [
16
+ "import json\n",
17
+ "from typing import Iterable\n",
18
+ "from langchain.docstore.document import Document\n",
19
+ "from typing import List\n",
20
+ "\n",
21
+ "# Helper methods for storing and loading already generated documents\n",
22
+ "def store_documents(documents, file_path: str) -> None:\n",
23
+ " with open(file_path, \"w\") as jsonl_file:\n",
24
+ " for doc in documents:\n",
25
+ " jsonl_file.write(doc.json() + \"\\n\")\n",
26
+ "\n",
27
+ "\n",
28
+ "def load_documents(file_path: str) -> List[Document]:\n",
29
+ " documents = []\n",
30
+ " with open(file_path, \"r\") as jsonl_file:\n",
31
+ " for line in jsonl_file:\n",
32
+ " data = json.loads(line)\n",
33
+ " obj = Document(**data)\n",
34
+ " documents.append(obj)\n",
35
+ " return documents"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "execution_count": 3,
41
+ "metadata": {},
42
+ "outputs": [],
43
+ "source": [
44
+ "def get_pdf_documents(all_docs: bool):\n",
45
+ " \"\"\"\n",
46
+ " Method for returning the documents of the PDFs. Processing and updating takes place in update_pdf_documents.\n",
47
+ " all_docs parameter defines whether to load all documents or only new ones. Only new ones can be used if the index is already build and new documents should be added.\n",
48
+ " \"\"\"\n",
49
+ " pdf_documents = []\n",
50
+ " if all_docs:\n",
51
+ " pdf_documents = load_documents(\"./../input_data/PDF/documents/all_documents\")\n",
52
+ " else:\n",
53
+ " pdf_documents = load_documents(\"./../input_data/PDF/documents/new_documents\")\n",
54
+ "\n",
55
+ " return pdf_documents\n",
56
+ "\n",
57
+ "def get_web_documents(all_docs: bool) -> List[Document]:\n",
58
+ " \"\"\"\n",
59
+ " Method for returning the already processed documents. FIRST need to call get_web_docs_for_cleaning and clean manually. As it is a manual cleaning process, the methods are need to be called asynchronously.\n",
60
+ " \"\"\"\n",
61
+ " web_documents = []\n",
62
+ " if all_docs:\n",
63
+ " web_documents = load_documents(\"./../input_data/Web/documents/all_documents\")\n",
64
+ " else:\n",
65
+ " web_documents = load_documents(\"./../input_data/Web/documents/new_documents\")\n",
66
+ "\n",
67
+ " return web_documents\n",
68
+ "\n",
69
+ "def get_template_documents(all_docs: bool) -> List[Document]:\n",
70
+ " \"\"\"\n",
71
+ " Method for returning the documents of the templates.\n",
72
+ " \"\"\"\n",
73
+ " template_documents = []\n",
74
+ " if all_docs:\n",
75
+ " template_documents = load_documents(\"./../input_data/Templates/documents/all_documents\")\n",
76
+ " else:\n",
77
+ " template_documents = load_documents(\"./../input_data/Templates/documents/new_documents\")\n",
78
+ "\n",
79
+ " return template_documents\n",
80
+ "\n",
81
+ "def get_dataset_documents() -> List[Document]:\n",
82
+ " \"\"\"\n",
83
+ " Method for returning the documents of the templates.\n",
84
+ " \"\"\"\n",
85
+ " template_documents = []\n",
86
+ " template_documents = load_documents(\"./../input_data/QA_dataset/all_documents\")\n",
87
+ "\n",
88
+ " return template_documents"
89
+ ]
90
+ },
91
+ {
92
+ "cell_type": "code",
93
+ "execution_count": 4,
94
+ "metadata": {},
95
+ "outputs": [],
96
+ "source": [
97
+ "def get_documents_from_files(all_docs: bool):\n",
98
+ " \"\"\"\n",
99
+ " Gets documents from all document types.\n",
100
+ " \"\"\"\n",
101
+ " documents_all = []\n",
102
+ " documents_PDF = get_pdf_documents(all_docs)\n",
103
+ " document_web = get_web_documents(all_docs)\n",
104
+ " document_template = get_template_documents(all_docs)\n",
105
+ " document_dataset = get_dataset_documents()\n",
106
+ " \n",
107
+ " documents_all.extend(documents_PDF)\n",
108
+ " documents_all.extend(document_web)\n",
109
+ " documents_all.extend(document_template)\n",
110
+ " documents_all.extend(document_dataset)\n",
111
+ " \n",
112
+ " print(\"Number of documents: \" + str(len(documents_all)) + \"\\n\")\n",
113
+ " return documents_all"
114
+ ]
115
+ },
116
+ {
117
+ "cell_type": "code",
118
+ "execution_count": 5,
119
+ "metadata": {},
120
+ "outputs": [],
121
+ "source": [
122
+ "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
123
+ "\n",
124
+ "def split_docs(documents: List[Document], chunk_size: int, chunk_overlap: int):\n",
125
+ "\n",
126
+ " text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=[\" \"])\n",
127
+ " chunkedDocuments = text_splitter.split_documents(documents)\n",
128
+ " return chunkedDocuments"
129
+ ]
130
+ },
131
+ {
132
+ "cell_type": "code",
133
+ "execution_count": 6,
134
+ "metadata": {},
135
+ "outputs": [],
136
+ "source": [
137
+ "import re\n",
138
+ "\n",
139
+ "def clean_text(text: str) -> str:\n",
140
+ " # Replace multiple whitespaces (except newlines) with a single space\n",
141
+ " text = re.sub(r\"(?!\\n)\\s+\", \" \", text)\n",
142
+ " # Replace multiple newlines with a single newline\n",
143
+ " text = re.sub(r\"\\n+\", \"\\n\", text)\n",
144
+ " # Remove leading and trailing whitespace\n",
145
+ " text = text.strip()\n",
146
+ " return text\n",
147
+ "\n",
148
+ "def clean_and_process_chunked_documents(chunkedDocuments: List[Document]) -> List[Document]:\n",
149
+ " counter = 1\n",
150
+ " for i in chunkedDocuments:\n",
151
+ " i.page_content = clean_text(i.page_content)\n",
152
+ " i.metadata[\"original_text\"] = i.page_content\n",
153
+ " i.metadata[\"doc_ID\"] = counter\n",
154
+ " counter += 1\n",
155
+ "\n",
156
+ " i.page_content = i.page_content.lower() \n",
157
+ "\n",
158
+ " return chunkedDocuments"
159
+ ]
160
+ },
161
+ {
162
+ "cell_type": "code",
163
+ "execution_count": 7,
164
+ "metadata": {},
165
+ "outputs": [],
166
+ "source": [
167
+ "from langchain.embeddings import HuggingFaceEmbeddings\n",
168
+ "\n",
169
+ "def get_embedding_model():\n",
170
+ " path = \"Basti8499/bge-large-en-v1.5-ISO-27001\"\n",
171
+ " model = HuggingFaceEmbeddings(model_name=path)\n",
172
+ " return model"
173
+ ]
174
+ },
175
+ {
176
+ "cell_type": "code",
177
+ "execution_count": 8,
178
+ "metadata": {},
179
+ "outputs": [],
180
+ "source": [
181
+ "def create_embedding_vectors(embedding_model, documents: List[Document]):\n",
182
+ " texts = []\n",
183
+ " for document in documents:\n",
184
+ " texts.append(document.page_content)\n",
185
+ "\n",
186
+ " embeddings = embedding_model.embed_documents(texts)\n",
187
+ "\n",
188
+ " return embeddings"
189
+ ]
190
+ },
191
+ {
192
+ "cell_type": "code",
193
+ "execution_count": 1,
194
+ "metadata": {},
195
+ "outputs": [],
196
+ "source": [
197
+ "def preprocess_data(chunk_size: int, chunk_overlap: int, all_docs: bool):\n",
198
+ " documents = get_documents_from_files(all_docs)\n",
199
+ " chunked_documents = split_docs(documents, chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n",
200
+ " chunked_cleaned_documents = clean_and_process_chunked_documents(chunked_documents)\n",
201
+ " embedding_model = get_embedding_model()\n",
202
+ " embeddings = create_embedding_vectors(embedding_model, chunked_cleaned_documents)\n",
203
+ "\n",
204
+ " return chunked_cleaned_documents, embedding_model, embeddings"
205
+ ]
206
+ }
207
+ ],
208
+ "metadata": {
209
+ "kernelspec": {
210
+ "display_name": "venv",
211
+ "language": "python",
212
+ "name": "python3"
213
+ },
214
+ "language_info": {
215
+ "codemirror_mode": {
216
+ "name": "ipython",
217
+ "version": 3
218
+ },
219
+ "file_extension": ".py",
220
+ "mimetype": "text/x-python",
221
+ "name": "python",
222
+ "nbconvert_exporter": "python",
223
+ "pygments_lexer": "ipython3",
224
+ "version": "3.11.5"
225
+ }
226
+ },
227
+ "nbformat": 4,
228
+ "nbformat_minor": 2
229
+ }
init_embedding_model.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from langchain_community.embeddings import HuggingFaceEmbeddings
2
+
3
+ path = "Basti8499/bge-large-en-v1.5-ISO-27001"
4
+ model = HuggingFaceEmbeddings(model_name=path)
input_data/PDF/documents/all_documents ADDED
File without changes
input_data/PDF/documents/new_documents ADDED
File without changes
input_data/QA_dataset/all_documents ADDED
The diff for this file is too large to render. See raw diff
 
input_data/QA_dataset/golden_qa_set.json ADDED
The diff for this file is too large to render. See raw diff
 
input_data/Templates/documents/all_documents ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {"page_content": "You can find a possible template for the backup policy from the Annex A of ISO 27001 attached to this message. It contains pre-written texts for purpose, scope, content and more for the backup policy.", "metadata": {"template_path": "./../input_data/Templates/template_files/processed/Backup policy.docx", "source": "Backup policy.docx"}, "type": "Document"}
2
+ {"page_content": "You can find a possible template for the change management policy from the Annex A of ISO 27001 attached to this message. It contains pre-written texts for purpose, scope, content, procedures, risk management and more for the change management policy.", "metadata": {"template_path": "./../input_data/Templates/template_files/processed/Change management policy.docx", "source": "Change management policy.docx"}, "type": "Document"}
3
+ {"page_content": "You can find a possible template for the encryption policy from the Annex A of ISO 27001 attached to this message. It contains pre-written texts for purpose, scope, content and more for the encryption policy.", "metadata": {"template_path": "./../input_data/Templates/template_files/processed/Encryption policy.docx", "source": "Encryption policy.docx"}, "type": "Document"}
4
+ {"page_content": "You can find a possible template for a checklist for all ISO-27001 controls (Version 2013) attached to this message. It contains a simple checklist for the ISO 27001 controls 5 to 18.", "metadata": {"template_path": "./../input_data/Templates/template_files/processed/IC-ISO-27001-Controls-Checklist.xlsx", "source": "IC-ISO-27001-Controls-Checklist.xlsx"}, "type": "Document"}
5
+ {"page_content": "You can find a possible template for a risk assessment needed for the ISO-27001 certification attached to this message. It contains a simple checklist of selected controls for which a risk assessment is needed.", "metadata": {"template_path": "./../input_data/Templates/template_files/processed/IC-ISO-27001-Risk-Assessment.xlsx", "source": "IC-ISO-27001-Risk-Assessment.xlsx"}, "type": "Document"}
input_data/Templates/documents/new_documents ADDED
File without changes
input_data/Templates/template_files/processed/Backup policy.docx ADDED
Binary file (12.7 kB). View file
 
input_data/Templates/template_files/processed/Change management policy.docx ADDED
Binary file (13.9 kB). View file
 
input_data/Templates/template_files/processed/Encryption policy.docx ADDED
Binary file (13.2 kB). View file
 
input_data/Templates/template_files/processed/IC-ISO-27001-Controls-Checklist.xlsx ADDED
Binary file (314 kB). View file
 
input_data/Templates/template_files/processed/IC-ISO-27001-Risk-Assessment.xlsx ADDED
Binary file (48.5 kB). View file
 
input_data/Web/URLs/cleaned_urls.txt ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ https://www.vanta.com/resources/who-needs-iso-27001-certification
2
+ https://www.vanta.com/resources/iso-27001-compliance-checklist
3
+ https://www.vanta.com/resources/how-long-does-it-take-to-get-iso-certified
4
+ https://www.strongdm.com/blog/iso-27001-controls
5
+ https://www.itgovernance.eu/blog/en/a-9-step-guide-to-implementing-iso-27001
6
+ https://www.itgovernance.eu/blog/en/benefits-of-iso-27001-certification
7
+ https://www.itgovernance.eu/blog/en/why-are-so-many-organisations-getting-certified-to-iso-27001
8
+ https://www.itgovernance.eu/blog/en/what-you-need-to-know-about-iso-270012022
9
+ https://www.itgovernance.eu/blog/en/how-iso-27001-can-boost-your-cloud-security
10
+ https://www.vanta.com/resources/the-ultimate-iso-27001-guide-powered-by-vanta-and-aprio
11
+ https://www.drata.com/blog/iso-27001-compliance
12
+ https://www.drata.com/blog/iso-27001-risk-assessment
13
+ https://www.drata.com/blog/iso-27001-statement-of-applicability
14
+ https://www.drata.com/blog/ask-an-auditor-demystifying-iso-27001
15
+ https://www.drata.com/blog/iso-27001-vs-iso-27002
16
+ https://www.drata.com/blog/iso-27001-2022-update
17
+ https://www.drata.com/blog/iso-27001-certification-cost
18
+ https://www.dataguard.co.uk/knowledge/iso-27001/2022-version-transition-guide/
19
+ https://www.dataguard.co.uk/knowledge/iso-27001-certification/
20
+ https://www.dataguard.co.uk/knowledge/iso-27001-controls-annex-a/
21
+ https://www.dataguard.co.uk/knowledge/iso-27001/clause-4-1-requirements-of-interested-parties/
22
+ https://www.dataguard.co.uk/knowledge/iso-27001/clause-4-2-needs-and-expectations-of-key-parties/
23
+ https://www.dataguard.co.uk/knowledge/iso-27001/clause-4-3-how-to-determine-the-scope-of-your-isms/
24
+ https://www.dataguard.co.uk/knowledge/iso-27001/clause-4-4-information-security-management-system/
25
+ https://www.dataguard.co.uk/knowledge/iso-27001/clause-5-1-leadership-and-commitment/
26
+ https://www.dataguard.co.uk/knowledge/iso-27001/clause-5-2-information-security-policy/
27
+ https://www.dataguard.co.uk/knowledge/iso-27001/clause-5-3-organisational-roles-responsibilities-and-authorities/
28
+ https://www.dataguard.co.uk/knowledge/iso-27001/clause-6-1-actions-to-address-risks-and-opportunities/
29
+ https://www.dataguard.co.uk/knowledge/iso-27001/clause-6-2-information-security-objectives/
30
+ https://www.british-assessment.co.uk/insights/a-complete-guide-to-iso-iec-270012022/
31
+ https://secureframe.com/blog/iso-27001-2022
32
+ https://www.dataguard.co.uk/blog/iso-27001-risk-treatment-plan-what-you-need-to-know
33
+ https://www.creative-n.com/blog/how-much-does-it-cost-to-implement-iso-27001/
34
+ https://www.secfix.com/post/when-is-an-iso-27001-certification-required
35
+ https://www.dataguard.co.uk/knowledge/iso-27001/clause-7-1-resources-for-isms/
36
+ https://www.dataguard.co.uk/knowledge/iso-27001/clause-7-2-competence/
37
+ https://www.dataguard.co.uk/knowledge/iso-27001/clause-7-3-awareness/
38
+ https://www.dataguard.co.uk/knowledge/iso-27001/clause-7-4-communication/
39
+ https://www.dataguard.co.uk/knowledge/iso-27001/clause-7-5-documented-information/
40
+ https://www.dataguard.co.uk/knowledge/iso-27001/clause-8-1-operational-planning-and-control/
41
+ https://www.dataguard.co.uk/knowledge/iso-27001/clause-8-2-information-security-risk-assessment/
42
+ https://www.dataguard.co.uk/knowledge/iso-27001/clause-8-3-information-security-risk-treatment/
43
+ https://www.dataguard.co.uk/knowledge/iso-27001/clause-9-1-monitoring-measurement-analysis-and-evaluation/
44
+ https://www.dataguard.co.uk/knowledge/iso-27001/clause-9-2-internal-audit/
45
+ https://www.dataguard.co.uk/knowledge/iso-27001/clause-9-3-management-review/
46
+ https://www.dataguard.co.uk/knowledge/iso-27001/clause-10-1-continual-improvement/
47
+ https://www.dataguard.co.uk/knowledge/iso-27001/clause-10-2-nonconformity-and-corrective-action/
48
+ https://www.dataguard.co.uk/blog/iso-27001-annex-a5-information-security-policies
49
+ https://www.dataguard.co.uk/blog/iso-27001-annex-a.6-organisation-information-security/
50
+ https://www.dataguard.co.uk/blog/iso-27001-annex-a.7-human-resource-security/
51
+ https://www.dataguard.co.uk/blog/iso-27001-annex-a.8-asset-management
52
+ https://www.dataguard.co.uk/blog/iso-27001-annex-a.9-access-control/
53
+ https://www.dataguard.co.uk/blog/iso-27001-annex-a.10-cryptography
54
+ https://www.dataguard.co.uk/blog/iso-27001-annex-a.11-physical-and-environmental-security/
55
+ https://www.dataguard.co.uk/blog/iso-27001-annex-a.12-operations-security
56
+ https://www.dataguard.co.uk/blog/iso-27001-annex-a.13-communications-security/
57
+ https://www.dataguard.co.uk/blog/iso-27001-annex-a.14-system-acquisition-development-and-maintenance/
58
+ https://www.dataguard.co.uk/blog/iso-27001-annex-a.15-supplier-relationships/
59
+ https://www.dataguard.co.uk/blog/iso-27001-annex-a.16-information-security-incident-management/
60
+ https://www.dataguard.co.uk/blog/iso-27001-annex-a.17-information-security-aspects-of-business-continuity-management/
61
+ https://www.dataguard.co.uk/blog/iso-27001-annex-a.18-compliance/
62
+ https://reciprocity.com/difference-between-gdpr-and-iso-27001/
input_data/Web/URLs/uncleaned_urls.txt ADDED
File without changes
input_data/Web/documents/all_documents ADDED
File without changes
input_data/Web/documents/new_documents ADDED
File without changes
requirements.txt ADDED
Binary file (7.27 kB). View file
 
requirements_Docker.txt ADDED
Binary file (7.27 kB). View file
 
setup.sh ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ python -m venv venv
3
+ source venv/bin/activate
4
+ python -m pip install --upgrade pip
5
+ python -m pip install -r requirements.txt
6
+ python -m pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu121
7
+ echo "Initialization completed."
8
+
9
+ # Creating all necessary directories at once
10
+ mkdir -p ./inputData/PDF/files \
11
+ ./inputData/PDF/PDF_images \
12
+ ./inputData/Templates/template_files/new \
13
+ ./inputData/Templates/template_files/processed \
14
+ ./chroma
15
+
16
+ echo "Directories and necessary files created."
sparse_index/sparse_1536_264 ADDED
The diff for this file is too large to render. See raw diff