jdenis-insn commited on
Commit
737f55b
·
1 Parent(s): de91b1c

init commit for build

Browse files
.gitignore ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110
+ .pdm.toml
111
+ .pdm-python
112
+ .pdm-build/
113
+
114
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115
+ __pypackages__/
116
+
117
+ # Celery stuff
118
+ celerybeat-schedule
119
+ celerybeat.pid
120
+
121
+ # SageMath parsed files
122
+ *.sage.py
123
+
124
+ # Environments
125
+ .env*
126
+ .venv
127
+ env/
128
+ venv/
129
+ ENV/
130
+ env.bak/
131
+ venv.bak/
132
+
133
+ # Spyder project settings
134
+ .spyderproject
135
+ .spyproject
136
+
137
+ # Rope project settings
138
+ .ropeproject
139
+
140
+ # mkdocs documentation
141
+ /site
142
+
143
+ # mypy
144
+ .mypy_cache/
145
+ .dmypy.json
146
+ dmypy.json
147
+
148
+ # Pyre type checker
149
+ .pyre/
150
+
151
+ # pytype static type analyzer
152
+ .pytype/
153
+
154
+ # Cython debug symbols
155
+ cython_debug/
156
+
157
+ # PyCharm
158
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
161
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162
+ #.idea/
163
+
164
+ # BDD
165
+ qdrant_storage/
166
+ memory
167
+
168
+ *.pdf
Dockerfile ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Étape 1 : Image de base et installation des dépendances système
2
+ FROM python:3.12-slim as base
3
+
4
+ RUN apt-get update && apt-get install -y \
5
+ nginx \
6
+ supervisor \
7
+ curl \
8
+ && rm -rf /var/lib/apt/lists/*
9
+
10
+ # Étape 2 : Création de l'utilisateur 'user' avec l'ID 1000
11
+ RUN useradd -m -u 1000 user
12
+
13
+ # Étape 3 : Définition des variables d'environnement et du répertoire de travail
14
+ ENV HOME=/home/user \
15
+ PATH=/home/user/.local/bin:$PATH
16
+ WORKDIR $HOME/app
17
+
18
+ # Étape 4 : Installation de 'uv' (gestionnaire de projet Python)
19
+ COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
20
+
21
+ # Étape 5 : Copie des fichiers de configuration avec les permissions appropriées
22
+ COPY --chown=user:user nginx.conf /etc/nginx/nginx.conf
23
+ COPY --chown=user:user supervisord.conf /etc/supervisor/conf.d/supervisord.conf
24
+
25
+ # Étape 6 : Copie des fichiers de l'application avec les permissions appropriées
26
+ COPY --chown=user:user . $HOME/app
27
+
28
+ # Étape 7 : Installation des dépendances Python
29
+ COPY --chown=user:user pyproject.toml uv.lock ./
30
+ RUN uv sync --no-dev --frozen --no-cache
31
+
32
+ # Étape 8 : Téléchargement et installation de Qdrant
33
+ RUN curl -fsSL https://github.com/qdrant/qdrant/releases/latest/download/qdrant-linux-x86_64 -o /usr/local/bin/qdrant \
34
+ && chmod +x /usr/local/bin/qdrant
35
+
36
+ # Étape 9 : Exposition des ports nécessaires
37
+ EXPOSE 80 6333
38
+
39
+ # Étape 10 : Changement de l'utilisateur pour 'user'
40
+ USER user
41
+
42
+ # Étape 11 : Lancement de l'application avec supervisord
43
+ CMD ["supervisord", "-n", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: CV
3
  emoji: 👁
4
  colorFrom: indigo
5
- colorTo: pink
6
  sdk: docker
7
  pinned: false
8
  license: unlicense
 
1
  ---
2
+ title: CV_JBDENIS
3
  emoji: 👁
4
  colorFrom: indigo
5
+ colorTo: red
6
  sdk: docker
7
  pinned: false
8
  license: unlicense
backend/app/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ def hello() -> str:
2
+ """Return greetings."""
3
+ return "Hello from my-app!"
backend/app/internal/__init__.py ADDED
File without changes
backend/app/internal/bdd_manager.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any
2
+ from langchain_qdrant import QdrantVectorStore
3
+ from qdrant_client import QdrantClient
4
+ from qdrant_client.http.models import Distance, VectorParams
5
+ from qdrant_client.http.exceptions import UnexpectedResponse
6
+ from qdrant_client.http.models import CollectionStatus
7
+ from langchain.vectorstores.base import VectorStoreRetriever
8
+ from langchain.retrievers import EnsembleRetriever
9
+ from langchain_core.vectorstores import VectorStore
10
+ from app.settings import settings
11
+
12
+
13
+ try:
14
+ client = QdrantClient(url=settings.qdrant_url)
15
+
16
+ except Exception as e:
17
+ raise Exception(f"Error connecting to Qdrant: {e}")
18
+
19
+
20
+ def create_collection(collection_name: str):
21
+ """
22
+ Create a collection in Qdrant if it does not already exist.
23
+
24
+ Args:
25
+ collection_name (str): The name of the collection to be created.
26
+
27
+ Returns:
28
+ str: A message indicating the result of the operation.
29
+
30
+ Raises:
31
+ Exception: If there is an error during the collection creation process.
32
+ """
33
+ try:
34
+ existing_collections = client.get_collections()
35
+ if any(col.name == collection_name for col in existing_collections.collections):
36
+ return f"Collection '{collection_name}' already exists."
37
+
38
+ client.create_collection(
39
+ collection_name=collection_name,
40
+ vectors_config=VectorParams(size=768, distance=Distance.COSINE),
41
+ )
42
+ return f"Collection '{collection_name}' created successfully."
43
+
44
+ except Exception as e:
45
+ raise Exception(f"Error creating collection '{collection_name}': {e}")
46
+
47
+
48
+ def get_vector_store(embeddings, collection_name):
49
+ """
50
+ Retrieve or initialize a Qdrant vector store for a given collection.
51
+
52
+ Args:
53
+ embeddings: The embedding model or function to be used for vectorization.
54
+ collection_name (str): The name of the Qdrant collection.
55
+
56
+ Returns:
57
+ QdrantVectorStore: A Qdrant vector store object tied to the specified collection.
58
+
59
+ Raises:
60
+ Exception: If the collection does not exist or there is an issue accessing it.
61
+ """
62
+ try:
63
+ collection_info = client.get_collection(collection_name)
64
+
65
+ if collection_info.status != CollectionStatus.GREEN:
66
+ raise Exception(
67
+ f"Collection '{collection_name}' is not active (status: {
68
+ collection_info.status})."
69
+ )
70
+
71
+ return QdrantVectorStore(
72
+ client=client, collection_name=collection_name, embedding=embeddings
73
+ )
74
+
75
+ except UnexpectedResponse as e:
76
+ raise Exception(
77
+ f"Collection '{
78
+ collection_name}' does not exist or could not be accessed: {e}"
79
+ )
80
+
81
+ except Exception as e:
82
+ raise Exception(
83
+ f"An error occurred while retrieving the vector store for '{
84
+ collection_name}': {e}"
85
+ )
86
+
87
+
88
+ def get_retriever(vector_store: VectorStore) -> VectorStoreRetriever:
89
+ """
90
+ Converts a vector store into a retriever instance.
91
+
92
+ Args:
93
+ vector_store: An object that represents the vector store. It must have an `as_retriever` method.
94
+
95
+ Returns
96
+ -------
97
+ VectorStoreRetriever: An instance of VectorStoreRetriever for querying the vector store.
98
+
99
+ Raises
100
+ ------
101
+ AttributeError: If the provided vector store does not have an `as_retriever` method.
102
+ """ # noqa: D401
103
+ if not hasattr(vector_store, "as_retriever"):
104
+ raise AttributeError(
105
+ "The provided vector store does not have an 'as_retriever' method."
106
+ )
107
+
108
+ return vector_store.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.7})
109
+
110
+
111
+ def get_ensemble_retriever(
112
+ retriever_doc: VectorStoreRetriever, retriever_user: VectorStoreRetriever
113
+ ) -> EnsembleRetriever:
114
+ """
115
+ Create an ensemble retriever that combines two retrievers with specified weights.
116
+
117
+ Args:
118
+ retriever_doc (VectorStoreRetriever): The first retriever,
119
+ typically for document retrieval.
120
+ retriever_user (VectorStoreRetriever): The second retriever,
121
+ typically for user-specific retrieval.
122
+
123
+ Returns:
124
+ EnsembleRetriever: An instance of `EnsembleRetriever` combining the two retrievers
125
+ with the specified weights (0.2 for `retriever_doc` and 0.8 for `retriever_user`).
126
+ """
127
+ return EnsembleRetriever(
128
+ retrievers=[retriever_doc, retriever_user], weights=[0.2, 0.8]
129
+ )
backend/app/internal/embedder.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_openai import OpenAIEmbeddings
2
+ from langchain_huggingface.embeddings import HuggingFaceEmbeddings
3
+ from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
4
+ from app.settings import settings
5
+
6
+
7
+ def get_embedder(provider: str = "hf_api"):
8
+ if provider == "hf_local":
9
+ return HuggingFaceEmbeddings(
10
+ model_name=settings.embedding_model_name,
11
+ )
12
+
13
+ if provider == "hf_api":
14
+ return HuggingFaceInferenceAPIEmbeddings(
15
+ model_name=settings.embedding_model_name,
16
+ api_key=settings.hf_token,
17
+ )
18
+
19
+ if provider == "openai":
20
+ return OpenAIEmbeddings(
21
+ openai_api_key=settings.scw_api_key,
22
+ openai_api_base=settings.scw_generative_apis_endpoint,
23
+ model=settings.embedding_model_name,
24
+ tiktoken_enabled=False,
25
+ )
26
+ return None
backend/app/internal/export_report.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import re
3
+ from typing import List
4
+ from fpdf import FPDF
5
+ from datetime import datetime
6
+
7
+ # Initialisation du logger
8
+ logging.basicConfig(level=logging.INFO)
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ def create_pdf_report(output_path: str, logo_path: str, report_text: str):
13
+ """
14
+ Creates a PDF report with a logo, the current date, and a given text.
15
+
16
+ Args:
17
+ output_path (str): The path where the generated PDF will be saved.
18
+ logo_path (str): The path to the logo image to include in the report.
19
+ report_text (str): The text content to include in the report.
20
+
21
+ Returns
22
+ -------
23
+ None: The function saves the PDF to the specified output path.
24
+
25
+ Raises
26
+ ------
27
+ FileNotFoundError: If the logo file does not exist.
28
+ ValueError: If the provided paths or text are invalid.
29
+ """
30
+ pdf = FPDF()
31
+ pdf.add_page()
32
+
33
+ # Set font for the document
34
+ pdf.set_font("Arial", size=12)
35
+
36
+ # Add logo
37
+ try:
38
+ pdf.image(logo_path, x=10, y=8, w=30)
39
+ except FileNotFoundError:
40
+ raise FileNotFoundError(f"Logo file not found at: {logo_path}") # noqa: B904
41
+
42
+ # Add title
43
+ pdf.set_font("Arial", style="B", size=16)
44
+ pdf.cell(200, 10, txt="Rapport de conversation avec Dis-ADEME", ln=True, align="C")
45
+
46
+ # Add date
47
+ pdf.set_font("Arial", size=12)
48
+ creation_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
49
+ pdf.ln(10) # Add some space
50
+ pdf.cell(
51
+ 200,
52
+ 10,
53
+ txt=f"Date de création : {
54
+ creation_date}",
55
+ ln=True,
56
+ align="R",
57
+ )
58
+
59
+ # Add content
60
+ pdf.ln(20) # Add some space
61
+ pdf.set_font("Arial", size=12)
62
+ pdf.multi_cell(0, 10, txt=report_text)
63
+
64
+ # Save the PDF
65
+ try:
66
+ pdf.output(output_path)
67
+ logger.info(f"PDF report created successfully at: {output_path}")
68
+ except Exception as e: # noqa: BLE001
69
+ raise ValueError(f"Failed to save PDF: {e}") # noqa: B904
70
+
71
+
72
+ def extract_pdf_references(conversation: List[dict]) -> List[str]:
73
+ """
74
+ Extract unique PDF references from the chatbot's responses in the conversation.
75
+
76
+ Args:
77
+ conversation (List[dict]): List of dictionaries representing the conversation.
78
+ Each dictionary contains 'role' ('user' or 'assistant')
79
+ and 'content' (message string).
80
+
81
+ Returns:
82
+ List[str]: A list of unique PDF references mentioned in the chatbot's responses.
83
+ """
84
+ pdf_references = set()
85
+
86
+ for message in conversation:
87
+ if (
88
+ message.get("role") == "assistant"
89
+ and "Consultez les documents suivants pour plus d'information:"
90
+ in message.get("content", "")
91
+ ):
92
+ # Extract all PDF file names using regex
93
+ matches = re.findall(r"[\w\s-]+\.pdf", message["content"])
94
+ pdf_references.update(matches)
95
+ return sorted(pdf_references)
backend/app/internal/llm_chat.py ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import uuid
2
+ from typing import Any, Callable, Dict, List
3
+
4
+ from langchain.chains import create_history_aware_retriever, create_retrieval_chain
5
+ from langchain.chains.base import Chain
6
+ from langchain.chains.combine_documents import create_stuff_documents_chain
7
+ from langchain_community.chat_message_histories import ChatMessageHistory
8
+ from langchain_core.chat_history import BaseChatMessageHistory
9
+ from langchain_core.messages import AIMessage, HumanMessage
10
+ from langchain_core.language_models.chat_models import BaseChatModel
11
+ from langchain_core.prompts import ChatPromptTemplate
12
+ from langchain_core.runnables.history import RunnableWithMessageHistory
13
+ from langchain_openai import ChatOpenAI
14
+ from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
15
+
16
+ from app.internal.export_report import extract_pdf_references
17
+ from app.internal.template_prompt import summary_system_prompt
18
+ from app.settings import settings
19
+
20
+
21
+ def get_chat_llm() -> BaseChatModel:
22
+ """
23
+ Initializes and returns a ChatOpenAI instance configured with the provided settings.
24
+
25
+ Returns:
26
+ ChatOpenAI: An instance of ChatOpenAI configured to use the specified model, API endpoint, and API key.
27
+
28
+ Raises:
29
+ ValueError: If any of the required settings (endpoint, API key, or model name) is missing.
30
+ """
31
+ try:
32
+ if settings.provider == "hf_local":
33
+ pass
34
+
35
+ if settings.provider == "hf_api":
36
+ if not settings.hf_token:
37
+ raise ValueError("The HugginFace APIs token is not set.")
38
+
39
+ llm = HuggingFaceEndpoint(
40
+ repo_id=settings.llm_model_name,
41
+ task="text-generation",
42
+ max_new_tokens=settings.max_length,
43
+ do_sample=False,
44
+ repetition_penalty=1.03,
45
+ temperature=settings.temperature,
46
+ # huggingfacehub_api_token=settings.hf_token,
47
+ )
48
+
49
+ return ChatHuggingFace(llm=llm)
50
+
51
+ if settings.provider == "openai":
52
+ if not settings.scw_generative_apis_endpoint:
53
+ raise ValueError("The SCW Generative APIs endpoint is not set.")
54
+ if not settings.scw_api_key:
55
+ raise ValueError("The SCW API key is not set.")
56
+ if not settings.llm_model_name:
57
+ raise ValueError("The LLM model name is not set.")
58
+
59
+ return ChatOpenAI(
60
+ base_url=settings.scw_generative_apis_endpoint,
61
+ api_key=settings.scw_api_key,
62
+ model=settings.llm_model_name,
63
+ temperature=settings.temperature,
64
+ )
65
+ except Exception as e:
66
+ raise RuntimeError(f"Failed to initialize ChatOpenAI: {e}")
67
+
68
+
69
+ def get_history_retriever(llm, retriever, contextualize_q_prompt) -> object:
70
+ """
71
+ Creates a history-aware retriever using the provided LLM, retriever, and contextualization prompt.
72
+
73
+ Args:
74
+ llm: The language model used for generating context-aware queries.
75
+ retriever: The retriever instance for querying a vector store or similar.
76
+ contextualize_q_prompt: A prompt template for contextualizing queries.
77
+
78
+ Returns:
79
+ object: A history-aware retriever instance.
80
+
81
+ Raises:
82
+ ValueError: If any of the required inputs are None or invalid.
83
+ """
84
+ if not llm or not retriever or not contextualize_q_prompt:
85
+ raise ValueError(
86
+ "LLM, retriever, and contextualize_q_prompt must all be provided."
87
+ )
88
+
89
+ try:
90
+ return create_history_aware_retriever(llm, retriever, contextualize_q_prompt)
91
+ except Exception as e:
92
+ raise RuntimeError(f"Failed to create history-aware retriever: {e}")
93
+
94
+
95
+ def get_system_prompt_chain(llm, qa_prompt) -> object:
96
+ """
97
+ Creates a prompt chain for processing system-level instructions with a question-answering prompt.
98
+
99
+ Args:
100
+ llm: The language model used for processing the system prompt.
101
+ qa_prompt: The prompt template for question-answering tasks.
102
+
103
+ Returns:
104
+ object: A chain instance for system prompt processing.
105
+
106
+ Raises:
107
+ ValueError: If either `llm` or `qa_prompt` is None.
108
+ """
109
+ if not llm or not qa_prompt:
110
+ raise ValueError("LLM and qa_prompt must both be provided.")
111
+
112
+ try:
113
+ return create_stuff_documents_chain(llm, qa_prompt)
114
+ except Exception as e:
115
+ raise RuntimeError(f"Failed to create system prompt chain: {e}")
116
+
117
+
118
+ def get_rag_chain(history_aware_retriever, question_answer_chain) -> object:
119
+ """
120
+ Creates a Retrieval-Augmented Generation (RAG) chain using a history-aware retriever and a Q&A chain.
121
+
122
+ Args:
123
+ history_aware_retriever: A retriever configured to incorporate conversation history into queries.
124
+ question_answer_chain: A chain for handling question-answering tasks.
125
+
126
+ Returns:
127
+ object: A RAG chain instance.
128
+
129
+ Raises:
130
+ ValueError: If either `history_aware_retriever` or `question_answer_chain` is None.
131
+ """
132
+ if not history_aware_retriever or not question_answer_chain:
133
+ raise ValueError(
134
+ "Both history_aware_retriever and question_answer_chain must be provided."
135
+ )
136
+
137
+ try:
138
+ return create_retrieval_chain(history_aware_retriever, question_answer_chain)
139
+ except Exception as e:
140
+ raise RuntimeError(f"Failed to create RAG chain: {e}")
141
+
142
+
143
+ def get_session_history(session_id: str, history_store: dict) -> BaseChatMessageHistory:
144
+ """
145
+ Retrieves or initializes the chat history for a given session ID.
146
+
147
+ Args:
148
+ session_id (str): The unique identifier for the session.
149
+ history_store (dict): A dictionary to store session histories.
150
+
151
+ Returns:
152
+ BaseChatMessageHistory: The chat message history for the session.
153
+
154
+ Raises:
155
+ ValueError: If `session_id` is not provided.
156
+ """
157
+ if not session_id:
158
+ raise ValueError("A valid session_id must be provided.")
159
+
160
+ if session_id not in history_store:
161
+ history_store[session_id] = ChatMessageHistory()
162
+
163
+ return history_store[session_id]
164
+
165
+
166
+ def get_conversational_rag_chain(
167
+ rag_chain: Chain,
168
+ session_history_func: Callable[[str], BaseChatMessageHistory],
169
+ ) -> RunnableWithMessageHistory:
170
+ """
171
+ Creates a conversational Retrieval-Augmented Generation (RAG) chain with session history.
172
+
173
+ Args:
174
+ rag_chain (Chain): The RAG chain for handling retrieval and generation tasks.
175
+ session_history_func (Callable): A function to retrieve or initialize session history.
176
+
177
+ Returns:
178
+ RunnableWithMessageHistory: A chain that maintains message history and processes input/output.
179
+
180
+ Raises:
181
+ ValueError: If `rag_chain` or `session_history_func` is not provided.
182
+ """
183
+ if not rag_chain:
184
+ raise ValueError("A valid rag_chain must be provided.")
185
+ if not session_history_func:
186
+ raise ValueError("A valid session history function must be provided.")
187
+
188
+ return RunnableWithMessageHistory(
189
+ rag_chain,
190
+ session_history_func,
191
+ input_messages_key="input",
192
+ history_messages_key="chat_history",
193
+ output_messages_key="answer",
194
+ )
195
+
196
+
197
+ def question_to_conversational_rag_chain(
198
+ user_query: str, conversational_rag_chain: Any, session_id: str = None
199
+ ) -> Dict[str, Any]:
200
+ """
201
+ Sends a user query to a conversational RAG chain and retrieves the response.
202
+
203
+ Args:
204
+ user_query (str): The query from the user.
205
+ conversational_rag_chain (Any): The conversational RAG chain instance.
206
+ session_id (str, optional): A unique identifier for the session. If not provided, a new session_id is generated.
207
+
208
+ Returns:
209
+ Dict[str, Any]: The response from the conversational RAG chain.
210
+
211
+ Raises:
212
+ ValueError: If the user query is empty or the RAG chain is not provided.
213
+ RuntimeError: If an error occurs during the invocation of the RAG chain.
214
+ """
215
+ if not user_query:
216
+ raise ValueError("The user query must be a non-empty string.")
217
+ if not conversational_rag_chain:
218
+ raise ValueError("A valid conversational RAG chain must be provided.")
219
+
220
+ # Generate a session_id if none is provided
221
+ if not session_id:
222
+ session_id = str(uuid.uuid4())
223
+
224
+ try:
225
+ # Invoke the conversational RAG chain
226
+ return conversational_rag_chain.invoke(
227
+ {"input": user_query}, config={"configurable": {"session_id": session_id}}
228
+ )
229
+ except Exception as e:
230
+ raise RuntimeError(f"Failed to process the query with the RAG chain: {e}")
231
+
232
+
233
+ def get_documents_retrieve(output: Dict[str, Any], max_docs: int = 3) -> List[str]:
234
+ """
235
+ Retrieves the titles of the documents from the output context.
236
+
237
+ Args:
238
+ output (Dict[str, Any]): The output containing context and metadata.
239
+ max_docs (int): The maximum number of document titles to retrieve. Default is 3.
240
+
241
+ Returns:
242
+ List[str]: A list of document titles.
243
+
244
+ Raises:
245
+ ValueError: If the 'context' key is missing or empty in the output.
246
+ """
247
+ if "context" not in output:
248
+ return None
249
+
250
+ return [
251
+ output["context"][i].metadata.get("Title", "Untitled Document")
252
+ for i in range(min(len(output["context"]), max_docs))
253
+ ] # TODO add filtre sur le type de documents à retourner
254
+
255
+
256
+ def get_llm_answer(output: Dict[str, Any]) -> str:
257
+ """
258
+ Extracts the answer generated by the LLM from the output.
259
+
260
+ Args:
261
+ output (Dict[str, Any]): The output containing the answer.
262
+
263
+ Returns:
264
+ str: The LLM-generated answer.
265
+
266
+ Raises:
267
+ ValueError: If the 'answer' key is missing or empty in the output.
268
+ """
269
+ if "answer" not in output or not output["answer"]:
270
+ raise ValueError("The output does not contain a valid 'answer'.")
271
+
272
+ return output["answer"]
273
+
274
+
275
+ def get_format_output(answer: str, context: List[str]) -> str:
276
+ """
277
+ Formats the LLM answer with a list of related document titles.
278
+
279
+ Args:
280
+ answer (str): The LLM-generated answer.
281
+ context (List[str]): A list of document titles related to the answer.
282
+
283
+ Returns:
284
+ str: A formatted string containing the answer and document references.
285
+
286
+ Raises:
287
+ ValueError: If the answer is empty or None.
288
+ """
289
+ if not answer:
290
+ raise ValueError("The 'answer' must be a non-empty string.")
291
+
292
+ formatted_output = f"{answer}"
293
+ if context:
294
+ uniques_doc = set(context)
295
+ formatted_output += (
296
+ "\n\nConsultez les documents suivants pour plus d'information:\n\n"
297
+ )
298
+ formatted_output += "\n\n".join(uniques_doc)
299
+
300
+ return formatted_output
301
+
302
+
303
+ def clean_output(answer): # TODO add clean process for output
304
+ pass
305
+
306
+
307
+ def generate_summary(llm, conversation: List[dict]) -> str:
308
+ """
309
+ Generate a summary of the conversation with LangChain and append PDF references at the end.
310
+
311
+ Args:
312
+ conversation (List[dict]): List of dictionaries representing the conversation.
313
+ Each dictionary contains 'role' ('user' or 'assistant')
314
+ and 'content' (message string).
315
+ llm (str): OpenAI model to use.
316
+
317
+ Returns:
318
+ str: The generated summary with PDF references appended.
319
+ """
320
+ # Extract unique PDF references
321
+ pdf_references = extract_pdf_references(conversation)
322
+
323
+ # Prepare the messages
324
+ messages = summary_system_prompt
325
+
326
+ for message in conversation:
327
+ if message["role"] == "user":
328
+ messages.append(HumanMessage(content=message["content"]))
329
+
330
+ elif message["role"] == "assistant":
331
+ messages.append(AIMessage(content=message["content"]))
332
+
333
+ # Generate the summary
334
+ summary_prompt = ChatPromptTemplate.from_messages(messages).format()
335
+
336
+ summary = llm.invoke(summary_prompt)
337
+
338
+ # Append the PDF references
339
+ summary_text = summary.content
340
+ if pdf_references:
341
+ summary_text += (
342
+ "\n\nDocuments pdf à consulter pour plus d'information:"
343
+ + "\n".join(sorted(pdf_references))
344
+ )
345
+
346
+ return summary_text
backend/app/internal/parser.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Iterator, List, Union
3
+
4
+ import openparse
5
+ from docling.document_converter import DocumentConverter
6
+ from langchain_core.document_loaders import BaseLoader
7
+ from langchain_core.documents import Document as LCDocument
8
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
9
+
10
+ from app.settings import Settings
11
+
12
+
13
+ def get_pdf_paths(directory_or_file: Union[str, os.PathLike]) -> List[str]:
14
+ """
15
+ Retrieve all PDF file paths from a given directory, including its subdirectories, or from a single file.
16
+
17
+ Args:
18
+ directory_or_file (Union[str, os.PathLike]): Path to a directory or a single file.
19
+
20
+ Returns:
21
+ List[str]: A list of file paths to PDF files.
22
+
23
+ Raises:
24
+ FileNotFoundError: If the given path does not exist.
25
+ ValueError: If the input path is neither a directory nor a PDF file.
26
+ """
27
+ if not os.path.exists(directory_or_file):
28
+ raise FileNotFoundError(f"The path '{directory_or_file}' does not exist.")
29
+
30
+ pdf_paths = []
31
+
32
+ if os.path.isdir(directory_or_file):
33
+ for root, _, files in os.walk(directory_or_file):
34
+ for file in files:
35
+ if file.lower().endswith(".pdf"):
36
+ pdf_paths.append(os.path.join(root, file))
37
+
38
+ elif os.path.isfile(directory_or_file):
39
+ if directory_or_file.lower().endswith(".pdf"):
40
+ pdf_paths.append(directory_or_file)
41
+ else:
42
+ raise ValueError(f"The file '{directory_or_file}' is not a PDF.")
43
+ else:
44
+ raise ValueError(
45
+ f"The path '{directory_or_file}' is neither a directory nor a valid file."
46
+ )
47
+
48
+ return pdf_paths
49
+
50
+
51
+ settings = Settings()
52
+
53
+
54
+ def parse_document(doc_path, parser=settings.parser):
55
+ if parser == "openparse":
56
+ parser = openparse.DocumentParser()
57
+ parsed_basic_doc = parser.parse(doc_path)
58
+
59
+ parsed_doc = [
60
+ node.text.replace("<br><br>", "\n") for node in parsed_basic_doc.nodes
61
+ ]
62
+
63
+ if parser == "docling": # FIXME
64
+ converter = DocumentConverter()
65
+ parsed_doc = converter.convert(doc_path)
66
+
67
+ # loader = DoclingPDFLoader(file_path=doc_path)
68
+ # parsed_doc = loader.load()
69
+
70
+ return parsed_doc
71
+
72
+
73
+ def split_documents(text_splitter, docs):
74
+ return text_splitter.split_documents(docs)
75
+
76
+
77
+ def get_text_chunker():
78
+ return RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
79
+
80
+
81
+ # class DoclingPDFLoader(BaseLoader):
82
+
83
+ # def __init__(self, file_path: str | list[str]) -> None:
84
+ # self._file_paths = file_path if isinstance(
85
+ # file_path, list) else [file_path]
86
+ # self._converter = DocumentConverter()
87
+
88
+ # def lazy_load(self) -> Iterator[LCDocument]:
89
+ # for source in self._file_paths:
90
+ # dl_doc = self._converter.convert(source).document
91
+ # text = dl_doc.export_to_markdown()
92
+ # yield LCDocument(page_content=text)
93
+
94
+
95
+ # loader = DoclingPDFLoader(file_path=path)
96
+ # text_splitter = RecursiveCharacterTextSplitter(
97
+ # chunk_size=1000,
98
+ # chunk_overlap=200,
99
+ # )
100
+
101
+ # docs = loader.load()
102
+ # splits = text_splitter.split_documents(docs)
103
+
104
+ # splits
backend/app/internal/template_prompt.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.messages import SystemMessage # noqa: D100
2
+ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
3
+
4
+ ### Contextualize question ###
5
+ contextualize_q_system_prompt = """Based on the provided chat history and the most
6
+ recent user question, your task is to reformulate the latest question
7
+ into a fully standalone version.
8
+
9
+ Ensure the reformulated question is clear, self-contained, and does not rely
10
+ on any prior context from the chat history to be understood.
11
+ If the latest question already functions as a standalone question,
12
+ return it unchanged.
13
+ Do NOT provide an answer to the question or interpret the user’s intent
14
+ beyond making the question self-contained.
15
+ Retain all technical details, key terms, and precision from the original
16
+ question in your reformulation.
17
+ Your sole output should be the reformulated standalone question,
18
+ or the original question if no reformulation is required."""
19
+
20
+ contextualize_q_prompt = ChatPromptTemplate.from_messages(
21
+ [
22
+ ("system", contextualize_q_system_prompt),
23
+ MessagesPlaceholder("chat_history"),
24
+ ("human", "{input}"),
25
+ ]
26
+ )
27
+
28
+
29
+ ### Answer question ###
30
+ system_prompt = """You are an intelligent and professional assistant named 'Dis-ADEME',
31
+ created by the ADEME organization to assist with question-answering tasks related
32
+ to ecological transition, sustainable practices, and technical inquiries.
33
+
34
+ Use the provided retrieved context to answer the user's question accurately
35
+ and concisely.
36
+ If the retrieved context does not contain the necessary information,
37
+ explicitly state:
38
+ "Je suis désolé, je ne dispose pas des informations nécessaires
39
+ pour répondre à cette question."
40
+ Limit your response to a maximum of three sentences while maintaining clarity
41
+ and relevance. Ensure that your tone is formal and professional,
42
+ as your responses are intended for official use.
43
+ Do not speculate or provide information that is not explicitly supported
44
+ by the retrieved context.
45
+ Context:
46
+ {context}"""
47
+
48
+ qa_prompt = ChatPromptTemplate.from_messages(
49
+ [
50
+ ("system", system_prompt),
51
+ MessagesPlaceholder("chat_history"),
52
+ ("human", "{input}"),
53
+ ]
54
+ )
55
+
56
+ ### Conversation summary ###
57
+ summary_report_system_prompt = """
58
+ You are a knowledgeable and professional French assistant named 'Dis-ADEME',
59
+ created by the ADEME organization.
60
+ Your task is to summarize in French the following conversation between a user and
61
+ an assistant, providing a structured, comprehensive, and detailed summary.
62
+
63
+ Focus exclusively on the content and technical details discussed in the conversation,
64
+ omitting any reference to the roles of the participants
65
+ (e.g., "user" or "assistant").
66
+ Present the information in clear, concise, and professional language,
67
+ suitable for inclusion in an official administrative report.
68
+ Emphasize critical technical details, key points of discussion,
69
+ and any actionable insights or conclusions derived from the conversation.
70
+ Organize the summary into sections or paragraphs if appropriate,
71
+ ensuring clarity and logical flow.
72
+ If the conversation references external documents or resources (e.g., PDFs),
73
+ include their titles or descriptions in a dedicated section at the end of the summary.
74
+ Do not include any conversational or informal elements; maintain
75
+ a formal and neutral tone throughout.
76
+ Output your response as a structured report in French, ready for official use.
77
+ """
78
+
79
+ summary_system_prompt = [SystemMessage(content=summary_report_system_prompt)]
backend/app/main.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Main module."""
2
+
3
+ import logging
4
+ from typing import Any
5
+ import uvicorn
6
+ from fastapi import FastAPI
7
+
8
+ from app.routers.chatting import chat_router
9
+ from app.routers.embedding import embedding_router
10
+
11
+
12
+ logging.basicConfig(
13
+ level=logging.INFO,
14
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
15
+ )
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ app = FastAPI()
20
+
21
+ app.include_router(embedding_router)
22
+ app.include_router(chat_router)
23
+
24
+
25
+ @app.get("/")
26
+ async def root() -> Any: # noqa: ANN401
27
+ """Return greetings."""
28
+ return {"message": "Hello ADEME!"}
29
+
30
+
31
+ if __name__ == "__main__":
32
+ uvicorn.run(app, log_level="info")
backend/app/resources/logo_ademe.png ADDED
backend/app/routers/__init__.py ADDED
File without changes
backend/app/routers/chatting.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Any, Dict, List
3
+
4
+ from fastapi import APIRouter, HTTPException
5
+ from pydantic import BaseModel
6
+
7
+ from app.internal.bdd_manager import (
8
+ create_collection,
9
+ get_ensemble_retriever,
10
+ get_retriever,
11
+ get_vector_store,
12
+ )
13
+ from app.internal.embedder import get_embedder
14
+ from app.internal.export_report import create_pdf_report
15
+ from app.internal.llm_chat import (
16
+ generate_summary,
17
+ get_chat_llm,
18
+ get_conversational_rag_chain,
19
+ get_documents_retrieve,
20
+ get_format_output,
21
+ get_history_retriever,
22
+ get_llm_answer,
23
+ get_rag_chain,
24
+ get_session_history,
25
+ get_system_prompt_chain,
26
+ question_to_conversational_rag_chain,
27
+ )
28
+ from app.internal.template_prompt import contextualize_q_prompt, qa_prompt
29
+ from app.settings import settings
30
+
31
+ # Initialisation du logger
32
+ logging.basicConfig(level=logging.INFO)
33
+ logger = logging.getLogger(__name__)
34
+
35
+ chat_router = APIRouter(
36
+ prefix="/chatting",
37
+ tags=["question_anwser"],
38
+ responses={404: {"description": "Not found"}},
39
+ )
40
+
41
+
42
+ class QueryRequest(BaseModel):
43
+ user_query: str
44
+ session_id: str = settings.session_id
45
+
46
+
47
+ class ResponseOutput(BaseModel):
48
+ answer: str
49
+ context: List[str]
50
+ formatted_output: str
51
+
52
+
53
+ class Conversation(BaseModel):
54
+ messages: List[Any]
55
+
56
+
57
+ class ResponseOutputSum(BaseModel):
58
+ summary: str
59
+
60
+
61
+ # Initialisation des ressources
62
+ user_collection_name = settings.user_collection_name
63
+ logger.info("Initializing collection: %s", user_collection_name)
64
+ create_collection(user_collection_name)
65
+
66
+ doc_collection_name = settings.doc_collection_name
67
+ logger.info("Initializing collection: %s", doc_collection_name)
68
+ create_collection(doc_collection_name)
69
+
70
+ embedder = get_embedder(provider=settings.provider)
71
+ logger.info("Embedder initialized.")
72
+
73
+ doc_vector_store = get_vector_store(embedder, doc_collection_name)
74
+ logger.info("Vector store initialized with collection: %s", doc_collection_name)
75
+
76
+ user_vector_store = get_vector_store(embedder, user_collection_name)
77
+ logger.info("Vector store initialized with collection: %s", user_collection_name)
78
+
79
+ logger.info("Initializing LLM and retrievers...")
80
+ llm = get_chat_llm()
81
+ user_retriever = get_retriever(user_vector_store)
82
+ doc_retriever = get_retriever(doc_vector_store)
83
+ retriever = get_ensemble_retriever(doc_retriever, user_retriever)
84
+
85
+ logger.info("Creating history-aware retriever...")
86
+ history_retriever = get_history_retriever(llm, retriever, contextualize_q_prompt)
87
+
88
+ logger.info("Creating system prompt chain...")
89
+ qa_chain = get_system_prompt_chain(llm, qa_prompt)
90
+
91
+ logger.info("Creating RAG chain...")
92
+ rag_chain = get_rag_chain(history_retriever, qa_chain)
93
+
94
+ logger.info("Initializing conversational RAG chain...")
95
+ conversational_chain = get_conversational_rag_chain(
96
+ rag_chain,
97
+ lambda sid: get_session_history(settings.session_id, settings.history_store),
98
+ )
99
+
100
+
101
+ @chat_router.post("/chat", response_model=ResponseOutput)
102
+ async def chat_with_rag_chain(request: QueryRequest):
103
+ """
104
+ Route pour interagir avec le RAG (Retrieval-Augmented Generation) Chain.
105
+ """
106
+ logger.info("Received chat request with session_id: %s", request.session_id)
107
+ logger.info("User query: %s", request.user_query)
108
+
109
+ try:
110
+ logger.info("Processing user query...")
111
+ response = question_to_conversational_rag_chain(
112
+ request.user_query, conversational_chain, request.session_id
113
+ )
114
+ logger.info("LLM response received: %s", response)
115
+
116
+ answer = get_llm_answer(response)
117
+ documents = get_documents_retrieve(response)
118
+
119
+ logger.info("Formatting output...")
120
+ formatted_output = get_format_output(answer, documents)
121
+
122
+ logger.info(
123
+ "Successfully processed chat request for session_id: %s", request.session_id
124
+ )
125
+ return {
126
+ "answer": answer,
127
+ "context": documents,
128
+ "formatted_output": formatted_output,
129
+ }
130
+
131
+ except ValueError as e:
132
+ logger.error("Validation error: %s", str(e))
133
+ raise HTTPException(status_code=400, detail=str(e))
134
+ except Exception as e:
135
+ logger.error("Internal server error: %s", str(e))
136
+ raise HTTPException(status_code=500, detail=f"Internal server error: {e}")
137
+
138
+
139
+ @chat_router.get("/history/{session_id}")
140
+ async def get_chat_history(session_id: str):
141
+ """
142
+ Route pour récupérer l'historique des messages pour une session donnée.
143
+ """
144
+ logger.info("Fetching chat history for session_id: %s", session_id)
145
+
146
+ try:
147
+ history = get_session_history(session_id, settings.history_store)
148
+ logger.info(
149
+ "Successfully retrieved chat history for session_id: %s", session_id
150
+ )
151
+ return {"session_id": session_id, "history": history.messages}
152
+ except ValueError as e:
153
+ logger.error("Validation error: %s", str(e))
154
+ raise HTTPException(status_code=400, detail=str(e))
155
+ except Exception as e:
156
+ logger.error("Internal server error while fetching history: %s", str(e))
157
+ raise HTTPException(status_code=500, detail=f"Internal server error: {e}")
158
+
159
+
160
+ @chat_router.post("/chat", response_model=ResponseOutput)
161
+ async def chat_with_rag_chain(request: QueryRequest):
162
+ """
163
+ Route pour interagir avec le RAG (Retrieval-Augmented Generation) Chain.
164
+ """
165
+ logger.info("Received chat request with session_id: %s", request.session_id)
166
+ logger.info("User query: %s", request.user_query)
167
+
168
+ try:
169
+ logger.info("Processing user query...")
170
+ response = question_to_conversational_rag_chain(
171
+ request.user_query, conversational_chain, request.session_id
172
+ )
173
+
174
+ answer = get_llm_answer(response)
175
+ documents = get_documents_retrieve(response)
176
+
177
+ logger.info("Formatting output...")
178
+ formatted_output = get_format_output(answer, documents)
179
+
180
+ logger.info(
181
+ "Successfully processed chat request for session_id: %s", request.session_id
182
+ )
183
+ return {
184
+ "answer": answer,
185
+ "context": documents,
186
+ "formatted_output": formatted_output,
187
+ }
188
+
189
+ except ValueError as e:
190
+ logger.error("Validation error: %s", str(e))
191
+ raise HTTPException(status_code=400, detail=str(e))
192
+ except Exception as e:
193
+ logger.error("Internal server error: %s", str(e))
194
+ raise HTTPException(status_code=500, detail=f"Internal server error: {e}")
195
+
196
+
197
+ @chat_router.post("/summary", response_model=ResponseOutputSum)
198
+ async def summarize_conversation(conversation: Conversation):
199
+ """
200
+ Génère un résumé de la conversation et liste les documents PDF référencés.
201
+
202
+ Args:
203
+ conversation (Conversation): Objet contenant les messages de la conversation.
204
+
205
+ Returns:
206
+ dict: Résumé de la conversation et liste des documents PDF référencés.
207
+ """
208
+ outpur_path = r"..\Shared_data\export.pdf"
209
+ # outpur_path = r"C:\Users\jeanb\Documents\kzs-team\Shared_data\export.pdf"
210
+ logo_path = r"app\resources\logo_ademe.png"
211
+ summary_text = generate_summary(llm, conversation.messages)
212
+
213
+ create_pdf_report(outpur_path, logo_path, summary_text)
214
+
215
+ return {"summary": summary_text}
backend/app/routers/embedding.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Embedding tools"""
2
+
3
+ import logging
4
+ from typing import List
5
+ from fastapi import APIRouter, HTTPException
6
+ from pydantic import BaseModel
7
+
8
+ from app.internal.bdd_manager import create_collection, get_vector_store
9
+ from app.internal.embedder import get_embedder
10
+ from app.internal.parser import get_pdf_paths, get_text_chunker, parse_document
11
+ from app.settings import settings
12
+
13
+
14
+ logging.basicConfig(level=logging.INFO)
15
+ logger = logging.getLogger(__name__)
16
+
17
+ embedding_router = APIRouter(
18
+ prefix="/embeddings",
19
+ tags=["documents"],
20
+ responses={404: {"description": "Not found"}},
21
+ )
22
+
23
+ user_collection_name = settings.user_collection_name
24
+ logger.info("Initializing collection: %s", user_collection_name)
25
+ create_collection(user_collection_name)
26
+
27
+ doc_collection_name = settings.doc_collection_name
28
+ logger.info("Initializing collection: %s", doc_collection_name)
29
+ create_collection(doc_collection_name)
30
+
31
+ embedder = get_embedder(provider=settings.provider)
32
+ logger.info("Embedder initialized.")
33
+
34
+ doc_vector_store = get_vector_store(embedder, doc_collection_name)
35
+ logger.info("Vector store initialized with collection: %s", doc_collection_name)
36
+
37
+ user_vector_store = get_vector_store(embedder, user_collection_name)
38
+ logger.info("Vector store initialized with collection: %s", user_collection_name)
39
+
40
+ text_splitter = get_text_chunker()
41
+ logger.info("Text splitter initialized.")
42
+
43
+
44
+ def get_vectorstore(vectorstor_type):
45
+ if vectorstor_type == "user":
46
+ return user_vector_store
47
+
48
+ if vectorstor_type == "doc":
49
+ return doc_vector_store
50
+ return None
51
+
52
+
53
+ class DocPathsInput(BaseModel): # TODO move to schema.py
54
+ doc_paths: str
55
+ vectorstor_type: str
56
+
57
+
58
+ @embedding_router.post("/embedded/")
59
+ async def embedding(doc_paths_input: DocPathsInput):
60
+ """
61
+ Embeds documents provided via file paths and adds them to the vector store.
62
+
63
+ Args:
64
+ doc_paths_input (DocPathsInput): A Pydantic model containing
65
+ a list of document file paths.
66
+
67
+ Returns:
68
+ dict: A response containing the number of documents added to the vector store.
69
+
70
+ Raises:
71
+ HTTPException: If the document parsing or embedding process fails.
72
+ """
73
+
74
+ logger.info("Received request to embed documents: %s", doc_paths_input.doc_paths)
75
+ vector_store = get_vectorstore(doc_paths_input.vectorstor_type)
76
+
77
+ try:
78
+ folder_path = doc_paths_input.doc_paths
79
+ logger.info(folder_path)
80
+ doc_paths = get_pdf_paths(folder_path)
81
+ logger.info(doc_paths)
82
+ for path in doc_paths:
83
+ try:
84
+ logger.info("Parsing document at path: %s", path)
85
+ parsed_documents = parse_document(path)
86
+ doc_title = path.split("\\")[-1]
87
+ logger.info("Document parsed: %s", doc_title)
88
+
89
+ documents = text_splitter.create_documents(
90
+ parsed_documents,
91
+ metadatas=[
92
+ {"Title": doc_title} for _ in range(len(parsed_documents))
93
+ ],
94
+ )
95
+ logger.info(
96
+ "Created %d document chunks for: %s", len(documents), doc_title
97
+ )
98
+
99
+ vector_store.add_documents(documents)
100
+
101
+ logger.info("Documents added to vector store: %s", doc_title)
102
+
103
+ except Exception as e:
104
+ logger.info(
105
+ f"An error occured during the parsing of the file {path}: {e}"
106
+ )
107
+
108
+ logger.info("All documents successfully processed and embedded.")
109
+ return {
110
+ "message": "Documents successfully embedded and stored",
111
+ "documents_added": len(doc_paths),
112
+ }
113
+
114
+ except Exception as e:
115
+ logger.error("An error occurred during the embedding process: %s", e)
116
+ raise HTTPException(status_code=500, detail=f"An error occurred: {e!s}")
117
+
118
+
119
+ class SearchQuery(BaseModel): # TODO move to schema.py
120
+ vectorstor_type: str
121
+ query: str
122
+ k: int = 2
123
+
124
+
125
+ @embedding_router.post("/similarity_search/")
126
+ async def search_documents(search_query: SearchQuery):
127
+ """
128
+ Search for documents in the vector store based on a query.
129
+
130
+ Args:
131
+ search_query (SearchQuery): A Pydantic model containing the query string and the number of results (k).
132
+
133
+ Returns:
134
+ List[dict]: A list of documents matching the query, including their content and metadata.
135
+
136
+ Raises:
137
+ HTTPException: If the search process fails or no documents are found.
138
+ """
139
+ logger.info("Received similarity search query: %s", search_query.query)
140
+
141
+ vector_store = get_vectorstore(search_query.vectorstor_type)
142
+
143
+ try:
144
+ found_docs = vector_store.similarity_search(
145
+ search_query.query, k=search_query.k
146
+ )
147
+ logger.info(
148
+ "Found %d documents for query: %s", len(found_docs), search_query.query
149
+ )
150
+
151
+ if not found_docs:
152
+ logger.warning("No documents found for query: %s", search_query.query)
153
+ raise HTTPException(
154
+ status_code=404, detail="No documents found for the given query."
155
+ )
156
+
157
+ logger.info("Returning results for query: %s", search_query.query)
158
+ return [
159
+ {
160
+ "content": doc.page_content,
161
+ "metadata": doc.metadata if hasattr(doc, "metadata") else None,
162
+ }
163
+ for doc in found_docs
164
+ ]
165
+ except Exception as e:
166
+ logger.error("An error occurred during the similarity search: %s", e)
167
+ raise HTTPException(
168
+ status_code=500, detail=f"An error occurred during the search: {e}"
169
+ )
backend/app/settings.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pydantic_settings import BaseSettings, SettingsConfigDict
3
+
4
+
5
+ class Settings(BaseSettings):
6
+ model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8")
7
+ llm_model_name: str = "HuggingFaceH4/zephyr-7b-beta"
8
+ context_window_size: int = 5
9
+ retrieval_top_k: int = 3
10
+ temperature: float = 0.2
11
+ max_length: int = 2048
12
+ hf_token: str = os.getenv("HF_TOKEN")
13
+
14
+ if not hf_token:
15
+ raise ValueError(
16
+ "ERREUR : Le token Hugging Face (HF_TOKEN) n'est pas défini ! Ajoute-le dans les variables d'environnement Hugging Face Spaces."
17
+ )
18
+
19
+ embedding_model_name: str = "sentence-transformers/sentence-t5-xxl"
20
+ # qdrant_url: str = "http://qdrant:6333"
21
+ qdrant_url: str = "http://localhost:6333"
22
+ parser: str = "openparse"
23
+ history_store: dict = {}
24
+ session_id: str = "user012025"
25
+ user_collection_name: str = "User_Ademe_collection"
26
+ doc_collection_name: str = "Doc_Ademe_collection"
27
+ provider: str = "hf_api"
28
+
29
+
30
+ settings = Settings()
backend/test/test_main.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """Basic test."""
2
+
3
+
4
+ def test_basic() -> None:
5
+ """Test that 1 + 1 equals 2."""
6
+ assert 1 + 1 == 2
dockerignore ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dist/
2
+ env/
3
+ presentation/
4
+ __pycache__/
5
+ .coverage/
6
+ .doit.db
7
+ .git/
8
+ .gitignore
9
+ .idea/
10
+ .mypy_cache/
11
+ .pytest_cache/
12
+ .ruff_cache/
13
+ .venv/
14
+ .gitlab-ci.yml
15
+ renovate.json
16
+ Dockerfile
17
+ dodo.py
frontend/app/__init__.py ADDED
File without changes
frontend/app/main.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import base64
3
+ from pathlib import Path
4
+ import streamlit as st
5
+ import requests
6
+ from settings import settings
7
+
8
+
9
+ BASE_DIR = str(Path(__file__).resolve().parent)
10
+ # API_URL_CHAT = "http://localhost:8088/chatting/chat"
11
+ # API_URL_EMBEDDING = "http://localhost:8088/embeddings/embedded"
12
+ # API_URL_SUM = "http://localhost:8088/chatting/summary"
13
+ API_URL_CHAT = "http://localhost/api/chatting/chat"
14
+ API_URL_EMBEDDING = "http://localhost/api/embeddings/embedded"
15
+ API_URL_SUM = "http://localhost/api/chatting/summary"
16
+
17
+ st.set_page_config(
18
+ page_title="CV_JBDENIS",
19
+ page_icon="🧊",
20
+ )
21
+
22
+ # Helper functions for background
23
+
24
+
25
+ def get_base64_of_bin_file(bin_file): # noqa: ANN001, ANN201, D103
26
+ with open(bin_file, "rb") as f:
27
+ data = f.read()
28
+ return base64.b64encode(data).decode()
29
+
30
+
31
+ def set_png_as_page_bg(png_file) -> None: # noqa: ANN001, D103
32
+ bin_str = get_base64_of_bin_file(png_file)
33
+ page_bg_img = (
34
+ """
35
+ <style>
36
+ .stApp {
37
+ background-image: url("data:image/png;base64,%s");
38
+ background-size: cover;
39
+ }
40
+ </style>
41
+ """ # noqa: UP031
42
+ % bin_str
43
+ )
44
+ st.markdown(page_bg_img, unsafe_allow_html=True)
45
+ return # noqa: PLR1711
46
+
47
+
48
+ # Set background
49
+ set_png_as_page_bg(png_file=r"app\resources\aide-financiere-ademe.jpg")
50
+
51
+ logo_path = r"app\resources\logo_ademe.png"
52
+
53
+ col1, col2 = st.columns([3, 2])
54
+ with col1:
55
+ st.image(logo_path, width=400)
56
+ with col2:
57
+ st.title("Dis-ADEME")
58
+ st.write("Bienvenue dans votre application de chat.")
59
+
60
+ # Navigation
61
+ st.sidebar.title("Menu")
62
+ page = st.sidebar.radio("Navigation", ["Accueil", "Admin"])
63
+
64
+
65
+ def save_uploaded_files(uploaded_files: list): # noqa: ANN201, D103
66
+ save_dir = BASE_DIR + r"\uploaded_files\user"
67
+ # save_dir = r"\Shared_data\uploaded_files"
68
+ if not os.path.exists(save_dir):
69
+ os.makedirs(save_dir)
70
+
71
+ saved_file_paths = []
72
+ for uploaded_file in uploaded_files:
73
+ file_path = os.path.join(save_dir, uploaded_file.name)
74
+ with open(file_path, "wb") as f:
75
+ f.write(uploaded_file.getbuffer())
76
+ saved_file_paths.append(file_path)
77
+ st.session_state.uploaded_files.append(file_path)
78
+
79
+ return saved_file_paths
80
+
81
+
82
+ # Page d'accueil
83
+ if page == "Accueil":
84
+ if "uploaded_files" not in st.session_state:
85
+ st.session_state.uploaded_files = []
86
+
87
+ if "messages" not in st.session_state:
88
+ st.session_state.messages = []
89
+
90
+ saved_paths = []
91
+ with st.sidebar:
92
+ st.header("Uploader des fichiers PDF")
93
+ uploaded_files = st.file_uploader(
94
+ "Choisissez des fichiers PDF",
95
+ type="pdf",
96
+ accept_multiple_files=True,
97
+ key="pdf_uploader",
98
+ )
99
+
100
+ if uploaded_files:
101
+ saved_paths = save_uploaded_files(uploaded_files)
102
+ st.success(f"Fichiers sauvegardés : {saved_paths[-1]}, en analyse ...")
103
+
104
+ if saved_paths:
105
+ try:
106
+ response = requests.post(
107
+ API_URL_EMBEDDING,
108
+ json={"doc_paths": saved_paths[-1], "vectorstor_type": "user"},
109
+ )
110
+ response.raise_for_status()
111
+ embedded = response.json().get(
112
+ "message",
113
+ "Désolé, une erreur s'est produite durant la lecture du fichier.",
114
+ )
115
+
116
+ if response:
117
+ st.success(f"Analyse du fichiers {saved_paths[-1]} terminée.")
118
+
119
+ saved_paths = []
120
+ except requests.RequestException as e:
121
+ embedded = f"Erreur lors de la communication avec l'API : {e}"
122
+
123
+ if st.session_state.messages:
124
+ st.write("")
125
+ st.divider()
126
+ st.write("")
127
+ st.header("Rapport de conversation")
128
+ if st.button("Générer le rapport de conversation"):
129
+ try:
130
+ response = requests.post(
131
+ API_URL_SUM, json={"messages": st.session_state.messages}
132
+ )
133
+ response.raise_for_status()
134
+ summary = response.json().get("summary", "Résumé non disponible.")
135
+ st.subheader("Résumé généré")
136
+ st.text_area("Rapport", summary, height=200)
137
+ except requests.exceptions.RequestException as e:
138
+ st.error(f"Erreur lors de l'appel de l'API : {e}")
139
+ if response:
140
+ with open(r"..\Shared_data\export.pdf", "rb") as pdf_file:
141
+ # with open(r"C:\Users\jeanb\Documents\kzs-team\Shared_data\export.pdf", "rb") as pdf_file:
142
+
143
+ PDFbyte = pdf_file.read()
144
+
145
+ if PDFbyte:
146
+ st.download_button(
147
+ label="Télécharger le rapport de conversation",
148
+ data=PDFbyte,
149
+ file_name="Conversation_Dis_ADEME.pdf",
150
+ mime="application/octet-stream",
151
+ )
152
+
153
+ # Chatbot
154
+ for message in st.session_state.messages:
155
+ with st.chat_message(message["role"], avatar=message["avatar"]):
156
+ st.write(message["content"])
157
+
158
+ if prompt := st.chat_input("Comment puis-je vous aider ?"):
159
+ st.session_state.messages.append(
160
+ {"role": "user", "content": prompt, "avatar": "👤"}
161
+ )
162
+ with st.chat_message("user", avatar="👤"):
163
+ st.write(prompt)
164
+
165
+ try:
166
+ response = requests.post(API_URL_CHAT, json={"user_query": prompt})
167
+ response.raise_for_status()
168
+ data = response.json()
169
+ answer = data.get(
170
+ "formatted_output", "Désolé, je n'ai pas de réponse à cette question."
171
+ )
172
+ except requests.RequestException as e:
173
+ answer = f"Erreur lors de la communication avec l'API : {e}"
174
+
175
+ st.session_state.messages.append(
176
+ {"role": "assistant", "content": answer, "avatar": "🤖"}
177
+ )
178
+ with st.chat_message("assistant", avatar="🤖"):
179
+ st.write(answer)
180
+
181
+ # Page Admin
182
+ elif page == "Admin":
183
+ st.title("Admin - Ajouter des documents à la base de données")
184
+
185
+ doc_path = st.text_input("Entrez le chemin du document ou du dossier à ajouter")
186
+
187
+ if st.button("Ajouter les documents PDF à la base de données"):
188
+ if doc_path:
189
+ print("SAVED DOC:", doc_path)
190
+ try:
191
+ response = requests.post(
192
+ API_URL_EMBEDDING,
193
+ json={"doc_paths": doc_path, "vectorstor_type": "doc"},
194
+ )
195
+ response.raise_for_status()
196
+ st.success("Documents ajoutés à la base de données avec succès.")
197
+ except requests.RequestException as e:
198
+ st.error(f"Erreur lors de l'ajout des documents : {e}")
199
+ else:
200
+ st.warning("Veuillez entrer un chemin valide.")
frontend/app/resources/aide-financiere-ademe.JPG ADDED
frontend/app/resources/logo_ademe.png ADDED
frontend/app/settings.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic_settings import BaseSettings, SettingsConfigDict
2
+
3
+
4
+ class Settings(BaseSettings):
5
+ # model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8")
6
+ api_url_chat: str = "http://backend/chatting/chat"
7
+ api_url_embedding: str = "http://backend/embeddings/embedded"
8
+ api_url_sum: str = "http://backend/chatting/summary"
9
+
10
+
11
+ settings = Settings()
frontend/test/test_main.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """Basic test."""
2
+
3
+
4
+ def test_basic() -> None:
5
+ """Test that 1 + 1 equals 2."""
6
+ assert 1 + 1 == 2
nginx.conf ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ user user;
2
+ worker_processes 1;
3
+
4
+ events {
5
+ worker_connections 1024;
6
+ }
7
+
8
+ http {
9
+ server {
10
+ listen 80;
11
+
12
+ location /api/ {
13
+ proxy_pass http://127.0.0.1:8000/;
14
+ proxy_set_header Host $host;
15
+ proxy_set_header X-Real-IP $remote_addr;
16
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
17
+ proxy_set_header X-Forwarded-Proto $scheme;
18
+ }
19
+
20
+ location / {
21
+ proxy_pass http://127.0.0.1:8501/;
22
+ proxy_set_header Host $host;
23
+ proxy_set_header X-Real-IP $remote_addr;
24
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
25
+ proxy_set_header X-Forwarded-Proto $scheme;
26
+ }
27
+
28
+ location /qdrant/ {
29
+ proxy_pass http://127.0.0.1:6333/;
30
+ proxy_set_header Host $host;
31
+ proxy_set_header X-Real-IP $remote_addr;
32
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
33
+ proxy_set_header X-Forwarded-Proto $scheme;
34
+ }
35
+ }
36
+ }
pyproject.toml ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "kzs-rag"
3
+ version = "0.1.0"
4
+ description = "RAG project by kaizen"
5
+ readme = "readme.md"
6
+ requires-python = ">=3.12,<3.13"
7
+ dependencies = [
8
+ "pydantic-settings>=2.6.1",
9
+ ]
10
+
11
+ [dependency-groups]
12
+ frontend = [
13
+ "streamlit>=1.40.1",
14
+ ]
15
+
16
+ backend = [
17
+ "docling>=2.8.1",
18
+ "fastapi[standard]>=0.115.4",
19
+ "langchain-community>=0.3.8",
20
+ "langchain-openai>=0.2.10",
21
+ "langchain-qdrant>=0.2.0",
22
+ "langgraph>=0.2.53",
23
+ "qdrant-client>=1.12.1",
24
+ "sentence-transformers>=3.3.1",
25
+ "openparse>=0.7.0",
26
+ # "fpdf>=1.7.2",
27
+ "fpdf2>=2.8.1",
28
+ ]
29
+
30
+ dev = [
31
+ "mypy>=1.13.0",
32
+ "pytest>=8.3.3",
33
+ "ruff>=0.7.1",
34
+ "pytest-forked>=1.6.0",
35
+ "pytest-gitignore>=1.3",
36
+ "pytest-html>=4.1.1",
37
+ "pytest-xdist>=3.6.1",
38
+ "pandas>=2.2.3",
39
+ "pandas-stubs>=2.2.3.241009",
40
+ # "gitlabci-local>=10.2.0",
41
+ "plotly>=5.24.1",
42
+ "ipykernel>=6.29.5",
43
+ ]
44
+
45
+ [tool.ruff]
46
+ target-version = "py312"
47
+ fix = false
48
+ line-length = 88 # Same as Black
49
+ exclude = [
50
+ ".git",
51
+ ".git-rewrite",
52
+ ".mypy_cache",
53
+ ".pytype",
54
+ ".ruff_cache",
55
+ "__pypackages__",
56
+ ".venv"
57
+ ]
58
+
59
+ [tool.ruff.lint]
60
+ fixable = ["ALL"] # Allow autofix for all enabled rules
61
+ unfixable = []
62
+ # Rule selection
63
+ select = [
64
+ "F", "E", "C90", "N", "D", "UP", "YTT", "ANN", "ASYNC", "S", "BLE",
65
+ "FBT", "B", "A", "C4", "DTZ", "T10", "DJ", "EXE", "FA", "ISC",
66
+ "ICN", "G", "INP", "PIE", "PYI", "PT", "Q", "RSE", "RET", "SLF", "SLOT", "SIM",
67
+ "TID", "TCH", "INT", "ARG", "PTH", "TD", "FIX", "ERA", "PD", "PGH", "PL", "TRY",
68
+ "FLY", "NPY", "AIR", "PERF", "RUF", "T20", "I"
69
+ ]
70
+ # Not selected:
71
+ # - CPY (flake8-copyright) no need of a copyright per file
72
+ # - COM (flake8-commas) handled by ruff
73
+ # - EM (flake8-errmsg) too little gain for the cost
74
+
75
+ ignore = [
76
+ "D203", # 1 blank line required before class docstring
77
+ "D212", # Multi-line docstring summary should start at the first line
78
+ "TRY003", # Avoid specifying long messages outside the exception class
79
+ "ANN101", # Missing type annotation for self in method
80
+ "ANN102", # Missing type annotation for cls in classmethod
81
+ "G004", # Logging statement uses f-string
82
+ "PD013", # `.melt` is preferred to `.stack`; provides same functionality (WRONG!)
83
+ ]
84
+
85
+
86
+ [tool.ruff.lint.per-file-ignores]
87
+ "__init__.py" = ["D104"] # Ignore "missing docstring in public package" in all `__init__.py` files
88
+ "test/**/*.py" = [ # Ignore rules necessary for tests
89
+ "INP001", # Ignore "File is part of an implicit namespace package. Add an `__init__.py`."
90
+ "S101", # Ignore "Use of `assert` detected" because pytest relies on assert
91
+ "N802", # Ignore "Function name should be lowercase" because test function are non-standard
92
+ # "ARG", # Unused function args -> fixtures nevertheless are functionally relevant...
93
+ # "FBT", # Don't care about booleans as positional arguments in tests, e.g. via @pytest.mark.parametrize()
94
+ "PLR2004", # Ignore "Magic value used in comparison"
95
+ # "S311", # Standard pseudo-random generators are not suitable for cryptographic purposes
96
+ ]
97
+
98
+
99
+ [tool.ruff.lint.flake8-annotations]
100
+ mypy-init-return = true
101
+
102
+ [tool.ruff.lint.pydocstyle]
103
+ convention = "numpy"
104
+
105
+
106
+ [tool.mypy]
107
+ python_version = "3.12"
108
+ exclude = [
109
+ ]
110
+
111
+
112
+ [[tool.mypy.overrides]]
113
+ module = [""
114
+ ]
115
+ ignore_missing_imports = true
116
+
117
+ [tool.pytest.ini_options]
118
+
119
+ addopts = [
120
+ "--import-mode=prepend",
121
+ "-vv",
122
+ "--exitfirst",
123
+ "--capture=no",
124
+ "--showlocals",
125
+ # "--forked",
126
+ # "--cov-config=.coverage/coveragerc",
127
+ # "--cov=src",
128
+ # "--cov=app",
129
+ # "--cov-report=html",
130
+ "--html=.pytest_cache/report.html",
131
+ ]
132
+
133
+ python_files = "*.py"
134
+
135
+ norecursedirs = [
136
+ "dist",
137
+ "doc",
138
+ "__pycache__",
139
+ ]
140
+
141
+ [tool.pymarkdown]
142
+ # plugins.line-length.line_length = 88
143
+ # plugins.ul-style.style = "sublist"
144
+ # extensions.front-matter.enabled = true
supervisord.conf ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [supervisord]
2
+ nodaemon=true
3
+
4
+ [program:nginx]
5
+ command=/usr/sbin/nginx -g "daemon off;"
6
+ autostart=true
7
+ autorestart=true
8
+ user=user
9
+
10
+ [program:qdrant]
11
+ command=/usr/local/bin/qdrant
12
+ autostart=true
13
+ autorestart=true
14
+ user=user
15
+
16
+ [program:backend]
17
+ command=uv run fastapi app.main:app --host 0.0.0.0 --port 8000
18
+ directory=/home/user/app/backend
19
+ autostart=true
20
+ autorestart=true
21
+ user=user
22
+
23
+ [program:frontend]
24
+ command=uv run streamlit run app/main.py --server.port 8501 --server.address=0.0.0.0
25
+ directory=/home/user/app/frontend
26
+ autostart=true
27
+ autorestart=true
28
+ user=user
uv.lock ADDED
The diff for this file is too large to render. See raw diff