adrien.aribaut-gaudin commited on
Commit
2e3ba97
1 Parent(s): 0ac6117

first push from pages 9 to 25

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.pdf filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config_key.py
2
+
3
+
4
+ #Test folder
5
+ data/Test/
6
+
7
+ #database folder
8
+ database/
9
+ Ilumio_chatbot/
10
+
11
+ # Byte-compiled / optimized / DLL files
12
+ __pycache__/
13
+ *.py[cod]
14
+ *$py.class
15
+
16
+ # C extensions
17
+ *.so
18
+
19
+ # Distribution / packaging
20
+ .Python
21
+ build/
22
+ develop-eggs/
23
+ dist/
24
+ downloads/
25
+ eggs/
26
+ .eggs/
27
+ lib/
28
+ lib64/
29
+ parts/
30
+ sdist/
31
+ var/
32
+ wheels/
33
+ share/python-wheels/
34
+ *.egg-info/
35
+ .installed.cfg
36
+ *.egg
37
+ MANIFEST
38
+
39
+ # PyInstaller
40
+ # Usually these files are written by a python script from a template
41
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
42
+ *.manifest
43
+ *.spec
44
+
45
+ # Installer logs
46
+ pip-log.txt
47
+ pip-delete-this-directory.txt
48
+
49
+ # Unit test / coverage reports
50
+ htmlcov/
51
+ .tox/
52
+ .nox/
53
+ .coverage
54
+ .coverage.*
55
+ .cache
56
+ nosetests.xml
57
+ coverage.xml
58
+ *.cover
59
+ *.py,cover
60
+ .hypothesis/
61
+ .pytest_cache/
62
+ cover/
63
+
64
+ # Translations
65
+ *.mo
66
+ *.pot
67
+
68
+ # Django stuff:
69
+ *.log
70
+ local_settings.py
71
+ db.sqlite3
72
+ db.sqlite3-journal
73
+
74
+ # Flask stuff:
75
+ instance/
76
+ .webassets-cache
77
+
78
+ # Scrapy stuff:
79
+ .scrapy
80
+
81
+ # Sphinx documentation
82
+ docs/_build/
83
+
84
+ # PyBuilder
85
+ .pybuilder/
86
+ target/
87
+
88
+ # Jupyter Notebook
89
+ .ipynb_checkpoints
90
+
91
+ # IPython
92
+ profile_default/
93
+ ipython_config.py
94
+
95
+ # pyenv
96
+ # For a library or package, you might want to ignore these files since the code is
97
+ # intended to run in multiple environments; otherwise, check them in:
98
+ # .python-version
99
+
100
+ # pipenv
101
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
102
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
103
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
104
+ # install all needed dependencies.
105
+ #Pipfile.lock
106
+
107
+ # poetry
108
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
109
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
110
+ # commonly ignored for libraries.
111
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
112
+ #poetry.lock
113
+
114
+ # pdm
115
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
116
+ #pdm.lock
117
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
118
+ # in version control.
119
+ # https://pdm.fming.dev/#use-with-ide
120
+ .pdm.toml
121
+
122
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
123
+ __pypackages__/
124
+
125
+ # Celery stuff
126
+ celerybeat-schedule
127
+ celerybeat.pid
128
+
129
+ # SageMath parsed files
130
+ *.sage.py
131
+
132
+ # Environments
133
+ .env
134
+ .venv
135
+ env/
136
+ venv/
137
+ ENV/
138
+ env.bak/
139
+ venv.bak/
140
+
141
+ # Spyder project settings
142
+ .spyderproject
143
+ .spyproject
144
+
145
+ # Rope project settings
146
+ .ropeproject
147
+
148
+ # mkdocs documentation
149
+ /site
150
+
151
+ # mypy
152
+ .mypy_cache/
153
+ .dmypy.json
154
+ dmypy.json
155
+
156
+ # Pyre type checker
157
+ .pyre/
158
+
159
+ # pytype static type analyzer
160
+ .pytype/
161
+
162
+ # Cython debug symbols
163
+ cython_debug/
164
+
165
+ # PyCharm
166
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
167
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
168
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
169
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
170
+ #.idea/
app.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+ import time
4
+ import chromadb
5
+
6
+ from config import *
7
+ from src.tools.reader import get_pdf_title_styles
8
+ from src.tools.llm import LlmAgent
9
+ import src.view.view as view
10
+ from src.tools.pretty_print import pretty_print_container_structure, pretty_printer_paragraphs
11
+ from src.model.container import Container
12
+ from src.control.control import Chatbot
13
+ from src.tools.retriever import Retriever
14
+ from src.model.doc import Doc
15
+ from src.tools.test_read import pdf_manager
16
+
17
+ os.environ["TOKENIZERS_PARALLELISM"] = "true"
18
+
19
+ if not "OPENAI_API_KEY" in os.environ:
20
+ from config_key import OPENAI_API_KEY
21
+ os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
22
+
23
+ # start_time = time.time()
24
+
25
+ # doc = Doc(path=content_en_path_real)
26
+ # print("--- %s seconds ---" % (time.time() - start_time))
27
+ # check if the database is empty
28
+ # pdf_manager(pdf_path=content_en_path_real)
29
+ # pretty_printer_paragraphs(doc.container.paragraphs)
30
+ # pretty_print_container_structure(doc.container)
31
+
32
+ if not os.path.exists("database/"):
33
+ os.makedirs("database/")
34
+
35
+ client_db = chromadb.PersistentClient(path="database/")
36
+
37
+ try:
38
+ client_db.get_collection(name="illumio_database")
39
+ llm = LlmAgent(model="TheBloke/Llama-2-7b-Chat-GPTQ")
40
+ retriever = Retriever(client_db, None, "illumio_database", llmagent=llm)
41
+ except:
42
+ print("Database is empty")
43
+ doc = Doc(path=content_en_path_real)
44
+ llm = LlmAgent(model="TheBloke/Llama-2-7b-Chat-GPTQ")
45
+ retriever = Retriever(client_db,doc.container,"illumio_database",llmagent=llm)
46
+
47
+
48
+ chat = Chatbot(llm_agent=llm, retriever=retriever)
49
+
50
+ ilumio_qna = view.run(ctrl=chat, config=view_config)
51
+
52
+ ilumio_qna.queue().launch()
config.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ content_language = 'en'
2
+ plan_language = 'en'
3
+ content_en_path_real = "data/Illumio_Core_REST_API_Developer_Guide_23.3.pdf"
4
+ content_test = "Ilumio_chatbot/data/Test/Test_children.pdf"
5
+
6
+ examples = {"Question banale?": "Pourquoi le ciel est bleu?",
7
+ }
8
+
9
+
10
+ view_config = {
11
+ 'title': '# Ilumio Q&A',
12
+ 'examples': examples,
13
+ }
data/Illumio_Core_REST_API_Developer_Guide_23.3.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8821bd9530837f23a99e6b5d17d1e893f74d91ac6112c861d4ecd3f830e42479
3
+ size 4115867
requirements.txt ADDED
Binary file (6.89 kB). View file
 
src/__init__.py ADDED
File without changes
src/control/__init__.py ADDED
File without changes
src/control/control.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ from src.tools.retriever import Retriever
4
+ from src.tools.llm import LlmAgent
5
+ from src.model.block import Block
6
+
7
+
8
+ class Chatbot:
9
+ def __init__(self, llm_agent : LlmAgent, retriever: Retriever):
10
+ self.retriever = retriever
11
+ self.llm = llm_agent
12
+
13
+ def get_response(self, query, histo):
14
+ histo_conversation, histo_queries = self._get_histo(histo)
15
+ langage_of_query = self.llm.detect_language(query).lower()
16
+ queries = self.llm.translate(text=histo_queries)
17
+ block_sources = self.retriever.similarity_search(query=queries)
18
+ block_sources = self._select_best_sources(block_sources)
19
+ sources_contents = [s.content for s in block_sources]
20
+ context = '\n'.join(sources_contents)
21
+ answer = self.llm.generate_paragraph(query=queries, histo=histo_conversation, context=context, language=langage_of_query)
22
+ answer = self.llm.generate_answer(answer=answer, query=query, histo=histo_conversation, context=context,language=langage_of_query)
23
+ # print(answer.split('bot:')[1].strip())
24
+ # print("*************")
25
+ # answer = self._clean_answer(answer)
26
+ return answer, block_sources
27
+
28
+
29
+
30
+ @staticmethod
31
+ def _select_best_sources(sources: [Block], delta_1_2=0.15, delta_1_n=0.3, absolute=1.2, alpha=0.9) -> [Block]:
32
+ """
33
+ Select the best sources: not far from the very best, not far from the last selected, and not too bad per se
34
+ """
35
+ best_sources = []
36
+ for idx, s in enumerate(sources):
37
+ if idx == 0 \
38
+ or (s.distance - sources[idx - 1].distance < delta_1_2
39
+ and s.distance - sources[0].distance < delta_1_n) \
40
+ or s.distance < absolute:
41
+ best_sources.append(s)
42
+ delta_1_2 *= alpha
43
+ delta_1_n *= alpha
44
+ absolute *= alpha
45
+ else:
46
+ break
47
+ return best_sources
48
+
49
+
50
+ @staticmethod
51
+ def _get_histo(histo: [(str, str)]) -> (str, str):
52
+ histo_conversation = ""
53
+ histo_queries = ""
54
+
55
+ for (query, answer) in histo[-5:]:
56
+ histo_conversation += f'user: {query} \n bot: {answer}\n'
57
+ histo_queries += query + '\n'
58
+ return histo_conversation[:-1], histo_queries
59
+
60
+
61
+ @staticmethod
62
+ def _clean_answer(answer: str) -> str:
63
+ print(answer)
64
+ answer = answer.strip('bot:')
65
+ while answer and answer[-1] in {"'", '"', " ", "`"}:
66
+ answer = answer[:-1]
67
+ while answer and answer[0] in {"'", '"', " ", "`"}:
68
+ answer = answer[1:]
69
+ answer = answer.strip('bot:')
70
+ if answer:
71
+ if answer[-1] != ".":
72
+ answer += "."
73
+ return answer
src/model/__init__.py ADDED
File without changes
src/model/block.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class Block:
2
+ def __init__(self, doc: str= '',title: str = '', content: str = '',
3
+ index: str = '', rank: int = 0, level: int = 0, distance: float = 99999):
4
+ self.doc = doc
5
+ self.title = title
6
+ self.content = content
7
+ self.index = index
8
+ self.rank = rank
9
+ self.level = level
10
+ self.distance = distance
11
+
12
+ def to_dict(self) -> {}:
13
+ block_dict = {'doc': self.doc,
14
+ 'title': self.title,
15
+ 'content': self.content,
16
+ 'index': self.index,
17
+ 'rank': self.rank,
18
+ 'level': self.level,
19
+ 'distance': self.distance}
20
+ return block_dict
21
+
22
+ def from_dict(self, block_dict: {}):
23
+ self.doc = block_dict['doc']
24
+ self.title = block_dict['title']
25
+ self.content = block_dict['content']
26
+ self.index = block_dict['index']
27
+ self.rank = block_dict['rank']
28
+ self.level = block_dict['level']
29
+ self.distance = block_dict['distance']
30
+ return self
31
+
32
+ @property
33
+ def distance_str(self) -> str:
34
+ return format(self.distance, '.2f')
src/model/container.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .paragraph import Paragraph
2
+ from .block import Block
3
+
4
+ INFINITE = 99999
5
+
6
+ class Container:
7
+
8
+ def __init__(self, paragraphs : [Paragraph], title : Paragraph=None, level: int = 0, index: [int] = None , father=None, id_ = 0):
9
+ if index is None:
10
+ index = []
11
+ self.level = level
12
+ self.title = title
13
+ self.paragraphs = []
14
+ self.children = []
15
+ self.index = index
16
+ self.father = father
17
+ self.id_ = int(str(1) + str(father.id_) + str(id_))
18
+ if paragraphs:
19
+ self.paragraphs, self.children = self.create_children(paragraphs, level, index)
20
+ self.containers = [self]
21
+ for child in self.children:
22
+ self.containers += child.containers
23
+ self.blocks = self.get_blocks()
24
+
25
+
26
+ def get_blocks(self):
27
+ block = Block(level=self.level, index=self.index)
28
+ if self.title:
29
+ block.title = self.title.text
30
+ for p in self.paragraphs:
31
+ if not p.blank:
32
+ block.content += p.text
33
+ blocks = [block] if block.content else []
34
+ for child in self.children:
35
+ blocks += child.blocks
36
+ return blocks
37
+
38
+ def create_children(self, paragraphs, level, rank) -> ([], []):
39
+ """
40
+ creates children containers or directly attached content
41
+ and returns the list of containers and contents of level+1
42
+ :return:
43
+ [Content or Container]
44
+ """
45
+ attached_paragraphs = []
46
+ container_paragraphs = []
47
+ container_title = None
48
+ children = []
49
+ in_children = False
50
+ level = INFINITE
51
+ child_id = 0
52
+
53
+ while paragraphs:
54
+ p = paragraphs.pop(0)
55
+ if not in_children and not p.is_structure:
56
+ attached_paragraphs.append(p)
57
+ else:
58
+ in_children = True
59
+ if p.is_structure and not p.blank and p.level <= level: # if p is higher or equal in hierarchy
60
+ if container_paragraphs or container_title:
61
+ children.append(Container(container_paragraphs, container_title, level, rank, self, child_id))
62
+ child_id += 1
63
+ container_paragraphs = []
64
+ container_title = p
65
+ level = p.level
66
+
67
+ else: # p is strictly lower in hierarchy
68
+ container_paragraphs.append(p)
69
+
70
+ if container_paragraphs or container_title:
71
+ children.append(Container(container_paragraphs, container_title, level, rank, self, child_id))
72
+ child_id += 1
73
+
74
+ return attached_paragraphs, children
75
+
76
+
77
+ #REAL ONEEEEEEEEEEEEEEEEEEEEE
78
+
79
+ # def create_children(self, paragraphs: [Paragraph], level: int, index: [int]):
80
+ # """
81
+ # Creates children containers and/or directly attached content and returns the list of attached content and the list of children containers.
82
+ # The indexes correspond to the indexes of the paragraphs in the content and also on the structure.
83
+ # :return: List of Content or Container
84
+ # """
85
+ # attached_paragraphs = []
86
+ # children = []
87
+ # in_children = False
88
+ # level = INFINITE
89
+ # container_paragraphs = []
90
+ # container_title = None
91
+
92
+ # while paragraphs:
93
+ # p = paragraphs.pop(0)
94
+
95
+ # if not in_children and not p.is_structure:
96
+ # attached_paragraphs.append(p)
97
+ # else:
98
+ # in_children = True
99
+ # if p.is_structure and p.level <= level: # if p is higher in hierarchy, then the child is completed
100
+ # if container_paragraphs or container_title:
101
+ # if level <= len(index):
102
+ # index = index[:level]
103
+ # index[-1] += 1
104
+ # else:
105
+ # for i in range(level-len(index)):
106
+ # index.append(1)
107
+ # children.append(Container(container_paragraphs, container_title, level, index.copy(), self))
108
+ # container_paragraphs = []
109
+ # container_title = p
110
+ # level = p.level
111
+ # else: # p is normal text or strictly lower in hierarchy, then the child continues to grow
112
+ # container_paragraphs.append(p)
113
+ # if container_paragraphs or container_title:
114
+ # if level <= len(index):
115
+ # index = index[:level]
116
+ # index[-1] += 1
117
+ # else:
118
+ # for i in range(level - len(index)):
119
+ # index.append(1)
120
+ # children.append(Container(container_paragraphs, container_title, level, index.copy(), self))
121
+
122
+ # return attached_paragraphs, children
123
+
124
+
125
+
126
+ # def create_children(self, paragraphs: [Paragraph], level: int, index: [int]):
127
+ # """
128
+ # Creates children containers and/or directly attached content and returns the list of attached content and the list of children containers.
129
+ # The indexes correspond to the indexes of the paragraphs in the content and also on the structure.
130
+ # :return: List of Content or Container
131
+ # """
132
+ # attached_paragraphs = []
133
+ # children = []
134
+ # in_children = False
135
+ # level = INFINITE
136
+ # # container_paragraphs = []
137
+ # # container_title = None
138
+
139
+ # while paragraphs:
140
+ # p = paragraphs.pop(0)
141
+
142
+ # if not in_children and p.is_structure and level != INFINITE:
143
+ # paragraphs.insert(0, p)
144
+ # children.append(Container(paragraphs, title=p, level=p.level, children=children, index=index.copy(), father=self))
145
+ # else:
146
+ # in_children = True
147
+ # if p.is_structure and p.level <= level: # if p is higher in hierarchy, then the child is completed
148
+ # level = p.level
149
+ # if len(index) == level:
150
+ # index[-1] += 1
151
+ # elif len(index) < level:
152
+ # if self.children != []:
153
+ # index = self.children[-1].index.copy()
154
+ # index[-1] += 1
155
+ # else:
156
+ # index.append(1)
157
+ # else:
158
+ # index = index[:level]
159
+ # index[-1] += 1
160
+ # while paragraphs:
161
+ # p = paragraphs.pop(0)
162
+ # if p.is_structure:
163
+ # paragraphs.insert(0, p)
164
+ # break
165
+ # else:
166
+ # attached_paragraphs.append(p)
167
+ # if paragraphs and p.level > level:
168
+ # in_children = False
169
+ # children.append(Container(paragraphs, title=p, level=p.level, index=index.copy(), father=self))
170
+ # else:
171
+ # break
172
+ # return attached_paragraphs, children
173
+
174
+ @property
175
+ def structure(self):
176
+
177
+ self_structure = {str(self.id_): {
178
+ 'index': str(self.id_),
179
+ 'canMove': True,
180
+ 'isFolder': True,
181
+ 'children': [p.id_ for p in self.paragraphs] + [child.id_ for child in self.children],
182
+ 'canRename': True,
183
+ 'data': {},
184
+ 'level': self.level,
185
+ 'rank': self.rank,
186
+ 'title': self.title.text if self.title else 'root'
187
+ }}
188
+ paragraphs_structure = [p.structure for p in self.paragraphs]
189
+ structure = [self_structure] + paragraphs_structure
190
+ for child in self.children:
191
+ structure += child.structure
192
+ return structure
src/model/doc.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.model.container import Container
2
+ from src.model.paragraph import Paragraph
3
+ from src.tools.index_creation import set_good_indexes
4
+ from src.tools.reader import get_pdf_title_styles
5
+
6
+
7
+ class Doc:
8
+
9
+ def __init__(self, path='', id_=None):
10
+
11
+ self.title = self.get_good_title(path)
12
+ self.id_ = id(self)
13
+ self.path = path
14
+ paragraphs = get_pdf_title_styles(path)
15
+ self.container = Container(paragraphs, father=self)
16
+ set_good_indexes(self.container)
17
+ self.blocks = self.get_blocks()
18
+
19
+
20
+ def get_good_title(self,path):
21
+ if '/' in path:
22
+ res = path.split('/')[-1]
23
+ if '\\' in path:
24
+ res = path.split('\\')[-1]
25
+ else:
26
+ res = path
27
+ return res
28
+
29
+ @property
30
+ def structure(self):
31
+
32
+ return self.container.structure
33
+
34
+ def get_blocks(self):
35
+
36
+ def from_list_to_str(index_list):
37
+ index_str = str(index_list[0])
38
+ for el in index_list[1:]:
39
+ index_str += '.' + str(el)
40
+ return index_str
41
+
42
+ blocks = self.container.blocks
43
+ # blocks = self.delete_duplicate()
44
+ for block in blocks:
45
+ block.doc = self.title
46
+ block.index = from_list_to_str(block.index)
47
+ print(block.index + ' ' + block.title)
48
+ return blocks
49
+
50
+
51
+ def delete_duplicate(self):
52
+ while self.found_duplicates(self.container.blocks):
53
+ for i in range(len(self.container.blocks) - 1):
54
+ if self.container.blocks[i].index == self.container.blocks[i + 1].index:
55
+ if self.container.blocks[i].index != []:
56
+ self.container.blocks[i].index.pop()
57
+ return self.container.blocks
58
+
59
+ def found_duplicates(self, blocks):
60
+ for i in range(len(blocks) - 1):
61
+ if blocks[i].index == blocks[i + 1].index:
62
+ return True
63
+ return False
64
+
65
+ """
66
+ current_level = len(current_index)
67
+ if 0 < block.level:
68
+ if block.level == current_level:
69
+ current_index[-1] += 1
70
+ elif current_level < block.level:
71
+ current_index.append(1)
72
+ elif block.level < current_level:
73
+ current_index = current_index[:block.level]
74
+ current_index[-1] += 1
75
+ block.index = from_list_to_str(current_index)
76
+ else:
77
+ block.index = "0"
78
+ """
src/model/paragraph.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import string
2
+
3
+ INFINITE = 10000
4
+
5
+ class Paragraph:
6
+ def __init__(self, text : str, font_style : str, id_ : int, page_id : int):
7
+ self.font_style = font_style
8
+ self.id_ = int(str(2)+str(page_id)+str(id_))
9
+ self.page_id = page_id
10
+ self.level = int(font_style[-1]) if 'title' in font_style else INFINITE
11
+ self.is_structure = self.level < INFINITE
12
+ self.text = text
13
+
14
+ @property
15
+ def blank(self):
16
+ """
17
+ checks if the paragraph is blank: i.e. it brings some signal (it may otherwise be ignored)
18
+ """
19
+ text = self.text.replace('\n', '')
20
+ return set(text).isdisjoint(string.ascii_letters)
src/tools/__init__.py ADDED
File without changes
src/tools/index_creation.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.model.container import Container
2
+
3
+ def create_dic_levels(c:Container,dict_of_levels : dict = {}):
4
+ if c.level == 0:
5
+ dict_of_levels[c.level] = [1]
6
+ for child in c.children:
7
+ if child.level not in dict_of_levels:
8
+ dict_of_levels[child.level] = [1 for _ in range(child.level)]
9
+ create_dic_levels(child, dict_of_levels)
10
+ return dict_of_levels
11
+
12
+ def create_good_indexes(c:Container, dict_of_levels : dict):
13
+ actual_level = c.level
14
+ c.index = dict_of_levels[actual_level].copy()
15
+ actual_len = len(dict_of_levels[actual_level])
16
+ temp_update = dict_of_levels[actual_level][-1]
17
+ dict_of_levels[actual_level][-1] += 1
18
+ for i in dict_of_levels.values():
19
+ if len(i) > actual_len:
20
+ i[actual_len - 1] = temp_update
21
+ for child in c.children:
22
+ c_lvl = child.level
23
+ for i in dict_of_levels.values():
24
+ if len(i) > c_lvl:
25
+ i[c_lvl:] = [1 for _ in range(len(i[c_lvl:]))]
26
+ create_good_indexes(child, dict_of_levels) # Apply the function recursively to all children
27
+
28
+ def set_good_block_indexes(c:Container):
29
+ for i in c.containers:
30
+ for b in i.blocks:
31
+ b.index = i.index
32
+
33
+
34
+ def set_good_indexes(c:Container):
35
+ dict_levels = create_dic_levels(c)
36
+ create_good_indexes(c, dict_levels)
37
+ set_good_block_indexes(c)
src/tools/llm.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.llms.huggingface_pipeline import HuggingFacePipeline
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
3
+ from langchain.chains import LLMChain
4
+ from langchain.prompts import PromptTemplate
5
+
6
+ class LlmAgent:
7
+
8
+ def __init__(self, model :str = "TheBloke/Llama-2-7b-Chat-GPTQ"):
9
+ self.tokenizer = AutoTokenizer.from_pretrained(model, use_fast=True)
10
+ self.model = AutoModelForCausalLM.from_pretrained(model,
11
+ device_map="auto",
12
+ trust_remote_code=False, #A CHANGER SELON LES MODELES, POUR CELUI DE LAMA2 CA MARCHE (celui par default)
13
+ revision="main")
14
+ self.pipe = pipeline("text-generation", model=self.model, tokenizer=self.tokenizer)
15
+
16
+ def generate_paragraph(self, query: str, context: {}, histo: [(str, str)], language='fr') -> str:
17
+ locallm = HuggingFacePipeline(pipeline=self.pipe)
18
+ """generates the answer"""
19
+ template = (f"You are a conversation bot designed to answer to the query from users delimited by "
20
+ f"triple backticks: "
21
+ f"\\n ``` {query} ```\\n"
22
+ f"Your answer is based on the context delimited by triple backticks: "
23
+ f"\\n ``` {context} ```\\n"
24
+ f" You are consistent and avoid redundancies with the rest of the initial conversation "
25
+ f"delimited by triple backticks: "
26
+ f"\\n ``` {histo} ```\\n"
27
+ f"Your response shall be in {language} and shall be concise")
28
+ prompt = PromptTemplate(input_variables=[], template=template)
29
+ llm_chain = LLMChain(prompt=prompt,llm=locallm)
30
+ p = llm_chain.predict()
31
+ # print("****************")
32
+ # print(template)
33
+ # print("----")
34
+ # print(p)
35
+ return p
36
+
37
+ def translate(self, text: str, language="en") -> str:
38
+ locallm = HuggingFacePipeline(pipeline=self.pipe)
39
+ """translates"""
40
+
41
+ # languages = "`French to English" if language == "en" else "English to French"
42
+
43
+ tempate = (f" Your task consists in translating in English\\n"
44
+ f" the following text delimited by by triple backticks: ```{text}```\n"
45
+ )
46
+
47
+ prompt = PromptTemplate(input_variables=[], template=tempate)
48
+ llm_chain = LLMChain(prompt=prompt,llm=locallm,verbose=True)
49
+ p = llm_chain.predict()
50
+ return p
51
+
52
+ def generate_answer(self, query: str, answer: str, histo: str, context: str,language : str) -> str:
53
+ """provides the final answer in {language} based on the initial query and the answer in english"""
54
+ def _cut_unfinished_sentence(s: str):
55
+ return '.'.join(s.split('.')[:-1])
56
+ locallm = HuggingFacePipeline(pipeline=self.pipe)
57
+ template = (f"Your task consists in translating the answer in {language}, if its not already the case, to the query "
58
+ f"delimited by triple backticks: ```{query}``` \\n"
59
+ f"You are given the answer in {language} delimited by triple backticks: ```{answer}```"
60
+ f"\\n You don't add new content to the answer but: "
61
+ f"\\n 1 You can use some vocabulary from the context delimited by triple backticks: "
62
+ f"```{context}```"
63
+ f"\\n 2 You are consistent and avoid redundancies with the rest of the initial"
64
+ f" conversation delimited by triple backticks: ```{histo}```"
65
+ )
66
+ prompt = PromptTemplate(input_variables=[], template=template)
67
+ llm_chain = LLMChain(prompt=prompt,llm=locallm,verbose=True)
68
+ p = llm_chain.predict()
69
+ # p = _cut_unfinished_sentence(p)
70
+ return p
71
+
72
+
73
+ def transform_parahraph_into_question(self, prompt : str, title_doc : str = '',title_para : str = '') -> str:
74
+ self.tokenizer.pad_token = self.tokenizer.eos_token
75
+ max_tokens = 45
76
+
77
+ prompt_template=f'''[INST] <<SYS>>
78
+ You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
79
+ If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
80
+ Your job is to create a question about a paragraph of a document untitled "{title_doc}".
81
+ The paragraph title is "{title_para}".
82
+ If you see that the question that you are creating will not respect {max_tokens} tokens, find a way to make it shorter.
83
+ If you see that the document paragraph seems to be code flattened, try to analyze it and create a question about it.
84
+ If you see that the paragraph is a table, try to create a question about it.
85
+ If you can't create a question about the paragraph, just rephrase {title_para} so that it becomes a question.
86
+ Your response shall only contains one question, shall be concise and shall respect the following format:
87
+ "Question: <question>"
88
+ The paragraph you need to create a question about is the following :
89
+ <</SYS>>
90
+ {prompt}[/INST]
91
+
92
+ '''
93
+ input_ids = self.tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
94
+ output = self.model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=max_tokens,num_return_sequences=1)
95
+
96
+ res1 = self.tokenizer.decode(output[0][input_ids.shape[-1]:], skip_special_tokens=True)
97
+ print(res1)
98
+ print("-"*len(res1))
99
+ return res1
100
+
101
+ def detect_language(self, text: str) -> str:
102
+ """detects the language"""
103
+ locallm = HuggingFacePipeline(pipeline=self.pipe)
104
+ template = (f"Your task consists in detecting the language of the following text delimited by triple backticks: "
105
+ f"```{text}```"
106
+ f" Your answer shall be the two letters code of the language"
107
+ )
108
+ prompt = PromptTemplate(input_variables=[], template=template)
109
+ llm_chain = LLMChain(prompt=prompt,llm=locallm,verbose=True)
110
+ p = llm_chain.predict()
111
+ return p
src/tools/pretty_print.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.model.paragraph import Paragraph
2
+ from src.model.container import Container
3
+
4
+
5
+ #function that pretty prints the paragraphs
6
+ def pretty_printer_paragraphs(paragraphs):
7
+ for p in paragraphs:
8
+ if (p.font_style == "title1"):
9
+ print(f"Titre 1 {p.text}")
10
+ elif (p.font_style == "title2"):
11
+ print(f"---> Titre 2 {p.text}")
12
+ elif (p.font_style == "title3"):
13
+ print(f"-------> Titre 3 {p.text}")
14
+ # elif (p.font_style == "title4"):
15
+ # print(f"-----------> Titre 4 {p.text}")
16
+ # elif (p.font_style == "content"):
17
+ # print(f"---------------> {p.text}")
18
+
19
+ def pretty_print_container_structure(container):
20
+ if container.title:
21
+ print(f"{'-'*container.level} {container.title.text}")
22
+ for p in container.paragraphs:
23
+ print(f"{'-'*container.level} {p.text}")
24
+ for c in container.children:
25
+ pretty_print_container_structure(c)
src/tools/reader.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pdfplumber as pdfp
3
+ from src.model.paragraph import Paragraph
4
+ import asyncio
5
+
6
+ def skip_header(dictionary):
7
+ i = 0
8
+ if not (dictionary[i]["chars"][0]["size"] > 19 and dictionary[i]["chars"][0]["size"] < 30):
9
+ i+=2
10
+ return i
11
+
12
+
13
+ def get_style_of_line(size : float):
14
+ if size >= 9 and size < 11.5:
15
+ return "content"
16
+ elif size >= 11.5 and size <= 12.7:
17
+ return "title5"
18
+ elif size >= 12.8 and size <= 13.5:
19
+ return "title4"
20
+ elif size > 13.5 and size <= 15.5:
21
+ return "title3"
22
+ elif size > 15.5 and size <= 18.5:
23
+ return "title2"
24
+ elif size > 19 and size < 30:
25
+ return "title1"
26
+ # elif size >= 12 and size <= 14.5:
27
+ # return "title2"
28
+ # elif size > 14.5 and size <= 16.5:
29
+ # return "title1"
30
+ else:
31
+ return "unknown"
32
+
33
+ def get_pdf_title_styles(path):
34
+ pdf_to_read = extract_all_lines_from_the_doc(path)
35
+ paragraphs = []
36
+ j = 0
37
+ while j < len(pdf_to_read):
38
+ dictionary = pdf_to_read[j]["content"]
39
+ i = skip_header(dictionary)
40
+ while i < len(dictionary):
41
+ #print(f"{dictionary[i]['chars'][0]} : {dictionary[i]['text']}")
42
+ if(dictionary[i]["text"].startswith("RESTAPIDeveloperGuide")):
43
+ i+=1
44
+ continue
45
+ p = Paragraph(dictionary[i]["text"],font_style=get_style_of_line(dictionary[i]["chars"][0]["size"]),id_=i,page_id=pdf_to_read[j]["page_number"])
46
+ if(i != len(dictionary)-1):
47
+ while(dictionary[i+1]["chars"][0]["size"] == dictionary[i]["chars"][0]["size"]):
48
+ p.text += " " + dictionary[i+1]["text"]
49
+ i += 1
50
+ # if(i == len(dictionary)-1):
51
+ # print("PIDOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO")
52
+ # if(j == len(pdf_to_read)-1):
53
+ # print("JUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU")
54
+ # break
55
+ # else:
56
+ # if(dictionary[i]["chars"][0]["size"] == pdf_to_read[j+1]["content"][0]["chars"][0]["size"]):
57
+ # print("MAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA")
58
+ # j += 1
59
+ # p.text += " " + pdf_to_read[j]["content"][0]["text"]
60
+ # dictionary = pdf_to_read[j]["content"]
61
+ # i = 0
62
+ # else:
63
+ # print("RRIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIZ")
64
+ # break
65
+ else:
66
+ p.text = dictionary[i]["text"]
67
+ #print(f"{dictionary[i]['chars'][0]} : {dictionary[i]['text']}")
68
+ i += 1
69
+ # print(f'{p.page_id} : {p.font_style} ->>>>> {p.text}')
70
+ paragraphs.append(p)
71
+ j += 1
72
+ return paragraphs
73
+
74
+
75
+ def test_get_font_sizes_of_a_page(page : int, path):
76
+ with open(os.path.abspath(path)) as f:
77
+ reader = pdfp.PDF(f)
78
+ page = reader.pages[page]
79
+ dictionary = page.extract_text_lines()
80
+ for i in range(len(dictionary)):
81
+ print(f'{i} : {dictionary[i]["chars"][0]["size"]} ->>>>> {dictionary[i]["text"]}')
82
+
83
+
84
+ def extract_all_lines_from_the_doc(path):
85
+ lines_of_doc = []
86
+ with open(path, 'rb') as f:
87
+ reader = pdfp.PDF(f)
88
+ skip_table_of_contents = reader.pages[8:16]
89
+ j = 0
90
+ while j < len(skip_table_of_contents):
91
+ lines_of_doc.append({"page_number": j+9, "content": skip_table_of_contents[j].extract_text_lines()})
92
+ j += 1
93
+ return lines_of_doc
94
+
95
+
96
+
97
+
98
+ # path = "data/Illumio_Core_REST_API_Developer_Guide_23.3.pdf"
99
+ # get_pdf_title_styles(os.path.abspath(path))
100
+ # print("--------------------------------------------------")
101
+ # print("--------------------------------------------------")
102
+ #print(test_get_font_sizes_of_a_page(8))
src/tools/retriever.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.model.container import Container
2
+ from src.model.block import Block
3
+ from src.model.doc import Doc
4
+ from src.tools.llm import LlmAgent
5
+
6
+ class Retriever:
7
+ def __init__(self,db_client,doc : Doc = None, collection_name:str = "illumio_database", llmagent : LlmAgent = None):
8
+ if doc != None:
9
+ self.collection = db_client.create_collection(name=collection_name)
10
+ blocks_good_format: [Block] = doc.blocks
11
+ self.collection.add(
12
+ documents=[llmagent.transform_parahraph_into_question(block.content,title_doc=doc.title,title_para=block.title) for block in blocks_good_format],
13
+ ids=[block.index for block in blocks_good_format],
14
+ metadatas=[block.to_dict() for block in blocks_good_format]
15
+ )
16
+ else:
17
+ self.collection = db_client.get_collection(name=collection_name)
18
+
19
+
20
+
21
+ def similarity_search(self, query: str) -> {}:
22
+ res = self.collection.query(query_texts=query)
23
+ block_dict_sources = res['metadatas'][0]
24
+ distances = res['distances'][0]
25
+ blocks = []
26
+ for bd, d in zip(block_dict_sources, distances):
27
+ b = Block().from_dict(bd)
28
+ b.distance = d
29
+ blocks.append(b)
30
+ return blocks
31
+
src/tools/test_read.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # To read the PDF
2
+ import PyPDF2
3
+ # To analyze the PDF layout and extract text
4
+ from pdfminer.high_level import extract_pages, extract_text
5
+ from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
6
+ # To extract text from tables in PDF
7
+ import pdfplumber
8
+ # To extract the images from the PDFs
9
+ from PIL import Image
10
+ from pdf2image import convert_from_path
11
+ # To perform OCR to extract text from images
12
+ import pytesseract
13
+ # To remove the additional created files
14
+ import os
15
+
16
+ def text_extraction(element):
17
+ # Extracting the text from the in-line text element
18
+ line_text = element.get_text()
19
+
20
+ # Find the formats of the text
21
+ # Initialize the list with all the formats that appeared in the line of text
22
+ line_formats = []
23
+ for text_line in element:
24
+ if isinstance(text_line, LTTextContainer):
25
+ # Iterating through each character in the line of text
26
+ for character in text_line:
27
+ if isinstance(character, LTChar):
28
+ # Append the font name of the character
29
+ line_formats.append(character.fontname)
30
+ # Append the font size of the character
31
+ line_formats.append(character.size)
32
+ # Find the unique font sizes and names in the line
33
+ format_per_line = list(set(line_formats))
34
+
35
+ # Return a tuple with the text in each line along with its format
36
+ return (line_text, format_per_line)
37
+
38
+
39
+ def crop_image(element, pageObj):
40
+ # Get the coordinates to crop the image from the PDF
41
+ [image_left, image_top, image_right, image_bottom] = [element.x0,element.y0,element.x1,element.y1]
42
+ # Crop the page using coordinates (left, bottom, right, top)
43
+ pageObj.mediabox.lower_left = (image_left, image_bottom)
44
+ pageObj.mediabox.upper_right = (image_right, image_top)
45
+ # Save the cropped page to a new PDF
46
+ cropped_pdf_writer = PyPDF2.PdfWriter()
47
+ cropped_pdf_writer.add_page(pageObj)
48
+ # Save the cropped PDF to a new file
49
+ with open('cropped_image.pdf', 'wb') as cropped_pdf_file:
50
+ cropped_pdf_writer.write(cropped_pdf_file)
51
+
52
+ # Create a function to convert the PDF to images
53
+ def convert_to_images(input_file,):
54
+ images = convert_from_path(input_file,poppler_path=r'C:\Program Files\poppler-23.08.0\Library\bin')
55
+ image = images[0]
56
+ output_file = "PDF_image.png"
57
+ image.save(output_file, "PNG")
58
+
59
+ # Create a function to read text from images
60
+ def image_to_text(image_path):
61
+ # Read the image
62
+ img = Image.open(image_path)
63
+ # Extract the text from the image
64
+ text = pytesseract.image_to_string(img)
65
+ return text
66
+
67
+
68
+ def extract_table(pdf_path, page_num, table_num):
69
+ # Open the pdf file
70
+ pdf = pdfplumber.open(pdf_path)
71
+ # Find the examined page
72
+ table_page = pdf.pages[page_num]
73
+ # Extract the appropriate table
74
+ table = table_page.extract_tables()[table_num]
75
+ return table
76
+
77
+ # Convert table into the appropriate format
78
+ def table_converter(table):
79
+ table_string = ''
80
+ # Iterate through each row of the table
81
+ for row_num in range(len(table)):
82
+ row = table[row_num]
83
+ # Remove the line breaker from the wrapped texts
84
+ cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
85
+ # Convert the table into a string
86
+ table_string+=('|'+'|'.join(cleaned_row)+'|'+'\n')
87
+ # Removing the last line break
88
+ table_string = table_string[:-1]
89
+ return table_string
90
+
91
+
92
+
93
+ def pdf_manager(pdf_path):
94
+ # create a PDF file object
95
+ pdfFileObj = open(pdf_path, 'rb')
96
+ # create a PDF reader object
97
+ pdfReaded = PyPDF2.PdfReader(pdfFileObj)
98
+
99
+ # Create the dictionary to extract text from each image
100
+ text_per_page = {}
101
+ # We extract the pages from the PDF
102
+ for pagenum, page in enumerate(extract_pages(pdf_path)):
103
+
104
+ # Initialize the variables needed for the text extraction from the page
105
+ pageObj = pdfReaded.pages[pagenum]
106
+ page_text = []
107
+ line_format = []
108
+ text_from_images = []
109
+ text_from_tables = []
110
+ page_content = []
111
+ # Initialize the number of the examined tables
112
+ table_num = 0
113
+ first_element= True
114
+ table_extraction_flag= False
115
+ # Open the pdf file
116
+ pdf = pdfplumber.open(pdf_path)
117
+ # Find the examined page
118
+ page_tables = pdf.pages[pagenum]
119
+ # Find the number of tables on the page
120
+ tables = page_tables.find_tables()
121
+
122
+
123
+ # Find all the elements
124
+ page_elements = [(element.y1, element) for element in page._objs]
125
+ # Sort all the elements as they appear in the page
126
+ page_elements.sort(key=lambda a: a[0], reverse=True)
127
+
128
+ # Find the elements that composed a page
129
+ for i,component in enumerate(page_elements):
130
+ # Extract the position of the top side of the element in the PDF
131
+ pos= component[0]
132
+ # Extract the element of the page layout
133
+ element = component[1]
134
+
135
+ # Check if the element is a text element
136
+ if isinstance(element, LTTextContainer):
137
+ # Check if the text appeared in a table
138
+ if table_extraction_flag == False:
139
+ # Use the function to extract the text and format for each text element
140
+ (line_text, format_per_line) = text_extraction(element)
141
+ # Append the text of each line to the page text
142
+ page_text.append(line_text)
143
+ # Append the format for each line containing text
144
+ line_format.append(format_per_line)
145
+ page_content.append(line_text)
146
+ else:
147
+ # Omit the text that appeared in a table
148
+ pass
149
+
150
+ # Check the elements for images
151
+ if isinstance(element, LTFigure):
152
+ # Crop the image from the PDF
153
+ crop_image(element, pageObj)
154
+ # Convert the cropped pdf to an image
155
+ convert_to_images('cropped_image.pdf')
156
+ # Extract the text from the image
157
+ image_text = image_to_text('PDF_image.png')
158
+ text_from_images.append(image_text)
159
+ page_content.append(image_text)
160
+ # Add a placeholder in the text and format lists
161
+ page_text.append('image')
162
+ line_format.append('image')
163
+
164
+ # Check the elements for tables
165
+ if isinstance(element, LTRect):
166
+ # If the first rectangular element
167
+ if first_element == True and (table_num+1) <= len(tables):
168
+ # Find the bounding box of the table
169
+ lower_side = page.bbox[3] - tables[table_num].bbox[3]
170
+ upper_side = element.y1
171
+ # Extract the information from the table
172
+ table = extract_table(pdf_path, pagenum, table_num)
173
+ # Convert the table information in structured string format
174
+ table_string = table_converter(table)
175
+ # Append the table string into a list
176
+ text_from_tables.append(table_string)
177
+ page_content.append(table_string)
178
+ # Set the flag as True to avoid the content again
179
+ table_extraction_flag = True
180
+ # Make it another element
181
+ first_element = False
182
+ # Add a placeholder in the text and format lists
183
+ page_text.append('table')
184
+ line_format.append('table')
185
+
186
+ # Check if we already extracted the tables from the page
187
+ if element.y0 >= lower_side and element.y1 <= upper_side:
188
+ pass
189
+ elif not isinstance(page_elements[i+1][1], LTRect):
190
+ table_extraction_flag = False
191
+ first_element = True
192
+ table_num+=1
193
+
194
+
195
+ # Create the key of the dictionary
196
+ dctkey = 'Page_'+str(pagenum)
197
+ # Add the list of list as the value of the page key
198
+ text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
199
+
200
+ # Closing the pdf file object
201
+ pdfFileObj.close()
202
+
203
+ # Deleting the additional files created
204
+ os.remove('cropped_image.pdf')
205
+ os.remove('PDF_image.png')
206
+
207
+ # Display the content of the page
208
+ result = ''.join(text_per_page['Page_0'][4])
209
+ print(result)
src/view/view.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from src.control.control import Chatbot
3
+
4
+
5
+ def run(ctrl: Chatbot, config: {}):
6
+ with gr.Blocks() as qna:
7
+ with gr.Row():
8
+ with gr.Column():
9
+ pass
10
+
11
+ with gr.Column(scale=10):
12
+
13
+ gr.Markdown(config['title'])
14
+
15
+ histo_text_comp = gr.Chatbot(
16
+ visible=False,
17
+ value=[],
18
+ )
19
+ input_text_comp = gr.Textbox(
20
+ label="",
21
+ lines=1,
22
+ max_lines=3,
23
+ interactive=True,
24
+ placeholder="Posez votre question ici",
25
+ )
26
+ clear_btn = gr.Button("Clear")
27
+ input_example_comp = gr.Radio(
28
+ label="Examples",
29
+ choices=list(config['examples'].values()),
30
+ value="",
31
+ )
32
+ source_text_comp = []
33
+ for i in range(4):
34
+ source_text_comp.append(gr.Textbox(
35
+ lines=4,
36
+ max_lines=4,
37
+ interactive=False,
38
+ visible=False,
39
+ ))
40
+
41
+ with gr.Column():
42
+ pass
43
+
44
+ def input_text_fn1(input_text_, histo_text_):
45
+ histo_text_.append((input_text_, None))
46
+ update_ = {
47
+ histo_text_comp: gr.update(visible=True, value=histo_text_),
48
+ input_example_comp: gr.update(visible=False,),
49
+ }
50
+ for i in range(4):
51
+ update_[source_text_comp[i]] = gr.update(visible=False)
52
+ return update_
53
+
54
+ def input_text_fn2(input_text_, histo_text_):
55
+ answer, sources = ctrl.get_response(query=input_text_, histo=histo_text_)
56
+ histo_text_[-1] = (input_text_, answer)
57
+ update_ = {
58
+ histo_text_comp: gr.update(value=histo_text_),
59
+ input_text_comp: gr.update(value=''),
60
+ }
61
+ for i in range(min(len(sources), 3)):
62
+ s = sources[i]
63
+ source_label = f'{s.index} {s.title} score = {s.distance_str}'
64
+ source_text = s.content
65
+ update_[source_text_comp[i]] = gr.update(visible=True, value=source_text, label=source_label)
66
+ return update_
67
+
68
+ def input_example_fn(input_example_, histo_text_):
69
+ histo_text_.append((input_example_, None))
70
+ update_ = {
71
+ input_text_comp: gr.update(value=input_example_),
72
+ histo_text_comp: gr.update(visible=True, value=histo_text_),
73
+ input_example_comp: gr.update(visible=False, value=''),
74
+ }
75
+ for i in range(4):
76
+ update_[source_text_comp[i]] = gr.update(visible=False)
77
+ return update_
78
+
79
+ def clear_fn():
80
+ update_ = {
81
+ input_text_comp: gr.update(value=''),
82
+ histo_text_comp: gr.update(value='', visible=False),
83
+ input_example_comp: gr.update(value='', visible=True),
84
+ }
85
+ for i in range(4):
86
+ update_[source_text_comp[i]] = gr.update(visible=False, value='hello')
87
+ return update_
88
+
89
+ input_text_comp \
90
+ .submit(input_text_fn1,
91
+ inputs=[input_text_comp, histo_text_comp],
92
+ outputs=[histo_text_comp, input_example_comp,
93
+ source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])\
94
+ .then(input_text_fn2,
95
+ inputs=[input_text_comp, histo_text_comp],
96
+ outputs=[input_text_comp, histo_text_comp,
97
+ source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])
98
+ input_example_comp \
99
+ .input(input_example_fn,
100
+ inputs=[input_example_comp, histo_text_comp],
101
+ outputs=[input_text_comp, histo_text_comp, input_example_comp,
102
+ source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])\
103
+ .then(input_text_fn2,
104
+ inputs=[input_text_comp, histo_text_comp],
105
+ outputs=[input_text_comp, histo_text_comp,
106
+ source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])
107
+ clear_btn.click(clear_fn,
108
+ inputs=None,
109
+ outputs=[input_text_comp, histo_text_comp, input_example_comp,
110
+ source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])
111
+
112
+ return qna
test.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from src.model.doc import Doc
2
+ from config import *
3
+ from src.tools.llm import LlmAgent
4
+
5
+ llmagent = LlmAgent(model="TheBloke/Llama-2-7b-Chat-GPTQ")
6
+ doc = Doc(path=content_en_path_real)
7
+ [llmagent.transform_parahraph_into_question(block.content, title_doc=doc.title,title_para=block.title) for block in doc.blocks]