Quent1Fvr commited on
Commit
e2e8616
1 Parent(s): 91c90f8
Files changed (47) hide show
  1. .gitattributes +3 -0
  2. .gitignore +179 -0
  3. __pycache__/config.cpython-311.pyc +0 -0
  4. __pycache__/config_key.cpython-311.pyc +0 -0
  5. app.py +27 -0
  6. config.py +19 -0
  7. config_key.py +1 -0
  8. requirements.txt +0 -0
  9. src/__init__.py +0 -0
  10. src/__pycache__/__init__.cpython-311.pyc +0 -0
  11. src/__pycache__/config_key.cpython-311.pyc +0 -0
  12. src/control/__init__.py +0 -0
  13. src/control/__pycache__/__init__.cpython-311.pyc +0 -0
  14. src/control/__pycache__/control.cpython-311.pyc +0 -0
  15. src/control/control.py +116 -0
  16. src/model/__init__.py +0 -0
  17. src/model/__pycache__/__init__.cpython-311.pyc +0 -0
  18. src/model/__pycache__/block.cpython-311.pyc +0 -0
  19. src/model/__pycache__/container.cpython-311.pyc +0 -0
  20. src/model/__pycache__/doc.cpython-311.pyc +0 -0
  21. src/model/__pycache__/paragraph.cpython-311.pyc +0 -0
  22. src/model/block.py +58 -0
  23. src/model/container.py +112 -0
  24. src/model/doc.py +79 -0
  25. src/model/paragraph.py +39 -0
  26. src/tools/__init__.py +0 -0
  27. src/tools/__pycache__/__init__.cpython-311.pyc +0 -0
  28. src/tools/__pycache__/index_creation.cpython-311.pyc +0 -0
  29. src/tools/__pycache__/llm.cpython-311.pyc +0 -0
  30. src/tools/__pycache__/reader_html.cpython-311.pyc +0 -0
  31. src/tools/__pycache__/reader_pdf_tools.cpython-311.pyc +0 -0
  32. src/tools/__pycache__/reader_word.cpython-311.pyc +0 -0
  33. src/tools/__pycache__/readers_pdf.cpython-311.pyc +0 -0
  34. src/tools/__pycache__/retriever.cpython-311.pyc +0 -0
  35. src/tools/__pycache__/table_converter.cpython-311.pyc +0 -0
  36. src/tools/index_creation.py +67 -0
  37. src/tools/llm.py +149 -0
  38. src/tools/pretty_print.py +33 -0
  39. src/tools/reader_html.py +118 -0
  40. src/tools/reader_pdf_tools.py +56 -0
  41. src/tools/reader_word.py +106 -0
  42. src/tools/readers_pdf.py +428 -0
  43. src/tools/retriever.py +49 -0
  44. src/tools/table_converter.py +14 -0
  45. src/view/__pycache__/view.cpython-311.pyc +0 -0
  46. src/view/view.py +262 -0
  47. styles.txt +18 -0
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ env/lib/python3.10/site-packages/*.so filter=lfs diff=lfs merge=lfs -text
37
+ env/lib/python3.10/site-packages/*.dylib filter=lfs diff=lfs merge=lfs -text
38
+ env/lib/python3.10/site-packages/**/*.js.map filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ config_key.py
3
+
4
+
5
+ #library package
6
+ sqlite_updated/
7
+
8
+ #Test folder + files
9
+ data/Test/
10
+ test.py
11
+ test_read.py
12
+ styles.txt
13
+
14
+ #database folder
15
+ database/
16
+ database_structure/
17
+ database_word/
18
+ Ilumio_chatbot/
19
+
20
+ # Byte-compiled / optimized / DLL files
21
+ __pycache__/
22
+ *.py[cod]
23
+ *$py.class
24
+
25
+ # C extensions
26
+ *.so
27
+
28
+ # Distribution / packaging
29
+ .Python
30
+ build/
31
+ develop-eggs/
32
+ dist/
33
+ downloads/
34
+ eggs/
35
+ .eggs/
36
+ lib/
37
+ lib64/
38
+ parts/
39
+ sdist/
40
+ var/
41
+ wheels/
42
+ share/python-wheels/
43
+ *.egg-info/
44
+ .installed.cfg
45
+ *.egg
46
+ MANIFEST
47
+
48
+ # PyInstaller
49
+ # Usually these files are written by a python script from a template
50
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
51
+ *.manifest
52
+ *.spec
53
+
54
+ # Installer logs
55
+ pip-log.txt
56
+ pip-delete-this-directory.txt
57
+
58
+ # Unit test / coverage reports
59
+ htmlcov/
60
+ .tox/
61
+ .nox/
62
+ .coverage
63
+ .coverage.*
64
+ .cache
65
+ nosetests.xml
66
+ coverage.xml
67
+ *.cover
68
+ *.py,cover
69
+ .hypothesis/
70
+ .pytest_cache/
71
+ cover/
72
+
73
+ # Translations
74
+ *.mo
75
+ *.pot
76
+
77
+ # Django stuff:
78
+ *.log
79
+ local_settings.py
80
+ db.sqlite3
81
+ db.sqlite3-journal
82
+
83
+ # Flask stuff:
84
+ instance/
85
+ .webassets-cache
86
+
87
+ # Scrapy stuff:
88
+ .scrapy
89
+ .env
90
+ # Sphinx documentation
91
+ docs/_build/
92
+
93
+ # PyBuilder
94
+ .pybuilder/
95
+ target/
96
+
97
+ # Jupyter Notebook
98
+ .ipynb_checkpoints
99
+
100
+ # IPython
101
+ profile_default/
102
+ ipython_config.py
103
+
104
+ # pyenv
105
+ # For a library or package, you might want to ignore these files since the code is
106
+ # intended to run in multiple environments; otherwise, check them in:
107
+ # .python-version
108
+
109
+ # pipenv
110
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
111
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
112
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
113
+ # install all needed dependencies.
114
+ #Pipfile.lock
115
+
116
+ # poetry
117
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
118
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
119
+ # commonly ignored for libraries.
120
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
121
+ #poetry.lock
122
+
123
+ # pdm
124
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
125
+ #pdm.lock
126
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
127
+ # in version control.
128
+ # https://pdm.fming.dev/#use-with-ide
129
+ .pdm.toml
130
+
131
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
132
+ __pypackages__/
133
+
134
+ # Celery stuff
135
+ celerybeat-schedule
136
+ celerybeat.pid
137
+
138
+ # SageMath parsed files
139
+ *.sage.py
140
+
141
+ # Environments
142
+ .env
143
+ .venv
144
+ env/
145
+ venv/
146
+ ENV/
147
+ env.bak/
148
+ venv.bak/
149
+
150
+ # Spyder project settings
151
+ .spyderproject
152
+ .spyproject
153
+
154
+ # Rope project settings
155
+ .ropeproject
156
+
157
+ # mkdocs documentation
158
+ /site
159
+
160
+ # mypy
161
+ .mypy_cache/
162
+ .dmypy.json
163
+ dmypy.json
164
+
165
+ # Pyre type checker
166
+ .pyre/
167
+
168
+ # pytype static type analyzer
169
+ .pytype/
170
+
171
+ # Cython debug symbols
172
+ cython_debug/
173
+
174
+ # PyCharm
175
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
176
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
177
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
178
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
179
+ #.idea/
__pycache__/config.cpython-311.pyc ADDED
Binary file (1.41 kB). View file
 
__pycache__/config_key.cpython-311.pyc ADDED
Binary file (239 Bytes). View file
 
app.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from config import *
3
+ from src.tools.llm import LlmAgent
4
+ import src.view.view as view
5
+ from src.control.control import Chatbot
6
+ import chromadb
7
+ from src.tools.retriever import Retriever
8
+
9
+ os.environ["TOKENIZERS_PARALLELISM"] = "true"
10
+
11
+ if not "OPENAI_API_KEY" in os.environ:
12
+ from config_key import OPENAI_API_KEY
13
+ os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
14
+
15
+ llm_model = "gpt-4"
16
+ llm = LlmAgent(llm_model=llm_model)
17
+
18
+ if not os.path.exists("database_structure/"):
19
+ os.makedirs("database_structure/")
20
+
21
+ client_db = chromadb.PersistentClient("database_structure/")
22
+
23
+ chat = Chatbot(client_db=client_db,retriever=Retriever(llmagent=llm),llm_agent=llm)
24
+
25
+ ilumio_qna = view.run(ctrl=chat, config=view_config)
26
+
27
+ ilumio_qna.queue().launch()
config.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ content_language = 'en'
2
+ plan_language = 'en'
3
+ content_en_path_real = "data/Illumio_Core_REST_API_Developer_Guide_23.3.pdf"
4
+ content_test = "data/Test/Illumio_product_brief.pdf"
5
+ content_python = "data/cours-python_crop.docx"
6
+ content_html = "data/Test/list.html"
7
+ content_data_analyst = "data\Test\Data_Analyst_chez_Stockly.pdf"
8
+ content_test_epita = "data\Test\Test_epita.pdf"
9
+
10
+ examples = {"Question_1": "What is the max_results parameter for async traffic queries ?",
11
+ "Question_2": "How can I use the Public Experimental Provisioning API to determine if a specific set of objects can be provisioned?",
12
+ "Question_3": "Explain the potential challenges and workarounds when using json-query with the curl -i option. Why might this combination lead to errors?",
13
+ }
14
+
15
+
16
+ view_config = {
17
+ 'title': "<h1 style=text-align:center;font-size:4.5em;background-image:linear-gradient(45deg,#f3ec78,#af4261);background-color:red;background-size:100%;background-repeat:repeat;-webkit-background-clip:text;-webkit-text-fill-color:transparent;-moz-background-clip:text;-moz-text-fill-color:transparent;font-weight:bold;margin-top:4%;padding-bottom:1%>Document QnA</h1>",
18
+ 'examples': examples,
19
+ }
config_key.py ADDED
@@ -0,0 +1 @@
 
 
1
+ OPENAI_API_KEY = "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"
requirements.txt ADDED
Binary file (5.51 kB). View file
 
src/__init__.py ADDED
File without changes
src/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (172 Bytes). View file
 
src/__pycache__/config_key.cpython-311.pyc ADDED
Binary file (269 Bytes). View file
 
src/control/__init__.py ADDED
File without changes
src/control/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (180 Bytes). View file
 
src/control/__pycache__/control.cpython-311.pyc ADDED
Binary file (7.87 kB). View file
 
src/control/control.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import chromadb
3
+ from src.tools.retriever import Retriever
4
+ from src.tools.llm import LlmAgent
5
+ from src.model.block import Block
6
+ from src.model.doc import Doc
7
+ from chromadb.utils import embedding_functions
8
+ import gradio as gr
9
+
10
+
11
+ class Chatbot:
12
+ def __init__(self, llm_agent : LlmAgent = None, retriever: Retriever = None, client_db=None):
13
+ self.retriever = retriever
14
+ self.llm = llm_agent
15
+ self.client_db = client_db
16
+
17
+ def get_response(self, query, histo):
18
+ histo_conversation, histo_queries = self._get_histo(histo)
19
+ language_of_query = self.llm.detect_language_v2(query).lower()
20
+ queries = self.llm.translate_v2(histo_queries)
21
+ if "en" in language_of_query:
22
+ language_of_query = "en"
23
+ else:
24
+ language_of_query = "fr"
25
+ block_sources = self.retriever.similarity_search(queries=queries)
26
+ block_sources = self._select_best_sources(block_sources)
27
+ sources_contents = [f"Paragraph title : {s.title}\n-----\n{s.content}" if s.title else f"Paragraph {s.index}\n-----\n{s.content}" for s in block_sources]
28
+ context = '\n'.join(sources_contents)
29
+ i = 1
30
+ while (len(context) + len(histo_conversation) > 15000) and i < len(sources_contents):
31
+ context = "\n".join(sources_contents[:-i])
32
+ i += 1
33
+ answer = self.llm.generate_paragraph_v2(query=query, histo=histo_conversation, context=context, language=language_of_query)
34
+ answer = self._clean_chatgpt_answer(answer)
35
+ return answer, block_sources
36
+
37
+
38
+
39
+ @staticmethod
40
+ def _select_best_sources(sources: [Block], delta_1_2=0.15, delta_1_n=0.3, absolute=1.2, alpha=0.9) -> [Block]:
41
+ """
42
+ Select the best sources: not far from the very best, not far from the last selected, and not too bad per se
43
+ """
44
+ best_sources = []
45
+ for idx, s in enumerate(sources):
46
+ if idx == 0 \
47
+ or (s.distance - sources[idx - 1].distance < delta_1_2
48
+ and s.distance - sources[0].distance < delta_1_n) \
49
+ or s.distance < absolute:
50
+ best_sources.append(s)
51
+ delta_1_2 *= alpha
52
+ delta_1_n *= alpha
53
+ absolute *= alpha
54
+ else:
55
+ break
56
+ return best_sources
57
+
58
+
59
+ @staticmethod
60
+ def _get_histo(histo: [(str, str)]) -> (str, str):
61
+ histo_conversation = ""
62
+ histo_queries = ""
63
+
64
+ for (query, answer) in histo[-5:]:
65
+ histo_conversation += f'user: {query} \n bot: {answer}\n'
66
+ histo_queries += query + '\n'
67
+ return histo_conversation[:-1], histo_queries
68
+
69
+
70
+ @staticmethod
71
+ def _clean_answer(answer: str) -> str:
72
+ print(answer)
73
+ answer = answer.strip('bot:')
74
+ while answer and answer[-1] in {"'", '"', " ", "`"}:
75
+ answer = answer[:-1]
76
+ while answer and answer[0] in {"'", '"', " ", "`"}:
77
+ answer = answer[1:]
78
+ answer = answer.strip('bot:')
79
+ if answer:
80
+ if answer[-1] != ".":
81
+ answer += "."
82
+ return answer
83
+
84
+ def _clean_chatgpt_answer(self,answer: str) -> str:
85
+ answer = answer.strip('bot:')
86
+ answer = answer.strip('Answer:')
87
+ answer = answer.strip('Réponse:')
88
+ while answer and answer[-1] in {"'", '"', " ", "`"}:
89
+ answer = answer[:-1]
90
+ return answer
91
+
92
+ def upload_doc(self,input_doc,include_images_,actual_page_start):
93
+ title = Doc.get_title(Doc,input_doc.name)
94
+ extension = title.split('.')[-1]
95
+ if extension and (extension == 'docx' or extension == 'pdf' or extension == 'html'):
96
+ open_ai_embedding = embedding_functions.OpenAIEmbeddingFunction(api_key=os.environ['OPENAI_API_KEY'], model_name="text-embedding-ada-002")
97
+ coll_name = "".join([c if c.isalnum() else "_" for c in title])
98
+ collection = self.client_db.get_or_create_collection(name=coll_name,embedding_function=open_ai_embedding)
99
+
100
+ if collection.count() == 0:
101
+ gr.Info("Please wait while your document is being analysed")
102
+ print("Database is empty")
103
+ doc = Doc(path=input_doc.name,include_images=include_images_,actual_first_page=actual_page_start)
104
+
105
+ # for block in doc.blocks: #DEBUG PART
106
+ # print(f"{block.index} : {block.content}")
107
+
108
+ retriever = Retriever(doc.container, collection=collection,llmagent=self.llm)
109
+ else:
110
+ print("Database is not empty")
111
+ retriever = Retriever(collection=collection,llmagent=self.llm)
112
+
113
+ self.retriever = retriever
114
+ else:
115
+ return False
116
+ return True
src/model/__init__.py ADDED
File without changes
src/model/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (178 Bytes). View file
 
src/model/__pycache__/block.cpython-311.pyc ADDED
Binary file (3.04 kB). View file
 
src/model/__pycache__/container.cpython-311.pyc ADDED
Binary file (5.77 kB). View file
 
src/model/__pycache__/doc.cpython-311.pyc ADDED
Binary file (4.05 kB). View file
 
src/model/__pycache__/paragraph.cpython-311.pyc ADDED
Binary file (2.64 kB). View file
 
src/model/block.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+
3
+ class Block:
4
+ def __init__(self, doc: str= '',title: str = '', content: str = '',
5
+ index: str = '', rank: int = 0, level: int = 0, distance: float = 99999):
6
+ self.doc = doc
7
+ self.title = title
8
+ self.content = content
9
+ self.index = index
10
+ self.rank = rank
11
+ self.level = level
12
+ self.distance = distance
13
+
14
+ @property
15
+ def distance_str(self) -> str:
16
+ return format(self.distance, '.2f')
17
+
18
+ def separate_1_block_in_n(self, max_size=4500):
19
+ """
20
+ Separate a block in n blocks of equal size
21
+ """
22
+ content_length = len(self.content)
23
+ n = math.ceil(content_length / max_size)
24
+ block_size = content_length // n
25
+ new_blocks = []
26
+ for i in range(n):
27
+ start = i * block_size
28
+ end = (i + 1) * block_size if i < n - 1 else None
29
+ new_blocks.append(Block(doc=self.doc,
30
+ title=self.title + f"_part{i}",
31
+ content=self.content[start:end],
32
+ index=self.index + f"_{i}",
33
+ rank=self.rank,
34
+ level=self.level))
35
+ return new_blocks
36
+
37
+
38
+ def to_dict(self) -> {}:
39
+ block_dict = {'doc': self.doc,
40
+ 'title': self.title,
41
+ 'content': self.content,
42
+ 'index': self.index,
43
+ 'rank': self.rank,
44
+ 'level': self.level,
45
+ 'distance': self.distance}
46
+ return block_dict
47
+
48
+ def from_dict(self, block_dict: {}):
49
+ self.doc = block_dict['doc']
50
+ self.title = block_dict['title']
51
+ self.content = block_dict['content']
52
+ self.index = block_dict['index']
53
+ self.rank = block_dict['rank']
54
+ self.level = block_dict['level']
55
+ self.distance = block_dict['distance']
56
+ return self
57
+
58
+
src/model/container.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .paragraph import Paragraph
2
+ from .block import Block
3
+
4
+ INFINITE = 99999
5
+
6
+ class Container:
7
+
8
+ def __init__(self, paragraphs : [Paragraph], title : Paragraph=None, level: int = 0, index: [int] = None , father=None, id_ = 0):
9
+ if index is None:
10
+ index = []
11
+ self.level = level
12
+ self.title = title
13
+ self.paragraphs = []
14
+ self.children = []
15
+ self.index = index
16
+ self.father = father
17
+ self.id_ = int(str(1) + str(father.id_) + str(id_))
18
+ if paragraphs:
19
+ self.paragraphs, self.children = self.create_children(paragraphs, level, index)
20
+ self.containers = [self]
21
+ for child in self.children:
22
+ self.containers += child.containers
23
+ self.blocks = self.get_blocks()
24
+
25
+
26
+ def get_blocks(self):
27
+ block = Block(level=self.level, index=self.index)
28
+ if self.title:
29
+ self.title.text = self.title.text.replace('\r', '').replace('\n', '')
30
+ block.title = self.title.text
31
+ block.content = self.title.text + '/'
32
+ temp_father = self.father
33
+ while temp_father and type(temp_father) == Container:
34
+ if temp_father.title:
35
+ temp_father.title.text = temp_father.title.text.replace('\r', '').replace('\n', '')
36
+ block.content = temp_father.title.text + '/' + block.content
37
+ temp_father = temp_father.father
38
+ block.content += " :\n\n"
39
+ i = 0
40
+ for p in self.paragraphs:
41
+ if not p.blank:
42
+ i = 1
43
+ block.content += p.text
44
+ if i == 0:
45
+ blocks = []
46
+ else:
47
+ blocks = [block]
48
+ for child in self.children:
49
+ blocks += child.blocks
50
+ return blocks
51
+
52
+
53
+ def create_children(self, paragraphs, level, rank) -> ([], []):
54
+ """
55
+ creates children containers or directly attached content
56
+ and returns the list of containers and contents of level+1
57
+ :return:
58
+ [Content or Container]
59
+ """
60
+ attached_paragraphs = []
61
+ container_paragraphs = []
62
+ container_title = None
63
+ children = []
64
+ in_children = False
65
+ level = INFINITE
66
+ child_id = 0
67
+
68
+ while paragraphs:
69
+ p = paragraphs.pop(0)
70
+ if not in_children and not p.is_structure:
71
+ attached_paragraphs.append(p)
72
+ else:
73
+ in_children = True
74
+ if p.blank:
75
+ continue
76
+ if p.is_structure and p.level <= level: # if p is higher or equal in hierarchy
77
+ if container_paragraphs or container_title:
78
+ children.append(Container(container_paragraphs, container_title, level, rank, self, child_id))
79
+ child_id += 1
80
+ container_paragraphs = []
81
+ container_title = p
82
+ level = p.level
83
+
84
+ else: # p is strictly lower in hierarchy
85
+ container_paragraphs.append(p)
86
+
87
+ if container_paragraphs or container_title:
88
+ children.append(Container(container_paragraphs, container_title, level, rank, self, child_id))
89
+ child_id += 1
90
+
91
+ return attached_paragraphs, children
92
+
93
+
94
+ @property
95
+ def structure(self):
96
+
97
+ self_structure = {str(self.id_): {
98
+ 'index': str(self.id_),
99
+ 'canMove': True,
100
+ 'isFolder': True,
101
+ 'children': [p.id_ for p in self.paragraphs] + [child.id_ for child in self.children],
102
+ 'canRename': True,
103
+ 'data': {},
104
+ 'level': self.level,
105
+ 'rank': self.rank,
106
+ 'title': self.title.text if self.title else 'root'
107
+ }}
108
+ paragraphs_structure = [p.structure for p in self.paragraphs]
109
+ structure = [self_structure] + paragraphs_structure
110
+ for child in self.children:
111
+ structure += child.structure
112
+ return structure
src/model/doc.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.model.container import Container
2
+ from src.tools.index_creation import set_indexes
3
+ from src.tools.reader_word import WordReader
4
+ from src.tools.readers_pdf import Reader, Reader_illumio
5
+ from src.tools.reader_html import Reader_HTML
6
+ from src.model.paragraph import Paragraph
7
+
8
+
9
+ class Doc:
10
+
11
+ def __init__(self, path='', include_images=True, actual_first_page=1):
12
+
13
+ self.title = self.get_title(path)
14
+ self.extension = self.title.split('.')[-1]
15
+ self.id_ = id(self)
16
+ self.path = path
17
+ paragraphs = []
18
+ if self.extension == 'docx':
19
+ paragraphs = WordReader(path).paragraphs
20
+ elif self.extension == 'pdf':
21
+ if "Illumio_Core_REST_API_Developer_Guide_23.3" in self.title:
22
+ paragraphs = Reader_illumio(path).paragraphs
23
+ else:
24
+ paragraphs = Reader(path, actual_first_page, include_images).paragraphs
25
+ else:
26
+ paragraphs = Reader_HTML(path).paragraphs
27
+ self.container = Container(paragraphs, father=self, title=self.set_first_container_title(self.title.split(".")[0],self.extension))
28
+ set_indexes(self.container)
29
+ self.blocks = self.get_blocks()
30
+
31
+
32
+ def get_title(self,path) -> str:
33
+ if '/' not in path and '\\' not in path:
34
+ res = path
35
+ if '/' in path:
36
+ res = path.split('/')[-1]
37
+ if '\\' in path:
38
+ res = path.split('\\')[-1]
39
+ return res
40
+
41
+ @property
42
+ def structure(self):
43
+ return self.container.structure
44
+
45
+ def get_blocks(self):
46
+
47
+ def from_list_to_str(index_list):
48
+ index_str = str(index_list[0])
49
+ for el in index_list[1:]:
50
+ index_str += '.' + str(el)
51
+ return index_str
52
+
53
+ blocks = self.container.blocks
54
+ for block in blocks:
55
+ block.doc = self.title
56
+ block.index = from_list_to_str(block.index)
57
+ return blocks
58
+
59
+ def set_first_container_title(self,title,extension) -> Paragraph:
60
+ if extension == 'pdf':
61
+ return Paragraph(text=title,font_style='title0',id_=0,page_id=0)
62
+ elif extension == 'docx':
63
+ return Paragraph(text=title,font_style='title0',id_=0,page_id=1)
64
+ else:
65
+ return Paragraph(text=title,font_style='h0',id_=0,page_id=1)
66
+ """
67
+ current_level = len(current_index)
68
+ if 0 < block.level:
69
+ if block.level == current_level:
70
+ current_index[-1] += 1
71
+ elif current_level < block.level:
72
+ current_index.append(1)
73
+ elif block.level < current_level:
74
+ current_index = current_index[:block.level]
75
+ current_index[-1] += 1
76
+ block.index = from_list_to_str(current_index)
77
+ else:
78
+ block.index = "0"
79
+ """
src/model/paragraph.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import string
2
+
3
+ INFINITE = 10000
4
+
5
+ class Paragraph:
6
+ def __init__(self, text : str, font_style : str, id_ : int, page_id : int):
7
+ self.font_style = font_style
8
+ self.id_ = int(str(2)+str(page_id)+str(id_))
9
+ self.page_id = page_id
10
+ self.level = self.handle_levels(font_style)
11
+ self.is_structure = self.level < INFINITE
12
+ self.text = text
13
+
14
+ @property
15
+ def blank(self):
16
+ """
17
+ checks if the paragraph is blank: i.e. it brings some signal (it may otherwise be ignored)
18
+ """
19
+ text = self.text.replace('\n', '')
20
+ return set(text).isdisjoint(string.ascii_letters)
21
+
22
+ def rearrange_paragraph(self):
23
+ """
24
+ rearrange the paragraph to have a better structure
25
+ """
26
+ if self.font_style == "code":
27
+ self.text = "\n\nCode :```\n" + self.text + "\n```\n\n"
28
+ elif self.font_style == "table":
29
+ self.text = "\n\nTable :\n" + self.text + "\n\n"
30
+ return self
31
+
32
+ def handle_levels(self, font_style : str):
33
+ if len(font_style) != 5 and 'title' in font_style:
34
+ return int(font_style[-1])
35
+ elif len(font_style) == 2 and font_style[0] == 'h':
36
+ return int(font_style[-1])
37
+ else:
38
+ return INFINITE
39
+
src/tools/__init__.py ADDED
File without changes
src/tools/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (178 Bytes). View file
 
src/tools/__pycache__/index_creation.cpython-311.pyc ADDED
Binary file (4.83 kB). View file
 
src/tools/__pycache__/llm.cpython-311.pyc ADDED
Binary file (11.7 kB). View file
 
src/tools/__pycache__/reader_html.cpython-311.pyc ADDED
Binary file (8.28 kB). View file
 
src/tools/__pycache__/reader_pdf_tools.cpython-311.pyc ADDED
Binary file (3.64 kB). View file
 
src/tools/__pycache__/reader_word.cpython-311.pyc ADDED
Binary file (4.72 kB). View file
 
src/tools/__pycache__/readers_pdf.cpython-311.pyc ADDED
Binary file (25.1 kB). View file
 
src/tools/__pycache__/retriever.cpython-311.pyc ADDED
Binary file (3.24 kB). View file
 
src/tools/__pycache__/table_converter.cpython-311.pyc ADDED
Binary file (1.03 kB). View file
 
src/tools/index_creation.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.model.container import Container
2
+
3
+ INFINITE = 99999
4
+
5
+ def create_dic_levels(c:Container,dict_of_levels : dict = {}):
6
+ if c.level == 0:
7
+ dict_of_levels[c.level] = [0]
8
+ for child in c.children:
9
+ if child.level not in dict_of_levels:
10
+ dict_of_levels[child.level] = [1 for _ in range(child.level)]
11
+ create_dic_levels(child, dict_of_levels)
12
+ if INFINITE in dict_of_levels.keys():
13
+ dict_of_levels[INFINITE] = [1]
14
+ return dict_of_levels
15
+
16
+
17
+ def create_good_indexes(c:Container, dict_of_levels : dict):
18
+ actual_level = c.level
19
+ c.index = dict_of_levels[actual_level].copy()
20
+ actual_len = len(dict_of_levels[actual_level])
21
+ temp_update = dict_of_levels[actual_level][-1]
22
+ dict_of_levels[actual_level][-1] += 1
23
+ for i in dict_of_levels.values():
24
+ if len(i) > actual_len:
25
+ i[actual_len - 1] = temp_update
26
+ for child in c.children:
27
+ c_lvl = child.level
28
+ for i in dict_of_levels.values():
29
+ if len(i) > c_lvl:
30
+ i[c_lvl:] = [1 for _ in range(len(i[c_lvl:]))]
31
+ create_good_indexes(child, dict_of_levels) # Apply the function recursively to all children
32
+
33
+
34
+ def create_good_indexes_not_ordered_titles(c:Container, dict_of_levels : dict):
35
+ actual_level = c.level
36
+ c.index = dict_of_levels[actual_level].copy()
37
+ actual_len = len(dict_of_levels[actual_level])
38
+ temp_update = dict_of_levels[actual_level][-1]
39
+ dict_of_levels[actual_level][-1] += 1
40
+ for i in dict_of_levels.values():
41
+ if len(i) > actual_len:
42
+ i[actual_len - 1] = temp_update
43
+ for child in c.children:
44
+ c_lvl = child.level
45
+ for i in dict_of_levels.values():
46
+ if len(i) > c_lvl:
47
+ i[c_lvl:] = [1 for _ in range(len(i[c_lvl:]))]
48
+ create_good_indexes(child, dict_of_levels) # Apply the function recursively to all children
49
+
50
+
51
+ def set_good_block_indexes(c:Container):
52
+ for i in c.containers:
53
+ for b in i.blocks:
54
+ b.index = i.index
55
+
56
+
57
+ def set_indexes(c:Container):
58
+ dict_levels = create_dic_levels(c)
59
+ myKeys = list(dict_levels.keys())
60
+ myKeys.sort()
61
+ dict_levels = {key: dict_levels[key] for key in myKeys}
62
+ if c.children and c.children[0] and (c.children[0].level > min(list(dict_levels.keys())[1:])):
63
+ c.children[0].level = min(list(dict_levels.keys())[1:])
64
+ create_good_indexes_not_ordered_titles(c, dict_levels)
65
+ else:
66
+ create_good_indexes(c, dict_levels)
67
+ set_good_block_indexes(c)
src/tools/llm.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+
3
+ class LlmAgent:
4
+
5
+ def __init__(self, llm_model: str):
6
+ self.llm = llm_model
7
+
8
+ def generate_paragraph(self, query: str, context: {}, histo: [(str, str)], language='fr') -> str:
9
+ """generates the answer"""
10
+ template = (f"You are a conversation bot designed to answer to the query from users."
11
+ f"Your answer is based on the context delimited by triple backticks :\n ``` {context} ```\n"
12
+ f"You are consistent and avoid redundancies with the rest of the initial conversation delimited by triple backticks :\n ``` {histo} ```\n"
13
+ f"Your response shall be in {language} and shall be concise."
14
+ f"You shall only provide the answer, nothing else before and after."
15
+ f"Here is the query you are given :\n"
16
+ f"``` {query} ```")
17
+ generation = openai.ChatCompletion.create(model=self.llm, messages=[{"role":"user","content":template}])
18
+ res = generation.choices[0].message.content
19
+ print("****************")
20
+ print(res)
21
+ print("----")
22
+ return str(res)
23
+
24
+ def generate_paragraph_v2(self, query: str, context: {}, histo: [(str, str)], language='fr') -> str:
25
+ """generates the answer"""
26
+ context_for_the_ai = (f"You are a conversation bot designed to answer to the query from users."
27
+ f"Your answer is based on the context delimited by triple backticks :\n ``` {context} ```\n"
28
+ f"You are consistent and avoid redundancies with the rest of the initial conversation delimited by triple backticks :\n ``` {histo} ```\n"
29
+ f"Your response shall be in {language} and shall be concise.")
30
+ generation = openai.ChatCompletion.create(model="gpt-3.5-turbo-16k", messages=[{"role":"system","content":context_for_the_ai},{"role":"user","content":query}])
31
+ res = generation.choices[0].message.content
32
+ print("****************")
33
+ print(res)
34
+ print("----")
35
+ return str(res)
36
+
37
+ def translate(self, text: str) -> str:
38
+ """translates"""
39
+ template = (f"Your task consists in translating in English the following text delimited by triple backticks: ``` {text} ```\n"
40
+ f"If the text is already in English, just return it !\n"
41
+ f"Your must not provide an answer to the text, just translate it.\n")
42
+ generation = openai.ChatCompletion.create(model=self.llm, messages=[{"role":"user","content":template}])
43
+ res = generation.choices[0].message.content
44
+ print("****************")
45
+ print(res)
46
+ print("----TRANSLATE----")
47
+ return res
48
+
49
+ def translate_v2(self, text: str) -> str:
50
+ """translates"""
51
+ task = "Translate in english the text. If it is already in english, just return the text."
52
+ generation = openai.ChatCompletion.create(model="gpt-4", messages=[{"role":"system","content":task},{"role":"user","content":text}])
53
+ res = generation.choices[0].message.content
54
+ print("****************")
55
+ print(res)
56
+ print("----TRANSLATE V2----")
57
+ return res
58
+
59
+ def generate_answer(self, query: str, answer: str, histo: str, context: str,language : str) -> str:
60
+ """provides the final answer in {language} based on the initial query and the answer in english"""
61
+ template = (f"Your task consists in translating the answer in {language}, if its not already the case, to the query "
62
+ f"delimited by triple backticks: ```{query}``` \n"
63
+ f"You don't add new content to the answer but: "
64
+ f"1 You can use some vocabulary from the context delimited by triple backticks:\n"
65
+ f"```{context}```\n"
66
+ f"2 You are consistent and avoid redundancies with the rest of the initial"
67
+ f"conversation delimited by triple backticks: ```{histo}```\n"
68
+ f"Your response shall respect the following format:<response>\n"
69
+ f"Here is the answer you are given in {language}:"
70
+ f"{answer}")
71
+ generation = openai.ChatCompletion.create(model=self.llm, messages=[{"role":"user","content":template}])
72
+ res = generation.choices[0].message.content
73
+ print("****************")
74
+ print(res)
75
+ print("----")
76
+ return str(res).strip()
77
+
78
+ def summarize_paragraph(self, prompt : str, title_doc : str = '',title_para : str = ''):
79
+ max_tokens = 700
80
+ """summarizes the paragraph"""
81
+ template = (f"Your task consists in summarizing the paragraph of the document untitled ```{title_doc}```."
82
+ f"The paragraph title is ```{title_para}```."
83
+ f"Your response shall be concise and shall respect the following format:"
84
+ f"<summary>"
85
+ f"If you see that the summary that you are creating will not respect ```{max_tokens}``` tokens, find a way to make it shorter."
86
+ f"The paragraph you need to summarize is the following :"
87
+ f"{prompt}")
88
+ generation = openai.ChatCompletion.create(model=self.llm, messages=[{"role":"user","content":template}])
89
+ res = generation.choices[0].message.content
90
+ print("****************")
91
+ print(res)
92
+ print("----")
93
+ return str(res).strip()
94
+
95
+ def summarize_paragraph_v2(self, prompt : str, title_doc : str = '', title_para : str = ''):
96
+ max_tokens = 850
97
+ location_of_the_paragraph = prompt.split(" :")[0]
98
+ """summarizes the paragraph"""
99
+ task = (f"Your task consists in summarizing in English the paragraph of the document untitled ```{title_doc}``` located in the ```{location_of_the_paragraph}``` section of the document."
100
+ f"The paragraph title is ```{title_para}```."
101
+ f"Your response shall be concise and shall respect the following format:"
102
+ f"<summary>"
103
+ f"If you see that the summary that you are creating will not respect ```{max_tokens}``` tokens, find a way to make it shorter.")
104
+ generation = openai.ChatCompletion.create(model="gpt-3.5-turbo-16k", messages=[{"role":"system","content":task},{"role":"user","content":prompt}])
105
+ res = generation.choices[0].message.content
106
+ print("****************")
107
+ print(res)
108
+ print("----")
109
+ return str(res).strip()
110
+
111
+ def transform_paragraph_into_question(self, prompt : str, title_doc : str = '',title_para : str = '') -> (str, str):
112
+ max_tokens = 150
113
+
114
+ prompt_template=(f"Your job is to create two questions about a paragraph of a document untitled ```{title_doc}```."
115
+ f"The paragraph title is ```{title_para}```."
116
+ f"If you see that the questions that you are creating will not respect ```{max_tokens}``` tokens, find a way to make them shorter."
117
+ f"If you can't create a question about the paragraph, just rephrase ```{title_para}``` so that it becomes a question."
118
+ f"Your response shall contains two questions, shall be concise and shall respect the following format:"
119
+ f"`<question1>!=;<question2>`"
120
+ f"You should not answer to the questions, just create them. Moreover, you shall include the title of the paragraph in the questions."
121
+ f"The paragraph you need to create questions about is the following :"
122
+ f"{prompt}")
123
+ generation = openai.ChatCompletion.create(model=self.llm, messages=[{"role":"user","content":prompt_template}])
124
+ res = generation.choices[0].message.content
125
+ print("****************")
126
+ res = str(res).split("!=;")
127
+ if len(res) == 1:
128
+ return (res[0],"")
129
+ elif len(res) == 2:
130
+ return (res[0],res[1])
131
+ else:
132
+ return ("","")
133
+
134
+ def detect_language(self, text: str) -> str:
135
+ """detects the language"""
136
+ template = (f"Your task consists in detecting the language of the last question or sentence of the text."
137
+ f"You should only give the two letters code of the language detected, nothing else."
138
+ f"Here is the text you are given delimited by triple backticks : ```{text}```")
139
+ generation = openai.ChatCompletion.create(model=self.llm, messages=[{"role":"user","content":template}])
140
+ res = generation.choices[0].message.content
141
+ return str(res).strip()
142
+
143
+ def detect_language_v2(self, text: str) -> str:
144
+ """detects the language"""
145
+ task = (f"Your task consists in detecting the language of the last question or sentence of the text."
146
+ f"You should only give the two letters code of the language detected, nothing else.")
147
+ generation = openai.ChatCompletion.create(model=self.llm, messages=[{"role":"system","content":task},{"role":"user","content":text}])
148
+ res = generation.choices[0].message.content
149
+ return str(res).strip()
src/tools/pretty_print.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.model.paragraph import Paragraph
2
+ from src.model.container import Container
3
+
4
+
5
+ #function that pretty prints the paragraphs
6
+ def pretty_printer_paragraphs(paragraphs):
7
+ for p in paragraphs:
8
+ if (p.font_style == "title1"):
9
+ print(f"Titre 1 {p.text}")
10
+ elif (p.font_style == "title2"):
11
+ print(f"---> Titre 2 {p.text}")
12
+ elif (p.font_style == "title3"):
13
+ print(f"-------> Titre 3 {p.text}")
14
+ elif (p.font_style == "title4"):
15
+ print(f"-----------> Titre 4 {p.text}")
16
+ elif (p.font_style == "content"):
17
+ print(f"---------------> {p.text}")
18
+ elif (p.font_style == "code"):
19
+ print(f"----------code------------> {p.text}")
20
+ elif (p.font_style == "table"):
21
+ print(f"----------table------------> {p.text}")
22
+
23
+ def pretty_print_container_structure(container):
24
+ if container.title:
25
+ print(f"{'-'*container.level} {container.title.text}")
26
+ for p in container.paragraphs:
27
+ print(f"{'-'*container.level} {p.text}")
28
+ for c in container.children:
29
+ pretty_print_container_structure(c)
30
+
31
+ def print_all_block_indexes(container):
32
+ for b in container.blocks:
33
+ print(f'{b.index} : {b.title if b.title else ""}')
src/tools/reader_html.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pyquery import PyQuery as pq
2
+ from src.model.paragraph import Paragraph
3
+ from bs4 import BeautifulSoup
4
+ from src.tools.readers_pdf import Reader_illumio
5
+ from src.tools.table_converter import table_converter
6
+
7
+ class Reader_HTML:
8
+ def __init__(self, path):
9
+ self.path = path
10
+ self.paragraphs = self.read_html_2(path)
11
+
12
+ #without beautifulsoup but doesn't work fine
13
+ def read_html(self, path):
14
+ with open(path, 'r') as html_file:
15
+ doc = pq(html_file.read())
16
+
17
+ # Remove script and style elements
18
+ doc('script').remove()
19
+ doc('style').remove()
20
+
21
+ paragraphs = []
22
+ for index, elem in enumerate(doc('*')):
23
+ # Check if the element is a leaf (does not contain other elements)
24
+ if not pq(elem).find('*'):
25
+ text = pq(elem).text().strip()
26
+ if text:
27
+ paragraphs.append(Paragraph(text=text, font_style=elem.tag, id_ = index, page_id=1))
28
+ return paragraphs
29
+
30
+ #with beautifulsoup
31
+ def read_html_2(self,path):
32
+ HTMLFile = open(path, "r")
33
+ # Reading the file
34
+ reader = HTMLFile.read()
35
+ paragraphs = []
36
+ # Creating a BeautifulSoup object and specifying the parser
37
+ S = BeautifulSoup(reader, 'html.parser')
38
+ for tag in S(['style', 'script', 'footer', 'header', 'nav', 'aside', 'form']):
39
+ tag.decompose()
40
+
41
+ # Get all elements that do not contain other elements
42
+ leaf_elements = [elem for elem in S.body.descendants if elem.name is not None and not elem.find_all()]
43
+ paragraphs = []
44
+ for index, elem in enumerate(leaf_elements):
45
+ text = elem.get_text(strip=True, separator='\n')
46
+ if text:
47
+ p = Paragraph(text=text, font_style=elem.name, id_ = index, page_id=1)
48
+ paragraphs.append(p)
49
+ paragraphs = self.concatenate_paragraphs_with_same_font_style(paragraphs)
50
+ paragraphs = [p.rearrange_paragraph() for p in paragraphs]
51
+ return paragraphs
52
+
53
+ def concatenate_paragraphs_with_same_font_style(self,paragraphs: [Paragraph]):
54
+ i = 0
55
+ while i < len(paragraphs)-1:
56
+ if paragraphs[i].font_style == "th":
57
+ paragraphs = self.create_table(paragraphs,i)
58
+ i += 1
59
+ elif paragraphs[i].font_style == "li":
60
+ paragraphs,i = self.create_list(paragraphs,i)
61
+ i += 1
62
+ elif paragraphs[i].font_style == paragraphs[i+1].font_style:
63
+ paragraphs[i].text += "\n" + paragraphs[i+1].text
64
+ paragraphs.pop(i+1)
65
+ else:
66
+ i += 1
67
+ return paragraphs
68
+
69
+
70
+ def create_table(self, paragraphs, i: int):
71
+ table = []
72
+ titles = []
73
+ content = []
74
+ while i < len(paragraphs) and paragraphs[i].font_style == "th":
75
+ titles.append(paragraphs[i].text)
76
+ paragraphs.pop(i)
77
+ table.append(titles)
78
+ length = len(titles)
79
+ temp = 0
80
+ while i < len(paragraphs) and paragraphs[i].font_style == "td":
81
+ if temp == length:
82
+ temp = 0
83
+ content.append(paragraphs[i].text)
84
+ table.append(content)
85
+ content = []
86
+ else:
87
+ content.append(paragraphs[i].text)
88
+ paragraphs.pop(i)
89
+ temp += 1
90
+ table.append(content)
91
+ paragraphs.insert(i,Paragraph(table_converter(table),font_style="table",id_=i,page_id=1))
92
+ return paragraphs
93
+
94
+ def create_list(self, paragraphs, i: int):
95
+ list_content = []
96
+ while i < len(paragraphs) and paragraphs[i].font_style in ["ul", "ol", "li"]:
97
+ if paragraphs[i].font_style == "li":
98
+ list_content.append(paragraphs[i].text)
99
+ paragraphs.pop(i)
100
+ elif paragraphs[i].font_style in ["ul", "ol"]:
101
+ sublist, i = self.create_list(paragraphs, i+1)
102
+ list_content.append(sublist)
103
+ else:
104
+ i += 1
105
+ list_paragraph = Paragraph(text=self.format_list(list_content), font_style="list", id_=i, page_id=1)
106
+ paragraphs.insert(i, list_paragraph)
107
+ return paragraphs, i
108
+
109
+ def format_list(self,list_content):
110
+ res = ""
111
+ for i in range(len(list_content)):
112
+ if type(list_content[i]) == str:
113
+ res += f"{i+1}. {list_content[i]}\n"
114
+ else:
115
+ res += f"{i+1}. {self.format_list(list_content[i])}\n"
116
+ return res
117
+
118
+
src/tools/reader_pdf_tools.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def flatten(S):
2
+ if S == []:
3
+ return S
4
+ if isinstance(S[0], list):
5
+ return flatten(S[0]) + flatten(S[1:])
6
+ return S[:1] + flatten(S[1:])
7
+
8
+ def keep_int_and_floats_in_list(S):
9
+ i = 0
10
+ while i < len(S):
11
+ if isinstance(S[i], str):
12
+ S.pop(i)
13
+ else:
14
+ i+=1
15
+ return S
16
+
17
+ def group_formats(formats : list) -> list:
18
+ #create a list of lists of formats that are close to each other (0.5 difference)
19
+ formats = sorted(formats)
20
+ groups = []
21
+ current_group = []
22
+ current_format = formats[0]
23
+ for format in formats:
24
+ if format - current_format <= 0.20:
25
+ current_group.append(format)
26
+ else:
27
+ groups.append(current_group)
28
+ current_group = [format]
29
+ current_format = format
30
+ groups.append(current_group)
31
+ return groups
32
+
33
+ def find_max_list(list):
34
+ list_len = [len(i) for i in list]
35
+ return len(list) - 1 - list_len[::-1].index(max(list_len))
36
+
37
+ def find_good_key_in_dict(dict : dict, value) -> str:
38
+ for key in dict.keys():
39
+ if value in dict[key]:
40
+ return key
41
+ return None
42
+
43
+ def create_dict_and_assign_styles_from_format(formats : list) -> dict:
44
+ #create a dictionary with the format as key and the style as value
45
+ styles = {}
46
+ content_format_index = find_max_list(formats)
47
+ i = 0
48
+ for l in formats[:content_format_index]:
49
+ formats[content_format_index - i] += l
50
+ del formats[formats.index(l)]
51
+ i+=1
52
+ number_of_styles = len(formats)
53
+ styles["content"] = sorted(list(set(formats[0])))
54
+ for i in range(1,len(formats)):
55
+ styles["title"+str(number_of_styles-i)] = sorted(list(set(formats[i])))
56
+ return styles
src/tools/reader_word.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import docx
2
+ import os
3
+ # sys.path.append('path to app')
4
+ # import docx
5
+ # import os
6
+ # import sys
7
+
8
+ from src.model.paragraph import Paragraph
9
+
10
+ class WordReader:
11
+
12
+ def __init__(self, path):
13
+ self.path = path
14
+ self.paragraphs = self.get_word_paragraphs()
15
+
16
+ def get_word_paragraphs(self):
17
+ """
18
+ Fetches paragraphs from a Word document.
19
+
20
+ Returns:
21
+ list: List of Paragraph objects from the document.
22
+ """
23
+ if not os.path.exists(self.path):
24
+ raise FileNotFoundError(f"The file {self.path} does not exist.")
25
+
26
+ try:
27
+ doc = docx.Document(self.path)
28
+ paragraphs = self.to_paragraph_objects(doc.paragraphs) # Convert to Paragraph objects
29
+ return paragraphs
30
+ except Exception as e:
31
+ raise ValueError(f"Error reading the .docx file. Original error: {str(e)}")
32
+
33
+ def determine_style(self, paragraph):
34
+ """
35
+ Determines the style of the paragraph based on its attributes.
36
+
37
+ Returns:
38
+ str: Style of the paragraph.
39
+ """
40
+ # Check for heading styles first
41
+ if paragraph.style.name.startswith('Heading 1'):
42
+ return "title1"
43
+ elif paragraph.style.name.startswith('Heading 2'):
44
+ return "title2"
45
+ elif paragraph.style.name.startswith('Heading 3'):
46
+ return "title3"
47
+ elif paragraph.style.name.startswith('Heading 4'):
48
+ return "title4"
49
+ elif paragraph.style.name.startswith('Heading 5'):
50
+ return "title5"
51
+
52
+ # If not a heading, check the runs within the paragraph
53
+ for run in paragraph.runs:
54
+ font = run.font
55
+ fontname = font.name
56
+ size = font.size
57
+
58
+ # Convert size to points (from twips)
59
+ if size:
60
+ size_in_points = size.pt
61
+
62
+ # Map based on font name and size as in the PDF reader
63
+ if fontname == "XFQKGD+Consolas":
64
+ return "code"
65
+ elif (size_in_points >= 9 and size_in_points < 11.5) or fontname == "Wingdings-Regular":
66
+ return "content"
67
+ # If none of the above conditions match, default to 'content'
68
+ return "content"
69
+
70
+
71
+ def to_paragraph_objects(self, doc_paragraphs):
72
+ """
73
+ Convert docx paragraphs to Paragraph objects for further processing.
74
+ """
75
+ paragraph_objects = []
76
+ for idx, paragraph in enumerate(doc_paragraphs):
77
+ style = self.determine_style(paragraph)
78
+
79
+ # Assuming page_id is always 1 for simplicity, change as needed.
80
+ p_obj = Paragraph(text=paragraph.text, font_style=style, id_=idx, page_id=1)
81
+ paragraph_objects.append(p_obj)
82
+ paragraphs = self.rearrange_paragraphs(paragraph_objects)
83
+
84
+ return paragraphs
85
+
86
+
87
+ def rearrange_paragraphs(self, paragraphs : [Paragraph]):
88
+ #associate paragraphs with the same font style
89
+ i = 0
90
+ while i < len(paragraphs):
91
+ paragraphs[i] = paragraphs[i].rearrange_paragraph()
92
+ i+=1
93
+ return paragraphs
94
+
95
+
96
+ def display_paragraphs(self):
97
+ """
98
+ Prints the paragraphs from the document to the console.
99
+ """
100
+ for paragraph in self.paragraphs:
101
+ print(paragraph.text)
102
+ print('-' * 40) # separator for clarity
103
+
104
+ # if __name__ == '__main__':
105
+ # reader = WordReader("Illumio_Core_REST_API_Developer_Guide_23.3.docx")
106
+ # reader.display_paragraphs()
src/tools/readers_pdf.py ADDED
@@ -0,0 +1,428 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import PyPDF2
3
+ # To analyze the PDF layout and extract text
4
+ from pdfminer.high_level import extract_pages, extract_text
5
+ from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
6
+ # To extract text from tables in PDF
7
+ import pdfplumber
8
+ # To extract the images from the PDFs
9
+ from PIL import Image
10
+ from pdf2image import convert_from_path
11
+ # To perform OCR to extract text from images
12
+ import pytesseract
13
+ # To remove the additional created files
14
+ import os
15
+ import pdfplumber as pdfp
16
+
17
+ from src.model.paragraph import Paragraph
18
+ from src.tools.table_converter import table_converter
19
+ from src.tools.reader_pdf_tools import *
20
+ import gradio as gr
21
+
22
+
23
+ def get_style_of_line(size : float, fontname : str):
24
+ if fontname == "XFQKGD+Consolas":
25
+ return "code"
26
+ elif (size >= 9 and size < 11.5) or fontname == "CRRYJU+Wingdings-Regular":
27
+ return "content"
28
+ elif size >= 11.5 and size <= 12.7:
29
+ return "title5"
30
+ elif size >= 12.8 and size <= 13.5:
31
+ return "title4"
32
+ elif size > 13.5 and size <= 15.5:
33
+ return "title3"
34
+ elif size > 15.5 and size <= 18.5:
35
+ return "title2"
36
+ elif size > 19 and size < 30:
37
+ return "title1"
38
+ else:
39
+ return "unknown"
40
+
41
+ class Reader:
42
+ def __init__(self, path,actual_first_page_=0, include_images=True):
43
+ self.path = path
44
+ self.paragraphs = self.pdf_manager(path, actual_first_page_, include_images=include_images)
45
+
46
+
47
+ def most_occuring_fonts(self, line_formats : list):
48
+ if line_formats != []:
49
+ min_freq = 3
50
+ font_size_freq = {i: line_formats.count(i) for i in set(line_formats) if isinstance(i, float)}
51
+ most_occuring_font_sizes = [size for size, freq in font_size_freq.items() if freq >= min_freq]
52
+ line_formats = [i for i in line_formats if i in most_occuring_font_sizes or isinstance(i, str)]
53
+ return line_formats
54
+
55
+
56
+ def text_extraction(self,element):
57
+ # Extracting the text from the in line text element
58
+ line_text = element.get_text()
59
+ # Find the formats of the text
60
+ # Initialize the list with all the formats appeared in the line of text
61
+ line_formats = []
62
+ for text_line in element:
63
+ if isinstance(text_line, LTTextContainer):
64
+ # Iterating through each character in the line of text
65
+ for character in text_line:
66
+ if isinstance(character, LTChar):
67
+ # Append the font name of the character
68
+ line_formats.append(character.fontname)
69
+ # Append the font size of the character
70
+ line_formats.append(character.size)
71
+ #find the most occuring font size and keep it. If there are more than one, keep all of them.
72
+ line_formats = self.most_occuring_fonts(line_formats)
73
+ # Find the unique font sizes and names in the line and delete the None values
74
+ format_per_line = list(set(line_formats))
75
+ # Return a tuple with the text in each line along with its format
76
+ return (line_text, format_per_line)
77
+
78
+ # Extracting tables from the page
79
+ def extract_table(self, pdf_path, page_num, table_num):
80
+ # Open the pdf file
81
+ pdf = pdfplumber.open(pdf_path)
82
+ # Find the examined page
83
+ table_page = pdf.pages[page_num]
84
+ # Extract the appropriate table
85
+ table = table_page.extract_tables()[table_num]
86
+
87
+ return table
88
+
89
+ # Create a function to check if the element is in any tables present in the page
90
+ def is_element_inside_any_table(self, element, page ,tables):
91
+ x0, y0up, x1, y1up = element.bbox
92
+ # Change the cordinates because the pdfminer counts from the botton to top of the page
93
+ y0 = page.bbox[3] - y1up
94
+ y1 = page.bbox[3] - y0up
95
+ for table in tables:
96
+ tx0, ty0, tx1, ty1 = table.bbox
97
+ if tx0 <= x0 <= x1 <= tx1 and ty0 <= y0 <= y1 <= ty1:
98
+ return True
99
+ return False
100
+
101
+ # Function to find the table for a given element
102
+ def find_table_for_element(self, element, page ,tables):
103
+ x0, y0up, x1, y1up = element.bbox
104
+ # Change the cordinates because the pdfminer counts from the botton to top of the page
105
+ y0 = page.bbox[3] - y1up
106
+ y1 = page.bbox[3] - y0up
107
+ for i, table in enumerate(tables):
108
+ tx0, ty0, tx1, ty1 = table.bbox
109
+ if tx0 <= x0 <= x1 <= tx1 and ty0 <= y0 <= y1 <= ty1:
110
+ return i # Return the index of the table
111
+ return None
112
+
113
+ # Create a function to crop the image elements from PDFs
114
+ def crop_image(self, element, pageObj):
115
+ # Get the coordinates to crop the image from PDF
116
+ [image_left, image_top, image_right, image_bottom] = [element.x0,element.y0,element.x1,element.y1]
117
+ # Crop the page using coordinates (left, bottom, right, top)
118
+ pageObj.mediabox.lower_left = (image_left, image_bottom)
119
+ pageObj.mediabox.upper_right = (image_right, image_top)
120
+ # Save the cropped page to a new PDF
121
+ cropped_pdf_writer = PyPDF2.PdfWriter()
122
+ cropped_pdf_writer.add_page(pageObj)
123
+ # Save the cropped PDF to a new file
124
+ with open('cropped_image.pdf', 'wb') as cropped_pdf_file:
125
+ cropped_pdf_writer.write(cropped_pdf_file)
126
+
127
+ # Create a function to convert the PDF to images
128
+ def convert_to_images(self, input_file,):
129
+ images = convert_from_path(input_file)
130
+ image = images[0]
131
+ output_file = 'PDF_image.png'
132
+ image.save(output_file, 'PNG')
133
+
134
+ # Create a function to read text from images
135
+ def image_to_text(self, image_path):
136
+ # Read the image
137
+ img = Image.open(image_path)
138
+ # Extract the text from the image
139
+ text = pytesseract.image_to_string(img)
140
+ return text
141
+
142
+ def pdf_manager(self, pdf_path, actual_first_page=0, include_images=True):
143
+ # create a PDF file object
144
+ pdfFileObj = open(pdf_path, 'rb')
145
+ # create a PDF reader object
146
+ pdfReaded = PyPDF2.PdfReader(pdfFileObj)
147
+ number_of_pages = len(pdfReaded.pages)
148
+ # Create the dictionary to extract text from each image
149
+ text_per_page = {}
150
+ # Create a boolean variable for image detection
151
+ image_flag = False
152
+ actual_first_page = int(actual_first_page)
153
+ if actual_first_page > number_of_pages:
154
+ gr.Warning("The number of pages you want to skip is greater than the number of pages in the document. We will extract all the pages.")
155
+ page_numbers = None
156
+ else:
157
+ page_numbers = [i for i in range(actual_first_page - 1,number_of_pages)]
158
+ # We extract the pages from the PDF
159
+ for pagenum, page in enumerate(extract_pages(pdf_path,page_numbers=page_numbers)):
160
+ # Initialize the page object
161
+ pagenum = page_numbers[pagenum] if page_numbers else pagenum
162
+ pageObj = pdfReaded.pages[pagenum]
163
+ # Initialize the variables needed for the text extraction from the page
164
+ page_text = []
165
+ line_format = []
166
+ text_from_images = []
167
+ text_from_tables = []
168
+ page_content = []
169
+ # Initialize the number of the examined tables
170
+ table_in_page= -1
171
+ # Open the pdf file
172
+ pdf = pdfplumber.open(pdf_path)
173
+ # Find the examined page
174
+ page_tables = pdf.pages[pagenum]
175
+ # Find the number of tables in the page
176
+
177
+ tables = page_tables.find_tables()
178
+ if len(tables)!=0:
179
+ table_in_page = 0
180
+
181
+ # Extracting the tables of the page
182
+ for table_num in range(len(tables)):
183
+ # Extract the information of the table
184
+ table = self.extract_table(pdf_path, pagenum, table_num)
185
+ # Convert the table information in structured string format
186
+ table_string = table_converter(table)
187
+ # Append the table string into a list
188
+ text_from_tables.append(table_string)
189
+
190
+ # Find all the elements
191
+ page_elements = [(element.y1, element) for element in page._objs]
192
+ # Sort all the element as they appear in the page
193
+ page_elements.sort(key=lambda a: a[0], reverse=True)
194
+
195
+
196
+ # Find the elements that composed a page
197
+ for i,component in enumerate(page_elements):
198
+ # Extract the element of the page layout
199
+ element = component[1]
200
+
201
+ # Check the elements for tables
202
+ if table_in_page == -1:
203
+ pass
204
+ else:
205
+ if self.is_element_inside_any_table(element, page ,tables):
206
+ table_found = self.find_table_for_element(element,page ,tables)
207
+ if table_found == table_in_page and table_found != None:
208
+ page_content.append(text_from_tables[table_in_page])
209
+ page_text.append('table')
210
+ line_format.append('table')
211
+ table_in_page+=1
212
+ # Pass this iteration because the content of this element was extracted from the tables
213
+ continue
214
+
215
+ if not self.is_element_inside_any_table(element,page,tables):
216
+
217
+ # Check if the element is text element
218
+ if isinstance(element, LTTextContainer):
219
+ # Use the function to extract the text and format for each text element
220
+ (line_text, format_per_line) = self.text_extraction(element)
221
+ # Append the text of each line to the page text
222
+ page_text.append(line_text)
223
+ # Append the format for each line containing text
224
+ line_format.append(format_per_line)
225
+ page_content.append(line_text)
226
+
227
+
228
+ #Check the elements for images
229
+ if include_images:
230
+ if isinstance(element, LTFigure):
231
+ # Crop the image from PDF
232
+ self.crop_image(element, pageObj)
233
+ # Convert the croped pdf to image
234
+ self.convert_to_images('cropped_image.pdf')
235
+ # Extract the text from image
236
+ image_text = self.image_to_text('PDF_image.png')
237
+ text_from_images.append(image_text)
238
+ page_content.append(image_text)
239
+ # Add a placeholder in the text and format lists
240
+ page_text.append('image')
241
+ line_format.append('image')
242
+ # Update the flag for image detection
243
+ image_flag = True
244
+
245
+ # Create the key of the dictionary
246
+ dctkey = 'Page_'+str(pagenum)
247
+ # Add the list of list as value of the page key
248
+ text_per_page[dctkey]= [page_text, line_format, text_from_images, text_from_tables, page_content]
249
+
250
+
251
+ # Close the pdf file object
252
+ pdfFileObj.close()
253
+
254
+ # Create a list of formats for all the pages
255
+ formats = []
256
+ for p in text_per_page.values():
257
+ formats.append(p[1])
258
+
259
+ #flatten the list of lists
260
+ formats = flatten(formats)
261
+
262
+ #keep only the font sizes in the list
263
+ formats = keep_int_and_floats_in_list(formats)
264
+
265
+ #group the formats in lists of similar formats
266
+ grouped_formats = group_formats(formats)
267
+
268
+ #create a dictionary with the format as key and the style as value
269
+ styles = create_dict_and_assign_styles_from_format(grouped_formats)
270
+
271
+ #display the result on a separate file as a JSON with some indentation for better visualization
272
+ with open(file="styles.txt", mode='a') as fp:
273
+ if fp.tell() == 0:
274
+ fp.write('Document title: ' + pdf_path.split('/')[-1] + '\n') if '/' in pdf_path else fp.write('Document title: ' + pdf_path.split('\\')[-1] + '\n')
275
+ else:
276
+ fp.write('\nDocument title: ' + pdf_path.split('/')[-1] + '\n') if '/' in pdf_path else fp.write('\nDocument title: ' + pdf_path.split('\\')[-1] + '\n')
277
+ json.dump(styles, fp, indent=4)
278
+
279
+ # Delete the additional files created if image is detected
280
+ if image_flag:
281
+ os.remove('cropped_image.pdf')
282
+ os.remove('PDF_image.png')
283
+
284
+ #beginning of the paragraph extraction
285
+ paragraphs = []
286
+ for index, page in enumerate(text_per_page.values()):
287
+ content_format = page[1]
288
+ j = 0
289
+ while j+1 < len(content_format):
290
+ actual_format = content_format[j]
291
+ n_of_fontsizes = len(list(i for i in actual_format if isinstance(i, int) or isinstance(i, float)))
292
+ if n_of_fontsizes > 1:
293
+ actual_format = max(keep_int_and_floats_in_list(actual_format))
294
+ actual_format = find_good_key_in_dict(styles,actual_format)
295
+ elif n_of_fontsizes == 1:
296
+ actual_format = keep_int_and_floats_in_list(actual_format)[0]
297
+ actual_format = find_good_key_in_dict(styles,actual_format)
298
+ elif n_of_fontsizes == 0 and actual_format == "table":
299
+ actual_format = "table"
300
+ else:
301
+ actual_format = "content"
302
+ #try to find the good format if the current result seems wrong
303
+ #changes depending on the document
304
+ if len(page[4][j]) > 150 and "title" in actual_format:
305
+ actual_format = "content"
306
+ paragraph = Paragraph(text=page[4][j],font_style=actual_format,id_=j,page_id=index)
307
+ paragraphs.append(paragraph)
308
+ j+=1
309
+
310
+ paragraphs = self.concatenate_paragraphs(paragraphs, pdf_path.split('/')[-1]) if '/' in pdf_path else self.concatenate_paragraphs(paragraphs, pdf_path.split('\\')[-1])
311
+ return paragraphs
312
+
313
+ def concatenate_paragraphs(self, paragraphs, doc_title):
314
+ concatenated_paragraphs = []
315
+ i = 0
316
+ actual_page_id = paragraphs[0].page_id
317
+ while i < len(paragraphs):
318
+ p = paragraphs[i]
319
+ if p.blank or "REST API Developer Guide 23.3" in p.text or "x! illumio" in p.text:
320
+ i+=1
321
+ continue
322
+ if (p.page_id != actual_page_id) and doc_title == "Illumio_Core_REST_API_Developer_Guide_23.3.pdf" and (not p.font_style == "table" and not "title" in p.font_style):
323
+ i+=2
324
+ actual_page_id = p.page_id
325
+ continue
326
+ if not concatenated_paragraphs:
327
+ concatenated_paragraphs.append(p)
328
+ elif p.font_style != concatenated_paragraphs[-1].font_style:
329
+ if (p.font_style == "table" and concatenated_paragraphs[-1].font_style == "content") \
330
+ or (p.font_style == "content" and concatenated_paragraphs[-1].font_style == "table"):
331
+ concatenated_paragraphs[-1].text += '\n' + p.text
332
+ else:
333
+ concatenated_paragraphs.append(p)
334
+ else:
335
+ if "title" in p.font_style:
336
+ concatenated_paragraphs[-1].text += ' : ' + p.text
337
+ concatenated_paragraphs[-1].text = concatenated_paragraphs[-1].text.replace('\n','').replace('\r','')
338
+ else:
339
+ concatenated_paragraphs[-1].text += '\n' + p.text
340
+ i+=1
341
+ return concatenated_paragraphs
342
+
343
+
344
+ class Reader_illumio:
345
+ def __init__(self, path):
346
+ self.path = path
347
+ self.paragraphs = self.get_pdf_paragraphs(path)
348
+
349
+ def skip_header(self, dictionary):
350
+ i = 0
351
+ if "Illumio_Core_REST_API_Developer_Guide_23.3" in self.path and not (dictionary[i]["chars"][0]["size"] > 19 and dictionary[i]["chars"][0]["size"] < 30):
352
+ i+=2
353
+ return i
354
+
355
+
356
+ def get_pdf_paragraphs(self,path):
357
+ pdf_to_read = self.extract_all_lines_from_the_doc(path)
358
+ paragraphs = []
359
+ j = 0
360
+ while j < len(pdf_to_read):
361
+ dictionary = pdf_to_read[j]["content"]
362
+ tables = pdf_to_read[j]["tables"]
363
+ i = self.skip_header(dictionary)
364
+ table_count = 0
365
+ while i < len(dictionary):
366
+ # print(f"{dictionary[i]['chars'][0]}")
367
+ if(dictionary[i]["text"].startswith("RESTAPIDeveloperGuide")):
368
+ i+=1
369
+ continue
370
+ if (self.check_if_already_in_table(dictionary[i]['chars'][0],tables) == False):
371
+ p = Paragraph(dictionary[i]["text"],font_style=get_style_of_line(dictionary[i]["chars"][0]["size"],dictionary[i]["chars"][0]["fontname"]),id_=i,page_id=pdf_to_read[j]["page_number"])
372
+ if(i != len(dictionary)-1):
373
+ while((dictionary[i+1]["chars"][0]["size"] == dictionary[i]["chars"][-1]["size"] and dictionary[i+1]["chars"][0]["fontname"] == dictionary[i]["chars"][-1]["fontname"]) and self.check_if_already_in_table(dictionary[i+1]['chars'][0],tables) == False):
374
+ p.text += " " + dictionary[i+1]["text"]
375
+ i += 1
376
+ else:
377
+ p.text = dictionary[i]["text"]
378
+ #print(f"{dictionary[i]['chars'][0]} : {dictionary[i]['text']}")
379
+ i += 1
380
+ # print(f'{p.page_id} : {p.font_style} ->>>>> {p.text}')
381
+ paragraphs.append(p)
382
+ else:
383
+ p = Paragraph(table_converter(tables[table_count].extract()),font_style="table",id_=i,page_id=pdf_to_read[j]["page_number"])
384
+ paragraphs.append(p)
385
+ i = self.skip_out_table(dictionary,i,tables[table_count])
386
+ table_count += 1
387
+ j += 1
388
+ paragraphs = self.rearrange_paragraphs(paragraphs)
389
+ return paragraphs
390
+
391
+ def rearrange_paragraphs(self, paragraphs : [Paragraph]):
392
+ #associate paragraphs with the same font style
393
+ i = 0
394
+ while i < len(paragraphs):
395
+ paragraphs[i] = paragraphs[i].rearrange_paragraph()
396
+ i+=1
397
+ return paragraphs
398
+
399
+ def extract_all_lines_from_the_doc(self,path):
400
+ lines_of_doc = []
401
+ with open(path, 'rb') as f:
402
+ reader = pdfp.PDF(f)
403
+ if "Illumio_Core_REST_API_Developer_Guide_23.3" in path:
404
+ skip_table_of_contents = reader.pages[8:]
405
+ j = 0
406
+ while j < len(skip_table_of_contents):
407
+ lines_of_doc.append({"page_number": j+9, "content": skip_table_of_contents[j].extract_text_lines(), "tables": skip_table_of_contents[j].find_tables()})
408
+ j += 1
409
+ else:
410
+ for page in reader.pages:
411
+ lines_of_doc.append({"page_number": page.page_number, "content": page.extract_text_lines(), "tables": page.find_tables()})
412
+ return lines_of_doc
413
+
414
+ def check_if_already_in_table(self,line,tables):
415
+ for table in tables:
416
+ if table.bbox[1] <= line["top"] <= table.bbox[3]:
417
+ return True
418
+ return False
419
+
420
+ def skip_out_table(self,dictionary,index,table):
421
+ i = index
422
+ while i < len(dictionary):
423
+ if self.check_if_already_in_table(dictionary[i]['chars'][0],tables=[table]) == True:
424
+ i += 1
425
+ else:
426
+ break
427
+ return i
428
+
src/tools/retriever.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.model.block import Block
2
+ from src.model.doc import Doc
3
+ from src.tools.llm import LlmAgent
4
+ import gradio as gr
5
+
6
+ class Retriever:
7
+ def __init__(self, doc : Doc = None, collection = None, llmagent : LlmAgent = None):
8
+ if doc != None:
9
+ blocks_good_format: [Block] = doc.blocks
10
+ self.collection = collection
11
+ gr.Info("Please wait while the database is being created")
12
+ for block in blocks_good_format:
13
+ if len(block.content) > 4500:
14
+ new_blocks = block.separate_1_block_in_n(max_size=4500)
15
+ for new_block in new_blocks:
16
+ summary = llmagent.summarize_paragraph_v2(prompt=new_block.content,title_doc=doc.title,title_para=block.title)
17
+ if "<summary>" in summary:
18
+ summary = summary.split("<summary>")[1]
19
+ self.collection.add(
20
+ documents=[summary],
21
+ ids=[new_block.index],
22
+ metadatas=[new_block.to_dict()]
23
+ )
24
+ else:
25
+ summary = llmagent.summarize_paragraph_v2(prompt=block.content,title_doc=doc.title,title_para=block.title)
26
+ if "<summary>" in summary:
27
+ summary = summary.split("<summary>")[1]
28
+ self.collection.add(
29
+ documents=[summary],
30
+ ids=[block.index],
31
+ metadatas=[block.to_dict()]
32
+ )
33
+ gr.Info(f"The collection {collection.name} has been added to the database")
34
+ else:
35
+ self.collection = collection
36
+
37
+
38
+
39
+ def similarity_search(self, queries: str) -> {}:
40
+ res = self.collection.query(query_texts=queries,n_results=6)
41
+ block_dict_sources = res['metadatas'][0]
42
+ distances = res['distances'][0]
43
+ blocks = []
44
+ for bd, d in zip(block_dict_sources, distances):
45
+ b = Block().from_dict(bd)
46
+ b.distance = d
47
+ blocks.append(b)
48
+ return blocks
49
+
src/tools/table_converter.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Convert table into appropriate fromat
2
+
3
+ def table_converter(table):
4
+ table_string = ''
5
+ # Iterate through each row of the table
6
+ for row_num in range(len(table)):
7
+ row = table[row_num]
8
+ # Remove the line breaker from the wrapted texts
9
+ cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
10
+ # Convert the table into a string
11
+ table_string+=('|'+'|'.join(cleaned_row)+'|'+'\n')
12
+ # Removing the last line break
13
+ table_string = table_string[:-1]
14
+ return table_string
src/view/__pycache__/view.cpython-311.pyc ADDED
Binary file (17.5 kB). View file
 
src/view/view.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from src.control.control import Chatbot
3
+ from chromadb.utils import embedding_functions
4
+ import os
5
+
6
+ def run(ctrl: Chatbot, config: {}):
7
+ with gr.Blocks() as qna:
8
+ with gr.Row():
9
+ with gr.Column():
10
+ pass
11
+
12
+ with gr.Column(scale=10):
13
+ gr.Markdown(config['title'])
14
+ page_start_warning = gr.Markdown("<center>⚠️ If your document starts with a front cover and/or a table of contents, please enter the page number of the ⚠️ first page with real content.<center/>")
15
+ actual_page_start = gr.Number(
16
+ label="Start page (default = 1)",
17
+ visible=True,
18
+ interactive=True,
19
+ container=True,
20
+ value=1,
21
+ )
22
+
23
+ include_images_btn = gr.Checkbox(
24
+ label="Analyse text from images. This option is definitely slower, particularly on big documents. (ONLY for .pdf)",
25
+ value=False,
26
+ visible=True,
27
+ container=True,
28
+ )
29
+
30
+ input_doc_comp = gr.File(
31
+ label="Upload a file",
32
+ scale=1,
33
+ min_width=100,
34
+ )
35
+
36
+ histo_text_comp = gr.Chatbot(
37
+ visible=False,
38
+ value=[],
39
+ )
40
+ input_text_comp = gr.Textbox(
41
+ label="",
42
+ lines=1,
43
+ visible=False,
44
+ max_lines=3,
45
+ interactive=True,
46
+ placeholder="Posez votre question ici",
47
+ )
48
+
49
+ clear_btn = gr.Button("Clear Chat", visible=False)
50
+
51
+ input_example_comp = gr.Radio(
52
+ label="Examples",
53
+ choices=config['examples'].values(),
54
+ value="",
55
+ visible=False,
56
+ )
57
+
58
+ source_text_comp = []
59
+ for i in range(4):
60
+ source_text_comp.append(gr.Textbox(
61
+ lines=4,
62
+ max_lines=4,
63
+ interactive=False,
64
+ visible=False,
65
+ ))
66
+ upload_another_doc_btn = gr.Button("Upload another document", visible=False)
67
+
68
+ open_ai_embedding = embedding_functions.OpenAIEmbeddingFunction(api_key=os.environ['OPENAI_API_KEY'], model_name="text-embedding-ada-002")
69
+ with gr.Column(scale=7):
70
+ collections_list = gr.Radio(choices=[a.name for a in ctrl.client_db.list_collections()],
71
+ label="Current collections in the database",
72
+ visible=True,
73
+ info="Choose a collection to query."
74
+ )
75
+ delete_database_btn = gr.Button("Delete current collection", visible=False)
76
+
77
+ def input_doc_fn(input_doc_, include_images_, actual_page_start_):
78
+ result = ctrl.upload_doc(input_doc_,include_images_, actual_page_start_)
79
+ if result == True:
80
+ return {
81
+ input_doc_comp: gr.update(visible=False),
82
+ input_text_comp: gr.update(visible=True),
83
+ input_example_comp: gr.update(visible=True),
84
+ clear_btn: gr.update(visible=True),
85
+ include_images_btn: gr.update(visible=False,value=include_images_),
86
+ delete_database_btn: gr.update(visible=True),
87
+ upload_another_doc_btn: gr.update(visible=True),
88
+ collections_list: gr.update(choices=[a.name for a in ctrl.client_db.list_collections()],value=ctrl.retriever.collection.name),
89
+ page_start_warning: gr.update(visible=False),
90
+ actual_page_start: gr.update(visible=False),
91
+ }
92
+ else:
93
+ gr.Warning("File extension not supported. Only .docx, .pdf and .html are supported.")
94
+ return {
95
+ input_doc_comp: gr.update(visible=True),
96
+ input_text_comp: gr.update(visible=False),
97
+ input_example_comp: gr.update(visible=False),
98
+ clear_btn: gr.update(visible=False),
99
+ include_images_btn: gr.update(visible=True,value=include_images_),
100
+ page_start_warning: gr.update(visible=True),
101
+ actual_page_start: gr.update(visible=True, value=1),
102
+ }
103
+
104
+ def input_file_clear():
105
+ update_ = {
106
+ input_doc_comp: gr.update(visible=True, value=None),
107
+ clear_btn: gr.update(visible=False),
108
+ input_text_comp: gr.update(value='', visible=False),
109
+ histo_text_comp: gr.update(value='', visible=False),
110
+ input_example_comp: gr.update(value='', visible=False),
111
+ include_images_btn: gr.update(visible=True),
112
+ upload_another_doc_btn: gr.update(visible=False),
113
+ delete_database_btn: gr.update(visible=True),
114
+ page_start_warning: gr.update(visible=True),
115
+ actual_page_start: gr.update(visible=True, value=1),
116
+ collections_list: gr.update(value=None, choices=[a.name for a in ctrl.client_db.list_collections()]),
117
+ }
118
+ for i in range(4):
119
+ update_[source_text_comp[i]] = gr.update(visible=False, value='hello')
120
+ return update_
121
+
122
+
123
+ def input_text_fn1(input_text_, histo_text_):
124
+ histo_text_.append((input_text_, None))
125
+ update_ = {
126
+ histo_text_comp: gr.update(visible=True, value=histo_text_),
127
+ input_example_comp: gr.update(visible=False,),
128
+ }
129
+ for i in range(4):
130
+ update_[source_text_comp[i]] = gr.update(visible=False)
131
+ return update_
132
+
133
+ def input_text_fn2(input_text_, histo_text_):
134
+ answer, sources = ctrl.get_response(query=input_text_, histo=histo_text_)
135
+ histo_text_[-1] = (input_text_, answer)
136
+ update_ = {
137
+ histo_text_comp: gr.update(value=histo_text_),
138
+ input_text_comp: gr.update(value=''),
139
+ }
140
+ for i in range(min(len(sources), 3)):
141
+ s = sources[i]
142
+ if i != 0:
143
+ prev = sources[i - 1]
144
+ if prev.index == s.index:
145
+ continue
146
+ source_label = f'{s.index} {s.title} score = {s.distance_str}'
147
+ source_text = s.content
148
+ update_[source_text_comp[i]] = gr.update(visible=True, value=source_text, label=source_label)
149
+ return update_
150
+
151
+ def input_example_fn(input_example_, histo_text_):
152
+ histo_text_.append((input_example_, None))
153
+ update_ = {
154
+ input_text_comp: gr.update(value=input_example_),
155
+ histo_text_comp: gr.update(visible=True, value=histo_text_),
156
+ input_example_comp: gr.update(visible=False, value=''),
157
+ }
158
+ for i in range(4):
159
+ update_[source_text_comp[i]] = gr.update(visible=False)
160
+ return update_
161
+
162
+ def clear_fn():
163
+ update_ = {
164
+ input_text_comp: gr.update(value=''),
165
+ histo_text_comp: gr.update(value='', visible=False),
166
+ input_example_comp: gr.update(value='', visible=True),
167
+ upload_another_doc_btn: gr.update(visible=True),
168
+ }
169
+ for i in range(4):
170
+ update_[source_text_comp[i]] = gr.update(visible=False, value='hello')
171
+ return update_
172
+
173
+ def list_all_chroma_collections():
174
+ update = {
175
+ collections_list: gr.update(choices=[a.name for a in ctrl.client_db.list_collections()]),
176
+ }
177
+ return update
178
+
179
+ def change_collection(collection_name):
180
+ ctrl.retriever.collection = ctrl.client_db.get_collection(collection_name, embedding_function=open_ai_embedding)
181
+ return {
182
+ delete_database_btn: gr.update(visible=True),
183
+ input_doc_comp: gr.update(visible=False,value=None),
184
+ input_text_comp: gr.update(visible=True, value=''),
185
+ input_example_comp: gr.update(visible=True),
186
+ clear_btn: gr.update(visible=True),
187
+ collections_list: gr.update(choices=[a.name for a in ctrl.client_db.list_collections()]),
188
+ include_images_btn: gr.update(visible=False),
189
+ histo_text_comp: gr.update(visible=False, value=''),
190
+ upload_another_doc_btn: gr.update(visible=True),
191
+ actual_page_start: gr.update(visible=False),
192
+ page_start_warning: gr.update(visible=False),
193
+ }
194
+
195
+ def delete_curr_database():
196
+ ctrl.client_db.delete_collection(ctrl.retriever.collection.name)
197
+ gr.Info(f"Collection {ctrl.retriever.collection.name} deleted from the database")
198
+ return {
199
+ delete_database_btn: gr.update(visible=False),
200
+ input_doc_comp: gr.update(visible=True,value=None),
201
+ input_text_comp: gr.update(visible=False, value=''),
202
+ input_example_comp: gr.update(visible=False),
203
+ clear_btn: gr.update(visible=False),
204
+ collections_list: gr.update(choices=[a.name for a in ctrl.client_db.list_collections()]),
205
+ include_images_btn: gr.update(visible=True),
206
+ histo_text_comp: gr.update(visible=False, value=''),
207
+ upload_another_doc_btn: gr.update(visible=False),
208
+ actual_page_start: gr.update(visible=True, value=1),
209
+ page_start_warning: gr.update(visible=True),
210
+ }
211
+
212
+ upload_another_doc_btn.click(input_file_clear,
213
+ inputs=None,
214
+ outputs=[collections_list, page_start_warning, actual_page_start, input_doc_comp, input_text_comp, input_example_comp, clear_btn, include_images_btn, histo_text_comp, delete_database_btn,upload_another_doc_btn, source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])
215
+
216
+ delete_database_btn.click(delete_curr_database,
217
+ inputs=None,
218
+ outputs=[page_start_warning, actual_page_start, delete_database_btn, input_doc_comp, input_text_comp, input_example_comp, clear_btn, collections_list, include_images_btn, histo_text_comp, upload_another_doc_btn])
219
+
220
+ collections_list.input(change_collection,
221
+ inputs=[collections_list],
222
+ outputs=[actual_page_start, page_start_warning, collections_list, input_text_comp, input_example_comp, clear_btn, include_images_btn, histo_text_comp, input_doc_comp, delete_database_btn,upload_another_doc_btn])
223
+
224
+ input_doc_comp \
225
+ .upload(input_doc_fn,
226
+ inputs=[input_doc_comp, include_images_btn, actual_page_start],
227
+ outputs=[page_start_warning, actual_page_start, input_doc_comp, input_text_comp,upload_another_doc_btn,
228
+ input_example_comp, include_images_btn, clear_btn, histo_text_comp, delete_database_btn,collections_list, source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])\
229
+ .then(list_all_chroma_collections,
230
+ inputs=None,
231
+ outputs=[collections_list])
232
+
233
+ input_doc_comp \
234
+ .clear(input_file_clear,
235
+ inputs=None,
236
+ outputs=[page_start_warning, actual_page_start, input_doc_comp, clear_btn, upload_another_doc_btn, input_text_comp, histo_text_comp, input_example_comp, include_images_btn, delete_database_btn,
237
+ source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])\
238
+
239
+ input_text_comp \
240
+ .submit(input_text_fn1,
241
+ inputs=[input_text_comp, histo_text_comp],
242
+ outputs=[histo_text_comp, input_example_comp,
243
+ source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])\
244
+ .then(input_text_fn2,
245
+ inputs=[input_text_comp, histo_text_comp],
246
+ outputs=[input_text_comp, histo_text_comp,
247
+ source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])
248
+ input_example_comp \
249
+ .input(input_example_fn,
250
+ inputs=[input_example_comp, histo_text_comp],
251
+ outputs=[input_text_comp, histo_text_comp, input_example_comp,
252
+ source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])\
253
+ .then(input_text_fn2,
254
+ inputs=[input_text_comp, histo_text_comp],
255
+ outputs=[input_text_comp, histo_text_comp,
256
+ source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])
257
+ clear_btn.click(clear_fn,
258
+ inputs=None,
259
+ outputs=[input_text_comp, histo_text_comp, input_example_comp,upload_another_doc_btn,
260
+ source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])
261
+
262
+ return qna
styles.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Document title: Defoe_RobinsonCrusoe1.pdf
2
+ {
3
+ "content": [
4
+ 11.0,
5
+ 13.300000000000011,
6
+ 15.999999999999943,
7
+ 15.999999999999986,
8
+ 16.0,
9
+ 16.000000000000007,
10
+ 16.00000000000003
11
+ ],
12
+ "title2": [
13
+ 23.0
14
+ ],
15
+ "title1": [
16
+ 27.600000000000023
17
+ ]
18
+ }