Spaces:

Hexamind
/

Chatbot_llama2_questions

Runtime error

App Files Files Community

adrien.aribaut-gaudin commited on Oct 12, 2023

Commit

2e3ba97

1 Parent(s): 0ac6117

first push from pages 9 to 25

Browse files

Files changed (23) hide show

.gitattributes +1 -0
.gitignore +170 -0
app.py +52 -0
config.py +13 -0
data/Illumio_Core_REST_API_Developer_Guide_23.3.pdf +3 -0
requirements.txt +0 -0
src/__init__.py +0 -0
src/control/__init__.py +0 -0
src/control/control.py +73 -0
src/model/__init__.py +0 -0
src/model/block.py +34 -0
src/model/container.py +192 -0
src/model/doc.py +78 -0
src/model/paragraph.py +20 -0
src/tools/__init__.py +0 -0
src/tools/index_creation.py +37 -0
src/tools/llm.py +111 -0
src/tools/pretty_print.py +25 -0
src/tools/reader.py +102 -0
src/tools/retriever.py +31 -0
src/tools/test_read.py +209 -0
src/view/view.py +112 -0
test.py +7 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.pdf filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,170 @@

+config_key.py
+#Test folder
+data/Test/
+#database folder
+database/
+Ilumio_chatbot/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

app.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import pandas as pd
+import os
+import time
+import chromadb
+from config import *
+from src.tools.reader import get_pdf_title_styles
+from src.tools.llm import LlmAgent
+import src.view.view as view
+from src.tools.pretty_print import pretty_print_container_structure, pretty_printer_paragraphs
+from src.model.container import Container
+from src.control.control import Chatbot
+from src.tools.retriever import Retriever
+from src.model.doc import Doc
+from src.tools.test_read import pdf_manager
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+if not "OPENAI_API_KEY" in os.environ:
+    from config_key import OPENAI_API_KEY
+    os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
+# start_time = time.time()
+# doc = Doc(path=content_en_path_real)
+# print("--- %s seconds ---" % (time.time() - start_time))
+# check if the database is empty
+# pdf_manager(pdf_path=content_en_path_real)
+# pretty_printer_paragraphs(doc.container.paragraphs)
+# pretty_print_container_structure(doc.container)
+if not os.path.exists("database/"):
+    os.makedirs("database/")
+client_db = chromadb.PersistentClient(path="database/")
+try:
+    client_db.get_collection(name="illumio_database")
+    llm = LlmAgent(model="TheBloke/Llama-2-7b-Chat-GPTQ")
+    retriever = Retriever(client_db, None, "illumio_database", llmagent=llm)
+except:
+    print("Database is empty")
+    doc = Doc(path=content_en_path_real)
+    llm = LlmAgent(model="TheBloke/Llama-2-7b-Chat-GPTQ")
+    retriever = Retriever(client_db,doc.container,"illumio_database",llmagent=llm)
+chat = Chatbot(llm_agent=llm, retriever=retriever)
+ilumio_qna = view.run(ctrl=chat, config=view_config)
+ilumio_qna.queue().launch()

config.py ADDED Viewed

	@@ -0,0 +1,13 @@

+content_language = 'en'
+plan_language = 'en'
+content_en_path_real = "data/Illumio_Core_REST_API_Developer_Guide_23.3.pdf"
+content_test = "Ilumio_chatbot/data/Test/Test_children.pdf"
+examples = {"Question banale?": "Pourquoi le ciel est bleu?",
+}
+view_config = {
+    'title': '# Ilumio Q&A',
+    'examples': examples,
+}

data/Illumio_Core_REST_API_Developer_Guide_23.3.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8821bd9530837f23a99e6b5d17d1e893f74d91ac6112c861d4ecd3f830e42479
+size 4115867

requirements.txt ADDED Viewed

Binary file (6.89 kB). View file

src/__init__.py ADDED Viewed

File without changes

src/control/__init__.py ADDED Viewed

File without changes

src/control/control.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import pandas as pd
+from src.tools.retriever import Retriever
+from src.tools.llm import LlmAgent
+from src.model.block import Block
+class Chatbot:
+    def __init__(self, llm_agent : LlmAgent, retriever: Retriever):
+        self.retriever = retriever
+        self.llm = llm_agent
+    def get_response(self, query, histo):
+        histo_conversation, histo_queries = self._get_histo(histo)
+        langage_of_query = self.llm.detect_language(query).lower()
+        queries = self.llm.translate(text=histo_queries)
+        block_sources = self.retriever.similarity_search(query=queries)
+        block_sources = self._select_best_sources(block_sources)
+        sources_contents = [s.content for s in block_sources]
+        context = '\n'.join(sources_contents)
+        answer = self.llm.generate_paragraph(query=queries, histo=histo_conversation, context=context, language=langage_of_query)
+        answer = self.llm.generate_answer(answer=answer, query=query, histo=histo_conversation, context=context,language=langage_of_query)
+        # print(answer.split('bot:')[1].strip())
+        # print("*************")
+        # answer = self._clean_answer(answer)
+        return answer, block_sources
+    @staticmethod
+    def  _select_best_sources(sources: [Block], delta_1_2=0.15, delta_1_n=0.3, absolute=1.2, alpha=0.9) -> [Block]:
+        """
+        Select the best sources: not far from the very best, not far from the last selected, and not too bad per se
+        """
+        best_sources = []
+        for idx, s in enumerate(sources):
+            if idx == 0 \
+                    or (s.distance - sources[idx - 1].distance < delta_1_2
+                        and s.distance - sources[0].distance < delta_1_n) \
+                    or s.distance < absolute:
+                best_sources.append(s)
+                delta_1_2 *= alpha
+                delta_1_n *= alpha
+                absolute *= alpha
+            else:
+                break
+        return best_sources
+    @staticmethod
+    def _get_histo(histo: [(str, str)]) -> (str, str):
+        histo_conversation = ""
+        histo_queries = ""
+        for (query, answer) in histo[-5:]:
+            histo_conversation += f'user: {query} \n bot: {answer}\n'
+            histo_queries += query + '\n'
+        return histo_conversation[:-1], histo_queries
+    @staticmethod
+    def _clean_answer(answer: str) -> str:
+        print(answer)
+        answer = answer.strip('bot:')
+        while answer and answer[-1] in {"'", '"', " ", "`"}:
+            answer = answer[:-1]
+        while answer and answer[0] in {"'", '"', " ", "`"}:
+            answer = answer[1:]
+        answer = answer.strip('bot:')
+        if answer:
+            if answer[-1] != ".":
+                answer += "."
+        return answer

src/model/__init__.py ADDED Viewed

File without changes

src/model/block.py ADDED Viewed

	@@ -0,0 +1,34 @@

+class Block:
+    def __init__(self, doc: str= '',title: str = '', content: str = '',
+                 index: str = '', rank: int = 0, level: int = 0, distance: float = 99999):
+        self.doc = doc
+        self.title = title
+        self.content = content
+        self.index = index
+        self.rank = rank
+        self.level = level
+        self.distance = distance
+    def to_dict(self) -> {}:
+        block_dict = {'doc': self.doc,
+                      'title': self.title,
+                      'content': self.content,
+                      'index': self.index,
+                      'rank': self.rank,
+                      'level': self.level,
+                      'distance': self.distance}
+        return block_dict
+    def from_dict(self, block_dict: {}):
+        self.doc = block_dict['doc']
+        self.title = block_dict['title']
+        self.content = block_dict['content']
+        self.index = block_dict['index']
+        self.rank = block_dict['rank']
+        self.level = block_dict['level']
+        self.distance = block_dict['distance']
+        return self
+    @property
+    def distance_str(self) -> str:
+        return format(self.distance, '.2f')

src/model/container.py ADDED Viewed

	@@ -0,0 +1,192 @@

+from .paragraph import Paragraph
+from .block import Block
+INFINITE = 99999
+class Container:
+    def __init__(self, paragraphs : [Paragraph], title : Paragraph=None, level: int = 0, index: [int] = None , father=None, id_ = 0):
+        if index is None:
+            index = []
+        self.level = level
+        self.title = title
+        self.paragraphs = []
+        self.children = []
+        self.index = index
+        self.father = father
+        self.id_ = int(str(1) + str(father.id_) + str(id_))
+        if paragraphs:
+            self.paragraphs, self.children = self.create_children(paragraphs, level, index)
+        self.containers = [self]
+        for child in self.children:
+            self.containers += child.containers
+        self.blocks = self.get_blocks()
+    def get_blocks(self):
+        block = Block(level=self.level, index=self.index)
+        if self.title:
+            block.title = self.title.text
+        for p in self.paragraphs:
+            if not p.blank:
+                block.content += p.text
+        blocks = [block] if block.content else []
+        for child in self.children:
+            blocks += child.blocks
+        return blocks
+    def create_children(self, paragraphs, level, rank) -> ([], []):
+        """
+        creates children containers or directly attached content
+        and returns the list of containers and contents of level+1
+        :return:
+        [Content or Container]
+        """
+        attached_paragraphs = []
+        container_paragraphs = []
+        container_title = None
+        children = []
+        in_children = False
+        level = INFINITE
+        child_id = 0
+        while paragraphs:
+            p = paragraphs.pop(0)
+            if not in_children and not p.is_structure:
+                attached_paragraphs.append(p)
+            else:
+                in_children = True
+                if p.is_structure and not p.blank and p.level <= level:  # if p is higher or equal in hierarchy
+                    if container_paragraphs or container_title:
+                        children.append(Container(container_paragraphs, container_title, level, rank, self, child_id))
+                        child_id += 1
+                    container_paragraphs = []
+                    container_title = p
+                    level = p.level
+                else:  # p is strictly lower in hierarchy
+                    container_paragraphs.append(p)
+        if container_paragraphs or container_title:
+            children.append(Container(container_paragraphs, container_title, level, rank, self, child_id))
+            child_id += 1
+        return attached_paragraphs, children
+    #REAL ONEEEEEEEEEEEEEEEEEEEEE
+    # def create_children(self, paragraphs: [Paragraph], level: int, index: [int]):
+    #     """
+    #     Creates children containers and/or directly attached content and returns the list of attached content and the list of children containers.
+    #     The indexes correspond to the indexes of the paragraphs in the content and also on the structure.
+    #     :return: List of Content or Container
+    #     """
+    #     attached_paragraphs = []
+    #     children = []
+    #     in_children = False
+    #     level = INFINITE
+    #     container_paragraphs = []
+    #     container_title = None
+    #     while paragraphs:
+    #         p = paragraphs.pop(0)
+    #         if not in_children and not p.is_structure:
+    #             attached_paragraphs.append(p)
+    #         else:
+    #             in_children = True
+    #             if p.is_structure and p.level <= level:  # if p is higher in hierarchy, then the child is completed
+    #                 if container_paragraphs or container_title:
+    #                     if level <= len(index):
+    #                         index = index[:level]
+    #                         index[-1] += 1
+    #                     else:
+    #                         for i in range(level-len(index)):
+    #                             index.append(1)
+    #                     children.append(Container(container_paragraphs, container_title, level, index.copy(), self))
+    #                 container_paragraphs = []
+    #                 container_title = p
+    #                 level = p.level
+    #             else:  # p is normal text or strictly lower in hierarchy, then the child continues to grow
+    #                 container_paragraphs.append(p)
+    #     if container_paragraphs or container_title:
+    #         if level <= len(index):
+    #             index = index[:level]
+    #             index[-1] += 1
+    #         else:
+    #             for i in range(level - len(index)):
+    #                 index.append(1)
+    #         children.append(Container(container_paragraphs, container_title, level, index.copy(), self))
+    #     return attached_paragraphs, children
+    # def create_children(self, paragraphs: [Paragraph], level: int, index: [int]):
+    #     """
+    #     Creates children containers and/or directly attached content and returns the list of attached content and the list of children containers.
+    #     The indexes correspond to the indexes of the paragraphs in the content and also on the structure.
+    #     :return: List of Content or Container
+    #     """
+    #     attached_paragraphs = []
+    #     children = []
+    #     in_children = False
+    #     level = INFINITE
+    #     # container_paragraphs = []
+    #     # container_title = None
+    #     while paragraphs:
+    #         p = paragraphs.pop(0)
+    #         if not in_children and p.is_structure and level != INFINITE:
+    #             paragraphs.insert(0, p)
+    #             children.append(Container(paragraphs, title=p, level=p.level, children=children, index=index.copy(), father=self))
+    #         else:
+    #             in_children = True
+    #             if p.is_structure and p.level <= level:  # if p is higher in hierarchy, then the child is completed
+    #                 level = p.level
+    #                 if len(index) == level:
+    #                     index[-1] += 1
+    #                 elif len(index) < level:
+    #                     if self.children != []:
+    #                         index = self.children[-1].index.copy()
+    #                         index[-1] += 1
+    #                     else:
+    #                         index.append(1)
+    #                 else:
+    #                     index = index[:level]
+    #                     index[-1] += 1
+    #                 while paragraphs:
+    #                     p = paragraphs.pop(0)
+    #                     if p.is_structure:
+    #                         paragraphs.insert(0, p)
+    #                         break
+    #                     else:
+    #                         attached_paragraphs.append(p)
+    #                 if paragraphs and p.level > level:
+    #                     in_children = False
+    #                     children.append(Container(paragraphs, title=p, level=p.level, index=index.copy(), father=self))
+    #                 else:
+    #                     break
+    #     return attached_paragraphs, children
+    @property
+    def structure(self):
+        self_structure = {str(self.id_): {
+            'index': str(self.id_),
+            'canMove': True,
+            'isFolder': True,
+            'children': [p.id_ for p in self.paragraphs] + [child.id_ for child in self.children],
+            'canRename': True,
+            'data': {},
+            'level': self.level,
+            'rank': self.rank,
+            'title': self.title.text if self.title else 'root'
+        }}
+        paragraphs_structure = [p.structure for p in self.paragraphs]
+        structure = [self_structure] + paragraphs_structure
+        for child in self.children:
+            structure += child.structure
+        return structure

src/model/doc.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from src.model.container import Container
+from src.model.paragraph import Paragraph
+from src.tools.index_creation import set_good_indexes
+from src.tools.reader import get_pdf_title_styles
+class Doc:
+    def __init__(self, path='', id_=None):
+        self.title = self.get_good_title(path)
+        self.id_ = id(self)
+        self.path = path
+        paragraphs = get_pdf_title_styles(path)
+        self.container = Container(paragraphs, father=self)
+        set_good_indexes(self.container)
+        self.blocks = self.get_blocks()
+    def get_good_title(self,path):
+        if '/' in path:
+            res = path.split('/')[-1]
+        if '\\' in path:
+            res = path.split('\\')[-1]
+        else:
+            res = path
+        return res
+    @property
+    def structure(self):
+        return self.container.structure
+    def get_blocks(self):
+        def from_list_to_str(index_list):
+            index_str = str(index_list[0])
+            for el in index_list[1:]:
+                index_str += '.' + str(el)
+            return index_str
+        blocks = self.container.blocks
+        # blocks = self.delete_duplicate()
+        for block in blocks:
+            block.doc = self.title
+            block.index = from_list_to_str(block.index)
+            print(block.index + ' ' + block.title)
+        return blocks
+    def delete_duplicate(self):
+        while self.found_duplicates(self.container.blocks):
+            for i in range(len(self.container.blocks) - 1):
+                if self.container.blocks[i].index == self.container.blocks[i + 1].index:
+                    if self.container.blocks[i].index != []:
+                        self.container.blocks[i].index.pop()
+        return self.container.blocks
+    def found_duplicates(self, blocks):
+        for i in range(len(blocks) - 1):
+            if blocks[i].index == blocks[i + 1].index:
+                return True
+        return False
+"""
+    current_level = len(current_index)
+    if 0 < block.level:
+        if block.level == current_level:
+            current_index[-1] += 1
+        elif current_level < block.level:
+            current_index.append(1)
+        elif block.level < current_level:
+            current_index = current_index[:block.level]
+            current_index[-1] += 1
+        block.index = from_list_to_str(current_index)
+    else:
+        block.index = "0"
+"""

src/model/paragraph.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import string
+INFINITE = 10000
+class Paragraph:
+    def __init__(self, text : str, font_style : str, id_ : int, page_id : int):
+        self.font_style = font_style
+        self.id_ = int(str(2)+str(page_id)+str(id_))
+        self.page_id = page_id
+        self.level = int(font_style[-1]) if 'title' in font_style else INFINITE
+        self.is_structure = self.level < INFINITE
+        self.text = text
+    @property
+    def blank(self):
+        """
+        checks if the paragraph is blank: i.e. it brings some signal (it may otherwise be ignored)
+        """
+        text = self.text.replace('\n', '')
+        return set(text).isdisjoint(string.ascii_letters)

src/tools/__init__.py ADDED Viewed

File without changes

src/tools/index_creation.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from src.model.container import Container
+def create_dic_levels(c:Container,dict_of_levels : dict = {}):
+    if c.level == 0:
+        dict_of_levels[c.level] = [1]
+    for child in c.children:
+        if child.level not in dict_of_levels:
+            dict_of_levels[child.level] = [1 for _ in range(child.level)]
+        create_dic_levels(child, dict_of_levels)
+    return dict_of_levels
+def create_good_indexes(c:Container, dict_of_levels : dict):
+    actual_level = c.level
+    c.index = dict_of_levels[actual_level].copy()
+    actual_len = len(dict_of_levels[actual_level])
+    temp_update = dict_of_levels[actual_level][-1]
+    dict_of_levels[actual_level][-1] += 1
+    for i in dict_of_levels.values():
+        if len(i) > actual_len:
+            i[actual_len - 1] = temp_update
+    for child in c.children:
+        c_lvl = child.level
+        for i in dict_of_levels.values():
+            if len(i) > c_lvl:
+                i[c_lvl:] = [1 for _ in range(len(i[c_lvl:]))]
+        create_good_indexes(child, dict_of_levels)  # Apply the function recursively to all children
+def set_good_block_indexes(c:Container):
+    for i in c.containers:
+        for b in i.blocks:
+            b.index = i.index
+def set_good_indexes(c:Container):
+    dict_levels = create_dic_levels(c)
+    create_good_indexes(c, dict_levels)
+    set_good_block_indexes(c)

src/tools/llm.py ADDED Viewed

	@@ -0,0 +1,111 @@

+from langchain.llms.huggingface_pipeline import HuggingFacePipeline
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+from langchain.chains import LLMChain
+from langchain.prompts import PromptTemplate
+class LlmAgent:
+    def __init__(self, model :str = "TheBloke/Llama-2-7b-Chat-GPTQ"):
+        self.tokenizer = AutoTokenizer.from_pretrained(model, use_fast=True)
+        self.model = AutoModelForCausalLM.from_pretrained(model,
+                                                device_map="auto",
+                                                trust_remote_code=False,             #A CHANGER SELON LES MODELES, POUR CELUI DE LAMA2 CA MARCHE (celui par default)
+                                                revision="main")
+        self.pipe = pipeline("text-generation", model=self.model, tokenizer=self.tokenizer)
+    def generate_paragraph(self, query: str, context: {}, histo: [(str, str)], language='fr') -> str:
+        locallm = HuggingFacePipeline(pipeline=self.pipe)
+        """generates the  answer"""
+        template = (f"You are a conversation bot designed to answer to the query from users delimited by "
+                    f"triple backticks: "
+                    f"\\n ``` {query} ```\\n"
+                    f"Your answer is based on the context delimited by triple backticks: "
+                    f"\\n ``` {context} ```\\n"
+                    f" You are consistent and avoid redundancies with the rest of the initial conversation "
+                    f"delimited by triple backticks: "
+                    f"\\n ``` {histo} ```\\n"
+                    f"Your response shall be in {language} and shall be concise")
+        prompt = PromptTemplate(input_variables=[], template=template)
+        llm_chain = LLMChain(prompt=prompt,llm=locallm)
+        p = llm_chain.predict()
+        # print("****************")
+        # print(template)
+        # print("----")
+        # print(p)
+        return p
+    def translate(self, text: str, language="en") -> str:
+        locallm = HuggingFacePipeline(pipeline=self.pipe)
+        """translates"""
+        # languages = "`French to English" if language == "en" else "English to French"
+        tempate = (f"    Your task consists in translating in English\\n"
+                    f"    the following text delimited by by triple backticks: ```{text}```\n"
+                    )
+        prompt = PromptTemplate(input_variables=[], template=tempate)
+        llm_chain = LLMChain(prompt=prompt,llm=locallm,verbose=True)
+        p = llm_chain.predict()
+        return p
+    def generate_answer(self, query: str, answer: str, histo: str, context: str,language : str) -> str:
+        """provides the final answer in {language} based on the initial query and the answer in english"""
+        def _cut_unfinished_sentence(s: str):
+            return '.'.join(s.split('.')[:-1])
+        locallm = HuggingFacePipeline(pipeline=self.pipe)
+        template = (f"Your task consists in translating the answer in {language}, if its not already the case, to the query "
+                    f"delimited by triple backticks: ```{query}``` \\n"
+                    f"You are given the answer in {language} delimited by triple backticks: ```{answer}```"
+                    f"\\n You don't add new content to the answer but: "
+                    f"\\n 1 You can use some vocabulary from the context delimited by triple backticks: "
+                    f"```{context}```"
+                    f"\\n 2 You are consistent and avoid redundancies with the rest of the initial"
+                    f" conversation delimited by triple backticks: ```{histo}```"
+                    )
+        prompt = PromptTemplate(input_variables=[], template=template)
+        llm_chain = LLMChain(prompt=prompt,llm=locallm,verbose=True)
+        p = llm_chain.predict()
+        # p = _cut_unfinished_sentence(p)
+        return p
+    def transform_parahraph_into_question(self, prompt : str, title_doc : str = '',title_para : str = '') -> str:
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+        max_tokens = 45
+        prompt_template=f'''[INST] <<SYS>>
+        You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+        If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
+        Your job is to create a question about a paragraph of a document untitled "{title_doc}".
+        The paragraph title is "{title_para}".
+        If you see that the question that you are creating will not respect {max_tokens} tokens, find a way to make it shorter.
+        If you see that the document paragraph seems to be code flattened, try to analyze it and create a question about it.
+        If you see that the paragraph is a table, try to create a question about it.
+        If you can't create a question about the paragraph, just rephrase {title_para} so that it becomes a question.
+        Your response shall only contains one question, shall be concise and shall respect the following format:
+        "Question: <question>"
+        The paragraph you need to create a question about is the following :
+        <</SYS>>
+        {prompt}[/INST]
+        '''
+        input_ids = self.tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
+        output = self.model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=max_tokens,num_return_sequences=1)
+        res1 = self.tokenizer.decode(output[0][input_ids.shape[-1]:], skip_special_tokens=True)
+        print(res1)
+        print("-"*len(res1))
+        return res1
+    def detect_language(self, text: str) -> str:
+        """detects the language"""
+        locallm = HuggingFacePipeline(pipeline=self.pipe)
+        template = (f"Your task consists in detecting the language of the following text delimited by triple backticks: "
+                    f"```{text}```"
+                    f" Your answer shall be the two letters code of the language"
+                    )
+        prompt = PromptTemplate(input_variables=[], template=template)
+        llm_chain = LLMChain(prompt=prompt,llm=locallm,verbose=True)
+        p = llm_chain.predict()
+        return p

src/tools/pretty_print.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from src.model.paragraph import Paragraph
+from src.model.container import Container
+#function that pretty prints the paragraphs
+def pretty_printer_paragraphs(paragraphs):
+    for p in paragraphs:
+        if (p.font_style == "title1"):
+            print(f"Titre 1 {p.text}")
+        elif (p.font_style == "title2"):
+            print(f"---> Titre 2 {p.text}")
+        elif (p.font_style == "title3"):
+            print(f"-------> Titre 3 {p.text}")
+        # elif (p.font_style == "title4"):
+        #     print(f"-----------> Titre 4 {p.text}")
+        # elif (p.font_style == "content"):
+        #     print(f"---------------> {p.text}")
+def pretty_print_container_structure(container):
+    if container.title:
+        print(f"{'-'*container.level} {container.title.text}")
+    for p in container.paragraphs:
+        print(f"{'-'*container.level} {p.text}")
+    for c in container.children:
+        pretty_print_container_structure(c)

src/tools/reader.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import os
+import pdfplumber as pdfp
+from src.model.paragraph import Paragraph
+import asyncio
+def skip_header(dictionary):
+    i = 0
+    if not (dictionary[i]["chars"][0]["size"] > 19 and dictionary[i]["chars"][0]["size"] < 30):
+        i+=2
+    return i
+def get_style_of_line(size : float):
+    if size >= 9 and size < 11.5:
+        return "content"
+    elif size >= 11.5 and size <= 12.7:
+        return "title5"
+    elif size >= 12.8 and size <= 13.5:
+        return "title4"
+    elif size > 13.5 and size <= 15.5:
+        return "title3"
+    elif size > 15.5 and size <= 18.5:
+        return "title2"
+    elif size > 19 and size < 30:
+        return "title1"
+    # elif size >= 12 and size <= 14.5:
+    #     return "title2"
+    # elif size > 14.5 and size <= 16.5:
+    #     return "title1"
+    else:
+        return "unknown"
+def get_pdf_title_styles(path):
+    pdf_to_read = extract_all_lines_from_the_doc(path)
+    paragraphs = []
+    j = 0
+    while j < len(pdf_to_read):
+        dictionary = pdf_to_read[j]["content"]
+        i = skip_header(dictionary)
+        while i < len(dictionary):
+            #print(f"{dictionary[i]['chars'][0]} : {dictionary[i]['text']}")
+            if(dictionary[i]["text"].startswith("RESTAPIDeveloperGuide")):
+                i+=1
+                continue
+            p = Paragraph(dictionary[i]["text"],font_style=get_style_of_line(dictionary[i]["chars"][0]["size"]),id_=i,page_id=pdf_to_read[j]["page_number"])
+            if(i != len(dictionary)-1):
+                while(dictionary[i+1]["chars"][0]["size"] == dictionary[i]["chars"][0]["size"]):
+                    p.text += " " + dictionary[i+1]["text"]
+                    i += 1
+                    # if(i == len(dictionary)-1):
+                    #     print("PIDOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO")
+                    #     if(j == len(pdf_to_read)-1):
+                    #         print("JUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU")
+                    #         break
+                    #     else:
+                    #         if(dictionary[i]["chars"][0]["size"] == pdf_to_read[j+1]["content"][0]["chars"][0]["size"]):
+                    #             print("MAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA")
+                    #             j += 1
+                    #             p.text += " " + pdf_to_read[j]["content"][0]["text"]
+                    #             dictionary = pdf_to_read[j]["content"]
+                    #             i = 0
+                    #         else:
+                    #             print("RRIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIZ")
+                    #             break
+            else:
+                p.text = dictionary[i]["text"]
+            #print(f"{dictionary[i]['chars'][0]} : {dictionary[i]['text']}")
+            i += 1
+            # print(f'{p.page_id} : {p.font_style} ->>>>> {p.text}')
+            paragraphs.append(p)
+        j += 1
+    return paragraphs
+def test_get_font_sizes_of_a_page(page : int, path):
+    with open(os.path.abspath(path)) as f:
+        reader = pdfp.PDF(f)
+        page = reader.pages[page]
+        dictionary = page.extract_text_lines()
+        for i in range(len(dictionary)):
+            print(f'{i} : {dictionary[i]["chars"][0]["size"]} ->>>>> {dictionary[i]["text"]}')
+def extract_all_lines_from_the_doc(path):
+    lines_of_doc = []
+    with open(path, 'rb') as f:
+        reader = pdfp.PDF(f)
+        skip_table_of_contents = reader.pages[8:16]
+        j = 0
+        while j < len(skip_table_of_contents):
+            lines_of_doc.append({"page_number": j+9, "content": skip_table_of_contents[j].extract_text_lines()})
+            j += 1
+    return lines_of_doc
+# path = "data/Illumio_Core_REST_API_Developer_Guide_23.3.pdf"
+# get_pdf_title_styles(os.path.abspath(path))
+# print("--------------------------------------------------")
+# print("--------------------------------------------------")
+#print(test_get_font_sizes_of_a_page(8))

src/tools/retriever.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from src.model.container import Container
+from src.model.block import Block
+from src.model.doc import Doc
+from src.tools.llm import LlmAgent
+class Retriever:
+    def __init__(self,db_client,doc : Doc = None, collection_name:str = "illumio_database", llmagent : LlmAgent = None):
+        if doc != None:
+            self.collection = db_client.create_collection(name=collection_name)
+            blocks_good_format: [Block] = doc.blocks
+            self.collection.add(
+                documents=[llmagent.transform_parahraph_into_question(block.content,title_doc=doc.title,title_para=block.title) for block in blocks_good_format],
+                ids=[block.index for block in blocks_good_format],
+                metadatas=[block.to_dict() for block in blocks_good_format]
+            )
+        else:
+            self.collection = db_client.get_collection(name=collection_name)
+    def similarity_search(self, query: str) -> {}:
+        res = self.collection.query(query_texts=query)
+        block_dict_sources = res['metadatas'][0]
+        distances = res['distances'][0]
+        blocks = []
+        for bd, d in zip(block_dict_sources, distances):
+            b = Block().from_dict(bd)
+            b.distance = d
+            blocks.append(b)
+        return blocks

src/tools/test_read.py ADDED Viewed

	@@ -0,0 +1,209 @@

+# To read the PDF
+import PyPDF2
+# To analyze the PDF layout and extract text
+from pdfminer.high_level import extract_pages, extract_text
+from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
+# To extract text from tables in PDF
+import pdfplumber
+# To extract the images from the PDFs
+from PIL import Image
+from pdf2image import convert_from_path
+# To perform OCR to extract text from images
+import pytesseract
+# To remove the additional created files
+import os
+def text_extraction(element):
+    # Extracting the text from the in-line text element
+    line_text = element.get_text()
+    # Find the formats of the text
+    # Initialize the list with all the formats that appeared in the line of text
+    line_formats = []
+    for text_line in element:
+        if isinstance(text_line, LTTextContainer):
+            # Iterating through each character in the line of text
+            for character in text_line:
+                if isinstance(character, LTChar):
+                    # Append the font name of the character
+                    line_formats.append(character.fontname)
+                    # Append the font size of the character
+                    line_formats.append(character.size)
+    # Find the unique font sizes and names in the line
+    format_per_line = list(set(line_formats))
+    # Return a tuple with the text in each line along with its format
+    return (line_text, format_per_line)
+def crop_image(element, pageObj):
+    # Get the coordinates to crop the image from the PDF
+    [image_left, image_top, image_right, image_bottom] = [element.x0,element.y0,element.x1,element.y1]
+    # Crop the page using coordinates (left, bottom, right, top)
+    pageObj.mediabox.lower_left = (image_left, image_bottom)
+    pageObj.mediabox.upper_right = (image_right, image_top)
+    # Save the cropped page to a new PDF
+    cropped_pdf_writer = PyPDF2.PdfWriter()
+    cropped_pdf_writer.add_page(pageObj)
+    # Save the cropped PDF to a new file
+    with open('cropped_image.pdf', 'wb') as cropped_pdf_file:
+        cropped_pdf_writer.write(cropped_pdf_file)
+# Create a function to convert the PDF to images
+def convert_to_images(input_file,):
+    images = convert_from_path(input_file,poppler_path=r'C:\Program Files\poppler-23.08.0\Library\bin')
+    image = images[0]
+    output_file = "PDF_image.png"
+    image.save(output_file, "PNG")
+# Create a function to read text from images
+def image_to_text(image_path):
+    # Read the image
+    img = Image.open(image_path)
+    # Extract the text from the image
+    text = pytesseract.image_to_string(img)
+    return text
+def extract_table(pdf_path, page_num, table_num):
+    # Open the pdf file
+    pdf = pdfplumber.open(pdf_path)
+    # Find the examined page
+    table_page = pdf.pages[page_num]
+    # Extract the appropriate table
+    table = table_page.extract_tables()[table_num]
+    return table
+# Convert table into the appropriate format
+def table_converter(table):
+    table_string = ''
+    # Iterate through each row of the table
+    for row_num in range(len(table)):
+        row = table[row_num]
+        # Remove the line breaker from the wrapped texts
+        cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
+        # Convert the table into a string
+        table_string+=('|'+'|'.join(cleaned_row)+'|'+'\n')
+    # Removing the last line break
+    table_string = table_string[:-1]
+    return table_string
+def pdf_manager(pdf_path):
+    # create a PDF file object
+    pdfFileObj = open(pdf_path, 'rb')
+    # create a PDF reader object
+    pdfReaded = PyPDF2.PdfReader(pdfFileObj)
+    # Create the dictionary to extract text from each image
+    text_per_page = {}
+    # We extract the pages from the PDF
+    for pagenum, page in enumerate(extract_pages(pdf_path)):
+        # Initialize the variables needed for the text extraction from the page
+        pageObj = pdfReaded.pages[pagenum]
+        page_text = []
+        line_format = []
+        text_from_images = []
+        text_from_tables = []
+        page_content = []
+        # Initialize the number of the examined tables
+        table_num = 0
+        first_element= True
+        table_extraction_flag= False
+        # Open the pdf file
+        pdf = pdfplumber.open(pdf_path)
+        # Find the examined page
+        page_tables = pdf.pages[pagenum]
+        # Find the number of tables on the page
+        tables = page_tables.find_tables()
+        # Find all the elements
+        page_elements = [(element.y1, element) for element in page._objs]
+        # Sort all the elements as they appear in the page
+        page_elements.sort(key=lambda a: a[0], reverse=True)
+        # Find the elements that composed a page
+        for i,component in enumerate(page_elements):
+            # Extract the position of the top side of the element in the PDF
+            pos= component[0]
+            # Extract the element of the page layout
+            element = component[1]
+            # Check if the element is a text element
+            if isinstance(element, LTTextContainer):
+                # Check if the text appeared in a table
+                if table_extraction_flag == False:
+                    # Use the function to extract the text and format for each text element
+                    (line_text, format_per_line) = text_extraction(element)
+                    # Append the text of each line to the page text
+                    page_text.append(line_text)
+                    # Append the format for each line containing text
+                    line_format.append(format_per_line)
+                    page_content.append(line_text)
+                else:
+                    # Omit the text that appeared in a table
+                    pass
+            # Check the elements for images
+            if isinstance(element, LTFigure):
+                # Crop the image from the PDF
+                crop_image(element, pageObj)
+                # Convert the cropped pdf to an image
+                convert_to_images('cropped_image.pdf')
+                # Extract the text from the image
+                image_text = image_to_text('PDF_image.png')
+                text_from_images.append(image_text)
+                page_content.append(image_text)
+                # Add a placeholder in the text and format lists
+                page_text.append('image')
+                line_format.append('image')
+            # Check the elements for tables
+            if isinstance(element, LTRect):
+                # If the first rectangular element
+                if first_element == True and (table_num+1) <= len(tables):
+                    # Find the bounding box of the table
+                    lower_side = page.bbox[3] - tables[table_num].bbox[3]
+                    upper_side = element.y1
+                    # Extract the information from the table
+                    table = extract_table(pdf_path, pagenum, table_num)
+                    # Convert the table information in structured string format
+                    table_string = table_converter(table)
+                    # Append the table string into a list
+                    text_from_tables.append(table_string)
+                    page_content.append(table_string)
+                    # Set the flag as True to avoid the content again
+                    table_extraction_flag = True
+                    # Make it another element
+                    first_element = False
+                    # Add a placeholder in the text and format lists
+                    page_text.append('table')
+                    line_format.append('table')
+                # Check if we already extracted the tables from the page
+                if element.y0 >= lower_side and element.y1 <= upper_side:
+                    pass
+                elif not isinstance(page_elements[i+1][1], LTRect):
+                    table_extraction_flag = False
+                    first_element = True
+                    table_num+=1
+        # Create the key of the dictionary
+        dctkey = 'Page_'+str(pagenum)
+        # Add the list of list as the value of the page key
+        text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
+    # Closing the pdf file object
+    pdfFileObj.close()
+    # Deleting the additional files created
+    os.remove('cropped_image.pdf')
+    os.remove('PDF_image.png')
+    # Display the content of the page
+    result = ''.join(text_per_page['Page_0'][4])
+    print(result)

src/view/view.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import gradio as gr
+from src.control.control import Chatbot
+def run(ctrl: Chatbot, config: {}):
+    with gr.Blocks() as qna:
+        with gr.Row():
+            with gr.Column():
+                pass
+            with gr.Column(scale=10):
+                gr.Markdown(config['title'])
+                histo_text_comp = gr.Chatbot(
+                    visible=False,
+                    value=[],
+                )
+                input_text_comp = gr.Textbox(
+                    label="",
+                    lines=1,
+                    max_lines=3,
+                    interactive=True,
+                    placeholder="Posez votre question ici",
+                )
+                clear_btn = gr.Button("Clear")
+                input_example_comp = gr.Radio(
+                    label="Examples",
+                    choices=list(config['examples'].values()),
+                    value="",
+                )
+                source_text_comp = []
+                for i in range(4):
+                    source_text_comp.append(gr.Textbox(
+                        lines=4,
+                        max_lines=4,
+                        interactive=False,
+                        visible=False,
+                    ))
+            with gr.Column():
+                pass
+        def input_text_fn1(input_text_, histo_text_):
+            histo_text_.append((input_text_, None))
+            update_ = {
+                histo_text_comp: gr.update(visible=True, value=histo_text_),
+                input_example_comp: gr.update(visible=False,),
+            }
+            for i in range(4):
+                update_[source_text_comp[i]] = gr.update(visible=False)
+            return update_
+        def input_text_fn2(input_text_, histo_text_):
+            answer, sources = ctrl.get_response(query=input_text_, histo=histo_text_)
+            histo_text_[-1] = (input_text_, answer)
+            update_ = {
+                histo_text_comp: gr.update(value=histo_text_),
+                input_text_comp: gr.update(value=''),
+            }
+            for i in range(min(len(sources), 3)):
+                s = sources[i]
+                source_label = f'{s.index}   {s.title}                        score = {s.distance_str}'
+                source_text = s.content
+                update_[source_text_comp[i]] = gr.update(visible=True, value=source_text, label=source_label)
+            return update_
+        def input_example_fn(input_example_, histo_text_):
+            histo_text_.append((input_example_, None))
+            update_ = {
+                input_text_comp: gr.update(value=input_example_),
+                histo_text_comp: gr.update(visible=True, value=histo_text_),
+                input_example_comp: gr.update(visible=False, value=''),
+            }
+            for i in range(4):
+                update_[source_text_comp[i]] = gr.update(visible=False)
+            return update_
+        def clear_fn():
+            update_ = {
+                input_text_comp: gr.update(value=''),
+                histo_text_comp: gr.update(value='', visible=False),
+                input_example_comp: gr.update(value='', visible=True),
+            }
+            for i in range(4):
+                update_[source_text_comp[i]] = gr.update(visible=False, value='hello')
+            return update_
+        input_text_comp \
+            .submit(input_text_fn1,
+                    inputs=[input_text_comp, histo_text_comp],
+                    outputs=[histo_text_comp, input_example_comp,
+                             source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])\
+            .then(input_text_fn2,
+                  inputs=[input_text_comp, histo_text_comp],
+                  outputs=[input_text_comp, histo_text_comp,
+                           source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])
+        input_example_comp \
+            .input(input_example_fn,
+                   inputs=[input_example_comp, histo_text_comp],
+                   outputs=[input_text_comp, histo_text_comp, input_example_comp,
+                            source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])\
+            .then(input_text_fn2,
+                  inputs=[input_text_comp, histo_text_comp],
+                  outputs=[input_text_comp, histo_text_comp,
+                           source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])
+        clear_btn.click(clear_fn,
+                        inputs=None,
+                        outputs=[input_text_comp, histo_text_comp, input_example_comp,
+                                 source_text_comp[0], source_text_comp[1], source_text_comp[2], source_text_comp[3]])
+    return qna

test.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from src.model.doc import Doc
+from config import *
+from src.tools.llm import LlmAgent
+llmagent = LlmAgent(model="TheBloke/Llama-2-7b-Chat-GPTQ")
+doc = Doc(path=content_en_path_real)
+[llmagent.transform_parahraph_into_question(block.content, title_doc=doc.title,title_para=block.title) for block in doc.blocks]