Spaces:

axionable
/

clara

Running

App Files Files Community

axionable commited on May 30, 2024

Commit

03d828b

verified ·

1 Parent(s): d33391f

Upload folder using huggingface_hub

Browse files

Files changed (37) hide show

.gitattributes +4 -0
.gitignore +236 -0
PDF/.gitignore +2 -0
PDF/Anticiper-les-effets-de-l-adaptation-dun-rechauffement-climatique-de-plus-4-degres-quels-couts-de-l-adaptation.pdf +3 -0
PDF/deu-2023.pdf +3 -0
PDF/memo_risques_physiques_focus_batiment_2022.pdf +3 -0
app.py +336 -0
assets/Logo.png +0 -0
assets/axionable.svg +24 -0
assets/download.png +0 -0
assets/logo4.png +0 -0
climateqa/__init__.py +0 -0
climateqa/engine/__init__.py +0 -0
climateqa/engine/embeddings.py +29 -0
climateqa/engine/keywords.py +30 -0
climateqa/engine/llm/__init__.py +8 -0
climateqa/engine/llm/openai.py +25 -0
climateqa/engine/old/chains.py +83 -0
climateqa/engine/old/chat.py +39 -0
climateqa/engine/old/custom_retrieval_chain.py +63 -0
climateqa/engine/prompts.py +80 -0
climateqa/engine/rag.py +121 -0
climateqa/engine/reformulation.py +42 -0
climateqa/engine/retriever.py +166 -0
climateqa/engine/text_retriever.py +44 -0
climateqa/engine/utils.py +69 -0
climateqa/engine/vectorstore.py +171 -0
climateqa/engine/vectorstore_annoy.py +187 -0
logs/.gitignore +2 -0
requirements.txt +15 -0
setup.py +1 -0
style.css +462 -0
test +32 -0
utils.py +12 -0
vectors/.gitignore +2 -0
vectors/index.annoy +3 -0
vectors/index.pkl +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+PDF/Anticiper-les-effets-de-l-adaptation-dun-rechauffement-climatique-de-plus-4-degres-quels-couts-de-l-adaptation.pdf filter=lfs diff=lfs merge=lfs -text
+PDF/deu-2023.pdf filter=lfs diff=lfs merge=lfs -text
+PDF/memo_risques_physiques_focus_batiment_2022.pdf filter=lfs diff=lfs merge=lfs -text
+vectors/index.annoy filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,236 @@

+setAPIKEY.sh
+# Created by https://www.toptal.com/developers/gitignore/api/python,visualstudiocode,macos
+# Edit at https://www.toptal.com/developers/gitignore?templates=python,visualstudiocode,macos
+### macOS ###
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+# Historique conversasion with chatbot
+*.json
+# Icon must end with two \r
+Icon
+# files for RAG
+sources/*
+categories.csv
+# Thumbnails
+._*
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+### macOS Patch ###
+# iCloud generated files
+*.icloud
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+### Python Patch ###
+# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
+poetry.toml
+# ruff
+.ruff_cache/
+# LSP config files
+pyrightconfig.json
+### VisualStudioCode ###
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+!.vscode/*.code-snippets
+# Local History for Visual Studio Code
+.history/
+# Built Visual Studio Code Extensions
+*.vsix
+### VisualStudioCode Patch ###
+# Ignore all local history of files
+.history
+.ionide
+# End of https://www.toptal.com/developers/gitignore/api/python,visualstudiocode,macos

PDF/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *
2	+ !.gitignore

PDF/Anticiper-les-effets-de-l-adaptation-dun-rechauffement-climatique-de-plus-4-degres-quels-couts-de-l-adaptation.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be9d2d29a6545fc1949b10eb8428e6fac632aa84020fa61f4f76600817a21cd5
+size 2079496

PDF/deu-2023.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:09ea20da6494b2de2ae4d1f45dd309ee72700acf676a3d5dfdbf4f2cec8408bb
+size 9714830

PDF/memo_risques_physiques_focus_batiment_2022.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c3f8c224d1e3d269e7688b1a49cff025f24a67bfa156306ce94ed5d3ede0720
+size 5330523

app.py ADDED Viewed

	@@ -0,0 +1,336 @@

+# , get_pinecone_vectorstore, find_similar_vectors
+from climateqa.engine.vectorstore import build_vectores_stores, get_PDF_Names_from_GCP, get_categories_files
+from climateqa.engine.text_retriever import ClimateQARetriever
+from climateqa.engine.rag import make_rag_chain
+from climateqa.engine.llm import get_llm
+from utils import create_user_id
+from datetime import datetime
+import json
+import re
+import gradio as gr
+from sentence_transformers import CrossEncoder
+reranker = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1")
+# Load environment variables in local mode
+try:
+    from dotenv import load_dotenv
+    load_dotenv()
+except Exception as e:
+    pass
+# Set up Gradio Theme
+theme = gr.themes.Soft(
+    primary_hue="yellow",
+    secondary_hue="orange",
+    font=[gr.themes.GoogleFont("Poppins"), "ui-sans-serif",
+          "system-ui", "sans-serif"],
+)
+init_prompt = ""
+system_template = {
+    "role": "system",
+    "content": init_prompt,
+}
+user_id = create_user_id()
+list_categorie = get_categories_files()
+categories=list_categorie["AllCat"]
+def parse_output_llm_with_sources(output):
+    # Split the content into a list of text and "[Doc X]" references
+    content_parts = re.split(r'\[(Doc\s?\d+(?:,\s?Doc\s?\d+)*)\]', output)
+    parts = []
+    for part in content_parts:
+        if part.startswith("Doc"):
+            subparts = part.split(",")
+            subparts = [subpart.lower().replace("doc", "").strip()
+                        for subpart in subparts]
+            subparts = [f"""<a href="#doc{subpart}" class="a-doc-ref" target="_self"><span class='doc-ref'><sup style="color:#FFC000 !important;">({
+                subpart})</sup></span></a>""" for subpart in subparts]
+            parts.append("".join(subparts))
+        else:
+            parts.append(part)
+    content_parts = "".join(parts)
+    return content_parts
+def serialize_docs(docs):
+    new_docs = []
+    for doc in docs:
+        new_doc = {}
+        new_doc["page_content"] = doc.page_content
+        new_doc["metadata"] = doc.metadata
+        new_docs.append(new_doc)
+    return new_docs
+# Create vectorstore and retriever
+vectorstore = build_vectores_stores("./sources")
+llm = get_llm(provider="openai", max_tokens=1024, temperature=0.0)
+async def chat(query, history, categories, src_nb_max, src_pertinence):
+    """taking a query and a message history, use a pipeline (reformulation, retriever, answering) to yield a tuple of:
+    (messages in gradio format, messages in langchain format, source documents)"""
+    print(f">> NEW QUESTION : {query} -> sources max:{src_nb_max} - pertience: {src_pertinence}")
+    filter = None
+    if len(categories):
+        filter={ "$or" : [] }
+        for cat in categories:
+            for fich in list_categorie[cat]:
+                filter["$or"].append({"ax_name": fich})
+    print( ">> Filter :" + str(filter) )
+    print( ">> nb sources :" + str(src_nb_max) )
+    print( ">> pertinence :" + str(src_pertinence) )
+    retriever = ClimateQARetriever(
+        vectorstore=vectorstore, sources=["Custom"], reports=[],
+        threshold=src_pertinence, k_total=src_nb_max, filter=filter
+    )
+    rag_chain = make_rag_chain(retriever, llm)
+    inputs = {"query": query, "audience": None}
+    result = rag_chain.astream_log(inputs)
+    path_reformulation = "/logs/reformulation/final_output"
+    path_keywords = "/logs/keywords/final_output"
+    path_retriever = "/logs/find_documents/final_output"
+    path_answer = "/logs/answer/streamed_output_str/-"
+    docs_html = ""
+    output_query = ""
+    output_language = ""
+    output_keywords = ""
+    gallery = []
+    try:
+        async for op in result:
+            op = op.ops[0]
+            if op['path'] == path_reformulation:  # reforulated question
+                try:
+                    output_language = op['value']["language"]  # str
+                    output_query = op["value"]["question"]
+                except Exception as e:
+                    raise gr.Error(f"ClimateQ&A Error: {e} - The error has been noted, try another question and if the error remains, you can contact us :)")
+            if op["path"] == path_keywords:
+                try:
+                    output_keywords = op['value']["keywords"]  # str
+                    output_keywords = " AND ".join(output_keywords)
+                except Exception as e:
+                    pass
+            elif op['path'] == path_retriever:  # documents
+                try:
+                    docs = op['value']['docs']  # List[Document]
+                    docs_html = []
+                    for i, d in enumerate(docs, 1):
+                        docs_html.append(make_html_source(d, i))
+                    docs_html = "".join(docs_html)
+                except TypeError:
+                    print("No documents found")
+                    print("op: ", op)
+                    continue
+            elif op['path'] == path_answer:  # final answer
+                new_token = op['value']  # str
+                # time.sleep(0.01)
+                previous_answer = history[-1][1]
+                previous_answer = previous_answer if previous_answer is not None else ""
+                answer_yet = previous_answer + new_token
+                answer_yet = parse_output_llm_with_sources(answer_yet)
+                history[-1] = (query, answer_yet)
+            else:
+                continue
+            history = [tuple(x) for x in history]
+            yield history, docs_html, output_query, output_language, gallery, output_query, output_keywords
+    except Exception as e:
+        raise gr.Error(f"{e}")
+    timestamp = str(datetime.now().timestamp())
+    log_file = "logs/" + timestamp + ".json"
+    prompt = history[-1][0]
+    logs = {
+        "user_id": str(user_id),
+        "prompt": prompt,
+        "query": prompt,
+        "question": output_query,
+        "sources": ["Custom"],
+        "docs": serialize_docs(docs),
+        "answer": history[-1][1],
+        "time": timestamp,
+    }
+    #log_locally(log_file, logs)
+    yield history, docs_html, output_query, output_language, gallery, output_query, output_keywords
+def make_html_source(source, i):
+    # Prépare le contenu HTML pour un fichier texte
+    text_content = source.page_content.strip()
+    meta = source.metadata
+    # Nom de la source
+    name = f"<b>Document {i}</b>"
+    # Contenu HTML de la carte
+    card = f"""
+    <div class="card" id="doc{i}">
+        <div class="card-content">
+            <div>
+                <div style="float:right;width 10%;position:relative;top:0px">
+                    <a href='{meta['ax_url']}' target='_blank'><img style="width:20px" src='/file/assets/download.png' /></a>
+                </div>
+                <div>
+                    <h2>Extrait {i} (Score:{float(meta['similarity_score'])})</h2>
+                    <h2> {meta['ax_name']} - Page {int(meta['ax_page'])}</h2>
+                </div>
+            </div>
+            <p>{text_content}</p>
+        </div>
+        <!-- <div class="card-footer">
+            <span>{name}</span>
+        </div> -->
+    </div>
+    """
+    return card
+def log_locally(file, logs):
+    # Convertit les logs en format JSON
+    logs_json = json.dumps(logs)
+    # Écrit les logs dans un fichier local
+    with open(file, 'w') as f:
+        f.write(logs_json)
+# --------------------------------------------------------------------
+# Gradio
+# --------------------------------------------------------------------
+init_prompt = """
+Hello, I am Clara, an AI Assistant created by Axionable. My purpose is to answer your questions using the provided extracted passages, context, and guidelines.
+❓ How to interact with Clara
+Ask your question: You can ask me anything you want to know. I'll provide an answer based on the extracted passages and other relevant sources.
+Response structure: I aim to provide clear and structured answers using the given data.
+Guidelines: I follow specific guidelines to ensure that my responses are accurate and useful.
+⚠️ Limitations
+Though I do my best to help, there might be times when my responses are incorrect or incomplete. If that happens, please feel free to ask for more information or provide feedback to help improve my performance.
+What would you like to know today?
+"""
+with gr.Blocks(title="CLARA", css="style.css", theme=theme, elem_id="main-component", elem_classes="ax_background") as demo:
+    gr.HTML("""
+        <img style="width:100px" src="file/assets/axionable.svg"/>
+    """, elem_classes="logo-axio ")
+    # TAB Clara
+    with gr.Tab("CLARA"):
+        with gr.Row(elem_id="chatbot-row"):
+            with gr.Column(scale=2):
+                chatbot = gr.Chatbot(
+                    value=[(None, init_prompt)],
+                    show_copy_button=True, show_label=False, elem_id="chatbot", layout="panel",
+                    avatar_images=(None, "assets/logo4.png"))
+                with gr.Row(elem_id="input-message"):
+                    textbox = gr.Textbox(placeholder="Posez votre question", show_label=False,
+                                        scale=7, lines=1, interactive=True, elem_id="input-textbox")
+            with gr.Column(scale=1, variant="panel", elem_id="right-panel"):
+ #               with gr.Column(scale=1, elem_id="tab-citations"):
+ #                   gr.HTML("<p>Sources</p>")
+ #                   slider = gr.Slider(1, 10, value=src_nb_max, step=1, label="nb max", interactive=True, elem_id="source-nb-max")
+ #                   slider_p = gr.Slider(0.0, 1.0, value=src_pertinence, step=0.01, label="pertinence", interactive=True, elem_id="source-pertinence")
+ #                   sources_textbox = gr.HTML(
+ #                       show_label=False, elem_id="sources-textbox")
+ #                   docs_textbox = gr.State("")
+                # l'object tabs est necessaire actuellement
+                # J'ai l'impression qu'il est utiliser pour freezre les contenu des tabs
+                # pendant que l'ia gènère une reponse ..
+                with gr.Tabs() as tabs:
+#                    None
+                    with gr.Tab("sources"):
+                        sources_textbox = gr.HTML(
+                            show_label=False, elem_id="sources-textbox")
+                        docs_textbox = gr.State("")
+                    with gr.Tab("filtres"):
+                        cat_sel = gr.CheckboxGroup(categories,label="Catégories")
+                        slider = gr.Slider(1, 10, value=7, step=1, label="nb max", interactive=True, elem_id="source-nb-max")
+                        slider_p = gr.Slider(0.0, 1.0, value=0.5, step=0.01, label="pertinence", interactive=True, elem_id="source-pertinence")
+    # TAB A propos
+    with gr.Tab("À propos", elem_classes="max-height other-tabs"):
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown(
+                    ("CLARA (Climate LLM for Adaptation & Risks Answers) by [Axionable](https://www.axionable.com/)"
+                    "– Fork de [ClimateQ&A](https://huggingface.co/spaces/Ekimetrics/climate-question-answering/tree/main)"), elem_classes="a-propos")
+#    # TAB Configuration
+#    with gr.Tab("Configuration"):
+#
+#        with gr.Row(elem_id="config-row"):
+#            with gr.Column(scale=1):
+#
+#                for pdfName in get_PDF_Names_from_GCP():
+#                        gr.Markdown( pdfName, elem_classes="a-propos")
+    def start_chat(query, history):
+        history = history + [(query, None)]
+        history = [tuple(x) for x in history]
+        return (gr.update(interactive=False), gr.update(selected=1), history)
+    def finish_chat():
+        return (gr.update(interactive=True, value=""))
+    (textbox
+        .submit(start_chat, [textbox, chatbot], [textbox, tabs, chatbot], queue=False, api_name="start_chat_textbox")
+        .then(chat, [textbox, chatbot, cat_sel, slider, slider_p], [chatbot, sources_textbox], concurrency_limit=8, api_name="chat_textbox")
+        .then(finish_chat, None, [textbox], api_name="finish_chat_textbox")
+     )
+    demo.queue()
+demo.launch(allowed_paths=["assets/download.png",
+            "assets/logo4.png",
+            "assets/axionable.svg"],favicon_path="assets/logo4.png")

assets/Logo.png ADDED Viewed

assets/axionable.svg ADDED Viewed

assets/download.png ADDED Viewed

assets/logo4.png ADDED Viewed

climateqa/__init__.py ADDED Viewed

File without changes

climateqa/engine/__init__.py ADDED Viewed

File without changes

climateqa/engine/embeddings.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from langchain_community.embeddings import HuggingFaceBgeEmbeddings
+from langchain_community.embeddings import HuggingFaceEmbeddings
+def get_embeddings_function(version = "v1.2"):
+    if version == "v1.2":
+        # https://huggingface.co/BAAI/bge-base-en-v1.5
+        # Best embedding model at a reasonable size at the moment (2023-11-22)
+        # model_name = "BAAI/bge-base-en-v1.5"
+        # https://huggingface.co/BAAI/bge-m3
+        # A better one from 2024-04
+        model_name = "BAAI/bge-m3"
+        encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
+        print("Loading embeddings model: ", model_name)
+        embeddings_function = HuggingFaceBgeEmbeddings(
+            model_name=model_name,
+            encode_kwargs=encode_kwargs,
+            query_instruction="Represent this sentence for searching relevant passages: "
+        )
+    else:
+        embeddings_function = HuggingFaceEmbeddings(model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1")
+    return embeddings_function

climateqa/engine/keywords.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from typing import List
+from typing import Literal
+from langchain_core.pydantic_v1 import BaseModel, Field
+from langchain.prompts import ChatPromptTemplate
+from langchain_core.utils.function_calling import convert_to_openai_function
+from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
+class KeywordsOutput(BaseModel):
+    """Analyzing the user query to get keywords for a search engine"""
+    keywords: list = Field(
+        description="""
+        Generate 1 or 2 relevant keywords from the user query to ask a search engine for scientific research papers.
+        Example:
+        - "What is the impact of deep sea mining ?" -> ["deep sea mining"]
+        - "How will El Nino be impacted by climate change" -> ["el nino"]
+        - "Is climate change a hoax" -> [Climate change","hoax"]
+        """
+    )
+def make_keywords_chain(llm):
+    functions = [convert_to_openai_function(KeywordsOutput)]
+    llm_functions = llm.bind(functions = functions,function_call={"name":"KeywordsOutput"})
+    chain = llm_functions | JsonOutputFunctionsParser()
+    return chain

climateqa/engine/llm/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from climateqa.engine.llm.openai import get_llm as get_openai_llm
+def get_llm(provider="openai", **kwargs):
+    if provider == "openai":
+        return get_openai_llm(**kwargs)
+    else:
+        raise ValueError(f"Unknown provider: {provider}")

climateqa/engine/llm/openai.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from langchain_openai import ChatOpenAI
+import os
+try:
+    from dotenv import load_dotenv
+    load_dotenv()
+except Exception:
+    pass
+# gpt-3.5-turbo-0125
+def get_llm(model="gpt-3.5-turbo", max_tokens=1024, temperature=0.0,
+            streaming=True, timeout=30, **kwargs):
+    llm = ChatOpenAI(
+        model=model,
+        api_key=os.environ.get("OPENAI_API_KEY", None),
+        max_tokens=max_tokens,
+        streaming=streaming,
+        temperature=temperature,
+        timeout=timeout,
+        **kwargs,
+    )
+    return llm

climateqa/engine/old/chains.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# https://python.langchain.com/docs/modules/chains/how_to/custom_chain
+# Including reformulation of the question in the chain
+import json
+from langchain import PromptTemplate, LLMChain
+from langchain.chains import RetrievalQAWithSourcesChain,QAWithSourcesChain
+from langchain.chains import TransformChain, SequentialChain
+from langchain.chains.qa_with_sources import load_qa_with_sources_chain
+from climateqa.prompts import answer_prompt, reformulation_prompt,audience_prompts
+from climateqa.custom_retrieval_chain import CustomRetrievalQAWithSourcesChain
+def load_combine_documents_chain(llm):
+    prompt = PromptTemplate(template=answer_prompt, input_variables=["summaries", "question","audience","language"])
+    qa_chain = load_qa_with_sources_chain(llm, chain_type="stuff",prompt = prompt)
+    return qa_chain
+def load_qa_chain_with_docs(llm):
+    """Load a QA chain with documents.
+    Useful when you already have retrieved docs
+    To be called with this input
+    ```
+    output = chain({
+        "question":query,
+        "audience":"experts climate scientists",
+        "docs":docs,
+        "language":"English",
+    })
+    ```
+    """
+    qa_chain = load_combine_documents_chain(llm)
+    chain = QAWithSourcesChain(
+        input_docs_key = "docs",
+        combine_documents_chain = qa_chain,
+        return_source_documents = True,
+    )
+    return chain
+def load_qa_chain_with_text(llm):
+    prompt = PromptTemplate(
+        template = answer_prompt,
+        input_variables=["question","audience","language","summaries"],
+    )
+    qa_chain = LLMChain(llm = llm,prompt = prompt)
+    return qa_chain
+def load_qa_chain_with_retriever(retriever,llm):
+    qa_chain = load_combine_documents_chain(llm)
+    # This could be improved by providing a document prompt to avoid modifying page_content in the docs
+    # See here https://github.com/langchain-ai/langchain/issues/3523
+    answer_chain = CustomRetrievalQAWithSourcesChain(
+        combine_documents_chain = qa_chain,
+        retriever=retriever,
+        return_source_documents = True,
+        verbose = True,
+        fallback_answer="**⚠️ No relevant passages found in the climate science reports (IPCC and IPBES), you may want to ask a more specific question (specifying your question on climate issues).**",
+    )
+    return answer_chain
+def load_climateqa_chain(retriever,llm_reformulation,llm_answer):
+    reformulation_chain = load_reformulation_chain(llm_reformulation)
+    answer_chain = load_qa_chain_with_retriever(retriever,llm_answer)
+    climateqa_chain = SequentialChain(
+        chains = [reformulation_chain,answer_chain],
+        input_variables=["query","audience"],
+        output_variables=["answer","question","language","source_documents"],
+        return_all = True,
+        verbose = True,
+    )
+    return climateqa_chain

climateqa/engine/old/chat.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# LANGCHAIN IMPORTS
+from langchain import PromptTemplate, LLMChain
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.chains import RetrievalQAWithSourcesChain
+from langchain.chains.qa_with_sources import load_qa_with_sources_chain
+# CLIMATEQA
+from climateqa.retriever import ClimateQARetriever
+from climateqa.vectorstore import get_pinecone_vectorstore
+from climateqa.chains import load_climateqa_chain
+class ClimateQA:
+    def __init__(self,hf_embedding_model = "sentence-transformers/multi-qa-mpnet-base-dot-v1",
+                 show_progress_bar = False,batch_size = 1,max_tokens = 1024,**kwargs):
+        self.llm = self.get_llm(max_tokens = max_tokens,**kwargs)
+        self.embeddings_function = HuggingFaceEmbeddings(
+            model_name=hf_embedding_model,
+            encode_kwargs={"show_progress_bar":show_progress_bar,"batch_size":batch_size}
+        )
+    def get_vectorstore(self):
+        pass
+    def reformulate(self):
+        pass
+    def retrieve(self):
+        pass
+    def ask(self):
+        pass

climateqa/engine/old/custom_retrieval_chain.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from __future__ import annotations
+import inspect
+from typing import Any, Dict, List, Optional
+from pydantic import Extra
+from langchain.schema.language_model import BaseLanguageModel
+from langchain.callbacks.manager import (
+    AsyncCallbackManagerForChainRun,
+    CallbackManagerForChainRun,
+)
+from langchain.chains.base import Chain
+from langchain.prompts.base import BasePromptTemplate
+from typing import Any, Dict, List
+from langchain.callbacks.manager import (
+    AsyncCallbackManagerForChainRun,
+    CallbackManagerForChainRun,
+)
+from langchain.chains.combine_documents.stuff import StuffDocumentsChain
+from langchain.chains.qa_with_sources.base import BaseQAWithSourcesChain
+from langchain.docstore.document import Document
+from langchain.pydantic_v1 import Field
+from langchain.schema import BaseRetriever
+from langchain.chains import RetrievalQAWithSourcesChain
+from langchain.chains.router.llm_router import LLMRouterChain
+class CustomRetrievalQAWithSourcesChain(RetrievalQAWithSourcesChain):
+    fallback_answer:str = "No sources available to answer this question."
+    def _call(self,inputs,run_manager=None):
+        _run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
+        accepts_run_manager = (
+            "run_manager" in inspect.signature(self._get_docs).parameters
+        )
+        if accepts_run_manager:
+            docs = self._get_docs(inputs, run_manager=_run_manager)
+        else:
+            docs = self._get_docs(inputs)  # type: ignore[call-arg]
+        if len(docs) == 0:
+            answer = self.fallback_answer
+            sources = []
+        else:
+            answer = self.combine_documents_chain.run(
+                input_documents=docs, callbacks=_run_manager.get_child(), **inputs
+            )
+            answer, sources = self._split_sources(answer)
+        result: Dict[str, Any] = {
+            self.answer_key: answer,
+            self.sources_answer_key: sources,
+        }
+        if self.return_source_documents:
+            result["source_documents"] = docs
+        return result

climateqa/engine/prompts.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# If the message is not relevant to climate change (like "How are you", "I am 18 years old" or "When was built the eiffel tower"), return N/A
+reformulation_prompt_template = """
+Reformulate the following user message to be a short standalone question in English, in the context of an educational discussion about climate change.
+---
+query: La technologie nous sauvera-t-elle ?
+->
+'question': 'Can technology help humanity mitigate the effects of climate change?',
+'language': 'French',
+---
+query: what are our reserves in fossil fuel?
+->
+'question': 'What are the current reserves of fossil fuels and how long will they last?',
+'language': 'English',
+---
+query: what are the main causes of climate change?
+->
+'question': 'What are the main causes of climate change in the last century?',
+'language': 'English'
+---
+{format_instructions}
+Reformulate the question in English and detect the language of the original message
+Output the result as json with two keys "question" and "language"
+query: {query}
+->
+```json
+"""
+answer_prompt_without_docs_template = """
+You are Clara, an AI Assistant created by Axionable. Your role is to explain climate-related questions using info from the documents.
+Always stay true to climate science and do not make up information. If you do not know the answer, just say you do not know.
+Guidelines:
+- If it's a conversational question such as "hello", "who are you", ..., you can answer directly
+- Start by explaining clearly that you could not find any passages to answer in the documents, but it can be because of the search engine, and not because it's not there. So your answer is based on your own knowledge and must be taken with great caution because it's AI generated and you prefer to use sources to answer.
+- Also explain that the user should reformulate the question to be more specific to the documents
+- Because your role is to explain climate science, you can never be skeptical about climate change.
+- If the question is not related to environmental issues, never never answer it. Say it's not your role.
+- Make paragraphs by starting new lines to make your answers more readable.
+Question: {question}
+Answer in {language}:
+"""
+audience_prompts = {
+    "children": "6 year old children that don't know anything about science and climate change and need metaphors to learn",
+    "general": "the general public who know the basics in science and climate change and want to learn more about it without technical terms. Still use references to passages.",
+    "experts": "expert and climate scientists that are not afraid of technical terms",
+}
+answer_prompt_template_custom = """
+You are Clara, an AI Assistant created by Axionable. You are given a question and extracted passages. Provide a clear and structured answer based on the passages provided, the context and the guidelines.
+Guidelines:
+- If the passages have useful facts or numbers, use them in your answer.
+- When you use information from a passage, mention where it came from by using [Doc i] at the end of the sentence. i stands for the name of the document and page if you know it.
+- Do not use the sentence 'Doc i says ...' to say where information came from.
+- If the same thing is said in more than one document, you can mention all of them like this: [Doc i, Doc j, Doc k]
+- Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.
+- If it makes sense, use bullet points and lists to make your answers easier to understand.
+- You do not need to use every passage. Only use the ones that help answer the question.
+- If the documents do not have the information needed to answer the question, just say you do not have enough information.
+- Consider by default that the question is about the past century unless it is specified otherwise.
+- If the passage is the caption of a picture, you can still use it as part of your answer as any other document.
+-----------------------
+Passages:
+{context}
+-----------------------
+Question: {question} - Explained to {audience}
+Answer in {language} with the passages citations:
+"""

climateqa/engine/rag.py ADDED Viewed

	@@ -0,0 +1,121 @@

+from operator import itemgetter
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables import RunnablePassthrough, RunnableLambda, RunnableBranch
+from langchain_core.prompts.prompt import PromptTemplate
+from langchain_core.prompts.base import format_document
+from climateqa.engine.reformulation import make_reformulation_chain
+from climateqa.engine.prompts import answer_prompt_template_custom,answer_prompt_without_docs_template
+from climateqa.engine.utils import pass_values, flatten_dict,prepare_chain,rename_chain
+from climateqa.engine.keywords import make_keywords_chain
+DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(
+    template="{page_content}")
+def _combine_documents(
+    docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, sep="\n\n"
+):
+    doc_strings = []
+    for i, doc in enumerate(docs):
+        chunk_type = "Doc"
+        if isinstance(doc, str):
+            doc_formatted = doc
+        else:
+            doc_formatted = format_document(doc, document_prompt)
+        doc_string = f"{chunk_type} {i+1}: " + doc_formatted
+        doc_string = doc_string.replace("\n", " ")
+        doc_strings.append(doc_string)
+    return sep.join(doc_strings)
+def make_rag_chain(retriever, llm):
+    # Construct the prompt
+    prompt = ChatPromptTemplate.from_template(answer_prompt_template_custom)
+    prompt_without_docs = ChatPromptTemplate.from_template(
+        answer_prompt_without_docs_template)
+    # ------- CHAIN 0 - Reformulation
+    reformulation = make_reformulation_chain(llm)
+    reformulation = prepare_chain(reformulation, "reformulation")
+    # ------- Find all keywords from the reformulated query
+    keywords = make_keywords_chain(llm)
+    keywords = {"keywords": itemgetter("question") | keywords}
+    keywords = prepare_chain(keywords, "keywords")
+    # ------- CHAIN 1
+    # Retrieved documents
+    find_documents = {"docs": itemgetter(
+        "question") | retriever} | RunnablePassthrough()
+    find_documents = prepare_chain(find_documents, "find_documents")
+    # ------- CHAIN 2
+    # Construct inputs for the llm
+    input_documents = {
+        "context": lambda x: _combine_documents(x["docs"]),
+        **pass_values(["question", "audience", "language", "keywords"])
+    }
+    # ------- CHAIN 3
+    # Bot answer
+    llm_final = rename_chain(llm, "answer")
+    answer_with_docs = {
+        "answer": input_documents | prompt | llm_final | StrOutputParser(),
+        **pass_values(["question", "audience", "language", "query", "docs", "keywords"]),
+    }
+    answer_without_docs = {
+        "answer":  prompt_without_docs | llm_final | StrOutputParser(),
+        **pass_values(["question", "audience", "language", "query", "docs", "keywords"]),
+    }
+    answer = RunnableBranch(
+        (lambda x: len(x["docs"]) > 0, answer_with_docs),
+        answer_with_docs,
+    )
+    # ------- FINAL CHAIN
+    # Build the final chain
+    rag_chain = reformulation | keywords | find_documents | answer
+    return rag_chain
+def make_rag_papers_chain(llm):
+    #prompt = ChatPromptTemplate.from_template(papers_prompt_template)
+    input_documents = {
+        "context": lambda x: _combine_documents(x["docs"]),
+        **pass_values(["question", "language"])
+    }
+    chain = input_documents | llm | StrOutputParser()
+    chain = rename_chain(chain,"answer")
+    return chain
+def make_illustration_chain(llm):
+   # prompt_with_images = ChatPromptTemplate.from_template(answer_prompt_images_template)
+    input_description_images = {
+        "images":lambda x : _combine_documents(get_image_docs(x["docs"])),
+        **pass_values(["question","audience","language","answer"]),
+    }
+    illustration_chain = input_description_images | llm | StrOutputParser()
+    return illustration_chain

climateqa/engine/reformulation.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from langchain.output_parsers.structured import StructuredOutputParser, ResponseSchema
+from langchain_core.prompts import PromptTemplate
+from langchain_core.runnables import RunnablePassthrough, RunnableLambda, RunnableBranch
+from climateqa.engine.prompts import reformulation_prompt_template
+from climateqa.engine.utils import pass_values, flatten_dict
+response_schemas = [
+    ResponseSchema(name="language", description="The detected language of the input message"),
+    ResponseSchema(name="question", description="The reformulated question always in English")
+]
+output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
+format_instructions = output_parser.get_format_instructions()
+def fallback_default_values(x):
+    if x["question"] is None:
+        x["question"] = x["query"]
+        x["language"] = "english"
+    return x
+def make_reformulation_chain(llm):
+    prompt = PromptTemplate(
+        template=reformulation_prompt_template,
+        input_variables=["query"],
+        partial_variables={"format_instructions": format_instructions}
+    )
+    chain = (prompt | llm.bind(stop=["```"]) | output_parser)
+    reformulation_chain = (
+        {"reformulation":chain,**pass_values(["query"])}
+        | RunnablePassthrough()
+        | flatten_dict
+        | fallback_default_values
+    )
+    return reformulation_chain

climateqa/engine/retriever.py ADDED Viewed

	@@ -0,0 +1,166 @@

+# https://github.com/langchain-ai/langchain/issues/8623
+import pandas as pd
+from langchain_core.retrievers import BaseRetriever
+from langchain_core.vectorstores import VectorStoreRetriever
+from langchain_core.documents.base import Document
+from langchain_core.vectorstores import VectorStore
+from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
+from typing import List
+from pydantic import Field
+class ClimateQARetriever(BaseRetriever):
+    vectorstore:VectorStore
+    sources:list = ["IPCC","IPBES","IPOS"]
+    reports:list = []
+    threshold:float = 0.6
+    k_summary:int = 3
+    k_total:int = 10
+    namespace:str = "vectors",
+    min_size:int = 200,
+    def _get_relevant_documents(
+        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
+    ) -> List[Document]:
+        # Check if all elements in the list are either IPCC or IPBES
+        assert isinstance(self.sources,list)
+        assert all([x in ["IPCC","IPBES","IPOS"] for x in self.sources])
+        assert self.k_total > self.k_summary, "k_total should be greater than k_summary"
+        # Prepare base search kwargs
+        filters = {}
+        if len(self.reports) > 0:
+            filters["short_name"] = {"$in":self.reports}
+        else:
+            filters["source"] = { "$in":self.sources}
+        # Search for k_summary documents in the summaries dataset
+        filters_summaries = {
+            **filters,
+            "report_type": { "$in":["SPM"]},
+        }
+        #build with pinecone
+        #docs_summaries = self.vectorstore.similarity_search_with_score(query=query,filter = filters_summaries,k = self.k_summary)
+        docs_summaries = self.vectorstore.similarity_search_with_score(query=query, k=self.k_summary)
+        docs_summaries = [x for x in docs_summaries if x[1] > self.threshold]
+        # Search for k_total - k_summary documents in the full reports dataset
+        filters_full = {
+            **filters,
+            "report_type": { "$nin":["SPM"]},
+        }
+        k_full = self.k_total - len(docs_summaries)
+        #docs_full = self.vectorstore.similarity_search_with_score(query=query,filter = filters_full,k = k_full)
+        docs_full = self.vectorstore.similarity_search_with_score(query=query,k = k_full)
+        # Concatenate documents
+        docs = docs_summaries + docs_full
+        # Filter if scores are below threshold
+        docs = [x for x in docs if len(x[0].page_content) > self.min_size]
+        # docs = [x for x in docs if x[1] > self.threshold]
+        # Add score to metadata
+        results = []
+        for i,(doc,score) in enumerate(docs):
+            doc.metadata["similarity_score"] = score
+            doc.metadata["content"] = doc.page_content
+            doc.metadata["page_number"] = int(doc.metadata["page_number"]) + 1
+            # doc.page_content = f"""Doc {i+1} - {doc.metadata['short_name']}: {doc.page_content}"""
+            results.append(doc)
+        # Sort by score
+        # results = sorted(results,key = lambda x : x.metadata["similarity_score"],reverse = True)
+        return results
+# def filter_summaries(df,k_summary = 3,k_total = 10):
+#     # assert source in ["IPCC","IPBES","ALL"], "source arg should be in (IPCC,IPBES,ALL)"
+#     # # Filter by source
+#     # if source == "IPCC":
+#     #     df = df.loc[df["source"]=="IPCC"]
+#     # elif source == "IPBES":
+#     #     df = df.loc[df["source"]=="IPBES"]
+#     # else:
+#     #     pass
+#     # Separate summaries and full reports
+#     df_summaries = df.loc[df["report_type"].isin(["SPM","TS"])]
+#     df_full = df.loc[~df["report_type"].isin(["SPM","TS"])]
+#     # Find passages from summaries dataset
+#     passages_summaries = df_summaries.head(k_summary)
+#     # Find passages from full reports dataset
+#     passages_fullreports = df_full.head(k_total - len(passages_summaries))
+#     # Concatenate passages
+#     passages = pd.concat([passages_summaries,passages_fullreports],axis = 0,ignore_index = True)
+#     return passages
+# def retrieve_with_summaries(query,retriever,k_summary = 3,k_total = 10,sources = ["IPCC","IPBES"],max_k = 100,threshold = 0.555,as_dict = True,min_length = 300):
+#     assert max_k > k_total
+#     validated_sources = ["IPCC","IPBES"]
+#     sources = [x for x in sources if x in validated_sources]
+#     filters = {
+#         "source": { "$in": sources },
+#     }
+#     print(filters)
+#     # Retrieve documents
+#     docs = retriever.retrieve(query,top_k = max_k,filters = filters)
+#     # Filter by score
+#     docs = [{**x.meta,"score":x.score,"content":x.content} for x in docs if x.score > threshold]
+#     if len(docs) == 0:
+#         return []
+#     res = pd.DataFrame(docs)
+#     passages_df = filter_summaries(res,k_summary,k_total)
+#     if as_dict:
+#         contents = passages_df["content"].tolist()
+#         meta = passages_df.drop(columns = ["content"]).to_dict(orient = "records")
+#         passages = []
+#         for i in range(len(contents)):
+#             passages.append({"content":contents[i],"meta":meta[i]})
+#         return passages
+#     else:
+#         return passages_df
+# def retrieve(query,sources = ["IPCC"],threshold = 0.555,k = 10):
+#     print("hellooooo")
+#     # Reformulate queries
+#     reformulated_query,language = reformulate(query)
+#     print(reformulated_query)
+#     # Retrieve documents
+#     passages = retrieve_with_summaries(reformulated_query,retriever,k_total = k,k_summary = 3,as_dict = True,sources = sources,threshold = threshold)
+#     response = {
+#       "query":query,
+#       "reformulated_query":reformulated_query,
+#       "language":language,
+#       "sources":passages,
+#       "prompts":{"init_prompt":init_prompt,"sources_prompt":sources_prompt},
+#     }
+#     return response

climateqa/engine/text_retriever.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from langchain_core.retrievers import BaseRetriever
+from langchain_core.documents.base import Document
+from langchain_core.vectorstores import VectorStore
+from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
+from typing import List
+class ClimateQARetriever(BaseRetriever):
+    vectorstore: VectorStore
+    sources: list = []
+    reports:list = []
+    threshold: float = 0.01
+    k_summary: int = 3
+    k_total: int = 7
+    min_size: int = 200
+    filter: dict = None
+    def _get_relevant_documents(
+        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
+    ) -> List[Document]:
+       # Check if all elements in the list are either IPCC or IPBES
+        assert isinstance(self.sources,list)
+       # assert self.k_total > self.k_summary, "k_total should be greater than k_summary"
+        # Prepare base search kwargs
+        filters = {}
+        filters["source"] = { "$in":self.sources}
+        docs = self.vectorstore.similarity_search_with_score(query=query,k=self.k_total, filter=self.filter)
+        # Add score to metadata
+        results = []
+        for i, (doc, score) in enumerate(docs):
+            # filtre les sources sous le seuil
+            if score < self.threshold:
+                continue
+            doc.metadata["similarity_score"] = score
+            doc.metadata["content"] = doc.page_content
+            doc.metadata["chunk_type"] = "text"
+            doc.metadata["page_number"] = 1
+            results.append(doc)
+        return results

climateqa/engine/utils.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from operator import itemgetter
+from typing import Any, Dict, Iterable, Tuple
+from langchain_core.runnables import RunnablePassthrough
+def pass_values(x):
+    if not isinstance(x, list):
+        x = [x]
+    return {k: itemgetter(k) for k in x}
+def prepare_chain(chain,name):
+    chain = propagate_inputs(chain)
+    chain = rename_chain(chain,name)
+    return chain
+def propagate_inputs(chain):
+    chain_with_values = {
+        "outputs": chain,
+        "inputs": RunnablePassthrough()
+    } | RunnablePassthrough() | flatten_dict
+    return chain_with_values
+def rename_chain(chain,name):
+    return chain.with_config({"run_name":name})
+# Drawn from langchain utils and modified to remove the parent key
+def _flatten_dict(
+    nested_dict: Dict[str, Any], parent_key: str = "", sep: str = "_"
+) -> Iterable[Tuple[str, Any]]:
+    """
+    Generator that yields flattened items from a nested dictionary for a flat dict.
+    Parameters:
+        nested_dict (dict): The nested dictionary to flatten.
+        parent_key (str): The prefix to prepend to the keys of the flattened dict.
+        sep (str): The separator to use between the parent key and the key of the
+            flattened dictionary.
+    Yields:
+        (str, any): A key-value pair from the flattened dictionary.
+    """
+    for key, value in nested_dict.items():
+        new_key = key
+        if isinstance(value, dict):
+            yield from _flatten_dict(value, new_key, sep)
+        else:
+            yield new_key, value
+def flatten_dict(
+    nested_dict: Dict[str, Any], parent_key: str = "", sep: str = "_"
+) -> Dict[str, Any]:
+    """Flattens a nested dictionary into a flat dictionary.
+    Parameters:
+        nested_dict (dict): The nested dictionary to flatten.
+        parent_key (str): The prefix to prepend to the keys of the flattened dict.
+        sep (str): The separator to use between the parent key and the key of the
+            flattened dictionary.
+    Returns:
+        (dict): A flat dictionary.
+    """
+    flat_dict = {k: v for k, v in _flatten_dict(nested_dict, parent_key, sep)}
+    return flat_dict

climateqa/engine/vectorstore.py ADDED Viewed

	@@ -0,0 +1,171 @@

+from google.cloud import storage
+import os
+with open("./cred.json","w") as fj:
+    fj.write(os.environ["CRED_JSON"])
+storage_client = storage.Client()
+bucket_name = "docs-axio-clara"
+from langchain_pinecone import PineconeVectorStore
+from langchain_community.document_loaders import TextLoader
+from langchain_text_splitters import CharacterTextSplitter
+from climateqa.engine.embeddings import get_embeddings_function
+embeddings_function = get_embeddings_function()
+index_name = "clara-index"
+namespace = "my-namespace"
+import os
+import pdfplumber
+def get_categories_files():
+    finale = {}
+    listCat = []
+    CAT_DIR="config_categorie/"
+    FOLDER_PATH="."
+    bucket = storage_client.get_bucket(bucket_name)
+    blob = bucket.blob(CAT_DIR+"categories.csv")
+    lines = blob.download_as_text().split("\n")
+    blob_label = bucket.blob(CAT_DIR+"libelle.csv")
+    lines_label = blob_label.download_as_text().split("\n")
+    labels = {}
+    # récupération des libelles
+    first = True
+    for line in lines_label:
+        # evite la première ligne
+        if first:
+             first = False
+             continue
+        lab = line.split(";")[-1].replace("\n","").replace("\r","").replace("\t","")
+        labels[line.split(";")[0]] = lab
+        print( "label :"+lab )
+    # premier passage récupération des catégories existantes
+    first = True
+    for line in lines:
+        # evite la première ligne
+        if first:
+             first = False
+             continue
+        categories = line.split(";")[-1].split(" ")
+        for cat in categories:
+            categ = cat.replace(" ","").replace("\n","").replace("\r","").replace("\t","")
+            # si la categorie n'a pas de label on utilise le champ technique
+            try :
+                test = labels[categ] # plante si la clé n'exsite pas
+            except :
+                labels[categ] = categ
+            # on ajoute la catégorie (le label) dans la liste si pas déjà croisée
+            if not labels[categ] in listCat:
+                print(" - ["+categ+"] > "+ labels[categ] )
+                listCat.append(labels[categ])
+    # initialisation de la structure finale
+    for cat in listCat:
+        finale[cat] = []
+    finale["AllCat"] = listCat
+    # deuxième passage association fichier, catégorie
+    first = True
+    for line in lines:
+        # evite la première ligne
+        if first:
+             first = False
+             continue
+        fichier = line.split(";")[0]
+        categories = line.split(";")[-1].split(" ")
+        listCat = []
+        # on place le fichier dans les catégories associées
+        for cat in categories:
+            categ = cat.replace(" ","").replace("\n","").replace("\r","").replace("\t","")
+            print( fichier +" dans "+ labels[categ] +"("+categ+")")
+            finale[labels[categ]].append(fichier)
+    return finale
+def get_PDF_Names_from_GCP():
+    listName = []
+    # Récupération des fichier depuis GCP storage
+    blobs = storage_client.list_blobs(bucket_name, prefix='sources/')
+    for blob in blobs:
+        listName.append(blob.name)
+    return listName
+def get_PDF_from_GCP(folder_path, pdf_folder="./PDF"):
+    # Récupération des fichier depuis GCP storage
+    #blobs = storage_client.list_blobs(bucket_name, prefix='sources/')
+    #for blob in blobs:
+    #    print( "\n"+blob.name+":")
+    #    print( " <- Téléchargement Depuis GCP")
+    #    blob.download_to_filename(pdf_folder+"/"+blob.name)
+        # Extraction des textes dpuis les fichiers PDF
+        print(" >>> Extraction PDF")
+        for pdf_file in os.listdir(pdf_folder):
+            if pdf_file.startswith("."):
+                continue
+            print(" >  "+pdf_folder+"/"+pdf_file)
+            pdf_total_pages = 0
+            with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf:
+                pdf_total_pages = len(pdf.pages)
+            # Fuite mémoire pour les gros fichiers
+            # Reouvrir le fichier à chaque N page semble rélgler le problème
+            N_page = 300
+            page_number = 0
+            while page_number < pdf_total_pages:
+                print(" -- ouverture du fichier pour "+str(N_page)+ " pages --" )
+                with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf:
+                    npage = 0
+                    while (npage < N_page and page_number < pdf_total_pages) :
+                        print(" >>> "+str(page_number+1))
+                        f = open(folder_path+"/"+pdf_file+"..:page:.."+str(page_number+1), "w")
+                        for char_pdf in pdf.pages[page_number].chars:
+                            f.write(char_pdf["text"])
+                        f.close()
+                        npage = npage + 1
+                        page_number = page_number + 1
+        print(" X removing: " + blob.name )
+        os.remove(pdf_folder+"/"+blob.name)
+def build_vectores_stores(folder_path, pdf_folder="./PDF", vectors_path = "./vectors"):
+    vectorstore = PineconeVectorStore(
+    index_name=index_name,
+    embedding=embeddings_function,
+    #namespace=namespace
+    )
+    print(" Vectorisation ...")
+    return vectorstore
+    print("MISSING VECTORS")
+    exit(0)

climateqa/engine/vectorstore_annoy.py ADDED Viewed

	@@ -0,0 +1,187 @@

+from google.cloud import storage
+#storage_client = storage.Client()
+storage_client = storage.Client.create_anonymous_client()
+bucket_name = "docs-axio-clara"
+from langchain_community.vectorstores import Annoy
+from langchain_community.document_loaders import TextLoader
+from langchain_text_splitters import CharacterTextSplitter
+from climateqa.engine.embeddings import get_embeddings_function
+embeddings_function = get_embeddings_function()
+import os
+import pdfplumber
+def get_PDF_Names_from_GCP():
+    listName = []
+    # Récupération des fichier depuis GCP storage
+    blobs = storage_client.list_blobs(bucket_name, prefix='sources/')
+    for blob in blobs:
+        listName.append(blob.name)
+    return listName
+def get_PDF_from_GCP(folder_path, pdf_folder="./PDF"):
+    # Récupération des fichier depuis GCP storage
+    blobs = storage_client.list_blobs(bucket_name, prefix='sources/')
+    for blob in blobs:
+        print( "\n"+blob.name+":")
+        print( " <- Téléchargement Depuis GCP")
+        blob.download_to_filename(pdf_folder+"/"+blob.name)
+        # Extraction des textes dpuis les fichiers PDF
+        print(" >>> Extraction PDF")
+        for pdf_file in os.listdir(pdf_folder):
+            if pdf_file.startswith("."):
+                continue
+            print(" >  "+pdf_folder+"/"+pdf_file)
+            pdf_total_pages = 0
+            with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf:
+                pdf_total_pages = len(pdf.pages)
+            # Fuite mémoire pour les gros fichiers
+            # Reouvrir le fichier à chaque N page semble rélgler le problème
+            N_page = 300
+            page_number = 0
+            while page_number < pdf_total_pages:
+                print(" -- ouverture du fichier pour "+str(N_page)+ " pages --" )
+                with pdfplumber.open(pdf_folder+"/"+pdf_file) as pdf:
+                    npage = 0
+                    while (npage < N_page and page_number < pdf_total_pages) :
+                        print(" >>> "+str(page_number+1))
+                        f = open(folder_path+"/"+pdf_file+"..:page:.."+str(page_number+1), "w")
+                        for char_pdf in pdf.pages[page_number].chars:
+                            f.write(char_pdf["text"])
+                        f.close()
+                        npage = npage + 1
+                        page_number = page_number + 1
+        print(" X removing: " + blob.name )
+        os.remove(pdf_folder+"/"+blob.name)
+def build_vectores_stores(folder_path, pdf_folder="./PDF", vectors_path = "./vectors"):
+    if os.path.isfile(vectors_path+"/index.annoy"):
+        return Annoy.load_local(vectors_path, embeddings_function,allow_dangerous_deserialization=True)
+    try:
+        os.mkdir(vectors_path)
+    except:
+        pass
+    try:
+        # Récupération des fichier depuis GCP storage
+        blobs = storage_client.list_blobs(bucket_name, prefix='testvectors/')
+        for blob in blobs:
+            print( "\n"+blob.name.split("/")[-1]+":")
+            print( " <- Téléchargement Depuis GCP")
+            blob.download_to_filename(vectors_path+"/"+blob.name.split("/")[-1])
+    except:
+        pass
+    # TODO A FUNCTION FOR THAT TO AVOID CODE DUPLICATION
+    if os.path.isfile(vectors_path+"/index.annoy"):
+        return Annoy.load_local(vectors_path, embeddings_function,allow_dangerous_deserialization=True)
+    print("MISSING VECTORS")
+    exit(0)
+#    get_PDF_from_GCP(folder_path, pdf_folder)
+#    print(" Vectorisation ...")
+#    docs = []
+#    vector_store_from_docs = ()  # Créer un nouvel objet Annoy ou utiliser celui déjà initialisé selon votre code existant
+#    for filename in os.listdir(folder_path):
+#        if filename.startswith("."):
+#            continue
+#        file_path = os.path.join(folder_path, filename)
+#        if os.path.isfile(file_path):
+#            loader = TextLoader(file_path)
+#            documents = loader.load()
+#
+#            for doc in documents:
+#                if (doc.metadata):
+#                    doc.metadata["ax_page"] = doc.metadata['source'].split("..:page:..")[-1]
+#                    doc.metadata["ax_name"] = doc.metadata['source'].split("..:page:..")[0].split("/")[-1]
+#                    doc.metadata["ax_url"] = "https://storage.googleapis.com/docs-axio-clara/sources/"+doc.metadata["ax_name"]
+#
+#            text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
+#            docs += text_splitter.split_documents(documents)
+#    vector_store_from_docs = Annoy.from_documents(docs, embeddings_function)
+#    vector_store_from_docs.save_local(vectors_path)
+#    return vector_store_from_docs
+# Pinecone
+# More info at https://docs.pinecone.io/docs/langchain
+# And https://python.langchain.com/docs/integrations/vectorstores/pinecone
+#import os
+#from pinecone import Pinecone
+#from langchain_community.vectorstores import Pinecone as PineconeVectorstore
+# LOAD ENVIRONMENT VARIABLES
+#try:
+#    from dotenv import load_dotenv
+#    load_dotenv()
+#except:
+#    pass
+#def get_pinecone_vectorstore(embeddings,text_key = "content"):
+    # # initialize pinecone
+    # pinecone.init(
+    #     api_key=os.getenv("PINECONE_API_KEY"),  # find at app.pinecone.io
+    #     environment=os.getenv("PINECONE_API_ENVIRONMENT"),  # next to api key in console
+    # )
+    # index_name = os.getenv("PINECONE_API_INDEX")
+    # vectorstore = Pinecone.from_existing_index(index_name, embeddings,text_key = text_key)
+    # return vectorstore
+#    pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
+#    index = pc.Index(os.getenv("PINECONE_API_INDEX"))
+#    vectorstore = PineconeVectorstore(
+#        index, embeddings, text_key,
+#    )
+#    return vectorstore
+# def get_pinecone_retriever(vectorstore,k = 10,namespace = "vectors",sources = ["IPBES","IPCC"]):
+#     assert isinstance(sources,list)
+#     # Check if all elements in the list are either IPCC or IPBES
+#     filter = {
+#         "source": { "$in":sources},
+#     }
+#     retriever = vectorstore.as_retriever(search_kwargs={
+#         "k": k,
+#         "namespace":"vectors",
+#         "filter":filter
+#     })
+#     return retriever

logs/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *
2	+ !.gitignore

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+google-cloud-storage==2.16.0
+gradio==4.19.1
+python-dotenv==1.0.0
+langchain==0.1.10
+langchain_openai==0.0.6
+pinecone-client==3.0.2
+sentence-transformers==2.6.0
+huggingface-hub
+msal
+pyalex==0.13
+networkx==3.2.1
+pyvis==0.3.2
+annoy==1.17.3
+langchain_pinecone
+pdfplumber==0.11.0

setup.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ print("yoooooo")

style.css ADDED Viewed

	@@ -0,0 +1,462 @@

+/* :root {
+    --user-image: url('https://ih1.redbubble.net/image.4776899543.6215/st,small,507x507-pad,600x600,f8f8f8.jpg');
+  } */
+.fordataonly {
+    display:none !important
+}
+label {
+    color: #000000 !important;
+}
+strong {
+    color: #888888 !important;
+}
+.logo-axio {
+    float: right;
+    position: absolute;
+    right: 0px;
+}
+/* couleur text */
+p {
+    color: black !important;
+}
+li {
+    color: black !important;
+}
+button.selected {
+    border-radius: 20px !important;
+}
+button:hover {
+    color: #ffc000 !important;
+}
+/* fond panels/blocks */
+.panel {
+    background-color: #eeeeee !important;
+    border: 0px;
+}
+.block {
+    background-color: #eeeeee !important;
+}
+/* fond bot */
+.bot {
+    background-color: #eeeeee !important;
+}
+/* avatar en debut de reponse */
+.avatar-container {
+    align-self: baseline !important;
+    margin-top: 35px;
+}
+/* fond user */
+.user {
+    background-color: #d2d2d2 !important;
+}
+textarea {
+    background-color: #d2d2d2 !important;
+    color: black !important;
+}
+/* fond app */
+gradio-app {
+    background-color: #ffffff !important;
+}
+.gradio-container {
+    background-color: #ffffff !important;
+    max-width: 100% !important;
+    width: 100% !important;
+}
+.a-propos {
+    margin: 20px !important;
+}
+.telecharger {
+    border: 1px solid;
+    padding: 5px;
+    border-radius: 5px;
+    background-color: #ffc000;
+    color: #fff;
+    margin-left: 5px;
+}
+.warning-box {
+    background-color: #fff3cd;
+    border: 1px solid #ffeeba;
+    border-radius: 4px;
+    padding: 15px 20px;
+    font-size: 14px;
+    color: #856404;
+    display: inline-block;
+    margin-bottom: 15px;
+  }
+.tip-box {
+    background-color: #f7dd8f;
+    border: 1px solid #FFC000;
+    border-radius: 4px;
+    margin-top:20px;
+    padding: 15px 20px;
+    font-size: 14px;
+    display: inline-block;
+    margin-bottom: 15px;
+    width: auto;
+    color:black !important;
+}
+body.dark .warning-box * {
+    color:black !important;
+}
+body.dark .tip-box * {
+    color:rgb(216, 216, 216) !important;
+}
+.tip-box-title {
+    font-weight: bold;
+    font-size: 14px;
+    margin-bottom: 5px;
+}
+.light-bulb {
+    display: inline;
+    margin-right: 5px;
+}
+.gr-box {border-color: #d6c37c}
+#hidden-message{
+    display:none;
+}
+.message{
+    font-size:14px !important;
+}
+a {
+    text-decoration: none;
+    color: inherit;
+}
+.card {
+    background-color: white;
+    border-radius: 10px;
+    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+    overflow: hidden;
+    display: flex;
+    flex-direction: column;
+    margin:20px;
+}
+.card-content {
+    padding: 20px;
+}
+.card-content h2 {
+    font-size: 14px !important;
+    font-weight: bold;
+    margin-bottom: 10px;
+    margin-top:0px !important;
+    color:#FFC000!important;;
+}
+.card-content p {
+    font-size: 12px;
+    margin-bottom: 0;
+    color: black;
+}
+.card-footer {
+    background-color: #f4f4f4;
+    font-size: 10px;
+    padding: 10px;
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+}
+.card-footer span {
+    flex-grow: 1;
+    text-align: left;
+    color: #999 !important;
+}
+.pdf-link {
+    display: inline-flex;
+    align-items: center;
+    margin-left: auto;
+    text-decoration: none!important;
+    font-size: 14px;
+}
+.message.user{
+    /* background-color:#7494b0 !important; */
+    border:none;
+    /* color:white!important; */
+}
+.message.bot{
+    /* background-color:#f2f2f7 !important; */
+    border:none;
+}
+/* .gallery-item > div:hover{
+    background-color:#7494b0 !important;
+    color:white!important;
+}
+.gallery-item:hover{
+    border:#7494b0 !important;
+}
+.gallery-item > div{
+    background-color:white !important;
+    color:#577b9b!important;
+}
+.label{
+    color:#577b9b!important;
+} */
+/* .paginate{
+    color:#577b9b!important;
+} */
+/* span[data-testid="block-info"]{
+    background:none !important;
+    color:#577b9b;
+  } */
+/* Pseudo-element for the circularly cropped picture */
+/* .message.bot::before {
+    content: '';
+    position: absolute;
+    top: -10px;
+    left: -10px;
+    width: 30px;
+    height: 30px;
+    background-image: var(--user-image);
+    background-size: cover;
+    background-position: center;
+    border-radius: 50%;
+    z-index: 10;
+  }
+   */
+label.selected{
+  background:none !important;
+}
+#submit-button{
+    padding:0px !important;
+}
+@media screen and (min-width: 1024px) {
+    div#tab-examples{
+        height:calc(100vh - 190px) !important;
+        overflow-y: auto;
+    }
+    div#sources-textbox{
+        height:calc(100vh - 190px) !important;
+        overflow-y: auto !important;
+    }
+    div#tab-config{
+        height:calc(100vh - 190px) !important;
+        overflow-y: auto !important;
+    }
+    div#chatbot-row{
+        height:calc(100vh - 90px) !important;
+    }
+    div#chatbot{
+        height:calc(100vh - 170px) !important;
+    }
+    .max-height{
+        height:calc(100vh - 90px) !important;
+        overflow-y: auto;
+    }
+    /* .tabitem:nth-child(n+3) {
+        padding-top:30px;
+        padding-left:40px;
+        padding-right:40px;
+    } */
+}
+footer {
+    visibility: hidden;
+    display:none !important;
+}
+@media screen and (max-width: 767px) {
+    /* Your mobile-specific styles go here */
+    div#chatbot{
+        height:500px !important;
+    }
+    #submit-button{
+        padding:0px !important;
+        min-width: 80px;
+    }
+    /* This will hide all list items */
+    div.tab-nav button {
+        display: none !important;
+        color: #ffc000;
+    }
+    /* This will show only the first list item */
+    div.tab-nav button:first-child {
+        display: block !important;
+    }
+    /* This will show only the first list item */
+    div.tab-nav button:nth-child(2) {
+        display: block !important;
+    }
+    #right-panel button{
+        display: block !important;
+    }
+    /* ... add other mobile-specific styles ... */
+}
+body.dark .card{
+    background-color: #c7c7c7;
+}
+body.dark .card-content h2{
+    color:#f4dbd3 !important;
+}
+body.dark .card-footer {
+    background-color: #404652;
+}
+body.dark .card-footer span {
+    color:white !important;
+}
+.doc-ref{
+    color:#ffc000!important;
+    margin-right:1px;
+}
+.tabitem{
+    border:none !important;
+}
+.other-tabs > div{
+    padding-left:40px;
+    padding-right:40px;
+    padding-top:10px;
+}
+.gallery-item > div{
+    white-space: normal !important; /* Allow the text to wrap */
+    word-break: break-word !important; /* Break words to prevent overflow */
+    overflow-wrap: break-word !important; /* Break long words if necessary */
+  }
+span.chatbot > p > img{
+    margin-top:40px !important;
+    max-height: none !important;
+    max-width: 80% !important;
+    border-radius:0px !important;
+}
+.chatbot-caption{
+    font-size:11px;
+    font-style:italic;
+    color:#ffc000;
+}
+.ai-generated{
+    font-size:11px!important;
+    font-style:italic;
+    color:#ffc000 !important;
+}
+.card-image > .card-content{
+    background-color:#f1f7fa !important;
+}
+.tab-nav > button.selected{
+    color:#ffc000;
+    font-weight:bold;
+    border:none;
+}
+.tab-nav{
+    border:none !important;
+}
+#input-textbox > label > textarea{
+    border-radius:40px;
+    padding-left:30px;
+    resize:none;
+}
+#input-message > div{
+    border:none;
+}
+#dropdown-samples{
+  /*! border:none !important; */
+  /*! border-width:0px !important; */
+  background:none !important;
+}
+#dropdown-samples > .container > .wrap{
+  background-color:white;
+}
+#tab-examples > div > .form{
+  border:none;
+  background:none !important;
+}
+.a-doc-ref{
+	text-decoration: none !important;
+    color:#FFC000;
+}

test ADDED Viewed

	@@ -0,0 +1,32 @@

+FROM python:3.10
+WORKDIR /src
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Set up a new user named "user" with user ID 1000
+RUN useradd -m -u 1000 user
+# Switch to the "user" user
+USER user
+# Set home to the user's home directory
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH \
+    PYTHONPATH=$HOME/app \
+	PYTHONUNBUFFERED=1 \
+	GRADIO_ALLOW_FLAGGING=never \
+	GRADIO_NUM_PORTS=1 \
+	GRADIO_SERVER_NAME=0.0.0.0 \
+	GRADIO_THEME=huggingface \
+	SYSTEM=spaces
+# Set the working directory to the user's home directory
+WORKDIR $HOME/app
+# Copy the current directory contents into the container at $HOME/app setting the owner to the user
+COPY --chown=user . $HOME/app
+CMD ["python","setup.py"]
+CMD ["python", "app.py"]

utils.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import numpy as np
+import random
+import string
+import uuid
+def create_user_id():
+    """Create user_id
+        str: String to id user
+    """
+    user_id = str(uuid.uuid4())
+    return user_id

vectors/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *
2	+ !.gitignore

vectors/index.annoy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b94e9d486dbe3a9e2397672bda1d1c17198cca42a53afaa16ef8ecfcebd22fc9
+size 2238984

vectors/index.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4eb3d63539603642200f07f8fac2e290e94104fbbe4f4471dc663eff850263f6
+size 3223915