Spaces:

fmind
/

resume

Running

App Files Files

Médéric Hurier (Fmind) commited on Oct 17, 2023

Commit

ed67987

1 Parent(s): 5c68cc7

First release

Browse files

Files changed (3) hide show

app.py +70 -16
database.py +26 -8
lib.py +29 -7

app.py CHANGED Viewed

@@ -5,7 +5,6 @@
 import logging
 import gradio as gr
-import tiktoken
 import lib
@@ -18,34 +17,90 @@ logging.basicConfig(
 # %% CONFIGS
 THEME = "glass"
 TITLE = "Fmind Chatbot"
 CLIENT = lib.get_database_client(path=lib.DATABASE_PATH)
-ENCODING = tiktoken.get_encoding(encoding_name=lib.EMBEDDING_TOKENIZER)
-FUNCTION = lib.get_embedding_function()
 COLLECTION = CLIENT.get_collection(
     name=lib.DATABASE_COLLECTION,
-    embedding_function=FUNCTION,
 )
-EXAMPLES = [
-    "Who is Médéric Hurier (Fmind)?",
-    "Is Fmind open to new opportunities?",
-    "What is Médéric's most recent degree?",
-    "What is Médéric's latest work experience?",
-    "Is Médéric proficient in Python programming?",
-]
 # %% FUNCTIONS
 def answer(message: str, history: list[str]) -> str:
     """Answer questions about my resume."""
-    tokens = ENCODING.encode(message)
-    print("History:", len(history))
-    print("Tokens:", len(tokens))
-    return message
 # %% INTERFACES
@@ -60,7 +115,6 @@ interface = gr.ChatInterface(
     undo_btn=None,
 )
-# %% ENTRYPOINTS
 if __name__ == "__main__":
     interface.launch()

 import logging
 import gradio as gr
 import lib
 # %% CONFIGS
+# %% - Frontend
 THEME = "glass"
 TITLE = "Fmind Chatbot"
+EXAMPLES = [
+    "Who is Médéric Hurier (Fmind)?",
+    "Is Fmind open to new opportunities?",
+    "Can you share details about Médéric PhD?",
+    "Elaborate on Médéric current work position",
+    "Describe his proficiency with Python programming",
+    "What is the answer to life, the universe, and everything?",
+]
+# %% - Backend
+MODEL = lib.get_language_model()
 CLIENT = lib.get_database_client(path=lib.DATABASE_PATH)
+ENCODING = lib.get_encoding_function()
+EMBEDDING = lib.get_embedding_function()
 COLLECTION = CLIENT.get_collection(
     name=lib.DATABASE_COLLECTION,
+    embedding_function=EMBEDDING,
 )
+# %% - Answer
+PROMPT_CONTEXT = """
+You are Fmind Chatbot, specialized in providing information regarding Médéric Hurier's (known as Fmind) professional background.
+Médéric is an MLOps engineer based in Luxembourg. He is currently working at Decathlon. His calendar is booked until the conclusion of 2024.
+Your responses should be succinct and maintain a professional tone. If inquiries deviate from Médéric's professional sphere, courteously decline to engage.
+You may find more information about Médéric below (markdown format):
+"""
+PROMPT_MAX_TOKENS = lib.MODEL_INPUT_LIMIT
+QUERY_MAX_DISTANCE = 0.4
+QUERY_N_RESULTS = 20
 # %% FUNCTIONS
 def answer(message: str, history: list[str]) -> str:
     """Answer questions about my resume."""
+    # counters
+    n_tokens = 0
+    # messages
+    messages = []
+    # - context
+    n_tokens += len(ENCODING(PROMPT_CONTEXT))
+    messages += [{"role": "system", "content": PROMPT_CONTEXT}]
+    # - history
+    for user_content, assistant_content in history:
+        n_tokens += len(ENCODING(user_content))
+        n_tokens += len(ENCODING(assistant_content))
+        messages += [{"role": "user", "content": user_content}]
+        messages += [{"role": "assistant", "content": assistant_content}]
+    # - message
+    n_tokens += len(ENCODING(message))
+    messages += [{"role": "user", "content": message}]
+    # database
+    results = COLLECTION.query(query_texts=message, n_results=QUERY_N_RESULTS)
+    logging.info("Results: %s", results)
+    distances = results["distances"][0]
+    documents = results["documents"][0]
+    for distance, document in zip(distances, documents):
+        # - distance
+        logging.debug("Doc distance: %f", distance)
+        if distance > QUERY_MAX_DISTANCE:
+            break
+        # - document
+        n_document_tokens = len(ENCODING(document))
+        logging.debug("Doc tokens: %f", n_document_tokens)
+        if (n_tokens + n_document_tokens) >= PROMPT_MAX_TOKENS:
+            break
+        n_tokens += n_document_tokens
+        messages[0]["content"] += document
+    # response
+    logging.info("Tokens: %d", n_tokens)
+    logging.info("Messages: %s", messages)
+    api_response = MODEL(messages=messages)
+    logging.info("Response: %s", api_response.to_dict_recursive())
+    # content
+    content = api_response["choices"][0]["message"]["content"]
+    # return
+    return content
 # %% INTERFACES
     undo_btn=None,
 )
 if __name__ == "__main__":
     interface.launch()

database.py CHANGED Viewed

@@ -35,9 +35,15 @@ def segment_text(text: str, pattern: str) -> T.Iterator[tuple[str, str]]:
     return pairs
-def import_file(file: T.TextIO, collection: lib.Collection) -> int:
     """Import a markdown file to a database collection."""
-    imported = 0
     text = file.read()
     filename = file.name
     segments_h1 = segment_text(text=text, pattern=r"^# (.+)")
@@ -45,14 +51,19 @@ def import_file(file: T.TextIO, collection: lib.Collection) -> int:
         logging.debug('\t- H1: "%s" (%d)', h1, len(h1_text))
         segments_h2 = segment_text(text=h1_text, pattern=r"^## (.+)")
         for h2, content in segments_h2:
-            logging.debug('\t\t- H2: "%s" (%d)', h2, len(content))
             id_ = f"{filename} # {h1} ## {h2}"  # unique doc id
             document = f"# {h1}\n\n## {h2}\n\n{content.strip()}"
             metadata = {"filename": filename, "h1": h1, "h2": h2}
-            assert len(content) < 8000, f"Content is too long: #{h1} ##{h2}"
             collection.add(ids=id_, documents=document, metadatas=metadata)
-            imported += len(document)
-    return imported
 def main(args: list[str] | None = None) -> int:
@@ -64,6 +75,9 @@ def main(args: list[str] | None = None) -> int:
     logging.info("Database path: %s", database_path)
     client = lib.get_database_client(path=database_path)
     logging.info("- Reseting database client: %s", client.reset())
     # embedding
     embedding_function = lib.get_embedding_function()
     logging.info("Embedding function: %s", embedding_function)
@@ -76,8 +90,12 @@ def main(args: list[str] | None = None) -> int:
     # files
     for i, file in enumerate(opts.files):
         logging.info("Importing file %d: %s", i, file.name)
-        imported = import_file(file=file, collection=collection)
-        logging.info("- Docs imported from file %s: %d chars", i, imported)
     # return
     return 0

     return pairs
+def import_file(
+    file: T.TextIO,
+    collection: lib.Collection,
+    encoding_function: T.Callable,
+    max_output_tokens: int = lib.ENCODING_OUTPUT_LIMIT,
+) -> tuple[int, int]:
     """Import a markdown file to a database collection."""
+    n_chars = 0
+    n_tokens = 0
     text = file.read()
     filename = file.name
     segments_h1 = segment_text(text=text, pattern=r"^# (.+)")
         logging.debug('\t- H1: "%s" (%d)', h1, len(h1_text))
         segments_h2 = segment_text(text=h1_text, pattern=r"^## (.+)")
         for h2, content in segments_h2:
+            content_chars = len(content)
+            content_tokens = len(encoding_function(content))
+            logging.debug('\t\t- H2: "%s" (%d)', h2, content_chars)
             id_ = f"{filename} # {h1} ## {h2}"  # unique doc id
             document = f"# {h1}\n\n## {h2}\n\n{content.strip()}"
             metadata = {"filename": filename, "h1": h1, "h2": h2}
+            assert (
+                content_tokens < max_output_tokens
+            ), f"Content is too long ({content_tokens}): #{h1} ##{h2}"
             collection.add(ids=id_, documents=document, metadatas=metadata)
+            n_tokens += content_tokens
+            n_chars += content_chars
+    return n_chars, n_tokens
 def main(args: list[str] | None = None) -> int:
     logging.info("Database path: %s", database_path)
     client = lib.get_database_client(path=database_path)
     logging.info("- Reseting database client: %s", client.reset())
+    # encoding
+    encoding_function = lib.get_encoding_function()
+    logging.info("Encoding function: %s", encoding_function)
     # embedding
     embedding_function = lib.get_embedding_function()
     logging.info("Embedding function: %s", embedding_function)
     # files
     for i, file in enumerate(opts.files):
         logging.info("Importing file %d: %s", i, file.name)
+        n_chars, n_tokens = import_file(
+            file=file, collection=collection, encoding_function=encoding_function
+        )
+        logging.info(
+            "- Docs imported from file %s: %d chars | %d tokens", i, n_chars, n_tokens
+        )
     # return
     return 0

lib.py CHANGED Viewed

@@ -5,6 +5,7 @@
 __import__("pysqlite3")
 import os
 import sys
@@ -12,6 +13,8 @@ import sys
 sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
 import chromadb
 from chromadb.utils import embedding_functions
 # %% CONFIGS
@@ -20,7 +23,13 @@ DATABASE_COLLECTION = "resume"
 DATABASE_PATH = "database"
 EMBEDDING_MODEL = "text-embedding-ada-002"
-EMBEDDING_TOKENIZER = "cl100k_base"
 OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
@@ -31,20 +40,33 @@ Collection = chromadb.Collection
 # %% FUNCTIONS
 def get_database_client(path: str) -> chromadb.API:
     """Get a persistent client to the Chroma DB."""
-    settings = chromadb.Settings(
-        allow_reset=True,
-        anonymized_telemetry=False,
-    )
     return chromadb.PersistentClient(path=path, settings=settings)
 def get_embedding_function(
     model_name: str = EMBEDDING_MODEL, api_key: str = OPENAI_API_KEY
 ) -> embedding_functions.EmbeddingFunction:
     """Get the embedding function for Chroma DB collections."""
     return embedding_functions.OpenAIEmbeddingFunction(
-        model_name=model_name,
-        api_key=api_key,
     )

 __import__("pysqlite3")
+import functools
 import os
 import sys
 sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
 import chromadb
+import openai
+import tiktoken
 from chromadb.utils import embedding_functions
 # %% CONFIGS
 DATABASE_PATH = "database"
 EMBEDDING_MODEL = "text-embedding-ada-002"
+ENCODING_NAME = "cl100k_base"
+ENCODING_OUTPUT_LIMIT = 8191
+MODEL_NAME = "gpt-3.5-turbo-16k"
+MODEL_INPUT_LIMIT = 16_385
+MODEL_TEMPERATURE = 0.9
 OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
 # %% FUNCTIONS
+def get_language_model(
+    model: str = MODEL_NAME,
+    api_key: str = OPENAI_API_KEY,
+    temperature: float = MODEL_TEMPERATURE,
+) -> openai.ChatCompletion:
+    """Get an OpenAI ChatCompletion model."""
+    openai.api_key = api_key  # configure the API key globally
+    return functools.partial(
+        openai.ChatCompletion.create, model=model, temperature=temperature
+    )
 def get_database_client(path: str) -> chromadb.API:
     """Get a persistent client to the Chroma DB."""
+    settings = chromadb.Settings(allow_reset=True, anonymized_telemetry=False)
     return chromadb.PersistentClient(path=path, settings=settings)
+def get_encoding_function(encoding_name: str = ENCODING_NAME) -> tiktoken.Encoding:
+    """Get the encoding function for OpenAI models."""
+    return tiktoken.get_encoding(encoding_name=encoding_name).encode
 def get_embedding_function(
     model_name: str = EMBEDDING_MODEL, api_key: str = OPENAI_API_KEY
 ) -> embedding_functions.EmbeddingFunction:
     """Get the embedding function for Chroma DB collections."""
     return embedding_functions.OpenAIEmbeddingFunction(
+        model_name=model_name, api_key=api_key
     )