Spaces:

Geraldine
/

qa_inference_with_ollama_nvidia_or_groq

Running

App Files Files Community

Geraldine commited on May 10

Commit

4cf1107

•

1 Parent(s): 8b0098e

Upload 7 files

Browse files

Files changed (7) hide show

Home.py +53 -0
README.md +30 -13
clients.py +160 -0
pages/app_api_completion.py +128 -0
pages/app_langchain_completion.py +135 -0
requirements.txt +11 -0
screenshot.png +0 -0

Home.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import streamlit as st
+st.set_page_config(page_title="QA Inference Streamlit App using Ollama, Nvidia and Groq", layout="wide")
+st.write("# QA Inference with Ollama & Nvidia & Groq as LLMs providers")
+st.markdown(
+        """
+        This app is a demo for showing how to interact with LLMs in the case of three providers : Ollama, the Nvidia Cloud and Groq.
+        You can use one, two or the three LLMs hosting solutions according to your environment :
+        - **[Ollama](https://ollama.com/)** : a local Ollama instance must be running on http://localhost:11434 (change the base_url in clients.py if needed)
+        - **[Nvidia Cloud](https://build.nvidia.com/explore/discover)** : if you want to test the LLMs hosted on Nvidia Cloud and mostly the no-latency QA process on Nvidia GPU, you need to create an (free) account and generate an API key
+        - **[Groq Cloud](https://console.groq.com/playground)** : if you want to test the LLMs hosted on Groq and especially the speed of execution of the inference process on Groq LPU, you need to create an (free) account and generate an API key
+        The app contains two pages implementing the same kind of chatbot, the only difference is how to achieve the LLM answer
+        - 👉 **App API completion** page : this page illustrates how to query a LLM by using OpenAI-like APIs or the OpenAI client
+        - 👉 **App Langchain completion** page : this page illustrates how to query a LLM using appropriate Langchain components
+    """
+    )
+footer="""<style>
+a:link , a:visited{
+color: blue;
+background-color: transparent;
+text-decoration: underline;
+}
+a:hover,  a:active {
+color: red;
+background-color: transparent;
+text-decoration: underline;
+}
+.footer {
+position: fixed;
+left: 0;
+bottom: 0;
+width: 100%;
+background-color: white;
+color: black;
+text-align: center;
+}
+</style>
+<div class="footer">
+<p>Contact 🤙 <a style='display: block; text-align: center;' href="mailto:geraldine.geoffroy@epdl.ch" target="_blank">Géraldine Geoffroy</a></p>
+</div>
+"""
+st.markdown(footer,unsafe_allow_html=True)

README.md CHANGED Viewed

@@ -1,13 +1,30 @@
----
-title: Qa Inference With Ollama Nvidia Or Groq
-emoji: 🐠
-colorFrom: green
-colorTo: green
-sdk: streamlit
-sdk_version: 1.34.0
-app_file: app.py
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+#  Streamlit simple QA Inference App with Ollama, Nvidia Cloud and Groq
+> Post :
+> Deployed : no
+Two different ways to develop the same chatbot application
+- app_api_completion.py : make QA inference with LLMs by choosing between the native Chat API completion endpoints provided by Ollama, Nvidia or Groq
+- app_langchain_completion.py : make QA inference with LLMs with the dedicated Langchain wrappers for Ollama, Nvidia or Groq
+You can use one, two or the three LLMs hosting solutions according to your environment :
+- a running Ollama instance : the default base_url is http://localhost:11434 but if needed (remote or dockerized Ollama instance for example) you change it in the OllamaClient in clients.py
+*and/or*
+- a valid API key on the Nvidia Cloud : [https://build.nvidia.com/explore/discover](https://build.nvidia.com/explore/discover)
+*and/or*
+- a valid API key on Groq Cloud : [https://console.groq.com/playground](https://console.groq.com/playground)
+```
+git clone
+pip install -r requirements.txt
+streamlit run Home.py
+```
+Running on http://localhost:8501
+![screenshot](screenshot.png)

clients.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import requests
+import json
+from openai import OpenAI
+from groq import Groq
+class OllamaClient:
+    def __init__(
+        self,
+        api_key=None,
+        model=None,
+    ):
+        self.base_url = "http://localhost:11434"
+        self.headers = {"Content-Type": "application/json"}
+        self.api_key = api_key
+        self.model = model
+    def list_models(self):
+        url = f"{self.base_url}/api/tags"
+        try:
+            response = requests.get(url)
+            response.raise_for_status()  # Raise an exception for HTTP errors (status codes 4xx and 5xx)
+            return response.json()  # returns the response is in JSON format
+        except requests.exceptions.HTTPError as http_err:
+            print(f'HTTP error occurred: {http_err}')
+        except Exception as err:
+            print(f'Other error occurred: {err}')
+    def api_chat_completion(self,prompt,**options):
+        url = f"{self.base_url}/api/chat"
+        options = options if options is not None else {"max_tokens":1024,"top_p":0.7,"temperature":0.7}
+        payload = json.dumps(
+            {
+                "model": self.model,
+                "messages": [{"role": "user", "content": prompt}],
+                "option": {
+                    "num_ctx": self.options["max_tokens"],
+                    "top_p": self.options["top_p"],
+                    "temperature": self.options["temperature"],
+                    # stop_sequences=["<|prompter|>","<|assistant|>","</s>"]
+                },
+                "stream": False,
+            }
+        )
+        response = requests.request("POST", url, headers=self.headers, data=payload)
+        return response.json()["message"]["content"]
+    def client_chat_completion(self,prompt,**options):
+        options = options if options is not None else {"max_tokens":1024,"top_p":0.7,"temperature":0.7}
+        client = OpenAI(
+            base_url=self.base_url,
+            api_key=self.api_key,
+        )
+        completion = client.chat.completions.create(
+            model=self.model,
+            messages=[{"role": "user", "content": prompt}],
+            temperature=options["temperature"],
+            top_p=options["top_p"],
+            max_tokens=options["max_tokens"],
+            stream=False,
+        )
+        return completion.choices[0].message.content
+class NvidiaClient:
+    def __init__(self, api_key=None, model=None):
+        self.base_url = "https://integrate.api.nvidia.com/v1"
+        self.api_key = api_key
+        self.headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {self.api_key}",
+        }
+        self.model = model
+    def list_models(self):
+        url = f"{self.base_url}/models"
+        response = requests.request("GET", url) # api_key is not needed to list the available models
+        return response.json()
+    def api_chat_completion(self,prompt,**options):
+        url = f"{self.base_url}/chat/completions"
+        options = options if options is not None else {"max_tokens":1024,"top_p":0.7,"temperature":0.7}
+        payload = json.dumps(
+            {
+                "model": self.model,
+                "messages": [{"role": "user", "content": prompt}],
+                "temperature": options["temperature"],
+                "top_p": options["top_p"],
+                "max_tokens": options["max_tokens"],
+                "stream": False,
+            }
+        )
+        response = requests.request("POST", url, headers=self.headers, data=payload)
+        return response.json()["choices"][0]["message"]["content"]
+    def client_chat_completion(self,prompt,**options):
+        options = options if options is not None else {"max_tokens":1024,"top_p":0.7,"temperature":0.7}
+        client = OpenAI(
+            base_url=self.base_url,
+            api_key=self.api_key,
+        )
+        completion = client.chat.completions.create(
+            model=self.model,
+            messages=[{"role": "user", "content": prompt}],
+            temperature=self.options["temperature"],
+            top_p=self.options["top_p"],
+            max_tokens=self.options["max_tokens"],
+            stream=False,
+        )
+        return completion.choices[0].message.content
+class GroqClient:
+    def __init__(self, api_key=None, model=None):
+        self.base_url = "https://api.groq.com/openai/v1"
+        self.api_key = api_key
+        self.headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {self.api_key}",
+        }
+        self.model = model
+    def list_models(self):
+        url = f"{self.base_url}/models"
+        response = requests.request("GET", url, headers=self.headers)
+        return response.json()
+    def api_chat_completion(self,prompt,**options):
+        url = f"{self.base_url}/chat/completions"
+        options = options if options is not None else {"max_tokens":1024,"top_p":0.7,"temperature":0.7}
+        payload = json.dumps(
+            {
+                "model": self.model,
+                "messages": [{"role": "user", "content": prompt}],
+                "temperature": options["temperature"],
+                "top_p": options["top_p"],
+                "max_tokens": options["max_tokens"],
+                "stream": False,
+            }
+        )
+        response = requests.request("POST", url, headers=self.headers, data=payload)
+        return response.json()["choices"][0]["message"]["content"]
+    def client_chat_completion(self,prompt,**options):
+        options = options if options is not None else {"max_tokens":1024,"top_p":0.7,"temperature":0.7}
+        client = Groq(
+            api_key=self.api_key,
+        )
+        completion = client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": "you are a helpful assistant."},
+                {"role": "user", "content": prompt},
+            ],
+            temperature=self.options["temperature"],
+            top_p=self.options["top_p"],
+            max_tokens=self.options["max_tokens"],
+            stream=False,
+        )
+        return completion.choices[0].message.content

pages/app_api_completion.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import requests
+import json
+import os
+import streamlit as st
+from clients import OllamaClient, NvidiaClient, GroqClient
+st.set_page_config(
+    page_title="QA Inference Streamlit App using Ollama, Nvidia and Groq APIs"
+)
+# Cache the header of the app to prevent re-rendering on each load
+@st.cache_resource
+def display_app_header():
+    """Display the header of the Streamlit app."""
+    st.title("QA Inference with Ollama & Nvidia & Groq as LLMs providers")
+    st.subheader("ChatBot based on provider's OpenAI-like APIs and clients")
+# Display the header of the app
+display_app_header()
+# UI sidebar ##########################################
+st.sidebar.subheader("Models")
+# LLM
+llm_providers = {
+    "Local Ollama": "ollama",
+    "Cloud Nvidia": "nvidia",
+    "Cloud Groq": "groq",
+}
+llm_provider = st.sidebar.radio(
+    "Choose your LLM Provider", llm_providers.keys(), key="llm_provider"
+)
+if llm_provider == "Local Ollama":
+    ollama_list_models = OllamaClient().list_models()
+    if ollama_list_models:
+        ollama_models = [x["name"] for x in ollama_list_models["models"]]
+        ollama_llm = st.sidebar.radio(
+        "Select your Ollama model", ollama_models, key="ollama_llm"
+        )  # retrive with st.session_state["ollama_llm"]
+    else:
+        st.sidebar.error('Ollama is not running')
+elif llm_provider == "Cloud Nvidia":
+    if nvidia_api_token := st.sidebar.text_input("Enter your Nvidia API Key"):
+        st.sidebar.info("Nvidia authentification ok")
+        nvidia_list_models = NvidiaClient().list_models() # api_key is not needed to list the available models
+        nvidia_models = [x["id"] for x in nvidia_list_models["data"]]
+        nvidia_llm = st.sidebar.radio(
+            "Select your Nvidia LLM", nvidia_models, key="nvidia_llm"
+        )
+    else:
+        st.sidebar.warning("You must enter your Nvidia API key")
+elif llm_provider == "Cloud Groq":
+    if groq_api_token := st.sidebar.text_input("Enter your Groq API Key"):
+        st.sidebar.info("Groq authentification ok")
+        groq_list_models = GroqClient(api_key=groq_api_token).list_models()
+        groq_models = [x["id"] for x in groq_list_models["data"]]
+        groq_llm = st.sidebar.radio("Choose your Groq LLM", groq_models, key="groq_llm")
+    else:
+        st.sidebar.warning("You must enter your Groq API key")
+# LLM parameters
+st.sidebar.subheader("Parameters")
+max_tokens = st.sidebar.number_input("Token numbers", value=1024, key="max_tokens")
+temperature = st.sidebar.slider(
+    "Temperature", min_value=0.0, max_value=1.0, value=0.5, step=0.1, key="temperature"
+)
+top_p = st.sidebar.slider(
+    "Top P", min_value=0.0, max_value=1.0, value=0.7, step=0.1, key="top_p"
+)
+# LLM response function ########################################
+def get_llm_response(provider, prompt):
+    options = dict(
+        max_tokens=st.session_state["max_tokens"],
+        top_p=st.session_state["top_p"],
+        temperature=st.session_state["temperature"],
+    )
+    if provider == "ollama":
+        return OllamaClient(
+            api_key="ollama",
+            model=st.session_state["ollama_llm"],
+        ).api_chat_completion(
+            prompt, **options
+        )  # or .client_chat_completion(prompt,**options)
+    elif provider == "nvidia":
+        return NvidiaClient(
+            api_key=nvidia_api_token,
+            model=st.session_state["nvidia_llm"],
+        ).api_chat_completion(
+            prompt, **options
+        )  # or .client_chat_completion(prompt,**options)
+    elif provider == "groq":
+        return GroqClient(
+            api_key=groq_api_token,
+            model=st.session_state["groq_llm"],
+        ).api_chat_completion(
+            prompt, **options
+        )  # or .client_chat_completion(prompt,**options)
+# UI main #####################################################
+# Initialize chat history
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+# Display chat messages from history on app rerun
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        st.markdown(message["content"])
+# React to user input
+if prompt := st.chat_input("What is up?"):
+    # Display user message in chat message container
+    with st.chat_message("user"):
+        st.markdown(prompt)
+    # Add user message to chat history
+    st.session_state.messages.append({"role": "user", "content": prompt})
+    response = f"Echo: {prompt}"
+    # Display assistant response in chat message container
+    with st.chat_message("assistant"):
+        response = get_llm_response(llm_providers[st.session_state["llm_provider"]], prompt)
+        st.markdown(response)
+    # Add assistant response to chat history
+    st.session_state.messages.append({"role": "assistant", "content": response})

pages/app_langchain_completion.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import requests
+import json
+import os
+import streamlit as st
+from langchain_community.llms import Ollama
+from langchain_nvidia_ai_endpoints import ChatNVIDIA
+from langchain_groq import ChatGroq
+from langchain.chains import ConversationChain
+from langchain.memory import ConversationBufferMemory
+from clients import OllamaClient, GroqClient
+st.set_page_config(
+    page_title="QA Inference Streamlit App using Ollama, Nvidia and Groq with Langchain framework"
+)
+# Cache the header of the app to prevent re-rendering on each load
+@st.cache_resource
+def display_app_header():
+    """Display the header of the Streamlit app."""
+    st.title("QA Inference with Ollama & Nvidia & Groq as LLMs providers")
+    st.subheader("ChatBot based on Langchain framework")
+# Display the header of the app
+display_app_header()
+# UI sidebar ##########################################
+st.sidebar.subheader("Models")
+# LLM
+llm_providers = {
+    "Local Ollama": "ollama",
+    "Cloud Nvidia": "nvidia",
+    "Cloud Groq": "groq",
+}
+# hard coded because models returned by NvidiaClient().list_models() are not well formed for Langchain ChatNVIDIA class
+llms_from_nvidia = [
+    "ai-llama3-70b",
+    "ai-mistral-large",
+    "ai-gemma-7b",
+    "ai-codellama-70b",
+]
+llm_provider = st.sidebar.radio(
+    "Choose your LLM Provider", llm_providers.keys(), key="llm_provider"
+)
+if llm_provider == "Local Ollama":
+    ollama_list_models = OllamaClient().list_models()
+    ollama_models = [x["name"] for x in ollama_list_models["models"]]
+    ollama_llm = st.sidebar.radio(
+        "Select your Ollama model", ollama_models, key="ollama_llm"
+    )  # retrive with st.session_state["ollama_llm"]
+elif llm_provider == "Cloud Nvidia":
+    if nvidia_api_token := st.sidebar.text_input("Enter your Nvidia API Key"):
+        os.environ["NVIDIA_API_KEY"] = nvidia_api_token
+        st.sidebar.info("nvidia authentification ok")
+        # nvidia_models = [model.model_name for model in list_nvidia_models() if (model.model_type == "chat") & (model.model_name is not None)] # list is false
+        nvidia_models = llms_from_nvidia
+        nvidia_llm = st.sidebar.radio(
+            "Select your Nvidia LLM", nvidia_models, key="nvidia_llm"
+        )
+    else:
+        st.sidebar.warning("You must enter your Nvidia API key")
+elif llm_provider == "Cloud Groq":
+    if groq_api_token := st.sidebar.text_input("Enter your Groq API Key"):
+        st.sidebar.info("Groq authentification ok")
+        groq_list_models = GroqClient(api_key=groq_api_token).list_models()
+        groq_models = [x["id"] for x in groq_list_models["data"]]
+        groq_llm = st.sidebar.radio("Choose your Groq LLM", groq_models, key="groq_llm")
+    else:
+        st.sidebar.warning("You must enter your Groq API key")
+# LLM parameters
+st.sidebar.subheader("Parameters")
+max_tokens = st.sidebar.number_input("Token numbers", value=1024, key="max_tokens")
+temperature = st.sidebar.slider(
+    "Temperature", min_value=0.0, max_value=1.0, value=0.5, step=0.1, key="temperature"
+)
+top_p = st.sidebar.slider(
+    "Top P", min_value=0.0, max_value=1.0, value=0.7, step=0.1, key="top_p"
+)
+# LLM client #########################################
+class LlmProvider:
+    def __init__(self, provider):
+        if provider == "ollama":
+            self.llm = Ollama(
+                model=st.session_state["ollama_llm"],
+                temperature=st.session_state["temperature"],
+                max_tokens=st.session_state["max_tokens"],
+                top_p=st.session_state["top_p"],
+            )
+        elif provider == "nvidia":
+            self.llm = ChatNVIDIA(
+                model=st.session_state["nvidia_llm"],
+                temperature=st.session_state["temperature"],
+                max_tokens=st.session_state["max_tokens"],
+                top_p=st.session_state["top_p"],
+            )
+        elif provider == "groq":
+            self.llm = ChatGroq(
+                groq_api_key = groq_api_token,
+                model_name=st.session_state["groq_llm"],
+                temperature=st.session_state["temperature"],
+                max_tokens=st.session_state["max_tokens"],
+                top_p=st.session_state["top_p"],
+            )
+# Initialize chat history
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+# Display chat messages from history on app rerun
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        st.markdown(message["content"])
+# React to user input
+if prompt := st.chat_input("What is up?"):
+    # Display user message in chat message container
+    with st.chat_message("user"):
+        st.markdown(prompt)
+    conversation = ConversationChain(
+        llm=LlmProvider(llm_providers[st.session_state["llm_provider"]]).llm,
+        memory=ConversationBufferMemory(),
+    )
+    response = f"Echo: {prompt}"
+    # Display assistant response in chat message container
+    with st.chat_message("assistant"):
+        # response = LlmProvider1(llm_providers[llm_provider], prompt=prompt).response
+        response = conversation.invoke(prompt)["response"]
+        st.markdown(response)
+    # Add assistant response to chat history
+    st.session_state.messages.append({"role": "assistant", "content": response})

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+huggingface_hub
+langchain
+langchain-community
+langchain-nvidia-ai-endpoints
+numpy
+pandas
+requests
+streamlit
+ollama
+groq
+langchain-groq

screenshot.png ADDED Viewed