Spaces:

islam23
/

llama3-8b-RAG_News_Finance

Running

App Files Files Community

Islam YAHIAOUI commited on May 17

Commit

0308e6e

•

1 Parent(s): 5759fed

Correction

Browse files

Files changed (6) hide show

Helpers.py +4 -5
__pycache__/Helpers.cpython-312.pyc +0 -0
__pycache__/rag.cpython-312.pyc +0 -0
app.py +20 -10
example.py +102 -0
rag.py +2 -3

Helpers.py CHANGED Viewed

@@ -3,7 +3,7 @@ import json
 import spacy
 import string
-def generate_prompt(context, question, history):
     # history_summary = ""
     # if history:
@@ -14,16 +14,15 @@ def generate_prompt(context, question, history):
     else:
         prompt_context = "No context provided."
     prompt = f"""
-    <s>[INST] <<SYS>> You are a helpful, respectful and honest assistant. Always answer as helpfully as possible based on the context, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.<</SYS>>
-    Context:
     {prompt_context}
     [INST] {question} [/INST]
-    Response:
     """
     return prompt
 # ==============================================================================================================================================

 import spacy
 import string
+def generate_prompt(context, question, history=None):
     # history_summary = ""
     # if history:
     else:
         prompt_context = "No context provided."
     prompt = f"""
+    <s>[INST] <<SYS>> You are a helpful, respectful and honest assistant. Always answer as helpfully as possible based on the context, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content, and dont mention that you used the provided context .<</SYS>>
+    Context \n :
     {prompt_context}
     [INST] {question} [/INST]
     """
+    # Response:
     return prompt
 # ==============================================================================================================================================

__pycache__/Helpers.cpython-312.pyc CHANGED Viewed

Binary files a/__pycache__/Helpers.cpython-312.pyc and b/__pycache__/Helpers.cpython-312.pyc differ

__pycache__/rag.cpython-312.pyc CHANGED Viewed

Binary files a/__pycache__/rag.cpython-312.pyc and b/__pycache__/rag.cpython-312.pyc differ

app.py CHANGED Viewed

@@ -1,12 +1,14 @@
 import gradio as gr
 from huggingface_hub import InferenceClient
 from rag import run_rag
 """
 For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
 """
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
 def respond(
     message,
     history: list[tuple[str, str]],
@@ -22,9 +24,9 @@ def respond(
             messages.append({"role": "user", "content": val[0]})
         if val[1]:
             messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": run_rag(message)})
     response = ""
     for message in client.chat_completion(
@@ -35,19 +37,21 @@ def respond(
         top_p=top_p,
     ):
         token = message.choices[0].delta.content
-        response += token
-        yield response
 """
 For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
 """
 demo = gr.ChatInterface(
     respond,
     additional_inputs=[
-        gr.Textbox(value="You are a useful and capable assistant .", label="System message"),
         gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
         gr.Slider(
             minimum=0.1,
             maximum=1.0,
@@ -56,8 +60,14 @@ demo = gr.ChatInterface(
             label="Top-p (nucleus sampling)",
         ),
     ],
 )
 if __name__ == "__main__":
-    demo.launch(share=True)

 import gradio as gr
 from huggingface_hub import InferenceClient
+import os
 from rag import run_rag
 """
 For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
 """
+token = os.environ.get("token_HF", None)
+client = InferenceClient("tiiuae/falcon-11B",token= token)
+print(token)
 def respond(
     message,
     history: list[tuple[str, str]],
             messages.append({"role": "user", "content": val[0]})
         if val[1]:
             messages.append({"role": "assistant", "content": val[1]})
+    messages.append({"role": "user", "content": run_rag(message)})
     response = ""
     for message in client.chat_completion(
         top_p=top_p,
     ):
         token = message.choices[0].delta.content
+        response += str(token)
+        yield response
 """
 For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
 """
 demo = gr.ChatInterface(
     respond,
+    title="Retrieval Augmented Generation (RAG) Chatbot" ,
+    fill_height=True,
     additional_inputs=[
+        gr.Textbox(value="You are a friendly Chatbot.", label="System message" ),
         gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
+        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature" ),
         gr.Slider(
             minimum=0.1,
             maximum=1.0,
             label="Top-p (nucleus sampling)",
         ),
     ],
+    examples=[
+        [
+            "What is the capital of France?",
+            "What happend in 11 september 2001?",
+            "who is the president of the United States?"
+        ] ],
 )
 if __name__ == "__main__":
+    demo.launch()

example.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import gradio as gr
+import torch
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    TextIteratorStreamer,
+    BitsAndBytesConfig,
+)
+import os
+from threading import Thread
+import spaces
+import time
+token = os.environ["HF_TOKEN"]
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16
+)
+model = AutoModelForCausalLM.from_pretrained(
+    "NousResearch/Hermes-2-Pro-Llama-3-8B", quantization_config=quantization_config, token=token
+)
+tok = AutoTokenizer.from_pretrained("NousResearch/Hermes-2-Pro-Llama-3-8B", token=token)
+terminators = [
+    tok.eos_token_id,
+    tok.convert_tokens_to_ids("<|eot_id|>")
+]
+if torch.cuda.is_available():
+    device = torch.device("cuda")
+    print(f"Using GPU: {torch.cuda.get_device_name(device)}")
+else:
+    device = torch.device("cpu")
+    print("Using CPU")
+# model = model.to(device)
+# Dispatch Errors
+@spaces.GPU(duration=150)
+def chat(message, history, temperature,do_sample, max_tokens):
+    chat = []
+    for item in history:
+        chat.append({"role": "user", "content": item[0]})
+        if item[1] is not None:
+            chat.append({"role": "assistant", "content": item[1]})
+    chat.append({"role": "user", "content": message})
+    messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
+    model_inputs = tok([messages], return_tensors="pt").to(device)
+    streamer = TextIteratorStreamer(
+        tok, timeout=10.0, skip_prompt=True, skip_special_tokens=True
+    )
+    generate_kwargs = dict(
+        model_inputs,
+        streamer=streamer,
+        max_new_tokens=max_tokens,
+        do_sample=True,
+        temperature=temperature,
+        eos_token_id=terminators,
+    )
+    if temperature == 0:
+        generate_kwargs['do_sample'] = False
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
+    partial_text = ""
+    for new_text in streamer:
+        partial_text += new_text
+        yield partial_text
+    tokens = len(tok.tokenize(partial_text))
+    yield partial_text
+demo = gr.ChatInterface(
+    fn=chat,
+    examples=[["Write me a poem about Machine Learning."]],
+    # multimodal=False,
+    additional_inputs_accordion=gr.Accordion(
+        label="⚙️ Parameters", open=False, render=False
+    ),
+    additional_inputs=[
+        gr.Slider(
+            minimum=0, maximum=1, step=0.1, value=0.9, label="Temperature", render=False
+        ),
+        gr.Checkbox(label="Sampling",value=True),
+        gr.Slider(
+            minimum=128,
+            maximum=4096,
+            step=1,
+            value=512,
+            label="Max new tokens",
+            render=False,
+        ),
+    ],
+    stop_btn="Stop Generation",
+    title="Chat With LLMs",
+    description="Now Running [NousResearch/Hermes-2-Pro-Llama-3-8B](https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B) in 4bit"
+)
+demo.launch()

rag.py CHANGED Viewed

@@ -25,6 +25,5 @@ def run_rag(query, history=None):
     indices = [result.index for result in rerank_docs.results]
     documents = get_docs_by_indices(docs, indices)
     prompt = generate_prompt(documents, query, history)
-    print("Prompt: ", prompt)
-    # response = llama(prompt)
-    return prompt

     indices = [result.index for result in rerank_docs.results]
     documents = get_docs_by_indices(docs, indices)
     prompt = generate_prompt(documents, query, history)
+    return query , prompt