Islam YAHIAOUI commited on
Commit
0308e6e
1 Parent(s): 5759fed

Correction

Browse files
Helpers.py CHANGED
@@ -3,7 +3,7 @@ import json
3
  import spacy
4
  import string
5
 
6
- def generate_prompt(context, question, history):
7
 
8
  # history_summary = ""
9
  # if history:
@@ -14,16 +14,15 @@ def generate_prompt(context, question, history):
14
  else:
15
  prompt_context = "No context provided."
16
  prompt = f"""
17
- <s>[INST] <<SYS>> You are a helpful, respectful and honest assistant. Always answer as helpfully as possible based on the context, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.<</SYS>>
18
 
19
- Context:
20
  {prompt_context}
21
 
22
  [INST] {question} [/INST]
23
-
24
- Response:
25
  """
26
 
 
27
  return prompt
28
 
29
  # ==============================================================================================================================================
 
3
  import spacy
4
  import string
5
 
6
+ def generate_prompt(context, question, history=None):
7
 
8
  # history_summary = ""
9
  # if history:
 
14
  else:
15
  prompt_context = "No context provided."
16
  prompt = f"""
17
+ <s>[INST] <<SYS>> You are a helpful, respectful and honest assistant. Always answer as helpfully as possible based on the context, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content, and dont mention that you used the provided context .<</SYS>>
18
 
19
+ Context \n :
20
  {prompt_context}
21
 
22
  [INST] {question} [/INST]
 
 
23
  """
24
 
25
+ # Response:
26
  return prompt
27
 
28
  # ==============================================================================================================================================
__pycache__/Helpers.cpython-312.pyc CHANGED
Binary files a/__pycache__/Helpers.cpython-312.pyc and b/__pycache__/Helpers.cpython-312.pyc differ
 
__pycache__/rag.cpython-312.pyc CHANGED
Binary files a/__pycache__/rag.cpython-312.pyc and b/__pycache__/rag.cpython-312.pyc differ
 
app.py CHANGED
@@ -1,12 +1,14 @@
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
 
3
  from rag import run_rag
4
  """
5
  For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
  """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
-
9
 
 
10
  def respond(
11
  message,
12
  history: list[tuple[str, str]],
@@ -22,9 +24,9 @@ def respond(
22
  messages.append({"role": "user", "content": val[0]})
23
  if val[1]:
24
  messages.append({"role": "assistant", "content": val[1]})
25
-
26
- messages.append({"role": "user", "content": run_rag(message)})
27
 
 
 
28
  response = ""
29
 
30
  for message in client.chat_completion(
@@ -35,19 +37,21 @@ def respond(
35
  top_p=top_p,
36
  ):
37
  token = message.choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
 
 
 
42
  """
43
  For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
44
  """
45
  demo = gr.ChatInterface(
46
  respond,
 
 
47
  additional_inputs=[
48
- gr.Textbox(value="You are a useful and capable assistant .", label="System message"),
49
  gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
50
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
51
  gr.Slider(
52
  minimum=0.1,
53
  maximum=1.0,
@@ -56,8 +60,14 @@ demo = gr.ChatInterface(
56
  label="Top-p (nucleus sampling)",
57
  ),
58
  ],
 
 
 
 
 
 
59
  )
60
 
61
 
62
  if __name__ == "__main__":
63
- demo.launch(share=True)
 
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
3
+ import os
4
  from rag import run_rag
5
  """
6
  For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
7
  """
8
+ token = os.environ.get("token_HF", None)
9
+ client = InferenceClient("tiiuae/falcon-11B",token= token)
10
 
11
+ print(token)
12
  def respond(
13
  message,
14
  history: list[tuple[str, str]],
 
24
  messages.append({"role": "user", "content": val[0]})
25
  if val[1]:
26
  messages.append({"role": "assistant", "content": val[1]})
 
 
27
 
28
+ messages.append({"role": "user", "content": run_rag(message)})
29
+
30
  response = ""
31
 
32
  for message in client.chat_completion(
 
37
  top_p=top_p,
38
  ):
39
  token = message.choices[0].delta.content
40
+ response += str(token)
 
 
41
 
42
+ yield response
43
+
44
  """
45
  For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
46
  """
47
  demo = gr.ChatInterface(
48
  respond,
49
+ title="Retrieval Augmented Generation (RAG) Chatbot" ,
50
+ fill_height=True,
51
  additional_inputs=[
52
+ gr.Textbox(value="You are a friendly Chatbot.", label="System message" ),
53
  gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
54
+ gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature" ),
55
  gr.Slider(
56
  minimum=0.1,
57
  maximum=1.0,
 
60
  label="Top-p (nucleus sampling)",
61
  ),
62
  ],
63
+ examples=[
64
+ [
65
+ "What is the capital of France?",
66
+ "What happend in 11 september 2001?",
67
+ "who is the president of the United States?"
68
+ ] ],
69
  )
70
 
71
 
72
  if __name__ == "__main__":
73
+ demo.launch()
example.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import (
4
+ AutoModelForCausalLM,
5
+ AutoTokenizer,
6
+ TextIteratorStreamer,
7
+ BitsAndBytesConfig,
8
+ )
9
+ import os
10
+ from threading import Thread
11
+ import spaces
12
+ import time
13
+
14
+ token = os.environ["HF_TOKEN"]
15
+
16
+ quantization_config = BitsAndBytesConfig(
17
+ load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16
18
+ )
19
+
20
+ model = AutoModelForCausalLM.from_pretrained(
21
+ "NousResearch/Hermes-2-Pro-Llama-3-8B", quantization_config=quantization_config, token=token
22
+ )
23
+ tok = AutoTokenizer.from_pretrained("NousResearch/Hermes-2-Pro-Llama-3-8B", token=token)
24
+ terminators = [
25
+ tok.eos_token_id,
26
+ tok.convert_tokens_to_ids("<|eot_id|>")
27
+ ]
28
+
29
+ if torch.cuda.is_available():
30
+ device = torch.device("cuda")
31
+ print(f"Using GPU: {torch.cuda.get_device_name(device)}")
32
+ else:
33
+ device = torch.device("cpu")
34
+ print("Using CPU")
35
+
36
+ # model = model.to(device)
37
+ # Dispatch Errors
38
+
39
+
40
+ @spaces.GPU(duration=150)
41
+ def chat(message, history, temperature,do_sample, max_tokens):
42
+ chat = []
43
+ for item in history:
44
+ chat.append({"role": "user", "content": item[0]})
45
+ if item[1] is not None:
46
+ chat.append({"role": "assistant", "content": item[1]})
47
+ chat.append({"role": "user", "content": message})
48
+ messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
49
+ model_inputs = tok([messages], return_tensors="pt").to(device)
50
+ streamer = TextIteratorStreamer(
51
+ tok, timeout=10.0, skip_prompt=True, skip_special_tokens=True
52
+ )
53
+ generate_kwargs = dict(
54
+ model_inputs,
55
+ streamer=streamer,
56
+ max_new_tokens=max_tokens,
57
+ do_sample=True,
58
+ temperature=temperature,
59
+ eos_token_id=terminators,
60
+ )
61
+
62
+ if temperature == 0:
63
+ generate_kwargs['do_sample'] = False
64
+
65
+ t = Thread(target=model.generate, kwargs=generate_kwargs)
66
+ t.start()
67
+
68
+ partial_text = ""
69
+ for new_text in streamer:
70
+ partial_text += new_text
71
+ yield partial_text
72
+
73
+ tokens = len(tok.tokenize(partial_text))
74
+ yield partial_text
75
+
76
+
77
+ demo = gr.ChatInterface(
78
+ fn=chat,
79
+ examples=[["Write me a poem about Machine Learning."]],
80
+ # multimodal=False,
81
+ additional_inputs_accordion=gr.Accordion(
82
+ label="⚙️ Parameters", open=False, render=False
83
+ ),
84
+ additional_inputs=[
85
+ gr.Slider(
86
+ minimum=0, maximum=1, step=0.1, value=0.9, label="Temperature", render=False
87
+ ),
88
+ gr.Checkbox(label="Sampling",value=True),
89
+ gr.Slider(
90
+ minimum=128,
91
+ maximum=4096,
92
+ step=1,
93
+ value=512,
94
+ label="Max new tokens",
95
+ render=False,
96
+ ),
97
+ ],
98
+ stop_btn="Stop Generation",
99
+ title="Chat With LLMs",
100
+ description="Now Running [NousResearch/Hermes-2-Pro-Llama-3-8B](https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B) in 4bit"
101
+ )
102
+ demo.launch()
rag.py CHANGED
@@ -25,6 +25,5 @@ def run_rag(query, history=None):
25
  indices = [result.index for result in rerank_docs.results]
26
  documents = get_docs_by_indices(docs, indices)
27
  prompt = generate_prompt(documents, query, history)
28
- print("Prompt: ", prompt)
29
- # response = llama(prompt)
30
- return prompt
 
25
  indices = [result.index for result in rerank_docs.results]
26
  documents = get_docs_by_indices(docs, indices)
27
  prompt = generate_prompt(documents, query, history)
28
+
29
+ return query , prompt