Spaces:

aiwithankit
/

chatwithllama

Sleeping

File size: 2,072 Bytes

447f2e2
8a0052b
 
 
 
 
 
000dca3
 
8a0052b
 
 
 
 
 
 
 
 
000dca3
8a0052b
 
 
 
 
000dca3
8a0052b
000dca3
8a0052b
000dca3
8a0052b
 
 
 
 
 
 
 
000dca3
8a0052b
 
000dca3
8a0052b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
000dca3
4218b21
8a0052b
 
000dca3
 
 
 
 
d396c97
d0f00bb
 
000dca3
 
1671f40

import gradio as gr
# from huggingface_hub import InferenceClient

# """
# For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
# """
# client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")


# def respond(
#     message,
#     history: list[tuple[str, str]],
#     system_message,
#     max_tokens,
#     temperature,
#     top_p,
# ):
#     messages = [{"role": "system", "content": system_message}]

#     for val in history:
#         if val[0]:
#             messages.append({"role": "user", "content": val[0]})
#         if val[1]:
#             messages.append({"role": "assistant", "content": val[1]})

#     messages.append({"role": "user", "content": message})

#     response = ""

#     for message in client.chat_completion(
#         messages,
#         max_tokens=max_tokens,
#         stream=True,
#         temperature=temperature,
#         top_p=top_p,
#     ):
#         token = message.choices[0].delta.content

#         response += token
#         yield response

from transformers import pipeline
import torch

model_id = "unsloth/Llama-3.2-1B-Instruct"  # You can switch to 3B if needed
text_pipeline = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
# prompt= input("Please enter your query: ")
# outputs = text_pipeline(prompt, max_new_tokens=150)
# response = outputs[0]["generated_text"]
# print(response)

import gradio as gr

def generated_response(prompt,history):
    response = text_pipeline(prompt, max_new_tokens=150)
    return response[0]["generated_text"]

"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""

demo = gr.ChatInterface(generated_response,
                       title="This model is running on cpu so it will effect reasoning and inference time will be slow"  # This sets the header title
                       )

if __name__ == "__main__":
    demo.launch(share=True)