Spaces:
Sleeping
Sleeping
import streamlit as st | |
from huggingface_hub import InferenceClient | |
from modules.inference.instruct import infer, load_model | |
import os | |
token = os.environ["HF_TOKEN"] | |
client = InferenceClient(model="meta-llama/Llama-3.2-1B-Instruct", token=token) | |
st.write("## Ask your Local LLM") | |
text_input = st.text_input("Query", value="Why is the sky Blue") | |
submit = st.button("Submit") | |
# @st.cache_resource | |
# def load_model_cached(): | |
# return load_model() | |
# model = load_model_cached() | |
if submit: | |
# response = infer(model, text_input) | |
# response | |
output = client.chat.completions.create( | |
messages=[ | |
{"role": "system", "content": "You are a helpful assistant."}, | |
{"role": "user", "content": text_input}, | |
], | |
stream=True, | |
max_tokens=1024, | |
) | |
for chunk in output: | |
st.write(chunk.choices[0].delta.content) |