QueryYourDocs / inference_main.py
LVKinyanjui's picture
Unignored comment
0285186
raw
history blame contribute delete
883 Bytes
import streamlit as st
from huggingface_hub import InferenceClient
from modules.inference.instruct import infer, load_model
import os
token = os.environ["HF_TOKEN"]
client = InferenceClient(model="meta-llama/Llama-3.2-1B-Instruct", token=token)
st.write("## Ask your Local LLM")
text_input = st.text_input("Query", value="Why is the sky Blue")
submit = st.button("Submit")
# @st.cache_resource
# def load_model_cached():
# return load_model()
# model = load_model_cached()
if submit:
# response = infer(model, text_input)
# response
output = client.chat.completions.create(
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": text_input},
],
stream=True,
max_tokens=1024,
)
for chunk in output:
st.write(chunk.choices[0].delta.content)