import requests import streamlit as st import os from huggingface_hub import InferenceClient API_URL = '' API_KEY = os.getenv('API_KEY') headers = { "Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json" } # Prompt Set of Examples: prompt = f"Write instructions to teach anyone to write a discharge plan. List the entities, features and relationships to CCDA and FHIR objects in boldface." def StreamLLMChatResponse(prompt): endpoint_url = API_URL hf_token = API_KEY client = InferenceClient(endpoint_url, token=hf_token) gen_kwargs = dict( max_new_tokens=512, top_k=30, top_p=0.9, temperature=0.2, repetition_penalty=1.02, stop_sequences=["\nUser:", "<|endoftext|>", ""], ) stream = client.text_generation(prompt, stream=True, details=True, **gen_kwargs) report=[] res_box = st.empty() collected_chunks=[] collected_messages=[] for r in stream: if r.token.special: continue if r.token.text in gen_kwargs["stop_sequences"]: break collected_chunks.append(r.token.text) chunk_message = r.token.text collected_messages.append(chunk_message) try: report.append(r.token.text) if len(r.token.text) > 0: result="".join(report).strip() res_box.markdown(f'*{result}*') except: st.write(' ') def query(payload): response =, headers=headers, json=payload) st.markdown(response.json()) return response.json() def get_output(prompt): return query({"inputs": prompt}) def main(): st.title("Medical Llama Test Bench with Inference Endpoints Llama 7B") prompt = f"Write instructions to teach anyone to write a discharge plan. List the entities, features and relationships to CCDA and FHIR objects in boldface." example_input = st.text_input("Enter your example text:", value=prompt) if st.button("Run Prompt With Dr Llama"): StreamLLMChatResponse(example_input) if __name__ == "__main__": main()