import streamlit as st from optimum.nvidia.pipelines import pipeline import torch import os import sys # Retrieve the Hugging Face token from environment variables hf_token = os.environ.get("HF_TOKEN") if not hf_token: st.error("Hugging Face token not found. Please add your HF_TOKEN to the Space secrets.") st.stop() @st.cache_resource def load_pipeline(model_name): with st.spinner(f'Loading {model_name}... This may take several minutes.'): try: pipe = pipeline("text-generation", model=model_name,use_fp8=True) except Exception as e: st.error(f"An error occurred: {e}") st.stop() return pipe pipe8 = load_pipeline("unsloth/Meta-Llama-3.1-8B-bnb-4bit") pipe8instruct = load_pipeline("SanctumAI/Meta-Llama-3.1-8B-Instruct-GGUF") def generate_text(model, tokenizer, prompt, max_length=100): inputs = tokenizer(prompt, return_tensors="pt") with torch.no_grad(): outputs = model.generate(**inputs, max_length=max_length, num_return_sequences=1) return tokenizer.decode(outputs[0], skip_special_tokens=True) st.title("LLaMA-3.1-8B vs LLaMA-3.1-8B-Instruct Comparison") prompt = st.text_area("Enter your prompt:", height=100) max_length = st.slider("Max output length:", min_value=50, max_value=500, value=100) if st.button("Generate"): if prompt: col1, col2 = st.columns(2) with col1: st.subheader("LLaMA-3.1-8B Output") output_8b = pipe8(prompt, max_length) st.write(output_8b[0]['generated_text']) with col2: st.subheader("LLaMA-3.1-8B-Instruct Output") output_8b_instruct = pipe8instruct(prompt, max_length) st.write(output_8b_instruct[0]['generated_text']) else: st.warning("Please enter a prompt.")