########################## FRUSTATION PHASE ########################### import streamlit as st from transformers import AutoTokenizer from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig from huggingface_hub import snapshot_download import os # Define pretrained and quantized model directories pretrained_model_dir = "FPHam/Jackson_The_Formalizer_V2_13b_GPTQ" cwd = os.getcwd() quantized_model_dir = cwd + "/Jackson2-4bit-128g-GPTQ" # Create the cache directory if it doesn't exist os.makedirs(quantized_model_dir, exist_ok=True) snapshot_download(repo_id=pretrained_model_dir, local_dir=quantized_model_dir, local_dir_use_symlinks=False) # Quantization configuration # quantize_config = BaseQuantizeConfig(bits=4, group_size=128, damp_percent=0.01, desc_act=False) # Load the model using from_quantized model = AutoGPTQForCausalLM.from_quantized( quantized_model_dir, model_basename="Jackson2-4bit-128g-GPTQ", use_safetensors=True, strict=False, device="cuda:0", #trust_remote_code=True, use_triton=False, #quantize_config=quantize_config ) #model.save_quantized(quantized_model_dir) # Load the tokenizer tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir, use_fast=True) # Starting Streamlit app st.title("AutoGPTQ Streamlit App") user_input = st.text_input("Input a phrase") # Generate output when the "Generate" button is pressed if st.button("Generate"): inputs = tokenizer(user_input, return_tensors="pt") outputs = model.generate( **inputs, max_length=512 + inputs['input_ids'].size(-1), temperature=0.1, top_p=0.95, repetition_penalty=1.15 ) generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) st.text(generated_text)