Spaces:
Running
on
Zero
Running
on
Zero
File size: 2,863 Bytes
a9106b7 2f7b0a4 a9106b7 4df36c7 2f7b0a4 a9106b7 2f7b0a4 a9106b7 4df36c7 a9106b7 2f7b0a4 4df36c7 2f7b0a4 4df36c7 a9106b7 2f7b0a4 a9106b7 2f7b0a4 65d64f1 2f7b0a4 a9106b7 2f7b0a4 1742611 4df36c7 2f7b0a4 4df36c7 2f7b0a4 4df36c7 2f7b0a4 4df36c7 2f7b0a4 a5219f1 2f7b0a4 65d64f1 2f7b0a4 4df36c7 65d64f1 a9106b7 65d64f1 2f7b0a4 4df36c7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
import os
import logging
from logging.handlers import RotatingFileHandler
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_huggingface import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
# Logging setup
log_file = '/tmp/app_debug.log'
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
file_handler = RotatingFileHandler(log_file, maxBytes=10*1024*1024, backupCount=5)
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logger.addHandler(file_handler)
logger.debug("Application started")
model_id = "google/gemma-2-9b-it"
tokenizer = AutoTokenizer.from_pretrained(model_id)
# Load model with GPU availability check
if torch.cuda.is_available():
logger.debug("GPU is available. Proceeding with GPU setup.")
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
torch_dtype=torch.bfloat16,
)
else:
logger.warning("GPU is not available. Proceeding with CPU setup.")
model = AutoModelForCausalLM.from_pretrained(
model_id,
low_cpu_mem_usage=True,
use_auth_token=os.getenv('HF_TOKEN'),
)
model.eval()
# Create Hugging Face pipeline
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
max_length=2048,
temperature=0.7,
top_k=50,
top_p=0.9,
repetition_penalty=1.2,
)
# Initialize HuggingFacePipeline model for LangChain
chat_model = HuggingFacePipeline(pipeline=pipe)
logger.debug("Model and tokenizer loaded successfully")
# Define the conversation template for LangChain
template = """<|im_start|>system
{system_prompt}
<|im_end|>
{history}
<|im_start|>user
{human_input}
<|im_end|>
<|im_start|>assistant"""
# Create LangChain prompt and chain
prompt = PromptTemplate(
template=template, input_variables=["system_prompt", "history", "human_input"]
)
chain = LLMChain(llm=chat_model, prompt=prompt)
# Prediction function using LangChain and model
def predict(message, history=[]):
formatted_history = "\n".join(
[f"<|im_start|>{entry['role']}\n{entry['content']}<|im_end|>" for entry in history]
)
system_prompt = "You are a helpful coding assistant."
try:
result = chain.run({
"system_prompt": system_prompt,
"history": formatted_history,
"human_input": message
})
return result
except Exception as e:
logger.exception(f"Error during prediction: {e}")
return "An error occurred."
# Gradio UI
interface = gr.Interface(
fn=predict,
inputs=gr.Textbox(label="User input"),
outputs="text",
allow_flagging='never',
live=True,
)
interface.launch()
logger.debug("Chat interface initialized and launched")
|