asasasTextyhh / app.py
Hjgugugjhuhjggg's picture
Update app.py
e1c1e64 verified
raw
history blame
13.2 kB
from pydantic import BaseModel
from llama_cpp import Llama
from concurrent.futures import ThreadPoolExecutor, as_completed
import re
import httpx
import asyncio
import gradio as gr
import os
from dotenv import load_dotenv
import spaces
import requests
import random
from faker import Faker
load_dotenv()
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
global_data = {
'models': {},
'tokens': {
'eos': 'eos_token',
'pad': 'pad_token',
'padding': 'padding_token',
'unk': 'unk_token',
'bos': 'bos_token',
'sep': 'sep_token',
'cls': 'cls_token',
'mask': 'mask_token',
'n_ctx': 'n_ctx_token',
'vocab_size': 'vocab_size_token',
'n_embd': 'n_embd_token',
'n_head': 'n_head_token',
'n_layer': 'n_layer_token',
'n_vocab': 'n_vocab_token',
'block_size': 'block_size_token',
'n_rot': 'n_rot_token',
'rope_dim': 'rope_dim_token',
'rope_scaling': 'rope_scaling_token',
'n_positions': 'n_positions_token',
'use_cache': 'use_cache_token',
'use_parallel_inference': 'use_parallel_inference_token',
'parallel_inference_count': 'parallel_inference_count_token',
'use_mlock': 'use_mlock_token',
'use_mmap': 'use_mmap_token',
'use_cpu': 'use_cpu_token',
'f16_kv': 'f16_kv_token',
'f16_quant': 'f16_quant_token',
'f16_output': 'f16_output_token',
'use_flash_attn': 'use_flash_attn_token',
'max_seq_len': 'max_seq_len_token',
'do_sample': 'do_sample_token',
'top_k': 'top_k_token',
'top_p': 'top_p_token',
'temperature': 'temperature_token',
'num_return_sequences': 'num_return_sequences_token',
'use_repetition_penalty': 'use_repetition_penalty_token',
'repetition_penalty': 'repetition_penalty_token',
'no_repeat_ngram_size': 'no_repeat_ngram_size_token',
'bad_words_ids': 'bad_words_ids_token',
'use_token_logging': 'use_token_logging_token',
'use_tensor_parallel': 'use_tensor_parallel_token',
'tensor_parallel_size': 'tensor_parallel_size_token',
'use_gpu_memory_growth': 'use_gpu_memory_growth_token',
'use_multi_gpu_inference': 'use_multi_gpu_inference_token',
'multi_gpu_inference_count': 'multi_gpu_inference_count_token'
}
}
model_configs = [
{"repo_id": "Ffftdtd5dtft/gpt2-xl-Q2_K-GGUF", "filename": "gpt2-xl-q2_k.gguf", "name": "GPT-2 XL"},
{"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-8B Instruct"},
{"repo_id": "Ffftdtd5dtft/gemma-2-9b-it-Q2_K-GGUF", "filename": "gemma-2-9b-it-q2_k.gguf", "name": "Gemma 2-9B IT"},
{"repo_id": "Ffftdtd5dtft/gemma-2-27b-Q2_K-GGUF", "filename": "gemma-2-27b-q2_k.gguf", "name": "Gemma 2-27B"},
{"repo_id": "Ffftdtd5dtft/Phi-3-mini-128k-instruct-Q2_K-GGUF", "filename": "phi-3-mini-128k-instruct-q2_k.gguf", "name": "Phi-3 Mini 128K Instruct"},
{"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-q2_k.gguf", "name": "Meta Llama 3.1-8B"},
{"repo_id": "Ffftdtd5dtft/Qwen2-7B-Instruct-Q2_K-GGUF", "filename": "qwen2-7b-instruct-q2_k.gguf", "name": "Qwen2 7B Instruct"},
{"repo_id": "Ffftdtd5dtft/starcoder2-3b-Q2_K-GGUF", "filename": "starcoder2-3b-q2_k.gguf", "name": "Starcoder2 3B"},
{"repo_id": "Ffftdtd5dtft/Qwen2-1.5B-Instruct-Q2_K-GGUF", "filename": "qwen2-1.5b-instruct-q2_k.gguf", "name": "Qwen2 1.5B Instruct"},
{"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Q2_K-GGUF", "filename": "meta-llama-3.1-70b-q2_k.gguf", "name": "Meta Llama 3.1-70B"},
{"repo_id": "Ffftdtd5dtft/Mistral-Nemo-Instruct-2407-Q2_K-GGUF", "filename": "mistral-nemo-instruct-2407-q2_k.gguf", "name": "Mistral Nemo Instruct 2407"},
{"repo_id": "Ffftdtd5dtft/Hermes-3-Llama-3.1-8B-IQ1_S-GGUF", "filename": "hermes-3-llama-3.1-8b-iq1_s-imat.gguf", "name": "Hermes 3 Llama 3.1-8B"},
{"repo_id": "Ffftdtd5dtft/Phi-3.5-mini-instruct-Q2_K-GGUF", "filename": "phi-3.5-mini-instruct-q2_k.gguf", "name": "Phi 3.5 Mini Instruct"},
{"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-70b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-70B Instruct"},
{"repo_id": "Ffftdtd5dtft/codegemma-2b-IQ1_S-GGUF", "filename": "codegemma-2b-iq1_s-imat.gguf", "name": "Codegemma 2B"},
{"repo_id": "Ffftdtd5dtft/Phi-3-mini-128k-instruct-IQ2_XXS-GGUF", "filename": "phi-3-mini-128k-instruct-iq2_xxs-imat.gguf", "name": "Phi 3 Mini 128K Instruct XXS"},
{"repo_id": "Ffftdtd5dtft/TinyLlama-1.1B-Chat-v1.0-IQ1_S-GGUF", "filename": "tinyllama-1.1b-chat-v1.0-iq1_s-imat.gguf", "name": "TinyLlama 1.1B Chat"},
{"repo_id": "Ffftdtd5dtft/Mistral-NeMo-Minitron-8B-Base-IQ1_S-GGUF", "filename": "mistral-nemo-minitron-8b-base-iq1_s-imat.gguf", "name": "Mistral NeMo Minitron 8B Base"},
{"repo_id": "Ffftdtd5dtft/Mistral-Nemo-Instruct-2407-Q2_K-GGUF", "filename": "mistral-nemo-instruct-2407-q2_k.gguf", "name": "Mistral Nemo Instruct 2407"}
]
class ModelManager:
def __init__(self):
self.models = {}
def load_model(self, model_config):
if model_config['name'] not in self.models:
try:
print(f"Loading model {model_config['name']}...")
self.models[model_config['name']] = Llama.from_pretrained(
repo_id=model_config['repo_id'],
filename=model_config['filename'],
use_auth_token=HUGGINGFACE_TOKEN
)
print(f"Model {model_config['name']} loaded successfully.")
except Exception as e:
print(f"Error loading model {model_config['name']}: {e}")
def load_all_models(self):
with ThreadPoolExecutor() as executor:
for config in model_configs:
executor.submit(self.load_model, config)
return self.models
model_manager = ModelManager()
global_data['models'] = model_manager.load_all_models()
class ChatRequest(BaseModel):
message: str
def normalize_input(input_text):
return input_text.strip()
def remove_duplicates(text):
text = re.sub(r'(Hello there, how are you\? \[/INST\]){2,}', 'Hello there, how are you? [/INST]', text)
text = re.sub(r'(How are you\? \[/INST\]){2,}', 'How are you? [/INST]', text)
text = text.replace('[/INST]', '')
lines = text.split('\n')
unique_lines = []
seen_lines = set()
for line in lines:
if line not in seen_lines:
unique_lines.append(line)
seen_lines.add(line)
return '\n'.join(unique_lines)
PROXY_URL = "https://uhhy-fsfsfs.hf.space/valid"
def get_random_proxy():
try:
response = requests.get(PROXY_URL)
proxies = response.text.splitlines()
return random.choice(proxies)
except Exception as e:
print(f"Error fetching proxy: {e}")
return None
fake = Faker()
def generate_fake_ip():
return fake.ipv4()
@spaces.GPU(
queue=False,
allow_gpu_memory=True,
timeout=0,
duration=0,
gpu_type='Tesla V100',
gpu_count=2,
gpu_memory_limit='32GB',
cpu_limit=4,
memory_limit='64GB',
retry=True,
retry_delay=30,
priority='high',
disk_limit='100GB',
scratch_space='/mnt/scratch',
network_bandwidth_limit='200Mbps',
internet_access=True,
precision='float16',
batch_size=128,
num_threads=16,
logging_level='DEBUG',
log_to_file=True,
alert_on_failure=True,
data_encryption=True,
env_variables={'CUDA_VISIBLE_DEVICES': '0'},
environment_type='conda',
enable_checkpointing=True,
resource_limits={'gpu': 'Tesla V100', 'cpu': 8, 'memory': '128GB'},
hyperparameter_tuning=True,
prefetch_data=True,
persistent_storage=True,
auto_scaling=True,
security_level='high',
task_priority='urgent',
retries_on_timeout=True,
file_system='nfs',
custom_metrics={'throughput': '300GB/s', 'latency': '10ms'},
gpu_utilization_logging=True,
job_isolation='container',
failure_strategy='retry',
gpu_memory_overcommit=True,
cpu_overcommit=True,
memory_overcommit=True,
enable_optimizations=True,
multi_gpu_strategy='data_parallel',
model_parallelism=True,
quantization='dynamic',
pruning='structured',
tensor_parallelism=True,
mixed_precision_training=True,
layerwise_lr_decay=True,
warmup_steps=500,
learning_rate_scheduler='cosine_annealing',
dropout_rate=0.3,
weight_decay=0.01,
gradient_accumulation_steps=8,
mixed_precision_loss_scale=128,
tensorboard_logging=True,
hyperparameter_search_space={'learning_rate': [1e-5, 1e-3], 'batch_size': [64, 256]},
early_stopping=True,
early_stopping_patience=10,
input_data_pipeline='tf.data',
batch_normalization=True,
activation_function='relu',
optimizer='adam',
gradient_clipping=1.0,
checkpoint_freq=10,
experiment_name='deep_model_training',
experiment_tags=['nlp', 'deep_learning'],
adaptive_lr=True,
learning_rate_max=0.01,
learning_rate_min=1e-6,
max_steps=100000,
tolerance=0.01,
logging_frequency=10,
profile_gpu=True,
profile_cpu=True,
debug_mode=True,
save_best_model=True,
evaluation_metric='accuracy',
job_preemption='enabled',
preemptible_resources=True,
grace_period=60,
resource_scheduling='fifo',
hyperparameter_optimization_algorithm='bayesian',
distributed_training=True,
multi_node_training=True,
max_retries=5,
log_level='INFO',
secure_socket_layer=True,
data_sharding=True,
distributed_optimizer='horovod',
mixed_precision_support=True,
fault_tolerance=True,
external_gpu_resources=True,
disk_cache=True,
backup_enabled=True,
backup_frequency='daily',
task_grouping='dynamic',
instance_type='high_memory',
instance_count=3,
task_runtime='hours',
adaptive_memory_allocation=True,
model_versioning=True,
multi_model_support=True,
batch_optimization=True,
memory_prefetch=True,
data_prefetch_threads=16,
network_optimization=True,
model_parallelism_strategy='pipeline',
verbose_logging=True,
lock_on_failure=True,
data_compression=True,
inference_mode='batch',
distributed_cache_enabled=True,
dynamic_batching=True,
model_deployment=True,
latency_optimization=True,
multi_region_deployment=True,
multi_user_support=True,
job_scheduling='auto',
max_job_count=100,
suspend_on_idle=True,
hyperparameter_search_algorithm='random',
job_priority_scaling=True,
quantum_computing_support=True,
dynamic_resource_scaling=True,
runtime_optimization=True,
checkpoint_interval='30min',
max_gpu_temperature=80,
scale_on_gpu_utilization=True,
worker_threads=8
)
def generate_model_response(model, inputs):
try:
print(f"Generating response for model: {model}")
response = model(inputs)
print(f"Response from {model}: {response}")
return remove_duplicates(response['choices'][0]['text'])
except Exception as e:
print(f"Error generating model response from {model}: {e}")
return "Error generating response. Please try again later."
def remove_repetitive_responses(responses):
unique_responses = {}
for response in responses:
if response not in unique_responses:
unique_responses[response] = response
return unique_responses
async def process_message(message):
inputs = normalize_input(message)
with ThreadPoolExecutor() as executor:
futures = [
executor.submit(generate_model_response, model, inputs)
for model in global_data['models'].values()
]
responses = []
for future in as_completed(futures):
try:
response = future.result()
responses.append(response)
except Exception as e:
print(f"Error with model: {e}")
responses.append("Error generating response. Please try again later.")
unique_responses = remove_repetitive_responses(responses)
formatted_response = ""
for model, response in unique_responses.items():
formatted_response += f"**{model}:**\n{response}\n\n"
curl_command = f"""
curl -X POST -H "Content-Type: application/json" \\
-d '{{"message": "{message}"}}' \\
http://localhost:7860/generate
"""
return formatted_response, curl_command
iface = gr.Interface(
fn=process_message,
inputs=gr.Textbox(lines=2, placeholder="Enter your message here..."),
outputs=[gr.Markdown(), gr.Textbox(label="cURL command")],
title="Multi-Model LLM API",
description="Enter a message and get responses from multiple LLMs.",
)
if __name__ == "__main__":
port = int(os.environ.get("PORT", 7860))
iface.launch(server_port=port)