import gradio as gr import requests import json import subprocess import time import os import stat import shutil import threading import hashlib from functools import lru_cache # Cache directory CACHE_DIR = "response_cache" os.makedirs(CACHE_DIR, exist_ok=True) def get_cache_key(prompt): """Generate a unique cache key for a prompt""" return hashlib.md5(prompt.encode()).hexdigest() def save_to_cache(prompt, response): """Save response to cache file""" cache_key = get_cache_key(prompt) cache_file = os.path.join(CACHE_DIR, f"{cache_key}.json") with open(cache_file, 'w', encoding='utf-8') as f: json.dump({'prompt': prompt, 'response': response}, f) def load_from_cache(prompt): """Load response from cache if exists""" cache_key = get_cache_key(prompt) cache_file = os.path.join(CACHE_DIR, f"{cache_key}.json") if os.path.exists(cache_file): with open(cache_file, 'r', encoding='utf-8') as f: data = json.load(f) if data['prompt'] == prompt: # Double check the prompt return data['response'] return None @lru_cache(maxsize=100) # In-memory cache for frequently used prompts def get_cached_response(prompt): return load_from_cache(prompt) def setup_ollama(): try: print("Starting Ollama setup process...") # Download Ollama tarball tarball_url = "https://ollama.com/download/ollama-linux-amd64.tgz" print(f"Downloading Ollama from {tarball_url}") subprocess.run(f"curl -L {tarball_url} -o ollama.tgz", shell=True, check=True) if not os.path.exists("ollama.tgz"): raise Exception("Failed to download ollama.tgz") # Create directory and extract print("Creating directory and extracting tarball...") if os.path.exists("ollama_dir"): shutil.rmtree("ollama_dir") os.makedirs("ollama_dir", exist_ok=True) # List current directory print("Current directory contents:") subprocess.run("ls -la", shell=True) # Extract with verbose output result = subprocess.run( "tar -xvzf ollama.tgz -C ollama_dir", shell=True, capture_output=True, text=True ) print("Tar command output:", result.stdout) if result.stderr: print("Tar command errors:", result.stderr) # Check extraction print("Extracted directory contents:") subprocess.run("ls -la ollama_dir", shell=True) # Make sure ollama is executable ollama_path = "./ollama_dir/bin/ollama" if not os.path.exists(ollama_path): raise Exception(f"Ollama binary not found at {ollama_path}") os.chmod(ollama_path, stat.S_IRWXU) # Set LD_LIBRARY_PATH to include the lib directory lib_path = os.path.abspath("./ollama_dir/lib/ollama") os.environ["LD_LIBRARY_PATH"] = f"{lib_path}:{os.environ.get('LD_LIBRARY_PATH', '')}" # Start Ollama server print("Starting Ollama server...") ollama_process = subprocess.Popen( [ollama_path, "serve"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=os.environ ) # Wait for Ollama to start max_attempts = 30 for attempt in range(max_attempts): try: response = requests.get("http://localhost:11434/api/tags") print(f"Ollama server is running (attempt {attempt + 1})") break except Exception as e: print(f"Waiting for Ollama server... (attempt {attempt + 1}/{max_attempts})") # Check if process is still running if ollama_process.poll() is not None: stdout, stderr = ollama_process.communicate() print("Ollama process terminated unexpectedly") print("stdout:", stdout.decode() if stdout else "None") print("stderr:", stderr.decode() if stderr else "None") raise Exception("Ollama process terminated unexpectedly") time.sleep(1) # Pull the model print("Pulling Llama2 model...") subprocess.run([ollama_path, "pull", "qwen2.5:1.5b"], check=True) return ollama_process except Exception as e: print(f"Error setting up Ollama: {str(e)}") # Print stack trace import traceback traceback.print_exc() return None def query_ollama(prompt): try: # Check cache first cached_response = get_cached_response(prompt) if cached_response is not None: print("Using cached response") yield cached_response return # Initialize response text response_text = "" buffer = "" chunk_size = 0 # Make streaming request to Ollama response = requests.post( 'http://localhost:11434/api/generate', json={ 'model': 'qwen2.5:1.5b', 'prompt': prompt, 'stream': True, 'context_size': 2048, 'num_predict': 1000, }, stream=True ) # Process the stream for line in response.iter_lines(): if line: json_response = json.loads(line) if 'response' in json_response: chunk = json_response['response'] buffer += chunk chunk_size += len(chunk) # Only yield when buffer reaches certain size or on completion if chunk_size >= 20 or json_response.get('done', False): response_text += buffer yield response_text buffer = "" chunk_size = 0 if json_response.get('done', False): # Yield any remaining buffer if buffer: response_text += buffer yield response_text # Save the complete response to cache save_to_cache(prompt, response_text) break except Exception as e: yield f"Error: {str(e)}" def create_interface(): iface = gr.Interface( fn=query_ollama, inputs=gr.Textbox( label="Prompt", placeholder="Enter your prompt here...", lines=3 ), outputs=gr.Textbox( label="Response", lines=5, show_copy_button=True ), title="Qwen 2.5 Chat Interface", description="Chat with Qwen 2.5 model using Ollama backend", examples=[ ["Tell me a short story about space exploration"], ["Explain how photosynthesis works"], ["Write a haiku about artificial intelligence"] ], cache_examples=False, # Disable Gradio's example caching examples_per_page=10 ) return iface if __name__ == "__main__": print("Starting Ollama setup...") # Setup Ollama ollama_process = setup_ollama() if ollama_process: try: print("Starting Gradio interface...") # Launch Gradio interface iface = create_interface() iface.queue() iface.launch( server_name="0.0.0.0", server_port=7860, share=False, root_path="", show_error=True # Show detailed error messages ) finally: print("Cleaning up...") # Clean up Ollama ollama_process.terminate() ollama_process.wait() subprocess.run("rm -rf ollama_dir ollama.tgz", shell=True) # Don't clean up cache directory to persist cached responses else: print("Failed to start Ollama server")