#!/usr/bin/env python
# coding: utf-8
from os import listdir
from os.path import isdir
from fastapi import FastAPI, HTTPException, Request, responses, Body
from fastapi.middleware.cors import CORSMiddleware
from llama_cpp import Llama

from pydantic import BaseModel
from enum import Enum
from typing import Optional

# MODEL LOADING, FUNCTIONS, AND TESTING

print("Loading model...")
wphys = Llama(model_path="/models/Wllama-phys-8b.gguf", use_mmap=False, use_mlock=True)

app = FastAPI(
    title="Kong",
    description="Kodthae",
    version="1.0.0",
)

"""
  chat = []
  chat.append("<|system|>")
  chat.append(f"system  You are helpful assistant. please answer the question.<|end_of_text|>")
  chat.append("<|user|>")
  chat.append(f"{q}<|end_of_text|>")
  chat.append("<|assistant|>")
  chat.append(f"{a}<|end_of_text|>")
"""

def extract_restext(response):
  return response['choices'][0]['text'].strip()

def ask_llm(question, max_new_tokens=200, temperature=0.5):
  prompt = f"""
  <|system|>
  "system  You are helpful assistant. please answer the question.<|end_of_text|>"
  <|user|>
  {question}<|end_of_text|>
  <|assistant|>"""
  result = extract_restext(wphys(prompt, max_tokens=max_new_tokens, temperature=temperature, stop=["<|end_of_text|>","<|user|>","<|assistant|>"], echo=False)).replace("<|end_of_text|>","").replace("<|user|>","").replace("<|assistant|>","")
  return result


# TESTING THE MODEL
print("Testing model...")
assert wphys("Hello!, How are you today?", max_tokens=10) #Just checking that it can run
print("Ready.")


origins = ["*"]
app.add_middleware(
    CORSMiddleware,
    allow_origins=origins,
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"]
)

# API DATA CLASSES

class QuestionResponse(BaseModel):
  code: int = 200
  question: Optional[str] = None
  answer: str = None
  config: Optional[dict] = None

# API ROUTES
@app.get('/')
def docs():
  "Redirects the user from the main page to the docs."
  return responses.RedirectResponse('./docs')

@app.post('/questions/open-ended')
async def ask_gemmaWild(
    prompt: str = Body(..., embed=True, example="Why is ice cream so delicious?"),
    temperature: float = Body(0.5, embed=True), 
    max_new_tokens: int = Body(200, embed=True)
) -> QuestionResponse:
    
  if prompt:
    try:
      print(f'Asking Wllama-phys-8b with the question "{prompt}"')
      result = ask_llm(WIllm, prompt, max_new_tokens=max_new_tokens, temperature=temperature)
      print(f"Result: {result}")
      return QuestionResponse(answer=result, question=prompt, config={"temperature": temperature, "max_new_tokens": max_new_tokens})
    except Exception as e:
      return HTTPException(500, QuestionResponse(code=500, answer=str(e), question=prompt))
  else:
    return HTTPException(400, QuestionResponse(code=400, answer="Request argument 'prompt' not provided."))