from typing import Any, List, Mapping, Optional
from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.llms.base import LLM
import chatglm_cpp

from langchain import PromptTemplate, LLMChain
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

DEFAULT_MODEL_PATH = "chatglm2-6b-ggml.q8_0.bin"

callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
pipeline = chatglm_cpp.Pipeline(DEFAULT_MODEL_PATH)

class ChatGLM(LLM):
    temperature: float = 0.7
    base_model: str = DEFAULT_MODEL_PATH
    max_length: int = 2048
    verbose: bool = False
    streaming: bool = False
    top_p: float = 0.9
    top_k: int = 0
    max_context_length: int = 512
    threads: int = 0

    @property
    def _llm_type(self) -> str:
        return "chatglm"

    def _call(self, prompt: str, stop: Optional[List[str]] = None,
              run_manager: Optional[CallbackManagerForLLMRun] = None, ) -> str:
        if stop is not None:
            raise ValueError("stop kwargs are not permitted.")
        print("Prompt: ", prompt)
        history = [prompt]
        response = ""
        if self.streaming:
            for piece in pipeline.stream_chat(
                    history,
                    max_length=self.max_length,
                    max_context_length=self.max_context_length,
                    do_sample=self.temperature > 0,
                    top_k=self.top_k,
                    top_p=self.top_p,
                    temperature=self.temperature,
                    num_threads=self.threads,
            ):
                response += piece
            return response
            #     yield piece
            #     response += piece
            # history.append(response)
            # yield response
        else:
            response = pipeline.chat(
                history,
                max_length=self.max_length,
                max_context_length=self.max_context_length,
                do_sample=self.temperature > 0,
                top_k=self.top_k,
                top_p=self.top_p,
                temperature=self.temperature,
                num_threads=self.threads,
            )
            return response

    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        """Get the identifying parameters."""
        return {"temperature": self.temperature,
                "base_model": self.base_model,
                "max_length": self.max_length,
                "verbose": self.verbose,
                "streaming": self.streaming,
                "top_p": self.top_p,
                "top_k": self.top_k,
                "max_context_length": self.max_context_length,
                "threads": self.threads}


template = "小明的妈妈有两个孩子，一个叫大明 {question}"
prompt = PromptTemplate(template=template, input_variables=["question"])
question = "另外一个叫什么？"
llm = ChatGLM(streaming=False, callback_manager=callback_manager, show_progress=True)
llm_chain = LLMChain(prompt=prompt, llm=llm)
print(llm_chain.run(question))