#!/usr/bin/env python3 import utils; from utils import * import os, sys, lzma, json, pprint, time, subprocess thinker = os.getenv("thinker", "405b") TEMPERATURE = float(os.getenv("temperature", 0.1)) # 0.0 conservative (good for coding and correct syntax) LLM_HOST = "gemini" TKNZ_RATIO = 1 GEMINI_MODEL = 'gemini-1.5-pro-002' FLASH_MODEL = 'gemini-1.5-flash-002' MAX_OUTPUT_TOKENS = 1024*8 # https://github.com/google-gemini/cookbook/blob/main/quickstarts/Prompting.ipynb # https://github.com/google-gemini/cookbook/blob/main/quickstarts/Streaming.ipynb import google.generativeai as genai # pip install -U -q google-generativeai llm_log_filename = f"{location__}/.cache/llm.log" genai.configure(api_key="AIzaSyAUeHVWLkYioIGk6PMbCTqk73PowHCIyPM") GEMINI_CLIENT = genai.GenerativeModel(GEMINI_MODEL, \ generation_config = genai.GenerationConfig( max_output_tokens = MAX_OUTPUT_TOKENS, temperature = TEMPERATURE, )) def chat(prompt, history=[], use_cache=False, stream=False): if stream: return GEMINI_CLIENT.generate_content(prompt, stream=True) messages = history + [{"role": "user", "content": prompt}] # fake history with open(llm_log_filename,"at") as f: f.write(f"\n- - - [ {GEMINI_MODEL} ] - - -\n\nPROMPT:\n{prompt}\n") try: res = GEMINI_CLIENT.generate_content(prompt, request_options = { "timeout": 6000 }) with open(llm_log_filename,"at") as f: f.write(f"\nRESPONSE:\n{res}\n"); f.write(f"\nCONTENT:\n{res.text}\n") messages += [{"role": "assistant", "content": res.text}] return messages except Exception as e: with open(llm_log_filename,"at") as f: f.write(f"\nEXCEPTION:\n{e}\n") print(f"\nEXCEPTION:\n{e}\n"); raise e FLASH_CLIENT = genai.GenerativeModel(FLASH_MODEL, \ generation_config=genai.GenerationConfig( max_output_tokens=1024*8, temperature=TEMPERATURE )) # def flash_chat(prompt, history=[], use_cache=False, stream=False): # res = FLASH_CLIENT.generate_content(prompt) # return [{"role": "assistant", "content": res.text}] flash_chat = chat def who_are_you(): print(f"{RED}{LLM_HOST}{RESET} " * 2) if thinker == "gemini": # gemini pro CTXLEN = 1024*64 # gemini thì vô tư, 128k hoặc 1m ctxlen đều OK thinker_chat = chat elif thinker in "70b|405b": cache_filename = f"{location__}/.cache/thinker.jsonl.xz" lock_filename = f"{location__}/.cache/thinker.lock" log_filename = f"{location__}/.cache/thinker.log" ## Load thinker_cache lines = [] if not os.path.exists(cache_filename) else \ [ line for line in lzma.open(cache_filename,"rt") ] assert len(lines) % 2 == 0 thinker_cache = {}; i = 0 while i < len(lines): # line có \n ở cuối nên [:-1] để bỏ đi thinker_cache[lines[i][:-1]] = json.loads(lines[i+1]) i += 2 lines = None # Done loading # https://docs.together.ai/docs/chat-models#hosted-models model = { "405b": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo 128k", # $3.50 / 1m tokens(*) "70b": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo 128k", # $0.88 / 1m tokens(*) }[thinker] model, CTXLEN = model.strip().split() LLM_HOST = model CTXLEN = int(CTXLEN[:-1]) if CTXLEN > 64: CTXLEN = 64 # max 64k ctxlen CTXLEN = CTXLEN*1024 - MAX_OUTPUT_TOKENS from together import Together together_client = Together(api_key='adc0db56b77fe6508bdeadb4d8253771750a50639f8e87313153e49d4599f6ea') ### stops = ["<|eot_id|>","<|eom_id|>","",""] def thinker_chat(prompt, history=[], stream=False, use_cache=True, testing=False): if stream: with open(log_filename,"at") as f: f.write(f"\n- - - [ {LLM_HOST} ] - - -\n\nPROMPT:\n{prompt}\n") return together_client.chat.completions.create( model=model, messages=[{"role": "user", "content": prompt}], max_tokens=MAX_OUTPUT_TOKENS, temperature=TEMPERATURE, top_p=0.7, top_k=50, repetition_penalty=1.2, stop=stops, stream=True ) messages = history + [{"role": "user", "content": prompt}] messages_jsonl = json.dumps(messages, ensure_ascii=False) cache_found = (messages_jsonl in thinker_cache) if use_cache and cache_found: print(f"{YELLOW}<<< cached content >>>{RESET}") content = thinker_cache[messages_jsonl] elif testing: print(f"{RED}<<< testing content >>>{RESET}") content = "testing testing" else: print(f"{GREEN}<<< fresh content >>>{RESET}") with open(log_filename,"at") as f: f.write(f"\n- - - [ {LLM_HOST} ] - - -\n\nPROMPT:\n{prompt}\n") try: response = together_client.chat.completions.create( model=model, messages=messages, max_tokens=MAX_OUTPUT_TOKENS, temperature=TEMPERATURE, top_p=0.7, top_k=50, repetition_penalty=1.2, stop=stops, logprobs=1, stream=False ) except Exception as e: with open(log_filename,"at") as f: f.write(f"\nEXCEPTION:\n{e}\n") print(f"\nEXCEPTION:\n{e}\n"); raise e content = response.choices[0].message.content with open(log_filename,"at") as f: f.write(f"\nRESPONSE:\n{response}\n") f.write(f"\nCONTENT:\n{content}\n") thinker_cache[messages_jsonl] = content # update new generated content waits = 5 while waits > 0 and os.path.exists(lock_filename): # có người đang write, wait waits -= 1 time.sleep(0.2) if waits == 0: assert False, f"Bị lock hơn 1 second, có thể xóa {lock_filename} nếu lỗi này lặp lại" subprocess.run(f"touch {lock_filename}", shell=True) # lock with lzma.open(cache_filename,"at") as f: # write f.write(f"{messages_jsonl}\n{json.dumps(content, ensure_ascii=False)}\n") subprocess.run(f"rm {lock_filename}", shell=True) # unlock messages += [{"role": "assistant", "content": content}] return messages LLM_HOST += f"__{round(CTXLEN/1024)}k_ctxlen" who_are_you() from prompts import summary_template, docchat_template from prompts import contextual_template, clean_view_template USE_CACHE = os.getenv("cache", "1") == "1" def query_documents(documents, query): prompt = docchat_template.format(documents = documents, question = query) print(f"{GREEN}{prompt}{RESET}") utils.reset_timer(timer = "docchat") res = chat(prompt, use_cache = USE_CACHE) utils.measure_time("", timer = "docchat") raw = res[-1]["content"] print(f"{MAGENTA}{raw}{RESET}") return raw def extract_keyphrases_figures_summary(text): if len(text) < 80: return "" prompt = summary_template.format(text = text) print(f"{GREEN}{text}{RESET}") utils.reset_timer(timer = "extract_keyphrases_figures_summary") res = chat(prompt, use_cache = USE_CACHE) utils.measure_time("", timer = "extract_keyphrases_figures_summary") raw = res[-1]["content"] print(f"{MAGENTA}{raw}{RESET}") return raw def gen_contextual(document, chunk): prompt = contextual_template.format(document = document, chunk = chunk) res = thinker_chat(prompt, use_cache = USE_CACHE) contextual = res[-1]["content"].strip() return contextual def gen_clean_view(document): prompt = clean_view_template.format(document = document) res = chat(prompt, use_cache = USE_CACHE) ret = res[-1]["content"].strip() return ret if __name__ == "__main__": try: filename = sys.argv[1] except: filename = None if filename: q = open(filename, "rt").read() else: q = "What's your name? Who created you?" utils.reset_timer(); res = thinker_chat(q, use_cache=False) utils.measure_time(LLM_HOST + " ") print(f"{CYAN}{q}{RESET}", end="\n\n"); print(res[-1]["content"])