|
|
|
import utils; from utils import * |
|
import os, sys, lzma, json, pprint, time, subprocess |
|
|
|
thinker = os.getenv("thinker", "gemini") |
|
TEMPERATURE = float(os.getenv("temperature", 0.1)) |
|
|
|
LLM_HOST = "gemini" |
|
TKNZ_RATIO = 1 |
|
|
|
GEMINI_MODEL = 'gemini-1.5-pro-002' |
|
FLASH_MODEL = 'gemini-1.5-flash-002' |
|
|
|
|
|
|
|
import google.generativeai as genai |
|
llm_log_filename = f"{location__}/data/llm.log" |
|
|
|
|
|
genai.configure(api_key=os.getenv("GEMINI_FLASH_API_KEY")) |
|
|
|
GEMINI_CLIENT = genai.GenerativeModel(GEMINI_MODEL, \ |
|
generation_config=genai.GenerationConfig( |
|
max_output_tokens=1024*4, |
|
temperature=TEMPERATURE |
|
)) |
|
|
|
def chat(prompt, history=[], use_cache=False, stream=False): |
|
if stream: return GEMINI_CLIENT.generate_content(prompt, stream=True) |
|
|
|
messages = history + [{"role": "user", "content": prompt}] |
|
with open(llm_log_filename,"at") as f: f.write(f"\n- - - [ {GEMINI_MODEL} ] - - -\n\nPROMPT:\n{prompt}\n") |
|
|
|
try: |
|
res = GEMINI_CLIENT.generate_content(prompt, request_options = { "timeout": 6000 }) |
|
with open(llm_log_filename,"at") as f: f.write(f"\nRESPONSE:\n{res}\n"); f.write(f"\nCONTENT:\n{res.text}\n") |
|
messages += [{"role": "assistant", "content": res.text}] |
|
return messages |
|
|
|
except Exception as e: |
|
with open(llm_log_filename,"at") as f: f.write(f"\nEXCEPTION:\n{e}\n") |
|
print(f"\nEXCEPTION:\n{e}\n"); raise e |
|
|
|
|
|
FLASH_CLIENT = genai.GenerativeModel(FLASH_MODEL, \ |
|
generation_config=genai.GenerationConfig( |
|
max_output_tokens=1024*8, |
|
temperature=TEMPERATURE |
|
)) |
|
|
|
|
|
|
|
|
|
flash_chat = chat |
|
|
|
def who_are_you(): |
|
print(f"{RED}{LLM_HOST}{RESET} " * 2) |
|
|
|
|
|
if thinker == "gemini": |
|
CTXLEN = 1024*64 |
|
thinker_chat = chat |
|
|
|
elif thinker in "70b|405b": |
|
cache_filename = f"{location__}/data/thinker.jsonl.xz" |
|
lock_filename = f"{location__}/data/thinker.lock" |
|
log_filename = f"{location__}/data/thinker.log" |
|
|
|
|
|
lines = [] if not os.path.exists(cache_filename) else \ |
|
[ line for line in lzma.open(cache_filename,"rt") ] |
|
assert len(lines) % 2 == 0 |
|
thinker_cache = {}; i = 0 |
|
while i < len(lines): |
|
thinker_cache[lines[i][:-1]] = json.loads(lines[i+1]) |
|
i += 2 |
|
lines = None |
|
|
|
|
|
model = { |
|
"405b": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo 8k 3k 1.2", |
|
"70b": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo 128k 4k 1.2", |
|
}[thinker] |
|
|
|
model, CTXLEN, MAX_TOKENS, TKNZ_RATIO = model.strip().split() |
|
LLM_HOST = model |
|
|
|
MAX_TOKENS = int(MAX_TOKENS[:-1])*1024 |
|
TKNZ_RATIO = float(TKNZ_RATIO) |
|
|
|
CTXLEN = int(CTXLEN[:-1]) |
|
if CTXLEN > 32: CTXLEN = 32 |
|
CTXLEN = CTXLEN*1024 - MAX_TOKENS |
|
|
|
|
|
from together import Together |
|
together_client = Together(api_key=os.environ.get('TOGETHER_API_KEY')) |
|
|
|
stops = ["<|eot_id|>","<|eom_id|>","</answer>","</output>"] |
|
def thinker_chat(prompt, history=[], stream=False, use_cache=True, testing=False): |
|
if stream: |
|
with open(log_filename,"at") as f: f.write(f"\n- - - [ {LLM_HOST} ] - - -\n\nPROMPT:\n{prompt}\n") |
|
return together_client.chat.completions.create( |
|
model=model, |
|
messages=[{"role": "user", "content": prompt}], |
|
max_tokens=MAX_TOKENS, |
|
temperature=TEMPERATURE, |
|
top_p=0.7, top_k=50, |
|
repetition_penalty=1.2, stop=stops, |
|
stream=True |
|
) |
|
|
|
messages = history + [{"role": "user", "content": prompt}] |
|
messages_jsonl = json.dumps(messages, ensure_ascii=False) |
|
cache_found = (messages_jsonl in thinker_cache) |
|
|
|
if use_cache and cache_found: |
|
print(f"{YELLOW}<<< cached content >>>{RESET}") |
|
content = thinker_cache[messages_jsonl] |
|
|
|
elif testing: |
|
print(f"{RED}<<< testing content >>>{RESET}") |
|
content = "testing testing" |
|
|
|
else: |
|
print(f"{GREEN}<<< fresh content >>>{RESET}") |
|
with open(log_filename,"at") as f: f.write(f"\n- - - [ {LLM_HOST} ] - - -\n\nPROMPT:\n{prompt}\n") |
|
try: |
|
response = Together(api_key=os.environ.get('TOGETHER_API_KEY')).chat.completions.create( |
|
model=model, |
|
messages=messages, |
|
max_tokens=MAX_TOKENS, |
|
temperature=TEMPERATURE, |
|
top_p=0.7, top_k=50, |
|
repetition_penalty=1.2, stop=stops, |
|
logprobs=1, stream=False |
|
) |
|
except Exception as e: |
|
with open(log_filename,"at") as f: f.write(f"\nEXCEPTION:\n{e}\n") |
|
print(f"\nEXCEPTION:\n{e}\n"); raise e |
|
|
|
content = response.choices[0].message.content |
|
with open(log_filename,"at") as f: |
|
f.write(f"\nRESPONSE:\n{response}\n") |
|
f.write(f"\nCONTENT:\n{content}\n") |
|
|
|
thinker_cache[messages_jsonl] = content |
|
|
|
waits = 5 |
|
while waits > 0 and os.path.exists(lock_filename): |
|
waits -= 1 |
|
time.sleep(0.2) |
|
|
|
if waits == 0: |
|
assert False, f"Bị lock hơn 1 second, có thể xóa {lock_filename} nếu lỗi này lặp lại" |
|
|
|
subprocess.run(f"touch {lock_filename}", shell=True) |
|
with lzma.open(cache_filename,"at") as f: |
|
f.write(f"{messages_jsonl}\n{json.dumps(content, ensure_ascii=False)}\n") |
|
subprocess.run(f"rm {lock_filename}", shell=True) |
|
|
|
messages += [{"role": "assistant", "content": content}] |
|
return messages |
|
|
|
|
|
elif thinker in "gemma2:27b|commandr:35b|llama3.1:70b": |
|
|
|
|
|
import subprocess, ollama |
|
try: ollama.list() |
|
except: subprocess.run('nohup ssh -N -L 11434:localhost:11434 -p 22021 dungnt@118.70.171.68 &', shell=True) |
|
subprocess.run('nohup ssh -N -L 9999:localhost:11434 -p 17340 symato@1.tcp.ap.ngrok.io &', shell=True) |
|
|
|
OLLAMA_CLIENT = ollama.Client(host='http://localhost:11434') |
|
machine = "RTX-4090-24G" |
|
|
|
|
|
if thinker in "gemma2:27b": OLLAMA_MODEL = "gemma2:27b-instruct-q5_K_M" ; CTXLEN = 512*14 |
|
elif thinker in "commandr:35b": OLLAMA_MODEL = "command-r:35b-08-2024-q4_K_M" ; CTXLEN = 512*18 |
|
else: OLLAMA_MODEL = "not found" |
|
|
|
try: connect_to_4090 = OLLAMA_MODEL in str(ollama.list()) |
|
except: connect_to_4090 = False |
|
|
|
if not connect_to_4090: |
|
OLLAMA_CLIENT = ollama.Client(host='http://localhost:9999') |
|
machine = "A100-PCIE-40GB" |
|
|
|
if thinker in "gemma2:27b": OLLAMA_MODEL = "gemma2:27b-instruct-q8_0" ; CTXLEN = 1024*24 |
|
elif thinker in "commandr:35b": OLLAMA_MODEL = "command-r:35b-08-2024-q8_0" ; CTXLEN = 1024*32 |
|
elif thinker in "llama3.1:70b": OLLAMA_MODEL = "llama3.1:70b-instruct-q3_K_M" ; CTXLEN = 1024*12 |
|
LLM_HOST = f"{machine}__{OLLAMA_MODEL}" |
|
|
|
def thinker_chat(prompt, history=[], stream=False, use_cache=False): |
|
if stream: |
|
with open(llm_log_filename,"at") as f: f.write(f"\n- - - [ {LLM_HOST} ] - - -\n\nPROMPT:\n{prompt}\n") |
|
return OLLAMA_CLIENT.chat(model=OLLAMA_MODEL, messages=[{"role": "user", "content": prompt}], \ |
|
stream=True, options={'num_ctx': CTXLEN, 'temperature': TEMPERATURE}) |
|
|
|
messages = history + [{"role": "user", "content": prompt}] |
|
with open(llm_log_filename,"at") as f: f.write(f"\n- - - [ {LLM_HOST} ] - - -\n\nPROMPT:\n{prompt}\n") |
|
res = OLLAMA_CLIENT.chat(model=OLLAMA_MODEL, messages=messages, options={'temperature': TEMPERATURE}) |
|
content = res["message"]["content"] |
|
with open(llm_log_filename,"at") as f: f.write(f"\nCONTENT:\n{content}\n") |
|
messages += [{"role": "assistant", "content": content}] |
|
return messages |
|
|
|
|
|
|
|
|
|
LLM_HOST += f"__{round(CTXLEN/1024)}k_ctxlen" |
|
who_are_you() |
|
|
|
|
|
|
|
from prompts import summary_template |
|
from prompts import contextual_template, clean_view_template |
|
|
|
USE_CACHE = os.getenv("cache", "1") == "1" |
|
|
|
|
|
def extract_keyphrases_figures_summary(text): |
|
if len(text) < 80: return "" |
|
|
|
prompt = summary_template.format(text = text) |
|
print(f"{GREEN}{text}{RESET}") |
|
|
|
utils.reset_timer(timer = "extract_keyphrases_figures_summary") |
|
res = chat(prompt, use_cache = USE_CACHE) |
|
utils.measure_time("", timer = "extract_keyphrases_figures_summary") |
|
|
|
raw = res[-1]["content"] |
|
print(f"{MAGENTA}{raw}{RESET}") |
|
|
|
return raw |
|
|
|
|
|
def gen_contextual(document, chunk): |
|
prompt = contextual_template.format(document = document, chunk = chunk) |
|
res = thinker_chat(prompt, use_cache = USE_CACHE) |
|
contextual = res[-1]["content"].strip() |
|
return contextual |
|
|
|
|
|
def gen_clean_view(document): |
|
prompt = clean_view_template.format(document = document) |
|
res = chat(prompt, use_cache = USE_CACHE) |
|
ret = res[-1]["content"].strip() |
|
return ret |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
try: filename = sys.argv[1] |
|
except: filename = None |
|
if filename: q = open(filename, "rt").read() |
|
else: q = "What's your name? Who created you?" |
|
|
|
utils.reset_timer(); res = thinker_chat(q, use_cache=False) |
|
utils.measure_time(LLM_HOST + " ") |
|
print(f"{CYAN}{q}{RESET}", end="\n\n"); print(res[-1]["content"]) |
|
|