Spaces:
Running
Running
from h2o_wave import main, app, Q, ui, data | |
from gradio_client import Client | |
import ast | |
async def init_ui(q: Q) -> None: | |
q.page['meta'] = ui.meta_card( | |
box='', | |
layouts=[ | |
ui.layout(breakpoint='xs', min_height='100vh', zones=[ | |
ui.zone('main', size='1', direction=ui.ZoneDirection.ROW, zones=[ | |
ui.zone('sidebar', size='250px'), | |
ui.zone('body', direction=ui.ZoneDirection.COLUMN, zones=[ | |
ui.zone('title', size='55px'), | |
ui.zone('content', size='1'), | |
ui.zone('footer'), | |
]), | |
]) | |
]) | |
], | |
title='AchyuthGPT', | |
) | |
q.page['sidebar'] = ui.nav_card( | |
box='sidebar', color='primary', title='AchyuthGPT', subtitle='Programmed by Achyuth', | |
value=f"#{q.args['#']}' if q.args['#'] else '#page1", | |
image='https://huggingface.co/spaces/AchyuthGamer/AchyuthGPT-v1/resolve/main/achyuthailogo.png', items=[ | |
ui.nav_group('', items=[ | |
ui.nav_item(name='dwave-docs', label='Wave docs', path='https://AchyuthGPT.blogspot.com/'), | |
ui.nav_item(name='Achyuth-GPT', label='Achyuth GPT', path='https://github.com/achyuth4/AchyuthGPT-llmstudio'), | |
ui.nav_item(name='fine-tune', label='LLM Studio', path='https://github.com/achyuth4/AchyuthGPT-llmstudio'), | |
ui.nav_item(name='more-models', label='More models', path='https://huggingface.co/achyuthgamer'), | |
]), | |
], | |
secondary_items=[ | |
ui.toggle(name='dark_mode', label='Dark mode', trigger=True), | |
ui.text('<center>Developer - N.Achyuth Reddy.</center>') | |
] | |
) | |
q.page['chatbot'] = ui.chatbot_card( | |
box=ui.box('content'), | |
data=data('content from_user', t='list'), | |
name='chatbot' | |
) | |
q.page['title'] = ui.section_card( | |
box='title', | |
title='', | |
subtitle='', | |
items=[ | |
ui.dropdown(name='model', trigger=True, label='', value='gpt', choices=[ | |
ui.choice(name='gpt', label='GPT Model-1'), | |
ui.choice(name='falcon', label='GPT Model-2'), | |
ui.choice(name='mpt', label='GPT Model-3'), | |
]), | |
ui.button(name='clear', label='Clear', icon='Delete'), | |
], | |
) | |
""" | |
:param load_8bit: load model in 8-bit using bitsandbytes | |
:param load_4bit: load model in 4-bit using bitsandbytes | |
:param load_half: load model in float16 | |
:param infer_devices: whether to control devices with gpu_id. If False, then spread across GPUs | |
:param base_model: model HF-type name. If use --base_model to preload model, cannot unload in gradio in models tab | |
:param tokenizer_base_model: tokenizer HF-type name. Usually not required, inferred from base_model. | |
:param lora_weights: LORA weights path/HF link | |
:param gpu_id: if infer_devices, then use gpu_id for cuda device ID, or auto mode if gpu_id != -1 | |
:param compile_model Whether to compile the model | |
:param use_cache: Whether to use caching in model (some models fail when multiple threads use) | |
:param inference_server: Consume base_model as type of model at this address | |
Address can be text-generation-server hosting that base_model | |
e.g. python generate.py --inference_server="http://192.168.1.46:6112" --base_model=h2oai/h2ogpt-oasst1-512-12b | |
Or Address can be "openai_chat" or "openai" for OpenAI API | |
e.g. python generate.py --inference_server="openai_chat" --base_model=gpt-3.5-turbo | |
e.g. python generate.py --inference_server="openai" --base_model=text-davinci-003 | |
:param prompt_type: type of prompt, usually matched to fine-tuned model or plain for foundational model | |
:param prompt_dict: If prompt_type=custom, then expects (some) items returned by get_prompt(..., return_dict=True) | |
:param model_lock: Lock models to specific combinations, for ease of use and extending to many models | |
Only used if gradio = True | |
List of dicts, each dict has base_model, tokenizer_base_model, lora_weights, inference_server, prompt_type, and prompt_dict | |
If all models have same prompt_type, and prompt_dict, can still specify that once in CLI outside model_lock as default for dict | |
Can specify model_lock instead of those items on CLI | |
As with CLI itself, base_model can infer prompt_type and prompt_dict if in prompter.py. | |
Also, tokenizer_base_model and lora_weights are optional. | |
Also, inference_server is optional if loading model from local system. | |
All models provided will automatically appear in compare model mode | |
Model loading-unloading and related choices will be disabled. Model/lora/server adding will be disabled | |
:param model_lock_columns: How many columns to show if locking models (and so showing all at once) | |
If None, then defaults to up to 3 | |
if -1, then all goes into 1 row | |
Maximum value is 4 due to non-dynamic gradio rendering elements | |
:param fail_if_cannot_connect: if doing model locking (e.g. with many models), fail if True. Otherwise ignore. | |
Useful when many endpoints and want to just see what works, but still have to wait for timeout. | |
:param temperature: generation temperature | |
:param top_p: generation top_p | |
:param top_k: generation top_k | |
:param num_beams: generation number of beams | |
:param repetition_penalty: generation repetition penalty | |
:param num_return_sequences: generation number of sequences (1 forced for chat) | |
:param do_sample: generation sample | |
:param max_new_tokens: generation max new tokens | |
:param min_new_tokens: generation min tokens | |
:param early_stopping: generation early stopping | |
:param max_time: maximum time to allow for generation | |
:param memory_restriction_level: 0 = no restriction to tokens or model, 1 = some restrictions on token 2 = HF like restriction 3 = very low memory case | |
:param debug: enable debug mode | |
:param save_dir: directory chat data is saved to | |
:param share: whether to share the gradio app with sharable URL | |
:param local_files_only: whether to only use local files instead of doing to HF for models | |
:param resume_download: whether to resume downloads from HF for models | |
:param use_auth_token: whether to use HF auth token (requires CLI did huggingface-cli login before) | |
:param trust_remote_code: whether to use trust any code needed for HF model | |
:param offload_folder: path for spilling model onto disk | |
:param src_lang: source languages to include if doing translation (None = all) | |
:param tgt_lang: target languages to include if doing translation (None = all) | |
:param cli: whether to use CLI (non-gradio) interface. | |
:param cli_loop: whether to loop for CLI (False usually only for testing) | |
:param gradio: whether to enable gradio, or to enable benchmark mode | |
:param gradio_offline_level: > 0, then change fonts so full offline | |
== 1 means backend won't need internet for fonts, but front-end UI might if font not cached | |
== 2 means backend and frontend don't need internet to download any fonts. | |
Note: Some things always disabled include HF telemetry, gradio telemetry, chromadb posthog that involve uploading. | |
This option further disables google fonts for downloading, which is less intrusive than uploading, | |
but still required in air-gapped case. The fonts don't look as nice as google fonts, but ensure full offline behavior. | |
Also set --share=False to avoid sharing a gradio live link. | |
:param chat: whether to enable chat mode with chat history | |
:param chat_context: whether to use extra helpful context if human_bot | |
:param stream_output: whether to stream output | |
:param show_examples: whether to show clickable examples in gradio | |
:param verbose: whether to show verbose prints | |
:param h2ocolors: whether to use H2O.ai theme | |
:param height: height of chat window | |
:param show_lora: whether to show LORA options in UI (expert so can be hard to understand) | |
:param login_mode_if_model0: set to True to load --base_model after client logs in, to be able to free GPU memory when model is swapped | |
:param block_gradio_exit: whether to block gradio exit (used for testing) | |
:param concurrency_count: gradio concurrency count (1 is optimal for LLMs) | |
:param api_open: If False, don't let API calls skip gradio queue | |
:param allow_api: whether to allow API calls at all to gradio server | |
:param input_lines: how many input lines to show for chat box (>1 forces shift-enter for submit, else enter is submit) | |
:param gradio_size: Overall size of text and spaces: "xsmall", "small", "medium", "large". | |
Small useful for many chatbots in model_lock mode | |
:param auth: gradio auth for launcher in form [(user1, pass1), (user2, pass2), ...] | |
e.g. --auth=[('jon','password')] with no spaces | |
:param max_max_time: Maximum max_time for gradio slider | |
:param max_max_new_tokens: Maximum max_new_tokens for gradio slider | |
:param sanitize_user_prompt: whether to remove profanity from user input (slows down input processing) | |
:param sanitize_bot_response: whether to remove profanity and repeat lines from bot output (about 2x slower generation for long streaming cases due to better_profanity being slow) | |
:param extra_model_options: extra models to show in list in gradio | |
:param extra_lora_options: extra LORA to show in list in gradio | |
:param extra_server_options: extra servers to show in list in gradio | |
:param score_model: which model to score responses (None means no scoring) | |
:param eval_filename: json file to use for evaluation, if None is sharegpt | |
:param eval_prompts_only_num: for no gradio benchmark, if using eval_filename prompts for eval instead of examples | |
:param eval_prompts_only_seed: for no gradio benchmark, seed for eval_filename sampling | |
:param eval_as_output: for no gradio benchmark, whether to test eval_filename output itself | |
:param langchain_mode: Data source to include. Choose "UserData" to only consume files from make_db.py. | |
WARNING: wiki_full requires extra data processing via read_wiki_full.py and requires really good workstation to generate db, unless already present. | |
:param langchain_action: Mode langchain operations in on documents. | |
Query: Make query of document(s) | |
Summarize or Summarize_map_reduce: Summarize document(s) via map_reduce | |
Summarize_all: Summarize document(s) using entire document at once | |
Summarize_refine: Summarize document(s) using entire document, and try to refine before returning summary | |
:param force_langchain_evaluate: Whether to force langchain LLM use even if not doing langchain, mostly for testing. | |
:param user_path: user path to glob from to generate db for vector search, for 'UserData' langchain mode. | |
If already have db, any new/changed files are added automatically if path set, does not have to be same path used for prior db sources | |
:param detect_user_path_changes_every_query: whether to detect if any files changed or added every similarity search (by file hashes). | |
Expensive for large number of files, so not done by default. By default only detect changes during db loading. | |
:param visible_langchain_modes: dbs to generate at launch to be ready for LLM | |
Can be up to ['wiki', 'wiki_full', 'UserData', 'MyData', 'github h2oGPT', 'DriverlessAI docs'] | |
But wiki_full is expensive and requires preparation | |
To allow scratch space only live in session, add 'MyData' to list | |
Default: If only want to consume local files, e.g. prepared by make_db.py, only include ['UserData'] | |
FIXME: Avoid 'All' for now, not implemented | |
:param visible_langchain_actions: Which actions to allow | |
:param document_choice: Default document choice when taking subset of collection | |
:param load_db_if_exists: Whether to load chroma db if exists or re-generate db | |
:param keep_sources_in_context: Whether to keep url sources in context, not helpful usually | |
:param db_type: 'faiss' for in-memory or 'chroma' or 'weaviate' for persisted on disk | |
:param use_openai_embedding: Whether to use OpenAI embeddings for vector db | |
:param use_openai_model: Whether to use OpenAI model for use with vector db | |
:param hf_embedding_model: Which HF embedding model to use for vector db | |
Default is instructor-large with 768 parameters per embedding if have GPUs, else all-MiniLM-L6-v1 if no GPUs | |
Can also choose simpler model with 384 parameters per embedding: "sentence-transformers/all-MiniLM-L6-v2" | |
Can also choose even better embedding with 1024 parameters: 'hkunlp/instructor-xl' | |
We support automatically changing of embeddings for chroma, with a backup of db made if this is done | |
:param allow_upload_to_user_data: Whether to allow file uploads to update shared vector db | |
:param allow_upload_to_my_data: Whether to allow file uploads to update scratch vector db | |
:param enable_url_upload: Whether to allow upload from URL | |
:param enable_text_upload: Whether to allow upload of text | |
:param enable_sources_list: Whether to allow list (or download for non-shared db) of list of sources for chosen db | |
:param chunk: Whether to chunk data (True unless know data is already optimally chunked) | |
:param chunk_size: Size of chunks, with typically top-4 passed to LLM, so neesd to be in context length | |
:param top_k_docs: number of chunks to give LLM | |
:param reverse_docs: whether to reverse docs order so most relevant is closest to question. | |
Best choice for sufficiently smart model, and truncation occurs for oldest context, so best then too. | |
But smaller 6_9 models fail to use newest context and can get stuck on old information. | |
:param auto_reduce_chunks: Whether to automatically reduce top_k_docs to fit context given prompt | |
:param max_chunks: If top_k_docs=-1, maximum number of chunks to allow | |
:param n_jobs: Number of processors to use when consuming documents (-1 = all, is default) | |
:param enable_captions: Whether to support captions using BLIP for image files as documents, then preloads that model | |
:param captions_model: Which model to use for captions. | |
captions_model: str = "Salesforce/blip-image-captioning-base", # continue capable | |
captions_model: str = "Salesforce/blip2-flan-t5-xl", # question/answer capable, 16GB state | |
captions_model: str = "Salesforce/blip2-flan-t5-xxl", # question/answer capable, 60GB state | |
Note: opt-based blip2 are not permissive license due to opt and Meta license restrictions | |
:param pre_load_caption_model: Whether to preload caption model, or load after forking parallel doc loader | |
parallel loading disabled if preload and have images, to prevent deadlocking on cuda context | |
Recommended if using larger caption model | |
:param caption_gpu: If support caption, then use GPU if exists | |
:param enable_ocr: Whether to support OCR on images | |
:return: | |
""" | |
async def serve(q: Q): | |
if not q.client.initialized: | |
await init_ui(q) | |
q.client.model_client = Client('https://gpt.h2o.ai/') | |
q.client.initialized = True | |
# A new message arrived. | |
if q.args.chatbot: | |
# Append user message. | |
q.page['chatbot'].data += [q.args.chatbot, True] | |
# Append bot response. | |
kwargs = dict(instruction_nochat=q.args.chatbot) | |
try: | |
res = q.client.model_client.predict(str(dict(kwargs)), api_name='/submit_nochat_api') | |
bot_res = ast.literal_eval(res)['response'] | |
q.page['chatbot'].data += [bot_res, False] | |
except: | |
q.page['meta'] = ui.meta_card(box='', notification_bar=ui.notification_bar( | |
text='An error occurred during prediction. Please try later or a different model.', | |
type='error', | |
)) | |
elif q.args.clear: | |
# Recreate the card. | |
q.page['chatbot'] = ui.chatbot_card( | |
box=ui.box('content'), | |
data=data('content from_user', t='list'), | |
name='chatbot' | |
) | |
elif q.args.dark_mode is not None: | |
q.page['meta'].theme = 'achyuthgpt-dark' if q.args.dark_mode else 'light' | |
q.page['sidebar'].color = 'card' if q.args.dark_mode else 'primary' | |
elif q.args.model: | |
try: | |
q.client.model_client = Client(f'https://{q.args.model}.h2o.ai/') | |
q.page['meta'] = ui.meta_card(box='', notification_bar=ui.notification_bar( | |
text='Model changed successfully.', | |
type='success', | |
)) | |
except: | |
q.page['meta'] = ui.meta_card(box='', notification_bar=ui.notification_bar( | |
text='An error occurred while changing the model. Please try a different one.', | |
type='error', | |
)) | |
await q.page.save() | |