Spaces:
Paused
Paused
def noop_load(*args, **kwargs): | |
return None | |
def go_prepare_offline(*args, **kwargs): | |
kwargs0 = kwargs['kwargs'] | |
# gen.py steps should have already obtained: | |
# model+tokenizers from base_model or model_lock if required | |
# tokenizers, including tokenizers for model_lock if using inference servers even if no LLM locally | |
# score_model or reward model | |
# | |
# Additional steps are related to document Q/A: | |
# For simplicity use gradio functions, | |
# but not API calls that would require actual gradio app up and API usage that might have issues | |
kwargs['max_quality'] = True | |
embed = True | |
h2ogpt_key = '' | |
file_list = ['tests/driverslicense.jpeg', 'tests/CityofTshwaneWater.pdf', 'tests/example.xlsx'] | |
inputs2 = [kwargs['my_db_state0'], | |
kwargs['selection_docs_state0'], | |
kwargs['requests_state0'], | |
kwargs0['langchain_mode'], | |
kwargs0['chunk'], | |
kwargs0['chunk_size'], | |
embed, | |
kwargs['image_loaders_options'], | |
kwargs['pdf_loaders_options'], | |
kwargs['url_loaders_options'], | |
kwargs['jq_schema0'], | |
h2ogpt_key, | |
] | |
for fileup_output in file_list: | |
inputs1 = [fileup_output] | |
add_file_kwargs = dict(fn=kwargs['update_db_func'], | |
inputs=inputs1 + inputs2) | |
add_file_kwargs['fn'](*tuple(add_file_kwargs['inputs'])) | |
# ensure normal blip (not 2) obtained | |
blip2 = 'CaptionBlip2' | |
if blip2 in kwargs['image_loaders_options']: | |
image_loaders_options = kwargs['image_loaders_options'].copy() | |
image_loaders_options.remove(blip2) | |
inputs2[8] = kwargs['image_loaders_options'] | |
add_file_kwargs = dict(fn=kwargs['update_db_func'], | |
inputs=inputs1 + inputs2) | |
add_file_kwargs['fn'](*tuple(add_file_kwargs['inputs'])) | |
# FakeTokenizer etc. needs tiktoken for general tasks | |
import tiktoken | |
encoding = tiktoken.get_encoding("cl100k_base") | |
assert encoding | |
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") | |
assert encoding | |
# sometimes summarization needs gpt2 still | |
from transformers import AutoTokenizer | |
model_name = 'gpt2' | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
assert tokenizer | |
# then run h2ogpt as: | |
# HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 python generate.py --gradio_offline_level=2 --share=False ... | |