File size: 2,914 Bytes
b585c7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def noop_load(*args, **kwargs):
    return None


def go_prepare_offline(*args, **kwargs):
    kwargs0 = kwargs['kwargs']
    # gen.py steps should have already obtained:
    #   model+tokenizers from base_model or model_lock if required
    #   tokenizers, including tokenizers for model_lock if using inference servers even if no LLM locally
    #   score_model or reward model
    #
    # Additional steps are related to document Q/A:
    # For simplicity use gradio functions,
    #  but not API calls that would require actual gradio app up and API usage that might have issues

    kwargs['max_quality'] = True
    embed = True
    h2ogpt_key = ''
    file_list = ['tests/driverslicense.jpeg', 'tests/CityofTshwaneWater.pdf', 'tests/example.xlsx']

    inputs2 = [kwargs['my_db_state0'],
               kwargs['selection_docs_state0'],
               kwargs['requests_state0'],
               kwargs0['langchain_mode'],
               kwargs0['chunk'],
               kwargs0['chunk_size'],
               embed,
               kwargs['image_audio_loaders_options'],
               kwargs['pdf_loaders_options'],
               kwargs['url_loaders_options'],
               kwargs['jq_schema0'],
               kwargs['extract_frames'],
               h2ogpt_key,
               ]

    for fileup_output in file_list:
        inputs1 = [fileup_output]
        add_file_kwargs = dict(fn=kwargs['update_db_func'],
                               inputs=inputs1 + inputs2)
        add_file_kwargs['fn'](*tuple(add_file_kwargs['inputs']))

        # ensure normal blip (not 2) obtained
        blip2 = 'CaptionBlip2'
        if blip2 in kwargs['image_audio_loaders_options']:
            image_audio_loaders_options = kwargs['image_audio_loaders_options'].copy()
            image_audio_loaders_options.remove(blip2)

        # ensure normal asr (not asrlarge) obtained
        asrlarge = 'ASRLarge'
        if asrlarge in kwargs['image_audio_loaders_options']:
            image_audio_loaders_options = kwargs['image_audio_loaders_options'].copy()
            image_audio_loaders_options.remove(asrlarge)

        inputs2[8] = kwargs['image_audio_loaders_options']
        add_file_kwargs = dict(fn=kwargs['update_db_func'],
                               inputs=inputs1 + inputs2)
        add_file_kwargs['fn'](*tuple(add_file_kwargs['inputs']))

    # FakeTokenizer etc. needs tiktoken for general tasks
    import tiktoken
    encoding = tiktoken.get_encoding("cl100k_base")
    assert encoding
    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
    assert encoding

    # sometimes summarization needs gpt2 still
    from transformers import AutoTokenizer
    model_name = 'gpt2'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    assert tokenizer

    # then run h2ogpt as:
    # HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 python generate.py --gradio_offline_level=2 --share=False ...