Spaces:

Adityadn
/

AIImages

Runtime error

App Files Files Community

Adityadn commited on Mar 16, 2024

Commit

617d388

verified ·

1 Parent(s): 6f14d9a

Upload 524 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +54 -0
aiDescTerminal.py +90 -0
aiDescUI.py +688 -0
app.html +6 -0
app.py +67 -0
args_manager.py +55 -0
auth-example.json +6 -0
auth.json +6 -0
build_launcher.py +26 -0
config.txt +16 -0
config_modification_tutorial.txt +123 -0
css/style.css +220 -0
entry_with_update.py +46 -0
entrypoint.sh +33 -0
environment.yaml +7 -0
experiments_expansion.py +8 -0
experiments_face.py +7 -0
experiments_interrogate.py +205 -0
extras/BLIP/configs/bert_config.json +21 -0
extras/BLIP/configs/caption_coco.yaml +33 -0
extras/BLIP/configs/med_config.json +21 -0
extras/BLIP/configs/nlvr.yaml +21 -0
extras/BLIP/configs/nocaps.yaml +15 -0
extras/BLIP/configs/pretrain.yaml +27 -0
extras/BLIP/configs/retrieval_coco.yaml +34 -0
extras/BLIP/configs/retrieval_flickr.yaml +34 -0
extras/BLIP/configs/retrieval_msrvtt.yaml +12 -0
extras/BLIP/configs/vqa.yaml +25 -0
extras/BLIP/models/__pycache__/blip.cpython-310.pyc +0 -0
extras/BLIP/models/__pycache__/med.cpython-310.pyc +0 -0
extras/BLIP/models/__pycache__/vit.cpython-310.pyc +0 -0
extras/BLIP/models/bert_tokenizer/config.json +23 -0
extras/BLIP/models/bert_tokenizer/tokenizer.json +0 -0
extras/BLIP/models/bert_tokenizer/tokenizer_config.json +3 -0
extras/BLIP/models/bert_tokenizer/vocab.txt +0 -0
extras/BLIP/models/blip.py +239 -0
extras/BLIP/models/blip_itm.py +76 -0
extras/BLIP/models/blip_nlvr.py +105 -0
extras/BLIP/models/blip_pretrain.py +339 -0
extras/BLIP/models/blip_retrieval.py +319 -0
extras/BLIP/models/blip_vqa.py +186 -0
extras/BLIP/models/med.py +955 -0
extras/BLIP/models/nlvr_encoder.py +843 -0
extras/BLIP/models/vit.py +308 -0
extras/__pycache__/expansion.cpython-310.pyc +0 -0
extras/__pycache__/face_crop.cpython-310.pyc +0 -0
extras/__pycache__/interrogate.cpython-310.pyc +0 -0
extras/__pycache__/ip_adapter.cpython-310.pyc +0 -0
extras/__pycache__/preprocessors.cpython-310.pyc +0 -0
extras/__pycache__/resampler.cpython-310.pyc +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,54 @@

+__pycache__
+*.ckpt
+*.safetensors
+*.pth
+*.pt
+*.bin
+*.patch
+*.backup
+*.corrupted
+*.partial
+*.onnx
+sorted_styles.json
+/input
+/cache
+/language/default.json
+/test_imgs
+config.txt
+config_modification_tutorial.txt
+user_path_config.txt
+user_path_config-deprecated.txt
+/modules/*.png
+/repositories
+/fooocus_env
+/venv
+/tmp
+/ui-config.json
+/outputs
+/config.json
+/log
+/webui.settings.bat
+/embeddings
+/styles.csv
+/params.txt
+/styles.csv.bak
+/webui-user.bat
+/webui-user.sh
+/interrogate
+/user.css
+/.idea
+/notification.ogg
+/notification.mp3
+/SwinIR
+/textual_inversion
+.vscode
+/extensions
+/test/stdout.txt
+/test/stderr.txt
+/cache.json*
+/config_states/
+/node_modules
+/package-lock.json
+/.coverage*
+/auth.json
+.DS_Store

aiDescTerminal.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import os
+import sys
+import numpy as np
+from PIL import Image
+import requests
+from io import BytesIO
+root = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(root)
+os.chdir(root)
+import modules.config
+import modules.html
+import modules.flags as flags
+import modules.meta_parser
+def download_image(url):
+    response = requests.get(url)
+    img = Image.open(BytesIO(response.content)).convert("RGB")
+    return img
+def trigger_describe(mode, img_path):
+    print("Running")
+    print("Press Ctrl+C for Stop ")
+    if mode == flags.desc_type_photo:
+        from extras.interrogate import default_interrogator as default_interrogator_photo
+        if img_path.startswith('http'):
+            img = download_image(img_path)
+        else:
+            img = Image.open(img_path).convert("RGB")
+        return default_interrogator_photo(img), ["Fooocus V2", "Fooocus Enhance", "Fooocus Sharp"]
+    elif mode == flags.desc_type_anime:
+        from extras.wd14tagger import default_interrogator as default_interrogator_anime
+        if img_path.startswith('http'):
+            img = download_image(img_path)
+        elif isinstance(img_path, str):
+            # Load the image if the input is a path
+            img = Image.open(img_path).convert("RGB")
+        elif isinstance(img_path, np.ndarray):
+            # Use the provided NumPy array directly
+            img = Image.fromarray(img_path).convert("RGB")
+        else:
+            raise ValueError("Invalid image format. Please provide a valid path or NumPy array.")
+        # Convert the image to a NumPy array
+        img_array = np.array(img)
+        return default_interrogator_anime(img_array), ["Fooocus V2", "Fooocus Masterpiece"]
+    return mode, ["Fooocus V2"]
+style_selections = modules.config.default_styles
+def run_describe(image_path, content_type):
+    desc_input_image = image_path
+    desc_method = content_type
+    result, style_selections = None, None
+    if desc_method in ["Photograph", "1", ""]:
+        desc_method = "Photograph (1)"
+        result, style_selections = trigger_describe(flags.desc_type_photo, desc_input_image)
+    elif desc_method in ["Art/Anime", "2"]:
+        desc_method = "Art/Anime (2)"
+        result, style_selections = trigger_describe(flags.desc_type_anime, desc_input_image)
+    else:
+        print("ERROR!")
+    if result or style_selections != "":
+        style_selections = ""
+        print("Result:", result)
+        # print("Style Selections:", style_selections)
+        quit()
+if __name__ == "__main__":
+    desc_input_image = input("Path to Image (local path or URL): ")
+    if desc_input_image == "":
+        desc_input_image = "./imgs/Gambar1.jpg"
+    print(f"You use: {desc_input_image}")
+    desc_method = input(
+        """
+    Select Content Type:
+    Photograph (1)
+    Art/Anime (2)
+    """
+    )
+    run_describe(desc_input_image, desc_method)

aiDescUI.py ADDED Viewed

	@@ -0,0 +1,688 @@

+import gradio as gr
+import random
+import os
+# import json
+import time
+import shared
+import modules.config
+# import fooocus_version
+import modules.html
+import modules.async_worker as worker
+import modules.constants as constants
+import modules.flags as flags
+import modules.gradio_hijack as grh
+import modules.style_sorter as style_sorter
+import modules.meta_parser
+import args_manager
+import copy
+from modules.sdxl_styles import legal_style_names
+from modules.private_logger import get_current_html_path
+from modules.ui_gradio_extensions import reload_javascript
+from modules.auth import auth_enabled, check_auth
+# from modules.util import is_json
+# def get_task(*args):
+#     args = list(args)
+#     args.pop(0)
+#     return worker.AsyncTask(args=args)
+# def generate_clicked(task):
+#     import ldm_patched.modules.model_management as model_management
+#     with model_management.interrupt_processing_mutex:
+#         model_management.interrupt_processing = False
+#     # outputs=[progress_html, progress_window, progress_gallery, gallery]
+#     execution_start_time = time.perf_counter()
+#     finished = False
+#     yield gr.update(visible=True, value=modules.html.make_progress_html(1, 'Waiting for task to start ...')), \
+#         gr.update(visible=True, value=None), \
+#         gr.update(visible=False, value=None), \
+#         gr.update(visible=False)
+#     worker.async_tasks.append(task)
+#     while not finished:
+#         time.sleep(0.01)
+#         if len(task.yields) > 0:
+#             flag, product = task.yields.pop(0)
+#             if flag == 'preview':
+#                 # help bad internet connection by skipping duplicated preview
+#                 if len(task.yields) > 0:  # if we have the next item
+#                     if task.yields[0][0] == 'preview':   # if the next item is also a preview
+#                         # print('Skipped one preview for better internet connection.')
+#                         continue
+#                 percentage, title, image = product
+#                 yield gr.update(visible=True, value=modules.html.make_progress_html(percentage, title)), \
+#                     gr.update(visible=True, value=image) if image is not None else gr.update(), \
+#                     gr.update(), \
+#                     gr.update(visible=False)
+#             if flag == 'results':
+#                 yield gr.update(visible=True), \
+#                     gr.update(visible=True), \
+#                     gr.update(visible=True, value=product), \
+#                     gr.update(visible=False)
+#             if flag == 'finish':
+#                 yield gr.update(visible=False), \
+#                     gr.update(visible=False), \
+#                     gr.update(visible=False), \
+#                     gr.update(visible=True, value=product)
+#                 finished = True
+#                 # delete Fooocus temp images, only keep gradio temp images
+#                 if args_manager.args.disable_image_log:
+#                     for filepath in product:
+#                         if isinstance(filepath, str) and os.path.exists(filepath):
+#                             os.remove(filepath)
+#     execution_time = time.perf_counter() - execution_start_time
+#     print(f'Total time: {execution_time:.2f} seconds')
+#     return
+reload_javascript()
+title = 'AI Describe Image'
+if isinstance(args_manager.args.preset, str):
+    title += ' ' + args_manager.args.preset
+shared.gradio_root = gr.Blocks(
+    title=title,
+    css=modules.html.css).queue()
+with shared.gradio_root:
+    # currentTask = gr.State(worker.AsyncTask(args=[]))
+    with gr.Row():
+        with gr.Column(scale=2):
+            # with gr.Row():
+            #     progress_window = grh.Image(label='Preview', show_label=True, visible=False, height=768,
+            #                                 elem_classes=['main_view'])
+            #     progress_gallery = gr.Gallery(label='Finished Images', show_label=True, object_fit='contain',
+            #                                   height=768, visible=False, elem_classes=['main_view', 'image_gallery'])
+            # progress_html = gr.HTML(value=modules.html.make_progress_html(32, 'Progress 32%'), visible=False,
+            #                         elem_id='progress-bar', elem_classes='progress-bar')
+            # gallery = gr.Gallery(label='Gallery', show_label=False, object_fit='contain', visible=True, height=768,
+            #                      elem_classes=['resizable_area', 'main_view', 'final_gallery', 'image_gallery'],
+            #                      elem_id='final_gallery')
+            with gr.Row(visible=True) as image_input_panel:
+                with gr.Tabs():
+                    # with gr.TabItem(label='Upscale or Variation') as uov_tab:
+                    #     with gr.Row():
+                    #         with gr.Column():
+                    #             uov_input_image = grh.Image(label='Drag above image to here', source='upload', type='numpy')
+                    #         with gr.Column():
+                    #             uov_method = gr.Radio(label='Upscale or Variation:', choices=flags.uov_list, value=flags.disabled)
+                    #             gr.HTML('<a href="https://github.com/lllyasviel/Fooocus/discussions/390" target="_blank">\U0001F4D4 Document</a>')
+                    # with gr.TabItem(label='Image Prompt') as ip_tab:
+                    #     with gr.Row():
+                    #         ip_images = []
+                    #         ip_types = []
+                    #         ip_stops = []
+                    #         ip_weights = []
+                    #         ip_ctrls = []
+                    #         ip_ad_cols = []
+                    #         for _ in range(flags.controlnet_image_count):
+                    #             with gr.Column():
+                    #                 ip_image = grh.Image(label='Image', source='upload', type='numpy', show_label=False, height=300)
+                    #                 ip_images.append(ip_image)
+                    #                 ip_ctrls.append(ip_image)
+                    #                 with gr.Column(visible=False) as ad_col:
+                    #                     with gr.Row():
+                    #                         default_end, default_weight = flags.default_parameters[flags.default_ip]
+                    #                         ip_stop = gr.Slider(label='Stop At', minimum=0.0, maximum=1.0, step=0.001, value=default_end)
+                    #                         ip_stops.append(ip_stop)
+                    #                         ip_ctrls.append(ip_stop)
+                    #                         ip_weight = gr.Slider(label='Weight', minimum=0.0, maximum=2.0, step=0.001, value=default_weight)
+                    #                         ip_weights.append(ip_weight)
+                    #                         ip_ctrls.append(ip_weight)
+                    #                     ip_type = gr.Radio(label='Type', choices=flags.ip_list, value=flags.default_ip, container=False)
+                    #                     ip_types.append(ip_type)
+                    #                     ip_ctrls.append(ip_type)
+                    #                     ip_type.change(lambda x: flags.default_parameters[x], inputs=[ip_type], outputs=[ip_stop, ip_weight], queue=False, show_progress=False)
+                    #                 ip_ad_cols.append(ad_col)
+                    #     ip_advanced = gr.Checkbox(label='Advanced', value=False, container=False)
+                    #     gr.HTML('* \"Image Prompt\" is powered by Fooocus Image Mixture Engine (v1.0.1). <a href="https://github.com/lllyasviel/Fooocus/discussions/557" target="_blank">\U0001F4D4 Document</a>')
+                    #     def ip_advance_checked(x):
+                    #         return [gr.update(visible=x)] * len(ip_ad_cols) + \
+                    #             [flags.default_ip] * len(ip_types) + \
+                    #             [flags.default_parameters[flags.default_ip][0]] * len(ip_stops) + \
+                    #             [flags.default_parameters[flags.default_ip][1]] * len(ip_weights)
+                    #     ip_advanced.change(ip_advance_checked, inputs=ip_advanced,
+                    #                        outputs=ip_ad_cols + ip_types + ip_stops + ip_weights,
+                    #                        queue=False, show_progress=False)
+                    # with gr.TabItem(label='Inpaint or Outpaint') as inpaint_tab:
+                    #     with gr.Row():
+                    #         inpaint_input_image = grh.Image(label='Drag inpaint or outpaint image to here', source='upload', type='numpy', tool='sketch', height=500, brush_color="#FFFFFF", elem_id='inpaint_canvas')
+                    #         inpaint_mask_image = grh.Image(label='Mask Upload', source='upload', type='numpy', height=500, visible=False)
+                    #     with gr.Row():
+                    #         inpaint_additional_prompt = gr.Textbox(placeholder="Describe what you want to inpaint.", elem_id='inpaint_additional_prompt', label='Inpaint Additional Prompt', visible=False)
+                    #         outpaint_selections = gr.CheckboxGroup(choices=['Left', 'Right', 'Top', 'Bottom'], value=[], label='Outpaint Direction')
+                    #         inpaint_mode = gr.Dropdown(choices=modules.flags.inpaint_options, value=modules.flags.inpaint_option_default, label='Method')
+                    #     example_inpaint_prompts = gr.Dataset(samples=modules.config.example_inpaint_prompts, label='Additional Prompt Quick List', components=[inpaint_additional_prompt], visible=False)
+                    #     gr.HTML('* Powered by Fooocus Inpaint Engine <a href="https://github.com/lllyasviel/Fooocus/discussions/414" target="_blank">\U0001F4D4 Document</a>')
+                    #     example_inpaint_prompts.click(lambda x: x[0], inputs=example_inpaint_prompts, outputs=inpaint_additional_prompt, show_progress=False, queue=False)
+                    with gr.TabItem(label='Describe') as desc_tab:
+                        with gr.Row():
+                            with gr.Column():
+                                desc_input_image = grh.Image(label='Drag any image to here', source='upload', type='numpy')
+                            with gr.Column():
+                                # with gr.Row(elem_classes='type_row'):
+                                with gr.Row():
+                                    prompt = gr.Textbox(label="Output", show_label=True, elem_id='positive_prompt', container=True, autofocus=True, show_copy_button=True, interactive=True)
+                                    default_prompt = modules.config.default_prompt
+                                    if isinstance(default_prompt, str) and default_prompt != '':
+                                        shared.gradio_root.load(lambda: default_prompt, outputs=prompt)
+                                # with gr.Column(scale=3, min_width=0):
+                                #     generate_button = gr.Button(label="Generate", value="Generate", elem_classes='type_row', elem_id='generate_button', visible=True)
+                                #     load_parameter_button = gr.Button(label="Load Parameters", value="Load Parameters", elem_classes='type_row', elem_id='load_parameter_button', visible=False)
+                                #     skip_button = gr.Button(label="Skip", value="Skip", elem_classes='type_row_half', visible=False)
+                                #     stop_button = gr.Button(label="Stop", value="Stop", elem_classes='type_row_half', elem_id='stop_button', visible=False)
+                                #     def stop_clicked(currentTask):
+                                #         import ldm_patched.modules.model_management as model_management
+                                #         currentTask.last_stop = 'stop'
+                                #         if (currentTask.processing):
+                                #             model_management.interrupt_current_processing()
+                                #         return currentTask
+                                #     def skip_clicked(currentTask):
+                                #         import ldm_patched.modules.model_management as model_management
+                                #         currentTask.last_stop = 'skip'
+                                #         if (currentTask.processing):
+                                #             model_management.interrupt_current_processing()
+                                #         return currentTask
+                                #     stop_button.click(stop_clicked, inputs=currentTask, outputs=currentTask, queue=False, show_progress=False, _js='cancelGenerateForever')
+                                #     skip_button.click(skip_clicked, inputs=currentTask, outputs=currentTask, queue=False, show_progress=False)
+                                # with gr.Row(elem_classes='advanced_check_row'):
+                                #     # input_image_checkbox = gr.Checkbox(label='Input Image', value=False, container=False, elem_classes='min_check')
+                                #     advanced_checkbox = gr.Checkbox(label='Advanced', value=modules.config.default_advanced_checkbox, container=False, elem_classes='min_check')
+                                with gr.Row():
+                                    desc_method = gr.Radio(
+                                        label='Content Type',
+                                        choices=[flags.desc_type_photo, flags.desc_type_anime],
+                                        value=flags.desc_type_photo)
+                                    desc_btn = gr.Button(value='Describe this Image into Prompt')
+                                # gr.HTML('<a href="https://github.com/lllyasviel/Fooocus/discussions/1363" target="_blank">\U0001F4D4 Document</a>')
+                    # with gr.TabItem(label='Metadata') as load_tab:
+                    #     with gr.Column():
+                    #         metadata_input_image = grh.Image(label='Drag any image generated by Fooocus here', source='upload', type='filepath')
+                    #         metadata_json = gr.JSON(label='Metadata')
+                    #         metadata_import_button = gr.Button(value='Apply Metadata')
+                    #     def trigger_metadata_preview(filepath):
+                    #         parameters, metadata_scheme = modules.meta_parser.read_info_from_image(filepath)
+                    #         results = {}
+                    #         if parameters is not None:
+                    #             results['parameters'] = parameters
+                    #         if isinstance(metadata_scheme, flags.MetadataScheme):
+                    #             results['metadata_scheme'] = metadata_scheme.value
+                    #         return results
+                    #     metadata_input_image.upload(trigger_metadata_preview, inputs=metadata_input_image,
+                    #                                 outputs=metadata_json, queue=False, show_progress=True)
+            switch_js = "(x) => {if(x){viewer_to_bottom(100);viewer_to_bottom(500);}else{viewer_to_top();} return x;}"
+            down_js = "() => {viewer_to_bottom();}"
+            # input_image_checkbox.change(lambda x: gr.update(visible=x), inputs=input_image_checkbox,
+            #                             outputs=image_input_panel, queue=False, show_progress=False, _js=switch_js)
+            # ip_advanced.change(lambda: None, queue=False, show_progress=False, _js=down_js)
+            # current_tab = gr.Textbox(value='desc', visible=False)
+            # # uov_tab.select(lambda: 'uov', outputs=current_tab, queue=False, _js=down_js, show_progress=False)
+            # # inpaint_tab.select(lambda: 'inpaint', outputs=current_tab, queue=False, _js=down_js, show_progress=False)
+            # # ip_tab.select(lambda: 'ip', outputs=current_tab, queue=False, _js=down_js, show_progress=False)
+            # desc_tab.select(lambda: 'desc', outputs=current_tab, queue=False, _js=down_js, show_progress=False)
+        # with gr.Column(scale=1, visible=modules.config.default_advanced_checkbox) as advanced_column:
+        #     with gr.Tab(label='Setting'):
+        #         performance_selection = gr.Radio(label='Performance',
+        #                                          choices=modules.flags.performance_selections,
+        #                                          value=modules.config.default_performance)
+        #         aspect_ratios_selection = gr.Radio(label='Aspect Ratios', choices=modules.config.available_aspect_ratios,
+        #                                            value=modules.config.default_aspect_ratio, info='width × height',
+        #                                            elem_classes='aspect_ratios')
+        #         image_number = gr.Slider(label='Image Number', minimum=1, maximum=modules.config.default_max_image_number, step=1, value=modules.config.default_image_number)
+        #         output_format = gr.Radio(label='Output Format',
+        #                                     choices=modules.flags.output_formats,
+        #                                     value=modules.config.default_output_format)
+        #         negative_prompt = gr.Textbox(label='Negative Prompt', show_label=True, placeholder="Type prompt here.",
+        #                                      info='Describing what you do not want to see.', lines=2,
+        #                                      elem_id='negative_prompt',
+        #                                      value=modules.config.default_prompt_negative)
+        #         seed_random = gr.Checkbox(label='Random', value=True)
+        #         image_seed = gr.Textbox(label='Seed', value=0, max_lines=1, visible=False) # workaround for https://github.com/gradio-app/gradio/issues/5354
+        #         def random_checked(r):
+        #             return gr.update(visible=not r)
+        #         def refresh_seed(r, seed_string):
+        #             if r:
+        #                 return random.randint(constants.MIN_SEED, constants.MAX_SEED)
+        #             else:
+        #                 try:
+        #                     seed_value = int(seed_string)
+        #                     if constants.MIN_SEED <= seed_value <= constants.MAX_SEED:
+        #                         return seed_value
+        #                 except ValueError:
+        #                     pass
+        #                 return random.randint(constants.MIN_SEED, constants.MAX_SEED)
+        #         seed_random.change(random_checked, inputs=[seed_random], outputs=[image_seed],
+        #                            queue=False, show_progress=False)
+        #         def update_history_link():
+        #             if args_manager.args.disable_image_log:
+        #                 return gr.update(value='')
+        #             return gr.update(value=f'<a href="file={get_current_html_path(output_format)}" target="_blank">\U0001F4DA History Log</a>')
+        #         history_link = gr.HTML()
+        #         shared.gradio_root.load(update_history_link, outputs=history_link, queue=False, show_progress=False)
+        #     with gr.Tab(label='Style'):
+        #         style_sorter.try_load_sorted_styles(
+        #             style_names=legal_style_names,
+        #             default_selected=modules.config.default_styles)
+        #         style_search_bar = gr.Textbox(show_label=False, container=False,
+        #                                       placeholder="\U0001F50E Type here to search styles ...",
+        #                                       value="",
+        #                                       label='Search Styles')
+        #         style_selections = gr.CheckboxGroup(show_label=False, container=False,
+        #                                             choices=copy.deepcopy(style_sorter.all_styles),
+        #                                             value=copy.deepcopy(modules.config.default_styles),
+        #                                             label='Selected Styles',
+        #                                             elem_classes=['style_selections'])
+        #         gradio_receiver_style_selections = gr.Textbox(elem_id='gradio_receiver_style_selections', visible=False)
+        #         shared.gradio_root.load(lambda: gr.update(choices=copy.deepcopy(style_sorter.all_styles)),
+        #                                 outputs=style_selections)
+        #         style_search_bar.change(style_sorter.search_styles,
+        #                                 inputs=[style_selections, style_search_bar],
+        #                                 outputs=style_selections,
+        #                                 queue=False,
+        #                                 show_progress=False).then(
+        #             lambda: None, _js='()=>{refresh_style_localization();}')
+        #         gradio_receiver_style_selections.input(style_sorter.sort_styles,
+        #                                                inputs=style_selections,
+        #                                                outputs=style_selections,
+        #                                                queue=False,
+        #                                                show_progress=False).then(
+        #             lambda: None, _js='()=>{refresh_style_localization();}')
+        #     with gr.Tab(label='Model'):
+        #         with gr.Group():
+        #             with gr.Row():
+        #                 base_model = gr.Dropdown(label='Base Model (SDXL only)', choices=modules.config.model_filenames, value=modules.config.default_base_model_name, show_label=True)
+        #                 refiner_model = gr.Dropdown(label='Refiner (SDXL or SD 1.5)', choices=['None'] + modules.config.model_filenames, value=modules.config.default_refiner_model_name, show_label=True)
+        #             refiner_switch = gr.Slider(label='Refiner Switch At', minimum=0.1, maximum=1.0, step=0.0001,
+        #                                        info='Use 0.4 for SD1.5 realistic models; '
+        #                                             'or 0.667 for SD1.5 anime models; '
+        #                                             'or 0.8 for XL-refiners; '
+        #                                             'or any value for switching two SDXL models.',
+        #                                        value=modules.config.default_refiner_switch,
+        #                                        visible=modules.config.default_refiner_model_name != 'None')
+        #             refiner_model.change(lambda x: gr.update(visible=x != 'None'),
+        #                                  inputs=refiner_model, outputs=refiner_switch, show_progress=False, queue=False)
+        #         with gr.Group():
+        #             lora_ctrls = []
+        #             for i, (n, v) in enumerate(modules.config.default_loras):
+        #                 with gr.Row():
+        #                     lora_enabled = gr.Checkbox(label='Enable', value=True,
+        #                                                elem_classes=['lora_enable', 'min_check'], scale=1)
+        #                     lora_model = gr.Dropdown(label=f'LoRA {i + 1}',
+        #                                              choices=['None'] + modules.config.lora_filenames, value=n,
+        #                                              elem_classes='lora_model', scale=5)
+        #                     lora_weight = gr.Slider(label='Weight', minimum=modules.config.default_loras_min_weight,
+        #                                             maximum=modules.config.default_loras_max_weight, step=0.01, value=v,
+        #                                             elem_classes='lora_weight', scale=5)
+        #                     lora_ctrls += [lora_enabled, lora_model, lora_weight]
+        #         with gr.Row():
+        #             model_refresh = gr.Button(label='Refresh', value='\U0001f504 Refresh All Files', variant='secondary', elem_classes='refresh_button')
+        #     with gr.Tab(label='Advanced'):
+        #         guidance_scale = gr.Slider(label='Guidance Scale', minimum=1.0, maximum=30.0, step=0.01,
+        #                                    value=modules.config.default_cfg_scale,
+        #                                    info='Higher value means style is cleaner, vivider, and more artistic.')
+        #         sharpness = gr.Slider(label='Image Sharpness', minimum=0.0, maximum=30.0, step=0.001,
+        #                               value=modules.config.default_sample_sharpness,
+        #                               info='Higher value means image and texture are sharper.')
+        #         gr.HTML('<a href="https://github.com/lllyasviel/Fooocus/discussions/117" target="_blank">\U0001F4D4 Document</a>')
+        #         dev_mode = gr.Checkbox(label='Developer Debug Mode', value=False, container=False)
+        #         with gr.Column(visible=False) as dev_tools:
+        #             with gr.Tab(label='Debug Tools'):
+        #                 adm_scaler_positive = gr.Slider(label='Positive ADM Guidance Scaler', minimum=0.1, maximum=3.0,
+        #                                                 step=0.001, value=1.5, info='The scaler multiplied to positive ADM (use 1.0 to disable). ')
+        #                 adm_scaler_negative = gr.Slider(label='Negative ADM Guidance Scaler', minimum=0.1, maximum=3.0,
+        #                                                 step=0.001, value=0.8, info='The scaler multiplied to negative ADM (use 1.0 to disable). ')
+        #                 adm_scaler_end = gr.Slider(label='ADM Guidance End At Step', minimum=0.0, maximum=1.0,
+        #                                            step=0.001, value=0.3,
+        #                                            info='When to end the guidance from positive/negative ADM. ')
+        #                 refiner_swap_method = gr.Dropdown(label='Refiner swap method', value=flags.refiner_swap_method,
+        #                                                   choices=['joint', 'separate', 'vae'])
+        #                 adaptive_cfg = gr.Slider(label='CFG Mimicking from TSNR', minimum=1.0, maximum=30.0, step=0.01,
+        #                                          value=modules.config.default_cfg_tsnr,
+        #                                          info='Enabling Fooocus\'s implementation of CFG mimicking for TSNR '
+        #                                               '(effective when real CFG > mimicked CFG).')
+        #                 sampler_name = gr.Dropdown(label='Sampler', choices=flags.sampler_list,
+        #                                            value=modules.config.default_sampler)
+        #                 scheduler_name = gr.Dropdown(label='Scheduler', choices=flags.scheduler_list,
+        #                                              value=modules.config.default_scheduler)
+        #                 generate_image_grid = gr.Checkbox(label='Generate Image Grid for Each Batch',
+        #                                                   info='(Experimental) This may cause performance problems on some computers and certain internet conditions.',
+        #                                                   value=False)
+        #                 overwrite_step = gr.Slider(label='Forced Overwrite of Sampling Step',
+        #                                            minimum=-1, maximum=200, step=1,
+        #                                            value=modules.config.default_overwrite_step,
+        #                                            info='Set as -1 to disable. For developer debugging.')
+        #                 overwrite_switch = gr.Slider(label='Forced Overwrite of Refiner Switch Step',
+        #                                              minimum=-1, maximum=200, step=1,
+        #                                              value=modules.config.default_overwrite_switch,
+        #                                              info='Set as -1 to disable. For developer debugging.')
+        #                 overwrite_width = gr.Slider(label='Forced Overwrite of Generating Width',
+        #                                             minimum=-1, maximum=2048, step=1, value=-1,
+        #                                             info='Set as -1 to disable. For developer debugging. '
+        #                                                  'Results will be worse for non-standard numbers that SDXL is not trained on.')
+        #                 overwrite_height = gr.Slider(label='Forced Overwrite of Generating Height',
+        #                                              minimum=-1, maximum=2048, step=1, value=-1,
+        #                                              info='Set as -1 to disable. For developer debugging. '
+        #                                                   'Results will be worse for non-standard numbers that SDXL is not trained on.')
+        #                 overwrite_vary_strength = gr.Slider(label='Forced Overwrite of Denoising Strength of "Vary"',
+        #                                                     minimum=-1, maximum=1.0, step=0.001, value=-1,
+        #                                                     info='Set as negative number to disable. For developer debugging.')
+        #                 overwrite_upscale_strength = gr.Slider(label='Forced Overwrite of Denoising Strength of "Upscale"',
+        #                                                        minimum=-1, maximum=1.0, step=0.001, value=-1,
+        #                                                        info='Set as negative number to disable. For developer debugging.')
+        #                 disable_preview = gr.Checkbox(label='Disable Preview', value=False,
+        #                                               info='Disable preview during generation.')
+        #                 disable_intermediate_results = gr.Checkbox(label='Disable Intermediate Results',
+        #                                               value=modules.config.default_performance == 'Extreme Speed',
+        #                                               interactive=modules.config.default_performance != 'Extreme Speed',
+        #                                               info='Disable intermediate results during generation, only show final gallery.')
+        #                 disable_seed_increment = gr.Checkbox(label='Disable seed increment',
+        #                                                      info='Disable automatic seed increment when image number is > 1.',
+        #                                                      value=False)
+        #                 # if not args_manager.args.disable_metadata:
+        #                 #     save_metadata_to_images = gr.Checkbox(label='Save Metadata to Images', value=modules.config.default_save_metadata_to_images,
+        #                 #                                           info='Adds parameters to generated images allowing manual regeneration.')
+        #                 #     metadata_scheme = gr.Radio(label='Metadata Scheme', choices=flags.metadata_scheme, value=modules.config.default_metadata_scheme,
+        #                 #                                info='Image Prompt parameters are not included. Use png and a1111 for compatibility with Civitai.',
+        #                 #                                visible=modules.config.default_save_metadata_to_images)
+        #                 #     save_metadata_to_images.change(lambda x: gr.update(visible=x), inputs=[save_metadata_to_images], outputs=[metadata_scheme],
+        #                 #                                    queue=False, show_progress=False)
+        #             # with gr.Tab(label='Control'):
+        #             #     debugging_cn_preprocessor = gr.Checkbox(label='Debug Preprocessors', value=False,
+        #             #                                             info='See the results from preprocessors.')
+        #             #     skipping_cn_preprocessor = gr.Checkbox(label='Skip Preprocessors', value=False,
+        #             #                                            info='Do not preprocess images. (Inputs are already canny/depth/cropped-face/etc.)')
+        #             #     mixing_image_prompt_and_vary_upscale = gr.Checkbox(label='Mixing Image Prompt and Vary/Upscale',
+        #             #                                                        value=False)
+        #             #     mixing_image_prompt_and_inpaint = gr.Checkbox(label='Mixing Image Prompt and Inpaint',
+        #             #                                                   value=False)
+        #             #     controlnet_softness = gr.Slider(label='Softness of ControlNet', minimum=0.0, maximum=1.0,
+        #             #                                     step=0.001, value=0.25,
+        #             #                                     info='Similar to the Control Mode in A1111 (use 0.0 to disable). ')
+        #             #     with gr.Tab(label='Canny'):
+        #             #         canny_low_threshold = gr.Slider(label='Canny Low Threshold', minimum=1, maximum=255,
+        #             #                                         step=1, value=64)
+        #             #         canny_high_threshold = gr.Slider(label='Canny High Threshold', minimum=1, maximum=255,
+        #             #                                          step=1, value=128)
+        #             # with gr.Tab(label='Inpaint'):
+        #             #     debugging_inpaint_preprocessor = gr.Checkbox(label='Debug Inpaint Preprocessing', value=False)
+        #             #     inpaint_disable_initial_latent = gr.Checkbox(label='Disable initial latent in inpaint', value=False)
+        #             #     inpaint_engine = gr.Dropdown(label='Inpaint Engine',
+        #             #                                  value=modules.config.default_inpaint_engine_version,
+        #             #                                  choices=flags.inpaint_engine_versions,
+        #             #                                  info='Version of Fooocus inpaint model')
+        #             #     inpaint_strength = gr.Slider(label='Inpaint Denoising Strength',
+        #             #                                  minimum=0.0, maximum=1.0, step=0.001, value=1.0,
+        #             #                                  info='Same as the denoising strength in A1111 inpaint. '
+        #             #                                       'Only used in inpaint, not used in outpaint. '
+        #             #                                       '(Outpaint always use 1.0)')
+        #             #     inpaint_respective_field = gr.Slider(label='Inpaint Respective Field',
+        #             #                                          minimum=0.0, maximum=1.0, step=0.001, value=0.618,
+        #             #                                          info='The area to inpaint. '
+        #             #                                               'Value 0 is same as "Only Masked" in A1111. '
+        #             #                                               'Value 1 is same as "Whole Image" in A1111. '
+        #             #                                               'Only used in inpaint, not used in outpaint. '
+        #             #                                               '(Outpaint always use 1.0)')
+        #             #     inpaint_erode_or_dilate = gr.Slider(label='Mask Erode or Dilate',
+        #             #                                         minimum=-64, maximum=64, step=1, value=0,
+        #             #                                         info='Positive value will make white area in the mask larger, '
+        #             #                                              'negative value will make white area smaller.'
+        #             #                                              '(default is 0, always process before any mask invert)')
+        #             #     inpaint_mask_upload_checkbox = gr.Checkbox(label='Enable Mask Upload', value=False)
+        #             #     invert_mask_checkbox = gr.Checkbox(label='Invert Mask', value=False)
+        #             #     inpaint_ctrls = [debugging_inpaint_preprocessor, inpaint_disable_initial_latent, inpaint_engine,
+        #             #                      inpaint_strength, inpaint_respective_field,
+        #             #                      inpaint_mask_upload_checkbox, invert_mask_checkbox, inpaint_erode_or_dilate]
+        #             #     inpaint_mask_upload_checkbox.change(lambda x: gr.update(visible=x),
+        #             #                                        inputs=inpaint_mask_upload_checkbox,
+        #             #                                        outputs=inpaint_mask_image, queue=False, show_progress=False)
+        #             with gr.Tab(label='FreeU'):
+        #                 freeu_enabled = gr.Checkbox(label='Enabled', value=False)
+        #                 freeu_b1 = gr.Slider(label='B1', minimum=0, maximum=2, step=0.01, value=1.01)
+        #                 freeu_b2 = gr.Slider(label='B2', minimum=0, maximum=2, step=0.01, value=1.02)
+        #                 freeu_s1 = gr.Slider(label='S1', minimum=0, maximum=4, step=0.01, value=0.99)
+        #                 freeu_s2 = gr.Slider(label='S2', minimum=0, maximum=4, step=0.01, value=0.95)
+        #                 freeu_ctrls = [freeu_enabled, freeu_b1, freeu_b2, freeu_s1, freeu_s2]
+        #         def dev_mode_checked(r):
+        #             return gr.update(visible=r)
+        #         dev_mode.change(dev_mode_checked, inputs=[dev_mode], outputs=[dev_tools],
+        #                         queue=False, show_progress=False)
+        #         def model_refresh_clicked():
+        #             modules.config.update_all_model_names()
+        #             results = [gr.update(choices=modules.config.model_filenames)]
+        #             results += [gr.update(choices=['None'] + modules.config.model_filenames)]
+        #             for i in range(modules.config.default_max_lora_number):
+        #                 results += [gr.update(interactive=True), gr.update(choices=['None'] + modules.config.lora_filenames), gr.update()]
+        #             return results
+        #         model_refresh.click(model_refresh_clicked, [], [base_model, refiner_model] + lora_ctrls,
+        #                             queue=False, show_progress=False)
+        # performance_selection.change(lambda x: [gr.update(interactive=x != 'Extreme Speed')] * 11 +
+        #                                        [gr.update(visible=x != 'Extreme Speed')] * 1 +
+        #                                        [gr.update(interactive=x != 'Extreme Speed', value=x == 'Extreme Speed', )] * 1,
+        #                              inputs=performance_selection,
+        #                              outputs=[
+        #                                  guidance_scale, sharpness, adm_scaler_end, adm_scaler_positive,
+        #                                  adm_scaler_negative, refiner_switch, refiner_model, sampler_name,
+        #                                  scheduler_name, adaptive_cfg, refiner_swap_method, negative_prompt, disable_intermediate_results
+        #                              ], queue=False, show_progress=False)
+        # output_format.input(lambda x: gr.update(output_format=x), inputs=output_format)
+        # advanced_checkbox.change(lambda x: gr.update(visible=x), advanced_checkbox, advanced_column,
+        #                          queue=False, show_progress=False) \
+        #     .then(fn=lambda: None, _js='refresh_grid_delayed', queue=False, show_progress=False)
+        # def inpaint_mode_change(mode):
+        #     assert mode in modules.flags.inpaint_options
+        #     # inpaint_additional_prompt, outpaint_selections, example_inpaint_prompts,
+        #     # inpaint_disable_initial_latent, inpaint_engine,
+        #     # inpaint_strength, inpaint_respective_field
+        #     if mode == modules.flags.inpaint_option_detail:
+        #         return [
+        #             gr.update(visible=True), gr.update(visible=False, value=[]),
+        #             gr.Dataset.update(visible=True, samples=modules.config.example_inpaint_prompts),
+        #             False, 'None', 0.5, 0.0
+        #         ]
+        #     if mode == modules.flags.inpaint_option_modify:
+        #         return [
+        #             gr.update(visible=True), gr.update(visible=False, value=[]),
+        #             gr.Dataset.update(visible=False, samples=modules.config.example_inpaint_prompts),
+        #             True, modules.config.default_inpaint_engine_version, 1.0, 0.0
+        #         ]
+        #     return [
+        #         gr.update(visible=False, value=''), gr.update(visible=True),
+        #         gr.Dataset.update(visible=False, samples=modules.config.example_inpaint_prompts),
+        #         False, modules.config.default_inpaint_engine_version, 1.0, 0.618
+        #     ]
+        # inpaint_mode.input(inpaint_mode_change, inputs=inpaint_mode, outputs=[
+        #     inpaint_additional_prompt, outpaint_selections, example_inpaint_prompts,
+        #     inpaint_disable_initial_latent, inpaint_engine,
+        #     inpaint_strength, inpaint_respective_field
+        # ], show_progress=False, queue=False)
+        # ctrls = [currentTask, generate_image_grid]
+        # ctrls += [
+        #     prompt, negative_prompt, style_selections,
+        #     performance_selection, aspect_ratios_selection, image_number, output_format, image_seed, sharpness, guidance_scale
+        # ]
+        # ctrls += [base_model, refiner_model, refiner_switch] + lora_ctrls
+        # # ctrls += [input_image_checkbox, current_tab]
+        # # ctrls += [uov_method, uov_input_image]
+        # # ctrls += [outpaint_selections, inpaint_input_image, inpaint_additional_prompt, inpaint_mask_image]
+        # ctrls += [disable_preview, disable_intermediate_results, disable_seed_increment]
+        # ctrls += [adm_scaler_positive, adm_scaler_negative, adm_scaler_end, adaptive_cfg]
+        # ctrls += [sampler_name, scheduler_name]
+        # ctrls += [overwrite_step, overwrite_switch, overwrite_width, overwrite_height, overwrite_vary_strength]
+        # ctrls += [overwrite_upscale_strength, mixing_image_prompt_and_vary_upscale, mixing_image_prompt_and_inpaint]
+        # ctrls += [debugging_cn_preprocessor, skipping_cn_preprocessor, canny_low_threshold, canny_high_threshold]
+        # ctrls += [refiner_swap_method, controlnet_softness]
+        # ctrls += freeu_ctrls
+        # ctrls += inpaint_ctrls
+        # if not args_manager.args.disable_metadata:
+        #     ctrls += [save_metadata_to_images, metadata_scheme]
+        # ctrls += ip_ctrls
+        # state_is_generating = gr.State(False)
+        # def parse_meta(raw_prompt_txt, is_generating):
+        #     loaded_json = None
+        #     if is_json(raw_prompt_txt):
+        #         loaded_json = json.loads(raw_prompt_txt)
+        #     if loaded_json is None:
+        #         if is_generating:
+        #             return gr.update(), gr.update(), gr.update()
+        #         else:
+        #             return gr.update(), gr.update(visible=True), gr.update(visible=False)
+        #     return json.dumps(loaded_json), gr.update(visible=False), gr.update(visible=True)
+        # prompt.input(parse_meta, inputs=[prompt, state_is_generating], outputs=[prompt, generate_button, load_parameter_button], queue=False, show_progress=False)
+        # load_data_outputs = [advanced_checkbox, image_number, prompt, negative_prompt, style_selections,
+        #                      performance_selection, overwrite_step, overwrite_switch, aspect_ratios_selection,
+        #                      overwrite_width, overwrite_height, guidance_scale, sharpness, adm_scaler_positive,
+        #                      adm_scaler_negative, adm_scaler_end, refiner_swap_method, adaptive_cfg, base_model,
+        #                      refiner_model, refiner_switch, sampler_name, scheduler_name, seed_random, image_seed,
+        #                      generate_button, load_parameter_button] + freeu_ctrls + lora_ctrls
+        # load_parameter_button.click(modules.meta_parser.load_parameter_button_click, inputs=[prompt, state_is_generating], outputs=load_data_outputs, queue=False, show_progress=False)
+        # # def trigger_metadata_import(filepath, state_is_generating):
+        # #     parameters, metadata_scheme = modules.meta_parser.read_info_from_image(filepath)
+        # #     if parameters is None:
+        # #         print('Could not find metadata in the image!')
+        # #         parsed_parameters = {}
+        # #     else:
+        # #         metadata_parser = modules.meta_parser.get_metadata_parser(metadata_scheme)
+        # #         parsed_parameters = metadata_parser.parse_json(parameters)
+        # #     return modules.meta_parser.load_parameter_button_click(parsed_parameters, state_is_generating)
+        # # metadata_import_button.click(trigger_metadata_import, inputs=[metadata_input_image, state_is_generating], outputs=load_data_outputs, queue=False, show_progress=True) \
+        # #     .then(style_sorter.sort_styles, inputs=style_selections, outputs=style_selections, queue=False, show_progress=False)
+        # generate_button.click(lambda: (gr.update(visible=True, interactive=True), gr.update(visible=True, interactive=True), gr.update(visible=False, interactive=False), [], True),
+        #                       outputs=[stop_button, skip_button, generate_button, gallery, state_is_generating]) \
+        #     .then(fn=refresh_seed, inputs=[seed_random, image_seed], outputs=image_seed) \
+        #     .then(fn=get_task, inputs=ctrls, outputs=currentTask) \
+        #     .then(fn=generate_clicked, inputs=currentTask, outputs=[progress_html, progress_window, progress_gallery, gallery]) \
+        #     .then(lambda: (gr.update(visible=True, interactive=True), gr.update(visible=False, interactive=False), gr.update(visible=False, interactive=False), False),
+        #           outputs=[generate_button, stop_button, skip_button, state_is_generating]) \
+        #     .then(fn=update_history_link, outputs=history_link) \
+        #     .then(fn=lambda: None, _js='playNotification').then(fn=lambda: None, _js='refresh_grid_delayed')
+        for notification_file in ['notification.ogg', 'notification.mp3']:
+            if os.path.exists(notification_file):
+                gr.Audio(interactive=False, value=notification_file, elem_id='audio_notification', visible=False)
+                break
+        def trigger_describe(mode, img):
+            if mode == flags.desc_type_photo:
+                from extras.interrogate import default_interrogator as default_interrogator_photo
+                return default_interrogator_photo(img), ["Fooocus V2", "Fooocus Enhance", "Fooocus Sharp"]
+            if mode == flags.desc_type_anime:
+                from extras.wd14tagger import default_interrogator as default_interrogator_anime
+                return default_interrogator_anime(img), ["Fooocus V2", "Fooocus Masterpiece"]
+            return mode, ["Fooocus V2"]
+        desc_btn.click(trigger_describe, inputs=[desc_method, desc_input_image],
+                       outputs=prompt, show_progress=True, queue=True)
+def dump_default_english_config():
+    from modules.localization import dump_english_config
+    dump_english_config(grh.all_components)
+# dump_default_english_config()
+shared.gradio_root.launch(
+    inbrowser=args_manager.args.in_browser,
+    server_name=args_manager.args.listen,
+    server_port=args_manager.args.port,
+    share=args_manager.args.share,
+    auth=check_auth if (args_manager.args.share or args_manager.args.listen) and auth_enabled else None,
+    allowed_paths=[modules.config.path_outputs],
+    blocked_paths=[constants.AUTH_FILENAME]
+)

app.html ADDED Viewed

	@@ -0,0 +1,6 @@

+<script
+	type="module"
+	src="https://gradio.s3-us-west-2.amazonaws.com/4.21.0/gradio.js"
+></script>
+<gradio-app src="https://Adityadn-test.hf.space"></gradio-app>

app.py ADDED Viewed

	@@ -0,0 +1,67 @@

+print("Wait..")
+def test():
+    import gradio as gr
+    def analyze_text(text):
+        # Lakukan analisis atau pemrosesan teks di sini
+        result = f"Anda memasukkan teks: {text}"
+        return result
+    iface = gr.Interface(
+        fn=analyze_text,
+        inputs=gr.Textbox(),  # Menggunakan input textbox
+        outputs="text"  # Menetapkan output ke tipe teks
+    )
+    iface.launch()
+def process():
+    import subprocess
+    def uninstall_and_install_gradio(version):
+        # Uninstall current Gradio
+        uninstall_command = ["pip", "uninstall", "gradio", "-y"]
+        subprocess.run(uninstall_command)
+        # Install specific version of Gradio
+        install_command = ["pip", "install", f"gradio=={version}"]
+        subprocess.run(install_command)
+    # Gantilah "3.41.2" dengan versi Gradio yang diinginkan
+    desired_version = "3.41.2"
+    # Periksa versi Gradio yang terinstal
+    current_version_command = ["pip", "show", "gradio"]
+    result = subprocess.run(current_version_command, capture_output=True, text=True)
+    current_version = None
+    if "Version" in result.stdout:
+        current_version = result.stdout.split("Version:")[1].strip()
+    # Cek dan lakukan uninstall dan install jika versi tidak sesuai
+    if current_version != desired_version:
+        uninstall_and_install_gradio(desired_version)
+        print(f"Gradio has been updated to version {desired_version}")
+    else:
+        print(f"Gradio is already at version {desired_version}")
+    python_script = "entry_with_update.py"
+    # Argument yang ingin Anda tambahkan
+    # additional_arguments = ["--in-browser", "--all-in-fp32", "--directml", "--debug-mode", "--multi-user", "--always-cpu", "--is-windows-embedded-python"]
+    additional_arguments = ["--always-cpu"]
+    # Gabungkan semua argumen
+    PIP = ["pip", "install", "-r", "requirements.txt"]
+    command = ["python", python_script] + additional_arguments
+    # Jalankan skrip menggunakan subprocess
+    subprocess.run(PIP)
+    print("Installing..")
+    subprocess.run(command)# Menjalankan file batch
+    print("Running..")
+    # subprocess.run([batch_file_path], shell=True)
+process()

args_manager.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import ldm_patched.modules.args_parser as args_parser
+import os
+from tempfile import gettempdir
+args_parser.parser.add_argument("--share", action='store_true', help="Set whether to share on Gradio.")
+args_parser.parser.add_argument("--preset", type=str, default=None, help="Apply specified UI preset.")
+args_parser.parser.add_argument("--language", type=str, default='default',
+                                help="Translate UI using json files in [language] folder. "
+                                  "For example, [--language example] will use [language/example.json] for translation.")
+# For example, https://github.com/lllyasviel/Fooocus/issues/849
+args_parser.parser.add_argument("--disable-offload-from-vram", action="store_true",
+                                help="Force loading models to vram when the unload can be avoided. "
+                                  "Some Mac users may need this.")
+args_parser.parser.add_argument("--theme", type=str, help="launches the UI with light or dark theme", default=None)
+args_parser.parser.add_argument("--disable-image-log", action='store_true',
+                                help="Prevent writing images and logs to hard drive.")
+args_parser.parser.add_argument("--disable-analytics", action='store_true',
+                                help="Disables analytics for Gradio.")
+args_parser.parser.add_argument("--disable-metadata", action='store_true',
+                                help="Disables saving metadata to images.")
+args_parser.parser.add_argument("--disable-preset-download", action='store_true',
+                                help="Disables downloading models for presets", default=False)
+args_parser.parser.add_argument("--always-download-new-model", action='store_true',
+                                help="Always download newer models ", default=False)
+args_parser.parser.set_defaults(
+    disable_cuda_malloc=True,
+    in_browser=True,
+    port=None
+)
+args_parser.args = args_parser.parser.parse_args()
+# (Disable by default because of issues like https://github.com/lllyasviel/Fooocus/issues/724)
+args_parser.args.always_offload_from_vram = not args_parser.args.disable_offload_from_vram
+if args_parser.args.disable_analytics:
+    import os
+    os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
+if args_parser.args.disable_in_browser:
+    args_parser.args.in_browser = False
+if args_parser.args.temp_path is None:
+    args_parser.args.temp_path = os.path.join(gettempdir(), 'Fooocus')
+args = args_parser.args

auth-example.json ADDED Viewed

	@@ -0,0 +1,6 @@

+[
+    {
+        "user": "sitting-duck-1",
+        "pass": "very-bad-publicly-known-password-change-it"
+    }
+]

auth.json ADDED Viewed

	@@ -0,0 +1,6 @@

+[
+    {
+        "user": "user123",
+        "pass": "pass123"
+    }
+]

build_launcher.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import os
+win32_root = os.path.dirname(os.path.dirname(__file__))
+python_embeded_path = os.path.join(win32_root, 'python_embeded')
+is_win32_standalone_build = os.path.exists(python_embeded_path) and os.path.isdir(python_embeded_path)
+win32_cmd = '''
+.\python_embeded\python.exe -s Fooocus\entry_with_update.py {cmds} %*
+pause
+'''
+def build_launcher():
+    if not is_win32_standalone_build:
+        return
+    presets = [None, 'anime', 'realistic']
+    for preset in presets:
+        win32_cmd_preset = win32_cmd.replace('{cmds}', '' if preset is None else f'--preset {preset}')
+        bat_path = os.path.join(win32_root, 'run.bat' if preset is None else f'run_{preset}.bat')
+        if not os.path.exists(bat_path):
+            with open(bat_path, "w", encoding="utf-8") as f:
+                f.write(win32_cmd_preset)
+    return

config.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "path_checkpoints": [
+        "models\\checkpoints"
+    ],
+    "path_loras": [
+        "models\\loras"
+    ],
+    "path_embeddings": "models\\embeddings",
+    "path_vae_approx": "models\\vae_approx",
+    "path_upscale_models": "models\\upscale_models",
+    "path_inpaint": "models\\inpaint",
+    "path_controlnet": "models\\controlnet",
+    "path_clip_vision": "models\\clip_vision",
+    "path_fooocus_expansion": "models\\prompt_expansion\\fooocus_expansion",
+    "path_outputs": "outputs"
+}

config_modification_tutorial.txt ADDED Viewed

	@@ -0,0 +1,123 @@

+You can modify your "D:\\ADITYA FILE\\Developer\\MICROSOFT\\Microsoft Visual Studio Code\\Project\\Application Website\\Nyxel\\Flowly AI\\My Project\\AI Image\\config.txt" using the below keys, formats, and examples.
+Do not modify this file. Modifications in this file will not take effect.
+This file is a tutorial and example. Please edit "D:\\ADITYA FILE\\Developer\\MICROSOFT\\Microsoft Visual Studio Code\\Project\\Application Website\\Nyxel\\Flowly AI\\My Project\\AI Image\\config.txt" to really change any settings.
+Remember to split the paths with "\\" rather than "\", and there is no "," before the last "}".
+{
+    "path_checkpoints": [
+        "models\\checkpoints"
+    ],
+    "path_loras": [
+        "models\\loras"
+    ],
+    "path_embeddings": "models\\embeddings",
+    "path_vae_approx": "models\\vae_approx",
+    "path_upscale_models": "models\\upscale_models",
+    "path_inpaint": "models\\inpaint",
+    "path_controlnet": "models\\controlnet",
+    "path_clip_vision": "models\\clip_vision",
+    "path_fooocus_expansion": "models\\prompt_expansion\\fooocus_expansion",
+    "path_outputs": "outputs",
+    "default_model": "juggernautXL_v8Rundiffusion.safetensors",
+    "previous_default_models": [
+        "juggernautXL_version8Rundiffusion.safetensors",
+        "juggernautXL_version7Rundiffusion.safetensors",
+        "juggernautXL_v7Rundiffusion.safetensors",
+        "juggernautXL_version6Rundiffusion.safetensors",
+        "juggernautXL_v6Rundiffusion.safetensors"
+    ],
+    "default_refiner": "None",
+    "default_refiner_switch": 0.5,
+    "default_loras_min_weight": -2,
+    "default_loras_max_weight": 2,
+    "default_loras": [
+        [
+            "sd_xl_offset_example-lora_1.0.safetensors",
+            0.1
+        ],
+        [
+            "None",
+            1.0
+        ],
+        [
+            "None",
+            1.0
+        ],
+        [
+            "None",
+            1.0
+        ],
+        [
+            "None",
+            1.0
+        ]
+    ],
+    "default_max_lora_number": 5,
+    "default_cfg_scale": 4.0,
+    "default_sample_sharpness": 2.0,
+    "default_sampler": "dpmpp_2m_sde_gpu",
+    "default_scheduler": "karras",
+    "default_styles": [
+        "Fooocus V2",
+        "Fooocus Enhance",
+        "Fooocus Sharp"
+    ],
+    "default_prompt_negative": "",
+    "default_prompt": "",
+    "default_performance": "Speed",
+    "default_advanced_checkbox": false,
+    "default_max_image_number": 32,
+    "default_output_format": "png",
+    "default_image_number": 2,
+    "checkpoint_downloads": {
+        "juggernautXL_v8Rundiffusion.safetensors": "https://huggingface.co/lllyasviel/fav_models/resolve/main/fav/juggernautXL_v8Rundiffusion.safetensors"
+    },
+    "lora_downloads": {
+        "sd_xl_offset_example-lora_1.0.safetensors": "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/sd_xl_offset_example-lora_1.0.safetensors"
+    },
+    "embeddings_downloads": {},
+    "available_aspect_ratios": [
+        "704*1408",
+        "704*1344",
+        "768*1344",
+        "768*1280",
+        "832*1216",
+        "832*1152",
+        "896*1152",
+        "896*1088",
+        "960*1088",
+        "960*1024",
+        "1024*1024",
+        "1024*960",
+        "1088*960",
+        "1088*896",
+        "1152*896",
+        "1152*832",
+        "1216*832",
+        "1280*768",
+        "1344*768",
+        "1344*704",
+        "1408*704",
+        "1472*704",
+        "1536*640",
+        "1600*640",
+        "1664*576",
+        "1728*576"
+    ],
+    "default_aspect_ratio": "1152*896",
+    "default_inpaint_engine_version": "v2.6",
+    "default_cfg_tsnr": 7.0,
+    "default_overwrite_step": -1,
+    "default_overwrite_switch": -1,
+    "example_inpaint_prompts": [
+        "highly detailed face",
+        "detailed girl face",
+        "detailed man face",
+        "detailed hand",
+        "beautiful eyes"
+    ],
+    "default_save_metadata_to_images": true,
+    "default_metadata_scheme": "fooocus",
+    "metadata_created_by": ""
+}

css/style.css ADDED Viewed

	@@ -0,0 +1,220 @@

+/* based on https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/v1.6.0/style.css */
+#context-menu{
+    z-index:9999;
+    position:absolute;
+    display:block;
+    padding:0px 0;
+    border:2px solid #a55000;
+    border-radius:8px;
+    box-shadow:1px 1px 2px #CE6400;
+    width: 200px;
+}
+.context-menu-items{
+    list-style: none;
+    margin: 0;
+    padding: 0;
+}
+.context-menu-items a{
+    display:block;
+    padding:5px;
+    cursor:pointer;
+}
+.context-menu-items a:hover{
+    background: #a55000;
+}
+.canvas-tooltip-info {
+  position: absolute;
+  top: 28px;
+  left: 2px;
+  cursor: help;
+  background-color: rgba(0, 0, 0, 0.3);
+  width: 20px;
+  height: 20px;
+  border-radius: 50%;
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  flex-direction: column;
+  z-index: 100;
+}
+.canvas-tooltip-info::after {
+  content: '';
+  display: block;
+  width: 2px;
+  height: 7px;
+  background-color: white;
+  margin-top: 2px;
+}
+.canvas-tooltip-info::before {
+  content: '';
+  display: block;
+  width: 2px;
+  height: 2px;
+  background-color: white;
+}
+.canvas-tooltip-content {
+  display: none;
+  background-color: #f9f9f9;
+  color: #333;
+  border: 1px solid #ddd;
+  padding: 15px;
+  position: absolute;
+  top: 40px;
+  left: 10px;
+  width: 250px;
+  font-size: 16px;
+  opacity: 0;
+  border-radius: 8px;
+  box-shadow: 0px 8px 16px 0px rgba(0,0,0,0.2);
+  z-index: 100;
+}
+.canvas-tooltip:hover .canvas-tooltip-content {
+  display: block;
+  animation: fadeIn 0.5s;
+  opacity: 1;
+}
+@keyframes fadeIn {
+  from {opacity: 0;}
+  to {opacity: 1;}
+}
+.styler {
+  overflow:inherit !important;
+}
+.gradio-container{
+  overflow: visible;
+}
+/* fullpage image viewer */
+#lightboxModal{
+    display: none;
+    position: fixed;
+    z-index: 1001;
+    left: 0;
+    top: 0;
+    width: 100%;
+    height: 100%;
+    overflow: auto;
+    background-color: rgba(20, 20, 20, 0.95);
+    user-select: none;
+    -webkit-user-select: none;
+    flex-direction: column;
+}
+.modalControls {
+    display: flex;
+    position: absolute;
+    right: 0px;
+    left: 0px;
+    gap: 1em;
+    padding: 1em;
+    background-color:rgba(0,0,0,0);
+    z-index: 1;
+    transition: 0.2s ease background-color;
+}
+.modalControls:hover {
+    background-color:rgba(0,0,0,0.9);
+}
+.modalClose {
+    margin-left: auto;
+}
+.modalControls span{
+    color: white;
+    text-shadow: 0px 0px 0.25em black;
+    font-size: 35px;
+    font-weight: bold;
+    cursor: pointer;
+    width: 1em;
+}
+.modalControls span:hover, .modalControls span:focus{
+    color: #999;
+    text-decoration: none;
+}
+#lightboxModal > img {
+    display: block;
+    margin: auto;
+    width: auto;
+}
+#lightboxModal > img.modalImageFullscreen{
+    object-fit: contain;
+    height: 100%;
+    width: 100%;
+    min-height: 0;
+}
+.modalPrev,
+.modalNext {
+  cursor: pointer;
+  position: absolute;
+  top: 50%;
+  width: auto;
+  padding: 16px;
+  margin-top: -50px;
+  color: white;
+  font-weight: bold;
+  font-size: 20px;
+  transition: 0.6s ease;
+  border-radius: 0 3px 3px 0;
+  user-select: none;
+  -webkit-user-select: none;
+}
+.modalNext {
+  right: 0;
+  border-radius: 3px 0 0 3px;
+}
+.modalPrev:hover,
+.modalNext:hover {
+  background-color: rgba(0, 0, 0, 0.8);
+}
+#imageARPreview {
+    position: absolute;
+    top: 0px;
+    left: 0px;
+    border: 2px solid red;
+    background: rgba(255, 0, 0, 0.3);
+    z-index: 900;
+    pointer-events: none;
+    display: none;
+}
+#stylePreviewOverlay {
+    opacity: 0;
+    pointer-events: none;
+    width: 128px;
+    height: 128px;
+    position: fixed;
+    top: 0px;
+    left: 0px;
+    border: solid 1px lightgrey;
+    transform: translate(-140px, 20px);
+    background-size: cover;
+    background-position: center;
+    background-color: rgba(0, 0, 0, 0.3);
+    border-radius: 5px;
+    z-index: 100;
+    transition: transform 0.1s ease, opacity 0.3s ease;
+}
+#stylePreviewOverlay.lower-half {
+    transform: translate(-140px, -140px);
+}

entry_with_update.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import os
+import sys
+root = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(root)
+os.chdir(root)
+try:
+    import pygit2
+    pygit2.option(pygit2.GIT_OPT_SET_OWNER_VALIDATION, 0)
+    repo = pygit2.Repository(os.path.abspath(os.path.dirname(__file__)))
+    branch_name = repo.head.shorthand
+    remote_name = 'origin'
+    remote = repo.remotes[remote_name]
+    remote.fetch()
+    local_branch_ref = f'refs/heads/{branch_name}'
+    local_branch = repo.lookup_reference(local_branch_ref)
+    remote_reference = f'refs/remotes/{remote_name}/{branch_name}'
+    remote_commit = repo.revparse_single(remote_reference)
+    merge_result, _ = repo.merge_analysis(remote_commit.id)
+    if merge_result & pygit2.GIT_MERGE_ANALYSIS_UP_TO_DATE:
+        print("Already up-to-date")
+    elif merge_result & pygit2.GIT_MERGE_ANALYSIS_FASTFORWARD:
+        local_branch.set_target(remote_commit.id)
+        repo.head.set_target(remote_commit.id)
+        repo.checkout_tree(repo.get(remote_commit.id))
+        repo.reset(local_branch.target, pygit2.GIT_RESET_HARD)
+        print("Fast-forward merge")
+    elif merge_result & pygit2.GIT_MERGE_ANALYSIS_NORMAL:
+        print("Update failed - Did you modify any file?")
+except Exception as e:
+    print('Update failed.')
+    print(str(e))
+print('Update succeeded.')
+from launch import *

entrypoint.sh ADDED Viewed

	@@ -0,0 +1,33 @@

+#!/bin/bash
+ORIGINALDIR=/content/app
+# Use predefined DATADIR if it is defined
+[[ x"${DATADIR}" == "x" ]] && DATADIR=/content/data
+# Make persistent dir from original dir
+function mklink () {
+	mkdir -p $DATADIR/$1
+	ln -s $DATADIR/$1 $ORIGINALDIR
+}
+# Copy old files from import dir
+function import () {
+	(test -d /import/$1 && cd /import/$1 && cp -Rpn . $DATADIR/$1/)
+}
+cd $ORIGINALDIR
+# models
+mklink models
+# Copy original files
+(cd $ORIGINALDIR/models.org && cp -Rpn . $ORIGINALDIR/models/)
+# Import old files
+import models
+# outputs
+mklink outputs
+# Import old files
+import outputs
+# Start application
+python launch.py $*

environment.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+name: fooocus
+channels:
+  - defaults
+dependencies:
+  - python=3.10
+  - pip=23.0
+  - packaging

experiments_expansion.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from modules.expansion import FooocusExpansion
+expansion = FooocusExpansion()
+text = 'a handsome man'
+for i in range(64):
+    print(expansion(text, seed=i))

experiments_face.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import cv2
+import extras.face_crop as cropper
+img = cv2.imread('lena.png')
+result = cropper.crop_image(img)
+cv2.imwrite('lena_result.png', result)

experiments_interrogate.py ADDED Viewed

	@@ -0,0 +1,205 @@

+import os
+import sys
+root = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(root)
+os.chdir(root)
+try:
+    import pygit2
+    pygit2.option(pygit2.GIT_OPT_SET_OWNER_VALIDATION, 0)
+    repo = pygit2.Repository(os.path.abspath(os.path.dirname(__file__)))
+    branch_name = repo.head.shorthand
+    remote_name = 'origin'
+    remote = repo.remotes[remote_name]
+    remote.fetch()
+    local_branch_ref = f'refs/heads/{branch_name}'
+    local_branch = repo.lookup_reference(local_branch_ref)
+    remote_reference = f'refs/remotes/{remote_name}/{branch_name}'
+    remote_commit = repo.revparse_single(remote_reference)
+    merge_result, _ = repo.merge_analysis(remote_commit.id)
+    if merge_result & pygit2.GIT_MERGE_ANALYSIS_UP_TO_DATE:
+        print("Already up-to-date")
+    elif merge_result & pygit2.GIT_MERGE_ANALYSIS_FASTFORWARD:
+        local_branch.set_target(remote_commit.id)
+        repo.head.set_target(remote_commit.id)
+        repo.checkout_tree(repo.get(remote_commit.id))
+        repo.reset(local_branch.target, pygit2.GIT_RESET_HARD)
+        print("Fast-forward merge")
+    elif merge_result & pygit2.GIT_MERGE_ANALYSIS_NORMAL:
+        print("Update failed - Did you modify any file?")
+except Exception as e:
+    print('Update failed.')
+    print(str(e))
+import os
+import sys
+import ssl
+print('[System ARGV] ' + str(sys.argv))
+root = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(root)
+os.chdir(root)
+os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
+os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
+if "GRADIO_SERVER_PORT" not in os.environ:
+    os.environ["GRADIO_SERVER_PORT"] = "7865"
+ssl._create_default_https_context = ssl._create_unverified_context
+import platform
+import fooocus_version
+from build_launcher import build_launcher
+from modules.launch_util import is_installed, run, python, run_pip, requirements_met
+from modules.model_loader import load_file_from_url
+REINSTALL_ALL = False
+TRY_INSTALL_XFORMERS = False
+def prepare_environment():
+    torch_index_url = os.environ.get('TORCH_INDEX_URL', "https://download.pytorch.org/whl/cu121")
+    torch_command = os.environ.get('TORCH_COMMAND',
+                                   f"pip install torch==2.1.0 torchvision==0.16.0 --extra-index-url {torch_index_url}")
+    requirements_file = os.environ.get('REQS_FILE', "requirements_versions.txt")
+    print(f"Python {sys.version}")
+    print(f"Fooocus version: {fooocus_version.version}")
+    if REINSTALL_ALL or not is_installed("torch") or not is_installed("torchvision"):
+        run(f'"{python}" -m {torch_command}', "Installing torch and torchvision", "Couldn't install torch", live=True)
+    if TRY_INSTALL_XFORMERS:
+        if REINSTALL_ALL or not is_installed("xformers"):
+            xformers_package = os.environ.get('XFORMERS_PACKAGE', 'xformers==0.0.23')
+            if platform.system() == "Windows":
+                if platform.python_version().startswith("3.10"):
+                    run_pip(f"install -U -I --no-deps {xformers_package}", "xformers", live=True)
+                else:
+                    print("Installation of xformers is not supported in this version of Python.")
+                    print(
+                        "You can also check this and build manually: https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Xformers#building-xformers-on-windows-by-duckness")
+                    if not is_installed("xformers"):
+                        exit(0)
+            elif platform.system() == "Linux":
+                run_pip(f"install -U -I --no-deps {xformers_package}", "xformers")
+    if REINSTALL_ALL or not requirements_met(requirements_file):
+        run_pip(f"install -r \"{requirements_file}\"", "requirements")
+    return
+vae_approx_filenames = [
+    ('xlvaeapp.pth', 'https://huggingface.co/lllyasviel/misc/resolve/main/xlvaeapp.pth'),
+    ('vaeapp_sd15.pth', 'https://huggingface.co/lllyasviel/misc/resolve/main/vaeapp_sd15.pt'),
+    ('xl-to-v1_interposer-v3.1.safetensors',
+     'https://huggingface.co/lllyasviel/misc/resolve/main/xl-to-v1_interposer-v3.1.safetensors')
+]
+def ini_args():
+    from args_manager import args
+    return args
+prepare_environment()
+build_launcher()
+args = ini_args()
+if args.gpu_device_id is not None:
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu_device_id)
+    print("Set device to:", args.gpu_device_id)
+from modules import config
+def download_models():
+    for file_name, url in vae_approx_filenames:
+        load_file_from_url(url=url, model_dir=config.path_vae_approx, file_name=file_name)
+    load_file_from_url(
+        url='https://huggingface.co/lllyasviel/misc/resolve/main/fooocus_expansion.bin',
+        model_dir=config.path_fooocus_expansion,
+        file_name='pytorch_model.bin'
+    )
+    if args.disable_preset_download:
+        print('Skipped model download.')
+        return
+    if not args.always_download_new_model:
+        if not os.path.exists(os.path.join(config.paths_checkpoints[0], config.default_base_model_name)):
+            for alternative_model_name in config.previous_default_models:
+                if os.path.exists(os.path.join(config.paths_checkpoints[0], alternative_model_name)):
+                    print(f'You do not have [{config.default_base_model_name}] but you have [{alternative_model_name}].')
+                    print(f'Fooocus will use [{alternative_model_name}] to avoid downloading new models, '
+                          f'but you are not using latest models.')
+                    print('Use --always-download-new-model to avoid fallback and always get new models.')
+                    config.checkpoint_downloads = {}
+                    config.default_base_model_name = alternative_model_name
+                    break
+    for file_name, url in config.checkpoint_downloads.items():
+        load_file_from_url(url=url, model_dir=config.paths_checkpoints[0], file_name=file_name)
+    for file_name, url in config.embeddings_downloads.items():
+        load_file_from_url(url=url, model_dir=config.path_embeddings, file_name=file_name)
+    for file_name, url in config.lora_downloads.items():
+        load_file_from_url(url=url, model_dir=config.paths_loras[0], file_name=file_name)
+    return
+download_models()
+import gradio as gr
+import modules.gradio_hijack as grh
+from extras.interrogate import default_interrogator as default_interrogator_photo
+from extras.wd14tagger import default_interrogator as default_interrogator_anime
+import modules.flags as flags
+def interrogatorFunction(img, value):
+    if value == flags.desc_type_photo:  # Menggunakan operator perbandingan '==' untuk memeriksa kesamaan
+        output = default_interrogator_photo(img)
+        print(output)
+    else:
+        output = default_interrogator_anime(img)
+        print(output)
+    return output
+describe = gr.Blocks(title="AI Describe Image", css="#component-3, #component-5 {display: grid; align-content: center;}")
+with describe:
+    describe_tab = gr.TabItem(label='Describe')
+    with describe_tab:
+        input_column = gr.Row()
+        with input_column:
+            with gr.Column():
+                input_image = grh.Image(label='Input', source='upload', type='numpy')
+            with gr.Column():
+                content_type = gr.Radio(
+                    label='Content Type',
+                    choices=[flags.desc_type_photo, flags.desc_type_anime],
+                    value=flags.desc_type_photo
+                )
+                desc_btn = gr.Button(value='Describe this Image into Prompt')
+                outputs=gr.Textbox(type="text", label="Output", show_copy_button=True)
+    desc_btn.click(interrogatorFunction, inputs=[input_image, content_type], outputs=[outputs])
+describe.launch()

extras/BLIP/configs/bert_config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "type_vocab_size": 2,
+  "vocab_size": 30522,
+  "encoder_width": 768,
+  "add_cross_attention": true
+}

extras/BLIP/configs/caption_coco.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+image_root: '/export/share/datasets/vision/coco/images/'
+ann_root: 'annotation'
+coco_gt_root: 'annotation/coco_gt'
+# set pretrained as a file path or an url
+pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth'
+# size of vit model; base or large
+vit: 'base'
+vit_grad_ckpt: False
+vit_ckpt_layer: 0
+batch_size: 32
+init_lr: 1e-5
+# vit: 'large'
+# vit_grad_ckpt: True
+# vit_ckpt_layer: 5
+# batch_size: 16
+# init_lr: 2e-6
+image_size: 384
+# generation configs
+max_length: 20
+min_length: 5
+num_beams: 3
+prompt: 'a picture of '
+# optimizer
+weight_decay: 0.05
+min_lr: 0
+max_epoch: 5

extras/BLIP/configs/med_config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "type_vocab_size": 2,
+  "vocab_size": 30524,
+  "encoder_width": 768,
+  "add_cross_attention": true
+}

extras/BLIP/configs/nlvr.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+image_root: '/export/share/datasets/vision/NLVR2/'
+ann_root: 'annotation'
+# set pretrained as a file path or an url
+pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_nlvr.pth'
+#size of vit model; base or large
+vit: 'base'
+batch_size_train: 16
+batch_size_test: 64
+vit_grad_ckpt: False
+vit_ckpt_layer: 0
+max_epoch: 15
+image_size: 384
+# optimizer
+weight_decay: 0.05
+init_lr: 3e-5
+min_lr: 0

extras/BLIP/configs/nocaps.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+image_root: '/export/share/datasets/vision/nocaps/'
+ann_root: 'annotation'
+# set pretrained as a file path or an url
+pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth'
+vit: 'base'
+batch_size: 32
+image_size: 384
+max_length: 20
+min_length: 5
+num_beams: 3
+prompt: 'a picture of '

extras/BLIP/configs/pretrain.yaml ADDED Viewed

	@@ -0,0 +1,27 @@

+train_file: ['/export/share/junnan-li/VL_pretrain/annotation/coco_karpathy_train.json',
+             '/export/share/junnan-li/VL_pretrain/annotation/vg_caption.json',
+             ]
+laion_path: ''
+# size of vit model; base or large
+vit: 'base'
+vit_grad_ckpt: False
+vit_ckpt_layer: 0
+image_size: 224
+batch_size: 75
+queue_size: 57600
+alpha: 0.4
+# optimizer
+weight_decay: 0.05
+init_lr: 3e-4
+min_lr: 1e-6
+warmup_lr: 1e-6
+lr_decay_rate: 0.9
+max_epoch: 20
+warmup_steps: 3000

extras/BLIP/configs/retrieval_coco.yaml ADDED Viewed

	@@ -0,0 +1,34 @@

+image_root: '/export/share/datasets/vision/coco/images/'
+ann_root: 'annotation'
+dataset: 'coco'
+# set pretrained as a file path or an url
+pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth'
+# size of vit model; base or large
+vit: 'base'
+batch_size_train: 32
+batch_size_test: 64
+vit_grad_ckpt: True
+vit_ckpt_layer: 4
+init_lr: 1e-5
+# vit: 'large'
+# batch_size_train: 16
+# batch_size_test: 32
+# vit_grad_ckpt: True
+# vit_ckpt_layer: 12
+# init_lr: 5e-6
+image_size: 384
+queue_size: 57600
+alpha: 0.4
+k_test: 256
+negative_all_rank: True
+# optimizer
+weight_decay: 0.05
+min_lr: 0
+max_epoch: 6

extras/BLIP/configs/retrieval_flickr.yaml ADDED Viewed

	@@ -0,0 +1,34 @@

+image_root: '/export/share/datasets/vision/flickr30k/'
+ann_root: 'annotation'
+dataset: 'flickr'
+# set pretrained as a file path or an url
+pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_flickr.pth'
+# size of vit model; base or large
+vit: 'base'
+batch_size_train: 32
+batch_size_test: 64
+vit_grad_ckpt: True
+vit_ckpt_layer: 4
+init_lr: 1e-5
+# vit: 'large'
+# batch_size_train: 16
+# batch_size_test: 32
+# vit_grad_ckpt: True
+# vit_ckpt_layer: 10
+# init_lr: 5e-6
+image_size: 384
+queue_size: 57600
+alpha: 0.4
+k_test: 128
+negative_all_rank: False
+# optimizer
+weight_decay: 0.05
+min_lr: 0
+max_epoch: 6

extras/BLIP/configs/retrieval_msrvtt.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+video_root: '/export/share/dongxuli/data/msrvtt_retrieval/videos'
+ann_root: 'annotation'
+# set pretrained as a file path or an url
+pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth'
+# size of vit model; base or large
+vit: 'base'
+batch_size: 64
+k_test: 128
+image_size: 384
+num_frm_test: 8

extras/BLIP/configs/vqa.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+vqa_root: '/export/share/datasets/vision/VQA/Images/mscoco/' #followed by train2014/
+vg_root: '/export/share/datasets/vision/visual-genome/'  #followed by image/
+train_files: ['vqa_train','vqa_val','vg_qa']
+ann_root: 'annotation'
+# set pretrained as a file path or an url
+pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth'
+# size of vit model; base or large
+vit: 'base'
+batch_size_train: 16
+batch_size_test: 32
+vit_grad_ckpt: False
+vit_ckpt_layer: 0
+init_lr: 2e-5
+image_size: 480
+k_test: 128
+inference: 'rank'
+# optimizer
+weight_decay: 0.05
+min_lr: 0
+max_epoch: 10

extras/BLIP/models/__pycache__/blip.cpython-310.pyc ADDED Viewed

Binary file (7.1 kB). View file

extras/BLIP/models/__pycache__/med.cpython-310.pyc ADDED Viewed

Binary file (28 kB). View file

extras/BLIP/models/__pycache__/vit.cpython-310.pyc ADDED Viewed

Binary file (12.5 kB). View file

extras/BLIP/models/bert_tokenizer/config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "architectures": [
+    "BertForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.6.0.dev0",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
+}

extras/BLIP/models/bert_tokenizer/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

extras/BLIP/models/bert_tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "do_lower_case": true
+}

extras/BLIP/models/bert_tokenizer/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

extras/BLIP/models/blip.py ADDED Viewed

	@@ -0,0 +1,239 @@

+'''
+ * Copyright (c) 2022, salesforce.com, inc.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+ * By Junnan Li
+'''
+import warnings
+warnings.filterwarnings("ignore")
+from extras.BLIP.models.vit import VisionTransformer, interpolate_pos_embed
+from extras.BLIP.models.med import BertConfig, BertModel, BertLMHeadModel
+from transformers import BertTokenizer
+import torch
+from torch import nn
+import torch.nn.functional as F
+import os
+from urllib.parse import urlparse
+from timm.models.hub import download_cached_file
+class BLIP_Base(nn.Module):
+    def __init__(self,
+                 med_config = 'configs/med_config.json',
+                 image_size = 224,
+                 vit = 'base',
+                 vit_grad_ckpt = False,
+                 vit_ckpt_layer = 0,
+                 ):
+        """
+        Args:
+            med_config (str): path for the mixture of encoder-decoder model's configuration file
+            image_size (int): input image size
+            vit (str): model size of vision transformer
+        """
+        super().__init__()
+        self.visual_encoder, vision_width = create_vit(vit,image_size, vit_grad_ckpt, vit_ckpt_layer)
+        self.tokenizer = init_tokenizer()
+        med_config = BertConfig.from_json_file(med_config)
+        med_config.encoder_width = vision_width
+        self.text_encoder = BertModel(config=med_config, add_pooling_layer=False)
+    def forward(self, image, caption, mode):
+        assert mode in ['image', 'text', 'multimodal'], "mode parameter must be image, text, or multimodal"
+        text = self.tokenizer(caption, return_tensors="pt").to(image.device)
+        if mode=='image':
+            # return image features
+            image_embeds = self.visual_encoder(image)
+            return image_embeds
+        elif mode=='text':
+            # return text features
+            text_output = self.text_encoder(text.input_ids, attention_mask = text.attention_mask,
+                                            return_dict = True, mode = 'text')
+            return text_output.last_hidden_state
+        elif mode=='multimodal':
+            # return multimodel features
+            image_embeds = self.visual_encoder(image)
+            image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(image.device)
+            text.input_ids[:,0] = self.tokenizer.enc_token_id
+            output = self.text_encoder(text.input_ids,
+                                       attention_mask = text.attention_mask,
+                                       encoder_hidden_states = image_embeds,
+                                       encoder_attention_mask = image_atts,
+                                       return_dict = True,
+                                      )
+            return output.last_hidden_state
+class BLIP_Decoder(nn.Module):
+    def __init__(self,
+                 med_config = 'configs/med_config.json',
+                 image_size = 384,
+                 vit = 'base',
+                 vit_grad_ckpt = False,
+                 vit_ckpt_layer = 0,
+                 prompt = 'a picture of ',
+                 ):
+        """
+        Args:
+            med_config (str): path for the mixture of encoder-decoder model's configuration file
+            image_size (int): input image size
+            vit (str): model size of vision transformer
+        """
+        super().__init__()
+        self.visual_encoder, vision_width = create_vit(vit,image_size, vit_grad_ckpt, vit_ckpt_layer)
+        self.tokenizer = init_tokenizer()
+        med_config = BertConfig.from_json_file(med_config)
+        med_config.encoder_width = vision_width
+        self.text_decoder = BertLMHeadModel(config=med_config)
+        self.prompt = prompt
+        self.prompt_length = len(self.tokenizer(self.prompt).input_ids)-1
+    def forward(self, image, caption):
+        image_embeds = self.visual_encoder(image)
+        image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(image.device)
+        text = self.tokenizer(caption, padding='longest', truncation=True, max_length=40, return_tensors="pt").to(image.device)
+        text.input_ids[:,0] = self.tokenizer.bos_token_id
+        decoder_targets = text.input_ids.masked_fill(text.input_ids == self.tokenizer.pad_token_id, -100)
+        decoder_targets[:,:self.prompt_length] = -100
+        decoder_output = self.text_decoder(text.input_ids,
+                                           attention_mask = text.attention_mask,
+                                           encoder_hidden_states = image_embeds,
+                                           encoder_attention_mask = image_atts,
+                                           labels = decoder_targets,
+                                           return_dict = True,
+                                          )
+        loss_lm = decoder_output.loss
+        return loss_lm
+    def generate(self, image, sample=False, num_beams=3, max_length=30, min_length=10, top_p=0.9, repetition_penalty=1.0):
+        image_embeds = self.visual_encoder(image)
+        if not sample:
+            image_embeds = image_embeds.repeat_interleave(num_beams,dim=0)
+        image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(image.device)
+        model_kwargs = {"encoder_hidden_states": image_embeds, "encoder_attention_mask":image_atts}
+        prompt = [self.prompt] * image.size(0)
+        input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.to(image.device)
+        input_ids[:,0] = self.tokenizer.bos_token_id
+        input_ids = input_ids[:, :-1]
+        if sample:
+            #nucleus sampling
+            outputs = self.text_decoder.generate(input_ids=input_ids,
+                                                  max_length=max_length,
+                                                  min_length=min_length,
+                                                  do_sample=True,
+                                                  top_p=top_p,
+                                                  num_return_sequences=1,
+                                                  eos_token_id=self.tokenizer.sep_token_id,
+                                                  pad_token_id=self.tokenizer.pad_token_id,
+                                                  repetition_penalty=1.1,
+                                                  **model_kwargs)
+        else:
+            #beam search
+            outputs = self.text_decoder.generate(input_ids=input_ids,
+                                                  max_length=max_length,
+                                                  min_length=min_length,
+                                                  num_beams=num_beams,
+                                                  eos_token_id=self.tokenizer.sep_token_id,
+                                                  pad_token_id=self.tokenizer.pad_token_id,
+                                                  repetition_penalty=repetition_penalty,
+                                                  **model_kwargs)
+        captions = []
+        for output in outputs:
+            caption = self.tokenizer.decode(output, skip_special_tokens=True)
+            captions.append(caption[len(self.prompt):])
+        return captions
+def blip_decoder(pretrained='',**kwargs):
+    model = BLIP_Decoder(**kwargs)
+    if pretrained:
+        model,msg = load_checkpoint(model,pretrained)
+        assert(len(msg.missing_keys)==0)
+    return model
+def blip_feature_extractor(pretrained='',**kwargs):
+    model = BLIP_Base(**kwargs)
+    if pretrained:
+        model,msg = load_checkpoint(model,pretrained)
+        assert(len(msg.missing_keys)==0)
+    return model
+def init_tokenizer():
+    tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "bert_tokenizer")
+    tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
+    tokenizer.add_special_tokens({'bos_token':'[DEC]'})
+    tokenizer.add_special_tokens({'additional_special_tokens':['[ENC]']})
+    tokenizer.enc_token_id = tokenizer.additional_special_tokens_ids[0]
+    return tokenizer
+def create_vit(vit, image_size, use_grad_checkpointing=False, ckpt_layer=0, drop_path_rate=0):
+    assert vit in ['base', 'large'], "vit parameter must be base or large"
+    if vit=='base':
+        vision_width = 768
+        visual_encoder = VisionTransformer(img_size=image_size, patch_size=16, embed_dim=vision_width, depth=12,
+                                           num_heads=12, use_grad_checkpointing=use_grad_checkpointing, ckpt_layer=ckpt_layer,
+                                           drop_path_rate=0 or drop_path_rate
+                                          )
+    elif vit=='large':
+        vision_width = 1024
+        visual_encoder = VisionTransformer(img_size=image_size, patch_size=16, embed_dim=vision_width, depth=24,
+                                           num_heads=16, use_grad_checkpointing=use_grad_checkpointing, ckpt_layer=ckpt_layer,
+                                           drop_path_rate=0.1 or drop_path_rate
+                                          )
+    return visual_encoder, vision_width
+def is_url(url_or_filename):
+    parsed = urlparse(url_or_filename)
+    return parsed.scheme in ("http", "https")
+def load_checkpoint(model,url_or_filename):
+    if is_url(url_or_filename):
+        cached_file = download_cached_file(url_or_filename, check_hash=False, progress=True)
+        checkpoint = torch.load(cached_file, map_location='cpu')
+    elif os.path.isfile(url_or_filename):
+        checkpoint = torch.load(url_or_filename, map_location='cpu')
+    else:
+        raise RuntimeError('checkpoint url or path is invalid')
+    state_dict = checkpoint['model']
+    state_dict['visual_encoder.pos_embed'] = interpolate_pos_embed(state_dict['visual_encoder.pos_embed'],model.visual_encoder)
+    if 'visual_encoder_m.pos_embed' in model.state_dict().keys():
+        state_dict['visual_encoder_m.pos_embed'] = interpolate_pos_embed(state_dict['visual_encoder_m.pos_embed'],
+                                                                         model.visual_encoder_m)
+    for key in model.state_dict().keys():
+        if key in state_dict.keys():
+            if state_dict[key].shape!=model.state_dict()[key].shape:
+                del state_dict[key]
+    msg = model.load_state_dict(state_dict,strict=False)
+    print('load checkpoint from %s'%url_or_filename)
+    return model,msg

extras/BLIP/models/blip_itm.py ADDED Viewed

	@@ -0,0 +1,76 @@

+from extras.BLIP.models.med import BertConfig, BertModel
+from transformers import BertTokenizer
+import torch
+from torch import nn
+import torch.nn.functional as F
+from extras.BLIP.models.blip import create_vit, init_tokenizer, load_checkpoint
+class BLIP_ITM(nn.Module):
+    def __init__(self,
+                 med_config = 'configs/med_config.json',
+                 image_size = 384,
+                 vit = 'base',
+                 vit_grad_ckpt = False,
+                 vit_ckpt_layer = 0,
+                 embed_dim = 256,
+                 ):
+        """
+        Args:
+            med_config (str): path for the mixture of encoder-decoder model's configuration file
+            image_size (int): input image size
+            vit (str): model size of vision transformer
+        """
+        super().__init__()
+        self.visual_encoder, vision_width = create_vit(vit,image_size, vit_grad_ckpt, vit_ckpt_layer)
+        self.tokenizer = init_tokenizer()
+        med_config = BertConfig.from_json_file(med_config)
+        med_config.encoder_width = vision_width
+        self.text_encoder = BertModel(config=med_config, add_pooling_layer=False)
+        text_width = self.text_encoder.config.hidden_size
+        self.vision_proj = nn.Linear(vision_width, embed_dim)
+        self.text_proj = nn.Linear(text_width, embed_dim)
+        self.itm_head = nn.Linear(text_width, 2)
+    def forward(self, image, caption, match_head='itm'):
+        image_embeds = self.visual_encoder(image)
+        image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(image.device)
+        text = self.tokenizer(caption, padding='max_length', truncation=True, max_length=35,
+                              return_tensors="pt").to(image.device)
+        if match_head=='itm':
+            output = self.text_encoder(text.input_ids,
+                                       attention_mask = text.attention_mask,
+                                       encoder_hidden_states = image_embeds,
+                                       encoder_attention_mask = image_atts,
+                                       return_dict = True,
+                                      )
+            itm_output = self.itm_head(output.last_hidden_state[:,0,:])
+            return itm_output
+        elif match_head=='itc':
+            text_output = self.text_encoder(text.input_ids, attention_mask = text.attention_mask,
+                                            return_dict = True, mode = 'text')
+            image_feat = F.normalize(self.vision_proj(image_embeds[:,0,:]),dim=-1)
+            text_feat = F.normalize(self.text_proj(text_output.last_hidden_state[:,0,:]),dim=-1)
+            sim = image_feat @ text_feat.t()
+            return sim
+def blip_itm(pretrained='',**kwargs):
+    model = BLIP_ITM(**kwargs)
+    if pretrained:
+        model,msg = load_checkpoint(model,pretrained)
+        assert(len(msg.missing_keys)==0)
+    return model

extras/BLIP/models/blip_nlvr.py ADDED Viewed

	@@ -0,0 +1,105 @@

+from extras.BLIP.models.med import BertConfig
+from extras.BLIP.models.nlvr_encoder import BertModel
+from extras.BLIP.models.vit import interpolate_pos_embed
+from extras.BLIP.models.blip import create_vit, init_tokenizer, is_url
+from timm.models.hub import download_cached_file
+import torch
+from torch import nn
+import torch.nn.functional as F
+from transformers import BertTokenizer
+import numpy as np
+import os
+class BLIP_NLVR(nn.Module):
+    def __init__(self,
+                 med_config = 'configs/med_config.json',
+                 image_size = 480,
+                 vit = 'base',
+                 vit_grad_ckpt = False,
+                 vit_ckpt_layer = 0,
+                 ):
+        """
+        Args:
+            med_config (str): path for the mixture of encoder-decoder model's configuration file
+            image_size (int): input image size
+            vit (str): model size of vision transformer
+        """
+        super().__init__()
+        self.visual_encoder, vision_width = create_vit(vit,image_size, vit_grad_ckpt, vit_ckpt_layer, drop_path_rate=0.1)
+        self.tokenizer = init_tokenizer()
+        med_config = BertConfig.from_json_file(med_config)
+        med_config.encoder_width = vision_width
+        self.text_encoder = BertModel(config=med_config, add_pooling_layer=False)
+        self.cls_head = nn.Sequential(
+                  nn.Linear(self.text_encoder.config.hidden_size, self.text_encoder.config.hidden_size),
+                  nn.ReLU(),
+                  nn.Linear(self.text_encoder.config.hidden_size, 2)
+                )
+    def forward(self, image, text, targets, train=True):
+        image_embeds = self.visual_encoder(image)
+        image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(image.device)
+        image0_embeds, image1_embeds = torch.split(image_embeds,targets.size(0))
+        text = self.tokenizer(text, padding='longest', return_tensors="pt").to(image.device)
+        text.input_ids[:,0] = self.tokenizer.enc_token_id
+        output = self.text_encoder(text.input_ids,
+                                   attention_mask = text.attention_mask,
+                                   encoder_hidden_states = [image0_embeds,image1_embeds],
+                                   encoder_attention_mask = [image_atts[:image0_embeds.size(0)],
+                                                             image_atts[image0_embeds.size(0):]],
+                                   return_dict = True,
+                                  )
+        hidden_state = output.last_hidden_state[:,0,:]
+        prediction = self.cls_head(hidden_state)
+        if train:
+            loss = F.cross_entropy(prediction, targets)
+            return loss
+        else:
+            return prediction
+def blip_nlvr(pretrained='',**kwargs):
+    model = BLIP_NLVR(**kwargs)
+    if pretrained:
+        model,msg = load_checkpoint(model,pretrained)
+        print("missing keys:")
+        print(msg.missing_keys)
+    return model
+def load_checkpoint(model,url_or_filename):
+    if is_url(url_or_filename):
+        cached_file = download_cached_file(url_or_filename, check_hash=False, progress=True)
+        checkpoint = torch.load(cached_file, map_location='cpu')
+    elif os.path.isfile(url_or_filename):
+        checkpoint = torch.load(url_or_filename, map_location='cpu')
+    else:
+        raise RuntimeError('checkpoint url or path is invalid')
+    state_dict = checkpoint['model']
+    state_dict['visual_encoder.pos_embed'] = interpolate_pos_embed(state_dict['visual_encoder.pos_embed'],model.visual_encoder)
+    for key in list(state_dict.keys()):
+        if 'crossattention.self.' in key:
+            new_key0 = key.replace('self','self0')
+            new_key1 = key.replace('self','self1')
+            state_dict[new_key0] = state_dict[key]
+            state_dict[new_key1] = state_dict[key]
+        elif 'crossattention.output.dense.' in key:
+            new_key0 = key.replace('dense','dense0')
+            new_key1 = key.replace('dense','dense1')
+            state_dict[new_key0] = state_dict[key]
+            state_dict[new_key1] = state_dict[key]
+    msg = model.load_state_dict(state_dict,strict=False)
+    print('load checkpoint from %s'%url_or_filename)
+    return model,msg

extras/BLIP/models/blip_pretrain.py ADDED Viewed

	@@ -0,0 +1,339 @@

+'''
+ * Copyright (c) 2022, salesforce.com, inc.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+ * By Junnan Li
+'''
+from extras.BLIP.models.med import BertConfig, BertModel, BertLMHeadModel
+from transformers import BertTokenizer
+import transformers
+transformers.logging.set_verbosity_error()
+import torch
+from torch import nn
+import torch.nn.functional as F
+from extras.BLIP.models.blip import create_vit, init_tokenizer, load_checkpoint
+class BLIP_Pretrain(nn.Module):
+    def __init__(self,
+                 med_config = 'configs/bert_config.json',
+                 image_size = 224,
+                 vit = 'base',
+                 vit_grad_ckpt = False,
+                 vit_ckpt_layer = 0,
+                 embed_dim = 256,
+                 queue_size = 57600,
+                 momentum = 0.995,
+                 ):
+        """
+        Args:
+            med_config (str): path for the mixture of encoder-decoder model's configuration file
+            image_size (int): input image size
+            vit (str): model size of vision transformer
+        """
+        super().__init__()
+        self.visual_encoder, vision_width = create_vit(vit,image_size, vit_grad_ckpt, vit_ckpt_layer, 0)
+        if vit=='base':
+            checkpoint = torch.hub.load_state_dict_from_url(
+                url="https://dl.fbaipublicfiles.com/deit/deit_base_patch16_224-b5f2ef4d.pth",
+                map_location="cpu", check_hash=True)
+            state_dict = checkpoint["model"]
+            msg = self.visual_encoder.load_state_dict(state_dict,strict=False)
+        elif vit=='large':
+            from timm.models.helpers import load_custom_pretrained
+            from timm.models.vision_transformer import default_cfgs
+            load_custom_pretrained(self.visual_encoder,default_cfgs['vit_large_patch16_224_in21k'])
+        self.tokenizer = init_tokenizer()
+        encoder_config = BertConfig.from_json_file(med_config)
+        encoder_config.encoder_width = vision_width
+        self.text_encoder = BertModel.from_pretrained('bert-base-uncased',config=encoder_config, add_pooling_layer=False)
+        self.text_encoder.resize_token_embeddings(len(self.tokenizer))
+        text_width = self.text_encoder.config.hidden_size
+        self.vision_proj = nn.Linear(vision_width, embed_dim)
+        self.text_proj = nn.Linear(text_width, embed_dim)
+        self.itm_head = nn.Linear(text_width, 2)
+        # create momentum encoders
+        self.visual_encoder_m, vision_width = create_vit(vit,image_size)
+        self.vision_proj_m = nn.Linear(vision_width, embed_dim)
+        self.text_encoder_m = BertModel(config=encoder_config, add_pooling_layer=False)
+        self.text_proj_m = nn.Linear(text_width, embed_dim)
+        self.model_pairs = [[self.visual_encoder,self.visual_encoder_m],
+                            [self.vision_proj,self.vision_proj_m],
+                            [self.text_encoder,self.text_encoder_m],
+                            [self.text_proj,self.text_proj_m],
+                           ]
+        self.copy_params()
+        # create the queue
+        self.register_buffer("image_queue", torch.randn(embed_dim, queue_size))
+        self.register_buffer("text_queue", torch.randn(embed_dim, queue_size))
+        self.register_buffer("queue_ptr", torch.zeros(1, dtype=torch.long))
+        self.image_queue = nn.functional.normalize(self.image_queue, dim=0)
+        self.text_queue = nn.functional.normalize(self.text_queue, dim=0)
+        self.queue_size = queue_size
+        self.momentum = momentum
+        self.temp = nn.Parameter(0.07*torch.ones([]))
+        # create the decoder
+        decoder_config = BertConfig.from_json_file(med_config)
+        decoder_config.encoder_width = vision_width
+        self.text_decoder = BertLMHeadModel.from_pretrained('bert-base-uncased',config=decoder_config)
+        self.text_decoder.resize_token_embeddings(len(self.tokenizer))
+        tie_encoder_decoder_weights(self.text_encoder,self.text_decoder.bert,'','/attention')
+    def forward(self, image, caption, alpha):
+        with torch.no_grad():
+            self.temp.clamp_(0.001,0.5)
+        image_embeds = self.visual_encoder(image)
+        image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(image.device)
+        image_feat = F.normalize(self.vision_proj(image_embeds[:,0,:]),dim=-1)
+        text = self.tokenizer(caption, padding='max_length', truncation=True, max_length=30,
+                              return_tensors="pt").to(image.device)
+        text_output = self.text_encoder(text.input_ids, attention_mask = text.attention_mask,
+                                        return_dict = True, mode = 'text')
+        text_feat = F.normalize(self.text_proj(text_output.last_hidden_state[:,0,:]),dim=-1)
+        # get momentum features
+        with torch.no_grad():
+            self._momentum_update()
+            image_embeds_m = self.visual_encoder_m(image)
+            image_feat_m = F.normalize(self.vision_proj_m(image_embeds_m[:,0,:]),dim=-1)
+            image_feat_all = torch.cat([image_feat_m.t(),self.image_queue.clone().detach()],dim=1)
+            text_output_m = self.text_encoder_m(text.input_ids, attention_mask = text.attention_mask,
+                                                return_dict = True, mode = 'text')
+            text_feat_m = F.normalize(self.text_proj_m(text_output_m.last_hidden_state[:,0,:]),dim=-1)
+            text_feat_all = torch.cat([text_feat_m.t(),self.text_queue.clone().detach()],dim=1)
+            sim_i2t_m = image_feat_m @ text_feat_all / self.temp
+            sim_t2i_m = text_feat_m @ image_feat_all / self.temp
+            sim_targets = torch.zeros(sim_i2t_m.size()).to(image.device)
+            sim_targets.fill_diagonal_(1)
+            sim_i2t_targets = alpha * F.softmax(sim_i2t_m, dim=1) + (1 - alpha) * sim_targets
+            sim_t2i_targets = alpha * F.softmax(sim_t2i_m, dim=1) + (1 - alpha) * sim_targets
+        sim_i2t = image_feat @ text_feat_all / self.temp
+        sim_t2i = text_feat @ image_feat_all / self.temp
+        loss_i2t = -torch.sum(F.log_softmax(sim_i2t, dim=1)*sim_i2t_targets,dim=1).mean()
+        loss_t2i = -torch.sum(F.log_softmax(sim_t2i, dim=1)*sim_t2i_targets,dim=1).mean()
+        loss_ita = (loss_i2t+loss_t2i)/2
+        self._dequeue_and_enqueue(image_feat_m, text_feat_m)
+        ###============== Image-text Matching ===================###
+        encoder_input_ids = text.input_ids.clone()
+        encoder_input_ids[:,0] = self.tokenizer.enc_token_id
+        # forward the positve image-text pair
+        bs = image.size(0)
+        output_pos = self.text_encoder(encoder_input_ids,
+                                       attention_mask = text.attention_mask,
+                                       encoder_hidden_states = image_embeds,
+                                       encoder_attention_mask = image_atts,
+                                       return_dict = True,
+                                      )
+        with torch.no_grad():
+            weights_t2i = F.softmax(sim_t2i[:,:bs],dim=1)+1e-4
+            weights_t2i.fill_diagonal_(0)
+            weights_i2t = F.softmax(sim_i2t[:,:bs],dim=1)+1e-4
+            weights_i2t.fill_diagonal_(0)
+        # select a negative image for each text
+        image_embeds_neg = []
+        for b in range(bs):
+            neg_idx = torch.multinomial(weights_t2i[b], 1).item()
+            image_embeds_neg.append(image_embeds[neg_idx])
+        image_embeds_neg = torch.stack(image_embeds_neg,dim=0)
+        # select a negative text for each image
+        text_ids_neg = []
+        text_atts_neg = []
+        for b in range(bs):
+            neg_idx = torch.multinomial(weights_i2t[b], 1).item()
+            text_ids_neg.append(encoder_input_ids[neg_idx])
+            text_atts_neg.append(text.attention_mask[neg_idx])
+        text_ids_neg = torch.stack(text_ids_neg,dim=0)
+        text_atts_neg = torch.stack(text_atts_neg,dim=0)
+        text_ids_all = torch.cat([encoder_input_ids, text_ids_neg],dim=0)
+        text_atts_all = torch.cat([text.attention_mask, text_atts_neg],dim=0)
+        image_embeds_all = torch.cat([image_embeds_neg,image_embeds],dim=0)
+        image_atts_all = torch.cat([image_atts,image_atts],dim=0)
+        output_neg = self.text_encoder(text_ids_all,
+                                       attention_mask = text_atts_all,
+                                       encoder_hidden_states = image_embeds_all,
+                                       encoder_attention_mask = image_atts_all,
+                                       return_dict = True,
+                                      )
+        vl_embeddings = torch.cat([output_pos.last_hidden_state[:,0,:], output_neg.last_hidden_state[:,0,:]],dim=0)
+        vl_output = self.itm_head(vl_embeddings)
+        itm_labels = torch.cat([torch.ones(bs,dtype=torch.long),torch.zeros(2*bs,dtype=torch.long)],
+                               dim=0).to(image.device)
+        loss_itm = F.cross_entropy(vl_output, itm_labels)
+        ##================= LM ========================##
+        decoder_input_ids = text.input_ids.clone()
+        decoder_input_ids[:,0] = self.tokenizer.bos_token_id
+        decoder_targets = decoder_input_ids.masked_fill(decoder_input_ids == self.tokenizer.pad_token_id, -100)
+        decoder_output = self.text_decoder(decoder_input_ids,
+                                           attention_mask = text.attention_mask,
+                                           encoder_hidden_states = image_embeds,
+                                           encoder_attention_mask = image_atts,
+                                           labels = decoder_targets,
+                                           return_dict = True,
+                                          )
+        loss_lm = decoder_output.loss
+        return loss_ita, loss_itm, loss_lm
+    @torch.no_grad()
+    def copy_params(self):
+        for model_pair in self.model_pairs:
+            for param, param_m in zip(model_pair[0].parameters(), model_pair[1].parameters()):
+                param_m.data.copy_(param.data)  # initialize
+                param_m.requires_grad = False  # not update by gradient
+    @torch.no_grad()
+    def _momentum_update(self):
+        for model_pair in self.model_pairs:
+            for param, param_m in zip(model_pair[0].parameters(), model_pair[1].parameters()):
+                param_m.data = param_m.data * self.momentum + param.data * (1. - self.momentum)
+    @torch.no_grad()
+    def _dequeue_and_enqueue(self, image_feat, text_feat):
+        # gather keys before updating queue
+        image_feats = concat_all_gather(image_feat)
+        text_feats = concat_all_gather(text_feat)
+        batch_size = image_feats.shape[0]
+        ptr = int(self.queue_ptr)
+        assert self.queue_size % batch_size == 0  # for simplicity
+        # replace the keys at ptr (dequeue and enqueue)
+        self.image_queue[:, ptr:ptr + batch_size] = image_feats.T
+        self.text_queue[:, ptr:ptr + batch_size] = text_feats.T
+        ptr = (ptr + batch_size) % self.queue_size  # move pointer
+        self.queue_ptr[0] = ptr
+def blip_pretrain(**kwargs):
+    model = BLIP_Pretrain(**kwargs)
+    return model
+@torch.no_grad()
+def concat_all_gather(tensor):
+    """
+    Performs all_gather operation on the provided tensors.
+    *** Warning ***: torch.distributed.all_gather has no gradient.
+    """
+    tensors_gather = [torch.ones_like(tensor)
+        for _ in range(torch.distributed.get_world_size())]
+    torch.distributed.all_gather(tensors_gather, tensor, async_op=False)
+    output = torch.cat(tensors_gather, dim=0)
+    return output
+from typing import List
+def tie_encoder_decoder_weights(encoder: nn.Module, decoder: nn.Module, base_model_prefix: str, skip_key:str):
+    uninitialized_encoder_weights: List[str] = []
+    if decoder.__class__ != encoder.__class__:
+        print(
+            f"{decoder.__class__} and {encoder.__class__} are not equal. In this case make sure that all encoder weights are correctly initialized."
+        )
+    def tie_encoder_to_decoder_recursively(
+        decoder_pointer: nn.Module,
+        encoder_pointer: nn.Module,
+        module_name: str,
+        uninitialized_encoder_weights: List[str],
+        skip_key: str,
+        depth=0,
+    ):
+        assert isinstance(decoder_pointer, nn.Module) and isinstance(
+            encoder_pointer, nn.Module
+        ), f"{decoder_pointer} and {encoder_pointer} have to be of type torch.nn.Module"
+        if hasattr(decoder_pointer, "weight") and skip_key not in module_name:
+            assert hasattr(encoder_pointer, "weight")
+            encoder_pointer.weight = decoder_pointer.weight
+            if hasattr(decoder_pointer, "bias"):
+                assert hasattr(encoder_pointer, "bias")
+                encoder_pointer.bias = decoder_pointer.bias
+            print(module_name+' is tied')
+            return
+        encoder_modules = encoder_pointer._modules
+        decoder_modules = decoder_pointer._modules
+        if len(decoder_modules) > 0:
+            assert (
+                len(encoder_modules) > 0
+            ), f"Encoder module {encoder_pointer} does not match decoder module {decoder_pointer}"
+            all_encoder_weights = set([module_name + "/" + sub_name for sub_name in encoder_modules.keys()])
+            encoder_layer_pos = 0
+            for name, module in decoder_modules.items():
+                if name.isdigit():
+                    encoder_name = str(int(name) + encoder_layer_pos)
+                    decoder_name = name
+                    if not isinstance(decoder_modules[decoder_name], type(encoder_modules[encoder_name])) and len(
+                        encoder_modules
+                    ) != len(decoder_modules):
+                        # this can happen if the name corresponds to the position in a list module list of layers
+                        # in this case the decoder has added a cross-attention that the encoder does not have
+                        # thus skip this step and subtract one layer pos from encoder
+                        encoder_layer_pos -= 1
+                        continue
+                elif name not in encoder_modules:
+                    continue
+                elif depth > 500:
+                    raise ValueError(
+                        "Max depth of recursive function `tie_encoder_to_decoder` reached. It seems that there is a circular dependency between two or more `nn.Modules` of your model."
+                    )
+                else:
+                    decoder_name = encoder_name = name
+                tie_encoder_to_decoder_recursively(
+                    decoder_modules[decoder_name],
+                    encoder_modules[encoder_name],
+                    module_name + "/" + name,
+                    uninitialized_encoder_weights,
+                    skip_key,
+                    depth=depth + 1,
+                )
+                all_encoder_weights.remove(module_name + "/" + encoder_name)
+            uninitialized_encoder_weights += list(all_encoder_weights)
+    # tie weights recursively
+    tie_encoder_to_decoder_recursively(decoder, encoder, base_model_prefix, uninitialized_encoder_weights, skip_key)

extras/BLIP/models/blip_retrieval.py ADDED Viewed

	@@ -0,0 +1,319 @@

+from extras.BLIP.models.med import BertConfig, BertModel
+from transformers import BertTokenizer
+import torch
+from torch import nn
+import torch.nn.functional as F
+from extras.BLIP.models.blip import create_vit, init_tokenizer, load_checkpoint
+class BLIP_Retrieval(nn.Module):
+    def __init__(self,
+                 med_config = 'configs/med_config.json',
+                 image_size = 384,
+                 vit = 'base',
+                 vit_grad_ckpt = False,
+                 vit_ckpt_layer = 0,
+                 embed_dim = 256,
+                 queue_size = 57600,
+                 momentum = 0.995,
+                 negative_all_rank = False,
+                 ):
+        """
+        Args:
+            med_config (str): path for the mixture of encoder-decoder model's configuration file
+            image_size (int): input image size
+            vit (str): model size of vision transformer
+        """
+        super().__init__()
+        self.visual_encoder, vision_width = create_vit(vit,image_size, vit_grad_ckpt, vit_ckpt_layer)
+        self.tokenizer = init_tokenizer()
+        med_config = BertConfig.from_json_file(med_config)
+        med_config.encoder_width = vision_width
+        self.text_encoder = BertModel(config=med_config, add_pooling_layer=False)
+        text_width = self.text_encoder.config.hidden_size
+        self.vision_proj = nn.Linear(vision_width, embed_dim)
+        self.text_proj = nn.Linear(text_width, embed_dim)
+        self.itm_head = nn.Linear(text_width, 2)
+        # create momentum encoders
+        self.visual_encoder_m, vision_width = create_vit(vit,image_size)
+        self.vision_proj_m = nn.Linear(vision_width, embed_dim)
+        self.text_encoder_m = BertModel(config=med_config, add_pooling_layer=False)
+        self.text_proj_m = nn.Linear(text_width, embed_dim)
+        self.model_pairs = [[self.visual_encoder,self.visual_encoder_m],
+                            [self.vision_proj,self.vision_proj_m],
+                            [self.text_encoder,self.text_encoder_m],
+                            [self.text_proj,self.text_proj_m],
+                           ]
+        self.copy_params()
+        # create the queue
+        self.register_buffer("image_queue", torch.randn(embed_dim, queue_size))
+        self.register_buffer("text_queue", torch.randn(embed_dim, queue_size))
+        self.register_buffer("idx_queue", torch.full((1,queue_size),-100))
+        self.register_buffer("ptr_queue", torch.zeros(1, dtype=torch.long))
+        self.image_queue = nn.functional.normalize(self.image_queue, dim=0)
+        self.text_queue = nn.functional.normalize(self.text_queue, dim=0)
+        self.queue_size = queue_size
+        self.momentum = momentum
+        self.temp = nn.Parameter(0.07*torch.ones([]))
+        self.negative_all_rank = negative_all_rank
+    def forward(self, image, caption, alpha, idx):
+        with torch.no_grad():
+            self.temp.clamp_(0.001,0.5)
+        image_embeds = self.visual_encoder(image)
+        image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(image.device)
+        image_feat = F.normalize(self.vision_proj(image_embeds[:,0,:]),dim=-1)
+        text = self.tokenizer(caption, padding='max_length', truncation=True, max_length=35,
+                              return_tensors="pt").to(image.device)
+        text_output = self.text_encoder(text.input_ids, attention_mask = text.attention_mask,
+                                        return_dict = True, mode = 'text')
+        text_feat = F.normalize(self.text_proj(text_output.last_hidden_state[:,0,:]),dim=-1)
+        ###============== Image-text Contrastive Learning ===================###
+        idx = idx.view(-1,1)
+        idx_all = torch.cat([idx.t(), self.idx_queue.clone().detach()],dim=1)
+        pos_idx = torch.eq(idx, idx_all).float()
+        sim_targets = pos_idx / pos_idx.sum(1,keepdim=True)
+        # get momentum features
+        with torch.no_grad():
+            self._momentum_update()
+            image_embeds_m = self.visual_encoder_m(image)
+            image_feat_m = F.normalize(self.vision_proj_m(image_embeds_m[:,0,:]),dim=-1)
+            image_feat_m_all = torch.cat([image_feat_m.t(),self.image_queue.clone().detach()],dim=1)
+            text_output_m = self.text_encoder_m(text.input_ids, attention_mask = text.attention_mask,
+                                                return_dict = True, mode = 'text')
+            text_feat_m = F.normalize(self.text_proj_m(text_output_m.last_hidden_state[:,0,:]),dim=-1)
+            text_feat_m_all = torch.cat([text_feat_m.t(),self.text_queue.clone().detach()],dim=1)
+            sim_i2t_m = image_feat_m @ text_feat_m_all / self.temp
+            sim_t2i_m = text_feat_m @ image_feat_m_all / self.temp
+            sim_i2t_targets = alpha * F.softmax(sim_i2t_m, dim=1) + (1 - alpha) * sim_targets
+            sim_t2i_targets = alpha * F.softmax(sim_t2i_m, dim=1) + (1 - alpha) * sim_targets
+        sim_i2t = image_feat @ text_feat_m_all / self.temp
+        sim_t2i = text_feat @ image_feat_m_all / self.temp
+        loss_i2t = -torch.sum(F.log_softmax(sim_i2t, dim=1)*sim_i2t_targets,dim=1).mean()
+        loss_t2i = -torch.sum(F.log_softmax(sim_t2i, dim=1)*sim_t2i_targets,dim=1).mean()
+        loss_ita = (loss_i2t+loss_t2i)/2
+        idxs = concat_all_gather(idx)
+        self._dequeue_and_enqueue(image_feat_m, text_feat_m, idxs)
+        ###============== Image-text Matching ===================###
+        encoder_input_ids = text.input_ids.clone()
+        encoder_input_ids[:,0] = self.tokenizer.enc_token_id
+        # forward the positve image-text pair
+        bs = image.size(0)
+        output_pos = self.text_encoder(encoder_input_ids,
+                                       attention_mask = text.attention_mask,
+                                       encoder_hidden_states = image_embeds,
+                                       encoder_attention_mask = image_atts,
+                                       return_dict = True,
+                                      )
+        if self.negative_all_rank:
+            # compute sample similarity
+            with torch.no_grad():
+                mask = torch.eq(idx, idxs.t())
+                image_feat_world = concat_all_gather(image_feat)
+                text_feat_world = concat_all_gather(text_feat)
+                sim_i2t = image_feat @ text_feat_world.t() / self.temp
+                sim_t2i = text_feat @ image_feat_world.t() / self.temp
+                weights_i2t = F.softmax(sim_i2t,dim=1)
+                weights_i2t.masked_fill_(mask, 0)
+                weights_t2i = F.softmax(sim_t2i,dim=1)
+                weights_t2i.masked_fill_(mask, 0)
+            image_embeds_world = all_gather_with_grad(image_embeds)
+            # select a negative image (from all ranks) for each text
+            image_embeds_neg = []
+            for b in range(bs):
+                neg_idx = torch.multinomial(weights_t2i[b], 1).item()
+                image_embeds_neg.append(image_embeds_world[neg_idx])
+            image_embeds_neg = torch.stack(image_embeds_neg,dim=0)
+            # select a negative text (from all ranks) for each image
+            input_ids_world = concat_all_gather(encoder_input_ids)
+            att_mask_world = concat_all_gather(text.attention_mask)
+            text_ids_neg = []
+            text_atts_neg = []
+            for b in range(bs):
+                neg_idx = torch.multinomial(weights_i2t[b], 1).item()
+                text_ids_neg.append(input_ids_world[neg_idx])
+                text_atts_neg.append(att_mask_world[neg_idx])
+        else:
+            with torch.no_grad():
+                mask = torch.eq(idx, idx.t())
+                sim_i2t = image_feat @ text_feat.t() / self.temp
+                sim_t2i = text_feat @ image_feat.t() / self.temp
+                weights_i2t = F.softmax(sim_i2t,dim=1)
+                weights_i2t.masked_fill_(mask, 0)
+                weights_t2i = F.softmax(sim_t2i,dim=1)
+                weights_t2i.masked_fill_(mask, 0)
+            # select a negative image (from same rank) for each text
+            image_embeds_neg = []
+            for b in range(bs):
+                neg_idx = torch.multinomial(weights_t2i[b], 1).item()
+                image_embeds_neg.append(image_embeds[neg_idx])
+            image_embeds_neg = torch.stack(image_embeds_neg,dim=0)
+            # select a negative text (from same rank) for each image
+            text_ids_neg = []
+            text_atts_neg = []
+            for b in range(bs):
+                neg_idx = torch.multinomial(weights_i2t[b], 1).item()
+                text_ids_neg.append(encoder_input_ids[neg_idx])
+                text_atts_neg.append(text.attention_mask[neg_idx])
+        text_ids_neg = torch.stack(text_ids_neg,dim=0)
+        text_atts_neg = torch.stack(text_atts_neg,dim=0)
+        text_ids_all = torch.cat([encoder_input_ids, text_ids_neg],dim=0)
+        text_atts_all = torch.cat([text.attention_mask, text_atts_neg],dim=0)
+        image_embeds_all = torch.cat([image_embeds_neg,image_embeds],dim=0)
+        image_atts_all = torch.cat([image_atts,image_atts],dim=0)
+        output_neg = self.text_encoder(text_ids_all,
+                                       attention_mask = text_atts_all,
+                                       encoder_hidden_states = image_embeds_all,
+                                       encoder_attention_mask = image_atts_all,
+                                       return_dict = True,
+                                      )
+        vl_embeddings = torch.cat([output_pos.last_hidden_state[:,0,:], output_neg.last_hidden_state[:,0,:]],dim=0)
+        vl_output = self.itm_head(vl_embeddings)
+        itm_labels = torch.cat([torch.ones(bs,dtype=torch.long),torch.zeros(2*bs,dtype=torch.long)],
+                               dim=0).to(image.device)
+        loss_itm = F.cross_entropy(vl_output, itm_labels)
+        return loss_ita, loss_itm
+    @torch.no_grad()
+    def copy_params(self):
+        for model_pair in self.model_pairs:
+            for param, param_m in zip(model_pair[0].parameters(), model_pair[1].parameters()):
+                param_m.data.copy_(param.data)  # initialize
+                param_m.requires_grad = False  # not update by gradient
+    @torch.no_grad()
+    def _momentum_update(self):
+        for model_pair in self.model_pairs:
+            for param, param_m in zip(model_pair[0].parameters(), model_pair[1].parameters()):
+                param_m.data = param_m.data * self.momentum + param.data * (1. - self.momentum)
+    @torch.no_grad()
+    def _dequeue_and_enqueue(self, image_feat, text_feat, idxs):
+        # gather keys before updating queue
+        image_feats = concat_all_gather(image_feat)
+        text_feats = concat_all_gather(text_feat)
+        batch_size = image_feats.shape[0]
+        ptr = int(self.ptr_queue)
+        assert self.queue_size % batch_size == 0  # for simplicity
+        # replace the keys at ptr (dequeue and enqueue)
+        self.image_queue[:, ptr:ptr + batch_size] = image_feats.T
+        self.text_queue[:, ptr:ptr + batch_size] = text_feats.T
+        self.idx_queue[:, ptr:ptr + batch_size] = idxs.T
+        ptr = (ptr + batch_size) % self.queue_size # move pointer
+        self.ptr_queue[0] = ptr
+def blip_retrieval(pretrained='',**kwargs):
+    model = BLIP_Retrieval(**kwargs)
+    if pretrained:
+        model,msg = load_checkpoint(model,pretrained)
+        print("missing keys:")
+        print(msg.missing_keys)
+    return model
+@torch.no_grad()
+def concat_all_gather(tensor):
+    """
+    Performs all_gather operation on the provided tensors.
+    *** Warning ***: torch.distributed.all_gather has no gradient.
+    """
+    tensors_gather = [torch.ones_like(tensor)
+        for _ in range(torch.distributed.get_world_size())]
+    torch.distributed.all_gather(tensors_gather, tensor, async_op=False)
+    output = torch.cat(tensors_gather, dim=0)
+    return output
+class GatherLayer(torch.autograd.Function):
+    """
+    Gather tensors from all workers with support for backward propagation:
+    This implementation does not cut the gradients as torch.distributed.all_gather does.
+    """
+    @staticmethod
+    def forward(ctx, x):
+        output = [torch.zeros_like(x) for _ in range(torch.distributed.get_world_size())]
+        torch.distributed.all_gather(output, x)
+        return tuple(output)
+    @staticmethod
+    def backward(ctx, *grads):
+        all_gradients = torch.stack(grads)
+        torch.distributed.all_reduce(all_gradients)
+        return all_gradients[torch.distributed.get_rank()]
+def all_gather_with_grad(tensors):
+    """
+    Performs all_gather operation on the provided tensors.
+    Graph remains connected for backward grad computation.
+    """
+    # Queue the gathered tensors
+    world_size = torch.distributed.get_world_size()
+    # There is no need for reduction in the single-proc case
+    if world_size == 1:
+        return tensors
+    tensor_all = GatherLayer.apply(tensors)
+    return torch.cat(tensor_all, dim=0)

extras/BLIP/models/blip_vqa.py ADDED Viewed

	@@ -0,0 +1,186 @@

+from extras.BLIP.models.med import BertConfig, BertModel, BertLMHeadModel
+from extras.BLIP.models.blip import create_vit, init_tokenizer, load_checkpoint
+import torch
+from torch import nn
+import torch.nn.functional as F
+from transformers import BertTokenizer
+import numpy as np
+class BLIP_VQA(nn.Module):
+    def __init__(self,
+                 med_config = 'configs/med_config.json',
+                 image_size = 480,
+                 vit = 'base',
+                 vit_grad_ckpt = False,
+                 vit_ckpt_layer = 0,
+                 ):
+        """
+        Args:
+            med_config (str): path for the mixture of encoder-decoder model's configuration file
+            image_size (int): input image size
+            vit (str): model size of vision transformer
+        """
+        super().__init__()
+        self.visual_encoder, vision_width = create_vit(vit, image_size, vit_grad_ckpt, vit_ckpt_layer, drop_path_rate=0.1)
+        self.tokenizer = init_tokenizer()
+        encoder_config = BertConfig.from_json_file(med_config)
+        encoder_config.encoder_width = vision_width
+        self.text_encoder = BertModel(config=encoder_config, add_pooling_layer=False)
+        decoder_config = BertConfig.from_json_file(med_config)
+        self.text_decoder = BertLMHeadModel(config=decoder_config)
+    def forward(self, image, question, answer=None, n=None, weights=None, train=True, inference='rank', k_test=128):
+        image_embeds = self.visual_encoder(image)
+        image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(image.device)
+        question = self.tokenizer(question, padding='longest', truncation=True, max_length=35,
+                                  return_tensors="pt").to(image.device)
+        question.input_ids[:,0] = self.tokenizer.enc_token_id
+        if train:
+            '''
+            n: number of answers for each question
+            weights: weight for each answer
+            '''
+            answer = self.tokenizer(answer, padding='longest', return_tensors="pt").to(image.device)
+            answer.input_ids[:,0] = self.tokenizer.bos_token_id
+            answer_targets = answer.input_ids.masked_fill(answer.input_ids == self.tokenizer.pad_token_id, -100)
+            question_output = self.text_encoder(question.input_ids,
+                                                attention_mask = question.attention_mask,
+                                                encoder_hidden_states = image_embeds,
+                                                encoder_attention_mask = image_atts,
+                                                return_dict = True)
+            question_states = []
+            question_atts = []
+            for b, n in enumerate(n):
+                question_states += [question_output.last_hidden_state[b]]*n
+                question_atts += [question.attention_mask[b]]*n
+            question_states = torch.stack(question_states,0)
+            question_atts = torch.stack(question_atts,0)
+            answer_output = self.text_decoder(answer.input_ids,
+                                              attention_mask = answer.attention_mask,
+                                              encoder_hidden_states = question_states,
+                                              encoder_attention_mask = question_atts,
+                                              labels = answer_targets,
+                                              return_dict = True,
+                                              reduction = 'none',
+                                             )
+            loss = weights * answer_output.loss
+            loss = loss.sum()/image.size(0)
+            return loss
+        else:
+            question_output = self.text_encoder(question.input_ids,
+                                                attention_mask = question.attention_mask,
+                                                encoder_hidden_states = image_embeds,
+                                                encoder_attention_mask = image_atts,
+                                                return_dict = True)
+            if inference=='generate':
+                num_beams = 3
+                question_states = question_output.last_hidden_state.repeat_interleave(num_beams,dim=0)
+                question_atts = torch.ones(question_states.size()[:-1],dtype=torch.long).to(question_states.device)
+                model_kwargs = {"encoder_hidden_states": question_states, "encoder_attention_mask":question_atts}
+                bos_ids = torch.full((image.size(0),1),fill_value=self.tokenizer.bos_token_id,device=image.device)
+                outputs = self.text_decoder.generate(input_ids=bos_ids,
+                                                     max_length=10,
+                                                     min_length=1,
+                                                     num_beams=num_beams,
+                                                     eos_token_id=self.tokenizer.sep_token_id,
+                                                     pad_token_id=self.tokenizer.pad_token_id,
+                                                     **model_kwargs)
+                answers = []
+                for output in outputs:
+                    answer = self.tokenizer.decode(output, skip_special_tokens=True)
+                    answers.append(answer)
+                return answers
+            elif inference=='rank':
+                max_ids = self.rank_answer(question_output.last_hidden_state, question.attention_mask,
+                                           answer.input_ids, answer.attention_mask, k_test)
+                return max_ids
+    def rank_answer(self, question_states, question_atts, answer_ids, answer_atts, k):
+        num_ques = question_states.size(0)
+        start_ids = answer_ids[0,0].repeat(num_ques,1) # bos token
+        start_output = self.text_decoder(start_ids,
+                                         encoder_hidden_states = question_states,
+                                         encoder_attention_mask = question_atts,
+                                         return_dict = True,
+                                         reduction = 'none')
+        logits = start_output.logits[:,0,:] # first token's logit
+        # topk_probs: top-k probability
+        # topk_ids: [num_question, k]
+        answer_first_token = answer_ids[:,1]
+        prob_first_token = F.softmax(logits,dim=1).index_select(dim=1, index=answer_first_token)
+        topk_probs, topk_ids = prob_first_token.topk(k,dim=1)
+        # answer input: [num_question*k, answer_len]
+        input_ids = []
+        input_atts = []
+        for b, topk_id in enumerate(topk_ids):
+            input_ids.append(answer_ids.index_select(dim=0, index=topk_id))
+            input_atts.append(answer_atts.index_select(dim=0, index=topk_id))
+        input_ids = torch.cat(input_ids,dim=0)
+        input_atts = torch.cat(input_atts,dim=0)
+        targets_ids = input_ids.masked_fill(input_ids == self.tokenizer.pad_token_id, -100)
+        # repeat encoder's output for top-k answers
+        question_states = tile(question_states, 0, k)
+        question_atts = tile(question_atts, 0, k)
+        output = self.text_decoder(input_ids,
+                                   attention_mask = input_atts,
+                                   encoder_hidden_states = question_states,
+                                   encoder_attention_mask = question_atts,
+                                   labels = targets_ids,
+                                   return_dict = True,
+                                   reduction = 'none')
+        log_probs_sum = -output.loss
+        log_probs_sum = log_probs_sum.view(num_ques,k)
+        max_topk_ids = log_probs_sum.argmax(dim=1)
+        max_ids = topk_ids[max_topk_ids>=0,max_topk_ids]
+        return max_ids
+def blip_vqa(pretrained='',**kwargs):
+    model = BLIP_VQA(**kwargs)
+    if pretrained:
+        model,msg = load_checkpoint(model,pretrained)
+#         assert(len(msg.missing_keys)==0)
+    return model
+def tile(x, dim, n_tile):
+    init_dim = x.size(dim)
+    repeat_idx = [1] * x.dim()
+    repeat_idx[dim] = n_tile
+    x = x.repeat(*(repeat_idx))
+    order_index = torch.LongTensor(np.concatenate([init_dim * np.arange(n_tile) + i for i in range(init_dim)]))
+    return torch.index_select(x, dim, order_index.to(x.device))

extras/BLIP/models/med.py ADDED Viewed

	@@ -0,0 +1,955 @@

+'''
+ * Copyright (c) 2022, salesforce.com, inc.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+ * By Junnan Li
+ * Based on huggingface code base
+ * https://github.com/huggingface/transformers/blob/v4.15.0/src/transformers/models/bert
+'''
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Tuple
+import torch
+from torch import Tensor, device, dtype, nn
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+import torch.nn.functional as F
+from transformers.activations import ACT2FN
+from transformers.file_utils import (
+    ModelOutput,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    NextSentencePredictorOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from transformers.modeling_utils import (
+    PreTrainedModel,
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from transformers.utils import logging
+from transformers.models.bert.configuration_bert import BertConfig
+logger = logging.get_logger(__name__)
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word and position embeddings."""
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.config = config
+    def forward(
+        self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+        seq_length = input_shape[1]
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        embeddings = inputs_embeds
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class BertSelfAttention(nn.Module):
+    def __init__(self, config, is_cross_attention):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+            )
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            self.key = nn.Linear(config.encoder_width, self.all_head_size)
+            self.value = nn.Linear(config.encoder_width, self.all_head_size)
+        else:
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+        self.save_attention = False
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+    def get_attn_gradients(self):
+        return self.attn_gradients
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+    def get_attention_map(self):
+        return self.attention_map
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        past_key_value = (key_layer, value_layer)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+        if is_cross_attention and self.save_attention:
+            self.save_attention_map(attention_probs)
+            attention_probs.register_hook(self.save_attn_gradients)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+        context_layer = torch.matmul(attention_probs_dropped, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+        outputs = outputs + (past_key_value,)
+        return outputs
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.self = BertSelfAttention(config, is_cross_attention)
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertLayer(nn.Module):
+    def __init__(self, config, layer_num):
+        super().__init__()
+        self.config = config
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BertAttention(config)
+        self.layer_num = layer_num
+        if self.config.add_cross_attention:
+            self.crossattention = BertAttention(config, is_cross_attention=self.config.add_cross_attention)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        mode=None,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:-1]
+        present_key_value = self_attention_outputs[-1]
+        if mode=='multimodal':
+            assert encoder_hidden_states is not None, "encoder_hidden_states must be given for cross-attention layers"
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                output_attentions=output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+        outputs = outputs + (present_key_value,)
+        return outputs
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+class BertEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([BertLayer(config,i) for i in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        mode='multimodal',
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+        next_decoder_cache = () if use_cache else None
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+            if self.gradient_checkpointing and self.training:
+                if use_cache:
+                    logger.warn(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    mode=mode,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                    mode=mode,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+class BertOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+class BertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = BertConfig
+    base_model_prefix = "bert"
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+    def _init_weights(self, module):
+        """ Initialize the weights """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+class BertModel(BertPreTrainedModel):
+    """
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
+    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+    input to the forward pass.
+    """
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+        self.init_weights()
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+    def get_extended_attention_mask(self, attention_mask: Tensor, input_shape: Tuple[int], device: device, is_decoder: bool) -> Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+        Arguments:
+            attention_mask (:obj:`torch.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (:obj:`Tuple[int]`):
+                The shape of the input to the model.
+            device: (:obj:`torch.device`):
+                The device of the input to the model.
+        Returns:
+            :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            if is_decoder:
+                batch_size, seq_length = input_shape
+                seq_ids = torch.arange(seq_length, device=device)
+                causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
+                # in case past_key_values are used we need to add a prefix ones mask to the causal mask
+                # causal and attention masks must have same type with pytorch version < 1.3
+                causal_mask = causal_mask.to(attention_mask.dtype)
+                if causal_mask.shape[1] < attention_mask.shape[1]:
+                    prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
+                    causal_mask = torch.cat(
+                        [
+                            torch.ones((batch_size, seq_length, prefix_seq_len), device=device, dtype=causal_mask.dtype),
+                            causal_mask,
+                        ],
+                        axis=-1,
+                    )
+                extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
+            else:
+                extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        is_decoder=False,
+        mode='multimodal',
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            batch_size, seq_length = input_shape
+            device = input_ids.device
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size, seq_length = input_shape
+            device = inputs_embeds.device
+        elif encoder_embeds is not None:
+            input_shape = encoder_embeds.size()[:-1]
+            batch_size, seq_length = input_shape
+            device = encoder_embeds.device
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds or encoder_embeds")
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape,
+                                                                                 device, is_decoder)
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            if type(encoder_hidden_states) == list:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size()
+            else:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if type(encoder_attention_mask) == list:
+                encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        if encoder_embeds is None:
+            embedding_output = self.embeddings(
+                input_ids=input_ids,
+                position_ids=position_ids,
+                inputs_embeds=inputs_embeds,
+                past_key_values_length=past_key_values_length,
+            )
+        else:
+            embedding_output = encoder_embeds
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            mode=mode,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+class BertLMHeadModel(BertPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+        self.init_weights()
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        return_logits=False,
+        is_decoder=True,
+        reduction='mean',
+        mode='multimodal',
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
+            ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        Returns:
+        Example::
+            >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
+            >>> import torch
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+            >>> config = BertConfig.from_pretrained("bert-base-cased")
+            >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+            >>> prediction_logits = outputs.logits
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            is_decoder=is_decoder,
+            mode=mode,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+        if return_logits:
+            return prediction_scores[:, :-1, :].contiguous()
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1)
+            lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+            if reduction=='none':
+                lm_loss = lm_loss.view(prediction_scores.size(0),-1).sum(1)
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+    def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "past_key_values": past,
+            "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None),
+            "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None),
+            "is_decoder": True,
+        }
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past

extras/BLIP/models/nlvr_encoder.py ADDED Viewed

	@@ -0,0 +1,843 @@

+import math
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Tuple
+import torch
+from torch import Tensor, device, dtype, nn
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+import torch.nn.functional as F
+from transformers.activations import ACT2FN
+from transformers.file_utils import (
+    ModelOutput,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    NextSentencePredictorOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from transformers.modeling_utils import (
+    PreTrainedModel,
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from transformers.utils import logging
+from transformers.models.bert.configuration_bert import BertConfig
+logger = logging.get_logger(__name__)
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word and position embeddings."""
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.config = config
+    def forward(
+        self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+        seq_length = input_shape[1]
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        embeddings = inputs_embeds
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class BertSelfAttention(nn.Module):
+    def __init__(self, config, is_cross_attention):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+            )
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            self.key = nn.Linear(config.encoder_width, self.all_head_size)
+            self.value = nn.Linear(config.encoder_width, self.all_head_size)
+        else:
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+        self.save_attention = False
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+    def get_attn_gradients(self):
+        return self.attn_gradients
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+    def get_attention_map(self):
+        return self.attention_map
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        past_key_value = (key_layer, value_layer)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+        if is_cross_attention and self.save_attention:
+            self.save_attention_map(attention_probs)
+            attention_probs.register_hook(self.save_attn_gradients)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+        context_layer = torch.matmul(attention_probs_dropped, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+        outputs = outputs + (past_key_value,)
+        return outputs
+class BertSelfOutput(nn.Module):
+    def __init__(self, config, twin=False, merge=False):
+        super().__init__()
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        if twin:
+            self.dense0 = nn.Linear(config.hidden_size, config.hidden_size)
+            self.dense1 = nn.Linear(config.hidden_size, config.hidden_size)
+        else:
+            self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if merge:
+            self.act =  ACT2FN[config.hidden_act]
+            self.merge_layer = nn.Linear(config.hidden_size * 2, config.hidden_size)
+            self.merge = True
+        else:
+            self.merge = False
+    def forward(self, hidden_states, input_tensor):
+        if type(hidden_states) == list:
+            hidden_states0 = self.dense0(hidden_states[0])
+            hidden_states1 = self.dense1(hidden_states[1])
+            if self.merge:
+                #hidden_states = self.merge_layer(self.act(torch.cat([hidden_states0,hidden_states1],dim=-1)))
+                hidden_states = self.merge_layer(torch.cat([hidden_states0,hidden_states1],dim=-1))
+            else:
+                hidden_states = (hidden_states0+hidden_states1)/2
+        else:
+            hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False, layer_num=-1):
+        super().__init__()
+        if is_cross_attention:
+            self.self0 = BertSelfAttention(config, is_cross_attention)
+            self.self1 = BertSelfAttention(config, is_cross_attention)
+        else:
+            self.self = BertSelfAttention(config, is_cross_attention)
+        self.output = BertSelfOutput(config, twin=is_cross_attention, merge=(is_cross_attention and layer_num>=6))
+        self.pruned_heads = set()
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        if type(encoder_hidden_states)==list:
+            self_outputs0 = self.self0(
+                hidden_states,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states[0],
+                encoder_attention_mask[0],
+                past_key_value,
+                output_attentions,
+            )
+            self_outputs1 = self.self1(
+                hidden_states,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states[1],
+                encoder_attention_mask[1],
+                past_key_value,
+                output_attentions,
+            )
+            attention_output = self.output([self_outputs0[0],self_outputs1[0]], hidden_states)
+            outputs = (attention_output,) + self_outputs0[1:]  # add attentions if we output them
+        else:
+            self_outputs = self.self(
+                hidden_states,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                past_key_value,
+                output_attentions,
+            )
+            attention_output = self.output(self_outputs[0], hidden_states)
+            outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertLayer(nn.Module):
+    def __init__(self, config, layer_num):
+        super().__init__()
+        self.config = config
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BertAttention(config)
+        self.layer_num = layer_num
+        if self.config.add_cross_attention:
+            self.crossattention = BertAttention(config, is_cross_attention=self.config.add_cross_attention, layer_num=layer_num)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        mode=None,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:-1]
+        present_key_value = self_attention_outputs[-1]
+        if mode=='multimodal':
+            assert encoder_hidden_states is not None, "encoder_hidden_states must be given for cross-attention layers"
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                output_attentions=output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+        outputs = outputs + (present_key_value,)
+        return outputs
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+class BertEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([BertLayer(config,i) for i in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        mode='multimodal',
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+        next_decoder_cache = () if use_cache else None
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+            if self.gradient_checkpointing and self.training:
+                if use_cache:
+                    logger.warn(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    mode=mode,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                    mode=mode,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+class BertOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+class BertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = BertConfig
+    base_model_prefix = "bert"
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+    def _init_weights(self, module):
+        """ Initialize the weights """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+class BertModel(BertPreTrainedModel):
+    """
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
+    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+    input to the forward pass.
+    """
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+        self.init_weights()
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+    def get_extended_attention_mask(self, attention_mask: Tensor, input_shape: Tuple[int], device: device, is_decoder: bool) -> Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+        Arguments:
+            attention_mask (:obj:`torch.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (:obj:`Tuple[int]`):
+                The shape of the input to the model.
+            device: (:obj:`torch.device`):
+                The device of the input to the model.
+        Returns:
+            :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            if is_decoder:
+                batch_size, seq_length = input_shape
+                seq_ids = torch.arange(seq_length, device=device)
+                causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
+                # in case past_key_values are used we need to add a prefix ones mask to the causal mask
+                # causal and attention masks must have same type with pytorch version < 1.3
+                causal_mask = causal_mask.to(attention_mask.dtype)
+                if causal_mask.shape[1] < attention_mask.shape[1]:
+                    prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
+                    causal_mask = torch.cat(
+                        [
+                            torch.ones((batch_size, seq_length, prefix_seq_len), device=device, dtype=causal_mask.dtype),
+                            causal_mask,
+                        ],
+                        axis=-1,
+                    )
+                extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
+            else:
+                extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        is_decoder=False,
+        mode='multimodal',
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            batch_size, seq_length = input_shape
+            device = input_ids.device
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size, seq_length = input_shape
+            device = inputs_embeds.device
+        elif encoder_embeds is not None:
+            input_shape = encoder_embeds.size()[:-1]
+            batch_size, seq_length = input_shape
+            device = encoder_embeds.device
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds or encoder_embeds")
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape,
+                                                                                 device, is_decoder)
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            if type(encoder_hidden_states) == list:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size()
+            else:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if type(encoder_attention_mask) == list:
+                encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        if encoder_embeds is None:
+            embedding_output = self.embeddings(
+                input_ids=input_ids,
+                position_ids=position_ids,
+                inputs_embeds=inputs_embeds,
+                past_key_values_length=past_key_values_length,
+            )
+        else:
+            embedding_output = encoder_embeds
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            mode=mode,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )

extras/BLIP/models/vit.py ADDED Viewed

	@@ -0,0 +1,308 @@

+'''
+ * Copyright (c) 2022, salesforce.com, inc.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+ * By Junnan Li
+ * Based on timm code base
+ * https://github.com/rwightman/pytorch-image-models/tree/master/timm
+'''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from functools import partial
+from timm.models.vision_transformer import _cfg, PatchEmbed
+from timm.models.registry import register_model
+from timm.models.layers import trunc_normal_, DropPath
+from timm.models.helpers import named_apply, adapt_input_conv
+def checkpoint_wrapper(x):
+    return x
+class Mlp(nn.Module):
+    """ MLP as used in Vision Transformer, MLP-Mixer and related networks
+    """
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
+        self.scale = qk_scale or head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.attn_gradients = None
+        self.attention_map = None
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+    def get_attn_gradients(self):
+        return self.attn_gradients
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+    def get_attention_map(self):
+        return self.attention_map
+    def forward(self, x, register_hook=False):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        if register_hook:
+            self.save_attention_map(attn)
+            attn.register_hook(self.save_attn_gradients)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, use_grad_checkpointing=False):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        if use_grad_checkpointing:
+            self.attn = checkpoint_wrapper(self.attn)
+            self.mlp = checkpoint_wrapper(self.mlp)
+    def forward(self, x, register_hook=False):
+        x = x + self.drop_path(self.attn(self.norm1(x), register_hook=register_hook))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class VisionTransformer(nn.Module):
+    """ Vision Transformer
+    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`  -
+        https://arxiv.org/abs/2010.11929
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4., qkv_bias=True, qk_scale=None, representation_size=None,
+                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=None,
+                 use_grad_checkpointing=False, ckpt_layer=0):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            num_classes (int): number of classes for classification head
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            qk_scale (float): override default qk scale of head_dim ** -0.5 if set
+            representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set
+            drop_rate (float): dropout rate
+            attn_drop_rate (float): attention dropout rate
+            drop_path_rate (float): stochastic depth rate
+            norm_layer: (nn.Module): normalization layer
+        """
+        super().__init__()
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        self.patch_embed = PatchEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+                use_grad_checkpointing=(use_grad_checkpointing and i>=depth-ckpt_layer)
+            )
+            for i in range(depth)])
+        self.norm = norm_layer(embed_dim)
+        trunc_normal_(self.pos_embed, std=.02)
+        trunc_normal_(self.cls_token, std=.02)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+    def forward(self, x, register_blk=-1):
+        B = x.shape[0]
+        x = self.patch_embed(x)
+        cls_tokens = self.cls_token.expand(B, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+        x = x + self.pos_embed[:,:x.size(1),:]
+        x = self.pos_drop(x)
+        for i,blk in enumerate(self.blocks):
+            x = blk(x, register_blk==i)
+        x = self.norm(x)
+        return x
+    @torch.jit.ignore()
+    def load_pretrained(self, checkpoint_path, prefix=''):
+        _load_weights(self, checkpoint_path, prefix)
+@torch.no_grad()
+def _load_weights(model: VisionTransformer, checkpoint_path: str, prefix: str = ''):
+    """ Load weights from .npz checkpoints for official Google Brain Flax implementation
+    """
+    import numpy as np
+    def _n2p(w, t=True):
+        if w.ndim == 4 and w.shape[0] == w.shape[1] == w.shape[2] == 1:
+            w = w.flatten()
+        if t:
+            if w.ndim == 4:
+                w = w.transpose([3, 2, 0, 1])
+            elif w.ndim == 3:
+                w = w.transpose([2, 0, 1])
+            elif w.ndim == 2:
+                w = w.transpose([1, 0])
+        return torch.from_numpy(w)
+    w = np.load(checkpoint_path)
+    if not prefix and 'opt/target/embedding/kernel' in w:
+        prefix = 'opt/target/'
+    if hasattr(model.patch_embed, 'backbone'):
+        # hybrid
+        backbone = model.patch_embed.backbone
+        stem_only = not hasattr(backbone, 'stem')
+        stem = backbone if stem_only else backbone.stem
+        stem.conv.weight.copy_(adapt_input_conv(stem.conv.weight.shape[1], _n2p(w[f'{prefix}conv_root/kernel'])))
+        stem.norm.weight.copy_(_n2p(w[f'{prefix}gn_root/scale']))
+        stem.norm.bias.copy_(_n2p(w[f'{prefix}gn_root/bias']))
+        if not stem_only:
+            for i, stage in enumerate(backbone.stages):
+                for j, block in enumerate(stage.blocks):
+                    bp = f'{prefix}block{i + 1}/unit{j + 1}/'
+                    for r in range(3):
+                        getattr(block, f'conv{r + 1}').weight.copy_(_n2p(w[f'{bp}conv{r + 1}/kernel']))
+                        getattr(block, f'norm{r + 1}').weight.copy_(_n2p(w[f'{bp}gn{r + 1}/scale']))
+                        getattr(block, f'norm{r + 1}').bias.copy_(_n2p(w[f'{bp}gn{r + 1}/bias']))
+                    if block.downsample is not None:
+                        block.downsample.conv.weight.copy_(_n2p(w[f'{bp}conv_proj/kernel']))
+                        block.downsample.norm.weight.copy_(_n2p(w[f'{bp}gn_proj/scale']))
+                        block.downsample.norm.bias.copy_(_n2p(w[f'{bp}gn_proj/bias']))
+        embed_conv_w = _n2p(w[f'{prefix}embedding/kernel'])
+    else:
+        embed_conv_w = adapt_input_conv(
+            model.patch_embed.proj.weight.shape[1], _n2p(w[f'{prefix}embedding/kernel']))
+    model.patch_embed.proj.weight.copy_(embed_conv_w)
+    model.patch_embed.proj.bias.copy_(_n2p(w[f'{prefix}embedding/bias']))
+    model.cls_token.copy_(_n2p(w[f'{prefix}cls'], t=False))
+    pos_embed_w = _n2p(w[f'{prefix}Transformer/posembed_input/pos_embedding'], t=False)
+    if pos_embed_w.shape != model.pos_embed.shape:
+        pos_embed_w = resize_pos_embed(  # resize pos embedding when different size from pretrained weights
+            pos_embed_w, model.pos_embed, getattr(model, 'num_tokens', 1), model.patch_embed.grid_size)
+    model.pos_embed.copy_(pos_embed_w)
+    model.norm.weight.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/scale']))
+    model.norm.bias.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/bias']))
+#     if isinstance(model.head, nn.Linear) and model.head.bias.shape[0] == w[f'{prefix}head/bias'].shape[-1]:
+#         model.head.weight.copy_(_n2p(w[f'{prefix}head/kernel']))
+#         model.head.bias.copy_(_n2p(w[f'{prefix}head/bias']))
+#     if isinstance(getattr(model.pre_logits, 'fc', None), nn.Linear) and f'{prefix}pre_logits/bias' in w:
+#         model.pre_logits.fc.weight.copy_(_n2p(w[f'{prefix}pre_logits/kernel']))
+#         model.pre_logits.fc.bias.copy_(_n2p(w[f'{prefix}pre_logits/bias']))
+    for i, block in enumerate(model.blocks.children()):
+        block_prefix = f'{prefix}Transformer/encoderblock_{i}/'
+        mha_prefix = block_prefix + 'MultiHeadDotProductAttention_1/'
+        block.norm1.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/scale']))
+        block.norm1.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/bias']))
+        block.attn.qkv.weight.copy_(torch.cat([
+            _n2p(w[f'{mha_prefix}{n}/kernel'], t=False).flatten(1).T for n in ('query', 'key', 'value')]))
+        block.attn.qkv.bias.copy_(torch.cat([
+            _n2p(w[f'{mha_prefix}{n}/bias'], t=False).reshape(-1) for n in ('query', 'key', 'value')]))
+        block.attn.proj.weight.copy_(_n2p(w[f'{mha_prefix}out/kernel']).flatten(1))
+        block.attn.proj.bias.copy_(_n2p(w[f'{mha_prefix}out/bias']))
+        for r in range(2):
+            getattr(block.mlp, f'fc{r + 1}').weight.copy_(_n2p(w[f'{block_prefix}MlpBlock_3/Dense_{r}/kernel']))
+            getattr(block.mlp, f'fc{r + 1}').bias.copy_(_n2p(w[f'{block_prefix}MlpBlock_3/Dense_{r}/bias']))
+        block.norm2.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_2/scale']))
+        block.norm2.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_2/bias']))
+def interpolate_pos_embed(pos_embed_checkpoint, visual_encoder):
+    # interpolate position embedding
+    embedding_size = pos_embed_checkpoint.shape[-1]
+    num_patches = visual_encoder.patch_embed.num_patches
+    num_extra_tokens = visual_encoder.pos_embed.shape[-2] - num_patches
+    # height (== width) for the checkpoint position embedding
+    orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+    # height (== width) for the new position embedding
+    new_size = int(num_patches ** 0.5)
+    if orig_size!=new_size:
+        # class_token and dist_token are kept unchanged
+        extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+        # only the position tokens are interpolated
+        pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+        pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
+        pos_tokens = torch.nn.functional.interpolate(
+            pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
+        pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+        new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+        print('reshape position embedding from %d to %d'%(orig_size ** 2,new_size ** 2))
+        return new_pos_embed
+    else:
+        return pos_embed_checkpoint

extras/__pycache__/expansion.cpython-310.pyc ADDED Viewed

Binary file (3.77 kB). View file

extras/__pycache__/face_crop.cpython-310.pyc ADDED Viewed

Binary file (1.49 kB). View file

extras/__pycache__/interrogate.cpython-310.pyc ADDED Viewed

Binary file (2.47 kB). View file

extras/__pycache__/ip_adapter.cpython-310.pyc ADDED Viewed

Binary file (8.67 kB). View file

extras/__pycache__/preprocessors.cpython-310.pyc ADDED Viewed

Binary file (2.66 kB). View file

extras/__pycache__/resampler.cpython-310.pyc ADDED Viewed

Binary file (3.28 kB). View file