anas-awadalla commited on
Commit
f6a286e
1 Parent(s): 84256d5
app.py CHANGED
@@ -52,12 +52,12 @@ with open("bad_words.txt", "r") as f:
52
  model, image_processor, tokenizer = create_model_and_transforms(
53
  clip_vision_encoder_pretrained="openai",
54
  clip_vision_encoder_path="ViT-L-14",
55
- lang_encoder_path="togethercomputer/RedPajama-INCITE-Base-3B-v1",
56
- tokenizer_path="togethercomputer/RedPajama-INCITE-Base-3B-v1",
57
  cross_attn_every_n_layers=2,
58
  )
59
 
60
- checkpoint_path = hf_hub_download("openflamingo/OpenFlamingo-4B-vitl-rpj3b", "checkpoint.pt")
61
  model.load_state_dict(torch.load(checkpoint_path), strict=False)
62
 
63
  model.eval()
@@ -112,9 +112,7 @@ def generate(
112
  demo_plus_text += (
113
  "<image>Output:" if idx != 2 else f"<image>Question: {text.strip()} Answer:"
114
  )
115
- # demo_plus_image = [example_one_image, example_two_image, image]
116
 
117
- # print(demo_plus_image)
118
  print(demo_plus_text)
119
 
120
  lang_x = tokenizer(demo_plus_text, return_tensors="pt")
@@ -132,11 +130,10 @@ def generate(
132
  lang_x=input_ids,
133
  attention_mask=attention_mask,
134
  max_new_tokens=50,
135
- num_beams=3,
136
  do_sample=True,
137
- top_p=0.9,
138
  top_k=0,
139
- no_repeat_ngram_size=3,
140
  )
141
 
142
  gen_text = tokenizer.decode(
@@ -178,7 +175,7 @@ with gr.Blocks() as demo:
178
  Blog posts: #1 [An open-source framework for training vision-language models with in-context learning (like GPT-4!)](https://laion.ai/blog/open-flamingo/) // #2 [OpenFlamingo v2: New Models and Enhanced Training Setup]()\n
179
  GitHub: [open_flamingo](https://github.com/mlfoundations/open_flamingo)
180
 
181
- In this demo we implement an interactive interface that showcases the in-context learning capabilities of the OpenFlamingo-9B model, a large multimodal model trained on top of LLaMA-7B.
182
  The model is trained on an interleaved mixture of text and images and is able to generate text conditioned on sequences of images/text. To safeguard against harmful generations, we detect toxic text in the model output and reject it. However, we understand that this is not a perfect solution and we encourage you to use this demo responsibly. If you find that the model is generating harmful text, please report it using this [form](https://forms.gle/StbcPvyyW2p3Pc7z6).
183
  """
184
  )
@@ -186,7 +183,7 @@ with gr.Blocks() as demo:
186
  with gr.Accordion("See terms and conditions"):
187
  gr.Markdown("""**Please read the following information carefully before proceeding.**
188
  [OpenFlamingo-9B](https://huggingface.co/openflamingo/OpenFlamingo-9B-vitl-mpt7b) is a **research prototype** that aims to enable users to interact with AI through both language and images. AI agents equipped with both language and visual understanding can be useful on a larger variety of tasks compared to models that communicate solely via language. By releasing an open-source research prototype, we hope to help the research community better understand the risks and limitations of modern visual-language AI models and accelerate the development of safer and more reliable methods.
189
- **Limitations.** OpenFlamingo-9B is built on top of the [MPT-7B](https://huggingface.co/mosaicml/mpt-7b) large language model developed by Meta AI. Large language models are trained on mostly unfiltered internet data, and have been shown to be able to produce toxic, unethical, inaccurate, and harmful content. On top of this, OpenFlamingo’s ability to support visual inputs creates additional risks, since it can be used in a wider variety of applications; image+text models may carry additional risks specific to multimodality. Please use discretion when assessing the accuracy or appropriateness of the model’s outputs, and be mindful before sharing its results.
190
  **Privacy and data collection.** This demo does NOT store any personal information on its users, and it does NOT store user queries.""")
191
  read_tc = gr.Checkbox(
192
  label="I have read and agree to the terms and conditions")
 
52
  model, image_processor, tokenizer = create_model_and_transforms(
53
  clip_vision_encoder_pretrained="openai",
54
  clip_vision_encoder_path="ViT-L-14",
55
+ lang_encoder_path="anas-awadalla/mpt-7b",
56
+ tokenizer_path="anas-awadalla/mpt-7b",
57
  cross_attn_every_n_layers=2,
58
  )
59
 
60
+ checkpoint_path = hf_hub_download("openflamingo/OpenFlamingo-9B-vitl-mpt7b", "checkpoint.pt")
61
  model.load_state_dict(torch.load(checkpoint_path), strict=False)
62
 
63
  model.eval()
 
112
  demo_plus_text += (
113
  "<image>Output:" if idx != 2 else f"<image>Question: {text.strip()} Answer:"
114
  )
 
115
 
 
116
  print(demo_plus_text)
117
 
118
  lang_x = tokenizer(demo_plus_text, return_tensors="pt")
 
130
  lang_x=input_ids,
131
  attention_mask=attention_mask,
132
  max_new_tokens=50,
133
+ num_beams=5,
134
  do_sample=True,
135
+ top_p=0.95,
136
  top_k=0,
 
137
  )
138
 
139
  gen_text = tokenizer.decode(
 
175
  Blog posts: #1 [An open-source framework for training vision-language models with in-context learning (like GPT-4!)](https://laion.ai/blog/open-flamingo/) // #2 [OpenFlamingo v2: New Models and Enhanced Training Setup]()\n
176
  GitHub: [open_flamingo](https://github.com/mlfoundations/open_flamingo)
177
 
178
+ In this demo we implement an interactive interface that showcases the in-context learning capabilities of the OpenFlamingo-9B model, a large multimodal model trained on top of MPT-7B.
179
  The model is trained on an interleaved mixture of text and images and is able to generate text conditioned on sequences of images/text. To safeguard against harmful generations, we detect toxic text in the model output and reject it. However, we understand that this is not a perfect solution and we encourage you to use this demo responsibly. If you find that the model is generating harmful text, please report it using this [form](https://forms.gle/StbcPvyyW2p3Pc7z6).
180
  """
181
  )
 
183
  with gr.Accordion("See terms and conditions"):
184
  gr.Markdown("""**Please read the following information carefully before proceeding.**
185
  [OpenFlamingo-9B](https://huggingface.co/openflamingo/OpenFlamingo-9B-vitl-mpt7b) is a **research prototype** that aims to enable users to interact with AI through both language and images. AI agents equipped with both language and visual understanding can be useful on a larger variety of tasks compared to models that communicate solely via language. By releasing an open-source research prototype, we hope to help the research community better understand the risks and limitations of modern visual-language AI models and accelerate the development of safer and more reliable methods.
186
+ **Limitations.** OpenFlamingo-9B is built on top of the [MPT-7B](https://huggingface.co/mosaicml/mpt-7b) large language model developed by MosaicML. Large language models are trained on mostly unfiltered internet data, and have been shown to be able to produce toxic, unethical, inaccurate, and harmful content. On top of this, OpenFlamingo’s ability to support visual inputs creates additional risks, since it can be used in a wider variety of applications; image+text models may carry additional risks specific to multimodality. Please use discretion when assessing the accuracy or appropriateness of the model’s outputs, and be mindful before sharing its results.
187
  **Privacy and data collection.** This demo does NOT store any personal information on its users, and it does NOT store user queries.""")
188
  read_tc = gr.Checkbox(
189
  label="I have read and agree to the terms and conditions")
images/4645808729_2dfc59b6a5_z.jpg CHANGED
images/5944609705_4664531909_z.jpg CHANGED