ManishThota commited on
Commit
fa7747b
1 Parent(s): 3295429

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -31
app.py CHANGED
@@ -58,59 +58,95 @@ def extract_frames(frame):
58
 
59
  return image_bgr
60
 
61
- def predict_answer(image, video, question, max_tokens=100):
62
 
63
  text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:"
64
  input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
65
 
66
-
67
- if image:
68
- # Process as an image
69
- image = image.convert("RGB")
70
  image_tensor = model.image_preprocess(image)
71
-
72
- #Generate the answer
73
- output_ids = model.generate(
74
- input_ids,
75
- max_new_tokens=max_tokens,
76
- images=image_tensor,
77
- use_cache=True)[0]
78
-
79
- return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
80
-
81
- elif video:
82
- # Process as a video
83
- frames = video_to_frames(video)
84
- answers = []
85
- for frame in frames:
86
- image = extract_frames(frame)
87
- image_tensor = model.image_preprocess(image)
88
 
89
- # Generate the answer
90
- output_ids = model.generate(
91
  input_ids,
92
  max_new_tokens=max_tokens,
93
  images=image_tensor,
94
  use_cache=True)[0]
95
 
96
- answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
97
- answers.append(answer)
98
  return answers
 
 
 
 
 
 
 
99
 
100
- else:
101
- return "Unsupported file type. Please upload an image or video."
 
 
 
 
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
 
105
 
106
- def gradio_predict(image, video, question, max_tokens):
107
- answer = predict_answer(image, video, question, max_tokens)
108
  return answer
109
 
110
  iface = gr.Interface(
111
  fn=gradio_predict,
112
  inputs=[
113
- gr.Image(type="pil", label="Upload or Drag an Image"),
114
  gr.Video(label="Upload your video here"),
115
  gr.Textbox(label="Question", placeholder="e.g. Can you explain the slide?", scale=4),
116
  gr.Slider(2, 500, value=25, label="Token Count", info="Choose between 2 and 500")],
 
58
 
59
  return image_bgr
60
 
61
+ def predict_answer(video, question, max_tokens=100):
62
 
63
  text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:"
64
  input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
65
 
66
+ frames = video_to_frames(video)
67
+ answers = []
68
+ for i in range(len(frames)):
69
+ image = extract_frames(frames[i])
70
  image_tensor = model.image_preprocess(image)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
+ # Generate the answer
73
+ output_ids = model.generate(
74
  input_ids,
75
  max_new_tokens=max_tokens,
76
  images=image_tensor,
77
  use_cache=True)[0]
78
 
79
+ answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
80
+ answers.append(answer)
81
  return answers
82
+
83
+
84
+
85
+ # if image:
86
+ # # Process as an image
87
+ # image = image.convert("RGB")
88
+ # image_tensor = model.image_preprocess(image)
89
 
90
+ # #Generate the answer
91
+ # output_ids = model.generate(
92
+ # input_ids,
93
+ # max_new_tokens=max_tokens,
94
+ # images=image_tensor,
95
+ # use_cache=True)[0]
96
 
97
+ # return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
98
+
99
+ # elif video:
100
+ # # Process as a video
101
+ # frames = video_to_frames(video)
102
+ # answers = []
103
+ # for frame in frames:
104
+ # image = extract_frames(frame)
105
+ # image_tensor = model.image_preprocess(image)
106
+
107
+ # # Generate the answer
108
+ # output_ids = model.generate(
109
+ # input_ids,
110
+ # max_new_tokens=max_tokens,
111
+ # images=image_tensor,
112
+ # use_cache=True)[0]
113
+
114
+ # answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
115
+ # answers.append(answer)
116
+ # return answers
117
+
118
+ # else:
119
+ # return "Unsupported file type. Please upload an image or video."
120
+
121
+
122
+
123
+
124
+ # def gradio_predict(image, video, question, max_tokens):
125
+ # answer = predict_answer(image, video, question, max_tokens)
126
+ # return answer
127
 
128
+ # iface = gr.Interface(
129
+ # fn=gradio_predict,
130
+ # inputs=[
131
+ # gr.Image(type="pil", label="Upload or Drag an Image"),
132
+ # gr.Video(label="Upload your video here"),
133
+ # gr.Textbox(label="Question", placeholder="e.g. Can you explain the slide?", scale=4),
134
+ # gr.Slider(2, 500, value=25, label="Token Count", info="Choose between 2 and 500")],
135
+ # outputs=gr.TextArea(label="Answer"),
136
+ # # outputs=gr.Image(label="Output"),
137
+ # title="Video/Image Viewer",
138
+ # description="Upload an image or video to view it or extract frames from the video.",
139
+ # )
140
 
141
+ # iface.launch(debug=True)
142
 
143
+ def gradio_predict(video, question, max_tokens):
144
+ answer = predict_answer(video, question, max_tokens)
145
  return answer
146
 
147
  iface = gr.Interface(
148
  fn=gradio_predict,
149
  inputs=[
 
150
  gr.Video(label="Upload your video here"),
151
  gr.Textbox(label="Question", placeholder="e.g. Can you explain the slide?", scale=4),
152
  gr.Slider(2, 500, value=25, label="Token Count", info="Choose between 2 and 500")],