ManishThota commited on
Commit
0a6288f
1 Parent(s): ae8423a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -66
app.py CHANGED
@@ -58,95 +58,77 @@ def extract_frames(frame):
58
 
59
  return image_bgr
60
 
61
- def predict_answer(video, question, max_tokens=100):
62
 
63
  text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:"
64
  input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
65
 
66
- frames = video_to_frames(video)
67
- answers = []
68
- for i in range(len(frames)):
69
- image = extract_frames(frames[i])
70
- image_tensor = model.image_preprocess([image])
71
 
72
- # Generate the answer
73
- output_ids = model.generate(
74
- input_ids,
75
- max_new_tokens=max_tokens,
76
- images=image_tensor,
77
- use_cache=True)[0]
78
 
79
- answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
80
- answers.append(answer)
81
- return answers
82
 
83
 
84
 
85
- # if image:
86
- # # Process as an image
87
- # image = image.convert("RGB")
88
- # image_tensor = model.image_preprocess(image)
89
 
90
- # #Generate the answer
91
- # output_ids = model.generate(
92
- # input_ids,
93
- # max_new_tokens=max_tokens,
94
- # images=image_tensor,
95
- # use_cache=True)[0]
96
 
97
- # return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
98
 
99
- # elif video:
100
- # # Process as a video
101
- # frames = video_to_frames(video)
102
- # answers = []
103
- # for frame in frames:
104
- # image = extract_frames(frame)
105
- # image_tensor = model.image_preprocess(image)
106
 
107
- # # Generate the answer
108
- # output_ids = model.generate(
109
- # input_ids,
110
- # max_new_tokens=max_tokens,
111
- # images=image_tensor,
112
- # use_cache=True)[0]
113
 
114
- # answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
115
- # answers.append(answer)
116
- # return answers
117
 
118
- # else:
119
- # return "Unsupported file type. Please upload an image or video."
120
 
121
 
122
 
123
 
124
- # def gradio_predict(image, video, question, max_tokens):
125
- # answer = predict_answer(image, video, question, max_tokens)
126
- # return answer
127
-
128
- # iface = gr.Interface(
129
- # fn=gradio_predict,
130
- # inputs=[
131
- # gr.Image(type="pil", label="Upload or Drag an Image"),
132
- # gr.Video(label="Upload your video here"),
133
- # gr.Textbox(label="Question", placeholder="e.g. Can you explain the slide?", scale=4),
134
- # gr.Slider(2, 500, value=25, label="Token Count", info="Choose between 2 and 500")],
135
- # outputs=gr.TextArea(label="Answer"),
136
- # # outputs=gr.Image(label="Output"),
137
- # title="Video/Image Viewer",
138
- # description="Upload an image or video to view it or extract frames from the video.",
139
- # )
140
-
141
- # iface.launch(debug=True)
142
-
143
- def gradio_predict(video, question, max_tokens):
144
- answer = predict_answer(video, question, max_tokens)
145
  return answer
146
 
147
  iface = gr.Interface(
148
  fn=gradio_predict,
149
  inputs=[
 
150
  gr.Video(label="Upload your video here"),
151
  gr.Textbox(label="Question", placeholder="e.g. Can you explain the slide?", scale=4),
152
  gr.Slider(2, 500, value=25, label="Token Count", info="Choose between 2 and 500")],
@@ -156,4 +138,4 @@ iface = gr.Interface(
156
  description="Upload an image or video to view it or extract frames from the video.",
157
  )
158
 
159
- iface.launch(debug=True)
 
58
 
59
  return image_bgr
60
 
61
+ def predict_answer(image, video, question, max_tokens=100):
62
 
63
  text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:"
64
  input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
65
 
66
+ # frames = video_to_frames(video)
67
+ # answers = []
68
+ # for i in range(len(frames)):
69
+ # image = extract_frames(frames[i])
70
+ # image_tensor = model.image_preprocess([image])
71
 
72
+ # # Generate the answer
73
+ # output_ids = model.generate(
74
+ # input_ids,
75
+ # max_new_tokens=max_tokens,
76
+ # images=image_tensor,
77
+ # use_cache=True)[0]
78
 
79
+ # answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
80
+ # answers.append(answer)
81
+ # return answers
82
 
83
 
84
 
85
+ if image:
86
+ # Process as an image
87
+ image = image.convert("RGB")
88
+ image_tensor = model.image_preprocess(image)
89
 
90
+ #Generate the answer
91
+ output_ids = model.generate(
92
+ input_ids,
93
+ max_new_tokens=max_tokens,
94
+ images=image_tensor,
95
+ use_cache=True)[0]
96
 
97
+ return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
98
 
99
+ elif video:
100
+ # Process as a video
101
+ frames = video_to_frames(video)
102
+ answers = []
103
+ for frame in frames:
104
+ image = extract_frames(frame)
105
+ image_tensor = model.image_preprocess([image])
106
 
107
+ # Generate the answer
108
+ output_ids = model.generate(
109
+ input_ids,
110
+ max_new_tokens=max_tokens,
111
+ images=image_tensor,
112
+ use_cache=True)[0]
113
 
114
+ answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
115
+ answers.append(answer)
116
+ return answers
117
 
118
+ else:
119
+ return "Unsupported file type. Please upload an image or video."
120
 
121
 
122
 
123
 
124
+ def gradio_predict(image, video, question, max_tokens):
125
+ answer = predict_answer(image, video, question, max_tokens)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  return answer
127
 
128
  iface = gr.Interface(
129
  fn=gradio_predict,
130
  inputs=[
131
+ gr.Image(type="pil", label="Upload or Drag an Image"),
132
  gr.Video(label="Upload your video here"),
133
  gr.Textbox(label="Question", placeholder="e.g. Can you explain the slide?", scale=4),
134
  gr.Slider(2, 500, value=25, label="Token Count", info="Choose between 2 and 500")],
 
138
  description="Upload an image or video to view it or extract frames from the video.",
139
  )
140
 
141
+ iface.launch(debug=True)