ToletiSri commited on
Commit
575a023
1 Parent(s): ae90516

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -12
app.py CHANGED
@@ -2,6 +2,9 @@ import torch
2
  import torch.nn as nn
3
  import gradio as gr
4
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 
 
 
5
 
6
 
7
  class _MLPVectorProjector(nn.Module):
@@ -29,17 +32,36 @@ tokenizer_text = AutoTokenizer.from_pretrained(model_name, trust_remote_code=Tru
29
 
30
  ## Audio model
31
  model_name_audio = "openai/whisper-small"
32
- #processor_audio = WhisperProcessor.from_pretrained("openai/whisper-small")
33
- #model_audio = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
34
- #model_audio.config.forced_decoder_ids = None
35
- pipe = pipeline(
36
- task="automatic-speech-recognition",
37
- model=model_name_audio,
38
- chunk_length_s=30,
39
- device="cpu",
40
- )
41
 
42
  ## image model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  def example_inference(input_text, count): #, image, img_qn, audio):
45
  pred_text = textMode(input_text, count)
@@ -54,9 +76,9 @@ def textMode(text, count):
54
  phi2_text.generate(
55
  **inputs,
56
  max_new_tokens=count,
57
- bos_token_id=tokenizer.bos_token_id,
58
- eos_token_id=tokenizer.eos_token_id,
59
- pad_token_id=tokenizer.pad_token_id
60
  )
61
  )
62
  return prediction[0].rstrip('<|endoftext|>').rstrip("\n")
@@ -64,6 +86,7 @@ def textMode(text, count):
64
 
65
 
66
  def imageMode(image, question):
 
67
  return "In progress"
68
 
69
  def audioMode(audio):
 
2
  import torch.nn as nn
3
  import gradio as gr
4
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
5
+ from torchvision import transforms
6
+ from transformers import CLIPProcessor, CLIPModel
7
+ from PIL import Image
8
 
9
 
10
  class _MLPVectorProjector(nn.Module):
 
32
 
33
  ## Audio model
34
  model_name_audio = "openai/whisper-small"
35
+ pipe = pipeline(task="automatic-speech-recognition", model=model_name_audio,
36
+ chunk_length_s=30, device="cpu",)
 
 
 
 
 
 
 
37
 
38
  ## image model
39
+ #Clip model
40
+ model_id_clip = "openai/clip-vit-base-patch16"
41
+ model_clip = CLIPModel.from_pretrained(model_id_clip).to("cpu")
42
+ processor_clip = CLIPProcessor.from_pretrained(model_id_clip)
43
+
44
+ # Preprocess the image for clip
45
+ def preprocess_image(image_path):
46
+ image = Image.open(image_path).convert("RGB")
47
+ image = transforms.Resize((224, 224))(image)
48
+ image = transforms.ToTensor()(image)
49
+ return image.unsqueeze(0)
50
+
51
+ # Get clip encoding
52
+ def encode_image(image_path):
53
+ image = preprocess_image(image_path).to("cpu")
54
+ # Dummy input_ids for text
55
+ dummy_text = ""
56
+ inputs = processor_clip(text=dummy_text, images=image, return_tensors="pt", padding=True)
57
+ outputs = model_clip(**inputs)
58
+ img_embedding = outputs.image_embeds
59
+ return img_embedding
60
+
61
+ #Get the projection model
62
+
63
+ #Get the fine-tuned phi-2 model
64
+
65
 
66
  def example_inference(input_text, count): #, image, img_qn, audio):
67
  pred_text = textMode(input_text, count)
 
76
  phi2_text.generate(
77
  **inputs,
78
  max_new_tokens=count,
79
+ bos_token_id=tokenizer_text.bos_token_id,
80
+ eos_token_id=tokenizer_text.eos_token_id,
81
+ pad_token_id=tokenizer_text.pad_token_id
82
  )
83
  )
84
  return prediction[0].rstrip('<|endoftext|>').rstrip("\n")
 
86
 
87
 
88
  def imageMode(image, question):
89
+ image_embedding = encode_image(image)
90
  return "In progress"
91
 
92
  def audioMode(audio):