Anshu13 commited on
Commit
4acb49a
·
verified ·
1 Parent(s): 8da5be1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -55
app.py CHANGED
@@ -7,6 +7,11 @@ from transformers import AutoProcessor, AutoModelForImageTextToText
7
 
8
  processor = AutoProcessor.from_pretrained("deepseek-community/Janus-Pro-1B", trust_remote_code=True)
9
  model = AutoModelForImageTextToText.from_pretrained("deepseek-community/Janus-Pro-1B", trust_remote_code=True)
 
 
 
 
 
10
  whisper_model = whisper.load_model("base")
11
 
12
  def build_instruction(user_text):
@@ -14,81 +19,63 @@ def build_instruction(user_text):
14
 
15
  def text_to_prompt(user_text):
16
  instruction = build_instruction(user_text)
17
- inputs = processor(text=instruction, return_tensors="pt")
18
 
19
  input_len = inputs.input_ids.shape[1]
20
 
21
- output = model.generate(**inputs, max_new_tokens=150)
22
-
23
- return processor.decode(output[0][input_len:], skip_special_tokens=True)
24
 
25
  def image_text_to_prompt(image_path, user_text):
26
- image = Image.open(image_path)
 
 
 
27
  instruction = build_instruction(user_text)
28
- inputs = processor(images=image, text=instruction, return_tensors="pt")
29
 
 
30
  input_len = inputs.input_ids.shape[1]
31
 
32
- output = model.generate(**inputs, max_new_tokens=150)
33
-
34
- return processor.decode(output[0][input_len:], skip_special_tokens=True)
35
 
36
  def audio_to_prompt(audio_path):
37
  result = whisper_model.transcribe(audio_path)
38
- text = result["text"]
39
- return text_to_prompt(text)
40
 
41
  def generate_prompt_ui(input_type, text, image, audio):
42
-
43
- if input_type == "Text":
44
- return text_to_prompt(text)
45
-
46
- elif input_type == "Image + Text":
47
- if image is None:
48
- return "Please upload an image"
49
- return image_text_to_prompt(image, text)
50
-
51
- elif input_type == "Audio":
52
- if audio is None:
53
- return "Please upload audio"
54
- return audio_to_prompt(audio)
55
-
56
- return "Invalid input"
57
-
58
  with gr.Blocks() as app:
59
-
60
- gr.Markdown("# 🧠 AI Prompt Generator")
61
-
62
- input_type = gr.Radio(
63
- ["Text", "Image + Text", "Audio"],
64
- label="Select Input Type"
65
- )
66
-
67
- text_input = gr.Textbox(label="Enter your idea/prompt")
68
-
69
- image_input = gr.Image(type="filepath", label="Upload Image")
70
-
71
- audio_input = gr.Audio(type="filepath", label="Upload Audio")
72
-
73
  output = gr.Textbox(label="Generated Prompt")
 
74
 
75
- generate_btn = gr.Button("Generate Prompt 🚀")
76
-
77
- def update_inputs(choice):
78
  return (
79
- gr.update(visible=(choice == "Text" or choice == "Image + Text")),
80
  gr.update(visible=(choice == "Image + Text")),
81
  gr.update(visible=(choice == "Audio"))
82
  )
83
- input_type.change(
84
- fn=update_inputs,
85
- inputs=input_type,
86
- outputs=[text_input, image_input, audio_input]
87
- )
88
- generate_btn.click(
89
- fn=generate_prompt_ui,
90
- inputs=[input_type, text_input, image_input, audio_input],
91
- outputs=output
92
- )
93
 
94
  app.launch()
 
7
 
8
  processor = AutoProcessor.from_pretrained("deepseek-community/Janus-Pro-1B", trust_remote_code=True)
9
  model = AutoModelForImageTextToText.from_pretrained("deepseek-community/Janus-Pro-1B", trust_remote_code=True)
10
+
11
+
12
+ device = "cuda" if torch.cuda.is_available() else "cpu"
13
+ model.to(device)
14
+
15
  whisper_model = whisper.load_model("base")
16
 
17
  def build_instruction(user_text):
 
19
 
20
  def text_to_prompt(user_text):
21
  instruction = build_instruction(user_text)
22
+ inputs = processor(text=instruction, return_tensors="pt").to(device)
23
 
24
  input_len = inputs.input_ids.shape[1]
25
 
26
+ output = model.generate(**inputs, max_new_tokens=200)
27
+ return processor.decode(output[0][input_len:], skip_special_tokens=True).strip()
 
28
 
29
  def image_text_to_prompt(image_path, user_text):
30
+ if not user_text:
31
+ user_text = "Describe this image in detail."
32
+
33
+ image = Image.open(image_path).convert("RGB")
34
  instruction = build_instruction(user_text)
 
35
 
36
+ inputs = processor(images=[image], text=instruction, return_tensors="pt").to(device)
37
  input_len = inputs.input_ids.shape[1]
38
 
39
+ output = model.generate(**inputs, max_new_tokens=200)
40
+
41
+ return processor.decode(output[0][input_len:], skip_special_tokens=True).strip()
42
 
43
  def audio_to_prompt(audio_path):
44
  result = whisper_model.transcribe(audio_path)
45
+ return text_to_prompt(result["text"])
 
46
 
47
  def generate_prompt_ui(input_type, text, image, audio):
48
+ try:
49
+ if input_type == "Text":
50
+ return text_to_prompt(text)
51
+ elif input_type == "Image + Text":
52
+ return image_text_to_prompt(image, text)
53
+ elif input_type == "Audio":
54
+ return audio_to_prompt(audio)
55
+ except Exception as e:
56
+ return f"Error: {str(e)}"
57
+
58
+ # Gradio UI setup
 
 
 
 
 
59
  with gr.Blocks() as app:
60
+ gr.Markdown("# 🧠 Janus-Pro Prompt Generator")
61
+
62
+ input_type = gr.Radio(["Text", "Image + Text", "Audio"], label="Select Input Type", value="Text")
63
+
64
+ text_input = gr.Textbox(label="Enter your idea")
65
+ image_input = gr.Image(type="filepath", label="Upload Image", visible=False)
66
+ audio_input = gr.Audio(type="filepath", label="Upload Audio", visible=False)
67
+
 
 
 
 
 
 
68
  output = gr.Textbox(label="Generated Prompt")
69
+ btn = gr.Button("Generate 🚀")
70
 
71
+ def toggle(choice):
 
 
72
  return (
73
+ gr.update(visible=(choice != "Audio")),
74
  gr.update(visible=(choice == "Image + Text")),
75
  gr.update(visible=(choice == "Audio"))
76
  )
77
+
78
+ input_type.change(toggle, input_type, [text_input, image_input, audio_input])
79
+ btn.click(generate_prompt_ui, [input_type, text_input, image_input, audio_input], output)
 
 
 
 
 
 
 
80
 
81
  app.launch()