Spaces:

ake178178
/

pix2contenthuggingface

Paused

File size: 1,372 Bytes

b231190
 
 
88cb1c9
 
 
b231190
 
 
 
 
 
 
 
ded753e
 
b231190
ded753e
 
 
 
b231190
ded753e
b231190
ded753e
 
 
 
 
 
88cb1c9
 
f815f1f
88cb1c9

import streamlit as st
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
from gtts import gTTS
import tempfile
import os

# 加载BLIP模型和处理器
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

st.title("图像描述生成器")
st.write("使用摄像头拍照并生成图像的描述。")

# 使用Streamlit的camera_input来获取用户摄像头输入
image_data = st.camera_input("请使用摄像头拍照")

if image_data is not None:
    # 将图像数据转换为PIL图像
    image = Image.open(image_data)
    
    # 显示拍摄的图像
    st.image(image, caption="拍摄的图像", use_column_width=True)

    # 生成图像描述
    inputs = processor(image, return_tensors="pt")
    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)
    
    st.write(f"图像描述: {caption}")
    
    # 生成语音
    tts = gTTS(text=caption, lang='en')
    
    # 创建临时文件来保存音频
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
        tts.save(fp.name)
        audio_file = fp.name
    
    # 在Streamlit中播放音频
    st.audio(audio_file)

    # 删除临时文件
    os.remove(audio_file)