File size: 2,027 Bytes
c7ce7f7
e97d4b7
 
 
 
1037518
1742431
 
1037518
fd8a558
5b61863
e60395e
 
e97d4b7
 
 
 
 
 
 
78693ca
e97d4b7
c7ed215
e97d4b7
 
1e3f5f7
 
 
 
e97d4b7
194e9f5
 
08ac218
 
 
1e3f5f7
 
1037518
 
 
 
c7ed215
53d65e5
 
 
 
 
 
 
9ddb0f5
53d65e5
 
 
 
 
 
e97d4b7
 
da5f2d4
1e3f5f7
e97d4b7
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62

import gradio as gr
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import requests
import base64
import tempfile



processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to("cpu")


class Aspecto():
    pass

screen = Aspecto()
with gr.Blocks(theme=gr.themes.Ocean(primary_hue="pink", neutral_hue="indigo", font=[gr.themes.GoogleFont("Montserrat"), "Playwrite England SemiJoine", "Quicksand"])) as demo:
    image = gr.Image(label="Imagen", sources = ["upload","clipboard"])
    with gr.Row():
        button = gr.Button("Describir", variant="primary")
        clear = gr.Button("Borrar")
    output = gr.Textbox(label="Resumen")
    with gr.Row():
        button2 = gr.Button("Leer", variant="primary")
        clear = gr.Button("Borrar")
    output2 = gr.Audio(label="Audio")

    def describir(image):
      raw_image = image
      inputs = processor(raw_image, return_tensors="pt").to("cpu")
      out = model.generate(**inputs)
      return processor.decode(out[0], skip_special_tokens=True)

    def leer(texto):
        response = requests.post("https://charly-text-to-speech.hf.space/run/predict", json={
          "data": [
            texto,
        ]}).json()
        
        data = response['data'][0]
    
        # Extraer la parte de base64 del string (eliminar el prefijo 'data:audio/flac;base64,')
        audio_base64 = data.split(',')[1]
    
        # Decodificar el string base64
        audio_data = base64.b64decode(audio_base64)
        
        # Crear un archivo temporal
        with tempfile.NamedTemporaryFile(delete=False, suffix='.flac') as temp_audio_file:
            temp_audio_file.write(audio_data)
            temp_audio_path = temp_audio_file.name
        
        return temp_audio_path


    button.click(describir, [image], output)
    button2.click(leer, [output], output2)

demo.launch(debug=True)