histlearn commited on
Commit
e73a5e2
1 Parent(s): d6ba35f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -198
app.py CHANGED
@@ -1,195 +1,11 @@
1
  import gradio as gr
2
- from transformers import AutoProcessor, AutoModelForCausalLM
3
  from PIL import Image
4
  import torch
5
  from gtts import gTTS
6
- import spacy
7
- import requests
8
- import nltk.tree
9
- import re
10
  import os
11
 
12
- # Carregar o modelo de português do spaCy
13
- nlp = spacy.load("pt_core_news_sm")
14
-
15
- # Chave para o LX-Parser
16
- key = "eb159d39469d84f0ff47167a4d89cada"
17
-
18
- # Funções de manipulação gramatical
19
- def invert_adj_n(doc, tags):
20
- frase = []
21
- already = False
22
- for i in range(len(doc)):
23
- if already:
24
- already = False
25
- continue
26
- if doc[i].tag_ != "PUNCT":
27
- if tags[i] == "A":
28
- if i + 1 < len(tags) and tags[i + 1] == "N":
29
- frase.append(doc[i + 1].text)
30
- frase.append(doc[i].text)
31
- already = True
32
- else:
33
- frase.append(doc[i].text)
34
- else:
35
- frase.append(doc[i].text)
36
- else:
37
- frase.append(doc[i].text)
38
- return frase
39
-
40
- def adjust_adj(doc, tags):
41
- frase = []
42
- for i in range(len(doc)):
43
- frase.append(doc[i].text)
44
- if tags[i] == "A":
45
- if i + 1 < len(tags) and tags[i + 1] == "A":
46
- frase.append("e")
47
- return frase
48
-
49
- def adjust_art(doc, tags):
50
- frase = []
51
- already = False
52
- for i in range(len(doc)):
53
- if already:
54
- already = False
55
- continue
56
- text = doc[i].text
57
- if tags[i] == "ART" and text.lower() == "a":
58
- if i + 1 < len(doc):
59
- gender = doc[i + 1].morph.get("Gender")
60
- number = doc[i + 1].morph.get("Number")
61
- if gender and number:
62
- if gender[0] == "Masc" and number[0] == "Sing":
63
- frase.append("um")
64
- elif gender[0] == "Fem" and number[0] == "Sing":
65
- frase.append("uma")
66
- elif gender[0] == "Masc" and number[0] != "Sing":
67
- frase.append("os")
68
- else:
69
- frase.append("as")
70
- else:
71
- frase.append(text)
72
- else:
73
- frase.append(text)
74
- else:
75
- frase.append(text)
76
- return frase
77
-
78
- def create_sentence(doc, tags, frase):
79
- tmp = frase
80
- for i in range(len(doc)):
81
- text = doc[i].text
82
- if doc[i].is_sent_start:
83
- tmp[i] = tmp[i].capitalize()
84
- if doc[i].tag_ == "PUNCT":
85
- tmp[i - 1] += text
86
- return tmp
87
-
88
- def get_productions(texto):
89
- format = 'parentheses'
90
- url = "https://portulanclarin.net/workbench/lx-parser/api/"
91
- request_data = {
92
- 'method': 'parse',
93
- 'jsonrpc': '2.0',
94
- 'id': 0,
95
- 'params': {
96
- 'text': texto,
97
- 'format': format,
98
- 'key': key,
99
- },
100
- }
101
- request = requests.post(url, json=request_data)
102
- response_data = request.json()
103
- if "error" in response_data:
104
- print("Error:", response_data["error"])
105
- return []
106
- else:
107
- result = response_data["result"]
108
- productions = []
109
- tree = nltk.tree.Tree.fromstring(result)
110
- for tag in tree.productions():
111
- if len(re.findall(r"'.*'", str(tag))) > 0:
112
- productions.append(str(tag))
113
- return productions
114
-
115
- def get_tags(productions):
116
- tags = []
117
- for item in productions:
118
- if isinstance(item, str):
119
- tags.append(item[:item.find(' ->')])
120
- else:
121
- tags.append(item)
122
- for item in tags:
123
- if "'" in item:
124
- tags.remove(item)
125
- return tags
126
-
127
- def reordenar_sentenca(sentenca):
128
- if not sentenca.strip():
129
- return sentenca
130
- sentenca = sentenca.lower()
131
- sentence = get_productions(sentenca)
132
- tags = get_tags(sentence)
133
- doc = nlp(sentenca)
134
- if tags[0] != "ART":
135
- sentenca = "A " + sentenca.strip()
136
- sentence = get_productions(sentenca)
137
- tags = get_tags(sentence)
138
- doc = nlp(sentenca)
139
- if not sentence:
140
- return sentenca.strip()
141
- aux = []
142
- if len(tags) > 2 and tags[1] == "N" and tags[2] == "N":
143
- aux = sentenca.split()
144
- tmp = aux[1]
145
- aux[1] = aux[2]
146
- aux.insert(2, "de")
147
- aux[3] = tmp
148
- sentenca = " ".join(aux)
149
- sentence = get_productions(sentenca)
150
- tags = get_tags(sentence)
151
- doc = nlp(sentenca)
152
- frase = []
153
- already = False
154
- person = 3
155
- tmp_doc = []
156
- for token in doc:
157
- tmp_doc.append(token)
158
- frase = invert_adj_n(tmp_doc, tags)
159
- nova_sentenca = ' '.join(frase)
160
- productions = get_productions(nova_sentenca)
161
- tags = get_tags(productions)
162
- doc = nlp(nova_sentenca)
163
- while nova_sentenca != sentenca:
164
- frase = invert_adj_n(doc, tags)
165
- sentenca = nova_sentenca
166
- nova_sentenca = ' '.join(frase)
167
- productions = get_productions(nova_sentenca)
168
- tags = get_tags(productions)
169
- doc = nlp(nova_sentenca)
170
- frase = adjust_adj(doc, tags)
171
- nova_sentenca = ' '.join(frase)
172
- productions = get_productions(nova_sentenca)
173
- tags = get_tags(productions)
174
- doc = nlp(nova_sentenca)
175
- while nova_sentenca != sentenca:
176
- frase = adjust_adj(doc, tags)
177
- sentenca = nova_sentenca
178
- nova_sentenca = ' '.join(frase)
179
- productions = get_productions(nova_sentenca)
180
- tags = get_tags(productions)
181
- doc = nlp(nova_sentenca)
182
- frase = adjust_art(doc, tags)
183
- sentenca = ' '.join(frase)
184
- productions = get_productions(sentenca)
185
- tags = get_tags(productions)
186
- doc = nlp(sentenca)
187
- frase = create_sentence(doc, tags, frase)
188
- sentenca_normalizada = ""
189
- for i in range(len(frase)):
190
- sentenca_normalizada += frase[i] + " "
191
- return sentenca_normalizada.strip()
192
-
193
  def prepare_image(image_path):
194
  image = Image.open(image_path).convert("RGB")
195
  inputs = processor(images=image, return_tensors="pt").to(device)
@@ -207,40 +23,48 @@ def generate_caption(pixel_values):
207
  )
208
  return processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
209
 
 
 
 
 
 
210
  def text_to_speech_gtts(text, lang='pt'):
211
  tts = gTTS(text=text, lang=lang)
212
  tts.save("output.mp3")
213
  return "output.mp3"
214
 
215
  # Carregar os modelos
216
- processor = AutoProcessor.from_pretrained("histlearn/microsoft-git-portuguese-neuro-simbolic")
217
- model = AutoModelForCausalLM.from_pretrained("histlearn/microsoft-git-portuguese-neuro-simbolic")
 
 
 
218
 
219
  # Configurar o dispositivo (GPU ou CPU)
220
  device = "cuda" if torch.cuda.is_available() else "cpu"
221
  model.to(device)
 
222
 
223
  # Função principal para processar a imagem e gerar a voz
224
  def process_image(image):
225
  _, pixel_values = prepare_image(image)
226
- caption_pt = generate_caption(pixel_values)
227
- sentenca_normalizada = reordenar_sentenca(caption_pt)
228
- audio_file = text_to_speech_gtts(sentenca_normalizada)
229
- productions = get_productions(sentenca_normalizada)
230
- return sentenca_normalizada, productions, audio_file
231
 
232
- # Caminhos para as imagens de exemplo
233
  example_image_paths = [
234
- "example1.jpeg",
235
- "example2.jpeg",
236
- "example3.jpeg"
237
  ]
238
 
239
  # Interface Gradio
240
  iface = gr.Interface(
241
  fn=process_image,
242
  inputs=gr.Image(type="filepath"),
243
- outputs=[gr.Textbox(label="Sentença Normalizada"), gr.Textbox(label="Classes Gramaticais"), gr.Audio(type="filepath", label="Áudio")],
244
  examples=example_image_paths,
245
  title="Image to Voice",
246
  description="Gera uma descrição em português e a converte em voz a partir de uma imagem."
 
1
  import gradio as gr
2
+ from transformers import AutoProcessor, AutoModelForCausalLM, MarianMTModel, MarianTokenizer
3
  from PIL import Image
4
  import torch
5
  from gtts import gTTS
 
 
 
 
6
  import os
7
 
8
+ # Funções auxiliares
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  def prepare_image(image_path):
10
  image = Image.open(image_path).convert("RGB")
11
  inputs = processor(images=image, return_tensors="pt").to(device)
 
23
  )
24
  return processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
25
 
26
+ def translate_to_portuguese(text):
27
+ inputs = translation_tokenizer(text, return_tensors="pt", truncation=True).to(device)
28
+ translated_ids = translation_model.generate(inputs["input_ids"], max_length=50, num_beams=4, early_stopping=True)
29
+ return translation_tokenizer.batch_decode(translated_ids, skip_special_tokens=True)[0]
30
+
31
  def text_to_speech_gtts(text, lang='pt'):
32
  tts = gTTS(text=text, lang=lang)
33
  tts.save("output.mp3")
34
  return "output.mp3"
35
 
36
  # Carregar os modelos
37
+ processor = AutoProcessor.from_pretrained("microsoft/git-base")
38
+ model = AutoModelForCausalLM.from_pretrained("microsoft/git-base")
39
+ translation_model_name = 'Helsinki-NLP/opus-mt-tc-big-en-pt'
40
+ translation_tokenizer = MarianTokenizer.from_pretrained(translation_model_name)
41
+ translation_model = MarianMTModel.from_pretrained(translation_model_name)
42
 
43
  # Configurar o dispositivo (GPU ou CPU)
44
  device = "cuda" if torch.cuda.is_available() else "cpu"
45
  model.to(device)
46
+ translation_model.to(device)
47
 
48
  # Função principal para processar a imagem e gerar a voz
49
  def process_image(image):
50
  _, pixel_values = prepare_image(image)
51
+ caption_en = generate_caption(pixel_values)
52
+ caption_pt = translate_to_portuguese(caption_en)
53
+ audio_file = text_to_speech_gtts(caption_pt)
54
+ return caption_pt, audio_file
 
55
 
56
+ # Caminhos para as imagens de exemplo (supondo que estejam no mesmo diretório que o script)
57
  example_image_paths = [
58
+ "example1.png",
59
+ "example2.png",
60
+ "example3.png"
61
  ]
62
 
63
  # Interface Gradio
64
  iface = gr.Interface(
65
  fn=process_image,
66
  inputs=gr.Image(type="filepath"),
67
+ outputs=[gr.Textbox(), gr.Audio(type="filepath")],
68
  examples=example_image_paths,
69
  title="Image to Voice",
70
  description="Gera uma descrição em português e a converte em voz a partir de uma imagem."