histlearn commited on
Commit
d6ba35f
·
verified ·
1 Parent(s): 196a842

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +198 -22
app.py CHANGED
@@ -1,11 +1,195 @@
1
  import gradio as gr
2
- from transformers import AutoProcessor, AutoModelForCausalLM, MarianMTModel, MarianTokenizer
3
  from PIL import Image
4
  import torch
5
  from gtts import gTTS
 
 
 
 
6
  import os
7
 
8
- # Funções auxiliares
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  def prepare_image(image_path):
10
  image = Image.open(image_path).convert("RGB")
11
  inputs = processor(images=image, return_tensors="pt").to(device)
@@ -23,48 +207,40 @@ def generate_caption(pixel_values):
23
  )
24
  return processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
25
 
26
- def translate_to_portuguese(text):
27
- inputs = translation_tokenizer(text, return_tensors="pt", truncation=True).to(device)
28
- translated_ids = translation_model.generate(inputs["input_ids"], max_length=50, num_beams=4, early_stopping=True)
29
- return translation_tokenizer.batch_decode(translated_ids, skip_special_tokens=True)[0]
30
-
31
  def text_to_speech_gtts(text, lang='pt'):
32
  tts = gTTS(text=text, lang=lang)
33
  tts.save("output.mp3")
34
  return "output.mp3"
35
 
36
  # Carregar os modelos
37
- processor = AutoProcessor.from_pretrained("microsoft/git-base")
38
- model = AutoModelForCausalLM.from_pretrained("microsoft/git-base")
39
- translation_model_name = 'Helsinki-NLP/opus-mt-tc-big-en-pt'
40
- translation_tokenizer = MarianTokenizer.from_pretrained(translation_model_name)
41
- translation_model = MarianMTModel.from_pretrained(translation_model_name)
42
 
43
  # Configurar o dispositivo (GPU ou CPU)
44
  device = "cuda" if torch.cuda.is_available() else "cpu"
45
  model.to(device)
46
- translation_model.to(device)
47
 
48
  # Função principal para processar a imagem e gerar a voz
49
  def process_image(image):
50
  _, pixel_values = prepare_image(image)
51
- caption_en = generate_caption(pixel_values)
52
- caption_pt = translate_to_portuguese(caption_en)
53
- audio_file = text_to_speech_gtts(caption_pt)
54
- return caption_pt, audio_file
 
55
 
56
- # Caminhos para as imagens de exemplo (supondo que estejam no mesmo diretório que o script)
57
  example_image_paths = [
58
- "example1.png",
59
- "example2.png",
60
- "example3.png"
61
  ]
62
 
63
  # Interface Gradio
64
  iface = gr.Interface(
65
  fn=process_image,
66
  inputs=gr.Image(type="filepath"),
67
- outputs=[gr.Textbox(), gr.Audio(type="filepath")],
68
  examples=example_image_paths,
69
  title="Image to Voice",
70
  description="Gera uma descrição em português e a converte em voz a partir de uma imagem."
 
1
  import gradio as gr
2
+ from transformers import AutoProcessor, AutoModelForCausalLM
3
  from PIL import Image
4
  import torch
5
  from gtts import gTTS
6
+ import spacy
7
+ import requests
8
+ import nltk.tree
9
+ import re
10
  import os
11
 
12
+ # Carregar o modelo de português do spaCy
13
+ nlp = spacy.load("pt_core_news_sm")
14
+
15
+ # Chave para o LX-Parser
16
+ key = "eb159d39469d84f0ff47167a4d89cada"
17
+
18
+ # Funções de manipulação gramatical
19
+ def invert_adj_n(doc, tags):
20
+ frase = []
21
+ already = False
22
+ for i in range(len(doc)):
23
+ if already:
24
+ already = False
25
+ continue
26
+ if doc[i].tag_ != "PUNCT":
27
+ if tags[i] == "A":
28
+ if i + 1 < len(tags) and tags[i + 1] == "N":
29
+ frase.append(doc[i + 1].text)
30
+ frase.append(doc[i].text)
31
+ already = True
32
+ else:
33
+ frase.append(doc[i].text)
34
+ else:
35
+ frase.append(doc[i].text)
36
+ else:
37
+ frase.append(doc[i].text)
38
+ return frase
39
+
40
+ def adjust_adj(doc, tags):
41
+ frase = []
42
+ for i in range(len(doc)):
43
+ frase.append(doc[i].text)
44
+ if tags[i] == "A":
45
+ if i + 1 < len(tags) and tags[i + 1] == "A":
46
+ frase.append("e")
47
+ return frase
48
+
49
+ def adjust_art(doc, tags):
50
+ frase = []
51
+ already = False
52
+ for i in range(len(doc)):
53
+ if already:
54
+ already = False
55
+ continue
56
+ text = doc[i].text
57
+ if tags[i] == "ART" and text.lower() == "a":
58
+ if i + 1 < len(doc):
59
+ gender = doc[i + 1].morph.get("Gender")
60
+ number = doc[i + 1].morph.get("Number")
61
+ if gender and number:
62
+ if gender[0] == "Masc" and number[0] == "Sing":
63
+ frase.append("um")
64
+ elif gender[0] == "Fem" and number[0] == "Sing":
65
+ frase.append("uma")
66
+ elif gender[0] == "Masc" and number[0] != "Sing":
67
+ frase.append("os")
68
+ else:
69
+ frase.append("as")
70
+ else:
71
+ frase.append(text)
72
+ else:
73
+ frase.append(text)
74
+ else:
75
+ frase.append(text)
76
+ return frase
77
+
78
+ def create_sentence(doc, tags, frase):
79
+ tmp = frase
80
+ for i in range(len(doc)):
81
+ text = doc[i].text
82
+ if doc[i].is_sent_start:
83
+ tmp[i] = tmp[i].capitalize()
84
+ if doc[i].tag_ == "PUNCT":
85
+ tmp[i - 1] += text
86
+ return tmp
87
+
88
+ def get_productions(texto):
89
+ format = 'parentheses'
90
+ url = "https://portulanclarin.net/workbench/lx-parser/api/"
91
+ request_data = {
92
+ 'method': 'parse',
93
+ 'jsonrpc': '2.0',
94
+ 'id': 0,
95
+ 'params': {
96
+ 'text': texto,
97
+ 'format': format,
98
+ 'key': key,
99
+ },
100
+ }
101
+ request = requests.post(url, json=request_data)
102
+ response_data = request.json()
103
+ if "error" in response_data:
104
+ print("Error:", response_data["error"])
105
+ return []
106
+ else:
107
+ result = response_data["result"]
108
+ productions = []
109
+ tree = nltk.tree.Tree.fromstring(result)
110
+ for tag in tree.productions():
111
+ if len(re.findall(r"'.*'", str(tag))) > 0:
112
+ productions.append(str(tag))
113
+ return productions
114
+
115
+ def get_tags(productions):
116
+ tags = []
117
+ for item in productions:
118
+ if isinstance(item, str):
119
+ tags.append(item[:item.find(' ->')])
120
+ else:
121
+ tags.append(item)
122
+ for item in tags:
123
+ if "'" in item:
124
+ tags.remove(item)
125
+ return tags
126
+
127
+ def reordenar_sentenca(sentenca):
128
+ if not sentenca.strip():
129
+ return sentenca
130
+ sentenca = sentenca.lower()
131
+ sentence = get_productions(sentenca)
132
+ tags = get_tags(sentence)
133
+ doc = nlp(sentenca)
134
+ if tags[0] != "ART":
135
+ sentenca = "A " + sentenca.strip()
136
+ sentence = get_productions(sentenca)
137
+ tags = get_tags(sentence)
138
+ doc = nlp(sentenca)
139
+ if not sentence:
140
+ return sentenca.strip()
141
+ aux = []
142
+ if len(tags) > 2 and tags[1] == "N" and tags[2] == "N":
143
+ aux = sentenca.split()
144
+ tmp = aux[1]
145
+ aux[1] = aux[2]
146
+ aux.insert(2, "de")
147
+ aux[3] = tmp
148
+ sentenca = " ".join(aux)
149
+ sentence = get_productions(sentenca)
150
+ tags = get_tags(sentence)
151
+ doc = nlp(sentenca)
152
+ frase = []
153
+ already = False
154
+ person = 3
155
+ tmp_doc = []
156
+ for token in doc:
157
+ tmp_doc.append(token)
158
+ frase = invert_adj_n(tmp_doc, tags)
159
+ nova_sentenca = ' '.join(frase)
160
+ productions = get_productions(nova_sentenca)
161
+ tags = get_tags(productions)
162
+ doc = nlp(nova_sentenca)
163
+ while nova_sentenca != sentenca:
164
+ frase = invert_adj_n(doc, tags)
165
+ sentenca = nova_sentenca
166
+ nova_sentenca = ' '.join(frase)
167
+ productions = get_productions(nova_sentenca)
168
+ tags = get_tags(productions)
169
+ doc = nlp(nova_sentenca)
170
+ frase = adjust_adj(doc, tags)
171
+ nova_sentenca = ' '.join(frase)
172
+ productions = get_productions(nova_sentenca)
173
+ tags = get_tags(productions)
174
+ doc = nlp(nova_sentenca)
175
+ while nova_sentenca != sentenca:
176
+ frase = adjust_adj(doc, tags)
177
+ sentenca = nova_sentenca
178
+ nova_sentenca = ' '.join(frase)
179
+ productions = get_productions(nova_sentenca)
180
+ tags = get_tags(productions)
181
+ doc = nlp(nova_sentenca)
182
+ frase = adjust_art(doc, tags)
183
+ sentenca = ' '.join(frase)
184
+ productions = get_productions(sentenca)
185
+ tags = get_tags(productions)
186
+ doc = nlp(sentenca)
187
+ frase = create_sentence(doc, tags, frase)
188
+ sentenca_normalizada = ""
189
+ for i in range(len(frase)):
190
+ sentenca_normalizada += frase[i] + " "
191
+ return sentenca_normalizada.strip()
192
+
193
  def prepare_image(image_path):
194
  image = Image.open(image_path).convert("RGB")
195
  inputs = processor(images=image, return_tensors="pt").to(device)
 
207
  )
208
  return processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
209
 
 
 
 
 
 
210
  def text_to_speech_gtts(text, lang='pt'):
211
  tts = gTTS(text=text, lang=lang)
212
  tts.save("output.mp3")
213
  return "output.mp3"
214
 
215
  # Carregar os modelos
216
+ processor = AutoProcessor.from_pretrained("histlearn/microsoft-git-portuguese-neuro-simbolic")
217
+ model = AutoModelForCausalLM.from_pretrained("histlearn/microsoft-git-portuguese-neuro-simbolic")
 
 
 
218
 
219
  # Configurar o dispositivo (GPU ou CPU)
220
  device = "cuda" if torch.cuda.is_available() else "cpu"
221
  model.to(device)
 
222
 
223
  # Função principal para processar a imagem e gerar a voz
224
  def process_image(image):
225
  _, pixel_values = prepare_image(image)
226
+ caption_pt = generate_caption(pixel_values)
227
+ sentenca_normalizada = reordenar_sentenca(caption_pt)
228
+ audio_file = text_to_speech_gtts(sentenca_normalizada)
229
+ productions = get_productions(sentenca_normalizada)
230
+ return sentenca_normalizada, productions, audio_file
231
 
232
+ # Caminhos para as imagens de exemplo
233
  example_image_paths = [
234
+ "example1.jpeg",
235
+ "example2.jpeg",
236
+ "example3.jpeg"
237
  ]
238
 
239
  # Interface Gradio
240
  iface = gr.Interface(
241
  fn=process_image,
242
  inputs=gr.Image(type="filepath"),
243
+ outputs=[gr.Textbox(label="Sentença Normalizada"), gr.Textbox(label="Classes Gramaticais"), gr.Audio(type="filepath", label="Áudio")],
244
  examples=example_image_paths,
245
  title="Image to Voice",
246
  description="Gera uma descrição em português e a converte em voz a partir de uma imagem."