Spaces:
Runtime error
Runtime error
commit
#15
by
Balachandar
- opened
- .env +1 -1
- __pycache__/lang_list.cpython-311.pyc +0 -0
- app.py +135 -261
- ayatonic.env +2 -0
- audio_files/audio_3505178120260920029.wav → languages.json +0 -0
- requirements.txt +4 -5
- script.py +0 -10
.env
CHANGED
@@ -1,2 +1,2 @@
|
|
1 |
CO_API_KEY=KQBPf0H0ENZESIC5nuUJ4i4jjg34xMPAkYK7s31W
|
2 |
-
SEAMLESSM4T=facebook
|
|
|
1 |
CO_API_KEY=KQBPf0H0ENZESIC5nuUJ4i4jjg34xMPAkYK7s31W
|
2 |
+
SEAMLESSM4T=https://facebook-seamless-m4t-v2-large.hf.space/--replicas/v4gsf/
|
__pycache__/lang_list.cpython-311.pyc
DELETED
Binary file (5.61 kB)
|
|
app.py
CHANGED
@@ -5,20 +5,15 @@ from surya.ocr import run_ocr
|
|
5 |
from surya.model.detection.segformer import load_model as load_det_model, load_processor as load_det_processor
|
6 |
from surya.model.recognition.model import load_model as load_rec_model
|
7 |
from surya.model.recognition.processor import load_processor as load_rec_processor
|
8 |
-
from lang_list import
|
9 |
from gradio_client import Client
|
10 |
from dotenv import load_dotenv
|
11 |
import requests
|
12 |
-
from io import BytesIO
|
13 |
import cohere
|
14 |
import os
|
15 |
import re
|
16 |
import pandas as pd
|
17 |
-
import pydub
|
18 |
-
from pydub import AudioSegment
|
19 |
-
from pydub.utils import make_chunks
|
20 |
-
from pathlib import Path
|
21 |
-
import hashlib
|
22 |
|
23 |
|
24 |
title = "# Welcome to AyaTonic"
|
@@ -27,12 +22,14 @@ description = "Learn a New Language With Aya"
|
|
27 |
load_dotenv()
|
28 |
COHERE_API_KEY = os.getenv('CO_API_KEY')
|
29 |
SEAMLESSM4T = os.getenv('SEAMLESSM4T')
|
|
|
30 |
df = pd.read_csv("lang_list.csv")
|
31 |
-
|
32 |
inputlanguage = ""
|
33 |
producetext = "\n\nProduce a complete expositional blog post in {target_language} based on the above :"
|
34 |
-
formatinputstring = "
|
35 |
-
|
|
|
36 |
patterns = {
|
37 |
"red": r'<span style="color: red;">(.*?)</span>',
|
38 |
"blue": r'<span style="color: blue;">(.*?)</span>',
|
@@ -45,70 +42,10 @@ matches = {
|
|
45 |
"blue": [],
|
46 |
"green": [],
|
47 |
}
|
48 |
-
|
49 |
-
co = cohere.Client(COHERE_API_KEY)
|
50 |
-
audio_client = Client(SEAMLESSM4T)
|
51 |
-
|
52 |
-
def get_language_code(language_name):
|
53 |
-
"""
|
54 |
-
Extracts the first two letters of the language code based on the language name.
|
55 |
-
"""
|
56 |
-
try:
|
57 |
-
code = df.loc[df['name'].str.lower() == language_name.lower(), 'code'].values[0]
|
58 |
-
return code
|
59 |
-
except IndexError:
|
60 |
-
print(f"Language name '{language_name}' not found.")
|
61 |
-
return None
|
62 |
-
|
63 |
-
def translate_text(text, inputlanguage, target_language):
|
64 |
-
"""
|
65 |
-
Translates text.
|
66 |
-
"""
|
67 |
-
# Ensure you format the instruction string within the function body
|
68 |
-
instructions = translatetextinst.format(inputlanguage=inputlanguage)
|
69 |
-
producetext_formatted = producetext.format(target_language=target_language)
|
70 |
-
prompt = f"{text}{producetext_formatted}\n{instructions}"
|
71 |
-
response = co.generate(
|
72 |
-
model='c4ai-aya',
|
73 |
-
prompt=prompt,
|
74 |
-
max_tokens=2986,
|
75 |
-
temperature=0.6,
|
76 |
-
k=0,
|
77 |
-
stop_sequences=[],
|
78 |
-
return_likelihoods='NONE'
|
79 |
-
)
|
80 |
-
return response.generations[0].text
|
81 |
-
|
82 |
-
class LongAudioProcessor:
|
83 |
-
def __init__(self, audio_client, api_key=None):
|
84 |
-
self.client = audio_client
|
85 |
-
self.process_audio_to_text = process_audio_to_text
|
86 |
-
self.api_key = api_key
|
87 |
-
|
88 |
-
def process_long_audio(self, audio_path, inputlanguage, outputlanguage, chunk_length_ms=20000):
|
89 |
-
"""
|
90 |
-
Process audio files longer than 29 seconds by chunking them into smaller segments.
|
91 |
-
"""
|
92 |
-
audio = AudioSegment.from_file(audio_path)
|
93 |
-
chunks = make_chunks(audio, chunk_length_ms)
|
94 |
-
full_text = ""
|
95 |
-
for i, chunk in enumerate(chunks):
|
96 |
-
chunk_name = f"chunk{i}.wav"
|
97 |
-
with open(chunk_name, 'wb') as file:
|
98 |
-
chunk.export(file, format="wav")
|
99 |
-
try:
|
100 |
-
result = self.process_audio_to_text(chunk_name, inputlanguage=inputlanguage, outputlanguage=outputlanguage)
|
101 |
-
full_text += " " + result.strip()
|
102 |
-
except Exception as e:
|
103 |
-
print(f"Error processing {chunk_name}: {e}")
|
104 |
-
finally:
|
105 |
-
if os.path.exists(chunk_name):
|
106 |
-
os.remove(chunk_name)
|
107 |
-
return full_text.strip()
|
108 |
class TaggedPhraseExtractor:
|
109 |
def __init__(self, text=''):
|
110 |
self.text = text
|
111 |
-
self.patterns =
|
112 |
|
113 |
def set_text(self, text):
|
114 |
"""Set the text to search within."""
|
@@ -119,142 +56,73 @@ class TaggedPhraseExtractor:
|
|
119 |
self.patterns[color] = pattern
|
120 |
|
121 |
def extract_phrases(self):
|
122 |
-
"""Extract phrases for all colors and patterns added
|
123 |
-
matches = {}
|
124 |
-
for color, pattern in self.patterns.items():
|
125 |
-
found_phrases = re.findall(pattern, self.text)
|
126 |
-
sorted_phrases = sorted(found_phrases, key=len, reverse=True)
|
127 |
-
matches[color] = sorted_phrases[:3]
|
128 |
return matches
|
129 |
|
130 |
def print_phrases(self):
|
131 |
-
"""Extract phrases and print them
|
132 |
matches = self.extract_phrases()
|
133 |
-
for color,
|
134 |
print(f"Phrases with color {color}:")
|
135 |
-
for phrase in
|
136 |
-
print(f"- {phrase}")
|
137 |
-
print(f"\nThree longest phrases for color {color}:")
|
138 |
-
for phrase in data['top_three_longest']:
|
139 |
print(f"- {phrase}")
|
140 |
-
print()
|
141 |
|
142 |
-
|
|
|
|
|
|
|
143 |
"""
|
144 |
Convert audio input to text using the Gradio client.
|
145 |
"""
|
146 |
-
audio_client = Client(SEAMLESSM4T)
|
147 |
result = audio_client.predict(
|
148 |
audio_path,
|
149 |
inputlanguage,
|
150 |
-
|
151 |
api_name="/s2tt"
|
152 |
)
|
153 |
print("Audio Result: ", result)
|
154 |
-
return result[
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
def process_text_to_audio(text, translatefrom="English", translateto="English"):
|
159 |
-
"""
|
160 |
-
Convert text input to audio using the Gradio client and return a URL to the generated audio.
|
161 |
-
"""
|
162 |
-
try:
|
163 |
-
# Assuming audio_client.predict is correctly set up and returns a tuple (local_file_path, translated_text)
|
164 |
-
result = audio_client.predict(
|
165 |
-
text,
|
166 |
-
translatefrom,
|
167 |
-
translateto,
|
168 |
-
api_name="/t2st"
|
169 |
-
)
|
170 |
-
|
171 |
-
if not isinstance(result, tuple) or len(result) < 2:
|
172 |
-
raise ValueError("Unexpected result format from audio_client.predict")
|
173 |
-
|
174 |
-
|
175 |
-
# Print or log the raw API response for inspection
|
176 |
-
print("Raw API Response:", result)
|
177 |
-
|
178 |
-
# Initialize variables
|
179 |
-
audio_file_path = ""
|
180 |
|
181 |
-
|
182 |
-
if result:
|
183 |
-
for item in result:
|
184 |
-
if isinstance(item, str):
|
185 |
-
# Check if the item is a URL pointing to an audio file or a base64 encoded string
|
186 |
-
if any(ext in item.lower() for ext in ['.mp3', '.wav', '.ogg']) or is_base64(item):
|
187 |
-
audio_file_path = item
|
188 |
-
break
|
189 |
-
|
190 |
-
if not audio_file_path:
|
191 |
-
raise ValueError("No audio file path found in the response")
|
192 |
-
|
193 |
-
# If the response is a direct file path or a base64 string, handle accordingly
|
194 |
-
# For simplicity, we're returning the URL or base64 string directly
|
195 |
-
return audio_file_path
|
196 |
-
|
197 |
-
except Exception as e:
|
198 |
-
print(f"Error processing text to audio: {e}")
|
199 |
-
return ""
|
200 |
-
|
201 |
-
|
202 |
-
def save_audio_data_to_file(audio_data, directory="audio_files", filename="output_audio.wav"):
|
203 |
-
"""
|
204 |
-
Save audio data to a file and return the file path.
|
205 |
-
"""
|
206 |
-
os.makedirs(directory, exist_ok=True)
|
207 |
-
file_path = os.path.join(directory, filename)
|
208 |
-
with open(file_path, 'wb') as file:
|
209 |
-
file.write(audio_data)
|
210 |
-
return file_path
|
211 |
-
|
212 |
-
# Ensure the function that reads the audio file checks if the path is a file
|
213 |
-
def read_audio_file(file_path):
|
214 |
-
"""
|
215 |
-
Read and return the audio file content if the path is a file.
|
216 |
"""
|
217 |
-
|
218 |
-
with open(file_path, 'rb') as file:
|
219 |
-
return file.read()
|
220 |
-
else:
|
221 |
-
raise ValueError(f"Expected a file path, got a directory: {file_path}")
|
222 |
-
|
223 |
-
|
224 |
-
def initialize_ocr_models():
|
225 |
"""
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
|
|
|
|
231 |
|
232 |
class OCRProcessor:
|
233 |
-
def __init__(self,
|
234 |
-
self.
|
235 |
-
self.det_processor, self.det_model
|
|
|
236 |
|
237 |
def process_image(self, image):
|
238 |
"""
|
239 |
Process a PIL image and return the OCR text.
|
240 |
"""
|
241 |
-
predictions = run_ocr([image], [self.
|
242 |
-
return predictions[0]
|
243 |
|
244 |
def process_pdf(self, pdf_path):
|
245 |
"""
|
246 |
Process a PDF file and return the OCR text.
|
247 |
"""
|
248 |
-
predictions = run_ocr([pdf_path], [self.
|
249 |
-
return predictions[0]
|
250 |
|
251 |
-
def process_input(image=None, file=None, audio=None, text=""
|
252 |
-
|
253 |
-
ocr_processor = OCRProcessor(lang_code)
|
254 |
final_text = text
|
255 |
-
print("Image :", image)
|
256 |
if image is not None:
|
257 |
ocr_prediction = ocr_processor.process_image(image)
|
|
|
258 |
for idx in range(len((list(ocr_prediction)[0][1]))):
|
259 |
final_text += " "
|
260 |
final_text += list((list(ocr_prediction)[0][1])[idx])[1][1]
|
@@ -262,11 +130,13 @@ def process_input(image=None, file=None, audio=None, text="", translateto = "Eng
|
|
262 |
if file.name.lower().endswith(('.png', '.jpg', '.jpeg')):
|
263 |
pil_image = Image.open(file)
|
264 |
ocr_prediction = ocr_processor.process_image(pil_image)
|
|
|
265 |
for idx in range(len((list(ocr_prediction)[0][1]))):
|
266 |
final_text += " "
|
267 |
final_text += list((list(ocr_prediction)[0][1])[idx])[1][1]
|
268 |
elif file.name.lower().endswith('.pdf'):
|
269 |
ocr_prediction = ocr_processor.process_pdf(file.name)
|
|
|
270 |
for idx in range(len((list(ocr_prediction)[0][1]))):
|
271 |
final_text += " "
|
272 |
final_text += list((list(ocr_prediction)[0][1])[idx])[1][1]
|
@@ -274,11 +144,10 @@ def process_input(image=None, file=None, audio=None, text="", translateto = "Eng
|
|
274 |
final_text += "\nUnsupported file type."
|
275 |
print("OCR Text: ", final_text)
|
276 |
if audio is not None:
|
277 |
-
|
278 |
-
audio_text = long_audio_processor.process_long_audio(audio, inputlanguage=translatefrom, outputlanguage=translateto)
|
279 |
final_text += "\n" + audio_text
|
280 |
|
281 |
-
final_text_with_producetext = final_text + producetext
|
282 |
|
283 |
response = co.generate(
|
284 |
model='c4ai-aya',
|
@@ -298,91 +167,96 @@ def process_input(image=None, file=None, audio=None, text="", translateto = "Eng
|
|
298 |
)
|
299 |
processed_text = response.generations[0].text
|
300 |
|
301 |
-
audio_output = process_text_to_audio(processed_text
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
return final_text, audio_output, top_phrases, translations, audio_outputs
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
inputs = [
|
330 |
-
|
331 |
-
gr.Dropdown(choices=choices, label="Your Native Language"),
|
332 |
-
gr.Dropdown(choices=choices, label="Language To Learn"),
|
333 |
-
gr.Audio(sources="microphone", type="filepath", label="Mic Input"),
|
334 |
-
gr.Image(type="pil", label="Camera Input"),
|
335 |
-
gr.Textbox(lines=2, label="Text Input"),
|
336 |
-
gr.File(label="File Upload")
|
337 |
-
]
|
338 |
-
|
339 |
-
outputs = [
|
340 |
-
RichTextbox(label="Processed Text"),
|
341 |
-
gr.Audio(label="Audio"),
|
342 |
-
gr.Textbox(label="Focus 1"),
|
343 |
-
gr.Textbox(label="Translated Phrases 1"),
|
344 |
-
gr.Audio(label="Audio Output (Native Language) 1"),
|
345 |
-
gr.Audio(label="Audio Output (Target Language) 1"),
|
346 |
-
gr.Textbox(label="Focus 2"),
|
347 |
-
gr.Textbox(label="Translated Phrases 2"),
|
348 |
-
gr.Audio(label="Audio Output (Native Language) 2"),
|
349 |
-
gr.Audio(label="Audio Output (Target Language) 2"),
|
350 |
-
gr.Textbox(label="Focus 3"),
|
351 |
-
gr.Textbox(label="Translated Phrases 3"),
|
352 |
-
gr.Audio(label="Audio Output (Native Language) 3"),
|
353 |
-
gr.Audio(label="Audio Output (Target Language) 3")
|
354 |
-
]
|
355 |
-
|
356 |
-
|
357 |
-
def update_outputs(inputlanguage, target_language, audio, image, text, file):
|
358 |
-
processed_text, audio_output_path, top_phrases, translations, audio_outputs = process_input(
|
359 |
-
image=image, file=file, audio=audio, text=text,
|
360 |
-
translateto=target_language, translatefrom=inputlanguage
|
361 |
-
)
|
362 |
-
|
363 |
-
output_tuple = (
|
364 |
-
processed_text, # RichTextbox content
|
365 |
-
audio_output_path, # Main audio output
|
366 |
-
top_phrases[0] if len(top_phrases) > 0 else "", # Focus 1
|
367 |
-
translations[0] if len(translations) > 0 else "", # Translated Phrases 1
|
368 |
-
audio_outputs[0][0] if len(audio_outputs) > 0 else "", # Audio Output (Native Language) 1
|
369 |
-
audio_outputs[0][1] if len(audio_outputs) > 0 else "", # Audio Output (Target Language) 1
|
370 |
-
top_phrases[1] if len(top_phrases) > 1 else "", # Focus 2
|
371 |
-
translations[1] if len(translations) > 1 else "", # Translated Phrases 2
|
372 |
-
audio_outputs[1][0] if len(audio_outputs) > 1 else "", # Audio Output (Native Language) 2
|
373 |
-
audio_outputs[1][1] if len(audio_outputs) > 1 else "", # Audio Output (Target Language) 2
|
374 |
-
top_phrases[2] if len(top_phrases) > 2 else "", # Focus 3
|
375 |
-
translations[2] if len(translations) > 2 else "", # Translated Phrases 3
|
376 |
-
audio_outputs[2][0] if len(audio_outputs) > 2 else "", # Audio Output (Native Language) 3
|
377 |
-
audio_outputs[2][1] if len(audio_outputs) > 2 else "" # Audio Output (Target Language) 3
|
378 |
-
)
|
379 |
-
|
380 |
-
return output_tuple
|
381 |
-
|
382 |
-
def interface_func(inputlanguage, target_language, audio, image, text, file):
|
383 |
-
return update_outputs(inputlanguage, target_language, audio, image, text, file)
|
384 |
-
|
385 |
-
iface = gr.Interface(fn=interface_func, inputs=inputs, outputs=outputs, title=title, description=description)
|
386 |
|
387 |
if __name__ == "__main__":
|
388 |
-
iface.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
from surya.model.detection.segformer import load_model as load_det_model, load_processor as load_det_processor
|
6 |
from surya.model.recognition.model import load_model as load_rec_model
|
7 |
from surya.model.recognition.processor import load_processor as load_rec_processor
|
8 |
+
from lang_list import LANGUAGE_NAME_TO_CODE, TEXT_SOURCE_LANGUAGE_NAMES, S2ST_TARGET_LANGUAGE_NAMES
|
9 |
from gradio_client import Client
|
10 |
from dotenv import load_dotenv
|
11 |
import requests
|
12 |
+
from io import BytesIO
|
13 |
import cohere
|
14 |
import os
|
15 |
import re
|
16 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
|
19 |
title = "# Welcome to AyaTonic"
|
|
|
22 |
load_dotenv()
|
23 |
COHERE_API_KEY = os.getenv('CO_API_KEY')
|
24 |
SEAMLESSM4T = os.getenv('SEAMLESSM4T')
|
25 |
+
|
26 |
df = pd.read_csv("lang_list.csv")
|
27 |
+
|
28 |
inputlanguage = ""
|
29 |
producetext = "\n\nProduce a complete expositional blog post in {target_language} based on the above :"
|
30 |
+
formatinputstring = "\n\nthe above text is a learning aid. you must use rich text format to rewrite the above and add 1 . a red color tags for nouns 2. a blue color tag for verbs 3. a green color tag for adjectives and adverbs:"
|
31 |
+
|
32 |
+
# Regular expression patterns for each color
|
33 |
patterns = {
|
34 |
"red": r'<span style="color: red;">(.*?)</span>',
|
35 |
"blue": r'<span style="color: blue;">(.*?)</span>',
|
|
|
42 |
"blue": [],
|
43 |
"green": [],
|
44 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
class TaggedPhraseExtractor:
|
46 |
def __init__(self, text=''):
|
47 |
self.text = text
|
48 |
+
self.patterns = {}
|
49 |
|
50 |
def set_text(self, text):
|
51 |
"""Set the text to search within."""
|
|
|
56 |
self.patterns[color] = pattern
|
57 |
|
58 |
def extract_phrases(self):
|
59 |
+
"""Extract phrases for all colors and patterns added."""
|
60 |
+
matches = {color: re.findall(pattern, self.text) for color, pattern in self.patterns.items()}
|
|
|
|
|
|
|
|
|
61 |
return matches
|
62 |
|
63 |
def print_phrases(self):
|
64 |
+
"""Extract phrases and print them."""
|
65 |
matches = self.extract_phrases()
|
66 |
+
for color, phrases in matches.items():
|
67 |
print(f"Phrases with color {color}:")
|
68 |
+
for phrase in phrases:
|
|
|
|
|
|
|
69 |
print(f"- {phrase}")
|
70 |
+
print()
|
71 |
|
72 |
+
co = cohere.Client(COHERE_API_KEY)
|
73 |
+
audio_client = Client(SEAMLESSM4T)
|
74 |
+
|
75 |
+
def process_audio_to_text(audio_path, inputlanguage="English"):
|
76 |
"""
|
77 |
Convert audio input to text using the Gradio client.
|
78 |
"""
|
|
|
79 |
result = audio_client.predict(
|
80 |
audio_path,
|
81 |
inputlanguage,
|
82 |
+
inputlanguage,
|
83 |
api_name="/s2tt"
|
84 |
)
|
85 |
print("Audio Result: ", result)
|
86 |
+
return result['text'] # Adjust based on the actual response
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
|
88 |
+
def process_text_to_audio(text, target_language="English"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
"""
|
90 |
+
Convert text input to audio using the Gradio client.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
"""
|
92 |
+
result = audio_client.predict(
|
93 |
+
text,
|
94 |
+
target_language,
|
95 |
+
target_language, # could be make a variation for learning content
|
96 |
+
api_name="/t2st"
|
97 |
+
)
|
98 |
+
return result['audio'] # Adjust based on the actual response
|
99 |
|
100 |
class OCRProcessor:
|
101 |
+
def __init__(self, langs=["en"]):
|
102 |
+
self.langs = langs
|
103 |
+
self.det_processor, self.det_model = load_det_processor(), load_det_model()
|
104 |
+
self.rec_model, self.rec_processor = load_rec_model(), load_rec_processor()
|
105 |
|
106 |
def process_image(self, image):
|
107 |
"""
|
108 |
Process a PIL image and return the OCR text.
|
109 |
"""
|
110 |
+
predictions = run_ocr([image], [self.langs], self.det_model, self.det_processor, self.rec_model, self.rec_processor)
|
111 |
+
return predictions[0] # Assuming the first item in predictions contains the desired text
|
112 |
|
113 |
def process_pdf(self, pdf_path):
|
114 |
"""
|
115 |
Process a PDF file and return the OCR text.
|
116 |
"""
|
117 |
+
predictions = run_ocr([pdf_path], [self.langs], self.det_model, self.det_processor, self.rec_model, self.rec_processor)
|
118 |
+
return predictions[0] # Assuming the first item in predictions contains the desired text
|
119 |
|
120 |
+
def process_input(image=None, file=None, audio=None, text=""):
|
121 |
+
ocr_processor = OCRProcessor()
|
|
|
122 |
final_text = text
|
|
|
123 |
if image is not None:
|
124 |
ocr_prediction = ocr_processor.process_image(image)
|
125 |
+
# gettig text from ocr object
|
126 |
for idx in range(len((list(ocr_prediction)[0][1]))):
|
127 |
final_text += " "
|
128 |
final_text += list((list(ocr_prediction)[0][1])[idx])[1][1]
|
|
|
130 |
if file.name.lower().endswith(('.png', '.jpg', '.jpeg')):
|
131 |
pil_image = Image.open(file)
|
132 |
ocr_prediction = ocr_processor.process_image(pil_image)
|
133 |
+
# gettig text from ocr object
|
134 |
for idx in range(len((list(ocr_prediction)[0][1]))):
|
135 |
final_text += " "
|
136 |
final_text += list((list(ocr_prediction)[0][1])[idx])[1][1]
|
137 |
elif file.name.lower().endswith('.pdf'):
|
138 |
ocr_prediction = ocr_processor.process_pdf(file.name)
|
139 |
+
# gettig text from ocr object
|
140 |
for idx in range(len((list(ocr_prediction)[0][1]))):
|
141 |
final_text += " "
|
142 |
final_text += list((list(ocr_prediction)[0][1])[idx])[1][1]
|
|
|
144 |
final_text += "\nUnsupported file type."
|
145 |
print("OCR Text: ", final_text)
|
146 |
if audio is not None:
|
147 |
+
audio_text = process_audio_to_text(audio)
|
|
|
148 |
final_text += "\n" + audio_text
|
149 |
|
150 |
+
final_text_with_producetext = final_text + producetext
|
151 |
|
152 |
response = co.generate(
|
153 |
model='c4ai-aya',
|
|
|
167 |
)
|
168 |
processed_text = response.generations[0].text
|
169 |
|
170 |
+
audio_output = process_text_to_audio(processed_text)
|
171 |
+
|
172 |
+
return processed_text, audio_output
|
173 |
+
# Define Gradio interface
|
174 |
+
iface = gr.Interface(
|
175 |
+
fn=process_input,
|
176 |
+
inputs=[
|
177 |
+
gr.Image(type="pil", label="Camera Input"),
|
178 |
+
gr.File(label="File Upload"),
|
179 |
+
gr.Audio(sources="microphone", type="filepath", label="Mic Input"),
|
180 |
+
gr.Textbox(lines=2, label="Text Input"),
|
181 |
+
# gr.Dropdown(choices=TEXT_SOURCE_LANGUAGE_NAMES, label="Input Language"),
|
182 |
+
# gr.Dropdown(choices=TEXT_SOURCE_LANGUAGE_NAMES, label="Target Language")
|
183 |
+
gr.Dropdown(choices=df["name"].to_list(), label="Input Language"),
|
184 |
+
gr.Dropdown(choices=df["name"].to_list(), label="Target Language")
|
185 |
+
],
|
186 |
+
outputs=[
|
187 |
+
RichTextbox(label="Processed Text"),
|
188 |
+
gr.Audio(label="Audio Output")
|
189 |
+
],
|
190 |
+
title=title,
|
191 |
+
description=description
|
192 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
|
194 |
if __name__ == "__main__":
|
195 |
+
iface.launch()
|
196 |
+
|
197 |
+
|
198 |
+
# co = cohere.Client('yhA228YGeZSl1ctten8LQxw2dky2nngHetXFjV2Q') # This is your trial API key
|
199 |
+
# response = co.generate(
|
200 |
+
# model='c4ai-aya',
|
201 |
+
# prompt='एक यांत्रिक घड़ी दिन के समय को प्रदान करने के लिए एक गैर-इलेक्ट्रॉनिक तंत्र का उपयोग करती है। एक मुख्य स्प्रिंग का उपयोग यांत्रिक तंत्र को ऊर्जा संग्रहीत करने के लिए किया जाता है। एक यांत्रिक घड़ी में दांतों का एक कुंडल होता है जो धीरे-धीरे मुख्य स्प्रिंग से संचालित होता है। दांतों के कुंडल को एक यांत्रिक तंत्र में स्थानांतरित करने के लिए पहियों की एक श्रृंखला का उपयोग किया जाता है जो हाथों को घड़ी के चेहरे पर दाईं ओर ले जाता है। घड़ी के तंत्र को स्थिर करने और यह सुनिश्चित करने के लिए कि हाथ सही दिशा में घूमते हैं, एक कंपन का उपयोग किया जाता है। ',
|
202 |
+
# max_tokens=3674,
|
203 |
+
# temperature=0.9,
|
204 |
+
# k=0,
|
205 |
+
# stop_sequences=[],
|
206 |
+
# return_likelihoods='NONE')
|
207 |
+
# print('Prediction: {}'.format(response.generations[0].text))
|
208 |
+
|
209 |
+
# client = Client("https://facebook-seamless-m4t-v2-large.hf.space/--replicas/nq5nn/")
|
210 |
+
# result = client.predict(
|
211 |
+
# https://github.com/gradio-app/gradio/raw/main/test/test_files/audio_sample.wav, # filepath in 'Input speech' Audio component
|
212 |
+
# Afrikaans, # Literal[Afrikaans, Amharic, Armenian, Assamese, Basque, Belarusian, Bengali, Bosnian, Bulgarian, Burmese, Cantonese, Catalan, Cebuano, Central Kurdish, Croatian, Czech, Danish, Dutch, Egyptian Arabic, English, Estonian, Finnish, French, Galician, Ganda, Georgian, German, Greek, Gujarati, Halh Mongolian, Hebrew, Hindi, Hungarian, Icelandic, Igbo, Indonesian, Irish, Italian, Japanese, Javanese, Kannada, Kazakh, Khmer, Korean, Kyrgyz, Lao, Lithuanian, Luo, Macedonian, Maithili, Malayalam, Maltese, Mandarin Chinese, Marathi, Meitei, Modern Standard Arabic, Moroccan Arabic, Nepali, North Azerbaijani, Northern Uzbek, Norwegian Bokmål, Norwegian Nynorsk, Nyanja, Odia, Polish, Portuguese, Punjabi, Romanian, Russian, Serbian, Shona, Sindhi, Slovak, Slovenian, Somali, Southern Pashto, Spanish, Standard Latvian, Standard Malay, Swahili, Swedish, Tagalog, Tajik, Tamil, Telugu, Thai, Turkish, Ukrainian, Urdu, Vietnamese, Welsh, West Central Oromo, Western Persian, Yoruba, Zulu] in 'Source language' Dropdown component
|
213 |
+
# Bengali, # Literal[Bengali, Catalan, Czech, Danish, Dutch, English, Estonian, Finnish, French, German, Hindi, Indonesian, Italian, Japanese, Korean, Maltese, Mandarin Chinese, Modern Standard Arabic, Northern Uzbek, Polish, Portuguese, Romanian, Russian, Slovak, Spanish, Swahili, Swedish, Tagalog, Telugu, Thai, Turkish, Ukrainian, Urdu, Vietnamese, Welsh, Western Persian] in 'Target language' Dropdown component
|
214 |
+
# api_name="/s2st"
|
215 |
+
# )
|
216 |
+
# print(result)
|
217 |
+
|
218 |
+
# co = cohere.Client('yhA228YGeZSl1ctten8LQxw2dky2nngHetXFjV2Q')
|
219 |
+
# response = co.generate(
|
220 |
+
# model='command-nightly',
|
221 |
+
# prompt='Les mécanismes de montres mécaniques\n\nLes mécanismes de montres mécaniques sont des mécanismes qui indiquent la journée, mais pas l\'électronique. Elles utilisent un ressort principal pour stocker l\'énergie nécessaire au fonctionnement des mécanismes. Un train d\'engrenages est utilisé pour transférer l\'énergie du ressort principal à un ensemble de roues qui font tourner les aiguilles dans le sens horaire sur le cadran de la montre.\n\nLes mécanismes de montres mécaniques sontdakshineswar omkarnathji, qui sont des lieux de culte qui sont construits dans le temple. Les engrenages sont des roues qui sont utilisées pour transférer l\'énergie du ressort principal à un ensemble de roues qui font tourner les aiguilles dans le sens horaire sur le cadran de la montre.\n\nLe ressort principal est un ressort qui est utilisé pour stocker l\'énergie nécessaire au fonctionnement des mécanismes de la montre. Le ressort principal est un ressort qui est utilisé pour stocker l\'énergie nécessaire au fonctionnement des mécanismes de la montre, et il est utilisé pour transférer l\'énergie aux engrenages, qui sont des roues qui sont utilisées pour faire tourner les aiguilles dans le sens horaire sur le cadran de la montre.\n\nLes engrenages sont des roues qui sont utilisées pour faire tourner les aiguilles dans le sens horaire sur le cadran de la montre, et elles sont utilisées pour transférer l\'énergie du ressort principal aux roues qui font tourner les aiguilles dans le sens horaire sur le cadran de la montre.\n\nLes mécanismes de montres mécaniques sont des mécanismes qui indiquent la journée, et elles sont utilisées pour transférer l\'énergie du ressort principal à un ensemble de roues qui font tourner les aiguilles dans le sens horaire sur le cadran de la montre.\n\nLes mécanismes de montres mécaniques sont des mécanismes qui indiquent la journée, et elles sont utilisées pour transférer l\'énergie du ressort principal à un ensemble de roues qui font tourner les aiguilles dans le sens horaire sur le cadran de la montre, et elles sont utilisées pour stabiliser le mécanisme de la montre, et pour s\'assurer que les aiguilles tournent dans le bon sens.\n\nthe above text is a learning aid. you must use rich text format to rewrite the above and add 1 . a red color tags for nouns 2. a blue color tag for verbs 3. a green color tag for adjectives and adverbs:',
|
222 |
+
# max_tokens=7294,
|
223 |
+
# temperature=0.6,
|
224 |
+
# k=0,
|
225 |
+
# stop_sequences=[],
|
226 |
+
# return_likelihoods='NONE')
|
227 |
+
# print('Prediction: {}'.format(response.generations[0].text))
|
228 |
+
# example = RichTextbox().example_inputs()
|
229 |
+
|
230 |
+
|
231 |
+
|
232 |
+
# iface = gr.Interface(
|
233 |
+
# fn=process_input,
|
234 |
+
# inputs=[
|
235 |
+
# gr.Image(type="pil", label="Camera Input"),
|
236 |
+
# gr.File(label="File Upload"),
|
237 |
+
# gr.Audio(sources="microphone", type="filepath", label="Mic Input"),
|
238 |
+
# gr.Textbox(lines=2, label="Text Input"),
|
239 |
+
# gr.Dropdown(choices=TEXT_SOURCE_LANGUAGE_NAMES, label="Input Language"),
|
240 |
+
# gr.Dropdown(choices=TEXT_SOURCE_LANGUAGE_NAMES, label="Target Language")
|
241 |
+
# ],
|
242 |
+
# outputs=[
|
243 |
+
# gr.RichTextbox(label="Processed Text"),
|
244 |
+
# gr.Audio(label="Audio Output")
|
245 |
+
# ],
|
246 |
+
# title="OCR and Speech Processing App",
|
247 |
+
# description="This app processes images, PDFs, and audio inputs to generate text and audio outputs."
|
248 |
+
# )
|
249 |
+
|
250 |
+
# if __name__ == "__main__":
|
251 |
+
# # iface.launch()
|
252 |
+
|
253 |
+
# demo = gr.Interface(
|
254 |
+
# lambda x:x,
|
255 |
+
# RichTextbox(), # interactive version of your component
|
256 |
+
# RichTextbox(), # static version of your component
|
257 |
+
# examples=[[example]], # uncomment this line to view the "example version" of your component
|
258 |
+
# )
|
259 |
+
|
260 |
+
|
261 |
+
# if __name__ == "__main__":
|
262 |
+
# demo.launch()
|
ayatonic.env
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
CO_API_KEY=KQBPf0H0ENZESIC5nuUJ4i4jjg34xMPAkYK7s31W
|
2 |
+
SEAMLESSM4T=https://facebook-seamless-m4t-v2-large.hf.space/--replicas/v4gsf/
|
audio_files/audio_3505178120260920029.wav → languages.json
RENAMED
File without changes
|
requirements.txt
CHANGED
@@ -1,11 +1,10 @@
|
|
1 |
gradio
|
2 |
gradio_rich_textbox
|
3 |
gradio-client
|
|
|
|
|
|
|
4 |
torchvision
|
5 |
torch
|
6 |
python-dotenv
|
7 |
-
pandas
|
8 |
-
pydub
|
9 |
-
cohere
|
10 |
-
surya-ocr
|
11 |
-
pillow
|
|
|
1 |
gradio
|
2 |
gradio_rich_textbox
|
3 |
gradio-client
|
4 |
+
cohere
|
5 |
+
surya-ocr
|
6 |
+
pillow
|
7 |
torchvision
|
8 |
torch
|
9 |
python-dotenv
|
10 |
+
pandas
|
|
|
|
|
|
|
|
script.py
DELETED
@@ -1,10 +0,0 @@
|
|
1 |
-
from gradio_client import Client
|
2 |
-
|
3 |
-
client = Client("https://facebook-seamless-m4t-v2-large.hf.space/--replicas/v4gsf/")
|
4 |
-
result = client.predict(
|
5 |
-
"Hello my name is tonic!", # str in 'Input text' Textbox component
|
6 |
-
"English", # Literal[Afrikaans, Amharic, Armenian, Assamese, Basque, Belarusian, Bengali, Bosnian, Bulgarian, Burmese, Cantonese, Catalan, Cebuano, Central Kurdish, Croatian, Czech, Danish, Dutch, Egyptian Arabic, English, Estonian, Finnish, French, Galician, Ganda, Georgian, German, Greek, Gujarati, Halh Mongolian, Hebrew, Hindi, Hungarian, Icelandic, Igbo, Indonesian, Irish, Italian, Japanese, Javanese, Kannada, Kazakh, Khmer, Korean, Kyrgyz, Lao, Lithuanian, Luo, Macedonian, Maithili, Malayalam, Maltese, Mandarin Chinese, Marathi, Meitei, Modern Standard Arabic, Moroccan Arabic, Nepali, North Azerbaijani, Northern Uzbek, Norwegian Bokmål, Norwegian Nynorsk, Nyanja, Odia, Polish, Portuguese, Punjabi, Romanian, Russian, Serbian, Shona, Sindhi, Slovak, Slovenian, Somali, Southern Pashto, Spanish, Standard Latvian, Standard Malay, Swahili, Swedish, Tagalog, Tajik, Tamil, Telugu, Thai, Turkish, Ukrainian, Urdu, Vietnamese, Welsh, West Central Oromo, Western Persian, Yoruba, Zulu] in 'Source language' Dropdown component
|
7 |
-
"French", # Literal[Bengali, Catalan, Czech, Danish, Dutch, English, Estonian, Finnish, French, German, Hindi, Indonesian, Italian, Japanese, Korean, Maltese, Mandarin Chinese, Modern Standard Arabic, Northern Uzbek, Polish, Portuguese, Romanian, Russian, Slovak, Spanish, Swahili, Swedish, Tagalog, Telugu, Thai, Turkish, Ukrainian, Urdu, Vietnamese, Welsh, Western Persian] in 'Target language' Dropdown component
|
8 |
-
api_name="/t2st"
|
9 |
-
)
|
10 |
-
print(result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|