Spaces:
Runtime error
Runtime error
tonic
commited on
Commit
•
5701b30
1
Parent(s):
e115e9b
improve the interface , add parsing for longest phrases , language code quick fix for surya, adding translation with aya
Browse files- app.py +93 -26
- languages.json +0 -0
app.py
CHANGED
@@ -5,7 +5,7 @@ from surya.ocr import run_ocr
|
|
5 |
from surya.model.detection.segformer import load_model as load_det_model, load_processor as load_det_processor
|
6 |
from surya.model.recognition.model import load_model as load_rec_model
|
7 |
from surya.model.recognition.processor import load_processor as load_rec_processor
|
8 |
-
|
9 |
from gradio_client import Client
|
10 |
from dotenv import load_dotenv
|
11 |
import requests
|
@@ -27,7 +27,7 @@ choices = df["name"].to_list()
|
|
27 |
inputlanguage = ""
|
28 |
producetext = "\n\nProduce a complete expositional blog post in {target_language} based on the above :"
|
29 |
formatinputstring = "\n\nthe above text is a learning aid. you must use rich text format to rewrite the above and add 1 . a red color tags for nouns 2. a blue color tag for verbs 3. a green color tag for adjectives and adverbs:"
|
30 |
-
|
31 |
# Regular expression patterns for each color
|
32 |
patterns = {
|
33 |
"red": r'<span style="color: red;">(.*?)</span>',
|
@@ -41,6 +41,35 @@ matches = {
|
|
41 |
"blue": [],
|
42 |
"green": [],
|
43 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
class TaggedPhraseExtractor:
|
45 |
def __init__(self, text=''):
|
46 |
self.text = text
|
@@ -55,24 +84,32 @@ class TaggedPhraseExtractor:
|
|
55 |
self.patterns[color] = pattern
|
56 |
|
57 |
def extract_phrases(self):
|
58 |
-
"""Extract phrases for all colors and patterns added."""
|
59 |
-
matches = {
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
def print_phrases(self):
|
63 |
-
"""Extract phrases and print them."""
|
64 |
matches = self.extract_phrases()
|
65 |
-
for color,
|
66 |
print(f"Phrases with color {color}:")
|
67 |
-
for phrase in
|
|
|
|
|
|
|
68 |
print(f"- {phrase}")
|
69 |
-
print()
|
70 |
|
71 |
-
|
72 |
-
audio_client = Client(SEAMLESSM4T)
|
73 |
-
client = Client(SEAMLESSM4T)
|
74 |
-
|
75 |
-
def process_audio_to_text(audio_path, inputlanguage="English"):
|
76 |
"""
|
77 |
Convert audio input to text using the Gradio client.
|
78 |
"""
|
@@ -80,7 +117,7 @@ def process_audio_to_text(audio_path, inputlanguage="English"):
|
|
80 |
result = audio_client.predict(
|
81 |
audio_path,
|
82 |
inputlanguage,
|
83 |
-
|
84 |
api_name="/s2tt"
|
85 |
)
|
86 |
print("Audio Result: ", result)
|
@@ -100,8 +137,8 @@ def process_text_to_audio(text, translatefrom="English", translateto="English"):
|
|
100 |
return result[0]
|
101 |
|
102 |
class OCRProcessor:
|
103 |
-
def __init__(self,
|
104 |
-
self.
|
105 |
self.det_processor, self.det_model = load_det_processor(), load_det_model()
|
106 |
self.rec_model, self.rec_processor = load_rec_model(), load_rec_processor()
|
107 |
|
@@ -109,18 +146,19 @@ class OCRProcessor:
|
|
109 |
"""
|
110 |
Process a PIL image and return the OCR text.
|
111 |
"""
|
112 |
-
predictions = run_ocr([image], [self.
|
113 |
-
return predictions[0]
|
114 |
|
115 |
def process_pdf(self, pdf_path):
|
116 |
"""
|
117 |
Process a PDF file and return the OCR text.
|
118 |
"""
|
119 |
-
predictions = run_ocr([pdf_path], [self.
|
120 |
-
return predictions[0]
|
121 |
|
122 |
def process_input(image=None, file=None, audio=None, text="", translateto = "English", translatefrom = "English" ):
|
123 |
-
|
|
|
124 |
final_text = text
|
125 |
if image is not None:
|
126 |
ocr_prediction = ocr_processor.process_image(image)
|
@@ -171,7 +209,20 @@ def process_input(image=None, file=None, audio=None, text="", translateto = "Eng
|
|
171 |
|
172 |
audio_output = process_text_to_audio(processed_text, translateto, translateto)
|
173 |
|
174 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
|
176 |
def main():
|
177 |
with gr.Blocks() as demo:
|
@@ -193,12 +244,28 @@ def main():
|
|
193 |
process_button = gr.Button("🌟AyaTonic")
|
194 |
|
195 |
processed_text_output = RichTextbox(label="Processed Text")
|
196 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
|
198 |
process_button.click(
|
199 |
-
fn=
|
200 |
inputs=[image_input, file_input, audio_input, text_input, input_language, target_language],
|
201 |
-
outputs=[processed_text_output,
|
202 |
)
|
203 |
|
204 |
if __name__ == "__main__":
|
|
|
5 |
from surya.model.detection.segformer import load_model as load_det_model, load_processor as load_det_processor
|
6 |
from surya.model.recognition.model import load_model as load_rec_model
|
7 |
from surya.model.recognition.processor import load_processor as load_rec_processor
|
8 |
+
from lang_list import TEXT_SOURCE_LANGUAGE_NAMES , LANGUAGE_NAME_TO_CODE , text_source_language_codes
|
9 |
from gradio_client import Client
|
10 |
from dotenv import load_dotenv
|
11 |
import requests
|
|
|
27 |
inputlanguage = ""
|
28 |
producetext = "\n\nProduce a complete expositional blog post in {target_language} based on the above :"
|
29 |
formatinputstring = "\n\nthe above text is a learning aid. you must use rich text format to rewrite the above and add 1 . a red color tags for nouns 2. a blue color tag for verbs 3. a green color tag for adjectives and adverbs:"
|
30 |
+
translatetextinst = "\n\nthe above text is a learning aid. you must use markdown format to translate the above into {inputlanguage} :'"
|
31 |
# Regular expression patterns for each color
|
32 |
patterns = {
|
33 |
"red": r'<span style="color: red;">(.*?)</span>',
|
|
|
41 |
"blue": [],
|
42 |
"green": [],
|
43 |
}
|
44 |
+
|
45 |
+
co = cohere.Client(COHERE_API_KEY)
|
46 |
+
audio_client = Client(SEAMLESSM4T)
|
47 |
+
|
48 |
+
def get_language_code(language_name):
|
49 |
+
"""
|
50 |
+
Extracts the first two letters of the language code based on the language name.
|
51 |
+
"""
|
52 |
+
code = df.loc[df['name'] == language_name, 'code'].values[0]
|
53 |
+
return code[:2]
|
54 |
+
|
55 |
+
def translate_text(text, instructions=translatetextinst):
|
56 |
+
"""
|
57 |
+
- text (str): The initial text.
|
58 |
+
Returns:
|
59 |
+
- str: The translated text response.
|
60 |
+
"""
|
61 |
+
prompt = f"{text}{instructions}"
|
62 |
+
response = co.generate(
|
63 |
+
model='c4ai-aya',
|
64 |
+
prompt=prompt,
|
65 |
+
max_tokens=2986,
|
66 |
+
temperature=0.6,
|
67 |
+
k=0,
|
68 |
+
stop_sequences=[],
|
69 |
+
return_likelihoods='NONE'
|
70 |
+
)
|
71 |
+
return response.generations[0].text
|
72 |
+
|
73 |
class TaggedPhraseExtractor:
|
74 |
def __init__(self, text=''):
|
75 |
self.text = text
|
|
|
84 |
self.patterns[color] = pattern
|
85 |
|
86 |
def extract_phrases(self):
|
87 |
+
"""Extract phrases for all colors and patterns added, including the three longest phrases."""
|
88 |
+
matches = {}
|
89 |
+
three_matches = {}
|
90 |
+
for color, pattern in self.patterns.items():
|
91 |
+
found_phrases = re.findall(pattern, self.text)
|
92 |
+
sorted_phrases = sorted(found_phrases, key=len, reverse=True)
|
93 |
+
matches[color] = {
|
94 |
+
'all_phrases': found_phrases,
|
95 |
+
'top_three_longest': sorted_phrases[:3]
|
96 |
+
}
|
97 |
+
three_matches = sorted_phrases[:3]
|
98 |
+
return matches , three_matches
|
99 |
|
100 |
def print_phrases(self):
|
101 |
+
"""Extract phrases and print them, including the three longest phrases."""
|
102 |
matches = self.extract_phrases()
|
103 |
+
for color, data in matches.items():
|
104 |
print(f"Phrases with color {color}:")
|
105 |
+
for phrase in data['all_phrases']:
|
106 |
+
print(f"- {phrase}")
|
107 |
+
print(f"\nThree longest phrases for color {color}:")
|
108 |
+
for phrase in data['top_three_longest']:
|
109 |
print(f"- {phrase}")
|
110 |
+
print()
|
111 |
|
112 |
+
def process_audio_to_text(audio_path, inputlanguage="English", outputlanguage="English"):
|
|
|
|
|
|
|
|
|
113 |
"""
|
114 |
Convert audio input to text using the Gradio client.
|
115 |
"""
|
|
|
117 |
result = audio_client.predict(
|
118 |
audio_path,
|
119 |
inputlanguage,
|
120 |
+
outputlanguage,
|
121 |
api_name="/s2tt"
|
122 |
)
|
123 |
print("Audio Result: ", result)
|
|
|
137 |
return result[0]
|
138 |
|
139 |
class OCRProcessor:
|
140 |
+
def __init__(self, lang_code=["en"]):
|
141 |
+
self.lang_code = lang_code
|
142 |
self.det_processor, self.det_model = load_det_processor(), load_det_model()
|
143 |
self.rec_model, self.rec_processor = load_rec_model(), load_rec_processor()
|
144 |
|
|
|
146 |
"""
|
147 |
Process a PIL image and return the OCR text.
|
148 |
"""
|
149 |
+
predictions = run_ocr([image], [self.lang_code], self.det_model, self.det_processor, self.rec_model, self.rec_processor)
|
150 |
+
return predictions[0]
|
151 |
|
152 |
def process_pdf(self, pdf_path):
|
153 |
"""
|
154 |
Process a PDF file and return the OCR text.
|
155 |
"""
|
156 |
+
predictions = run_ocr([pdf_path], [self.lang_code], self.det_model, self.det_processor, self.rec_model, self.rec_processor)
|
157 |
+
return predictions[0]
|
158 |
|
159 |
def process_input(image=None, file=None, audio=None, text="", translateto = "English", translatefrom = "English" ):
|
160 |
+
lang_code = get_language_code(translatefrom)
|
161 |
+
ocr_processor = OCRProcessor(lang_code)
|
162 |
final_text = text
|
163 |
if image is not None:
|
164 |
ocr_prediction = ocr_processor.process_image(image)
|
|
|
209 |
|
210 |
audio_output = process_text_to_audio(processed_text, translateto, translateto)
|
211 |
|
212 |
+
extractor = TaggedPhraseExtractor(processed_text)
|
213 |
+
longest_phrases = extractor.get_longest_phrases()
|
214 |
+
|
215 |
+
# Translate the longest phrases back into the native language
|
216 |
+
translated_phrases = [translate_text(phrase, translateto, translatefrom) for phrase in longest_phrases]
|
217 |
+
|
218 |
+
# Convert the original and translated phrases to audio
|
219 |
+
audio_samples = {
|
220 |
+
"target_language": [text_to_audio(phrase, translateto) for phrase in longest_phrases],
|
221 |
+
"native_language": [text_to_audio(phrase, translatefrom) for phrase in translated_phrases]
|
222 |
+
}
|
223 |
+
|
224 |
+
return audio_output, processed_text, audio_samples, longest_phrases, translated_phrases
|
225 |
+
|
226 |
|
227 |
def main():
|
228 |
with gr.Blocks() as demo:
|
|
|
244 |
process_button = gr.Button("🌟AyaTonic")
|
245 |
|
246 |
processed_text_output = RichTextbox(label="Processed Text")
|
247 |
+
longest_phrases_1 = gr.Textbox(label="Focus")
|
248 |
+
translated_phrases_output_1 = gr.Textbox(label="Translated Phrases")
|
249 |
+
audio_output_native_phrase_1 = gr.Audio(label="Audio Output (Native Language)")
|
250 |
+
audio_output_target_phrase_1 = gr.Audio(label="Audio Output (Target Language)")
|
251 |
+
longest_phrases_2 = gr.Textbox(label="Focus")
|
252 |
+
translated_phrases_output_2 = gr.Textbox(label="Translated Phrases")
|
253 |
+
audio_output_native_phrase_2 = gr.Audio(label="Audio Output (Native Language)")
|
254 |
+
audio_output_target_phrase_2 = gr.Audio(label="Audio Output (Target Language)")
|
255 |
+
longest_phrases_3 = gr.Textbox(label="Focus")
|
256 |
+
translated_phrases_output_3 = gr.Textbox(label="Translated Phrases")
|
257 |
+
audio_output_native_phrase_3 = gr.Audio(label="Audio Output (Native Language)")
|
258 |
+
audio_output_target_phrase_3 = gr.Audio(label="Audio Output (Target Language)")
|
259 |
+
|
260 |
+
def update_outputs(image, file, audio, text, input_language, target_language):
|
261 |
+
processed_text, audio_samples, longest_phrases, translated_phrases = process_input(
|
262 |
+
image, file, audio, text, input_language, target_language)
|
263 |
+
return processed_text, audio_samples['native_language'], audio_samples['target_language'], "\n".join(longest_phrases), "\n".join(translated_phrases) # Fix this
|
264 |
|
265 |
process_button.click(
|
266 |
+
fn=update_outputs,
|
267 |
inputs=[image_input, file_input, audio_input, text_input, input_language, target_language],
|
268 |
+
outputs=[processed_text_output, audio_output_native_phrase_1, audio_output_target_phrase_1, longest_phrases_1, translated_phrases_output_1, audio_output_native_phrase_2, audio_output_target_phrase_2, longest_phrases_2, translated_phrases_output_2, audio_output_native_phrase_3, audio_output_target_phrase_3, longest_phrases_3, translated_phrases_output_3] #add education output
|
269 |
)
|
270 |
|
271 |
if __name__ == "__main__":
|
languages.json
DELETED
File without changes
|