Spaces:

MultiTransformer
/

AyaTonic

Runtime error

File size: 13,344 Bytes

import gradio as gr
from gradio_rich_textbox import RichTextbox
from PIL import Image
from surya.ocr import run_ocr
from surya.model.detection.segformer import load_model as load_det_model, load_processor as load_det_processor
from surya.model.recognition.model import load_model as load_rec_model
from surya.model.recognition.processor import load_processor as load_rec_processor
from lang_list import TEXT_SOURCE_LANGUAGE_NAMES , LANGUAGE_NAME_TO_CODE , text_source_language_codes
from gradio_client import Client
from dotenv import load_dotenv
import requests
from io import BytesIO  
import cohere
import os
import re
import pandas as pd
import pydub
from pydub import AudioSegment
from pydub.utils import make_chunks

title = "# Welcome to AyaTonic"
description = "Learn a New Language With Aya"
# Load environment variables
load_dotenv()
COHERE_API_KEY = os.getenv('CO_API_KEY')
SEAMLESSM4T = os.getenv('SEAMLESSM4T')
df = pd.read_csv("lang_list.csv")
choices = df["name"].to_list()
inputlanguage = ""
producetext =  "\n\nProduce a complete expositional blog post in {target_language} based on the above :"
formatinputstring = "\n\nthe above text is a learning aid. you must use rich text format to rewrite the above and add 1 . a red color tags for nouns 2. a blue color tag for verbs 3. a green color tag for adjectives and adverbs:"
translatetextinst = "\n\nthe above text is a learning aid. you must use markdown format to translate the above into {inputlanguage} :'"
# Regular expression patterns for each color
patterns = {
    "red": r'<span style="color: red;">(.*?)</span>',
    "blue": r'<span style="color: blue;">(.*?)</span>',
    "green": r'<span style="color: green;">(.*?)</span>',
}

# Dictionaries to hold the matches
matches = {
    "red": [],
    "blue": [],
    "green": [],
}

co = cohere.Client(COHERE_API_KEY)
audio_client = Client(SEAMLESSM4T)

def get_language_code(language_name):
    """
    Extracts the first two letters of the language code based on the language name.
    """
    code = df.loc[df['name'] == language_name, 'code'].values[0]
    return code[:2]

def translate_text(text, instructions=translatetextinst):
    """
    translates text.
    """
    prompt = f"{text}{instructions}"
    response = co.generate(
        model='c4ai-aya',
        prompt=prompt,
        max_tokens=2986,
        temperature=0.6,
        k=0,
        stop_sequences=[],
        return_likelihoods='NONE'
    )
    return response.generations[0].text

class LongAudioProcessor:
    def __init__(self, audio_client, api_key=None):
        self.client = audio_client
        self.api_key = api_key

    def process_long_audio(self, audio_path, chunk_length_ms=20000):
        """
        Process audio files longer than 29 seconds by chunking them into smaller segments.
        """
        audio = AudioSegment.from_file(audio_path)
        chunks = make_chunks(audio, chunk_length_ms)
        full_text = ""
        for i, chunk in enumerate(chunks):
            chunk_name = f"chunk{i}.wav"
            with open(chunk_name, 'wb') as file:
                chunk.export(file, format="wav")
            try:
                result = self.process_audio_to_text(chunk_name)
                full_text += " " + result.strip()
            except Exception as e:
                print(f"Error processing {chunk_name}: {e}")
            finally:
                if os.path.exists(chunk_name):
                    os.remove(chunk_name)
        return full_text.strip()
class TaggedPhraseExtractor:
    def __init__(self, text=''):
        self.text = text
        self.patterns = {}

    def set_text(self, text):
        """Set the text to search within."""
        self.text = text

    def add_pattern(self, color, pattern):
        """Add a new color and its associated pattern."""
        self.patterns[color] = pattern

    def extract_phrases(self):
        """Extract phrases for all colors and patterns added, including the three longest phrases."""
        matches = {}
        three_matches = {}
        for color, pattern in self.patterns.items():
            found_phrases = re.findall(pattern, self.text)
            sorted_phrases = sorted(found_phrases, key=len, reverse=True)
            matches[color] = sorted_phrases[:3]
        return matches

    def print_phrases(self):
        """Extract phrases and print them, including the three longest phrases."""
        matches = self.extract_phrases()
        for color, data in matches.items():
            print(f"Phrases with color {color}:")
            for phrase in data['all_phrases']:
                print(f"- {phrase}")
            print(f"\nThree longest phrases for color {color}:")
            for phrase in data['top_three_longest']:
                print(f"- {phrase}")
            print()
            
def process_audio_to_text(audio_path, inputlanguage="English", outputlanguage="English"):
    """
    Convert audio input to text using the Gradio client.
    """
    audio_client = Client(SEAMLESSM4T)
    result = audio_client.predict(
        audio_path,
        inputlanguage,  
        outputlanguage,  
        api_name="/s2tt"
    )
    print("Audio Result: ", result)
    return result[0]

def process_text_to_audio(text, translatefrom="English", translateto="English"):
    """
    Convert text input to audio using the Gradio client.
    """
    audio_client = Client(SEAMLESSM4T)
    result = audio_client.predict(
        text,
        translatefrom,  
        translateto, 
        api_name="/t2st"
    )
    return result[0] 

class OCRProcessor:
    def __init__(self, lang_code=["en"]): 
        self.lang_code = lang_code
        self.det_processor, self.det_model = load_det_processor(), load_det_model()
        self.rec_model, self.rec_processor = load_rec_model(), load_rec_processor()

    def process_image(self, image):
        """
        Process a PIL image and return the OCR text.
        """
        predictions = run_ocr([image], [self.lang_code], self.det_model, self.det_processor, self.rec_model, self.rec_processor)
        return predictions[0] 

    def process_pdf(self, pdf_path):
        """
        Process a PDF file and return the OCR text.
        """
        predictions = run_ocr([pdf_path], [self.lang_code], self.det_model, self.det_processor, self.rec_model, self.rec_processor)
        return predictions[0]
    
def process_input(image=None, file=None, audio=None, text="", translateto = "English", translatefrom = "English" ):
    lang_code = get_language_code(translatefrom)
    ocr_processor = OCRProcessor(lang_code)
    final_text = text
    if image is not None:
        ocr_prediction = ocr_processor.process_image(image)
        # gettig text from ocr object
        for idx in range(len((list(ocr_prediction)[0][1]))):
            final_text += " "
            final_text += list((list(ocr_prediction)[0][1])[idx])[1][1]
    if file is not None:
        if file.name.lower().endswith(('.png', '.jpg', '.jpeg')):
            pil_image = Image.open(file)
            ocr_prediction = ocr_processor.process_image(pil_image)
            # gettig text from ocr object
            for idx in range(len((list(ocr_prediction)[0][1]))):
                final_text += " "
                final_text += list((list(ocr_prediction)[0][1])[idx])[1][1]
        elif file.name.lower().endswith('.pdf'):
            ocr_prediction = ocr_processor.process_pdf(file.name)
            # gettig text from ocr object
            for idx in range(len((list(ocr_prediction)[0][1]))):
                final_text += " "
                final_text += list((list(ocr_prediction)[0][1])[idx])[1][1]
        else:
            final_text += "\nUnsupported file type."
    print("OCR Text: ", final_text)
    if audio is not None:
        long_audio_processor = LongAudioProcessor(audio_client)
        audio_text = long_audio_processor.process_long_audio(audio, inputlanguage=translatefrom, outputlanguage=translateto)
        final_text += "\n" + audio_text

    final_text_with_producetext = final_text + producetext

    response = co.generate(
        model='c4ai-aya',
        prompt=final_text_with_producetext,
        max_tokens=1024,
        temperature=0.5
    )
    # add graceful handling for errors (overflow)
    generated_text = response.generations[0].text
    print("Generated Text: ", generated_text)
    generated_text_with_format = generated_text + "\n" + formatinputstring
    response = co.generate(
        model='command-nightly',
        prompt=generated_text_with_format,
        max_tokens=4000,
        temperature=0.5
    )
    processed_text = response.generations[0].text

    audio_output = process_text_to_audio(processed_text, translateto, translateto)
    extractor = TaggedPhraseExtractor(final_text)
    extractor.add_pattern("red", patterns["red"])
    extractor.add_pattern("blue", patterns["blue"])
    extractor.add_pattern("green", patterns["green"])
    matches = extractor.extract_phrases()

    top_phrases = []
    for color, phrases in matches.items():
        top_phrases.extend(phrases)

    audio_outputs = []
    translations = []
    for phrase in top_phrases:
        translated_phrase = translate_text(phrase, translatefrom=translatefrom, translateto=translateto)
        translations.append(translated_phrase)
        target_audio = process_text_to_audio(phrase, translatefrom=translateto, translateto=translateto)
        native_audio = process_text_to_audio(translated_phrase, translatefrom=translatefrom, translateto=translatefrom)
        audio_outputs.append((target_audio, native_audio))

    return final_text, audio_output, top_phrases, translations, audio_outputs

def main():
    with gr.Blocks() as demo:
        gr.Markdown(title)
        gr.Markdown(description)
        
        with gr.Row():
            input_language = gr.Dropdown(choices=choices, label="Your Native Language")
            target_language = gr.Dropdown(choices=choices, label="Language To Learn")
        
        with gr.Accordion("Talk To 🌟AyaTonic"):
            with gr.Tab("🤙🏻Audio & Text"):
                audio_input = gr.Audio(sources="microphone", type="filepath", label="Mic Input")
                text_input = gr.Textbox(lines=2, label="Text Input")
            with gr.Tab("📸Image & File"):
                image_input = gr.Image(type="pil", label="Camera Input")
                file_input = gr.File(label="File Upload")
        
        process_button = gr.Button("🌟AyaTonic")
        
        processed_text_output = gr.RichTextbox(label="Processed Text")
        longest_phrases_1 = gr.Textbox(label="Focus")
        translated_phrases_output_1 = gr.Textbox(label="Translated Phrases")
        audio_output_native_phrase_1 = gr.Audio(label="Audio Output (Native Language)")
        audio_output_target_phrase_1 = gr.Audio(label="Audio Output (Target Language)")
        longest_phrases_2 = gr.Textbox(label="Focus")
        translated_phrases_output_2 = gr.Textbox(label="Translated Phrases")
        audio_output_native_phrase_2 = gr.Audio(label="Audio Output (Native Language)")
        audio_output_target_phrase_2 = gr.Audio(label="Audio Output (Target Language)")
        longest_phrases_3 = gr.Textbox(label="Focus")
        translated_phrases_output_3 = gr.Textbox(label="Translated Phrases")
        audio_output_native_phrase_3 = gr.Audio(label="Audio Output (Native Language)")
        audio_output_target_phrase_3 = gr.Audio(label="Audio Output (Target Language)")
        
        def update_outputs(image, file, audio, text, input_language, target_language):
            final_text, top_phrases, translations, audio_outputs = process_input(
                image=image, file=file, audio=audio, text=text, 
                translatefrom=input_language, translateto=target_language
            )

            # Prepare outputs for Gradio
            processed_text_output = final_text
            audio_output_native_phrases = [native for _, native in audio_outputs]
            audio_output_target_phrases = [target for target, _ in audio_outputs]

            # Assuming there are exactly 3 top phrases for simplicity
            longest_phrases_outputs = top_phrases[:3]
            translated_phrases_outputs = translations[:3]
            audio_outputs_native = audio_output_native_phrases[:3]
            audio_outputs_target = audio_output_target_phrases[:3]

            return (
                processed_text_output, 
                *audio_outputs_native, 
                *audio_outputs_target, 
                *longest_phrases_outputs, 
                *translated_phrases_outputs
            )
       
        process_button.click(
            fn=update_outputs,
            inputs=[image_input, file_input, audio_input, text_input, input_language, target_language],
            outputs=[
                processed_text_output, 
                audio_output_native_phrase_1, audio_output_target_phrase_1, 
                audio_output_native_phrase_2, audio_output_target_phrase_2, 
                audio_output_native_phrase_3, audio_output_target_phrase_3, 
                longest_phrases_1, translated_phrases_output_1, 
                longest_phrases_2, translated_phrases_output_2, 
                longest_phrases_3, translated_phrases_output_3
            ]
        )

if __name__ == "__main__":
    main()