import whisper
import os
import json
import torchaudio
import argparse
import torch
from config import config


import yaml

with open('config.yml', mode="r", encoding="utf-8") as f:
    configyml=yaml.load(f,Loader=yaml.FullLoader)


model_name = configyml["dataset_path"].replace("Data/","")


lang2token = {
            'zh': "ZH|",
            'ja': "JP|",
            "en": "EN|",
        }
def transcribe_one(audio_path):
    # load audio and pad/trim it to fit 30 seconds
    audio = whisper.load_audio(audio_path)
    audio = whisper.pad_or_trim(audio)

    # make log-Mel spectrogram and move to the same device as the model
    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    # detect the spoken language
    _, probs = model.detect_language(mel)
    print(f"Detected language: {max(probs, key=probs.get)}")
    lang = max(probs, key=probs.get)
    # decode the audio
    options = whisper.DecodingOptions(beam_size=5)
    result = whisper.decode(model, mel, options)

    # print the recognized text
    print(result.text)
    return lang, result.text
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--languages", default="CJ")
    parser.add_argument("--whisper_size", default="medium")
    args = parser.parse_args()
    if args.languages == "CJE":
        lang2token = {
            'zh': "ZH|",
            'ja': "JP|",
            "en": "EN|",
        }
    elif args.languages == "CJ":
        lang2token = {
            'zh': "ZH|",
            'ja': "JP|",
        }
    elif args.languages == "C":
        lang2token = {
            'zh': "ZH|",
        }
    assert (torch.cuda.is_available()), "Please enable GPU in order to run Whisper!"
    model = whisper.load_model(args.whisper_size)
    #parent_dir = "./custom_character_voice/"
    parent_dir=config.resample_config.in_dir
    parent_dir = parent_dir.replace("/audios","")
    print(parent_dir)
    speaker = model_name
    speaker_annos = []
    total_files = sum([len(files) for r, d, files in os.walk(parent_dir)])
    # resample audios
    # 2023/4/21: Get the target sampling rate
    with open(config.train_ms_config.config_path,'r', encoding='utf-8') as f:
        hps = json.load(f)
    target_sr = hps['data']['sampling_rate']
    processed_files = 0


    for i, wavfile in enumerate(list(os.walk(parent_dir))[0][2]):
        # try to load file as audio
        # if wavfile.startswith("processed_"):
        #     continue
        try:
            # wav, sr = torchaudio.load(parent_dir + "/" + speaker + "/" + wavfile, frame_offset=0, num_frames=-1, normalize=True,
            #                           channels_first=True)
            # wav = wav.mean(dim=0).unsqueeze(0)
            # if sr != target_sr:
            #     wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)(wav)
            # if wav.shape[1] / sr > 20:
            #     print(f"{wavfile} too long, ignoring\n")
            #save_path = parent_dir+"/"+ speaker + "/" + f"ada_{i}.wav"
            # torchaudio.save(save_path, wav, target_sr, channels_first=True)
            # transcribe text
            lang, text = transcribe_one(f"./Data/{speaker}/raw/{wavfile}")
            if lang not in list(lang2token.keys()):
                print(f"{lang} not supported, ignoring\n")
                continue
            #text = "ZH|" + text + "\n"
            text = f"./Data/{model_name}/wavs/{wavfile}|" + f"{model_name}|" +lang2token[lang] + text + "\n"
            speaker_annos.append(text)
            
            processed_files += 1
            print(f"Processed: {processed_files}/{total_files}")
        except Exception as e:
            print(e)
            continue

    # # clean annotation
    # import argparse
    # import text
    # from utils import load_filepaths_and_text
    # for i, line in enumerate(speaker_annos):
    #     path, sid, txt = line.split("|")
    #     cleaned_text = text._clean_text(txt, ["cjke_cleaners2"])
    #     cleaned_text += "\n" if not cleaned_text.endswith("\n") else ""
    #     speaker_annos[i] = path + "|" + sid + "|" + cleaned_text
    # write into annotation
    if len(speaker_annos) == 0:
        print("Warning: no short audios found, this IS expected if you have only uploaded long audios, videos or video links.")
        print("this IS NOT expected if you have uploaded a zip file of short audios. Please check your file structure or make sure your audio language is supported.")
    with open(config.preprocess_text_config.transcription_path, 'w', encoding='utf-8') as f:
        for line in speaker_annos:
            f.write(line)