diff --git a/.gitattributes b/.gitattributes
index c7d9f3332a950355d5a77d85000f05e6f45435ea..78def07007c1e3cf19794a94c22e5291f6e1c2e6 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -32,3 +32,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/demo_short.gif filter=lfs diff=lfs merge=lfs -text
+assets/demo.gif filter=lfs diff=lfs merge=lfs -text
+assets/figure.jpg filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..8643f44007fc1998f20faeb407716f1cf390e985
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,133 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+image/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+annotator
+cldm
+ldm
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/README.md b/README.md
index 154df8298fab5ecf322016157858e08cd1bccbe1..262be212abed4bed83f0887ed2cfdeebdbc93495 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,91 @@
----
-license: apache-2.0
----
+# visual-chatgpt-zh-vits
+visual-chatgpt支持中文的windows版本
+
+融合vits推断模块
+
+
+官方论文: [<font size=5>Visual ChatGPT: Talking, Drawing and Editing with Visual Foundation Models</font>](https://arxiv.org/abs/2303.04671)
+
+官方仓库：[visual-chatgpt](https://github.com/microsoft/visual-chatgpt)
+
+fork from：[visual-chatgpt-zh](https://github.com/wxj630/visual-chatgpt-zh)
+
+
+## Demo 
+<img src="./assets/demo_short.gif" width="750">
+
+##  System Architecture 
+
+
+<p align="center"><img src="./assets/figure.jpg" alt="Logo"></p>
+
+
+## Quick Start
+
+```
+# 1、下载代码
+git clone https://github.com/FrankZxShen/visual-chatgpt-zh-vits.git
+
+# 2、进入项目目录
+cd visual-chatgpt-zh-vits
+
+# 3、创建python环境并激活环境
+conda create -n visgpt python=3.8
+activate visgpt 
+
+# 4、安装环境依赖
+pip install -r requirement.txt
+
+# 5、确认api key
+export OPENAI_API_KEY={Your_Private_Openai_Key}
+# windows系统用set命令而不是export
+set OPENAI_API_KEY={Your_Private_Openai_Key}
+
+# 6、下载hf模型到指定目录
+# 具体模型文件地址于hf_models
+# 若需要vits推断功能将G.pth config.json放于vits_models下（目前仅支持日语？）
+# Windows：下载pyopenjtalk Windows于text下
+
+# 7、启动系统,这个例子我们加载了ImageCaptioning和Text2Image两个模型，
+python visual_chatgpt_zh_vits.py
+# 想要用哪个功能就可增加一些模型加载
+python visual_chatgpt_zh_vits.py 
+--load ImageCaptioning_cuda:0,Text2Image_cuda:0 \
+--pretrained_model_dir {your_hf_models_path} \
+
+# 8、可以直接在visual_chatgpt_zh_vits.py 38行修改key 若需要vits 39行设定True
+```
+
+原作者：根据官方建议，不同显卡可以指定不同“--load”参数，显存不够的就可以时间换空间，把不重要的模型加载到cpu上，虽然推理慢但是好歹能跑不是？（手动狗头）：
+```
+# Advice for CPU Users
+python visual_chatgpt.py --load ImageCaptioning_cpu,Text2Image_cpu
+
+# Advice for 1 Tesla T4 15GB  (Google Colab)                       
+python visual_chatgpt.py --load "ImageCaptioning_cuda:0,Text2Image_cuda:0"
+                                
+# Advice for 4 Tesla V100 32GB                            
+python visual_chatgpt.py --load "ImageCaptioning_cuda:0,ImageEditing_cuda:0,
+    Text2Image_cuda:1,Image2Canny_cpu,CannyText2Image_cuda:1,
+    Image2Depth_cpu,DepthText2Image_cuda:1,VisualQuestionAnswering_cuda:2,
+    InstructPix2Pix_cuda:2,Image2Scribble_cpu,ScribbleText2Image_cuda:2,
+    Image2Seg_cpu,SegText2Image_cuda:2,Image2Pose_cpu,PoseText2Image_cuda:2,
+    Image2Hed_cpu,HedText2Image_cuda:3,Image2Normal_cpu,
+    NormalText2Image_cuda:3,Image2Line_cpu,LineText2Image_cuda:3"
+```
+
+实测环境 Windows RTX3070 8G：若只需要ImageCaptioning和Text2Image两个模型的功能，对显存要求极低，理论上能跑AI绘图均可以（>4G，但速度很慢）？
+
+## limitations
+
+img无法显示在gradio上？
+
+## Acknowledgement
+
+We appreciate the open source of the following projects:
+
+- HuggingFace [[Project]](https://github.com/huggingface/transformers)
+
+- ControlNet  [[Paper]](https://arxiv.org/abs/2302.05543) [[Project]](https://github.com/lllyasviel/ControlNet)
+
+- Stable Diffusion [[Paper]](https://arxiv.org/abs/2112.10752)  [[Project]](https://github.com/CompVis/stable-diffusion)
diff --git a/assets/demo.gif b/assets/demo.gif
new file mode 100644
index 0000000000000000000000000000000000000000..6e97adf4516c6f5b36e98e3f10d7181e68266dd6
--- /dev/null
+++ b/assets/demo.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aef60fb82c9d46584c298346c9e5579cc51df0b6f347bccdb01f8476af9535e8
+size 2077920
diff --git a/assets/demo_short.gif b/assets/demo_short.gif
new file mode 100644
index 0000000000000000000000000000000000000000..38bfdf60369a46dc39c40bc7547ab366fb58ef8d
--- /dev/null
+++ b/assets/demo_short.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a4a388707ffe492d7c884a54e1ae84cd30ccd4e9b97f5319478cbb87dc87de3c
+size 1413949
diff --git a/assets/figure.jpg b/assets/figure.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..88d5b7a2eda46aa441bc997204c63d83bbc27894
--- /dev/null
+++ b/assets/figure.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2f369af4e9bfec6d524395650ef4481c9cc13f12f3897fa923f859cf925338c0
+size 3634245
diff --git a/attentions.py b/attentions.py
new file mode 100644
index 0000000000000000000000000000000000000000..89cb9c07c363c2f59aff4a27306024a033747b7c
--- /dev/null
+++ b/attentions.py
@@ -0,0 +1,300 @@
+import math
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+import commons
+from vits_modules import LayerNorm
+   
+
+class Encoder(nn.Module):
+  def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs):
+    super().__init__()
+    self.hidden_channels = hidden_channels
+    self.filter_channels = filter_channels
+    self.n_heads = n_heads
+    self.n_layers = n_layers
+    self.kernel_size = kernel_size
+    self.p_dropout = p_dropout
+    self.window_size = window_size
+
+    self.drop = nn.Dropout(p_dropout)
+    self.attn_layers = nn.ModuleList()
+    self.norm_layers_1 = nn.ModuleList()
+    self.ffn_layers = nn.ModuleList()
+    self.norm_layers_2 = nn.ModuleList()
+    for i in range(self.n_layers):
+      self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
+      self.norm_layers_1.append(LayerNorm(hidden_channels))
+      self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
+      self.norm_layers_2.append(LayerNorm(hidden_channels))
+
+  def forward(self, x, x_mask):
+    attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+    x = x * x_mask
+    for i in range(self.n_layers):
+      y = self.attn_layers[i](x, x, attn_mask)
+      y = self.drop(y)
+      x = self.norm_layers_1[i](x + y)
+
+      y = self.ffn_layers[i](x, x_mask)
+      y = self.drop(y)
+      x = self.norm_layers_2[i](x + y)
+    x = x * x_mask
+    return x
+
+
+class Decoder(nn.Module):
+  def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
+    super().__init__()
+    self.hidden_channels = hidden_channels
+    self.filter_channels = filter_channels
+    self.n_heads = n_heads
+    self.n_layers = n_layers
+    self.kernel_size = kernel_size
+    self.p_dropout = p_dropout
+    self.proximal_bias = proximal_bias
+    self.proximal_init = proximal_init
+
+    self.drop = nn.Dropout(p_dropout)
+    self.self_attn_layers = nn.ModuleList()
+    self.norm_layers_0 = nn.ModuleList()
+    self.encdec_attn_layers = nn.ModuleList()
+    self.norm_layers_1 = nn.ModuleList()
+    self.ffn_layers = nn.ModuleList()
+    self.norm_layers_2 = nn.ModuleList()
+    for i in range(self.n_layers):
+      self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
+      self.norm_layers_0.append(LayerNorm(hidden_channels))
+      self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
+      self.norm_layers_1.append(LayerNorm(hidden_channels))
+      self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
+      self.norm_layers_2.append(LayerNorm(hidden_channels))
+
+  def forward(self, x, x_mask, h, h_mask):
+    """
+    x: decoder input
+    h: encoder output
+    """
+    self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
+    encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+    x = x * x_mask
+    for i in range(self.n_layers):
+      y = self.self_attn_layers[i](x, x, self_attn_mask)
+      y = self.drop(y)
+      x = self.norm_layers_0[i](x + y)
+
+      y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
+      y = self.drop(y)
+      x = self.norm_layers_1[i](x + y)
+      
+      y = self.ffn_layers[i](x, x_mask)
+      y = self.drop(y)
+      x = self.norm_layers_2[i](x + y)
+    x = x * x_mask
+    return x
+
+
+class MultiHeadAttention(nn.Module):
+  def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False):
+    super().__init__()
+    assert channels % n_heads == 0
+
+    self.channels = channels
+    self.out_channels = out_channels
+    self.n_heads = n_heads
+    self.p_dropout = p_dropout
+    self.window_size = window_size
+    self.heads_share = heads_share
+    self.block_length = block_length
+    self.proximal_bias = proximal_bias
+    self.proximal_init = proximal_init
+    self.attn = None
+
+    self.k_channels = channels // n_heads
+    self.conv_q = nn.Conv1d(channels, channels, 1)
+    self.conv_k = nn.Conv1d(channels, channels, 1)
+    self.conv_v = nn.Conv1d(channels, channels, 1)
+    self.conv_o = nn.Conv1d(channels, out_channels, 1)
+    self.drop = nn.Dropout(p_dropout)
+
+    if window_size is not None:
+      n_heads_rel = 1 if heads_share else n_heads
+      rel_stddev = self.k_channels**-0.5
+      self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
+      self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
+
+    nn.init.xavier_uniform_(self.conv_q.weight)
+    nn.init.xavier_uniform_(self.conv_k.weight)
+    nn.init.xavier_uniform_(self.conv_v.weight)
+    if proximal_init:
+      with torch.no_grad():
+        self.conv_k.weight.copy_(self.conv_q.weight)
+        self.conv_k.bias.copy_(self.conv_q.bias)
+      
+  def forward(self, x, c, attn_mask=None):
+    q = self.conv_q(x)
+    k = self.conv_k(c)
+    v = self.conv_v(c)
+    
+    x, self.attn = self.attention(q, k, v, mask=attn_mask)
+
+    x = self.conv_o(x)
+    return x
+
+  def attention(self, query, key, value, mask=None):
+    # reshape [b, d, t] -> [b, n_h, t, d_k]
+    b, d, t_s, t_t = (*key.size(), query.size(2))
+    query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
+    key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+    value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+
+    scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
+    if self.window_size is not None:
+      assert t_s == t_t, "Relative attention is only available for self-attention."
+      key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
+      rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings)
+      scores_local = self._relative_position_to_absolute_position(rel_logits)
+      scores = scores + scores_local
+    if self.proximal_bias:
+      assert t_s == t_t, "Proximal bias is only available for self-attention."
+      scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
+    if mask is not None:
+      scores = scores.masked_fill(mask == 0, -1e4)
+      if self.block_length is not None:
+        assert t_s == t_t, "Local attention is only available for self-attention."
+        block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
+        scores = scores.masked_fill(block_mask == 0, -1e4)
+    p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
+    p_attn = self.drop(p_attn)
+    output = torch.matmul(p_attn, value)
+    if self.window_size is not None:
+      relative_weights = self._absolute_position_to_relative_position(p_attn)
+      value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
+      output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
+    output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
+    return output, p_attn
+
+  def _matmul_with_relative_values(self, x, y):
+    """
+    x: [b, h, l, m]
+    y: [h or 1, m, d]
+    ret: [b, h, l, d]
+    """
+    ret = torch.matmul(x, y.unsqueeze(0))
+    return ret
+
+  def _matmul_with_relative_keys(self, x, y):
+    """
+    x: [b, h, l, d]
+    y: [h or 1, m, d]
+    ret: [b, h, l, m]
+    """
+    ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
+    return ret
+
+  def _get_relative_embeddings(self, relative_embeddings, length):
+    max_relative_position = 2 * self.window_size + 1
+    # Pad first before slice to avoid using cond ops.
+    pad_length = max(length - (self.window_size + 1), 0)
+    slice_start_position = max((self.window_size + 1) - length, 0)
+    slice_end_position = slice_start_position + 2 * length - 1
+    if pad_length > 0:
+      padded_relative_embeddings = F.pad(
+          relative_embeddings,
+          commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
+    else:
+      padded_relative_embeddings = relative_embeddings
+    used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position]
+    return used_relative_embeddings
+
+  def _relative_position_to_absolute_position(self, x):
+    """
+    x: [b, h, l, 2*l-1]
+    ret: [b, h, l, l]
+    """
+    batch, heads, length, _ = x.size()
+    # Concat columns of pad to shift from relative to absolute indexing.
+    x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
+
+    # Concat extra elements so to add up to shape (len+1, 2*len-1).
+    x_flat = x.view([batch, heads, length * 2 * length])
+    x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]]))
+
+    # Reshape and slice out the padded elements.
+    x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
+    return x_final
+
+  def _absolute_position_to_relative_position(self, x):
+    """
+    x: [b, h, l, l]
+    ret: [b, h, l, 2*l-1]
+    """
+    batch, heads, length, _ = x.size()
+    # padd along column
+    x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
+    x_flat = x.view([batch, heads, length**2 + length*(length -1)])
+    # add 0's in the beginning that will skew the elements after reshape
+    x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
+    x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
+    return x_final
+
+  def _attention_bias_proximal(self, length):
+    """Bias for self-attention to encourage attention to close positions.
+    Args:
+      length: an integer scalar.
+    Returns:
+      a Tensor with shape [1, 1, length, length]
+    """
+    r = torch.arange(length, dtype=torch.float32)
+    diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
+    return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
+
+
+class FFN(nn.Module):
+  def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
+    super().__init__()
+    self.in_channels = in_channels
+    self.out_channels = out_channels
+    self.filter_channels = filter_channels
+    self.kernel_size = kernel_size
+    self.p_dropout = p_dropout
+    self.activation = activation
+    self.causal = causal
+
+    if causal:
+      self.padding = self._causal_padding
+    else:
+      self.padding = self._same_padding
+
+    self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
+    self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
+    self.drop = nn.Dropout(p_dropout)
+
+  def forward(self, x, x_mask):
+    x = self.conv_1(self.padding(x * x_mask))
+    if self.activation == "gelu":
+      x = x * torch.sigmoid(1.702 * x)
+    else:
+      x = torch.relu(x)
+    x = self.drop(x)
+    x = self.conv_2(self.padding(x * x_mask))
+    return x * x_mask
+  
+  def _causal_padding(self, x):
+    if self.kernel_size == 1:
+      return x
+    pad_l = self.kernel_size - 1
+    pad_r = 0
+    padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+    x = F.pad(x, commons.convert_pad_shape(padding))
+    return x
+
+  def _same_padding(self, x):
+    if self.kernel_size == 1:
+      return x
+    pad_l = (self.kernel_size - 1) // 2
+    pad_r = self.kernel_size // 2
+    padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+    x = F.pad(x, commons.convert_pad_shape(padding))
+    return x
diff --git a/commons.py b/commons.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda0a67534ac34bd02dc28b845619b2433a40df6
--- /dev/null
+++ b/commons.py
@@ -0,0 +1,96 @@
+import torch
+from torch.nn import functional as F
+import torch.jit
+
+
+def script_method(fn, _rcb=None):
+  return fn
+
+
+def script(obj, optimize=True, _frames_up=0, _rcb=None):
+  return obj
+
+
+torch.jit.script_method = script_method
+torch.jit.script = script
+
+
+def init_weights(m, mean=0.0, std=0.01):
+  classname = m.__class__.__name__
+  if classname.find("Conv") != -1:
+    m.weight.data.normal_(mean, std)
+
+
+def get_padding(kernel_size, dilation=1):
+  return int((kernel_size*dilation - dilation)/2)
+
+
+def intersperse(lst, item):
+  result = [item] * (len(lst) * 2 + 1)
+  result[1::2] = lst
+  return result
+
+
+def slice_segments(x, ids_str, segment_size=4):
+  ret = torch.zeros_like(x[:, :, :segment_size])
+  for i in range(x.size(0)):
+    idx_str = ids_str[i]
+    idx_end = idx_str + segment_size
+    ret[i] = x[i, :, idx_str:idx_end]
+  return ret
+
+
+def rand_slice_segments(x, x_lengths=None, segment_size=4):
+  b, d, t = x.size()
+  if x_lengths is None:
+    x_lengths = t
+  ids_str_max = x_lengths - segment_size + 1
+  ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+  ret = slice_segments(x, ids_str, segment_size)
+  return ret, ids_str
+
+
+def subsequent_mask(length):
+  mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
+  return mask
+
+
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+  n_channels_int = n_channels[0]
+  in_act = input_a + input_b
+  t_act = torch.tanh(in_act[:, :n_channels_int, :])
+  s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+  acts = t_act * s_act
+  return acts
+
+
+def convert_pad_shape(pad_shape):
+  l = pad_shape[::-1]
+  pad_shape = [item for sublist in l for item in sublist]
+  return pad_shape
+
+
+def sequence_mask(length, max_length=None):
+  if max_length is None:
+    max_length = length.max()
+  x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+  return x.unsqueeze(0) < length.unsqueeze(1)
+
+
+def generate_path(duration, mask):
+  """
+  duration: [b, 1, t_x]
+  mask: [b, 1, t_y, t_x]
+  """
+  device = duration.device
+  
+  b, _, t_y, t_x = mask.shape
+  cum_duration = torch.cumsum(duration, -1)
+  
+  cum_duration_flat = cum_duration.view(b * t_x)
+  path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
+  path = path.view(b, t_x, t_y)
+  path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
+  path = path.unsqueeze(1).transpose(2,3) * mask
+  return path
diff --git a/hf_models/download.md b/hf_models/download.md
new file mode 100644
index 0000000000000000000000000000000000000000..ea4a52a0760f7087217e4ab87494da5d458ce558
--- /dev/null
+++ b/hf_models/download.md
@@ -0,0 +1,33 @@
+ git clone https://huggingface.co/Salesforce/blip-image-captioning-base 
+
+ git clone https://huggingface.co/runwayml/stable-diffusion-v1-5
+
+ git clone https://huggingface.co/runwayml/stable-diffusion-inpainting
+
+ git clone https://huggingface.co/CIDAS/clipseg-rd64-refined
+
+ git clone https://huggingface.co/timbrooks/instruct-pix2pix
+
+ git clone https://huggingface.co/Salesforce/blip-vqa-base
+
+ git clone https://huggingface.co/lllyasviel/ControlNet
+
+ git clone https://huggingface.co/lllyasviel/sd-controlnet-canny
+
+ git clone https://huggingface.co/lllyasviel/sd-controlnet-seg
+
+ git clone https://huggingface.co/lllyasviel/sd-controlnet-scribble
+
+ git clone https://huggingface.co/lllyasviel/sd-controlnet-normal
+
+ git clone https://huggingface.co/lllyasviel/sd-controlnet-mlsd
+
+ git clone https://huggingface.co/lllyasviel/sd-controlnet-depth
+
+ git clone https://huggingface.co/lllyasviel/sd-controlnet-hed
+
+ git clone https://huggingface.co/lllyasviel/sd-controlnet-openpose
+
+ git clone https://huggingface.co/openmmlab/upernet-convnext-small
+
+ git clone https://huggingface.co/Intel/dpt-hybrid-midas
\ No newline at end of file
diff --git a/hubert_model.py b/hubert_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c7f8716c268d0f371f5a9f7995f59bd4b9082d1
--- /dev/null
+++ b/hubert_model.py
@@ -0,0 +1,221 @@
+import copy
+from typing import Optional, Tuple
+import random
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
+
+class Hubert(nn.Module):
+    def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
+        super().__init__()
+        self._mask = mask
+        self.feature_extractor = FeatureExtractor()
+        self.feature_projection = FeatureProjection()
+        self.positional_embedding = PositionalConvEmbedding()
+        self.norm = nn.LayerNorm(768)
+        self.dropout = nn.Dropout(0.1)
+        self.encoder = TransformerEncoder(
+            nn.TransformerEncoderLayer(
+                768, 12, 3072, activation="gelu", batch_first=True
+            ),
+            12,
+        )
+        self.proj = nn.Linear(768, 256)
+
+        self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_())
+        self.label_embedding = nn.Embedding(num_label_embeddings, 256)
+
+    def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        mask = None
+        if self.training and self._mask:
+            mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2)
+            x[mask] = self.masked_spec_embed.to(x.dtype)
+        return x, mask
+
+    def encode(
+        self, x: torch.Tensor, layer: Optional[int] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        x = self.feature_extractor(x)
+        x = self.feature_projection(x.transpose(1, 2))
+        x, mask = self.mask(x)
+        x = x + self.positional_embedding(x)
+        x = self.dropout(self.norm(x))
+        x = self.encoder(x, output_layer=layer)
+        return x, mask
+
+    def logits(self, x: torch.Tensor) -> torch.Tensor:
+        logits = torch.cosine_similarity(
+            x.unsqueeze(2),
+            self.label_embedding.weight.unsqueeze(0).unsqueeze(0),
+            dim=-1,
+        )
+        return logits / 0.1
+
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        x, mask = self.encode(x)
+        x = self.proj(x)
+        logits = self.logits(x)
+        return logits, mask
+
+
+class HubertSoft(Hubert):
+    def __init__(self):
+        super().__init__()
+
+    @torch.inference_mode()
+    def units(self, wav: torch.Tensor) -> torch.Tensor:
+        wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
+        x, _ = self.encode(wav)
+        return self.proj(x)
+
+
+class FeatureExtractor(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False)
+        self.norm0 = nn.GroupNorm(512, 512)
+        self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False)
+        self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False)
+        self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.gelu(self.norm0(self.conv0(x)))
+        x = F.gelu(self.conv1(x))
+        x = F.gelu(self.conv2(x))
+        x = F.gelu(self.conv3(x))
+        x = F.gelu(self.conv4(x))
+        x = F.gelu(self.conv5(x))
+        x = F.gelu(self.conv6(x))
+        return x
+
+
+class FeatureProjection(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.norm = nn.LayerNorm(512)
+        self.projection = nn.Linear(512, 768)
+        self.dropout = nn.Dropout(0.1)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.norm(x)
+        x = self.projection(x)
+        x = self.dropout(x)
+        return x
+
+
+class PositionalConvEmbedding(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            768,
+            768,
+            kernel_size=128,
+            padding=128 // 2,
+            groups=16,
+        )
+        self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.conv(x.transpose(1, 2))
+        x = F.gelu(x[:, :, :-1])
+        return x.transpose(1, 2)
+
+
+class TransformerEncoder(nn.Module):
+    def __init__(
+        self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
+    ) -> None:
+        super(TransformerEncoder, self).__init__()
+        self.layers = nn.ModuleList(
+            [copy.deepcopy(encoder_layer) for _ in range(num_layers)]
+        )
+        self.num_layers = num_layers
+
+    def forward(
+        self,
+        src: torch.Tensor,
+        mask: torch.Tensor = None,
+        src_key_padding_mask: torch.Tensor = None,
+        output_layer: Optional[int] = None,
+    ) -> torch.Tensor:
+        output = src
+        for layer in self.layers[:output_layer]:
+            output = layer(
+                output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
+            )
+        return output
+
+
+def _compute_mask(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    device: torch.device,
+    min_masks: int = 0,
+) -> torch.Tensor:
+    batch_size, sequence_length = shape
+
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
+        )
+
+    # compute number of masked spans in batch
+    num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random())
+    num_masked_spans = max(num_masked_spans, min_masks)
+
+    # make sure num masked indices <= sequence_length
+    if num_masked_spans * mask_length > sequence_length:
+        num_masked_spans = sequence_length // mask_length
+
+    # SpecAugment mask to fill
+    mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool)
+
+    # uniform distribution to sample from, make sure that offset samples are < sequence_length
+    uniform_dist = torch.ones(
+        (batch_size, sequence_length - (mask_length - 1)), device=device
+    )
+
+    # get random indices to mask
+    mask_indices = torch.multinomial(uniform_dist, num_masked_spans)
+
+    # expand masked indices to masked spans
+    mask_indices = (
+        mask_indices.unsqueeze(dim=-1)
+        .expand((batch_size, num_masked_spans, mask_length))
+        .reshape(batch_size, num_masked_spans * mask_length)
+    )
+    offsets = (
+        torch.arange(mask_length, device=device)[None, None, :]
+        .expand((batch_size, num_masked_spans, mask_length))
+        .reshape(batch_size, num_masked_spans * mask_length)
+    )
+    mask_idxs = mask_indices + offsets
+
+    # scatter indices to mask
+    mask = mask.scatter(1, mask_idxs, True)
+
+    return mask
+
+
+def hubert_soft(
+    path: str
+) -> HubertSoft:
+    r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
+    Args:
+        path (str): path of a pretrained model
+    """
+    hubert = HubertSoft()
+    checkpoint = torch.load(path)
+    consume_prefix_in_state_dict_if_present(checkpoint, "module.")
+    hubert.load_state_dict(checkpoint)
+    hubert.eval()
+    return hubert
diff --git a/image/01cc9083.png b/image/01cc9083.png
new file mode 100644
index 0000000000000000000000000000000000000000..71197f5f0f26475b0ba6acbbfe024b2fd7b959db
Binary files /dev/null and b/image/01cc9083.png differ
diff --git a/image/1d988a81.png b/image/1d988a81.png
new file mode 100644
index 0000000000000000000000000000000000000000..cce27ff04300ed74c1c72c3c29bb00fe77307fb3
Binary files /dev/null and b/image/1d988a81.png differ
diff --git a/image/307ade76.png b/image/307ade76.png
new file mode 100644
index 0000000000000000000000000000000000000000..7f9def50df67a02751f96520c163dc528b5910bb
Binary files /dev/null and b/image/307ade76.png differ
diff --git a/image/5ebacb6a.png b/image/5ebacb6a.png
new file mode 100644
index 0000000000000000000000000000000000000000..ae26aab078d7022e5da976152cfa7a097b85a7f1
Binary files /dev/null and b/image/5ebacb6a.png differ
diff --git a/image/cdb4d5e5.png b/image/cdb4d5e5.png
new file mode 100644
index 0000000000000000000000000000000000000000..f922f42364d5e05651389a90f7add71663be6c42
Binary files /dev/null and b/image/cdb4d5e5.png differ
diff --git a/mel_processing.py b/mel_processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e252e76320522a8a4195a60665168f22769aec2
--- /dev/null
+++ b/mel_processing.py
@@ -0,0 +1,101 @@
+import torch
+import torch.utils.data
+from librosa.filters import mel as librosa_mel_fn
+
+MAX_WAV_VALUE = 32768.0
+
+
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    """
+    PARAMS
+    ------
+    C: compression factor
+    """
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+
+
+def dynamic_range_decompression_torch(x, C=1):
+    """
+    PARAMS
+    ------
+    C: compression factor used to compress
+    """
+    return torch.exp(x) / C
+
+
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+
+
+def spectral_de_normalize_torch(magnitudes):
+    output = dynamic_range_decompression_torch(magnitudes)
+    return output
+
+
+mel_basis = {}
+hann_window = {}
+
+
+def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
+    if torch.min(y) < -1.:
+        print('min value is ', torch.min(y))
+    if torch.max(y) > 1.:
+        print('max value is ', torch.max(y))
+
+    global hann_window
+    dtype_device = str(y.dtype) + '_' + str(y.device)
+    wnsize_dtype_device = str(win_size) + '_' + dtype_device
+    if wnsize_dtype_device not in hann_window:
+        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
+
+    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
+    y = y.squeeze(1)
+
+    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
+                      center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
+
+    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+    return spec
+
+
+def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
+    global mel_basis
+    dtype_device = str(spec.dtype) + '_' + str(spec.device)
+    fmax_dtype_device = str(fmax) + '_' + dtype_device
+    if fmax_dtype_device not in mel_basis:
+        mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
+        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
+    spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec
+
+
+def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
+    if torch.min(y) < -1.:
+        print('min value is ', torch.min(y))
+    if torch.max(y) > 1.:
+        print('max value is ', torch.max(y))
+
+    global mel_basis, hann_window
+    dtype_device = str(y.dtype) + '_' + str(y.device)
+    fmax_dtype_device = str(fmax) + '_' + dtype_device
+    wnsize_dtype_device = str(win_size) + '_' + dtype_device
+    if fmax_dtype_device not in mel_basis:
+        mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
+        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
+    if wnsize_dtype_device not in hann_window:
+        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
+
+    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
+    y = y.squeeze(1)
+
+    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
+                      center=center, pad_mode='reflect', normalized=False, onesided=True)
+
+    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+
+    spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
+    spec = spectral_normalize_torch(spec)
+
+    return spec
diff --git a/models.py b/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecec07bed88394d5b6c1fc1f61348ecf601b1b1a
--- /dev/null
+++ b/models.py
@@ -0,0 +1,404 @@
+import math
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+import commons
+import vits_modules as modules
+import attentions
+
+from torch.nn import Conv1d, ConvTranspose1d
+from torch.nn.utils import weight_norm
+from commons import init_weights
+
+
+class StochasticDurationPredictor(nn.Module):
+  def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0):
+    super().__init__()
+    filter_channels = in_channels # it needs to be removed from future version.
+    self.in_channels = in_channels
+    self.filter_channels = filter_channels
+    self.kernel_size = kernel_size
+    self.p_dropout = p_dropout
+    self.n_flows = n_flows
+    self.gin_channels = gin_channels
+
+    self.log_flow = modules.Log()
+    self.flows = nn.ModuleList()
+    self.flows.append(modules.ElementwiseAffine(2))
+    for i in range(n_flows):
+      self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
+      self.flows.append(modules.Flip())
+
+    self.post_pre = nn.Conv1d(1, filter_channels, 1)
+    self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
+    self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
+    self.post_flows = nn.ModuleList()
+    self.post_flows.append(modules.ElementwiseAffine(2))
+    for i in range(4):
+      self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
+      self.post_flows.append(modules.Flip())
+
+    self.pre = nn.Conv1d(in_channels, filter_channels, 1)
+    self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
+    self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
+    if gin_channels != 0:
+      self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
+
+  def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
+    x = torch.detach(x)
+    x = self.pre(x)
+    if g is not None:
+      g = torch.detach(g)
+      x = x + self.cond(g)
+    x = self.convs(x, x_mask)
+    x = self.proj(x) * x_mask
+
+    if not reverse:
+      flows = self.flows
+      assert w is not None
+
+      logdet_tot_q = 0 
+      h_w = self.post_pre(w)
+      h_w = self.post_convs(h_w, x_mask)
+      h_w = self.post_proj(h_w) * x_mask
+      e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask
+      z_q = e_q
+      for flow in self.post_flows:
+        z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
+        logdet_tot_q += logdet_q
+      z_u, z1 = torch.split(z_q, [1, 1], 1) 
+      u = torch.sigmoid(z_u) * x_mask
+      z0 = (w - u) * x_mask
+      logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1,2])
+      logq = torch.sum(-0.5 * (math.log(2*math.pi) + (e_q**2)) * x_mask, [1,2]) - logdet_tot_q
+
+      logdet_tot = 0
+      z0, logdet = self.log_flow(z0, x_mask)
+      logdet_tot += logdet
+      z = torch.cat([z0, z1], 1)
+      for flow in flows:
+        z, logdet = flow(z, x_mask, g=x, reverse=reverse)
+        logdet_tot = logdet_tot + logdet
+      nll = torch.sum(0.5 * (math.log(2*math.pi) + (z**2)) * x_mask, [1,2]) - logdet_tot
+      return nll + logq # [b]
+    else:
+      flows = list(reversed(self.flows))
+      flows = flows[:-2] + [flows[-1]] # remove a useless vflow
+      z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
+      for flow in flows:
+        z = flow(z, x_mask, g=x, reverse=reverse)
+      z0, z1 = torch.split(z, [1, 1], 1)
+      logw = z0
+      return logw
+
+
+class DurationPredictor(nn.Module):
+  def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0):
+    super().__init__()
+
+    self.in_channels = in_channels
+    self.filter_channels = filter_channels
+    self.kernel_size = kernel_size
+    self.p_dropout = p_dropout
+    self.gin_channels = gin_channels
+
+    self.drop = nn.Dropout(p_dropout)
+    self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size//2)
+    self.norm_1 = modules.LayerNorm(filter_channels)
+    self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size//2)
+    self.norm_2 = modules.LayerNorm(filter_channels)
+    self.proj = nn.Conv1d(filter_channels, 1, 1)
+
+    if gin_channels != 0:
+      self.cond = nn.Conv1d(gin_channels, in_channels, 1)
+
+  def forward(self, x, x_mask, g=None):
+    x = torch.detach(x)
+    if g is not None:
+      g = torch.detach(g)
+      x = x + self.cond(g)
+    x = self.conv_1(x * x_mask)
+    x = torch.relu(x)
+    x = self.norm_1(x)
+    x = self.drop(x)
+    x = self.conv_2(x * x_mask)
+    x = torch.relu(x)
+    x = self.norm_2(x)
+    x = self.drop(x)
+    x = self.proj(x * x_mask)
+    return x * x_mask
+
+
+class TextEncoder(nn.Module):
+  def __init__(self,
+      n_vocab,
+      out_channels,
+      hidden_channels,
+      filter_channels,
+      n_heads,
+      n_layers,
+      kernel_size,
+      p_dropout,
+      emotion_embedding):
+    super().__init__()
+    self.n_vocab = n_vocab
+    self.out_channels = out_channels
+    self.hidden_channels = hidden_channels
+    self.filter_channels = filter_channels
+    self.n_heads = n_heads
+    self.n_layers = n_layers
+    self.kernel_size = kernel_size
+    self.p_dropout = p_dropout
+    self.emotion_embedding = emotion_embedding
+    
+    if self.n_vocab!=0:
+      self.emb = nn.Embedding(n_vocab, hidden_channels)
+      if emotion_embedding:
+        self.emo_proj = nn.Linear(1024, hidden_channels)
+      nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
+
+    self.encoder = attentions.Encoder(
+      hidden_channels,
+      filter_channels,
+      n_heads,
+      n_layers,
+      kernel_size,
+      p_dropout)
+    self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+  def forward(self, x, x_lengths, emotion_embedding=None):
+    if self.n_vocab!=0:
+      x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
+    if emotion_embedding is not None:
+      x = x + self.emo_proj(emotion_embedding.unsqueeze(1))
+    x = torch.transpose(x, 1, -1) # [b, h, t]
+    x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
+
+    x = self.encoder(x * x_mask, x_mask)
+    stats = self.proj(x) * x_mask
+
+    m, logs = torch.split(stats, self.out_channels, dim=1)
+    return x, m, logs, x_mask
+
+
+class ResidualCouplingBlock(nn.Module):
+  def __init__(self,
+      channels,
+      hidden_channels,
+      kernel_size,
+      dilation_rate,
+      n_layers,
+      n_flows=4,
+      gin_channels=0):
+    super().__init__()
+    self.channels = channels
+    self.hidden_channels = hidden_channels
+    self.kernel_size = kernel_size
+    self.dilation_rate = dilation_rate
+    self.n_layers = n_layers
+    self.n_flows = n_flows
+    self.gin_channels = gin_channels
+
+    self.flows = nn.ModuleList()
+    for i in range(n_flows):
+      self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
+      self.flows.append(modules.Flip())
+
+  def forward(self, x, x_mask, g=None, reverse=False):
+    if not reverse:
+      for flow in self.flows:
+        x, _ = flow(x, x_mask, g=g, reverse=reverse)
+    else:
+      for flow in reversed(self.flows):
+        x = flow(x, x_mask, g=g, reverse=reverse)
+    return x
+
+
+class PosteriorEncoder(nn.Module):
+  def __init__(self,
+      in_channels,
+      out_channels,
+      hidden_channels,
+      kernel_size,
+      dilation_rate,
+      n_layers,
+      gin_channels=0):
+    super().__init__()
+    self.in_channels = in_channels
+    self.out_channels = out_channels
+    self.hidden_channels = hidden_channels
+    self.kernel_size = kernel_size
+    self.dilation_rate = dilation_rate
+    self.n_layers = n_layers
+    self.gin_channels = gin_channels
+
+    self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+    self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
+    self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+  def forward(self, x, x_lengths, g=None):
+    x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
+    x = self.pre(x) * x_mask
+    x = self.enc(x, x_mask, g=g)
+    stats = self.proj(x) * x_mask
+    m, logs = torch.split(stats, self.out_channels, dim=1)
+    z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+    return z, m, logs, x_mask
+
+
+class Generator(torch.nn.Module):
+    def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
+        super(Generator, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
+        resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
+
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(weight_norm(
+                ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
+                                k, u, padding=(k-u)//2)))
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel//(2**(i+1))
+            for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+                self.resblocks.append(resblock(ch, k, d))
+
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+        self.ups.apply(init_weights)
+
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+
+    def forward(self, x, g=None):
+        x = self.conv_pre(x)
+        if g is not None:
+          x = x + self.cond(g)
+
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i*self.num_kernels+j](x)
+                else:
+                    xs += self.resblocks[i*self.num_kernels+j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+
+        return x
+
+
+class SynthesizerTrn(nn.Module):
+  """
+  Synthesizer for Training
+  """
+
+  def __init__(self, 
+    n_vocab,
+    spec_channels,
+    segment_size,
+    inter_channels,
+    hidden_channels,
+    filter_channels,
+    n_heads,
+    n_layers,
+    kernel_size,
+    p_dropout,
+    resblock, 
+    resblock_kernel_sizes, 
+    resblock_dilation_sizes, 
+    upsample_rates, 
+    upsample_initial_channel, 
+    upsample_kernel_sizes,
+    n_speakers=0,
+    gin_channels=0,
+    use_sdp=True,
+    emotion_embedding=False,
+    **kwargs):
+
+    super().__init__()
+    self.n_vocab = n_vocab
+    self.spec_channels = spec_channels
+    self.inter_channels = inter_channels
+    self.hidden_channels = hidden_channels
+    self.filter_channels = filter_channels
+    self.n_heads = n_heads
+    self.n_layers = n_layers
+    self.kernel_size = kernel_size
+    self.p_dropout = p_dropout
+    self.resblock = resblock
+    self.resblock_kernel_sizes = resblock_kernel_sizes
+    self.resblock_dilation_sizes = resblock_dilation_sizes
+    self.upsample_rates = upsample_rates
+    self.upsample_initial_channel = upsample_initial_channel
+    self.upsample_kernel_sizes = upsample_kernel_sizes
+    self.segment_size = segment_size
+    self.n_speakers = n_speakers
+    self.gin_channels = gin_channels
+
+    self.use_sdp = use_sdp
+
+    self.enc_p = TextEncoder(n_vocab,
+        inter_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        emotion_embedding)
+    self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
+    self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
+    self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
+
+    if use_sdp:
+      self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
+    else:
+      self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
+
+    if n_speakers > 1:
+      self.emb_g = nn.Embedding(n_speakers, gin_channels)
+
+  def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None, emotion_embedding=None):
+    x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, emotion_embedding)
+    if self.n_speakers > 0:
+      g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
+    else:
+      g = None
+
+    if self.use_sdp:
+      logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
+    else:
+      logw = self.dp(x, x_mask, g=g)
+    w = torch.exp(logw) * x_mask * length_scale
+    w_ceil = torch.ceil(w)
+    y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
+    y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype)
+    attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
+    attn = commons.generate_path(w_ceil, attn_mask)
+
+    m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
+    logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
+
+    z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
+    z = self.flow(z_p, y_mask, g=g, reverse=True)
+    o = self.dec((z * y_mask)[:,:,:max_len], g=g)
+    return o, attn, y_mask, (z, z_p, m_p, logs_p)
+
+  def voice_conversion(self, y, y_lengths, sid_src, sid_tgt):
+    assert self.n_speakers > 0, "n_speakers have to be larger than 0."
+    g_src = self.emb_g(sid_src).unsqueeze(-1)
+    g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
+    z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
+    z_p = self.flow(z, y_mask, g=g_src)
+    z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
+    o_hat = self.dec(z_hat * y_mask, g=g_tgt)
+    return o_hat, y_mask, (z, z_p, z_hat)
+
diff --git a/modules/__init__.py b/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/modules/controlnet_canny.py b/modules/controlnet_canny.py
new file mode 100644
index 0000000000000000000000000000000000000000..65c8e499f6e52080d86942f2aa16952cdd5da3a6
--- /dev/null
+++ b/modules/controlnet_canny.py
@@ -0,0 +1,60 @@
+from modules.utils import *
+
+class Image2Canny:
+    def __init__(self, device, pretrained_model_dir):
+        print("Initializing Image2Canny")
+        self.low_threshold = 100
+        self.high_threshold = 200
+
+    @prompts(name="Edge Detection On Image",
+             description="useful when you want to detect the edge of the image. "
+                         "like: detect the edges of this image, or canny detection on image, "
+                         "or perform edge detection on this image, or detect the canny image of this image. "
+                         "The input to this tool should be a string, representing the image_path")
+    def inference(self, inputs):
+        image = Image.open(inputs)
+        image = np.array(image)
+        canny = cv2.Canny(image, self.low_threshold, self.high_threshold)
+        canny = canny[:, :, None]
+        canny = np.concatenate([canny, canny, canny], axis=2)
+        canny = Image.fromarray(canny)
+        updated_image_path = get_new_image_name(inputs, func_name="edge")
+        canny.save(updated_image_path)
+        print(f"\nProcessed Image2Canny, Input Image: {inputs}, Output Text: {updated_image_path}")
+        return updated_image_path
+
+class CannyText2Image:
+    def __init__(self, device, pretrained_model_dir):
+        print("Initializing CannyText2Image to %s" % device)
+        self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
+        self.controlnet = ControlNetModel.from_pretrained(f"{pretrained_model_dir}/sd-controlnet-canny",
+                                                          torch_dtype=self.torch_dtype)
+        self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            f"{pretrained_model_dir}/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
+            torch_dtype=self.torch_dtype)
+        self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
+        self.pipe.to(device)
+        self.seed = -1
+        self.a_prompt = 'best quality, extremely detailed'
+        self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
+                        'fewer digits, cropped, worst quality, low quality'
+
+    @prompts(name="Generate Image Condition On Canny Image",
+             description="useful when you want to generate a new real image from both the user desciption and a canny image."
+                         " like: generate a real image of a object or something from this canny image,"
+                         " or generate a new real image of a object or something from this edge image. "
+                         "The input to this tool should be a comma seperated string of two, "
+                         "representing the image_path and the user description. ")
+    def inference(self, inputs):
+        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
+        image = Image.open(image_path)
+        self.seed = random.randint(0, 65535)
+        seed_everything(self.seed)
+        prompt = instruct_text + ', ' + self.a_prompt
+        image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
+                          guidance_scale=9.0).images[0]
+        updated_image_path = get_new_image_name(image_path, func_name="canny2image")
+        image.save(updated_image_path)
+        print(f"\nProcessed CannyText2Image, Input Canny: {image_path}, Input Text: {instruct_text}, "
+              f"Output Text: {updated_image_path}")
+        return updated_image_path
\ No newline at end of file
diff --git a/modules/controlnet_depth.py b/modules/controlnet_depth.py
new file mode 100644
index 0000000000000000000000000000000000000000..18e7b40879baa008d7b57babc5e821dbdf4d0b3e
--- /dev/null
+++ b/modules/controlnet_depth.py
@@ -0,0 +1,59 @@
+from modules.utils import *
+
+class Image2Depth:
+    def __init__(self, device, pretrained_model_dir):
+        print("Initializing Image2Depth")
+        self.depth_estimator = pipeline('depth-estimation')
+
+    @prompts(name="Predict Depth On Image",
+             description="useful when you want to detect depth of the image. like: generate the depth from this image, "
+                         "or detect the depth map on this image, or predict the depth for this image. "
+                         "The input to this tool should be a string, representing the image_path")
+    def inference(self, inputs):
+        image = Image.open(inputs)
+        depth = self.depth_estimator(image)['depth']
+        depth = np.array(depth)
+        depth = depth[:, :, None]
+        depth = np.concatenate([depth, depth, depth], axis=2)
+        depth = Image.fromarray(depth)
+        updated_image_path = get_new_image_name(inputs, func_name="depth")
+        depth.save(updated_image_path)
+        print(f"\nProcessed Image2Depth, Input Image: {inputs}, Output Depth: {updated_image_path}")
+        return updated_image_path
+
+
+class DepthText2Image:
+    def __init__(self, device, pretrained_model_dir):
+        print("Initializing DepthText2Image to %s" % device)
+        self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
+        self.controlnet = ControlNetModel.from_pretrained(
+            f"{pretrained_model_dir}/sd-controlnet-depth", torch_dtype=self.torch_dtype)
+        self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            f"{pretrained_model_dir}/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
+            torch_dtype=self.torch_dtype)
+        self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
+        self.pipe.to(device)
+        self.seed = -1
+        self.a_prompt = 'best quality, extremely detailed'
+        self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
+                        ' fewer digits, cropped, worst quality, low quality'
+
+    @prompts(name="Generate Image Condition On Depth",
+             description="useful when you want to generate a new real image from both the user desciption and depth image. "
+                         "like: generate a real image of a object or something from this depth image, "
+                         "or generate a new real image of a object or something from the depth map. "
+                         "The input to this tool should be a comma seperated string of two, "
+                         "representing the image_path and the user description")
+    def inference(self, inputs):
+        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
+        image = Image.open(image_path)
+        self.seed = random.randint(0, 65535)
+        seed_everything(self.seed)
+        prompt = instruct_text + ', ' + self.a_prompt
+        image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
+                          guidance_scale=9.0).images[0]
+        updated_image_path = get_new_image_name(image_path, func_name="depth2image")
+        image.save(updated_image_path)
+        print(f"\nProcessed DepthText2Image, Input Depth: {image_path}, Input Text: {instruct_text}, "
+              f"Output Image: {updated_image_path}")
+        return updated_image_path
\ No newline at end of file
diff --git a/modules/controlnet_hed.py b/modules/controlnet_hed.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd56532ba4059b8698a4164aea1ff8a4501a4981
--- /dev/null
+++ b/modules/controlnet_hed.py
@@ -0,0 +1,58 @@
+from modules.utils import *
+
+class Image2Hed:
+    def __init__(self, device, pretrained_model_dir):
+        print("Initializing Image2Hed")
+        self.detector = HEDdetector.from_pretrained(f'{pretrained_model_dir}/ControlNet')
+
+    @prompts(name="Hed Detection On Image",
+             description="useful when you want to detect the soft hed boundary of the image. "
+                         "like: detect the soft hed boundary of this image, or hed boundary detection on image, "
+                         "or peform hed boundary detection on this image, or detect soft hed boundary image of this image. "
+                         "The input to this tool should be a string, representing the image_path")
+    def inference(self, inputs):
+        image = Image.open(inputs)
+        hed = self.detector(image)
+        updated_image_path = get_new_image_name(inputs, func_name="hed-boundary")
+        hed.save(updated_image_path)
+        print(f"\nProcessed Image2Hed, Input Image: {inputs}, Output Hed: {updated_image_path}")
+        return updated_image_path
+
+
+class HedText2Image:
+    def __init__(self, device, pretrained_model_dir):
+        print("Initializing HedText2Image to %s" % device)
+        self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
+        self.controlnet = ControlNetModel.from_pretrained(f"{pretrained_model_dir}/sd-controlnet-hed",
+                                                          torch_dtype=self.torch_dtype)
+        self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            f"{pretrained_model_dir}/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
+            torch_dtype=self.torch_dtype
+        )
+        self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
+        self.pipe.to(device)
+        self.seed = -1
+        self.a_prompt = 'best quality, extremely detailed'
+        self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
+                        'fewer digits, cropped, worst quality, low quality'
+
+    @prompts(name="Generate Image Condition On Soft Hed Boundary Image",
+             description="useful when you want to generate a new real image from both the user desciption "
+                         "and a soft hed boundary image. "
+                         "like: generate a real image of a object or something from this soft hed boundary image, "
+                         "or generate a new real image of a object or something from this hed boundary. "
+                         "The input to this tool should be a comma seperated string of two, "
+                         "representing the image_path and the user description")
+    def inference(self, inputs):
+        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
+        image = Image.open(image_path)
+        self.seed = random.randint(0, 65535)
+        seed_everything(self.seed)
+        prompt = instruct_text + ', ' + self.a_prompt
+        image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
+                          guidance_scale=9.0).images[0]
+        updated_image_path = get_new_image_name(image_path, func_name="hed2image")
+        image.save(updated_image_path)
+        print(f"\nProcessed HedText2Image, Input Hed: {image_path}, Input Text: {instruct_text}, "
+              f"Output Image: {updated_image_path}")
+        return updated_image_path
\ No newline at end of file
diff --git a/modules/controlnet_line.py b/modules/controlnet_line.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a45a9f92789c5bb552425091ea62dd00376712a
--- /dev/null
+++ b/modules/controlnet_line.py
@@ -0,0 +1,58 @@
+from modules.utils import *
+
+class Image2Line:
+    def __init__(self, device, pretrained_model_dir):
+        print("Initializing Image2Line")
+        self.detector = MLSDdetector.from_pretrained(f'{pretrained_model_dir}/ControlNet')
+
+    @prompts(name="Line Detection On Image",
+             description="useful when you want to detect the straight line of the image. "
+                         "like: detect the straight lines of this image, or straight line detection on image, "
+                         "or peform straight line detection on this image, or detect the straight line image of this image. "
+                         "The input to this tool should be a string, representing the image_path")
+    def inference(self, inputs):
+        image = Image.open(inputs)
+        mlsd = self.detector(image)
+        updated_image_path = get_new_image_name(inputs, func_name="line-of")
+        mlsd.save(updated_image_path)
+        print(f"\nProcessed Image2Line, Input Image: {inputs}, Output Line: {updated_image_path}")
+        return updated_image_path
+
+
+class LineText2Image:
+    def __init__(self, device, pretrained_model_dir):
+        print("Initializing LineText2Image to %s" % device)
+        self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
+        self.controlnet = ControlNetModel.from_pretrained(f"{pretrained_model_dir}/sd-controlnet-mlsd",
+                                                          torch_dtype=self.torch_dtype)
+        self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            f"{pretrained_model_dir}/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
+            torch_dtype=self.torch_dtype
+        )
+        self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
+        self.pipe.to(device)
+        self.seed = -1
+        self.a_prompt = 'best quality, extremely detailed'
+        self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
+                        'fewer digits, cropped, worst quality, low quality'
+
+    @prompts(name="Generate Image Condition On Line Image",
+             description="useful when you want to generate a new real image from both the user desciption "
+                         "and a straight line image. "
+                         "like: generate a real image of a object or something from this straight line image, "
+                         "or generate a new real image of a object or something from this straight lines. "
+                         "The input to this tool should be a comma seperated string of two, "
+                         "representing the image_path and the user description. ")
+    def inference(self, inputs):
+        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
+        image = Image.open(image_path)
+        self.seed = random.randint(0, 65535)
+        seed_everything(self.seed)
+        prompt = instruct_text + ', ' + self.a_prompt
+        image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
+                          guidance_scale=9.0).images[0]
+        updated_image_path = get_new_image_name(image_path, func_name="line2image")
+        image.save(updated_image_path)
+        print(f"\nProcessed LineText2Image, Input Line: {image_path}, Input Text: {instruct_text}, "
+              f"Output Text: {updated_image_path}")
+        return updated_image_path
\ No newline at end of file
diff --git a/modules/controlnet_normal.py b/modules/controlnet_normal.py
new file mode 100644
index 0000000000000000000000000000000000000000..be0749e2121ccdbe16dc5424179d15a9f149d5d2
--- /dev/null
+++ b/modules/controlnet_normal.py
@@ -0,0 +1,71 @@
+from modules.utils import *
+
+class Image2Normal:
+    def __init__(self, device, pretrained_model_dir):
+        print("Initializing Image2Normal")
+        self.depth_estimator = pipeline("depth-estimation", model=f"{pretrained_model_dir}/dpt-hybrid-midas")
+        self.bg_threhold = 0.4
+
+    @prompts(name="Predict Normal Map On Image",
+             description="useful when you want to detect norm map of the image. "
+                         "like: generate normal map from this image, or predict normal map of this image. "
+                         "The input to this tool should be a string, representing the image_path")
+    def inference(self, inputs):
+        image = Image.open(inputs)
+        original_size = image.size
+        image = self.depth_estimator(image)['predicted_depth'][0]
+        image = image.numpy()
+        image_depth = image.copy()
+        image_depth -= np.min(image_depth)
+        image_depth /= np.max(image_depth)
+        x = cv2.Sobel(image, cv2.CV_32F, 1, 0, ksize=3)
+        x[image_depth < self.bg_threhold] = 0
+        y = cv2.Sobel(image, cv2.CV_32F, 0, 1, ksize=3)
+        y[image_depth < self.bg_threhold] = 0
+        z = np.ones_like(x) * np.pi * 2.0
+        image = np.stack([x, y, z], axis=2)
+        image /= np.sum(image ** 2.0, axis=2, keepdims=True) ** 0.5
+        image = (image * 127.5 + 127.5).clip(0, 255).astype(np.uint8)
+        image = Image.fromarray(image)
+        image = image.resize(original_size)
+        updated_image_path = get_new_image_name(inputs, func_name="normal-map")
+        image.save(updated_image_path)
+        print(f"\nProcessed Image2Normal, Input Image: {inputs}, Output Depth: {updated_image_path}")
+        return updated_image_path
+
+
+class NormalText2Image:
+    def __init__(self, device, pretrained_model_dir):
+        print("Initializing NormalText2Image to %s" % device)
+        self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
+        self.controlnet = ControlNetModel.from_pretrained(
+            f"{pretrained_model_dir}/sd-controlnet-normal", torch_dtype=self.torch_dtype)
+        self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            f"{pretrained_model_dir}/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
+            torch_dtype=self.torch_dtype)
+        self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
+        self.pipe.to(device)
+        self.seed = -1
+        self.a_prompt = 'best quality, extremely detailed'
+        self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
+                        ' fewer digits, cropped, worst quality, low quality'
+
+    @prompts(name="Generate Image Condition On Normal Map",
+             description="useful when you want to generate a new real image from both the user desciption and normal map. "
+                         "like: generate a real image of a object or something from this normal map, "
+                         "or generate a new real image of a object or something from the normal map. "
+                         "The input to this tool should be a comma seperated string of two, "
+                         "representing the image_path and the user description")
+    def inference(self, inputs):
+        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
+        image = Image.open(image_path)
+        self.seed = random.randint(0, 65535)
+        seed_everything(self.seed)
+        prompt = instruct_text + ', ' + self.a_prompt
+        image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
+                          guidance_scale=9.0).images[0]
+        updated_image_path = get_new_image_name(image_path, func_name="normal2image")
+        image.save(updated_image_path)
+        print(f"\nProcessed NormalText2Image, Input Normal: {image_path}, Input Text: {instruct_text}, "
+              f"Output Image: {updated_image_path}")
+        return updated_image_path
\ No newline at end of file
diff --git a/modules/controlnet_pose.py b/modules/controlnet_pose.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3e85eb506ffba16b7d146b886a9134c00e2f02e
--- /dev/null
+++ b/modules/controlnet_pose.py
@@ -0,0 +1,58 @@
+from modules.utils import *
+
+class Image2Pose:
+    def __init__(self, device, pretrained_model_dir):
+        print("Initializing Image2Pose")
+        self.detector = OpenposeDetector.from_pretrained(f'{pretrained_model_dir}/ControlNet')
+
+    @prompts(name="Pose Detection On Image",
+             description="useful when you want to detect the human pose of the image. "
+                         "like: generate human poses of this image, or generate a pose image from this image. "
+                         "The input to this tool should be a string, representing the image_path")
+    def inference(self, inputs):
+        image = Image.open(inputs)
+        pose = self.detector(image)
+        updated_image_path = get_new_image_name(inputs, func_name="human-pose")
+        pose.save(updated_image_path)
+        print(f"\nProcessed Image2Pose, Input Image: {inputs}, Output Pose: {updated_image_path}")
+        return updated_image_path
+
+
+class PoseText2Image:
+    def __init__(self, device, pretrained_model_dir):
+        print("Initializing PoseText2Image to %s" % device)
+        self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
+        self.controlnet = ControlNetModel.from_pretrained(f"{pretrained_model_dir}/sd-controlnet-openpose",
+                                                          torch_dtype=self.torch_dtype)
+        self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            f"{pretrained_model_dir}/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
+            torch_dtype=self.torch_dtype)
+        self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
+        self.pipe.to(device)
+        self.num_inference_steps = 20
+        self.seed = -1
+        self.unconditional_guidance_scale = 9.0
+        self.a_prompt = 'best quality, extremely detailed'
+        self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
+                        ' fewer digits, cropped, worst quality, low quality'
+
+    @prompts(name="Generate Image Condition On Pose Image",
+             description="useful when you want to generate a new real image from both the user desciption "
+                         "and a human pose image. "
+                         "like: generate a real image of a human from this human pose image, "
+                         "or generate a new real image of a human from this pose. "
+                         "The input to this tool should be a comma seperated string of two, "
+                         "representing the image_path and the user description")
+    def inference(self, inputs):
+        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
+        image = Image.open(image_path)
+        self.seed = random.randint(0, 65535)
+        seed_everything(self.seed)
+        prompt = instruct_text + ', ' + self.a_prompt
+        image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
+                          guidance_scale=9.0).images[0]
+        updated_image_path = get_new_image_name(image_path, func_name="pose2image")
+        image.save(updated_image_path)
+        print(f"\nProcessed PoseText2Image, Input Pose: {image_path}, Input Text: {instruct_text}, "
+              f"Output Image: {updated_image_path}")
+        return updated_image_path
\ No newline at end of file
diff --git a/modules/controlnet_scibble.py b/modules/controlnet_scibble.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba6e1adcbfb99945837fc8ff90856c1817a7ce10
--- /dev/null
+++ b/modules/controlnet_scibble.py
@@ -0,0 +1,56 @@
+from modules.utils import *
+
+class Image2Scribble:
+    def __init__(self, device, pretrained_model_dir):
+        print("Initializing Image2Scribble")
+        self.detector = HEDdetector.from_pretrained(f'{pretrained_model_dir}/ControlNet')
+
+    @prompts(name="Sketch Detection On Image",
+             description="useful when you want to generate a scribble of the image. "
+                         "like: generate a scribble of this image, or generate a sketch from this image, "
+                         "detect the sketch from this image. "
+                         "The input to this tool should be a string, representing the image_path")
+    def inference(self, inputs):
+        image = Image.open(inputs)
+        scribble = self.detector(image, scribble=True)
+        updated_image_path = get_new_image_name(inputs, func_name="scribble")
+        scribble.save(updated_image_path)
+        print(f"\nProcessed Image2Scribble, Input Image: {inputs}, Output Scribble: {updated_image_path}")
+        return updated_image_path
+
+
+class ScribbleText2Image:
+    def __init__(self, device, pretrained_model_dir):
+        print("Initializing ScribbleText2Image to %s" % device)
+        self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
+        self.controlnet = ControlNetModel.from_pretrained(f"{pretrained_model_dir}/sd-controlnet-scribble",
+                                                          torch_dtype=self.torch_dtype)
+        self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            f"{pretrained_model_dir}/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
+            torch_dtype=self.torch_dtype
+        )
+        self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
+        self.pipe.to(device)
+        self.seed = -1
+        self.a_prompt = 'best quality, extremely detailed'
+        self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
+                        'fewer digits, cropped, worst quality, low quality'
+
+    @prompts(name="Generate Image Condition On Sketch Image",
+             description="useful when you want to generate a new real image from both the user desciption and "
+                         "a scribble image or a sketch image. "
+                         "The input to this tool should be a comma seperated string of two, "
+                         "representing the image_path and the user description")
+    def inference(self, inputs):
+        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
+        image = Image.open(image_path)
+        self.seed = random.randint(0, 65535)
+        seed_everything(self.seed)
+        prompt = instruct_text + ', ' + self.a_prompt
+        image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
+                          guidance_scale=9.0).images[0]
+        updated_image_path = get_new_image_name(image_path, func_name="scribble2image")
+        image.save(updated_image_path)
+        print(f"\nProcessed ScribbleText2Image, Input Scribble: {image_path}, Input Text: {instruct_text}, "
+              f"Output Image: {updated_image_path}")
+        return updated_image_path
\ No newline at end of file
diff --git a/modules/controlnet_seg.py b/modules/controlnet_seg.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe8ff23d43eb747f767144a534171579d742bba8
--- /dev/null
+++ b/modules/controlnet_seg.py
@@ -0,0 +1,104 @@
+from modules.utils import *
+
+class Image2Seg:
+    def __init__(self, device, pretrained_model_dir):
+        print("Initializing Image2Seg")
+        self.image_processor = AutoImageProcessor.from_pretrained(f"{pretrained_model_dir}/upernet-convnext-small")
+        self.image_segmentor = UperNetForSemanticSegmentation.from_pretrained(f"{pretrained_model_dir}/upernet-convnext-small")
+        self.ade_palette = [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50],
+                            [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255],
+                            [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7],
+                            [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82],
+                            [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3],
+                            [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255],
+                            [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220],
+                            [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224],
+                            [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255],
+                            [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7],
+                            [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153],
+                            [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255],
+                            [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0],
+                            [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
+                            [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255],
+                            [11, 200, 200], [255, 82, 0], [0, 255, 245], [0, 61, 255],
+                            [0, 255, 112], [0, 255, 133], [255, 0, 0], [255, 163, 0],
+                            [255, 102, 0], [194, 255, 0], [0, 143, 255], [51, 255, 0],
+                            [0, 82, 255], [0, 255, 41], [0, 255, 173], [10, 0, 255],
+                            [173, 255, 0], [0, 255, 153], [255, 92, 0], [255, 0, 255],
+                            [255, 0, 245], [255, 0, 102], [255, 173, 0], [255, 0, 20],
+                            [255, 184, 184], [0, 31, 255], [0, 255, 61], [0, 71, 255],
+                            [255, 0, 204], [0, 255, 194], [0, 255, 82], [0, 10, 255],
+                            [0, 112, 255], [51, 0, 255], [0, 194, 255], [0, 122, 255],
+                            [0, 255, 163], [255, 153, 0], [0, 255, 10], [255, 112, 0],
+                            [143, 255, 0], [82, 0, 255], [163, 255, 0], [255, 235, 0],
+                            [8, 184, 170], [133, 0, 255], [0, 255, 92], [184, 0, 255],
+                            [255, 0, 31], [0, 184, 255], [0, 214, 255], [255, 0, 112],
+                            [92, 255, 0], [0, 224, 255], [112, 224, 255], [70, 184, 160],
+                            [163, 0, 255], [153, 0, 255], [71, 255, 0], [255, 0, 163],
+                            [255, 204, 0], [255, 0, 143], [0, 255, 235], [133, 255, 0],
+                            [255, 0, 235], [245, 0, 255], [255, 0, 122], [255, 245, 0],
+                            [10, 190, 212], [214, 255, 0], [0, 204, 255], [20, 0, 255],
+                            [255, 255, 0], [0, 153, 255], [0, 41, 255], [0, 255, 204],
+                            [41, 0, 255], [41, 255, 0], [173, 0, 255], [0, 245, 255],
+                            [71, 0, 255], [122, 0, 255], [0, 255, 184], [0, 92, 255],
+                            [184, 255, 0], [0, 133, 255], [255, 214, 0], [25, 194, 194],
+                            [102, 255, 0], [92, 0, 255]]
+
+    @prompts(name="Segmentation On Image",
+             description="useful when you want to detect segmentations of the image. "
+                         "like: segment this image, or generate segmentations on this image, "
+                         "or peform segmentation on this image. "
+                         "The input to this tool should be a string, representing the image_path")
+    def inference(self, inputs):
+        image = Image.open(inputs)
+        pixel_values = self.image_processor(image, return_tensors="pt").pixel_values
+        with torch.no_grad():
+            outputs = self.image_segmentor(pixel_values)
+        seg = self.image_processor.post_process_semantic_segmentation(outputs, target_sizes=[image.size[::-1]])[0]
+        color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8)  # height, width, 3
+        palette = np.array(self.ade_palette)
+        for label, color in enumerate(palette):
+            color_seg[seg == label, :] = color
+        color_seg = color_seg.astype(np.uint8)
+        segmentation = Image.fromarray(color_seg)
+        updated_image_path = get_new_image_name(inputs, func_name="segmentation")
+        segmentation.save(updated_image_path)
+        print(f"\nProcessed Image2Pose, Input Image: {inputs}, Output Pose: {updated_image_path}")
+        return updated_image_path
+
+
+class SegText2Image:
+    def __init__(self, device, pretrained_model_dir):
+        print("Initializing SegText2Image to %s" % device)
+        self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
+        self.controlnet = ControlNetModel.from_pretrained(f"{pretrained_model_dir}/sd-controlnet-seg",
+                                                          torch_dtype=self.torch_dtype)
+        self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            f"{pretrained_model_dir}/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
+            torch_dtype=self.torch_dtype)
+        self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
+        self.pipe.to(device)
+        self.seed = -1
+        self.a_prompt = 'best quality, extremely detailed'
+        self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
+                        ' fewer digits, cropped, worst quality, low quality'
+
+    @prompts(name="Generate Image Condition On Segmentations",
+             description="useful when you want to generate a new real image from both the user desciption and segmentations. "
+                         "like: generate a real image of a object or something from this segmentation image, "
+                         "or generate a new real image of a object or something from these segmentations. "
+                         "The input to this tool should be a comma seperated string of two, "
+                         "representing the image_path and the user description")
+    def inference(self, inputs):
+        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
+        image = Image.open(image_path)
+        self.seed = random.randint(0, 65535)
+        seed_everything(self.seed)
+        prompt = instruct_text + ', ' + self.a_prompt
+        image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
+                          guidance_scale=9.0).images[0]
+        updated_image_path = get_new_image_name(image_path, func_name="segment2image")
+        image.save(updated_image_path)
+        print(f"\nProcessed SegText2Image, Input Seg: {image_path}, Input Text: {instruct_text}, "
+              f"Output Image: {updated_image_path}")
+        return updated_image_path
\ No newline at end of file
diff --git a/modules/image_captioning.py b/modules/image_captioning.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5f6ae07a67db18cdce084d2ef65ed989f2a9b25
--- /dev/null
+++ b/modules/image_captioning.py
@@ -0,0 +1,21 @@
+import torch
+from modules.utils import *
+
+class ImageCaptioning:
+    def __init__(self, device, pretrained_model_dir):
+        print("Initializing ImageCaptioning to %s" % device)
+        self.device = device
+        self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
+        self.processor = BlipProcessor.from_pretrained(f"{pretrained_model_dir}/blip-image-captioning-base")
+        self.model = BlipForConditionalGeneration.from_pretrained(
+            f"{pretrained_model_dir}/blip-image-captioning-base", torch_dtype=self.torch_dtype).to(self.device)
+
+    @prompts(name="Get Photo Description",
+             description="useful when you want to know what is inside the photo. receives image_path as input. "
+                         "The input to this tool should be a string, representing the image_path. ")
+    def inference(self, image_path):
+        inputs = self.processor(Image.open(image_path), return_tensors="pt").to(self.device, self.torch_dtype)
+        out = self.model.generate(**inputs)
+        captions = self.processor.decode(out[0], skip_special_tokens=True)
+        print(f"\nProcessed ImageCaptioning, Input Image: {image_path}, Output Text: {captions}")
+        return captions
\ No newline at end of file
diff --git a/modules/image_editing.py b/modules/image_editing.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d858d6c51979e8a0c63b8ca1b7a070a3c6ef816
--- /dev/null
+++ b/modules/image_editing.py
@@ -0,0 +1,40 @@
+from modules.utils import *
+
+class ImageEditing:
+    def __init__(self, device, pretrained_model_dir):
+        print("Initializing ImageEditing to %s" % device)
+        self.device = device
+        self.mask_former = MaskFormer(device=self.device, pretrained_model_dir=pretrained_model_dir)
+        self.revision = 'fp16' if 'cuda' in device else None
+        self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
+        self.inpaint = StableDiffusionInpaintPipeline.from_pretrained(
+            f"{pretrained_model_dir}/stable-diffusion-inpainting", revision=self.revision, torch_dtype=self.torch_dtype).to(device)
+
+    @prompts(name="Remove Something From The Photo",
+             description="useful when you want to remove and object or something from the photo "
+                         "from its description or location. "
+                         "The input to this tool should be a comma seperated string of two, "
+                         "representing the image_path and the object need to be removed. ")
+    def inference_remove(self, inputs):
+        image_path, to_be_removed_txt = inputs.split(",")
+        return self.inference_replace(f"{image_path},{to_be_removed_txt},background")
+
+    @prompts(name="Replace Something From The Photo",
+             description="useful when you want to replace an object from the object description or "
+                         "location with another object from its description. "
+                         "The input to this tool should be a comma seperated string of three, "
+                         "representing the image_path, the object to be replaced, the object to be replaced with ")
+    def inference_replace(self, inputs):
+        image_path, to_be_replaced_txt, replace_with_txt = inputs.split(",")
+        original_image = Image.open(image_path)
+        original_size = original_image.size
+        mask_image = self.mask_former.inference(image_path, to_be_replaced_txt)
+        updated_image = self.inpaint(prompt=replace_with_txt, image=original_image.resize((512, 512)),
+                                     mask_image=mask_image.resize((512, 512))).images[0]
+        updated_image_path = get_new_image_name(image_path, func_name="replace-something")
+        updated_image = updated_image.resize(original_size)
+        updated_image.save(updated_image_path)
+        print(
+            f"\nProcessed ImageEditing, Input Image: {image_path}, Replace {to_be_replaced_txt} to {replace_with_txt}, "
+            f"Output Image: {updated_image_path}")
+        return updated_image_path
\ No newline at end of file
diff --git a/modules/instruct_px2pix.py b/modules/instruct_px2pix.py
new file mode 100644
index 0000000000000000000000000000000000000000..a30f90a4e1eefa0fd022b304411385f8c284866d
--- /dev/null
+++ b/modules/instruct_px2pix.py
@@ -0,0 +1,28 @@
+from modules.utils import *
+
+class InstructPix2Pix:
+    def __init__(self, device, pretrained_model_dir):
+        print("Initializing InstructPix2Pix to %s" % device)
+        self.device = device
+        self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
+        self.pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(f"{pretrained_model_dir}/instruct-pix2pix",
+                                                                           safety_checker=None,
+                                                                           torch_dtype=self.torch_dtype).to(device)
+        self.pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(self.pipe.scheduler.config)
+
+    @prompts(name="Instruct Image Using Text",
+             description="useful when you want to the style of the image to be like the text. "
+                         "like: make it look like a painting. or make it like a robot. "
+                         "The input to this tool should be a comma seperated string of two, "
+                         "representing the image_path and the text. ")
+    def inference(self, inputs):
+        """Change style of image."""
+        print("===>Starting InstructPix2Pix Inference")
+        image_path, text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
+        original_image = Image.open(image_path)
+        image = self.pipe(text, image=original_image, num_inference_steps=40, image_guidance_scale=1.2).images[0]
+        updated_image_path = get_new_image_name(image_path, func_name="pix2pix")
+        image.save(updated_image_path)
+        print(f"\nProcessed InstructPix2Pix, Input Image: {image_path}, Instruct Text: {text}, "
+              f"Output Image: {updated_image_path}")
+        return updated_image_path
\ No newline at end of file
diff --git a/modules/mask_former.py b/modules/mask_former.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cd0381d69036b8ee97170c65ebc5f3a0195cab3
--- /dev/null
+++ b/modules/mask_former.py
@@ -0,0 +1,30 @@
+from modules.utils import *
+
+class MaskFormer:
+    def __init__(self, device, pretrained_model_dir):
+        print("Initializing MaskFormer to %s" % device)
+        self.device = device
+        self.processor = CLIPSegProcessor.from_pretrained(f"{pretrained_model_dir}/clipseg-rd64-refined")
+        self.model = CLIPSegForImageSegmentation.from_pretrained(f"{pretrained_model_dir}/clipseg-rd64-refined").to(device)
+
+    def inference(self, image_path, text):
+        threshold = 0.5
+        min_area = 0.02
+        padding = 20
+        original_image = Image.open(image_path)
+        image = original_image.resize((512, 512))
+        inputs = self.processor(text=text, images=image, padding="max_length", return_tensors="pt").to(self.device)
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        mask = torch.sigmoid(outputs[0]).squeeze().cpu().numpy() > threshold
+        area_ratio = len(np.argwhere(mask)) / (mask.shape[0] * mask.shape[1])
+        if area_ratio < min_area:
+            return None
+        true_indices = np.argwhere(mask)
+        mask_array = np.zeros_like(mask, dtype=bool)
+        for idx in true_indices:
+            padded_slice = tuple(slice(max(0, i - padding), i + padding + 1) for i in idx)
+            mask_array[padded_slice] = True
+        visual_mask = (mask_array * 255).astype(np.uint8)
+        image_mask = Image.fromarray(visual_mask)
+        return image_mask.resize(original_image.size)
\ No newline at end of file
diff --git a/modules/text2img.py b/modules/text2img.py
new file mode 100644
index 0000000000000000000000000000000000000000..77178bdb5b0a02f8f3d681cf85206e43332678b4
--- /dev/null
+++ b/modules/text2img.py
@@ -0,0 +1,26 @@
+from modules.utils import *
+
+class Text2Image:
+    def __init__(self, device, pretrained_model_dir):
+        print("Initializing Text2Image to %s" % device)
+        self.device = device
+        self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
+        self.pipe = StableDiffusionPipeline.from_pretrained(f"{pretrained_model_dir}/stable-diffusion-v1-5",
+                                                            torch_dtype=self.torch_dtype)
+        self.pipe.to(device)
+        self.a_prompt = 'best quality, extremely detailed'
+        self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
+                        'fewer digits, cropped, worst quality, low quality'
+
+    @prompts(name="Generate Image From User Input Text",
+             description="useful when you want to generate an image from a user input text and save it to a file. "
+                         "like: generate an image of an object or something, or generate an image that includes some objects. "
+                         "The input to this tool should be a string, representing the text used to generate image. ")
+    def inference(self, text):
+        image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
+        prompt = text + ', ' + self.a_prompt
+        image = self.pipe(prompt, negative_prompt=self.n_prompt).images[0]
+        image.save(image_filename)
+        print(
+            f"\nProcessed Text2Image, Input Text: {text}, Output Image: {image_filename}")
+        return image_filename
\ No newline at end of file
diff --git a/modules/utils.py b/modules/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..de632db16e805030998da87bf58331c9f7459215
--- /dev/null
+++ b/modules/utils.py
@@ -0,0 +1,75 @@
+
+import os
+import gradio as gr
+import random
+import torch
+import cv2
+import re
+import uuid
+from PIL import Image
+import numpy as np
+import argparse
+
+from transformers import AutoModelForCausalLM, AutoTokenizer, CLIPSegProcessor, CLIPSegForImageSegmentation
+from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration, BlipForQuestionAnswering
+from transformers import AutoImageProcessor, UperNetForSemanticSegmentation
+
+from diffusers import StableDiffusionPipeline, StableDiffusionInpaintPipeline, StableDiffusionInstructPix2PixPipeline
+from diffusers import EulerAncestralDiscreteScheduler
+from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler
+from controlnet_aux import OpenposeDetector, MLSDdetector, HEDdetector
+
+from langchain.agents.initialize import initialize_agent
+from langchain.agents.tools import Tool
+from langchain.chains.conversation.memory import ConversationBufferMemory
+from langchain.llms.openai import OpenAI
+
+# 装饰器
+def prompts(name, description):
+    def decorator(func):
+        func.name = name
+        func.description = description
+        return func
+
+    return decorator
+
+# 设置种子
+def seed_everything(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    return seed
+
+# 对话历史截断
+def cut_dialogue_history(history_memory, keep_last_n_words=500):
+    tokens = history_memory.split()
+    n_tokens = len(tokens)
+    print(f"hitory_memory:{history_memory}, n_tokens: {n_tokens}")
+    if n_tokens < keep_last_n_words:
+        return history_memory
+    else:
+        paragraphs = history_memory.split('\n')
+        last_n_tokens = n_tokens
+        while last_n_tokens >= keep_last_n_words:
+            last_n_tokens = last_n_tokens - len(paragraphs[0].split(' '))
+            paragraphs = paragraphs[1:]
+        return '\n' + '\n'.join(paragraphs)
+
+# 获取新图片
+def get_new_image_name(org_img_name, func_name="update"):
+    head_tail = os.path.split(org_img_name)
+    head = head_tail[0]
+    tail = head_tail[1]
+    name_split = tail.split('.')[0].split('_')
+    this_new_uuid = str(uuid.uuid4())[0:4]
+    if len(name_split) == 1:
+        most_org_file_name = name_split[0]
+        recent_prev_file_name = name_split[0]
+        new_file_name = '{}_{}_{}_{}.png'.format(this_new_uuid, func_name, recent_prev_file_name, most_org_file_name)
+    else:
+        assert len(name_split) == 4
+        most_org_file_name = name_split[3]
+        recent_prev_file_name = name_split[0]
+        new_file_name = '{}_{}_{}_{}.png'.format(this_new_uuid, func_name, recent_prev_file_name, most_org_file_name)
+    return os.path.join(head, new_file_name)
\ No newline at end of file
diff --git a/modules/visual_question_answering.py b/modules/visual_question_answering.py
new file mode 100644
index 0000000000000000000000000000000000000000..db5b6f0cf80e45c1c9e000ddc5567d8660596b79
--- /dev/null
+++ b/modules/visual_question_answering.py
@@ -0,0 +1,24 @@
+from modules.utils import *
+
+class VisualQuestionAnswering:
+    def __init__(self, device, pretrained_model_dir):
+        print("Initializing VisualQuestionAnswering to %s" % device)
+        self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
+        self.device = device
+        self.processor = BlipProcessor.from_pretrained(f"{pretrained_model_dir}/blip-vqa-base")
+        self.model = BlipForQuestionAnswering.from_pretrained(
+            f"{pretrained_model_dir}/blip-vqa-base", torch_dtype=self.torch_dtype).to(self.device)
+
+    @prompts(name="Answer Question About The Image",
+             description="useful when you need an answer for a question based on an image. "
+                         "like: what is the background color of the last image, how many cats in this figure, what is in this figure. "
+                         "The input to this tool should be a comma seperated string of two, representing the image_path and the question")
+    def inference(self, inputs):
+        image_path, question = inputs.split(",")
+        raw_image = Image.open(image_path).convert('RGB')
+        inputs = self.processor(raw_image, question, return_tensors="pt").to(self.device, self.torch_dtype)
+        out = self.model.generate(**inputs)
+        answer = self.processor.decode(out[0], skip_special_tokens=True)
+        print(f"\nProcessed VisualQuestionAnswering, Input Image: {image_path}, Input Question: {question}, "
+              f"Output Answer: {answer}")
+        return answer
\ No newline at end of file
diff --git a/requirement.txt b/requirement.txt
new file mode 100644
index 0000000000000000000000000000000000000000..76865bbe155e6a2d0096eaaee27e8c2551f7caed
--- /dev/null
+++ b/requirement.txt
@@ -0,0 +1,48 @@
+langchain==0.0.101
+torch==1.12.1
+torchvision==0.13.1
+gradio==3.20.1
+accelerate
+addict
+albumentations
+basicsr
+controlnet-aux
+diffusers
+einops
+imageio
+imageio-ffmpeg
+invisible-watermark
+kornia
+numpy
+omegaconf
+open_clip_torch
+openai
+opencv-python
+prettytable
+safetensors
+streamlit
+test-tube
+timm
+torchmetrics
+transformers
+webdataset
+yapf
+numba
+librosa
+scipy
+unidecode
+openjtalk>=0.3.0.dev2
+jamo
+pypinyin
+jieba
+protobuf
+pygtrans
+cn2an
+inflect
+eng_to_ipa
+ko_pron
+indic_transliteration
+num_thai
+opencc
+vosk
+sounddevice
diff --git a/text/__init__.py b/text/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e69c354dd24e3243980236eca962cd5945a92fc
--- /dev/null
+++ b/text/__init__.py
@@ -0,0 +1,32 @@
+""" from https://github.com/keithito/tacotron """
+from text import cleaners
+
+
+def text_to_sequence(text, symbols, cleaner_names):
+  '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+    Args:
+      text: string to convert to a sequence
+      cleaner_names: names of the cleaner functions to run the text through
+    Returns:
+      List of integers corresponding to the symbols in the text
+  '''
+  _symbol_to_id = {s: i for i, s in enumerate(symbols)}
+
+  sequence = []
+
+  clean_text = _clean_text(text, cleaner_names)
+  for symbol in clean_text:
+    if symbol not in _symbol_to_id.keys():
+      continue
+    symbol_id = _symbol_to_id[symbol]
+    sequence += [symbol_id]
+  return sequence
+
+
+def _clean_text(text, cleaner_names):
+  for name in cleaner_names:
+    cleaner = getattr(cleaners, name)
+    if not cleaner:
+      raise Exception('Unknown cleaner: %s' % name)
+    text = cleaner(text)
+  return text
diff --git a/text/cantonese.py b/text/cantonese.py
new file mode 100644
index 0000000000000000000000000000000000000000..b66d12138b81b70b86f18217d24a08fce76305c0
--- /dev/null
+++ b/text/cantonese.py
@@ -0,0 +1,59 @@
+import re
+import cn2an
+import opencc
+
+
+converter = opencc.OpenCC('jyutjyu')
+
+# List of (Latin alphabet, ipa) pairs:
+_latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('A', 'ei˥'),
+    ('B', 'biː˥'),
+    ('C', 'siː˥'),
+    ('D', 'tiː˥'),
+    ('E', 'iː˥'),
+    ('F', 'e˥fuː˨˩'),
+    ('G', 'tsiː˥'),
+    ('H', 'ɪk̚˥tsʰyː˨˩'),
+    ('I', 'ɐi˥'),
+    ('J', 'tsei˥'),
+    ('K', 'kʰei˥'),
+    ('L', 'e˥llou˨˩'),
+    ('M', 'ɛːm˥'),
+    ('N', 'ɛːn˥'),
+    ('O', 'ou˥'),
+    ('P', 'pʰiː˥'),
+    ('Q', 'kʰiːu˥'),
+    ('R', 'aː˥lou˨˩'),
+    ('S', 'ɛː˥siː˨˩'),
+    ('T', 'tʰiː˥'),
+    ('U', 'juː˥'),
+    ('V', 'wiː˥'),
+    ('W', 'tʊk̚˥piː˥juː˥'),
+    ('X', 'ɪk̚˥siː˨˩'),
+    ('Y', 'waːi˥'),
+    ('Z', 'iː˨sɛːt̚˥')
+]]
+
+
+def number_to_cantonese(text):
+    return re.sub(r'\d+(?:\.?\d+)?', lambda x: cn2an.an2cn(x.group()), text)
+
+
+def latin_to_ipa(text):
+    for regex, replacement in _latin_to_ipa:
+        text = re.sub(regex, replacement, text)
+    return text
+
+
+def cantonese_to_ipa(text):
+    text = number_to_cantonese(text.upper())
+    text = converter.convert(text).replace('-','').replace('$',' ')
+    text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group())+' ', text)
+    text = re.sub(r'[、；：]', '，', text)
+    text = re.sub(r'\s*，\s*', ', ', text)
+    text = re.sub(r'\s*。\s*', '. ', text)
+    text = re.sub(r'\s*？\s*', '? ', text)
+    text = re.sub(r'\s*！\s*', '! ', text)
+    text = re.sub(r'\s*$', '', text)
+    return text
diff --git a/text/cleaners.py b/text/cleaners.py
new file mode 100644
index 0000000000000000000000000000000000000000..14874a1db0e4813d300d9e00131954f1bb8625bc
--- /dev/null
+++ b/text/cleaners.py
@@ -0,0 +1,145 @@
+import re
+
+
+def japanese_cleaners(text):
+    from text.japanese import japanese_to_romaji_with_accent
+    text = japanese_to_romaji_with_accent(text)
+    text = re.sub(r'([A-Za-z])$', r'\1.', text)
+    return text
+
+
+def japanese_cleaners2(text):
+    return japanese_cleaners(text).replace('ts', 'ʦ').replace('...', '…')
+
+
+def korean_cleaners(text):
+    '''Pipeline for Korean text'''
+    from text.korean import latin_to_hangul, number_to_hangul, divide_hangul
+    text = latin_to_hangul(text)
+    text = number_to_hangul(text)
+    text = divide_hangul(text)
+    text = re.sub(r'([\u3131-\u3163])$', r'\1.', text)
+    return text
+
+
+def chinese_cleaners(text):
+    '''Pipeline for Chinese text'''
+    from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo
+    text = number_to_chinese(text)
+    text = chinese_to_bopomofo(text)
+    text = latin_to_bopomofo(text)
+    text = re.sub(r'([ˉˊˇˋ˙])$', r'\1。', text)
+    return text
+
+
+def zh_ja_mixture_cleaners(text):
+    from text.mandarin import chinese_to_romaji
+    from text.japanese import japanese_to_romaji_with_accent
+    text = re.sub(r'\[ZH\](.*?)\[ZH\]',
+                  lambda x: chinese_to_romaji(x.group(1))+' ', text)
+    text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_romaji_with_accent(
+        x.group(1)).replace('ts', 'ʦ').replace('u', 'ɯ').replace('...', '…')+' ', text)
+    text = re.sub(r'\s+$', '', text)
+    text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
+    return text
+
+
+def sanskrit_cleaners(text):
+    text = text.replace('॥', '।').replace('ॐ', 'ओम्')
+    text = re.sub(r'([^।])$', r'\1।', text)
+    return text
+
+
+def cjks_cleaners(text):
+    from text.mandarin import chinese_to_lazy_ipa
+    from text.japanese import japanese_to_ipa
+    from text.korean import korean_to_lazy_ipa
+    from text.sanskrit import devanagari_to_ipa
+    from text.english import english_to_lazy_ipa
+    text = re.sub(r'\[ZH\](.*?)\[ZH\]',
+                  lambda x: chinese_to_lazy_ipa(x.group(1))+' ', text)
+    text = re.sub(r'\[JA\](.*?)\[JA\]',
+                  lambda x: japanese_to_ipa(x.group(1))+' ', text)
+    text = re.sub(r'\[KO\](.*?)\[KO\]',
+                  lambda x: korean_to_lazy_ipa(x.group(1))+' ', text)
+    text = re.sub(r'\[SA\](.*?)\[SA\]',
+                  lambda x: devanagari_to_ipa(x.group(1))+' ', text)
+    text = re.sub(r'\[EN\](.*?)\[EN\]',
+                  lambda x: english_to_lazy_ipa(x.group(1))+' ', text)
+    text = re.sub(r'\s+$', '', text)
+    text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
+    return text
+
+
+def cjke_cleaners(text):
+    from text.mandarin import chinese_to_lazy_ipa
+    from text.japanese import japanese_to_ipa
+    from text.korean import korean_to_ipa
+    from text.english import english_to_ipa2
+    text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: chinese_to_lazy_ipa(x.group(1)).replace(
+        'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn')+' ', text)
+    text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_ipa(x.group(1)).replace('ʧ', 'tʃ').replace(
+        'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz')+' ', text)
+    text = re.sub(r'\[KO\](.*?)\[KO\]',
+                  lambda x: korean_to_ipa(x.group(1))+' ', text)
+    text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: english_to_ipa2(x.group(1)).replace('ɑ', 'a').replace(
+        'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u')+' ', text)
+    text = re.sub(r'\s+$', '', text)
+    text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
+    return text
+
+
+def cjke_cleaners2(text):
+    from text.mandarin import chinese_to_ipa
+    from text.japanese import japanese_to_ipa2
+    from text.korean import korean_to_ipa
+    from text.english import english_to_ipa2
+    text = re.sub(r'\[ZH\](.*?)\[ZH\]',
+                  lambda x: chinese_to_ipa(x.group(1))+' ', text)
+    text = re.sub(r'\[JA\](.*?)\[JA\]',
+                  lambda x: japanese_to_ipa2(x.group(1))+' ', text)
+    text = re.sub(r'\[KO\](.*?)\[KO\]',
+                  lambda x: korean_to_ipa(x.group(1))+' ', text)
+    text = re.sub(r'\[EN\](.*?)\[EN\]',
+                  lambda x: english_to_ipa2(x.group(1))+' ', text)
+    text = re.sub(r'\s+$', '', text)
+    text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
+    return text
+
+
+def thai_cleaners(text):
+    from text.thai import num_to_thai, latin_to_thai
+    text = num_to_thai(text)
+    text = latin_to_thai(text)
+    return text
+
+
+def shanghainese_cleaners(text):
+    from text.shanghainese import shanghainese_to_ipa
+    text = shanghainese_to_ipa(text)
+    text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
+    return text
+
+
+def chinese_dialect_cleaners(text):
+    from text.mandarin import chinese_to_ipa2
+    from text.japanese import japanese_to_ipa3
+    from text.shanghainese import shanghainese_to_ipa
+    from text.cantonese import cantonese_to_ipa
+    from text.english import english_to_lazy_ipa2
+    from text.ngu_dialect import ngu_dialect_to_ipa
+    text = re.sub(r'\[ZH\](.*?)\[ZH\]',
+                  lambda x: chinese_to_ipa2(x.group(1))+' ', text)
+    text = re.sub(r'\[JA\](.*?)\[JA\]',
+                  lambda x: japanese_to_ipa3(x.group(1)).replace('Q', 'ʔ')+' ', text)
+    text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: shanghainese_to_ipa(x.group(1)).replace('1', '˥˧').replace('5',
+                  '˧˧˦').replace('6', '˩˩˧').replace('7', '˥').replace('8', '˩˨').replace('ᴀ', 'ɐ').replace('ᴇ', 'e')+' ', text)
+    text = re.sub(r'\[GD\](.*?)\[GD\]',
+                  lambda x: cantonese_to_ipa(x.group(1))+' ', text)
+    text = re.sub(r'\[EN\](.*?)\[EN\]',
+                  lambda x: english_to_lazy_ipa2(x.group(1))+' ', text)
+    text = re.sub(r'\[([A-Z]{2})\](.*?)\[\1\]', lambda x: ngu_dialect_to_ipa(x.group(2), x.group(
+        1)).replace('ʣ', 'dz').replace('ʥ', 'dʑ').replace('ʦ', 'ts').replace('ʨ', 'tɕ')+' ', text)
+    text = re.sub(r'\s+$', '', text)
+    text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
+    return text
diff --git a/text/english.py b/text/english.py
new file mode 100644
index 0000000000000000000000000000000000000000..6817392ba8a9eb830351de89fb7afc5ad72f5e42
--- /dev/null
+++ b/text/english.py
@@ -0,0 +1,188 @@
+""" from https://github.com/keithito/tacotron """
+
+'''
+Cleaners are transformations that run over the input text at both training and eval time.
+
+Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
+hyperparameter. Some cleaners are English-specific. You'll typically want to use:
+  1. "english_cleaners" for English text
+  2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
+     the Unidecode library (https://pypi.python.org/pypi/Unidecode)
+  3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
+     the symbols in symbols.py to match your data).
+'''
+
+
+# Regular expression matching whitespace:
+
+
+import re
+import inflect
+from unidecode import unidecode
+import eng_to_ipa as ipa
+_inflect = inflect.engine()
+_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
+_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
+_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
+_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
+_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
+_number_re = re.compile(r'[0-9]+')
+
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
+    ('mrs', 'misess'),
+    ('mr', 'mister'),
+    ('dr', 'doctor'),
+    ('st', 'saint'),
+    ('co', 'company'),
+    ('jr', 'junior'),
+    ('maj', 'major'),
+    ('gen', 'general'),
+    ('drs', 'doctors'),
+    ('rev', 'reverend'),
+    ('lt', 'lieutenant'),
+    ('hon', 'honorable'),
+    ('sgt', 'sergeant'),
+    ('capt', 'captain'),
+    ('esq', 'esquire'),
+    ('ltd', 'limited'),
+    ('col', 'colonel'),
+    ('ft', 'fort'),
+]]
+
+
+# List of (ipa, lazy ipa) pairs:
+_lazy_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('r', 'ɹ'),
+    ('æ', 'e'),
+    ('ɑ', 'a'),
+    ('ɔ', 'o'),
+    ('ð', 'z'),
+    ('θ', 's'),
+    ('ɛ', 'e'),
+    ('ɪ', 'i'),
+    ('ʊ', 'u'),
+    ('ʒ', 'ʥ'),
+    ('ʤ', 'ʥ'),
+    ('ˈ', '↓'),
+]]
+
+# List of (ipa, lazy ipa2) pairs:
+_lazy_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('r', 'ɹ'),
+    ('ð', 'z'),
+    ('θ', 's'),
+    ('ʒ', 'ʑ'),
+    ('ʤ', 'dʑ'),
+    ('ˈ', '↓'),
+]]
+
+# List of (ipa, ipa2) pairs
+_ipa_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('r', 'ɹ'),
+    ('ʤ', 'dʒ'),
+    ('ʧ', 'tʃ')
+]]
+
+
+def expand_abbreviations(text):
+    for regex, replacement in _abbreviations:
+        text = re.sub(regex, replacement, text)
+    return text
+
+
+def collapse_whitespace(text):
+    return re.sub(r'\s+', ' ', text)
+
+
+def _remove_commas(m):
+    return m.group(1).replace(',', '')
+
+
+def _expand_decimal_point(m):
+    return m.group(1).replace('.', ' point ')
+
+
+def _expand_dollars(m):
+    match = m.group(1)
+    parts = match.split('.')
+    if len(parts) > 2:
+        return match + ' dollars'  # Unexpected format
+    dollars = int(parts[0]) if parts[0] else 0
+    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
+    if dollars and cents:
+        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
+        cent_unit = 'cent' if cents == 1 else 'cents'
+        return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
+    elif dollars:
+        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
+        return '%s %s' % (dollars, dollar_unit)
+    elif cents:
+        cent_unit = 'cent' if cents == 1 else 'cents'
+        return '%s %s' % (cents, cent_unit)
+    else:
+        return 'zero dollars'
+
+
+def _expand_ordinal(m):
+    return _inflect.number_to_words(m.group(0))
+
+
+def _expand_number(m):
+    num = int(m.group(0))
+    if num > 1000 and num < 3000:
+        if num == 2000:
+            return 'two thousand'
+        elif num > 2000 and num < 2010:
+            return 'two thousand ' + _inflect.number_to_words(num % 100)
+        elif num % 100 == 0:
+            return _inflect.number_to_words(num // 100) + ' hundred'
+        else:
+            return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
+    else:
+        return _inflect.number_to_words(num, andword='')
+
+
+def normalize_numbers(text):
+    text = re.sub(_comma_number_re, _remove_commas, text)
+    text = re.sub(_pounds_re, r'\1 pounds', text)
+    text = re.sub(_dollars_re, _expand_dollars, text)
+    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
+    text = re.sub(_ordinal_re, _expand_ordinal, text)
+    text = re.sub(_number_re, _expand_number, text)
+    return text
+
+
+def mark_dark_l(text):
+    return re.sub(r'l([^aeiouæɑɔəɛɪʊ ]*(?: |$))', lambda x: 'ɫ'+x.group(1), text)
+
+
+def english_to_ipa(text):
+    text = unidecode(text).lower()
+    text = expand_abbreviations(text)
+    text = normalize_numbers(text)
+    phonemes = ipa.convert(text)
+    phonemes = collapse_whitespace(phonemes)
+    return phonemes
+
+
+def english_to_lazy_ipa(text):
+    text = english_to_ipa(text)
+    for regex, replacement in _lazy_ipa:
+        text = re.sub(regex, replacement, text)
+    return text
+
+
+def english_to_ipa2(text):
+    text = english_to_ipa(text)
+    text = mark_dark_l(text)
+    for regex, replacement in _ipa_to_ipa2:
+        text = re.sub(regex, replacement, text)
+    return text.replace('...', '…')
+
+
+def english_to_lazy_ipa2(text):
+    text = english_to_ipa(text)
+    for regex, replacement in _lazy_ipa2:
+        text = re.sub(regex, replacement, text)
+    return text
diff --git a/text/japanese.py b/text/japanese.py
new file mode 100644
index 0000000000000000000000000000000000000000..12653e96e8c0af0719fd2601f914b900eba70ffd
--- /dev/null
+++ b/text/japanese.py
@@ -0,0 +1,153 @@
+import re
+from unidecode import unidecode
+import text.pyopenjtalk as pyopenjtalk
+
+
+# Regular expression matching Japanese without punctuation marks:
+_japanese_characters = re.compile(
+    r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
+
+# Regular expression matching non-Japanese characters or punctuation marks:
+_japanese_marks = re.compile(
+    r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
+
+# List of (symbol, Japanese) pairs for marks:
+_symbols_to_japanese = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('％', 'パーセント')
+]]
+
+# List of (romaji, ipa) pairs for marks:
+_romaji_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('ts', 'ʦ'),
+    ('u', 'ɯ'),
+    ('j', 'ʥ'),
+    ('y', 'j'),
+    ('ni', 'n^i'),
+    ('nj', 'n^'),
+    ('hi', 'çi'),
+    ('hj', 'ç'),
+    ('f', 'ɸ'),
+    ('I', 'i*'),
+    ('U', 'ɯ*'),
+    ('r', 'ɾ')
+]]
+
+# List of (romaji, ipa2) pairs for marks:
+_romaji_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('u', 'ɯ'),
+    ('ʧ', 'tʃ'),
+    ('j', 'dʑ'),
+    ('y', 'j'),
+    ('ni', 'n^i'),
+    ('nj', 'n^'),
+    ('hi', 'çi'),
+    ('hj', 'ç'),
+    ('f', 'ɸ'),
+    ('I', 'i*'),
+    ('U', 'ɯ*'),
+    ('r', 'ɾ')
+]]
+
+# List of (consonant, sokuon) pairs:
+_real_sokuon = [(re.compile('%s' % x[0]), x[1]) for x in [
+    (r'Q([↑↓]*[kg])', r'k#\1'),
+    (r'Q([↑↓]*[tdjʧ])', r't#\1'),
+    (r'Q([↑↓]*[sʃ])', r's\1'),
+    (r'Q([↑↓]*[pb])', r'p#\1')
+]]
+
+# List of (consonant, hatsuon) pairs:
+_real_hatsuon = [(re.compile('%s' % x[0]), x[1]) for x in [
+    (r'N([↑↓]*[pbm])', r'm\1'),
+    (r'N([↑↓]*[ʧʥj])', r'n^\1'),
+    (r'N([↑↓]*[tdn])', r'n\1'),
+    (r'N([↑↓]*[kg])', r'ŋ\1')
+]]
+
+
+def symbols_to_japanese(text):
+    for regex, replacement in _symbols_to_japanese:
+        text = re.sub(regex, replacement, text)
+    return text
+
+
+def japanese_to_romaji_with_accent(text):
+    '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html'''
+    text = symbols_to_japanese(text)
+    sentences = re.split(_japanese_marks, text)
+    marks = re.findall(_japanese_marks, text)
+    text = ''
+    for i, sentence in enumerate(sentences):
+        if re.match(_japanese_characters, sentence):
+            if text != '':
+                text += ' '
+            labels = pyopenjtalk.extract_fullcontext(sentence)
+            for n, label in enumerate(labels):
+                phoneme = re.search(r'\-([^\+]*)\+', label).group(1)
+                if phoneme not in ['sil', 'pau']:
+                    text += phoneme.replace('ch', 'ʧ').replace('sh',
+                                                               'ʃ').replace('cl', 'Q')
+                else:
+                    continue
+                # n_moras = int(re.search(r'/F:(\d+)_', label).group(1))
+                a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1))
+                a2 = int(re.search(r"\+(\d+)\+", label).group(1))
+                a3 = int(re.search(r"\+(\d+)/", label).group(1))
+                if re.search(r'\-([^\+]*)\+', labels[n + 1]).group(1) in ['sil', 'pau']:
+                    a2_next = -1
+                else:
+                    a2_next = int(
+                        re.search(r"\+(\d+)\+", labels[n + 1]).group(1))
+                # Accent phrase boundary
+                if a3 == 1 and a2_next == 1:
+                    text += ' '
+                # Falling
+                elif a1 == 0 and a2_next == a2 + 1:
+                    text += '↓'
+                # Rising
+                elif a2 == 1 and a2_next == 2:
+                    text += '↑'
+        if i < len(marks):
+            text += unidecode(marks[i]).replace(' ', '')
+    return text
+
+
+def get_real_sokuon(text):
+    for regex, replacement in _real_sokuon:
+        text = re.sub(regex, replacement, text)
+    return text
+
+
+def get_real_hatsuon(text):
+    for regex, replacement in _real_hatsuon:
+        text = re.sub(regex, replacement, text)
+    return text
+
+
+def japanese_to_ipa(text):
+    text = japanese_to_romaji_with_accent(text).replace('...', '…')
+    text = re.sub(
+        r'([aiueo])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text)
+    text = get_real_sokuon(text)
+    text = get_real_hatsuon(text)
+    for regex, replacement in _romaji_to_ipa:
+        text = re.sub(regex, replacement, text)
+    return text
+
+
+def japanese_to_ipa2(text):
+    text = japanese_to_romaji_with_accent(text).replace('...', '…')
+    text = get_real_sokuon(text)
+    text = get_real_hatsuon(text)
+    for regex, replacement in _romaji_to_ipa2:
+        text = re.sub(regex, replacement, text)
+    return text
+
+
+def japanese_to_ipa3(text):
+    text = japanese_to_ipa2(text).replace('n^', 'ȵ').replace(
+        'ʃ', 'ɕ').replace('*', '\u0325').replace('#', '\u031a')
+    text = re.sub(
+        r'([aiɯeo])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text)
+    text = re.sub(r'((?:^|\s)(?:ts|tɕ|[kpt]))', r'\1ʰ', text)
+    return text
diff --git a/text/korean.py b/text/korean.py
new file mode 100644
index 0000000000000000000000000000000000000000..edee07429a450c55e3d8e246997faaa1e0b89cc9
--- /dev/null
+++ b/text/korean.py
@@ -0,0 +1,210 @@
+import re
+from jamo import h2j, j2hcj
+import ko_pron
+
+
+# This is a list of Korean classifiers preceded by pure Korean numerals.
+_korean_classifiers = '군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통'
+
+# List of (hangul, hangul divided) pairs:
+_hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('ㄳ', 'ㄱㅅ'),
+    ('ㄵ', 'ㄴㅈ'),
+    ('ㄶ', 'ㄴㅎ'),
+    ('ㄺ', 'ㄹㄱ'),
+    ('ㄻ', 'ㄹㅁ'),
+    ('ㄼ', 'ㄹㅂ'),
+    ('ㄽ', 'ㄹㅅ'),
+    ('ㄾ', 'ㄹㅌ'),
+    ('ㄿ', 'ㄹㅍ'),
+    ('ㅀ', 'ㄹㅎ'),
+    ('ㅄ', 'ㅂㅅ'),
+    ('ㅘ', 'ㅗㅏ'),
+    ('ㅙ', 'ㅗㅐ'),
+    ('ㅚ', 'ㅗㅣ'),
+    ('ㅝ', 'ㅜㅓ'),
+    ('ㅞ', 'ㅜㅔ'),
+    ('ㅟ', 'ㅜㅣ'),
+    ('ㅢ', 'ㅡㅣ'),
+    ('ㅑ', 'ㅣㅏ'),
+    ('ㅒ', 'ㅣㅐ'),
+    ('ㅕ', 'ㅣㅓ'),
+    ('ㅖ', 'ㅣㅔ'),
+    ('ㅛ', 'ㅣㅗ'),
+    ('ㅠ', 'ㅣㅜ')
+]]
+
+# List of (Latin alphabet, hangul) pairs:
+_latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
+    ('a', '에이'),
+    ('b', '비'),
+    ('c', '시'),
+    ('d', '디'),
+    ('e', '이'),
+    ('f', '에프'),
+    ('g', '지'),
+    ('h', '에이치'),
+    ('i', '아이'),
+    ('j', '제이'),
+    ('k', '케이'),
+    ('l', '엘'),
+    ('m', '엠'),
+    ('n', '엔'),
+    ('o', '오'),
+    ('p', '피'),
+    ('q', '큐'),
+    ('r', '아르'),
+    ('s', '에스'),
+    ('t', '티'),
+    ('u', '유'),
+    ('v', '브이'),
+    ('w', '더블유'),
+    ('x', '엑스'),
+    ('y', '와이'),
+    ('z', '제트')
+]]
+
+# List of (ipa, lazy ipa) pairs:
+_ipa_to_lazy_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
+    ('t͡ɕ','ʧ'),
+    ('d͡ʑ','ʥ'),
+    ('ɲ','n^'),
+    ('ɕ','ʃ'),
+    ('ʷ','w'),
+    ('ɭ','l`'),
+    ('ʎ','ɾ'),
+    ('ɣ','ŋ'),
+    ('ɰ','ɯ'),
+    ('ʝ','j'),
+    ('ʌ','ə'),
+    ('ɡ','g'),
+    ('\u031a','#'),
+    ('\u0348','='),
+    ('\u031e',''),
+    ('\u0320',''),
+    ('\u0339','')
+]]
+
+
+def latin_to_hangul(text):
+    for regex, replacement in _latin_to_hangul:
+        text = re.sub(regex, replacement, text)
+    return text
+
+
+def divide_hangul(text):
+    text = j2hcj(h2j(text))
+    for regex, replacement in _hangul_divided:
+        text = re.sub(regex, replacement, text)
+    return text
+
+
+def hangul_number(num, sino=True):
+    '''Reference https://github.com/Kyubyong/g2pK'''
+    num = re.sub(',', '', num)
+
+    if num == '0':
+        return '영'
+    if not sino and num == '20':
+        return '스무'
+
+    digits = '123456789'
+    names = '일이삼사오육칠팔구'
+    digit2name = {d: n for d, n in zip(digits, names)}
+
+    modifiers = '한 두 세 네 다섯 여섯 일곱 여덟 아홉'
+    decimals = '열 스물 서른 마흔 쉰 예순 일흔 여든 아흔'
+    digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())}
+    digit2dec = {d: dec for d, dec in zip(digits, decimals.split())}
+
+    spelledout = []
+    for i, digit in enumerate(num):
+        i = len(num) - i - 1
+        if sino:
+            if i == 0:
+                name = digit2name.get(digit, '')
+            elif i == 1:
+                name = digit2name.get(digit, '') + '십'
+                name = name.replace('일십', '십')
+        else:
+            if i == 0:
+                name = digit2mod.get(digit, '')
+            elif i == 1:
+                name = digit2dec.get(digit, '')
+        if digit == '0':
+            if i % 4 == 0:
+                last_three = spelledout[-min(3, len(spelledout)):]
+                if ''.join(last_three) == '':
+                    spelledout.append('')
+                    continue
+            else:
+                spelledout.append('')
+                continue
+        if i == 2:
+            name = digit2name.get(digit, '') + '백'
+            name = name.replace('일백', '백')
+        elif i == 3:
+            name = digit2name.get(digit, '') + '천'
+            name = name.replace('일천', '천')
+        elif i == 4:
+            name = digit2name.get(digit, '') + '만'
+            name = name.replace('일만', '만')
+        elif i == 5:
+            name = digit2name.get(digit, '') + '십'
+            name = name.replace('일십', '십')
+        elif i == 6:
+            name = digit2name.get(digit, '') + '백'
+            name = name.replace('일백', '백')
+        elif i == 7:
+            name = digit2name.get(digit, '') + '천'
+            name = name.replace('일천', '천')
+        elif i == 8:
+            name = digit2name.get(digit, '') + '억'
+        elif i == 9:
+            name = digit2name.get(digit, '') + '십'
+        elif i == 10:
+            name = digit2name.get(digit, '') + '백'
+        elif i == 11:
+            name = digit2name.get(digit, '') + '천'
+        elif i == 12:
+            name = digit2name.get(digit, '') + '조'
+        elif i == 13:
+            name = digit2name.get(digit, '') + '십'
+        elif i == 14:
+            name = digit2name.get(digit, '') + '백'
+        elif i == 15:
+            name = digit2name.get(digit, '') + '천'
+        spelledout.append(name)
+    return ''.join(elem for elem in spelledout)
+
+
+def number_to_hangul(text):
+    '''Reference https://github.com/Kyubyong/g2pK'''
+    tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text))
+    for token in tokens:
+        num, classifier = token
+        if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers:
+            spelledout = hangul_number(num, sino=False)
+        else:
+            spelledout = hangul_number(num, sino=True)
+        text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}')
+    # digit by digit for remaining digits
+    digits = '0123456789'
+    names = '영일이삼사오육칠팔구'
+    for d, n in zip(digits, names):
+        text = text.replace(d, n)
+    return text
+
+
+def korean_to_lazy_ipa(text):
+    text = latin_to_hangul(text)
+    text = number_to_hangul(text)
+    text=re.sub('[\uac00-\ud7af]+',lambda x:ko_pron.romanise(x.group(0),'ipa').split('] ~ [')[0],text)
+    for regex, replacement in _ipa_to_lazy_ipa:
+        text = re.sub(regex, replacement, text)
+    return text
+
+
+def korean_to_ipa(text):
+    text = korean_to_lazy_ipa(text)
+    return text.replace('ʧ','tʃ').replace('ʥ','dʑ')
diff --git a/text/mandarin.py b/text/mandarin.py
new file mode 100644
index 0000000000000000000000000000000000000000..f076ccd8eba2e019657763cbf0156b0bc51b75a4
--- /dev/null
+++ b/text/mandarin.py
@@ -0,0 +1,330 @@
+import os
+import sys
+import re
+from pypinyin import lazy_pinyin, BOPOMOFO
+import jieba
+import cn2an
+import logging
+
+logging.getLogger('jieba').setLevel(logging.WARNING)
+jieba.set_dictionary(r'./jieba/dict.txt')
+jieba.initialize()
+
+
+# List of (Latin alphabet, bopomofo) pairs:
+_latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
+    ('a', 'ㄟˉ'),
+    ('b', 'ㄅㄧˋ'),
+    ('c', 'ㄙㄧˉ'),
+    ('d', 'ㄉㄧˋ'),
+    ('e', 'ㄧˋ'),
+    ('f', 'ㄝˊㄈㄨˋ'),
+    ('g', 'ㄐㄧˋ'),
+    ('h', 'ㄝˇㄑㄩˋ'),
+    ('i', 'ㄞˋ'),
+    ('j', 'ㄐㄟˋ'),
+    ('k', 'ㄎㄟˋ'),
+    ('l', 'ㄝˊㄛˋ'),
+    ('m', 'ㄝˊㄇㄨˋ'),
+    ('n', 'ㄣˉ'),
+    ('o', 'ㄡˉ'),
+    ('p', 'ㄆㄧˉ'),
+    ('q', 'ㄎㄧㄡˉ'),
+    ('r', 'ㄚˋ'),
+    ('s', 'ㄝˊㄙˋ'),
+    ('t', 'ㄊㄧˋ'),
+    ('u', 'ㄧㄡˉ'),
+    ('v', 'ㄨㄧˉ'),
+    ('w', 'ㄉㄚˋㄅㄨˋㄌㄧㄡˋ'),
+    ('x', 'ㄝˉㄎㄨˋㄙˋ'),
+    ('y', 'ㄨㄞˋ'),
+    ('z', 'ㄗㄟˋ')
+]]
+
+# List of (bopomofo, romaji) pairs:
+_bopomofo_to_romaji = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('ㄅㄛ', 'p⁼wo'),
+    ('ㄆㄛ', 'pʰwo'),
+    ('ㄇㄛ', 'mwo'),
+    ('ㄈㄛ', 'fwo'),
+    ('ㄅ', 'p⁼'),
+    ('ㄆ', 'pʰ'),
+    ('ㄇ', 'm'),
+    ('ㄈ', 'f'),
+    ('ㄉ', 't⁼'),
+    ('ㄊ', 'tʰ'),
+    ('ㄋ', 'n'),
+    ('ㄌ', 'l'),
+    ('ㄍ', 'k⁼'),
+    ('ㄎ', 'kʰ'),
+    ('ㄏ', 'h'),
+    ('ㄐ', 'ʧ⁼'),
+    ('ㄑ', 'ʧʰ'),
+    ('ㄒ', 'ʃ'),
+    ('ㄓ', 'ʦ`⁼'),
+    ('ㄔ', 'ʦ`ʰ'),
+    ('ㄕ', 's`'),
+    ('ㄖ', 'ɹ`'),
+    ('ㄗ', 'ʦ⁼'),
+    ('ㄘ', 'ʦʰ'),
+    ('ㄙ', 's'),
+    ('ㄚ', 'a'),
+    ('ㄛ', 'o'),
+    ('ㄜ', 'ə'),
+    ('ㄝ', 'e'),
+    ('ㄞ', 'ai'),
+    ('ㄟ', 'ei'),
+    ('ㄠ', 'au'),
+    ('ㄡ', 'ou'),
+    ('ㄧㄢ', 'yeNN'),
+    ('ㄢ', 'aNN'),
+    ('ㄧㄣ', 'iNN'),
+    ('ㄣ', 'əNN'),
+    ('ㄤ', 'aNg'),
+    ('ㄧㄥ', 'iNg'),
+    ('ㄨㄥ', 'uNg'),
+    ('ㄩㄥ', 'yuNg'),
+    ('ㄥ', 'əNg'),
+    ('ㄦ', 'əɻ'),
+    ('ㄧ', 'i'),
+    ('ㄨ', 'u'),
+    ('ㄩ', 'ɥ'),
+    ('ˉ', '→'),
+    ('ˊ', '↑'),
+    ('ˇ', '↓↑'),
+    ('ˋ', '↓'),
+    ('˙', ''),
+    ('，', ','),
+    ('。', '.'),
+    ('！', '!'),
+    ('？', '?'),
+    ('—', '-')
+]]
+
+# List of (romaji, ipa) pairs:
+_romaji_to_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
+    ('ʃy', 'ʃ'),
+    ('ʧʰy', 'ʧʰ'),
+    ('ʧ⁼y', 'ʧ⁼'),
+    ('NN', 'n'),
+    ('Ng', 'ŋ'),
+    ('y', 'j'),
+    ('h', 'x')
+]]
+
+# List of (bopomofo, ipa) pairs:
+_bopomofo_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('ㄅㄛ', 'p⁼wo'),
+    ('ㄆㄛ', 'pʰwo'),
+    ('ㄇㄛ', 'mwo'),
+    ('ㄈㄛ', 'fwo'),
+    ('ㄅ', 'p⁼'),
+    ('ㄆ', 'pʰ'),
+    ('ㄇ', 'm'),
+    ('ㄈ', 'f'),
+    ('ㄉ', 't⁼'),
+    ('ㄊ', 'tʰ'),
+    ('ㄋ', 'n'),
+    ('ㄌ', 'l'),
+    ('ㄍ', 'k⁼'),
+    ('ㄎ', 'kʰ'),
+    ('ㄏ', 'x'),
+    ('ㄐ', 'tʃ⁼'),
+    ('ㄑ', 'tʃʰ'),
+    ('ㄒ', 'ʃ'),
+    ('ㄓ', 'ts`⁼'),
+    ('ㄔ', 'ts`ʰ'),
+    ('ㄕ', 's`'),
+    ('ㄖ', 'ɹ`'),
+    ('ㄗ', 'ts⁼'),
+    ('ㄘ', 'tsʰ'),
+    ('ㄙ', 's'),
+    ('ㄚ', 'a'),
+    ('ㄛ', 'o'),
+    ('ㄜ', 'ə'),
+    ('ㄝ', 'ɛ'),
+    ('ㄞ', 'aɪ'),
+    ('ㄟ', 'eɪ'),
+    ('ㄠ', 'ɑʊ'),
+    ('ㄡ', 'oʊ'),
+    ('ㄧㄢ', 'jɛn'),
+    ('ㄩㄢ', 'ɥæn'),
+    ('ㄢ', 'an'),
+    ('ㄧㄣ', 'in'),
+    ('ㄩㄣ', 'ɥn'),
+    ('ㄣ', 'ən'),
+    ('ㄤ', 'ɑŋ'),
+    ('ㄧㄥ', 'iŋ'),
+    ('ㄨㄥ', 'ʊŋ'),
+    ('ㄩㄥ', 'jʊŋ'),
+    ('ㄥ', 'əŋ'),
+    ('ㄦ', 'əɻ'),
+    ('ㄧ', 'i'),
+    ('ㄨ', 'u'),
+    ('ㄩ', 'ɥ'),
+    ('ˉ', '→'),
+    ('ˊ', '↑'),
+    ('ˇ', '↓↑'),
+    ('ˋ', '↓'),
+    ('˙', ''),
+    ('，', ','),
+    ('。', '.'),
+    ('！', '!'),
+    ('？', '?'),
+    ('—', '-')
+]]
+
+# List of (bopomofo, ipa2) pairs:
+_bopomofo_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('ㄅㄛ', 'pwo'),
+    ('ㄆㄛ', 'pʰwo'),
+    ('ㄇㄛ', 'mwo'),
+    ('ㄈㄛ', 'fwo'),
+    ('ㄅ', 'p'),
+    ('ㄆ', 'pʰ'),
+    ('ㄇ', 'm'),
+    ('ㄈ', 'f'),
+    ('ㄉ', 't'),
+    ('ㄊ', 'tʰ'),
+    ('ㄋ', 'n'),
+    ('ㄌ', 'l'),
+    ('ㄍ', 'k'),
+    ('ㄎ', 'kʰ'),
+    ('ㄏ', 'h'),
+    ('ㄐ', 'tɕ'),
+    ('ㄑ', 'tɕʰ'),
+    ('ㄒ', 'ɕ'),
+    ('ㄓ', 'tʂ'),
+    ('ㄔ', 'tʂʰ'),
+    ('ㄕ', 'ʂ'),
+    ('ㄖ', 'ɻ'),
+    ('ㄗ', 'ts'),
+    ('ㄘ', 'tsʰ'),
+    ('ㄙ', 's'),
+    ('ㄚ', 'a'),
+    ('ㄛ', 'o'),
+    ('ㄜ', 'ɤ'),
+    ('ㄝ', 'ɛ'),
+    ('ㄞ', 'aɪ'),
+    ('ㄟ', 'eɪ'),
+    ('ㄠ', 'ɑʊ'),
+    ('ㄡ', 'oʊ'),
+    ('ㄧㄢ', 'jɛn'),
+    ('ㄩㄢ', 'yæn'),
+    ('ㄢ', 'an'),
+    ('ㄧㄣ', 'in'),
+    ('ㄩㄣ', 'yn'),
+    ('ㄣ', 'ən'),
+    ('ㄤ', 'ɑŋ'),
+    ('ㄧㄥ', 'iŋ'),
+    ('ㄨㄥ', 'ʊŋ'),
+    ('ㄩㄥ', 'jʊŋ'),
+    ('ㄥ', 'ɤŋ'),
+    ('ㄦ', 'əɻ'),
+    ('ㄧ', 'i'),
+    ('ㄨ', 'u'),
+    ('ㄩ', 'y'),
+    ('ˉ', '˥'),
+    ('ˊ', '˧˥'),
+    ('ˇ', '˨˩˦'),
+    ('ˋ', '˥˩'),
+    ('˙', ''),
+    ('，', ','),
+    ('。', '.'),
+    ('！', '!'),
+    ('？', '?'),
+    ('—', '-')
+]]
+
+
+def number_to_chinese(text):
+    numbers = re.findall(r'\d+(?:\.?\d+)?', text)
+    for number in numbers:
+        text = text.replace(number, cn2an.an2cn(number), 1)
+    return text
+
+
+def chinese_to_bopomofo(text):
+    text = text.replace('、', '，').replace('；', '，').replace('：', '，')
+    words = jieba.lcut(text, cut_all=False)
+    text = ''
+    for word in words:
+        bopomofos = lazy_pinyin(word, BOPOMOFO)
+        if not re.search('[\u4e00-\u9fff]', word):
+            text += word
+            continue
+        for i in range(len(bopomofos)):
+            bopomofos[i] = re.sub(r'([\u3105-\u3129])$', r'\1ˉ', bopomofos[i])
+        if text != '':
+            text += ' '
+        text += ''.join(bopomofos)
+    return text
+
+
+def latin_to_bopomofo(text):
+    for regex, replacement in _latin_to_bopomofo:
+        text = re.sub(regex, replacement, text)
+    return text
+
+
+def bopomofo_to_romaji(text):
+    for regex, replacement in _bopomofo_to_romaji:
+        text = re.sub(regex, replacement, text)
+    return text
+
+
+def bopomofo_to_ipa(text):
+    for regex, replacement in _bopomofo_to_ipa:
+        text = re.sub(regex, replacement, text)
+    return text
+
+
+def bopomofo_to_ipa2(text):
+    for regex, replacement in _bopomofo_to_ipa2:
+        text = re.sub(regex, replacement, text)
+    return text
+
+
+def chinese_to_romaji(text):
+    text = number_to_chinese(text)
+    text = chinese_to_bopomofo(text)
+    text = latin_to_bopomofo(text)
+    text = bopomofo_to_romaji(text)
+    text = re.sub('i([aoe])', r'y\1', text)
+    text = re.sub('u([aoəe])', r'w\1', text)
+    text = re.sub('([ʦsɹ]`[⁼ʰ]?)([→↓↑ ]+|$)',
+                  r'\1ɹ`\2', text).replace('ɻ', 'ɹ`')
+    text = re.sub('([ʦs][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text)
+    return text
+
+
+def chinese_to_lazy_ipa(text):
+    text = chinese_to_romaji(text)
+    for regex, replacement in _romaji_to_ipa:
+        text = re.sub(regex, replacement, text)
+    return text
+
+
+def chinese_to_ipa(text):
+    text = number_to_chinese(text)
+    text = chinese_to_bopomofo(text)
+    text = latin_to_bopomofo(text)
+    text = bopomofo_to_ipa(text)
+    text = re.sub('i([aoe])', r'j\1', text)
+    text = re.sub('u([aoəe])', r'w\1', text)
+    text = re.sub('([sɹ]`[⁼ʰ]?)([→↓↑ ]+|$)',
+                  r'\1ɹ`\2', text).replace('ɻ', 'ɹ`')
+    text = re.sub('([s][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text)
+    return text
+
+
+def chinese_to_ipa2(text):
+    text = number_to_chinese(text)
+    text = chinese_to_bopomofo(text)
+    text = latin_to_bopomofo(text)
+    text = bopomofo_to_ipa2(text)
+    text = re.sub(r'i([aoe])', r'j\1', text)
+    text = re.sub(r'u([aoəe])', r'w\1', text)
+    text = re.sub(r'([ʂɹ]ʰ?)([˩˨˧˦˥ ]+|$)', r'\1ʅ\2', text)
+    text = re.sub(r'(sʰ?)([˩˨˧˦˥ ]+|$)', r'\1ɿ\2', text)
+    return text
diff --git a/text/ngu_dialect.py b/text/ngu_dialect.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce3e12bbf0469426872eed5f681985d3e1be9b26
--- /dev/null
+++ b/text/ngu_dialect.py
@@ -0,0 +1,30 @@
+import re
+import opencc
+
+
+dialects = {'SZ': 'suzhou', 'WX': 'wuxi', 'CZ': 'changzhou', 'HZ': 'hangzhou',
+            'SX': 'shaoxing', 'NB': 'ningbo', 'JJ': 'jingjiang', 'YX': 'yixing',
+            'JD': 'jiading', 'ZR': 'zhenru', 'PH': 'pinghu', 'TX': 'tongxiang',
+            'JS': 'jiashan', 'HN': 'xiashi', 'LP': 'linping', 'XS': 'xiaoshan',
+            'FY': 'fuyang', 'RA': 'ruao', 'CX': 'cixi', 'SM': 'sanmen',
+            'TT': 'tiantai', 'WZ': 'wenzhou', 'SC': 'suichang', 'YB': 'youbu'}
+
+converters = {}
+
+for dialect in dialects.values():
+    try:
+        converters[dialect] = opencc.OpenCC(dialect)
+    except:
+        pass
+
+
+def ngu_dialect_to_ipa(text, dialect):
+    dialect = dialects[dialect]
+    text = converters[dialect].convert(text).replace('-','').replace('$',' ')
+    text = re.sub(r'[、；：]', '，', text)
+    text = re.sub(r'\s*，\s*', ', ', text)
+    text = re.sub(r'\s*。\s*', '. ', text)
+    text = re.sub(r'\s*？\s*', '? ', text)
+    text = re.sub(r'\s*！\s*', '! ', text)
+    text = re.sub(r'\s*$', '', text)
+    return text
diff --git a/text/sanskrit.py b/text/sanskrit.py
new file mode 100644
index 0000000000000000000000000000000000000000..0223aaac384a2f850f5bc20651fc18eb964607d0
--- /dev/null
+++ b/text/sanskrit.py
@@ -0,0 +1,62 @@
+import re
+from indic_transliteration import sanscript
+
+
+# List of (iast, ipa) pairs:
+_iast_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('a', 'ə'),
+    ('ā', 'aː'),
+    ('ī', 'iː'),
+    ('ū', 'uː'),
+    ('ṛ', 'ɹ`'),
+    ('ṝ', 'ɹ`ː'),
+    ('ḷ', 'l`'),
+    ('ḹ', 'l`ː'),
+    ('e', 'eː'),
+    ('o', 'oː'),
+    ('k', 'k⁼'),
+    ('k⁼h', 'kʰ'),
+    ('g', 'g⁼'),
+    ('g⁼h', 'gʰ'),
+    ('ṅ', 'ŋ'),
+    ('c', 'ʧ⁼'),
+    ('ʧ⁼h', 'ʧʰ'),
+    ('j', 'ʥ⁼'),
+    ('ʥ⁼h', 'ʥʰ'),
+    ('ñ', 'n^'),
+    ('ṭ', 't`⁼'),
+    ('t`⁼h', 't`ʰ'),
+    ('ḍ', 'd`⁼'),
+    ('d`⁼h', 'd`ʰ'),
+    ('ṇ', 'n`'),
+    ('t', 't⁼'),
+    ('t⁼h', 'tʰ'),
+    ('d', 'd⁼'),
+    ('d⁼h', 'dʰ'),
+    ('p', 'p⁼'),
+    ('p⁼h', 'pʰ'),
+    ('b', 'b⁼'),
+    ('b⁼h', 'bʰ'),
+    ('y', 'j'),
+    ('ś', 'ʃ'),
+    ('ṣ', 's`'),
+    ('r', 'ɾ'),
+    ('l̤', 'l`'),
+    ('h', 'ɦ'),
+    ("'", ''),
+    ('~', '^'),
+    ('ṃ', '^')
+]]
+
+
+def devanagari_to_ipa(text):
+    text = text.replace('ॐ', 'ओम्')
+    text = re.sub(r'\s*।\s*$', '.', text)
+    text = re.sub(r'\s*।\s*', ', ', text)
+    text = re.sub(r'\s*॥', '.', text)
+    text = sanscript.transliterate(text, sanscript.DEVANAGARI, sanscript.IAST)
+    for regex, replacement in _iast_to_ipa:
+        text = re.sub(regex, replacement, text)
+    text = re.sub('(.)[`ː]*ḥ', lambda x: x.group(0)
+                  [:-1]+'h'+x.group(1)+'*', text)
+    return text
diff --git a/text/shanghainese.py b/text/shanghainese.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb29c24a08d2e406e8399cf7bc9fe5cb43cb9c61
--- /dev/null
+++ b/text/shanghainese.py
@@ -0,0 +1,64 @@
+import re
+import cn2an
+import opencc
+
+
+converter = opencc.OpenCC('zaonhe')
+
+# List of (Latin alphabet, ipa) pairs:
+_latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('A', 'ᴇ'),
+    ('B', 'bi'),
+    ('C', 'si'),
+    ('D', 'di'),
+    ('E', 'i'),
+    ('F', 'ᴇf'),
+    ('G', 'dʑi'),
+    ('H', 'ᴇtɕʰ'),
+    ('I', 'ᴀi'),
+    ('J', 'dʑᴇ'),
+    ('K', 'kʰᴇ'),
+    ('L', 'ᴇl'),
+    ('M', 'ᴇm'),
+    ('N', 'ᴇn'),
+    ('O', 'o'),
+    ('P', 'pʰi'),
+    ('Q', 'kʰiu'),
+    ('R', 'ᴀl'),
+    ('S', 'ᴇs'),
+    ('T', 'tʰi'),
+    ('U', 'ɦiu'),
+    ('V', 'vi'),
+    ('W', 'dᴀbɤliu'),
+    ('X', 'ᴇks'),
+    ('Y', 'uᴀi'),
+    ('Z', 'zᴇ')
+]]
+
+
+def _number_to_shanghainese(num):
+    num = cn2an.an2cn(num).replace('一十','十').replace('二十', '廿').replace('二', '两')
+    return re.sub(r'((?:^|[^三四五六七八九])十|廿)两', r'\1二', num)
+
+
+def number_to_shanghainese(text):
+    return re.sub(r'\d+(?:\.?\d+)?', lambda x: _number_to_shanghainese(x.group()), text)
+
+
+def latin_to_ipa(text):
+    for regex, replacement in _latin_to_ipa:
+        text = re.sub(regex, replacement, text)
+    return text
+
+
+def shanghainese_to_ipa(text):
+    text = number_to_shanghainese(text.upper())
+    text = converter.convert(text).replace('-','').replace('$',' ')
+    text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group())+' ', text)
+    text = re.sub(r'[、；：]', '，', text)
+    text = re.sub(r'\s*，\s*', ', ', text)
+    text = re.sub(r'\s*。\s*', '. ', text)
+    text = re.sub(r'\s*？\s*', '? ', text)
+    text = re.sub(r'\s*！\s*', '! ', text)
+    text = re.sub(r'\s*$', '', text)
+    return text
diff --git a/text/thai.py b/text/thai.py
new file mode 100644
index 0000000000000000000000000000000000000000..998207c01a85c710a46db1ec8b62c39c2d94bc84
--- /dev/null
+++ b/text/thai.py
@@ -0,0 +1,44 @@
+import re
+from num_thai.thainumbers import NumThai
+
+
+num = NumThai()
+
+# List of (Latin alphabet, Thai) pairs:
+_latin_to_thai = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
+    ('a', 'เอ'),
+    ('b','บี'),
+    ('c','ซี'),
+    ('d','ดี'),
+    ('e','อี'),
+    ('f','เอฟ'),
+    ('g','จี'),
+    ('h','เอช'),
+    ('i','ไอ'),
+    ('j','เจ'),
+    ('k','เค'),
+    ('l','แอล'),
+    ('m','เอ็ม'),
+    ('n','เอ็น'),
+    ('o','โอ'),
+    ('p','พี'),
+    ('q','คิว'),
+    ('r','แอร์'),
+    ('s','เอส'),
+    ('t','ที'),
+    ('u','ยู'),
+    ('v','วี'),
+    ('w','ดับเบิลยู'),
+    ('x','เอ็กซ์'),
+    ('y','วาย'),
+    ('z','ซี')
+]]
+
+
+def num_to_thai(text):
+    return re.sub(r'(?:\d+(?:,?\d+)?)+(?:\.\d+(?:,?\d+)?)?', lambda x: ''.join(num.NumberToTextThai(float(x.group(0).replace(',', '')))), text)
+
+def latin_to_thai(text):
+    for regex, replacement in _latin_to_thai:
+        text = re.sub(regex, replacement, text)
+    return text
diff --git a/transforms.py b/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..4793d67ca5a5630e0ffe0f9fb29445c949e64dae
--- /dev/null
+++ b/transforms.py
@@ -0,0 +1,193 @@
+import torch
+from torch.nn import functional as F
+
+import numpy as np
+
+
+DEFAULT_MIN_BIN_WIDTH = 1e-3
+DEFAULT_MIN_BIN_HEIGHT = 1e-3
+DEFAULT_MIN_DERIVATIVE = 1e-3
+
+
+def piecewise_rational_quadratic_transform(inputs, 
+                                           unnormalized_widths,
+                                           unnormalized_heights,
+                                           unnormalized_derivatives,
+                                           inverse=False,
+                                           tails=None, 
+                                           tail_bound=1.,
+                                           min_bin_width=DEFAULT_MIN_BIN_WIDTH,
+                                           min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
+                                           min_derivative=DEFAULT_MIN_DERIVATIVE):
+
+    if tails is None:
+        spline_fn = rational_quadratic_spline
+        spline_kwargs = {}
+    else:
+        spline_fn = unconstrained_rational_quadratic_spline
+        spline_kwargs = {
+            'tails': tails,
+            'tail_bound': tail_bound
+        }
+
+    outputs, logabsdet = spline_fn(
+            inputs=inputs,
+            unnormalized_widths=unnormalized_widths,
+            unnormalized_heights=unnormalized_heights,
+            unnormalized_derivatives=unnormalized_derivatives,
+            inverse=inverse,
+            min_bin_width=min_bin_width,
+            min_bin_height=min_bin_height,
+            min_derivative=min_derivative,
+            **spline_kwargs
+    )
+    return outputs, logabsdet
+
+
+def searchsorted(bin_locations, inputs, eps=1e-6):
+    bin_locations[..., -1] += eps
+    return torch.sum(
+        inputs[..., None] >= bin_locations,
+        dim=-1
+    ) - 1
+
+
+def unconstrained_rational_quadratic_spline(inputs,
+                                            unnormalized_widths,
+                                            unnormalized_heights,
+                                            unnormalized_derivatives,
+                                            inverse=False,
+                                            tails='linear',
+                                            tail_bound=1.,
+                                            min_bin_width=DEFAULT_MIN_BIN_WIDTH,
+                                            min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
+                                            min_derivative=DEFAULT_MIN_DERIVATIVE):
+    inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
+    outside_interval_mask = ~inside_interval_mask
+
+    outputs = torch.zeros_like(inputs)
+    logabsdet = torch.zeros_like(inputs)
+
+    if tails == 'linear':
+        unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
+        constant = np.log(np.exp(1 - min_derivative) - 1)
+        unnormalized_derivatives[..., 0] = constant
+        unnormalized_derivatives[..., -1] = constant
+
+        outputs[outside_interval_mask] = inputs[outside_interval_mask]
+        logabsdet[outside_interval_mask] = 0
+    else:
+        raise RuntimeError('{} tails are not implemented.'.format(tails))
+
+    outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline(
+        inputs=inputs[inside_interval_mask],
+        unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
+        unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
+        unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
+        inverse=inverse,
+        left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound,
+        min_bin_width=min_bin_width,
+        min_bin_height=min_bin_height,
+        min_derivative=min_derivative
+    )
+
+    return outputs, logabsdet
+
+def rational_quadratic_spline(inputs,
+                              unnormalized_widths,
+                              unnormalized_heights,
+                              unnormalized_derivatives,
+                              inverse=False,
+                              left=0., right=1., bottom=0., top=1.,
+                              min_bin_width=DEFAULT_MIN_BIN_WIDTH,
+                              min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
+                              min_derivative=DEFAULT_MIN_DERIVATIVE):
+    if torch.min(inputs) < left or torch.max(inputs) > right:
+        raise ValueError('Input to a transform is not within its domain')
+
+    num_bins = unnormalized_widths.shape[-1]
+
+    if min_bin_width * num_bins > 1.0:
+        raise ValueError('Minimal bin width too large for the number of bins')
+    if min_bin_height * num_bins > 1.0:
+        raise ValueError('Minimal bin height too large for the number of bins')
+
+    widths = F.softmax(unnormalized_widths, dim=-1)
+    widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
+    cumwidths = torch.cumsum(widths, dim=-1)
+    cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0)
+    cumwidths = (right - left) * cumwidths + left
+    cumwidths[..., 0] = left
+    cumwidths[..., -1] = right
+    widths = cumwidths[..., 1:] - cumwidths[..., :-1]
+
+    derivatives = min_derivative + F.softplus(unnormalized_derivatives)
+
+    heights = F.softmax(unnormalized_heights, dim=-1)
+    heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
+    cumheights = torch.cumsum(heights, dim=-1)
+    cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0)
+    cumheights = (top - bottom) * cumheights + bottom
+    cumheights[..., 0] = bottom
+    cumheights[..., -1] = top
+    heights = cumheights[..., 1:] - cumheights[..., :-1]
+
+    if inverse:
+        bin_idx = searchsorted(cumheights, inputs)[..., None]
+    else:
+        bin_idx = searchsorted(cumwidths, inputs)[..., None]
+
+    input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
+    input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
+
+    input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
+    delta = heights / widths
+    input_delta = delta.gather(-1, bin_idx)[..., 0]
+
+    input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
+    input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
+
+    input_heights = heights.gather(-1, bin_idx)[..., 0]
+
+    if inverse:
+        a = (((inputs - input_cumheights) * (input_derivatives
+                                             + input_derivatives_plus_one
+                                             - 2 * input_delta)
+              + input_heights * (input_delta - input_derivatives)))
+        b = (input_heights * input_derivatives
+             - (inputs - input_cumheights) * (input_derivatives
+                                              + input_derivatives_plus_one
+                                              - 2 * input_delta))
+        c = - input_delta * (inputs - input_cumheights)
+
+        discriminant = b.pow(2) - 4 * a * c
+        assert (discriminant >= 0).all()
+
+        root = (2 * c) / (-b - torch.sqrt(discriminant))
+        outputs = root * input_bin_widths + input_cumwidths
+
+        theta_one_minus_theta = root * (1 - root)
+        denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
+                                     * theta_one_minus_theta)
+        derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2)
+                                                     + 2 * input_delta * theta_one_minus_theta
+                                                     + input_derivatives * (1 - root).pow(2))
+        logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
+
+        return outputs, -logabsdet
+    else:
+        theta = (inputs - input_cumwidths) / input_bin_widths
+        theta_one_minus_theta = theta * (1 - theta)
+
+        numerator = input_heights * (input_delta * theta.pow(2)
+                                     + input_derivatives * theta_one_minus_theta)
+        denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
+                                     * theta_one_minus_theta)
+        outputs = input_cumheights + numerator / denominator
+
+        derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2)
+                                                     + 2 * input_delta * theta_one_minus_theta
+                                                     + input_derivatives * (1 - theta).pow(2))
+        logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
+
+        return outputs, logabsdet
diff --git a/utils_vits.py b/utils_vits.py
new file mode 100644
index 0000000000000000000000000000000000000000..07839a71a8339f90fe7eeff4dc4a6bd284330049
--- /dev/null
+++ b/utils_vits.py
@@ -0,0 +1,75 @@
+import logging
+from json import loads
+from torch import load, FloatTensor
+from numpy import float32
+import librosa
+
+
+class HParams():
+  def __init__(self, **kwargs):
+    for k, v in kwargs.items():
+      if type(v) == dict:
+        v = HParams(**v)
+      self[k] = v
+
+  def keys(self):
+    return self.__dict__.keys()
+
+  def items(self):
+    return self.__dict__.items()
+
+  def values(self):
+    return self.__dict__.values()
+
+  def __len__(self):
+    return len(self.__dict__)
+
+  def __getitem__(self, key):
+    return getattr(self, key)
+
+  def __setitem__(self, key, value):
+    return setattr(self, key, value)
+
+  def __contains__(self, key):
+    return key in self.__dict__
+
+  def __repr__(self):
+    return self.__dict__.__repr__()
+
+
+def load_checkpoint(checkpoint_path, model):
+  checkpoint_dict = load(checkpoint_path, map_location='cpu')
+  iteration = checkpoint_dict['iteration']
+  saved_state_dict = checkpoint_dict['model']
+  if hasattr(model, 'module'):
+    state_dict = model.module.state_dict()
+  else:
+    state_dict = model.state_dict()
+  new_state_dict= {}
+  for k, v in state_dict.items():
+    try:
+      new_state_dict[k] = saved_state_dict[k]
+    except:
+      logging.info("%s is not in the checkpoint" % k)
+      new_state_dict[k] = v
+  if hasattr(model, 'module'):
+    model.module.load_state_dict(new_state_dict)
+  else:
+    model.load_state_dict(new_state_dict)
+  logging.info("Loaded checkpoint '{}' (iteration {})" .format(
+    checkpoint_path, iteration))
+  return
+
+
+def get_hparams_from_file(config_path):
+  with open(config_path, "r") as f:
+    data = f.read()
+  config = loads(data)
+
+  hparams = HParams(**config)
+  return hparams
+
+
+def load_audio_to_torch(full_path, target_sampling_rate):
+  audio, sampling_rate = librosa.load(full_path, sr=target_sampling_rate, mono=True)
+  return FloatTensor(audio.astype(float32))
diff --git a/visual_chatgpt.py b/visual_chatgpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6a1b7f1a6fe533b2b16a313f3085a53f4280ac2
--- /dev/null
+++ b/visual_chatgpt.py
@@ -0,0 +1,908 @@
+import os
+import gradio as gr
+import random
+import torch
+import cv2
+import re
+import uuid
+from PIL import Image
+import numpy as np
+import argparse
+
+from transformers import AutoModelForCausalLM, AutoTokenizer, CLIPSegProcessor, CLIPSegForImageSegmentation
+from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration, BlipForQuestionAnswering
+from transformers import AutoImageProcessor, UperNetForSemanticSegmentation
+
+from diffusers import StableDiffusionPipeline, StableDiffusionInpaintPipeline, StableDiffusionInstructPix2PixPipeline
+from diffusers import EulerAncestralDiscreteScheduler
+from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler
+from controlnet_aux import OpenposeDetector, MLSDdetector, HEDdetector
+
+from langchain.agents.initialize import initialize_agent
+from langchain.agents.tools import Tool
+from langchain.chains.conversation.memory import ConversationBufferMemory
+from langchain.llms.openai import OpenAI
+
+VISUAL_CHATGPT_PREFIX = """Visual ChatGPT is designed to be able to assist with a wide range of text and visual related tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of topics. Visual ChatGPT is able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.
+
+Visual ChatGPT is able to process and understand large amounts of text and images. As a language model, Visual ChatGPT can not directly read images, but it has a list of tools to finish different visual tasks. Each image will have a file name formed as "image/xxx.png", and Visual ChatGPT can invoke different tools to indirectly understand pictures. When talking about images, Visual ChatGPT is very strict to the file name and will never fabricate nonexistent files. When using tools to generate new image files, Visual ChatGPT is also known that the image may not be the same as the user's demand, and will use other visual question answering tools or description tools to observe the real image. Visual ChatGPT is able to use tools in a sequence, and is loyal to the tool observation outputs rather than faking the image content and image file name. It will remember to provide the file name from the last tool observation, if a new image is generated.
+
+Human may provide new figures to Visual ChatGPT with a description. The description helps Visual ChatGPT to understand this image, but Visual ChatGPT should use tools to finish following tasks, rather than directly imagine from the description.
+
+Overall, Visual ChatGPT is a powerful visual dialogue assistant tool that can help with a wide range of tasks and provide valuable insights and information on a wide range of topics. 
+
+
+TOOLS:
+------
+
+Visual ChatGPT  has access to the following tools:"""
+
+VISUAL_CHATGPT_FORMAT_INSTRUCTIONS = """To use a tool, please use the following format:
+
+```
+Thought: Do I need to use a tool? Yes
+Action: the action to take, should be one of [{tool_names}]
+Action Input: the input to the action
+Observation: the result of the action
+```
+
+When you have a response to say to the Human, or if you do not need to use a tool, you MUST use the format:
+
+```
+Thought: Do I need to use a tool? No
+{ai_prefix}: [your response here]
+```
+"""
+
+VISUAL_CHATGPT_SUFFIX = """You are very strict to the filename correctness and will never fake a file name if it does not exist.
+You will remember to provide the image file name loyally if it's provided in the last tool observation.
+
+Begin!
+
+Previous conversation history:
+{chat_history}
+
+New input: {input}
+Since Visual ChatGPT is a text language model, Visual ChatGPT must use tools to observe images rather than imagination.
+The thoughts and observations are only visible for Visual ChatGPT, Visual ChatGPT should remember to repeat important information in the final response for Human. 
+Thought: Do I need to use a tool? {agent_scratchpad}"""
+
+os.makedirs('image', exist_ok=True)
+
+
+def seed_everything(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    return seed
+
+
+def prompts(name, description):
+    def decorator(func):
+        func.name = name
+        func.description = description
+        return func
+
+    return decorator
+
+
+def cut_dialogue_history(history_memory, keep_last_n_words=500):
+    tokens = history_memory.split()
+    n_tokens = len(tokens)
+    print(f"hitory_memory:{history_memory}, n_tokens: {n_tokens}")
+    if n_tokens < keep_last_n_words:
+        return history_memory
+    else:
+        paragraphs = history_memory.split('\n')
+        last_n_tokens = n_tokens
+        while last_n_tokens >= keep_last_n_words:
+            last_n_tokens = last_n_tokens - len(paragraphs[0].split(' '))
+            paragraphs = paragraphs[1:]
+        return '\n' + '\n'.join(paragraphs)
+
+
+def get_new_image_name(org_img_name, func_name="update"):
+    head_tail = os.path.split(org_img_name)
+    head = head_tail[0]
+    tail = head_tail[1]
+    name_split = tail.split('.')[0].split('_')
+    this_new_uuid = str(uuid.uuid4())[0:4]
+    if len(name_split) == 1:
+        most_org_file_name = name_split[0]
+        recent_prev_file_name = name_split[0]
+        new_file_name = '{}_{}_{}_{}.png'.format(this_new_uuid, func_name, recent_prev_file_name, most_org_file_name)
+    else:
+        assert len(name_split) == 4
+        most_org_file_name = name_split[3]
+        recent_prev_file_name = name_split[0]
+        new_file_name = '{}_{}_{}_{}.png'.format(this_new_uuid, func_name, recent_prev_file_name, most_org_file_name)
+    return os.path.join(head, new_file_name)
+
+
+class MaskFormer:
+    def __init__(self, device):
+        print("Initializing MaskFormer to %s" % device)
+        self.device = device
+        self.processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
+        self.model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined").to(device)
+
+    def inference(self, image_path, text):
+        threshold = 0.5
+        min_area = 0.02
+        padding = 20
+        original_image = Image.open(image_path)
+        image = original_image.resize((512, 512))
+        inputs = self.processor(text=text, images=image, padding="max_length", return_tensors="pt").to(self.device)
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        mask = torch.sigmoid(outputs[0]).squeeze().cpu().numpy() > threshold
+        area_ratio = len(np.argwhere(mask)) / (mask.shape[0] * mask.shape[1])
+        if area_ratio < min_area:
+            return None
+        true_indices = np.argwhere(mask)
+        mask_array = np.zeros_like(mask, dtype=bool)
+        for idx in true_indices:
+            padded_slice = tuple(slice(max(0, i - padding), i + padding + 1) for i in idx)
+            mask_array[padded_slice] = True
+        visual_mask = (mask_array * 255).astype(np.uint8)
+        image_mask = Image.fromarray(visual_mask)
+        return image_mask.resize(original_image.size)
+
+
+class ImageEditing:
+    def __init__(self, device):
+        print("Initializing ImageEditing to %s" % device)
+        self.device = device
+        self.mask_former = MaskFormer(device=self.device)
+        self.revision = 'fp16' if 'cuda' in device else None
+        self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
+        self.inpaint = StableDiffusionInpaintPipeline.from_pretrained(
+            "runwayml/stable-diffusion-inpainting", revision=self.revision, torch_dtype=self.torch_dtype).to(device)
+
+    @prompts(name="Remove Something From The Photo",
+             description="useful when you want to remove and object or something from the photo "
+                         "from its description or location. "
+                         "The input to this tool should be a comma seperated string of two, "
+                         "representing the image_path and the object need to be removed. ")
+    def inference_remove(self, inputs):
+        image_path, to_be_removed_txt = inputs.split(",")
+        return self.inference_replace(f"{image_path},{to_be_removed_txt},background")
+
+    @prompts(name="Replace Something From The Photo",
+             description="useful when you want to replace an object from the object description or "
+                         "location with another object from its description. "
+                         "The input to this tool should be a comma seperated string of three, "
+                         "representing the image_path, the object to be replaced, the object to be replaced with ")
+    def inference_replace(self, inputs):
+        image_path, to_be_replaced_txt, replace_with_txt = inputs.split(",")
+        original_image = Image.open(image_path)
+        original_size = original_image.size
+        mask_image = self.mask_former.inference(image_path, to_be_replaced_txt)
+        updated_image = self.inpaint(prompt=replace_with_txt, image=original_image.resize((512, 512)),
+                                     mask_image=mask_image.resize((512, 512))).images[0]
+        updated_image_path = get_new_image_name(image_path, func_name="replace-something")
+        updated_image = updated_image.resize(original_size)
+        updated_image.save(updated_image_path)
+        print(
+            f"\nProcessed ImageEditing, Input Image: {image_path}, Replace {to_be_replaced_txt} to {replace_with_txt}, "
+            f"Output Image: {updated_image_path}")
+        return updated_image_path
+
+
+class InstructPix2Pix:
+    def __init__(self, device):
+        print("Initializing InstructPix2Pix to %s" % device)
+        self.device = device
+        self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
+        self.pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained("timbrooks/instruct-pix2pix",
+                                                                           safety_checker=None,
+                                                                           torch_dtype=self.torch_dtype).to(device)
+        self.pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(self.pipe.scheduler.config)
+
+    @prompts(name="Instruct Image Using Text",
+             description="useful when you want to the style of the image to be like the text. "
+                         "like: make it look like a painting. or make it like a robot. "
+                         "The input to this tool should be a comma seperated string of two, "
+                         "representing the image_path and the text. ")
+    def inference(self, inputs):
+        """Change style of image."""
+        print("===>Starting InstructPix2Pix Inference")
+        image_path, text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
+        original_image = Image.open(image_path)
+        image = self.pipe(text, image=original_image, num_inference_steps=40, image_guidance_scale=1.2).images[0]
+        updated_image_path = get_new_image_name(image_path, func_name="pix2pix")
+        image.save(updated_image_path)
+        print(f"\nProcessed InstructPix2Pix, Input Image: {image_path}, Instruct Text: {text}, "
+              f"Output Image: {updated_image_path}")
+        return updated_image_path
+
+
+class Text2Image:
+    def __init__(self, device):
+        print("Initializing Text2Image to %s" % device)
+        self.device = device
+        self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
+        self.pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5",
+                                                            torch_dtype=self.torch_dtype)
+        self.pipe.to(device)
+        self.a_prompt = 'best quality, extremely detailed'
+        self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
+                        'fewer digits, cropped, worst quality, low quality'
+
+    @prompts(name="Generate Image From User Input Text",
+             description="useful when you want to generate an image from a user input text and save it to a file. "
+                         "like: generate an image of an object or something, or generate an image that includes some objects. "
+                         "The input to this tool should be a string, representing the text used to generate image. ")
+    def inference(self, text):
+        image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
+        prompt = text + ', ' + self.a_prompt
+        image = self.pipe(prompt, negative_prompt=self.n_prompt).images[0]
+        image.save(image_filename)
+        print(
+            f"\nProcessed Text2Image, Input Text: {text}, Output Image: {image_filename}")
+        return image_filename
+
+
+class ImageCaptioning:
+    def __init__(self, device):
+        print("Initializing ImageCaptioning to %s" % device)
+        self.device = device
+        self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
+        self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+        self.model = BlipForConditionalGeneration.from_pretrained(
+            "Salesforce/blip-image-captioning-base", torch_dtype=self.torch_dtype).to(self.device)
+
+    @prompts(name="Get Photo Description",
+             description="useful when you want to know what is inside the photo. receives image_path as input. "
+                         "The input to this tool should be a string, representing the image_path. ")
+    def inference(self, image_path):
+        inputs = self.processor(Image.open(image_path), return_tensors="pt").to(self.device, self.torch_dtype)
+        out = self.model.generate(**inputs)
+        captions = self.processor.decode(out[0], skip_special_tokens=True)
+        print(f"\nProcessed ImageCaptioning, Input Image: {image_path}, Output Text: {captions}")
+        return captions
+
+
+class Image2Canny:
+    def __init__(self, device):
+        print("Initializing Image2Canny")
+        self.low_threshold = 100
+        self.high_threshold = 200
+
+    @prompts(name="Edge Detection On Image",
+             description="useful when you want to detect the edge of the image. "
+                         "like: detect the edges of this image, or canny detection on image, "
+                         "or perform edge detection on this image, or detect the canny image of this image. "
+                         "The input to this tool should be a string, representing the image_path")
+    def inference(self, inputs):
+        image = Image.open(inputs)
+        image = np.array(image)
+        canny = cv2.Canny(image, self.low_threshold, self.high_threshold)
+        canny = canny[:, :, None]
+        canny = np.concatenate([canny, canny, canny], axis=2)
+        canny = Image.fromarray(canny)
+        updated_image_path = get_new_image_name(inputs, func_name="edge")
+        canny.save(updated_image_path)
+        print(f"\nProcessed Image2Canny, Input Image: {inputs}, Output Text: {updated_image_path}")
+        return updated_image_path
+
+
+class CannyText2Image:
+    def __init__(self, device):
+        print("Initializing CannyText2Image to %s" % device)
+        self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
+        self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-canny",
+                                                          torch_dtype=self.torch_dtype)
+        self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
+            torch_dtype=self.torch_dtype)
+        self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
+        self.pipe.to(device)
+        self.seed = -1
+        self.a_prompt = 'best quality, extremely detailed'
+        self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
+                        'fewer digits, cropped, worst quality, low quality'
+
+    @prompts(name="Generate Image Condition On Canny Image",
+             description="useful when you want to generate a new real image from both the user desciption and a canny image."
+                         " like: generate a real image of a object or something from this canny image,"
+                         " or generate a new real image of a object or something from this edge image. "
+                         "The input to this tool should be a comma seperated string of two, "
+                         "representing the image_path and the user description. ")
+    def inference(self, inputs):
+        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
+        image = Image.open(image_path)
+        self.seed = random.randint(0, 65535)
+        seed_everything(self.seed)
+        prompt = instruct_text + ', ' + self.a_prompt
+        image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
+                          guidance_scale=9.0).images[0]
+        updated_image_path = get_new_image_name(image_path, func_name="canny2image")
+        image.save(updated_image_path)
+        print(f"\nProcessed CannyText2Image, Input Canny: {image_path}, Input Text: {instruct_text}, "
+              f"Output Text: {updated_image_path}")
+        return updated_image_path
+
+
+class Image2Line:
+    def __init__(self, device):
+        print("Initializing Image2Line")
+        self.detector = MLSDdetector.from_pretrained('lllyasviel/ControlNet')
+
+    @prompts(name="Line Detection On Image",
+             description="useful when you want to detect the straight line of the image. "
+                         "like: detect the straight lines of this image, or straight line detection on image, "
+                         "or peform straight line detection on this image, or detect the straight line image of this image. "
+                         "The input to this tool should be a string, representing the image_path")
+    def inference(self, inputs):
+        image = Image.open(inputs)
+        mlsd = self.detector(image)
+        updated_image_path = get_new_image_name(inputs, func_name="line-of")
+        mlsd.save(updated_image_path)
+        print(f"\nProcessed Image2Line, Input Image: {inputs}, Output Line: {updated_image_path}")
+        return updated_image_path
+
+
+class LineText2Image:
+    def __init__(self, device):
+        print("Initializing LineText2Image to %s" % device)
+        self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
+        self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-mlsd",
+                                                          torch_dtype=self.torch_dtype)
+        self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
+            torch_dtype=self.torch_dtype
+        )
+        self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
+        self.pipe.to(device)
+        self.seed = -1
+        self.a_prompt = 'best quality, extremely detailed'
+        self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
+                        'fewer digits, cropped, worst quality, low quality'
+
+    @prompts(name="Generate Image Condition On Line Image",
+             description="useful when you want to generate a new real image from both the user desciption "
+                         "and a straight line image. "
+                         "like: generate a real image of a object or something from this straight line image, "
+                         "or generate a new real image of a object or something from this straight lines. "
+                         "The input to this tool should be a comma seperated string of two, "
+                         "representing the image_path and the user description. ")
+    def inference(self, inputs):
+        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
+        image = Image.open(image_path)
+        self.seed = random.randint(0, 65535)
+        seed_everything(self.seed)
+        prompt = instruct_text + ', ' + self.a_prompt
+        image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
+                          guidance_scale=9.0).images[0]
+        updated_image_path = get_new_image_name(image_path, func_name="line2image")
+        image.save(updated_image_path)
+        print(f"\nProcessed LineText2Image, Input Line: {image_path}, Input Text: {instruct_text}, "
+              f"Output Text: {updated_image_path}")
+        return updated_image_path
+
+
+class Image2Hed:
+    def __init__(self, device):
+        print("Initializing Image2Hed")
+        self.detector = HEDdetector.from_pretrained('lllyasviel/ControlNet')
+
+    @prompts(name="Hed Detection On Image",
+             description="useful when you want to detect the soft hed boundary of the image. "
+                         "like: detect the soft hed boundary of this image, or hed boundary detection on image, "
+                         "or peform hed boundary detection on this image, or detect soft hed boundary image of this image. "
+                         "The input to this tool should be a string, representing the image_path")
+    def inference(self, inputs):
+        image = Image.open(inputs)
+        hed = self.detector(image)
+        updated_image_path = get_new_image_name(inputs, func_name="hed-boundary")
+        hed.save(updated_image_path)
+        print(f"\nProcessed Image2Hed, Input Image: {inputs}, Output Hed: {updated_image_path}")
+        return updated_image_path
+
+
+class HedText2Image:
+    def __init__(self, device):
+        print("Initializing HedText2Image to %s" % device)
+        self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
+        self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-hed",
+                                                          torch_dtype=self.torch_dtype)
+        self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
+            torch_dtype=self.torch_dtype
+        )
+        self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
+        self.pipe.to(device)
+        self.seed = -1
+        self.a_prompt = 'best quality, extremely detailed'
+        self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
+                        'fewer digits, cropped, worst quality, low quality'
+
+    @prompts(name="Generate Image Condition On Soft Hed Boundary Image",
+             description="useful when you want to generate a new real image from both the user desciption "
+                         "and a soft hed boundary image. "
+                         "like: generate a real image of a object or something from this soft hed boundary image, "
+                         "or generate a new real image of a object or something from this hed boundary. "
+                         "The input to this tool should be a comma seperated string of two, "
+                         "representing the image_path and the user description")
+    def inference(self, inputs):
+        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
+        image = Image.open(image_path)
+        self.seed = random.randint(0, 65535)
+        seed_everything(self.seed)
+        prompt = instruct_text + ', ' + self.a_prompt
+        image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
+                          guidance_scale=9.0).images[0]
+        updated_image_path = get_new_image_name(image_path, func_name="hed2image")
+        image.save(updated_image_path)
+        print(f"\nProcessed HedText2Image, Input Hed: {image_path}, Input Text: {instruct_text}, "
+              f"Output Image: {updated_image_path}")
+        return updated_image_path
+
+
+class Image2Scribble:
+    def __init__(self, device):
+        print("Initializing Image2Scribble")
+        self.detector = HEDdetector.from_pretrained('lllyasviel/ControlNet')
+
+    @prompts(name="Sketch Detection On Image",
+             description="useful when you want to generate a scribble of the image. "
+                         "like: generate a scribble of this image, or generate a sketch from this image, "
+                         "detect the sketch from this image. "
+                         "The input to this tool should be a string, representing the image_path")
+    def inference(self, inputs):
+        image = Image.open(inputs)
+        scribble = self.detector(image, scribble=True)
+        updated_image_path = get_new_image_name(inputs, func_name="scribble")
+        scribble.save(updated_image_path)
+        print(f"\nProcessed Image2Scribble, Input Image: {inputs}, Output Scribble: {updated_image_path}")
+        return updated_image_path
+
+
+class ScribbleText2Image:
+    def __init__(self, device):
+        print("Initializing ScribbleText2Image to %s" % device)
+        self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
+        self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-scribble",
+                                                          torch_dtype=self.torch_dtype)
+        self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
+            torch_dtype=self.torch_dtype
+        )
+        self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
+        self.pipe.to(device)
+        self.seed = -1
+        self.a_prompt = 'best quality, extremely detailed'
+        self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
+                        'fewer digits, cropped, worst quality, low quality'
+
+    @prompts(name="Generate Image Condition On Sketch Image",
+             description="useful when you want to generate a new real image from both the user desciption and "
+                         "a scribble image or a sketch image. "
+                         "The input to this tool should be a comma seperated string of two, "
+                         "representing the image_path and the user description")
+    def inference(self, inputs):
+        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
+        image = Image.open(image_path)
+        self.seed = random.randint(0, 65535)
+        seed_everything(self.seed)
+        prompt = instruct_text + ', ' + self.a_prompt
+        image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
+                          guidance_scale=9.0).images[0]
+        updated_image_path = get_new_image_name(image_path, func_name="scribble2image")
+        image.save(updated_image_path)
+        print(f"\nProcessed ScribbleText2Image, Input Scribble: {image_path}, Input Text: {instruct_text}, "
+              f"Output Image: {updated_image_path}")
+        return updated_image_path
+
+
+class Image2Pose:
+    def __init__(self, device):
+        print("Initializing Image2Pose")
+        self.detector = OpenposeDetector.from_pretrained('lllyasviel/ControlNet')
+
+    @prompts(name="Pose Detection On Image",
+             description="useful when you want to detect the human pose of the image. "
+                         "like: generate human poses of this image, or generate a pose image from this image. "
+                         "The input to this tool should be a string, representing the image_path")
+    def inference(self, inputs):
+        image = Image.open(inputs)
+        pose = self.detector(image)
+        updated_image_path = get_new_image_name(inputs, func_name="human-pose")
+        pose.save(updated_image_path)
+        print(f"\nProcessed Image2Pose, Input Image: {inputs}, Output Pose: {updated_image_path}")
+        return updated_image_path
+
+
+class PoseText2Image:
+    def __init__(self, device):
+        print("Initializing PoseText2Image to %s" % device)
+        self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
+        self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-openpose",
+                                                          torch_dtype=self.torch_dtype)
+        self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
+            torch_dtype=self.torch_dtype)
+        self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
+        self.pipe.to(device)
+        self.num_inference_steps = 20
+        self.seed = -1
+        self.unconditional_guidance_scale = 9.0
+        self.a_prompt = 'best quality, extremely detailed'
+        self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
+                        ' fewer digits, cropped, worst quality, low quality'
+
+    @prompts(name="Generate Image Condition On Pose Image",
+             description="useful when you want to generate a new real image from both the user desciption "
+                         "and a human pose image. "
+                         "like: generate a real image of a human from this human pose image, "
+                         "or generate a new real image of a human from this pose. "
+                         "The input to this tool should be a comma seperated string of two, "
+                         "representing the image_path and the user description")
+    def inference(self, inputs):
+        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
+        image = Image.open(image_path)
+        self.seed = random.randint(0, 65535)
+        seed_everything(self.seed)
+        prompt = instruct_text + ', ' + self.a_prompt
+        image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
+                          guidance_scale=9.0).images[0]
+        updated_image_path = get_new_image_name(image_path, func_name="pose2image")
+        image.save(updated_image_path)
+        print(f"\nProcessed PoseText2Image, Input Pose: {image_path}, Input Text: {instruct_text}, "
+              f"Output Image: {updated_image_path}")
+        return updated_image_path
+
+
+class Image2Seg:
+    def __init__(self, device):
+        print("Initializing Image2Seg")
+        self.image_processor = AutoImageProcessor.from_pretrained("openmmlab/upernet-convnext-small")
+        self.image_segmentor = UperNetForSemanticSegmentation.from_pretrained("openmmlab/upernet-convnext-small")
+        self.ade_palette = [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50],
+                            [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255],
+                            [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7],
+                            [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82],
+                            [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3],
+                            [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255],
+                            [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220],
+                            [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224],
+                            [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255],
+                            [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7],
+                            [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153],
+                            [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255],
+                            [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0],
+                            [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
+                            [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255],
+                            [11, 200, 200], [255, 82, 0], [0, 255, 245], [0, 61, 255],
+                            [0, 255, 112], [0, 255, 133], [255, 0, 0], [255, 163, 0],
+                            [255, 102, 0], [194, 255, 0], [0, 143, 255], [51, 255, 0],
+                            [0, 82, 255], [0, 255, 41], [0, 255, 173], [10, 0, 255],
+                            [173, 255, 0], [0, 255, 153], [255, 92, 0], [255, 0, 255],
+                            [255, 0, 245], [255, 0, 102], [255, 173, 0], [255, 0, 20],
+                            [255, 184, 184], [0, 31, 255], [0, 255, 61], [0, 71, 255],
+                            [255, 0, 204], [0, 255, 194], [0, 255, 82], [0, 10, 255],
+                            [0, 112, 255], [51, 0, 255], [0, 194, 255], [0, 122, 255],
+                            [0, 255, 163], [255, 153, 0], [0, 255, 10], [255, 112, 0],
+                            [143, 255, 0], [82, 0, 255], [163, 255, 0], [255, 235, 0],
+                            [8, 184, 170], [133, 0, 255], [0, 255, 92], [184, 0, 255],
+                            [255, 0, 31], [0, 184, 255], [0, 214, 255], [255, 0, 112],
+                            [92, 255, 0], [0, 224, 255], [112, 224, 255], [70, 184, 160],
+                            [163, 0, 255], [153, 0, 255], [71, 255, 0], [255, 0, 163],
+                            [255, 204, 0], [255, 0, 143], [0, 255, 235], [133, 255, 0],
+                            [255, 0, 235], [245, 0, 255], [255, 0, 122], [255, 245, 0],
+                            [10, 190, 212], [214, 255, 0], [0, 204, 255], [20, 0, 255],
+                            [255, 255, 0], [0, 153, 255], [0, 41, 255], [0, 255, 204],
+                            [41, 0, 255], [41, 255, 0], [173, 0, 255], [0, 245, 255],
+                            [71, 0, 255], [122, 0, 255], [0, 255, 184], [0, 92, 255],
+                            [184, 255, 0], [0, 133, 255], [255, 214, 0], [25, 194, 194],
+                            [102, 255, 0], [92, 0, 255]]
+
+    @prompts(name="Segmentation On Image",
+             description="useful when you want to detect segmentations of the image. "
+                         "like: segment this image, or generate segmentations on this image, "
+                         "or peform segmentation on this image. "
+                         "The input to this tool should be a string, representing the image_path")
+    def inference(self, inputs):
+        image = Image.open(inputs)
+        pixel_values = self.image_processor(image, return_tensors="pt").pixel_values
+        with torch.no_grad():
+            outputs = self.image_segmentor(pixel_values)
+        seg = self.image_processor.post_process_semantic_segmentation(outputs, target_sizes=[image.size[::-1]])[0]
+        color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8)  # height, width, 3
+        palette = np.array(self.ade_palette)
+        for label, color in enumerate(palette):
+            color_seg[seg == label, :] = color
+        color_seg = color_seg.astype(np.uint8)
+        segmentation = Image.fromarray(color_seg)
+        updated_image_path = get_new_image_name(inputs, func_name="segmentation")
+        segmentation.save(updated_image_path)
+        print(f"\nProcessed Image2Pose, Input Image: {inputs}, Output Pose: {updated_image_path}")
+        return updated_image_path
+
+
+class SegText2Image:
+    def __init__(self, device):
+        print("Initializing SegText2Image to %s" % device)
+        self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
+        self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-seg",
+                                                          torch_dtype=self.torch_dtype)
+        self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
+            torch_dtype=self.torch_dtype)
+        self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
+        self.pipe.to(device)
+        self.seed = -1
+        self.a_prompt = 'best quality, extremely detailed'
+        self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
+                        ' fewer digits, cropped, worst quality, low quality'
+
+    @prompts(name="Generate Image Condition On Segmentations",
+             description="useful when you want to generate a new real image from both the user desciption and segmentations. "
+                         "like: generate a real image of a object or something from this segmentation image, "
+                         "or generate a new real image of a object or something from these segmentations. "
+                         "The input to this tool should be a comma seperated string of two, "
+                         "representing the image_path and the user description")
+    def inference(self, inputs):
+        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
+        image = Image.open(image_path)
+        self.seed = random.randint(0, 65535)
+        seed_everything(self.seed)
+        prompt = instruct_text + ', ' + self.a_prompt
+        image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
+                          guidance_scale=9.0).images[0]
+        updated_image_path = get_new_image_name(image_path, func_name="segment2image")
+        image.save(updated_image_path)
+        print(f"\nProcessed SegText2Image, Input Seg: {image_path}, Input Text: {instruct_text}, "
+              f"Output Image: {updated_image_path}")
+        return updated_image_path
+
+
+class Image2Depth:
+    def __init__(self, device):
+        print("Initializing Image2Depth")
+        self.depth_estimator = pipeline('depth-estimation')
+
+    @prompts(name="Predict Depth On Image",
+             description="useful when you want to detect depth of the image. like: generate the depth from this image, "
+                         "or detect the depth map on this image, or predict the depth for this image. "
+                         "The input to this tool should be a string, representing the image_path")
+    def inference(self, inputs):
+        image = Image.open(inputs)
+        depth = self.depth_estimator(image)['depth']
+        depth = np.array(depth)
+        depth = depth[:, :, None]
+        depth = np.concatenate([depth, depth, depth], axis=2)
+        depth = Image.fromarray(depth)
+        updated_image_path = get_new_image_name(inputs, func_name="depth")
+        depth.save(updated_image_path)
+        print(f"\nProcessed Image2Depth, Input Image: {inputs}, Output Depth: {updated_image_path}")
+        return updated_image_path
+
+
+class DepthText2Image:
+    def __init__(self, device):
+        print("Initializing DepthText2Image to %s" % device)
+        self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
+        self.controlnet = ControlNetModel.from_pretrained(
+            "fusing/stable-diffusion-v1-5-controlnet-depth", torch_dtype=self.torch_dtype)
+        self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
+            torch_dtype=self.torch_dtype)
+        self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
+        self.pipe.to(device)
+        self.seed = -1
+        self.a_prompt = 'best quality, extremely detailed'
+        self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
+                        ' fewer digits, cropped, worst quality, low quality'
+
+    @prompts(name="Generate Image Condition On Depth",
+             description="useful when you want to generate a new real image from both the user desciption and depth image. "
+                         "like: generate a real image of a object or something from this depth image, "
+                         "or generate a new real image of a object or something from the depth map. "
+                         "The input to this tool should be a comma seperated string of two, "
+                         "representing the image_path and the user description")
+    def inference(self, inputs):
+        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
+        image = Image.open(image_path)
+        self.seed = random.randint(0, 65535)
+        seed_everything(self.seed)
+        prompt = instruct_text + ', ' + self.a_prompt
+        image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
+                          guidance_scale=9.0).images[0]
+        updated_image_path = get_new_image_name(image_path, func_name="depth2image")
+        image.save(updated_image_path)
+        print(f"\nProcessed DepthText2Image, Input Depth: {image_path}, Input Text: {instruct_text}, "
+              f"Output Image: {updated_image_path}")
+        return updated_image_path
+
+
+class Image2Normal:
+    def __init__(self, device):
+        print("Initializing Image2Normal")
+        self.depth_estimator = pipeline("depth-estimation", model="Intel/dpt-hybrid-midas")
+        self.bg_threhold = 0.4
+
+    @prompts(name="Predict Normal Map On Image",
+             description="useful when you want to detect norm map of the image. "
+                         "like: generate normal map from this image, or predict normal map of this image. "
+                         "The input to this tool should be a string, representing the image_path")
+    def inference(self, inputs):
+        image = Image.open(inputs)
+        original_size = image.size
+        image = self.depth_estimator(image)['predicted_depth'][0]
+        image = image.numpy()
+        image_depth = image.copy()
+        image_depth -= np.min(image_depth)
+        image_depth /= np.max(image_depth)
+        x = cv2.Sobel(image, cv2.CV_32F, 1, 0, ksize=3)
+        x[image_depth < self.bg_threhold] = 0
+        y = cv2.Sobel(image, cv2.CV_32F, 0, 1, ksize=3)
+        y[image_depth < self.bg_threhold] = 0
+        z = np.ones_like(x) * np.pi * 2.0
+        image = np.stack([x, y, z], axis=2)
+        image /= np.sum(image ** 2.0, axis=2, keepdims=True) ** 0.5
+        image = (image * 127.5 + 127.5).clip(0, 255).astype(np.uint8)
+        image = Image.fromarray(image)
+        image = image.resize(original_size)
+        updated_image_path = get_new_image_name(inputs, func_name="normal-map")
+        image.save(updated_image_path)
+        print(f"\nProcessed Image2Normal, Input Image: {inputs}, Output Depth: {updated_image_path}")
+        return updated_image_path
+
+
+class NormalText2Image:
+    def __init__(self, device):
+        print("Initializing NormalText2Image to %s" % device)
+        self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
+        self.controlnet = ControlNetModel.from_pretrained(
+            "fusing/stable-diffusion-v1-5-controlnet-normal", torch_dtype=self.torch_dtype)
+        self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
+            torch_dtype=self.torch_dtype)
+        self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
+        self.pipe.to(device)
+        self.seed = -1
+        self.a_prompt = 'best quality, extremely detailed'
+        self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
+                        ' fewer digits, cropped, worst quality, low quality'
+
+    @prompts(name="Generate Image Condition On Normal Map",
+             description="useful when you want to generate a new real image from both the user desciption and normal map. "
+                         "like: generate a real image of a object or something from this normal map, "
+                         "or generate a new real image of a object or something from the normal map. "
+                         "The input to this tool should be a comma seperated string of two, "
+                         "representing the image_path and the user description")
+    def inference(self, inputs):
+        image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
+        image = Image.open(image_path)
+        self.seed = random.randint(0, 65535)
+        seed_everything(self.seed)
+        prompt = instruct_text + ', ' + self.a_prompt
+        image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
+                          guidance_scale=9.0).images[0]
+        updated_image_path = get_new_image_name(image_path, func_name="normal2image")
+        image.save(updated_image_path)
+        print(f"\nProcessed NormalText2Image, Input Normal: {image_path}, Input Text: {instruct_text}, "
+              f"Output Image: {updated_image_path}")
+        return updated_image_path
+
+
+class VisualQuestionAnswering:
+    def __init__(self, device):
+        print("Initializing VisualQuestionAnswering to %s" % device)
+        self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
+        self.device = device
+        self.processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
+        self.model = BlipForQuestionAnswering.from_pretrained(
+            "Salesforce/blip-vqa-base", torch_dtype=self.torch_dtype).to(self.device)
+
+    @prompts(name="Answer Question About The Image",
+             description="useful when you need an answer for a question based on an image. "
+                         "like: what is the background color of the last image, how many cats in this figure, what is in this figure. "
+                         "The input to this tool should be a comma seperated string of two, representing the image_path and the question")
+    def inference(self, inputs):
+        image_path, question = inputs.split(",")
+        raw_image = Image.open(image_path).convert('RGB')
+        inputs = self.processor(raw_image, question, return_tensors="pt").to(self.device, self.torch_dtype)
+        out = self.model.generate(**inputs)
+        answer = self.processor.decode(out[0], skip_special_tokens=True)
+        print(f"\nProcessed VisualQuestionAnswering, Input Image: {image_path}, Input Question: {question}, "
+              f"Output Answer: {answer}")
+        return answer
+
+
+class ConversationBot:
+    def __init__(self, load_dict):
+        # load_dict = {'VisualQuestionAnswering':'cuda:0', 'ImageCaptioning':'cuda:1',...}
+        print(f"Initializing VisualChatGPT, load_dict={load_dict}")
+        if 'ImageCaptioning' not in load_dict:
+            raise ValueError("You have to load ImageCaptioning as a basic function for VisualChatGPT")
+
+        self.llm = OpenAI(temperature=0)
+        self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
+
+        self.models = dict()
+        for class_name, device in load_dict.items():
+            self.models[class_name] = globals()[class_name](device=device)
+
+        self.tools = []
+        for class_name, instance in self.models.items():
+            for e in dir(instance):
+                if e.startswith('inference'):
+                    func = getattr(instance, e)
+                    self.tools.append(Tool(name=func.name, description=func.description, func=func))
+
+        self.agent = initialize_agent(
+            self.tools,
+            self.llm,
+            agent="conversational-react-description",
+            verbose=True,
+            memory=self.memory,
+            return_intermediate_steps=True,
+            agent_kwargs={'prefix': VISUAL_CHATGPT_PREFIX, 'format_instructions': VISUAL_CHATGPT_FORMAT_INSTRUCTIONS,
+                          'suffix': VISUAL_CHATGPT_SUFFIX}, )
+
+    def run_text(self, text, state):
+        self.agent.memory.buffer = cut_dialogue_history(self.agent.memory.buffer, keep_last_n_words=500)
+        res = self.agent({"input": text})
+        res['output'] = res['output'].replace("\\", "/")
+        response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
+        state = state + [(text, response)]
+        print(f"\nProcessed run_text, Input text: {text}\nCurrent state: {state}\n"
+              f"Current Memory: {self.agent.memory.buffer}")
+        return state, state
+
+    def run_image(self, image, state, txt):
+        image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
+        print("======>Auto Resize Image...")
+        img = Image.open(image.name)
+        width, height = img.size
+        ratio = min(512 / width, 512 / height)
+        width_new, height_new = (round(width * ratio), round(height * ratio))
+        width_new = int(np.round(width_new / 64.0)) * 64
+        height_new = int(np.round(height_new / 64.0)) * 64
+        img = img.resize((width_new, height_new))
+        img = img.convert('RGB')
+        img.save(image_filename, "PNG")
+        print(f"Resize image form {width}x{height} to {width_new}x{height_new}")
+        description = self.models['ImageCaptioning'].inference(image_filename)
+        Human_prompt = "\nHuman: provide a figure named {}. The description is: {}. " \
+                       "This information helps you to understand this image, " \
+                       "but you should use tools to finish following tasks, " \
+                       "rather than directly imagine from my description. If you understand, say \"Received\". \n".format(
+            image_filename, description)
+        AI_prompt = "Received.  "
+        self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
+        state = state + [(f"![](/file={image_filename})*{image_filename}*", AI_prompt)]
+        print(f"\nProcessed run_image, Input image: {image_filename}\nCurrent state: {state}\n"
+              f"Current Memory: {self.agent.memory.buffer}")
+        return state, state, txt + ' ' + image_filename + ' '
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--load', type=str, default="ImageCaptioning_cuda:0,Text2Image_cuda:0")
+    args = parser.parse_args()
+    load_dict = {e.split('_')[0].strip(): e.split('_')[1].strip() for e in args.load.split(',')}
+    bot = ConversationBot(load_dict=load_dict)
+    with gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}") as demo:
+        chatbot = gr.Chatbot(elem_id="chatbot", label="Visual ChatGPT")
+        state = gr.State([])
+        with gr.Row():
+            with gr.Column(scale=0.7):
+                txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter, or upload an image").style(
+                    container=False)
+            with gr.Column(scale=0.15, min_width=0):
+                clear = gr.Button("Clear")
+            with gr.Column(scale=0.15, min_width=0):
+                btn = gr.UploadButton("Upload", file_types=["image"])
+
+        txt.submit(bot.run_text, [txt, state], [chatbot, state])
+        txt.submit(lambda: "", None, txt)
+        btn.upload(bot.run_image, [btn, state, txt], [chatbot, state, txt])
+        clear.click(bot.memory.clear)
+        clear.click(lambda: [], None, chatbot)
+        clear.click(lambda: [], None, state)
+        demo.launch(server_name="0.0.0.0", server_port=7868)
diff --git a/visual_chatgpt_zh.py b/visual_chatgpt_zh.py
new file mode 100644
index 0000000000000000000000000000000000000000..4be1c7ee4e69aeed5db1f2a6fb91f629375dda68
--- /dev/null
+++ b/visual_chatgpt_zh.py
@@ -0,0 +1,171 @@
+import os
+import gradio as gr
+import random
+import torch
+import cv2
+import re
+import uuid
+from PIL import Image
+import numpy as np
+import argparse
+
+from langchain.agents.initialize import initialize_agent
+from langchain.agents.tools import Tool
+from langchain.chains.conversation.memory import ConversationBufferMemory
+from langchain.llms.openai import OpenAI
+
+from modules.image_captioning import ImageCaptioning
+from modules.image_editing import ImageEditing
+from modules.instruct_px2pix import InstructPix2Pix
+from modules.mask_former import MaskFormer
+from modules.text2img import Text2Image
+from modules.visual_question_answering import VisualQuestionAnswering
+from modules.controlnet_canny import Image2Canny,CannyText2Image
+from modules.controlnet_depth import Image2Depth,DepthText2Image
+from modules.controlnet_hed import Image2Hed,HedText2Image
+from modules.controlnet_line import Image2Line,LineText2Image
+from modules.controlnet_normal import Image2Normal,NormalText2Image
+from modules.controlnet_pose import Image2Pose,PoseText2Image
+from modules.controlnet_scibble import Image2Scribble,ScribbleText2Image
+from modules.controlnet_seg import Image2Seg,SegText2Image
+
+from modules.utils import *
+
+import argparse
+
+# chatgpt前缀
+VISUAL_CHATGPT_PREFIX = """Visual ChatGPT is designed to be able to assist with a wide range of text and visual related tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of topics. Visual ChatGPT is able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.
+Visual ChatGPT is able to process and understand large amounts of text and image. As a language model, Visual ChatGPT can not directly read images, but it has a list of tools to finish different visual tasks. Each image will have a file name formed as "image/xxx.png", and Visual ChatGPT can invoke different tools to indirectly understand pictures. When talking about images, Visual ChatGPT is very strict to the file name and will never fabricate nonexistent files. When using tools to generate new image files, Visual ChatGPT is also known that the image may not be the same as user's demand, and will use other visual question answering tools or description tools to observe the real image. Visual ChatGPT is able to use tools in a sequence, and is  loyal to the tool observation outputs rather than faking the image content and image file name. It will remember to provide the file name from the last tool observation, if a new image is generated.
+Human may provide new figures to Visual ChatGPT with a description. The description helps Visual ChatGPT to understand this image, but Visual ChatGPT should use tools to finish following tasks, rather than directly imagine from the description.
+Overall, Visual ChatGPT is a powerful visual dialogue assistant tool that can help with a wide range of tasks and provide valuable insights and information on a wide range of topics. 
+TOOLS:
+------
+Visual ChatGPT  has access to the following tools:"""
+
+# 调教chatgpt的instruction
+VISUAL_CHATGPT_FORMAT_INSTRUCTIONS = """To use a tool, please use the following format:
+```
+Thought: Do I need to use a tool? Yes
+Action: the action to take, should be one of [{tool_names}]
+Action Input: the input to the action
+Observation: the result of the action
+```
+When you have a response to say to the Human, or if you do not need to use a tool, you MUST use the format:
+```
+Thought: Do I need to use a tool? No
+{ai_prefix}: [your response here]
+```
+"""
+
+# chatgpt后缀
+VISUAL_CHATGPT_SUFFIX = """You are very strict to the filename correctness and will never fake a file name if not exists.
+You will remember to provide the image file name loyally if it's provided in the last tool  observation.
+Begin!
+Previous conversation history:
+{chat_history}
+New input: {input}
+Since Visual ChatGPT is a text language model, Visual ChatGPT must use tools to observe images rather than imagination.
+The thoughts and observations are only visible for Visual ChatGPT, Visual ChatGPT should remember to repeat important information in the final response for Human. 
+Thought: Do I need to use a tool? {agent_scratchpad}"""
+
+os.makedirs('image', exist_ok=True)
+
+
+class ConversationBot:
+    def __init__(self, load_dict, pretrained_model_dir):
+        # load_dict = {'VisualQuestionAnswering':'cuda:0', 'ImageCaptioning':'cuda:1',...}
+        print(f"Initializing VisualChatGPT, load_dict={load_dict}")
+        if 'ImageCaptioning' not in load_dict:
+            raise ValueError("You have to load ImageCaptioning as a basic function for VisualChatGPT")
+
+        self.llm = OpenAI(temperature=0)
+        self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
+
+        self.models = dict()
+        for class_name, device in load_dict.items():
+            self.models[class_name] = globals()[class_name](device=device, pretrained_model_dir=pretrained_model_dir)
+
+        self.tools = []
+        for class_name, instance in self.models.items():
+            for e in dir(instance):
+                if e.startswith('inference'):
+                    func = getattr(instance, e)
+                    self.tools.append(Tool(name=func.name, description=func.description, func=func))
+
+        self.agent = initialize_agent(
+            self.tools,
+            self.llm,
+            agent="conversational-react-description",
+            verbose=True,
+            memory=self.memory,
+            return_intermediate_steps=True,
+            agent_kwargs={'prefix': VISUAL_CHATGPT_PREFIX, 'format_instructions': VISUAL_CHATGPT_FORMAT_INSTRUCTIONS,
+                          'suffix': VISUAL_CHATGPT_SUFFIX}, )
+
+    def run_text(self, text, state):
+        self.agent.memory.buffer = cut_dialogue_history(self.agent.memory.buffer, keep_last_n_words=500)
+        res = self.agent({"input": text})
+        res['output'] = res['output'].replace("\\", "/")
+        response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
+        state = state + [(text, response)]
+        print(f"\nProcessed run_text, Input text: {text}\nCurrent state: {state}\n"
+              f"Current Memory: {self.agent.memory.buffer}")
+        return state, state
+
+    def run_image(self, image, state, txt):
+        image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
+        print("======>Auto Resize Image...")
+        img = Image.open(image.name)
+        width, height = img.size
+        ratio = min(512 / width, 512 / height)
+        width_new, height_new = (round(width * ratio), round(height * ratio))
+        width_new = int(np.round(width_new / 64.0)) * 64
+        height_new = int(np.round(height_new / 64.0)) * 64
+        img = img.resize((width_new, height_new))
+        img = img.convert('RGB')
+        img.save(image_filename, "PNG")
+        print(f"Resize image form {width}x{height} to {width_new}x{height_new}")
+        description = self.models['ImageCaptioning'].inference(image_filename)
+        Human_prompt = "\nHuman: provide a figure named {}. The description is: {}. " \
+                       "This information helps you to understand this image, " \
+                       "but you should use tools to finish following tasks, " \
+                       "rather than directly imagine from my description. If you understand, say \"Received\". \n".format(
+            image_filename, description)
+        AI_prompt = "Received.  "
+        self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
+        state = state + [(f"![](/file={image_filename})*{image_filename}*", AI_prompt)]
+        print(f"\nProcessed run_image, Input image: {image_filename}\nCurrent state: {state}\n"
+              f"Current Memory: {self.agent.memory.buffer}")
+        return state, state, txt + ' ' + image_filename + ' '
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--load', type=str, default="ImageCaptioning_cuda:0,Text2Image_cuda:0")
+    parser.add_argument("--pretrained_model_dir", default="./hf_models_path",
+                            type=str, help="huggingface下载好的模型路径")
+    args = parser.parse_args()
+
+    pretrained_model_dir = args.pretrained_model_dir
+    
+    load_dict = {e.split('_')[0].strip(): e.split('_')[1].strip() for e in args.load.split(',')}
+    bot = ConversationBot(load_dict=load_dict, pretrained_model_dir=pretrained_model_dir)
+    with gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}") as demo:
+        chatbot = gr.Chatbot(elem_id="chatbot", label="Visual ChatGPT")
+        state = gr.State([])
+        with gr.Row():
+            with gr.Column(scale=0.7):
+                txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter, or upload an image").style(
+                    container=False)
+            with gr.Column(scale=0.15, min_width=0):
+                clear = gr.Button("Clear")
+            with gr.Column(scale=0.15, min_width=0):
+                btn = gr.UploadButton("Upload", file_types=["image"])
+
+        txt.submit(bot.run_text, [txt, state], [chatbot, state])
+        txt.submit(lambda: "", None, txt)
+        btn.upload(bot.run_image, [btn, state, txt], [chatbot, state, txt])
+        clear.click(bot.memory.clear)
+        clear.click(lambda: [], None, chatbot)
+        clear.click(lambda: [], None, state)
+        demo.launch(server_name="0.0.0.0", server_port=7868)
\ No newline at end of file
diff --git a/visual_chatgpt_zh_vits.py b/visual_chatgpt_zh_vits.py
new file mode 100644
index 0000000000000000000000000000000000000000..217ae0a9fa3e3618df27e4c8cbbc56fd4bc1905a
--- /dev/null
+++ b/visual_chatgpt_zh_vits.py
@@ -0,0 +1,195 @@
+import os
+import gradio as gr
+import random
+import torch
+import cv2
+import re
+import uuid
+from PIL import Image
+import numpy as np
+import argparse
+
+from langchain.agents.initialize import initialize_agent
+from langchain.agents.tools import Tool
+from langchain.chains.conversation.memory import ConversationBufferMemory
+from langchain.llms.openai import OpenAI
+
+from modules.image_captioning import ImageCaptioning
+from modules.image_editing import ImageEditing
+from modules.instruct_px2pix import InstructPix2Pix
+from modules.mask_former import MaskFormer
+from modules.text2img import Text2Image
+from modules.visual_question_answering import VisualQuestionAnswering
+from modules.controlnet_canny import Image2Canny,CannyText2Image
+from modules.controlnet_depth import Image2Depth,DepthText2Image
+from modules.controlnet_hed import Image2Hed,HedText2Image
+from modules.controlnet_line import Image2Line,LineText2Image
+from modules.controlnet_normal import Image2Normal,NormalText2Image
+from modules.controlnet_pose import Image2Pose,PoseText2Image
+from modules.controlnet_scibble import Image2Scribble,ScribbleText2Image
+from modules.controlnet_seg import Image2Seg,SegText2Image
+
+from modules.utils import *
+from vits_infer import generateSound
+from winsound import PlaySound
+
+import argparse
+
+# os.environ["OPENAI_API_KEY"] = ""
+use_vits = False
+
+# chatgpt前缀
+VISUAL_CHATGPT_PREFIX = """Visual ChatGPT is designed to be able to assist with a wide range of text and visual related tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of topics. Visual ChatGPT is able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.
+
+Visual ChatGPT is able to process and understand large amounts of text and image. As a language model, Visual ChatGPT can not directly read images, but it has a list of tools to finish different visual tasks. Each image will have a file name formed as "image/xxx.png", and Visual ChatGPT can invoke different tools to indirectly understand pictures. When talking about images, Visual ChatGPT is very strict to the file name and will never fabricate nonexistent files. When using tools to generate new image files, Visual ChatGPT is also known that the image may not be the same as user's demand, and will use other visual question answering tools or description tools to observe the real image. Visual ChatGPT is able to use tools in a sequence, and is  loyal to the tool observation outputs rather than faking the image content and image file name. It will remember to provide the file name from the last tool observation, if a new image is generated.
+
+Human may provide new figures to Visual ChatGPT with a description. The description helps Visual ChatGPT to understand this image, but Visual ChatGPT should use tools to finish following tasks, rather than directly imagine from the description.
+
+Overall, Visual ChatGPT is a powerful visual dialogue assistant tool that can help with a wide range of tasks and provide valuable insights and information on a wide range of topics. 
+
+
+TOOLS:
+------
+
+Visual ChatGPT  has access to the following tools:"""
+
+# 调教chatgpt的instruction
+VISUAL_CHATGPT_FORMAT_INSTRUCTIONS = """To use a tool, please use the following format:
+
+```
+Thought: Do I need to use a tool? Yes
+Action: the action to take, should be one of [{tool_names}]
+Action Input: the input to the action
+Observation: the result of the action
+```
+
+When you have a response to say to the Human, or if you do not need to use a tool, you MUST use the format:
+
+```
+Thought: Do I need to use a tool? No
+{ai_prefix}: [your response here]
+```
+"""
+
+# chatgpt后缀
+VISUAL_CHATGPT_SUFFIX = """You are very strict to the filename correctness and will never fake a file name if not exists.
+You will remember to provide the image file name loyally if it's provided in the last tool  observation.
+
+Begin!
+
+Previous conversation history:
+{chat_history}
+
+New input: {input}
+Since Visual ChatGPT is a text language model, Visual ChatGPT must use tools to observe images rather than imagination.
+The thoughts and observations are only visible for Visual ChatGPT, Visual ChatGPT should remember to repeat important information in the final response for Human. 
+Thought: Do I need to use a tool? {agent_scratchpad}"""
+
+os.makedirs('image', exist_ok=True)
+
+
+
+class ConversationBot:
+    def __init__(self, load_dict, pretrained_model_dir):
+        # load_dict = {'VisualQuestionAnswering':'cuda:0', 'ImageCaptioning':'cuda:1',...}
+        print(f"Initializing VisualChatGPT, load_dict={load_dict}")
+        if 'ImageCaptioning' not in load_dict:
+            raise ValueError("You have to load ImageCaptioning as a basic function for VisualChatGPT")
+
+        self.llm = OpenAI(temperature=0)
+        self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
+
+        self.models = dict()
+        for class_name, device in load_dict.items():
+            self.models[class_name] = globals()[class_name](device=device, pretrained_model_dir=pretrained_model_dir)
+
+        self.tools = []
+        for class_name, instance in self.models.items():
+            for e in dir(instance):
+                if e.startswith('inference'):
+                    func = getattr(instance, e)
+                    self.tools.append(Tool(name=func.name, description=func.description, func=func))
+
+        self.agent = initialize_agent(
+            self.tools,
+            self.llm,
+            agent="conversational-react-description",
+            verbose=True,
+            memory=self.memory,
+            return_intermediate_steps=True,
+            agent_kwargs={'prefix': VISUAL_CHATGPT_PREFIX, 'format_instructions': VISUAL_CHATGPT_FORMAT_INSTRUCTIONS,
+                          'suffix': VISUAL_CHATGPT_SUFFIX}, )
+
+    def run_text(self, text, state):
+        self.agent.memory.buffer = cut_dialogue_history(self.agent.memory.buffer, keep_last_n_words=500)
+        res = self.agent({"input": text})
+        res['output'] = res['output'].replace("\\", "/")
+        if use_vits:
+            is_exists = generateSound(res['output'])
+            if is_exists:
+                PlaySound(r'.\output.wav', flags=1)
+            else:
+                pass
+        response = re.sub('(image/\S*png)', lambda m: f'![](/file={m.group(0)})*{m.group(0)}*', res['output'])
+        state = state + [(text, response)]
+        print(f"\nProcessed run_text, Input text: {text}\nCurrent state: {state}\n"
+              f"Current Memory: {self.agent.memory.buffer}")
+        return state, state
+
+    def run_image(self, image, state, txt):
+        image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
+        print("======>Auto Resize Image...")
+        img = Image.open(image.name)
+        width, height = img.size
+        ratio = min(512 / width, 512 / height)
+        width_new, height_new = (round(width * ratio), round(height * ratio))
+        width_new = int(np.round(width_new / 64.0)) * 64
+        height_new = int(np.round(height_new / 64.0)) * 64
+        img = img.resize((width_new, height_new))
+        img = img.convert('RGB')
+        img.save(image_filename, "PNG")
+        print(f"Resize image form {width}x{height} to {width_new}x{height_new}")
+        description = self.models['ImageCaptioning'].inference(image_filename)
+        Human_prompt = "\nHuman: provide a figure named {}. The description is: {}. " \
+                       "This information helps you to understand this image, " \
+                       "but you should use tools to finish following tasks, " \
+                       "rather than directly imagine from my description. If you understand, say \"Received\". \n".format(
+            image_filename, description)
+        AI_prompt = "Received.  "
+        self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
+        state = state + [(f"![](/file={image_filename})*{image_filename}*", AI_prompt)]
+        print(f"\nProcessed run_image, Input image: {image_filename}\nCurrent state: {state}\n"
+              f"Current Memory: {self.agent.memory.buffer}")
+        return state, state, txt + ' ' + image_filename + ' '
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--load', type=str, default="ImageCaptioning_cuda:0,Text2Image_cuda:0")
+    parser.add_argument("--pretrained_model_dir", default="./hf_models",
+                            type=str, help="huggingface下载好的模型路径")
+    args = parser.parse_args()
+
+    pretrained_model_dir = args.pretrained_model_dir
+    
+    load_dict = {e.split('_')[0].strip(): e.split('_')[1].strip() for e in args.load.split(',')}
+    bot = ConversationBot(load_dict=load_dict, pretrained_model_dir=pretrained_model_dir)
+    with gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}") as demo:
+        chatbot = gr.Chatbot(elem_id="chatbot", label="Visual ChatGPT")
+        state = gr.State([])
+        with gr.Row():
+            with gr.Column(scale=0.7):
+                txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter, or upload an image").style(
+                    container=False)
+            with gr.Column(scale=0.15, min_width=0):
+                clear = gr.Button("Clear")
+            with gr.Column(scale=0.15, min_width=0):
+                btn = gr.UploadButton("Upload", file_types=["image"])
+
+        txt.submit(bot.run_text, [txt, state], [chatbot, state])
+        txt.submit(lambda: "", None, txt)
+        btn.upload(bot.run_image, [btn, state, txt], [chatbot, state, txt])
+        clear.click(bot.memory.clear)
+        clear.click(lambda: [], None, chatbot)
+        clear.click(lambda: [], None, state)
+        demo.launch(debug=True, server_port=7868)
diff --git a/vits_infer.py b/vits_infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb726776a00202b7439a68945849325a5b5d388b
--- /dev/null
+++ b/vits_infer.py
@@ -0,0 +1,180 @@
+import torch
+import gradio as gr
+import json
+import openai
+
+from scipy.io.wavfile import write
+from mel_processing import spectrogram_torch
+from text import text_to_sequence, _clean_text
+from models import SynthesizerTrn
+import utils_vits
+import commons
+import sys
+import re
+from torch import no_grad, LongTensor
+import logging
+from winsound import PlaySound
+
+from pygtrans import Translate, Null
+import time
+
+
+############################################################################
+def get_text(text, hps, cleaned=False):
+    if cleaned:
+        text_norm = text_to_sequence(text, hps.symbols, [])
+    else:
+        text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)
+    if hps.data.add_blank:
+        text_norm = commons.intersperse(text_norm, 0)
+    text_norm = LongTensor(text_norm)
+    return text_norm
+
+
+def ex_print(text, escape=False):
+    if escape:
+        print(text.encode('unicode_escape').decode())
+    else:
+        print(text)
+
+def print_speakers(speakers, escape=False):
+    if len(speakers) > 100:
+        return
+    print('ID\tSpeaker')
+    for id, name in enumerate(speakers):
+        ex_print(str(id) + '\t' + name, escape)
+
+def get_speaker_id(message):
+    speaker_id = input(message)
+    try:
+        speaker_id = int(speaker_id)
+    except:
+        print(str(speaker_id) + ' is not a valid ID!')
+        sys.exit(1)
+    return speaker_id
+
+def get_label_value(text, label, default, warning_name='value'):
+    value = re.search(rf'\[{label}=(.+?)\]', text)
+    if value:
+        try:
+            text = re.sub(rf'\[{label}=(.+?)\]', '', text, 1)
+            value = float(value.group(1))
+        except:
+            print(f'Invalid {warning_name}!')
+            sys.exit(1)
+    else:
+        value = default
+    return value, text
+
+def get_label(text, label):
+    if f'[{label}]' in text:
+        return True, text.replace(f'[{label}]', '')
+    else:
+        return False, text
+
+def generateSound(inputString):
+    idmessage = """ID      Speaker
+0       maho
+"""
+    speakerID = 0
+    model = r".\vits_models\G.pth"
+    config = r".\vits_models\config.json"        
+    if "image" and "png" in inputString:
+        return False
+    hps_ms = utils_vits.get_hparams_from_file(config)
+    n_speakers = hps_ms.data.n_speakers if 'n_speakers' in hps_ms.data.keys() else 0
+    n_symbols = len(hps_ms.symbols) if 'symbols' in hps_ms.keys() else 0
+    speakers = hps_ms.speakers if 'speakers' in hps_ms.keys() else ['0']
+    use_f0 = hps_ms.data.use_f0 if 'use_f0' in hps_ms.data.keys() else False
+    emotion_embedding = hps_ms.data.emotion_embedding if 'emotion_embedding' in hps_ms.data.keys() else False
+
+    net_g_ms = SynthesizerTrn(
+        n_symbols,
+        hps_ms.data.filter_length // 2 + 1,
+        hps_ms.train.segment_size // hps_ms.data.hop_length,
+        n_speakers=n_speakers,
+        emotion_embedding=emotion_embedding,
+        **hps_ms.model)
+    _ = net_g_ms.eval()
+    utils_vits.load_checkpoint(model, net_g_ms)
+
+    def voice_conversion():
+        audio_path = input('Path of an audio file to convert:\n')
+        print_speakers(speakers)
+        audio = utils_vits.load_audio_to_torch(
+            audio_path, hps_ms.data.sampling_rate)
+
+        originnal_id = get_speaker_id('Original speaker ID: ')
+        target_id = get_speaker_id('Target speaker ID: ')
+        out_path = input('Path to save: ')
+
+        y = audio.unsqueeze(0)
+
+        spec = spectrogram_torch(y, hps_ms.data.filter_length,
+                                 hps_ms.data.sampling_rate, hps_ms.data.hop_length, hps_ms.data.win_length,
+                                 center=False)
+        spec_lengths = LongTensor([spec.size(-1)])
+        sid_src = LongTensor([originnal_id])
+
+        with no_grad():
+            sid_tgt = LongTensor([target_id])
+            audio = net_g_ms.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[
+                0][0, 0].data.cpu().float().numpy()
+        return audio, out_path
+
+    if n_symbols != 0:
+        if not emotion_embedding:
+            #while True:
+            if(1==1):
+                #choice = input('TTS or VC? (t/v):')
+                choice = 't'
+                if choice == 't':
+                    #text = input('Text to read: ')
+                    text = inputString
+                    client = Translate()
+                    while True:
+                        text_t = client.translate(text, target='ja')
+                        if isinstance(text, Null):
+                            print("Translation failure!")
+                            time.sleep(2)
+                        else:
+                            print("Translation Success!")
+                            text = text_t.translatedText
+                            break
+                    if text == '[ADVANCED]':
+                        #text = input('Raw text:')
+                        text = "I can't speak!"
+                        #print('Cleaned text is:')
+                        #ex_print(_clean_text(
+                        #    text, hps_ms.data.text_cleaners), escape)
+                        #continue
+
+                    length_scale, text = get_label_value(
+                        text, 'LENGTH', 1.1, 'length scale')
+                    noise_scale, text = get_label_value(
+                        text, 'NOISE', 0.2, 'noise scale')
+                    noise_scale_w, text = get_label_value(
+                        text, 'NOISEW', 0.8, 'deviation of noise')
+                    cleaned, text = get_label(text, 'CLEANED')
+
+                    stn_tst = get_text(text, hps_ms, cleaned=cleaned)
+
+                    #print_speakers(speakers, escape)
+                    #speaker_id = get_speaker_id('Speaker ID: ')
+                    speaker_id = speakerID 
+                    #out_path = input('Path to save: ')
+                    out_path = "output.wav"
+
+                    with no_grad():
+                        x_tst = stn_tst.unsqueeze(0)
+                        x_tst_lengths = LongTensor([stn_tst.size(0)])
+                        sid = LongTensor([speaker_id])
+                        audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
+                                               noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.cpu().float().numpy()
+
+                elif choice == 'v':
+                    audio, out_path = voice_conversion()
+
+                write(out_path, hps_ms.data.sampling_rate, audio)
+                print('Successfully saved!')
+                return True
diff --git a/vits_models/G.pth b/vits_models/G.pth
new file mode 100644
index 0000000000000000000000000000000000000000..ce143cc6a777b9c9a30837610f354dcc9cf83fc5
--- /dev/null
+++ b/vits_models/G.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:78b09a29d01934cccecd809526a771deb5c5574955662f47886ab9c10f87edac
+size 436363606
diff --git a/vits_models/config.json b/vits_models/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8abe9a5d40ab32833eb9e303513bd5099b989571
--- /dev/null
+++ b/vits_models/config.json
@@ -0,0 +1 @@
+{"data":{"add_blank":true,"cleaned_text":true,"filter_length":1024,"hop_length":256,"max_wav_value":32768.0,"mel_fmax":null,"mel_fmin":0.0,"n_mel_channels":80,"n_speakers":0,"sampling_rate":22050,"text_cleaners":["japanese_cleaners"],"training_files":"/home/frank/project2/rl_study/ts_project/ts/filelists/maho.txt.cleaned","validation_files":"/home/frank/project2/rl_study/ts_project/ts/filelists/maho.txt.cleaned","win_length":1024},"model":{"filter_channels":768,"hidden_channels":192,"inter_channels":192,"kernel_size":3,"n_heads":2,"n_layers":6,"n_layers_q":3,"p_dropout":0.1,"resblock":"1","resblock_dilation_sizes":[[1,3,5],[1,3,5],[1,3,5]],"resblock_kernel_sizes":[3,7,11],"upsample_initial_channel":512,"upsample_kernel_sizes":[16,16,4,4],"upsample_rates":[8,8,2,2],"use_spectral_norm":false},"speakers":["maho"],"symbols":["_",",",".","!","?","-","A","E","I","N","O","Q","U","a","b","d","e","f","g","h","i","j","k","m","n","o","p","r","s","t","u","v","w","y","z","\u0283","\u02a7","\u2193","\u2191"," "],"train":{"batch_size":4,"betas":[0.8,0.99],"c_kl":1.0,"c_mel":45,"epochs":10000,"eps":1e-09,"eval_interval":200,"fp16_run":false,"init_lr_ratio":1,"learning_rate":0.0002,"log_interval":200,"lr_decay":0.999875,"seed":1234,"segment_size":8192,"warmup_epochs":0}}
\ No newline at end of file
diff --git a/vits_models/put_vits_models_hear b/vits_models/put_vits_models_hear
new file mode 100644
index 0000000000000000000000000000000000000000..12fae23f5d7540583715d79f44deb044bda2e21c
--- /dev/null
+++ b/vits_models/put_vits_models_hear
@@ -0,0 +1 @@
+model
diff --git a/vits_modules.py b/vits_modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..8db7d957a564bce0fcd077e54ac57e0bdb7a7d3c
--- /dev/null
+++ b/vits_modules.py
@@ -0,0 +1,387 @@
+import math
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from torch.nn import Conv1d
+from torch.nn.utils import weight_norm, remove_weight_norm
+
+import commons
+from commons import init_weights, get_padding
+from transforms import piecewise_rational_quadratic_transform
+
+
+LRELU_SLOPE = 0.1
+
+
+class LayerNorm(nn.Module):
+  def __init__(self, channels, eps=1e-5):
+    super().__init__()
+    self.channels = channels
+    self.eps = eps
+
+    self.gamma = nn.Parameter(torch.ones(channels))
+    self.beta = nn.Parameter(torch.zeros(channels))
+
+  def forward(self, x):
+    x = x.transpose(1, -1)
+    x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+    return x.transpose(1, -1)
+
+ 
+class ConvReluNorm(nn.Module):
+  def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
+    super().__init__()
+    self.in_channels = in_channels
+    self.hidden_channels = hidden_channels
+    self.out_channels = out_channels
+    self.kernel_size = kernel_size
+    self.n_layers = n_layers
+    self.p_dropout = p_dropout
+    assert n_layers > 1, "Number of layers should be larger than 0."
+
+    self.conv_layers = nn.ModuleList()
+    self.norm_layers = nn.ModuleList()
+    self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
+    self.norm_layers.append(LayerNorm(hidden_channels))
+    self.relu_drop = nn.Sequential(
+        nn.ReLU(),
+        nn.Dropout(p_dropout))
+    for _ in range(n_layers-1):
+      self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
+      self.norm_layers.append(LayerNorm(hidden_channels))
+    self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+    self.proj.weight.data.zero_()
+    self.proj.bias.data.zero_()
+
+  def forward(self, x, x_mask):
+    x_org = x
+    for i in range(self.n_layers):
+      x = self.conv_layers[i](x * x_mask)
+      x = self.norm_layers[i](x)
+      x = self.relu_drop(x)
+    x = x_org + self.proj(x)
+    return x * x_mask
+
+
+class DDSConv(nn.Module):
+  """
+  Dilated and Depth-Separable Convolution
+  """
+  def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
+    super().__init__()
+    self.channels = channels
+    self.kernel_size = kernel_size
+    self.n_layers = n_layers
+    self.p_dropout = p_dropout
+
+    self.drop = nn.Dropout(p_dropout)
+    self.convs_sep = nn.ModuleList()
+    self.convs_1x1 = nn.ModuleList()
+    self.norms_1 = nn.ModuleList()
+    self.norms_2 = nn.ModuleList()
+    for i in range(n_layers):
+      dilation = kernel_size ** i
+      padding = (kernel_size * dilation - dilation) // 2
+      self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size, 
+          groups=channels, dilation=dilation, padding=padding
+      ))
+      self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
+      self.norms_1.append(LayerNorm(channels))
+      self.norms_2.append(LayerNorm(channels))
+
+  def forward(self, x, x_mask, g=None):
+    if g is not None:
+      x = x + g
+    for i in range(self.n_layers):
+      y = self.convs_sep[i](x * x_mask)
+      y = self.norms_1[i](y)
+      y = F.gelu(y)
+      y = self.convs_1x1[i](y)
+      y = self.norms_2[i](y)
+      y = F.gelu(y)
+      y = self.drop(y)
+      x = x + y
+    return x * x_mask
+
+
+class WN(torch.nn.Module):
+  def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
+    super(WN, self).__init__()
+    assert(kernel_size % 2 == 1)
+    self.hidden_channels =hidden_channels
+    self.kernel_size = kernel_size,
+    self.dilation_rate = dilation_rate
+    self.n_layers = n_layers
+    self.gin_channels = gin_channels
+    self.p_dropout = p_dropout
+
+    self.in_layers = torch.nn.ModuleList()
+    self.res_skip_layers = torch.nn.ModuleList()
+    self.drop = nn.Dropout(p_dropout)
+
+    if gin_channels != 0:
+      cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
+      self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
+
+    for i in range(n_layers):
+      dilation = dilation_rate ** i
+      padding = int((kernel_size * dilation - dilation) / 2)
+      in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
+                                 dilation=dilation, padding=padding)
+      in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
+      self.in_layers.append(in_layer)
+
+      # last one is not necessary
+      if i < n_layers - 1:
+        res_skip_channels = 2 * hidden_channels
+      else:
+        res_skip_channels = hidden_channels
+
+      res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
+      res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
+      self.res_skip_layers.append(res_skip_layer)
+
+  def forward(self, x, x_mask, g=None, **kwargs):
+    output = torch.zeros_like(x)
+    n_channels_tensor = torch.IntTensor([self.hidden_channels])
+
+    if g is not None:
+      g = self.cond_layer(g)
+
+    for i in range(self.n_layers):
+      x_in = self.in_layers[i](x)
+      if g is not None:
+        cond_offset = i * 2 * self.hidden_channels
+        g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
+      else:
+        g_l = torch.zeros_like(x_in)
+
+      acts = commons.fused_add_tanh_sigmoid_multiply(
+          x_in,
+          g_l,
+          n_channels_tensor)
+      acts = self.drop(acts)
+
+      res_skip_acts = self.res_skip_layers[i](acts)
+      if i < self.n_layers - 1:
+        res_acts = res_skip_acts[:,:self.hidden_channels,:]
+        x = (x + res_acts) * x_mask
+        output = output + res_skip_acts[:,self.hidden_channels:,:]
+      else:
+        output = output + res_skip_acts
+    return output * x_mask
+
+  def remove_weight_norm(self):
+    if self.gin_channels != 0:
+      torch.nn.utils.remove_weight_norm(self.cond_layer)
+    for l in self.in_layers:
+      torch.nn.utils.remove_weight_norm(l)
+    for l in self.res_skip_layers:
+     torch.nn.utils.remove_weight_norm(l)
+
+
+class ResBlock1(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.convs1 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
+                               padding=get_padding(kernel_size, dilation[2])))
+        ])
+        self.convs1.apply(init_weights)
+
+        self.convs2 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1)))
+        ])
+        self.convs2.apply(init_weights)
+
+    def forward(self, x, x_mask=None):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c2(xt)
+            x = xt + x
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+
+
+class ResBlock2(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
+        super(ResBlock2, self).__init__()
+        self.convs = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1])))
+        ])
+        self.convs.apply(init_weights)
+
+    def forward(self, x, x_mask=None):
+        for c in self.convs:
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c(xt)
+            x = xt + x
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+
+
+class Log(nn.Module):
+  def forward(self, x, x_mask, reverse=False, **kwargs):
+    if not reverse:
+      y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
+      logdet = torch.sum(-y, [1, 2])
+      return y, logdet
+    else:
+      x = torch.exp(x) * x_mask
+      return x
+    
+
+class Flip(nn.Module):
+  def forward(self, x, *args, reverse=False, **kwargs):
+    x = torch.flip(x, [1])
+    if not reverse:
+      logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
+      return x, logdet
+    else:
+      return x
+
+
+class ElementwiseAffine(nn.Module):
+  def __init__(self, channels):
+    super().__init__()
+    self.channels = channels
+    self.m = nn.Parameter(torch.zeros(channels,1))
+    self.logs = nn.Parameter(torch.zeros(channels,1))
+
+  def forward(self, x, x_mask, reverse=False, **kwargs):
+    if not reverse:
+      y = self.m + torch.exp(self.logs) * x
+      y = y * x_mask
+      logdet = torch.sum(self.logs * x_mask, [1,2])
+      return y, logdet
+    else:
+      x = (x - self.m) * torch.exp(-self.logs) * x_mask
+      return x
+
+
+class ResidualCouplingLayer(nn.Module):
+  def __init__(self,
+      channels,
+      hidden_channels,
+      kernel_size,
+      dilation_rate,
+      n_layers,
+      p_dropout=0,
+      gin_channels=0,
+      mean_only=False):
+    assert channels % 2 == 0, "channels should be divisible by 2"
+    super().__init__()
+    self.channels = channels
+    self.hidden_channels = hidden_channels
+    self.kernel_size = kernel_size
+    self.dilation_rate = dilation_rate
+    self.n_layers = n_layers
+    self.half_channels = channels // 2
+    self.mean_only = mean_only
+
+    self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
+    self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
+    self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
+    self.post.weight.data.zero_()
+    self.post.bias.data.zero_()
+
+  def forward(self, x, x_mask, g=None, reverse=False):
+    x0, x1 = torch.split(x, [self.half_channels]*2, 1)
+    h = self.pre(x0) * x_mask
+    h = self.enc(h, x_mask, g=g)
+    stats = self.post(h) * x_mask
+    if not self.mean_only:
+      m, logs = torch.split(stats, [self.half_channels]*2, 1)
+    else:
+      m = stats
+      logs = torch.zeros_like(m)
+
+    if not reverse:
+      x1 = m + x1 * torch.exp(logs) * x_mask
+      x = torch.cat([x0, x1], 1)
+      logdet = torch.sum(logs, [1,2])
+      return x, logdet
+    else:
+      x1 = (x1 - m) * torch.exp(-logs) * x_mask
+      x = torch.cat([x0, x1], 1)
+      return x
+
+
+class ConvFlow(nn.Module):
+  def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0):
+    super().__init__()
+    self.in_channels = in_channels
+    self.filter_channels = filter_channels
+    self.kernel_size = kernel_size
+    self.n_layers = n_layers
+    self.num_bins = num_bins
+    self.tail_bound = tail_bound
+    self.half_channels = in_channels // 2
+
+    self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
+    self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.)
+    self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1)
+    self.proj.weight.data.zero_()
+    self.proj.bias.data.zero_()
+
+  def forward(self, x, x_mask, g=None, reverse=False):
+    x0, x1 = torch.split(x, [self.half_channels]*2, 1)
+    h = self.pre(x0)
+    h = self.convs(h, x_mask, g=g)
+    h = self.proj(h) * x_mask
+
+    b, c, t = x0.shape
+    h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
+
+    unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels)
+    unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels)
+    unnormalized_derivatives = h[..., 2 * self.num_bins:]
+
+    x1, logabsdet = piecewise_rational_quadratic_transform(x1,
+        unnormalized_widths,
+        unnormalized_heights,
+        unnormalized_derivatives,
+        inverse=reverse,
+        tails='linear',
+        tail_bound=self.tail_bound
+    )
+
+    x = torch.cat([x0, x1], 1) * x_mask
+    logdet = torch.sum(logabsdet * x_mask, [1,2])
+    if not reverse:
+        return x, logdet
+    else:
+        return x