Spaces:

Mahiruoshi
/

BangDream-Bert-VITS2

Running

App Files Files Community

Mahiruoshi commited on Nov 18, 2023

Commit

d1f7ac3

1 Parent(s): 50ea4f2

Update tools/sentence.py

Browse files

Files changed (1) hide show

tools/sentence.py +48 -1

tools/sentence.py CHANGED Viewed

@@ -1,4 +1,9 @@
-import re
 def is_japanese(string):
         for ch in string:
@@ -59,6 +64,48 @@ def split_mixed_language(sentence):
     sub_sentences = re.split(r'(?<=[。！？\.\?!])(?=")|(?<=")(?=[\u4e00-\u9fff\u3040-\u30ff\u31f0-\u31ff]|[a-zA-Z])', sentence)
     return [s.strip() for s in sub_sentences if s.strip()]
 if __name__ == "__main__":
     text = "你好，这是一段用来测试自动标注的文本。こんにちは,これは「自動ラベリングのテスト用テキスト」です.Hello, this is a piece of text to test autotagging.你好！今天我们要介绍VITS项目，其重点是使用了“GAN Duration predictor”和“transformer flow”,并且接入了Bert模型来提升韵律。Bert embedding会在稍后介绍。"
     print(extrac(text))

+import re, os
+from ebooklib import epub
+import PyPDF2
+from PyPDF2 import PdfReader
+from bs4 import BeautifulSoup
 def is_japanese(string):
         for ch in string:
     sub_sentences = re.split(r'(?<=[。！？\.\?!])(?=")|(?<=")(?=[\u4e00-\u9fff\u3040-\u30ff\u31f0-\u31ff]|[a-zA-Z])', sentence)
     return [s.strip() for s in sub_sentences if s.strip()]
+def seconds_to_ass_time(seconds):
+    """将秒数转换为ASS时间格式"""
+    hours = int(seconds / 3600)
+    minutes = int((seconds % 3600) / 60)
+    seconds = int(seconds) % 60
+    milliseconds = int((seconds - int(seconds)) * 1000)
+    return "{:01d}:{:02d}:{:02d}.{:02d}".format(hours, minutes, seconds, int(milliseconds / 10))
+def extract_text_from_epub(file_path):
+    book = epub.read_epub(file_path)
+    content = []
+    for item in book.items:
+        if isinstance(item, epub.EpubHtml):
+            soup = BeautifulSoup(item.content, 'html.parser')
+            content.append(soup.get_text())
+    return '\n'.join(content)
+def extract_text_from_pdf(file_path):
+    with open(file_path, 'rb') as file:
+        reader = PdfReader(file)
+        content = [page.extract_text() for page in reader.pages]
+    return '\n'.join(content)
+def remove_annotations(text):
+    # 移除方括号、尖括号和中文方括号中的内容
+    text = re.sub(r'\[.*?\]', '', text)
+    text = re.sub(r'\<.*?\>', '', text)
+    text = re.sub(r'&#8203;``【oaicite:1】``&#8203;', '', text)
+    return text
+def extract_text_from_file(inputFile):
+    file_extension = os.path.splitext(inputFile)[1].lower()
+    if file_extension == ".epub":
+        return extract_text_from_epub(inputFile)
+    elif file_extension == ".pdf":
+        return extract_text_from_pdf(inputFile)
+    elif file_extension == ".txt":
+        with open(inputFile, 'r', encoding='utf-8') as f:
+            return f.read()
+    else:
+        raise ValueError(f"Unsupported file format: {file_extension}")
 if __name__ == "__main__":
     text = "你好，这是一段用来测试自动标注的文本。こんにちは,これは「自動ラベリングのテスト用テキスト」です.Hello, this is a piece of text to test autotagging.你好！今天我们要介绍VITS项目，其重点是使用了“GAN Duration predictor”和“transformer flow”,并且接入了Bert模型来提升韵律。Bert embedding会在稍后介绍。"
     print(extrac(text))