Spaces:
Running
Running
Mahiruoshi
commited on
Commit
·
d1f7ac3
1
Parent(s):
50ea4f2
Update tools/sentence.py
Browse files- tools/sentence.py +48 -1
tools/sentence.py
CHANGED
@@ -1,4 +1,9 @@
|
|
1 |
-
import re
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
def is_japanese(string):
|
4 |
for ch in string:
|
@@ -59,6 +64,48 @@ def split_mixed_language(sentence):
|
|
59 |
sub_sentences = re.split(r'(?<=[。!?\.\?!])(?=")|(?<=")(?=[\u4e00-\u9fff\u3040-\u30ff\u31f0-\u31ff]|[a-zA-Z])', sentence)
|
60 |
return [s.strip() for s in sub_sentences if s.strip()]
|
61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
if __name__ == "__main__":
|
63 |
text = "你好,这是一段用来测试自动标注的文本。こんにちは,これは「自動ラベリングのテスト用テキスト」です.Hello, this is a piece of text to test autotagging.你好!今天我们要介绍VITS项目,其重点是使用了“GAN Duration predictor”和“transformer flow”,并且接入了Bert模型来提升韵律。Bert embedding会在稍后介绍。"
|
64 |
print(extrac(text))
|
|
|
1 |
+
import re, os
|
2 |
+
|
3 |
+
from ebooklib import epub
|
4 |
+
import PyPDF2
|
5 |
+
from PyPDF2 import PdfReader
|
6 |
+
from bs4 import BeautifulSoup
|
7 |
|
8 |
def is_japanese(string):
|
9 |
for ch in string:
|
|
|
64 |
sub_sentences = re.split(r'(?<=[。!?\.\?!])(?=")|(?<=")(?=[\u4e00-\u9fff\u3040-\u30ff\u31f0-\u31ff]|[a-zA-Z])', sentence)
|
65 |
return [s.strip() for s in sub_sentences if s.strip()]
|
66 |
|
67 |
+
def seconds_to_ass_time(seconds):
|
68 |
+
"""将秒数转换为ASS时间格式"""
|
69 |
+
hours = int(seconds / 3600)
|
70 |
+
minutes = int((seconds % 3600) / 60)
|
71 |
+
seconds = int(seconds) % 60
|
72 |
+
milliseconds = int((seconds - int(seconds)) * 1000)
|
73 |
+
return "{:01d}:{:02d}:{:02d}.{:02d}".format(hours, minutes, seconds, int(milliseconds / 10))
|
74 |
+
|
75 |
+
def extract_text_from_epub(file_path):
|
76 |
+
book = epub.read_epub(file_path)
|
77 |
+
content = []
|
78 |
+
for item in book.items:
|
79 |
+
if isinstance(item, epub.EpubHtml):
|
80 |
+
soup = BeautifulSoup(item.content, 'html.parser')
|
81 |
+
content.append(soup.get_text())
|
82 |
+
return '\n'.join(content)
|
83 |
+
|
84 |
+
def extract_text_from_pdf(file_path):
|
85 |
+
with open(file_path, 'rb') as file:
|
86 |
+
reader = PdfReader(file)
|
87 |
+
content = [page.extract_text() for page in reader.pages]
|
88 |
+
return '\n'.join(content)
|
89 |
+
|
90 |
+
def remove_annotations(text):
|
91 |
+
# 移除方括号、尖括号和中文方括号中的内容
|
92 |
+
text = re.sub(r'\[.*?\]', '', text)
|
93 |
+
text = re.sub(r'\<.*?\>', '', text)
|
94 |
+
text = re.sub(r'​``【oaicite:1】``​', '', text)
|
95 |
+
return text
|
96 |
+
|
97 |
+
def extract_text_from_file(inputFile):
|
98 |
+
file_extension = os.path.splitext(inputFile)[1].lower()
|
99 |
+
if file_extension == ".epub":
|
100 |
+
return extract_text_from_epub(inputFile)
|
101 |
+
elif file_extension == ".pdf":
|
102 |
+
return extract_text_from_pdf(inputFile)
|
103 |
+
elif file_extension == ".txt":
|
104 |
+
with open(inputFile, 'r', encoding='utf-8') as f:
|
105 |
+
return f.read()
|
106 |
+
else:
|
107 |
+
raise ValueError(f"Unsupported file format: {file_extension}")
|
108 |
+
|
109 |
if __name__ == "__main__":
|
110 |
text = "你好,这是一段用来测试自动标注的文本。こんにちは,これは「自動ラベリングのテスト用テキスト」です.Hello, this is a piece of text to test autotagging.你好!今天我们要介绍VITS项目,其重点是使用了“GAN Duration predictor”和“transformer flow”,并且接入了Bert模型来提升韵律。Bert embedding会在稍后介绍。"
|
111 |
print(extrac(text))
|