Mahiruoshi commited on
Commit
d1f7ac3
·
1 Parent(s): 50ea4f2

Update tools/sentence.py

Browse files
Files changed (1) hide show
  1. tools/sentence.py +48 -1
tools/sentence.py CHANGED
@@ -1,4 +1,9 @@
1
- import re
 
 
 
 
 
2
 
3
  def is_japanese(string):
4
  for ch in string:
@@ -59,6 +64,48 @@ def split_mixed_language(sentence):
59
  sub_sentences = re.split(r'(?<=[。!?\.\?!])(?=")|(?<=")(?=[\u4e00-\u9fff\u3040-\u30ff\u31f0-\u31ff]|[a-zA-Z])', sentence)
60
  return [s.strip() for s in sub_sentences if s.strip()]
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  if __name__ == "__main__":
63
  text = "你好,这是一段用来测试自动标注的文本。こんにちは,これは「自動ラベリングのテスト用テキスト」です.Hello, this is a piece of text to test autotagging.你好!今天我们要介绍VITS项目,其重点是使用了“GAN Duration predictor”和“transformer flow”,并且接入了Bert模型来提升韵律。Bert embedding会在稍后介绍。"
64
  print(extrac(text))
 
1
+ import re, os
2
+
3
+ from ebooklib import epub
4
+ import PyPDF2
5
+ from PyPDF2 import PdfReader
6
+ from bs4 import BeautifulSoup
7
 
8
  def is_japanese(string):
9
  for ch in string:
 
64
  sub_sentences = re.split(r'(?<=[。!?\.\?!])(?=")|(?<=")(?=[\u4e00-\u9fff\u3040-\u30ff\u31f0-\u31ff]|[a-zA-Z])', sentence)
65
  return [s.strip() for s in sub_sentences if s.strip()]
66
 
67
+ def seconds_to_ass_time(seconds):
68
+ """将秒数转换为ASS时间格式"""
69
+ hours = int(seconds / 3600)
70
+ minutes = int((seconds % 3600) / 60)
71
+ seconds = int(seconds) % 60
72
+ milliseconds = int((seconds - int(seconds)) * 1000)
73
+ return "{:01d}:{:02d}:{:02d}.{:02d}".format(hours, minutes, seconds, int(milliseconds / 10))
74
+
75
+ def extract_text_from_epub(file_path):
76
+ book = epub.read_epub(file_path)
77
+ content = []
78
+ for item in book.items:
79
+ if isinstance(item, epub.EpubHtml):
80
+ soup = BeautifulSoup(item.content, 'html.parser')
81
+ content.append(soup.get_text())
82
+ return '\n'.join(content)
83
+
84
+ def extract_text_from_pdf(file_path):
85
+ with open(file_path, 'rb') as file:
86
+ reader = PdfReader(file)
87
+ content = [page.extract_text() for page in reader.pages]
88
+ return '\n'.join(content)
89
+
90
+ def remove_annotations(text):
91
+ # 移除方括号、尖括号和中文方括号中的内容
92
+ text = re.sub(r'\[.*?\]', '', text)
93
+ text = re.sub(r'\<.*?\>', '', text)
94
+ text = re.sub(r'&#8203;``【oaicite:1】``&#8203;', '', text)
95
+ return text
96
+
97
+ def extract_text_from_file(inputFile):
98
+ file_extension = os.path.splitext(inputFile)[1].lower()
99
+ if file_extension == ".epub":
100
+ return extract_text_from_epub(inputFile)
101
+ elif file_extension == ".pdf":
102
+ return extract_text_from_pdf(inputFile)
103
+ elif file_extension == ".txt":
104
+ with open(inputFile, 'r', encoding='utf-8') as f:
105
+ return f.read()
106
+ else:
107
+ raise ValueError(f"Unsupported file format: {file_extension}")
108
+
109
  if __name__ == "__main__":
110
  text = "你好,这是一段用来测试自动标注的文本。こんにちは,これは「自動ラベリングのテスト用テキスト」です.Hello, this is a piece of text to test autotagging.你好!今天我们要介绍VITS项目,其重点是使用了“GAN Duration predictor”和“transformer flow”,并且接入了Bert模型来提升韵律。Bert embedding会在稍后介绍。"
111
  print(extrac(text))