Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -87,17 +87,17 @@ def Wordchunker(word):
|
|
87 |
return chunks
|
88 |
|
89 |
def BatchWordChunk(sentence):
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
ChunksasString = ""
|
94 |
-
for word in words:
|
95 |
ChunksasString = ""
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
|
|
|
|
101 |
|
102 |
# Translate from English to French
|
103 |
|
@@ -106,22 +106,22 @@ langdest = gr.Dropdown(choices=["af", "de", "es", "ko", "ja", "zh-cn"], label="C
|
|
106 |
ChunkModeDrop = gr.Dropdown(choices=["Chunks", "Reverse", "Three Word Chunks", "Spelling Chunks"], label="Choose Chunk Type", value="Chunks")
|
107 |
|
108 |
def FrontRevSentChunk (Chunkmode, Translate, Text, langdest):
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
|
126 |
# Define a function to filter out non-verb, noun, or adjective words
|
127 |
def filter_words(words):
|
@@ -137,13 +137,13 @@ def filter_words(words):
|
|
137 |
return filtered_words
|
138 |
|
139 |
def SepHypandSynExpansion(text):
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
synonyms = []
|
148 |
hypernyms = []
|
149 |
for synset in wordnet.synsets(token):
|
@@ -153,15 +153,15 @@ def SepHypandSynExpansion(text):
|
|
153 |
NoHits += f"{token} | "
|
154 |
else:
|
155 |
FinalOutput += "\n" f"{token}: hypernyms={hypernyms}, synonyms={synonyms} \n"
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
|
161 |
|
162 |
def WikiSearch(term):
|
163 |
termtoks = term.split(" ")
|
164 |
-
|
165 |
for item in termtoks:
|
166 |
# Search for the term on Wikipedia and get the first result
|
167 |
result = wikipedia.search(item, results=20)
|
@@ -180,13 +180,13 @@ def create_dictionary(word_list, word_dict = {}):
|
|
180 |
def merge_lines(roman_file, w4w_file, full_mean_file, macaronic_file):
|
181 |
files = [roman_file, w4w_file, full_mean_file, macaronic_file]
|
182 |
merged_lines = []
|
183 |
-
|
184 |
with open(roman_file.name, "r") as f1, open(w4w_file.name, "r") as f2, \
|
185 |
open(full_mean_file.name, "r") as f3, open(macaronic_file.name, "r") as f4:
|
186 |
for lines in zip(f1, f2, f3, f4):
|
187 |
merged_line = "\n".join(line.strip() for line in lines)
|
188 |
merged_lines.append(merged_line)
|
189 |
-
|
190 |
return "\n".join(merged_lines)
|
191 |
|
192 |
TTSLangOptions = gr.Dropdown(choices=["en", "ja", "ko", "zh-cn"], value="en", label="choose the language of the srt")
|
@@ -340,26 +340,26 @@ def find_string_positions(s, string):
|
|
340 |
return positions
|
341 |
|
342 |
def splittext(string):
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
|
364 |
def VideotoSegment(video_file, subtitle_file):
|
365 |
# Read the subtitle file and extract the timings for each subtitle
|
|
|
87 |
return chunks
|
88 |
|
89 |
def BatchWordChunk(sentence):
|
90 |
+
words = sentence.split(" ")
|
91 |
+
FinalOutput = ""
|
92 |
+
Currentchunks = ""
|
|
|
|
|
93 |
ChunksasString = ""
|
94 |
+
for word in words:
|
95 |
+
ChunksasString = ""
|
96 |
+
Currentchunks = Wordchunker(word)
|
97 |
+
for chunk in Currentchunks:
|
98 |
+
ChunksasString += chunk + " "
|
99 |
+
FinalOutput += "\n" + ChunksasString
|
100 |
+
return FinalOutput
|
101 |
|
102 |
# Translate from English to French
|
103 |
|
|
|
106 |
ChunkModeDrop = gr.Dropdown(choices=["Chunks", "Reverse", "Three Word Chunks", "Spelling Chunks"], label="Choose Chunk Type", value="Chunks")
|
107 |
|
108 |
def FrontRevSentChunk (Chunkmode, Translate, Text, langdest):
|
109 |
+
FinalOutput = ""
|
110 |
+
TransFinalOutput = ""
|
111 |
+
if Chunkmode=="Chunks":
|
112 |
+
FinalOutput += Sentencechunker(Text)
|
113 |
+
if Chunkmode=="Reverse":
|
114 |
+
FinalOutput += ReverseSentenceChunker(Text)
|
115 |
+
if Chunkmode=="Three Word Chunks":
|
116 |
+
FinalOutput += three_words_chunk(Text)
|
117 |
+
if Chunkmode=="Spelling Chunks":
|
118 |
+
FinalOutput += BatchWordChunk(Text)
|
119 |
+
|
120 |
+
if Translate:
|
121 |
+
TransFinalOutput = FinalOutput
|
122 |
+
translated = translator.translate(TransFinalOutput, dest=langdest)
|
123 |
+
FinalOutput += "\n" + translated.text
|
124 |
+
return FinalOutput
|
125 |
|
126 |
# Define a function to filter out non-verb, noun, or adjective words
|
127 |
def filter_words(words):
|
|
|
137 |
return filtered_words
|
138 |
|
139 |
def SepHypandSynExpansion(text):
|
140 |
+
# Tokenize the text
|
141 |
+
tokens = nltk.word_tokenize(text)
|
142 |
+
NoHits = ""
|
143 |
+
FinalOutput = ""
|
144 |
+
|
145 |
+
# Find synonyms and hypernyms of each word in the text
|
146 |
+
for token in tokens:
|
147 |
synonyms = []
|
148 |
hypernyms = []
|
149 |
for synset in wordnet.synsets(token):
|
|
|
153 |
NoHits += f"{token} | "
|
154 |
else:
|
155 |
FinalOutput += "\n" f"{token}: hypernyms={hypernyms}, synonyms={synonyms} \n"
|
156 |
+
NoHits = set(NoHits.split(" | "))
|
157 |
+
NoHits = filter_words(NoHits)
|
158 |
+
NoHits = "Words to pay special attention to: \n" + str(NoHits)
|
159 |
+
return NoHits, FinalOutput
|
160 |
|
161 |
|
162 |
def WikiSearch(term):
|
163 |
termtoks = term.split(" ")
|
164 |
+
|
165 |
for item in termtoks:
|
166 |
# Search for the term on Wikipedia and get the first result
|
167 |
result = wikipedia.search(item, results=20)
|
|
|
180 |
def merge_lines(roman_file, w4w_file, full_mean_file, macaronic_file):
|
181 |
files = [roman_file, w4w_file, full_mean_file, macaronic_file]
|
182 |
merged_lines = []
|
183 |
+
|
184 |
with open(roman_file.name, "r") as f1, open(w4w_file.name, "r") as f2, \
|
185 |
open(full_mean_file.name, "r") as f3, open(macaronic_file.name, "r") as f4:
|
186 |
for lines in zip(f1, f2, f3, f4):
|
187 |
merged_line = "\n".join(line.strip() for line in lines)
|
188 |
merged_lines.append(merged_line)
|
189 |
+
|
190 |
return "\n".join(merged_lines)
|
191 |
|
192 |
TTSLangOptions = gr.Dropdown(choices=["en", "ja", "ko", "zh-cn"], value="en", label="choose the language of the srt")
|
|
|
340 |
return positions
|
341 |
|
342 |
def splittext(string):
|
343 |
+
string_no_formaterror = string.replace(" -- > ", " --> ")
|
344 |
+
split_positions = find_string_positions(string_no_formaterror, " --> ")
|
345 |
+
split_strings = []
|
346 |
+
prepos = 0
|
347 |
+
for pos in split_positions:
|
348 |
+
pos -= 12
|
349 |
+
split_strings.append((string[prepos:pos])) #, string[pos:]))
|
350 |
+
prepos = pos
|
351 |
+
|
352 |
+
FinalOutput = ""
|
353 |
+
stoutput = ""
|
354 |
+
linenumber = 1
|
355 |
+
#print(linenumber)
|
356 |
+
for item in split_strings[1:]:
|
357 |
+
stoutput = item[0:29] + "\n" + item[30:]
|
358 |
+
stspaces = find_string_positions(stoutput, " ")
|
359 |
+
FinalOutput += str(linenumber) + "\n" + stoutput[:stspaces[-2]] + "\n"
|
360 |
+
FinalOutput += "\n"
|
361 |
+
linenumber += 1
|
362 |
+
return FinalOutput[2:]
|
363 |
|
364 |
def VideotoSegment(video_file, subtitle_file):
|
365 |
# Read the subtitle file and extract the timings for each subtitle
|