Spaces:

abnerh
/

video-to-subs

Paused

File size: 947 Bytes

0cc2cbd

import os, re, string
import subprocess
from textblob_de import TextBlobDE as TextBlob


def clean_english(text):
    clean_text = re.sub(r'  ', ' ', text)
    clean_text = re.sub(r'\bi\s', 'I ', clean_text)
    clean_text = re.sub(r'\si$', ' I', clean_text)
    clean_text = re.sub(r'i\'', 'I\'', clean_text)

    return clean_text

def clean_german(text):
	text = text.translate(str.maketrans('', '', string.punctuation))
	
	# Tokenize German text
	blob = TextBlob(text)
	pos = blob.tags

	# Get nouns and capitalize 
	nouns = {}
	for idx in pos:
		if idx[1] == 'NN' and len(idx[0]) > 1:
			nouns[idx[0]] = idx[0].capitalize()
		
	if len(nouns) != 0:
		pattern = re.compile("|".join(nouns.keys()))
		text = pattern.sub(lambda m: nouns[re.escape(m.group(0))], text)
	
	return text


def clean_spanish(text):
	clean_text = text.translate(str.maketrans('', '', string.punctuation))
	clean_text = re.sub(r'  ', ' ', clean_text)

	return clean_text