""" This module allows to extract texts from videos using OCR """ import easyocr import os import cv2 import shutil import difflib import re from tools.video_tools import generate_frames CONF_THRESH = 0.9 SIMILARITY_THRESH = 0.8 def process_text(text): result = re.sub(r"[\n\"\[\]~;]", "", text) lst = result.split() s = "" for item in lst: item = item.strip() if len(item)!=1 or item == "a" or item == "I" or item == "i" or item == "A": s += " "+item if len(s)<6: s = "" return s def get_formated_text(texts_arr): res = "" for row in texts_arr: k = process_text(row.lower()) if len(k) > 0: res += process_text(row.lower()) + ", " return res[:-2] def add_text(text_lst, text): for t in text_lst: similarity = difflib.SequenceMatcher(None, t, text).ratio() if similarity > SIMILARITY_THRESH: return text_lst.append(text) def retrieve_text(video_path, rate = 5, frames_path = "tmp_frames", show_print = True): texts_lst = [] generate_frames(video_path, frames_path, rate = rate, show_print = show_print) ocr = easyocr.Reader(['en']) for i in os.listdir(frames_path): text = ocr.readtext(frames_path + "/" + i) for txt in text: # Threshold for confidence if txt[2] > CONF_THRESH: # Filter similar texts add_text(texts_lst, txt[1]) # Delete temporary directory shutil.rmtree(frames_path) return texts_lst def retrieve_to_file(dest, video_path): text_lst = retrieve_text(video_path, rate = 2, show_print = False) file = open(dest, "w") file.writelines([line + "\n" for line in text_lst]) file.close() def retrieve_to_files(dest, video_path): for file_name in os.listdir(video_path): retrieve_to_file(dest + "/" + os.path.splitext(file_name)[0] + "_text.txt", video_path + "/" + file_name)