Spaces:

aiface
/

doc_moi_tieng_Viet

Running

App Files Files Community

aiface commited on Mar 4, 2023

Commit

327b68f

•

1 Parent(s): 907b7f3

Upload 12 files

Browse files

Files changed (13) hide show

.gitattributes +1 -0
preprocessing/20words_mean_face.npy +3 -0
preprocessing/30word - Copy.csv +30 -0
preprocessing/30word.csv +30 -0
preprocessing/README.md +23 -0
preprocessing/anhtrasn.json +30 -0
preprocessing/crop_mouth_from_video.py +283 -0
preprocessing/extract_audio_from_video.py +56 -0
preprocessing/shape_predictor_68_face_landmarks.dat +3 -0
preprocessing/transform.py +61 -0
preprocessing/utils.py +149 -0
preprocessing/vietnamese_detected_face_30_words.csv +0 -0
preprocessing/vietnamese_detected_face_30_words_have_snr.csv +0 -0

.gitattributes CHANGED Viewed

@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+preprocessing/shape_predictor_68_face_landmarks.dat filter=lfs diff=lfs merge=lfs -text

preprocessing/20words_mean_face.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dbf68b2044171e1160716df7c53e8bbfaa0ee8c61fb41171d04cb6092bb81422
+size 1168

preprocessing/30word - Copy.csv ADDED Viewed

	@@ -0,0 +1,30 @@

+thông,1916,,,,thông,tin,của,và,các,có,trong,là,ngày,đã,đầu,theo,công,tư,quý
+tin,1740,,,,1916,1740,1687,1640,1566,1513,1512,1344,1330,1284,1202,1197,1165,1148,1119
+của,1687,,,,những,thành,cho,vị,tế,về,phố,tháng,động,sản,với,được,chính,số,dõi
+và,1640,,,,1076,1065,1050,971,937,925,919,903,902,883,873,869,797,760,750
+các,1566,,,,,,,,,,,,,,,,,,
+có,1513,,,,,,,,,,,,,,,,,,
+trong,1512,,,,,,,,,,,,,,,,,,
+là,1344,,,,,,,,,,,,,,,,,,
+ngày,1330,,,,,,,,,,,,,,,,,,
+đã,1284,,,,,,,,,,,,,,,,,,
+đầu,1202,,,,,,,,,,,,,,,,,,
+theo,1197,,,,,,,,,,,,,,,,,,
+công,1165,,,,,,,,,,,,,,,,,,
+tư,1148,,,,,,,,,,,,,,,,,,
+quý,1119,,,,,,,,,,,,,,,,,,
+những,1076,,,,,,,,,,,,,,,,,,
+thành,1065,,,,,,,,,,,,,,,,,,
+cho,1050,,,,,,,,,,,,,,,,,,
+vị,971,,,,,,,,,,,,,,,,,,
+tế,937,,,,,,,,,,,,,,,,,,
+về,925,,,,,,,,,,,,,,,,,,
+phố,919,,,,,,,,,,,,,,,,,,
+tháng,903,,,,,,,,,,,,,,,,,,
+động,902,,,,,,,,,,,,,,,,,,
+sản,883,,,,,,,,,,,,,,,,,,
+với,873,,,,,,,,,,,,,,,,,,
+được,869,,,,,,,,,,,,,,,,,,
+chính,797,,,,,,,,,,,,,,,,,,
+số,760,,,,,,,,,,,,,,,,,,
+dõi,750,,,,,,,,,,,,,,,,,,

preprocessing/30word.csv ADDED Viewed

	@@ -0,0 +1,30 @@

+thông,1916
+tin,1740
+của,1687
+và,1640
+các,1566
+có,1513
+trong,1512
+là,1344
+ngày,1330
+đã,1284
+đầu,1202
+theo,1197
+công,1165
+tư,1148
+quý,1119
+những,1076
+thành,1065
+cho,1050
+vị,971
+tế,937
+về,925
+phố,919
+tháng,903
+động,902
+sản,883
+với,873
+được,869
+chính,797
+số,760
+dõi,750

preprocessing/README.md ADDED Viewed

	@@ -0,0 +1,23 @@

+### Pre-processing
+* To get mouth ROIs
+Run mouth cropping script to save grayscale mouth ROIs. We assume you save cropped mouths to *`$TCN_LIPREADING_ROOT/datasets/visual_data/`*. You can choose `--testset-only` to produce testing set.
+```Shell
+python crop_mouth_from_video.py --video-direc <LRW-DIREC> \
+                                --landmark-direc <LANDMARK-DIREC> \
+                                --save-direc <MOUTH-ROIS-DIRECTORY> \
+                                --convert-gray \
+                                --testset-only
+```
+* To get audio waveforms
+Run format conversion script to extract audio waveforms (.npz) from raw videos. We assume you save audio waveforms to *`$TCN_LIPREADING_ROOT/datasets/audio_data/`*. You can choose `--testset-only` to produce testing set.
+```Shell
+python extract_audio_from_video.py --video-direc <LRW-DIREC> \
+                                   --save-direc <AUDIO-WAVEFORMS-DIRECTORY> \
+                                   --testset-only
+```

preprocessing/anhtrasn.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+    "transcript": "Xin kính chào quý vị Kính mời quý vị",
+    "words": [
+        {
+            "end_time": 7.6,
+            "start_time": 0.0,
+            "word": "Xin"
+        },
+        {
+            "end_time": 7.8,
+            "start_time": 7.6,
+            "word": "kính"
+        },
+        {
+            "end_time": 8.0,
+            "start_time": 7.8,
+            "word": "chào"
+        },
+        {
+            "end_time": 8.0,
+            "start_time": 8.0,
+            "word": "quý"
+        },
+        {
+            "end_time": 8.2,
+            "start_time": 8.0,
+            "word": "vị"
+        }
+    ]
+}

preprocessing/crop_mouth_from_video.py ADDED Viewed

	@@ -0,0 +1,283 @@

+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright 2020 Imperial College London (Pingchuan Ma)
+# Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+""" Crop Mouth ROIs from videos for lipreading"""
+# from msilib.schema import File
+from ast import Pass
+import os
+import cv2  # OpenCV 라이브러리
+import glob  # 리눅스식 경로 표기법을 사용하여 원하는 폴더/파일 리스트 얻음
+import argparse  # 명령행 인자를 파싱해주는 모듈
+import numpy as np
+from collections import deque  # collections 모듈에 있는 데크 불러오기 # 데크: 스택과 큐를 합친 자료구조
+from utils import *  # utils.py 모듈에 있는 모든 함수 불러오기
+from transform import *  # transform.py 모듈에 있는 모든 함수 불러오기
+import dlib  # face landmark 찾는 라이브러리
+import face_alignment  # face landmark 찾는 라이브러리
+from PIL import Image
+# 인자값을 받아서 처리하는 함수
+def load_args(default_config=None):
+    # 인자값을 받아서 처리하는 함수
+    parser = argparse.ArgumentParser(description='Lipreading Pre-processing')
+    # 입력받을 인자값 등록
+    # -- utils
+    parser.add_argument('--video-direc', default=None, help='raw video directory')
+    parser.add_argument('--video-format', default='.mp4', help='raw video format')
+    parser.add_argument('--landmark-direc', default=None, help='landmark directory')
+    parser.add_argument('--filename-path', default='./vietnamese_detected_face_30.csv', help='list of detected video and its subject ID')
+    parser.add_argument('--save-direc', default=None, help='the directory of saving mouth ROIs')
+    # -- mean face utils
+    parser.add_argument('--mean-face', default='./20words_mean_face.npy', help='mean face pathname')
+    # -- mouthROIs utils
+    parser.add_argument('--crop-width', default=96, type=int, help='the width of mouth ROIs')
+    parser.add_argument('--crop-height', default=96, type=int, help='the height of mouth ROIs')
+    parser.add_argument('--start-idx', default=48, type=int, help='the start of landmark index')
+    parser.add_argument('--stop-idx', default=68, type=int, help='the end of landmark index')
+    parser.add_argument('--window-margin', default=12, type=int, help='window margin for smoothed_landmarks')
+    # -- convert to gray scale
+    parser.add_argument('--convert-gray', default=False, action='store_true', help='convert2grayscale')
+    # -- test set only
+    parser.add_argument('--testset-only', default=False, action='store_true', help='process testing set only')
+    # 입력받은 인자값을 args에 저장 (type: namespace)
+    args = parser.parse_args()
+    return args
+args = load_args()  # args 파싱 및 로드
+# -- mean face utils
+STD_SIZE = (256, 256)
+mean_face_landmarks = np.load(args.mean_face)  # 20words_mean_face.npy
+stablePntsIDs = [33, 36, 39, 42, 45]
+# 영상에서 랜드마크 받아서 입술 잘라내기
+def crop_patch( video_pathname, landmarks):
+    """Crop mouth patch
+    :param str video_pathname: pathname for the video_dieo  # 영상 위치
+    :param list landmarks: interpolated landmarks  # 보간된 랜드마크
+    """
+    frame_idx = 0  # 프레임 인덱스 번호 0 으로 초기화
+    frame_gen = read_video(video_pathname)  # 비디오 불러오기
+    # 무한 반복
+    while True:
+        try:
+            frame = frame_gen.__next__() ## -- BGR  # 이미지 프레임 하나씩 불러오기
+        except StopIteration:  # 더 이상 next 요소가 없으면 StopIterraion Exception 발생
+            break  # while 빠져나가기
+        if frame_idx == 0:  # 프레임 인덱스 번호가 0일 경우
+            q_frame, q_landmarks = deque(), deque()  # 데크 생성
+            sequence = []
+        q_landmarks.append(landmarks[frame_idx])  # 프레임 인덱스 번호에 맞는 랜드마크 정보 추가
+        q_frame.append(frame)  # 프레임 정보 추가
+        if len(q_frame) == args.window_margin:
+            smoothed_landmarks = np.mean(q_landmarks, axis=0)  # 각 그룹의 같은 원소끼리 평균
+            cur_landmarks = q_landmarks.popleft()  # 데크 제일 왼쪽 값 꺼내기
+            cur_frame = q_frame.popleft()  # 데크 제일 왼쪽 값 꺼내기
+            # -- affine transformation  # 아핀 변환
+            trans_frame, trans = warp_img( smoothed_landmarks[stablePntsIDs, :],
+                                           mean_face_landmarks[stablePntsIDs, :],
+                                           cur_frame,
+                                           STD_SIZE)
+            trans_landmarks = trans(cur_landmarks)
+            # -- crop mouth patch  # 입술 잘라내기
+            sequence.append( cut_patch( trans_frame,
+                                        trans_landmarks[args.start_idx:args.stop_idx],
+                                        args.crop_height//2,
+                                        args.crop_width//2,))
+        if frame_idx == len(landmarks)-1:
+            while q_frame:
+                cur_frame = q_frame.popleft()  # 데크 제일 왼쪽 값 꺼내기
+                # -- transform frame  # 프레임 변환
+                trans_frame = apply_transform( trans, cur_frame, STD_SIZE)
+                # -- transform landmarks  # 랜드마크 변환
+                trans_landmarks = trans(q_landmarks.popleft())
+                # -- crop mouth patch  # 입술 잘라내기
+                sequence.append( cut_patch( trans_frame,
+                                            trans_landmarks[args.start_idx:args.stop_idx],
+                                            args.crop_height//2,
+                                            args.crop_width//2,))
+            return np.array(sequence)  # 입술 numpy 반환
+        frame_idx += 1  # 프레임 인덱스 번호 증가
+    return None
+# 랜드마크 보간
+def landmarks_interpolate(landmarks):
+    """Interpolate landmarks
+    param list landmarks: landmarks detected in raw videos  # 원본 영상 데이터에서 검출한 랜드마크
+    """
+    valid_frames_idx = [idx for idx, _ in enumerate(landmarks) if _ is not None]  # 랜드마크 번호 list 생성
+    # 랜드마크 번호 list 가 비어있다면
+    if not valid_frames_idx:
+        return None
+    # 1부터 (랜드마크 번호 list 개수-1)만큼 for 문 반복
+    for idx in range(1, len(valid_frames_idx)):
+        if valid_frames_idx[idx] - valid_frames_idx[idx-1] == 1:  # 현재 랜드마크 번호 - 이전 랜드마크 번호 == 1 일 경우
+            continue  # 코드 실행 건너뛰기
+        else:  # 아니라면
+            landmarks = linear_interpolate(landmarks, valid_frames_idx[idx-1], valid_frames_idx[idx])  # 랜드마크 업데이트(보간)
+    valid_frames_idx = [idx for idx, _ in enumerate(landmarks) if _ is not None]  # 랜드마크 번호 list 생성
+    # -- Corner case: keep frames at the beginning or at the end failed to be detected.  # 시작 또는 끝 프레임을 보관하지 못함
+    if valid_frames_idx:
+        landmarks[:valid_frames_idx[0]] = [landmarks[valid_frames_idx[0]]] * valid_frames_idx[0]  # 랜드마크 첫번째 프레임 정보 저장
+        landmarks[valid_frames_idx[-1]:] = [landmarks[valid_frames_idx[-1]]] * (len(landmarks) - valid_frames_idx[-1])  # 랜드마크 마지막 프레임 정보 저장
+    valid_frames_idx = [idx for idx, _ in enumerate(landmarks) if _ is not None]  # 랜드마크 번호 list 생성
+    # 랜드마크 번호 list 개수 == 보간한 랜드마크 개수 확인, 아니면 AssertionError 메시지를 띄움
+    assert len(valid_frames_idx) == len(landmarks), "not every frame has landmark"  # 원하는 조건의 변수값을 보증하기 위해 사용
+    return landmarks  # 랜드마크 반환
+def get_yield(output_video):
+    for frame in output_video:
+        yield frame
+lines = open(args.filename_path).read().splitlines()  # 문자열을 '\n' 기준으로 쪼갠 후 list 생성
+lines = list(filter(lambda x: 'test' == x.split('/')[-2], lines)) if args.testset_only else lines  # args.testset_only 값이 있다면 test 폴더 속 파일명만 불러와서 list 생성, 아니라면 원래 lines 그대로 값 유지
+# lines 개수만큼 반복문 실행
+for filename_idx, line in enumerate(lines):
+    # 파일명, 사람id
+    filename, person_id = line.split(',')
+    print('idx: {} \tProcessing.\t{}'.format(filename_idx, filename))  # 파일 인덱스번호, 파일명 출력
+    video_pathname = os.path.join(args.video_direc, filename+args.video_format)  # 영상디렉토리 + 파일명.비디오포맷/
+    landmarks_pathname = os.path.join(args.landmark_direc, filename+'.npz')  # 저장디렉토리 + 랜드마크 파일명.npz
+    dst_pathname = os.path.join( args.save_direc, filename+'.npz')  # 저장디렉토리 + 결과영상 파일명.npz
+    # 파일이 있는지 확인, 없으면 AssertionError 메시지를 띄움
+    assert os.path.isfile(video_pathname), "File does not exist. Path input: {}".format(video_pathname)  # 원하는 조건의 변수값을 보증하기 위해 사용
+    # video 에 대한 face landmark npz 파일이 없고 영상 확장자 avi 인 경우 dlib 으로 직접 npz 파일 생성
+    if not os.path.exists(landmarks_pathname) and video_pathname.split('.')[-1] == 'mp4':
+        # dlib 사용해서 face landmark 찾기
+        def get_face_landmark(img):
+            detector_hog = dlib.get_frontal_face_detector()
+            dlib_rects = detector_hog(img, 1)
+            model_path = os.path.dirname(os.path.abspath(__file__)) + '/shape_predictor_68_face_landmarks.dat'
+            landmark_predictor = dlib.shape_predictor(model_path)
+            # dlib 으로 face landmark 찾기
+            list_landmarks = []
+            for dlib_rect in dlib_rects:
+                points = landmark_predictor(img, dlib_rect)
+                list_points = list(map(lambda p: (p.x, p.y), points.parts()))
+                list_landmarks.append(list_points)
+            input_width, input_height = img.shape
+            output_width, output_height = (256, 256)
+            width_rate = input_width / output_width
+            height_rate = input_height / output_height
+            img_rate = [(width_rate, height_rate)]*68
+            face_rate = np.array(img_rate)
+            eye_rate = np.array(img_rate[36:48])
+            # face landmark list 가 비어있지 않은 경우
+            if list_landmarks:
+                for dlib_rect, landmark in zip(dlib_rects, list_landmarks):
+                    face_landmark = np.array(landmark)  # face landmark
+                    eye_landmark = np.array(landmark[36:48])  # eye landmark
+                    return face_landmark, eye_landmark
+            # face landmark list 가 비어있는 경우
+            else:
+                landmark = [(0.0, 0.0)] * 68
+                face_landmark = np.array(landmark)  # face landmark
+                eye_landmark = np.array(landmark[36:48])  # eye landmark
+                return face_landmark, eye_landmark
+        target_frames = 29  # 원하는 프레임 개수
+        video = videoToArray(video_pathname, is_gray=args.convert_gray)  # 영상 정보 앞에 영상 프레임 개수를 추가한 numpy
+        output_video = frameAdjust(video, target_frames)  # frame sampling (프레임 개수 맞추기)
+        multi_sub_landmarks = []
+        person_landmarks = []
+        frame_landmarks = []
+        for frame_idx, frame in enumerate(get_yield(output_video)):
+            print(f'\n ------------frame {frame_idx}------------ ')
+            facial_landmarks, eye_landmarks = get_face_landmark(frame)  # dlib 사용해서 face landmark 찾기
+            person_landmarks = {
+                'id': 0,
+                'most_recent_fitting_scores': np.array([2.0,2.0,2.0]),
+                'facial_landmarks': facial_landmarks,
+                'roll': 7,
+                'yaw': 3.5,
+                'eye_landmarks': eye_landmarks,
+                'fitting_scores_updated': True,
+                'pitch': -0.05
+            }
+            frame_landmarks.append(person_landmarks)
+            multi_sub_landmarks.append(np.array(frame_landmarks.copy(), dtype=object))
+        multi_sub_landmarks = np.array(multi_sub_landmarks)  # list to numpy
+        save2npz(landmarks_pathname, data=multi_sub_landmarks)  # face landmark npz 저장
+        print('\n ------------ save npz ------------ \n')
+    # video 에 대한 face landmark npz 파일이 있는 경우
+    else:
+        # 파일이 있는지 확인, 없으면 AssertionError 메시지를 띄움
+        assert os.path.isfile(landmarks_pathname), "File does not exist. Path input: {}".format(landmarks_pathname)  # 원하는 조건의 변수값을 보증하기 위해 사용
+        # 파일이 존재할 경우
+        if os.path.exists(dst_pathname):
+            continue  # 코드 실행 건너뛰기
+        multi_sub_landmarks = np.load( landmarks_pathname, allow_pickle=True)['data']  # numpy 파일 열기
+        landmarks = [None] * len( multi_sub_landmarks)  # 랜드마크 변수 초기화
+        for frame_idx in range(len(landmarks)):
+            try:
+                landmarks[frame_idx] = multi_sub_landmarks[frame_idx][int(person_id)]['facial_landmarks'].astype(np.float64)  # 프레임 인덱스 번호에서 사람id의 얼굴 랜드마크 정보 가져오기
+            except IndexError:  # 해당 인덱스 번호에 깂이 없으면 IndexError 발생
+                continue  # 코드 실행 건너뛰기
+        # face landmark 가 [(0,0)]*68 이 아니면 랜드마크 보간 후 npz 파일 생성
+        landmarks_empty_list = []
+        landmarks_empty = [(0, 0)]*68
+        landmarks_empty = np.array(landmarks_empty, dtype=object)
+        for i in range(len(landmarks_empty)):
+            landmarks_empty_list.append(landmarks_empty.copy())
+        condition = landmarks != landmarks_empty_list
+        if condition:
+            # -- pre-process landmarks: interpolate frames not being detected.
+            preprocessed_landmarks = landmarks_interpolate(landmarks)  # 랜드마크 보간
+            # 변수가 비어있지 않다면
+            if not preprocessed_landmarks:
+                continue  # 코드 실행 건너뛰기
+            # -- crop
+            sequence = crop_patch(video_pathname, preprocessed_landmarks)  # 영상에서 랜드마크 받아서 입술 잘라내기
+            # sequence가 비어있는지 확인, 비어있으면 AssertionError 메시지를 띄움
+            assert sequence is not None, "cannot crop from {}.".format(filename)  # 원하는 조건의 변수값을 보증하기 위해 사용
+            # -- save
+            data = convert_bgr2gray(sequence) if args.convert_gray else sequence[...,::-1]  # gray 변환
+            save2npz(dst_pathname, data=data)  # 데이터를 npz 형식으로 저장
+print('Done.')

preprocessing/extract_audio_from_video.py ADDED Viewed

	@@ -0,0 +1,56 @@

+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright 2020 Imperial College London (Pingchuan Ma)
+# Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Transforms mp4 audio to npz. Code has strong assumptions on the dataset organization!"""
+import os
+import librosa  # 음원 데이터 분석 라이브러리
+import argparse  # 명령행 인자를 파싱해주는 모듈
+from utils import *  # utils.py 모듈에 있는 모든 함수(read_txt_lines(), save2npz(), read_video()) 불러오기
+# 인자값을 받아서 처리하는 함수
+def load_args(default_config=None):
+    # 인자값을 받을 수 있는 인스턴스 생성
+    parser = argparse.ArgumentParser(description='Extract Audio Waveforms')
+    # 입력받을 인자값 등록
+    # -- utils
+    parser.add_argument('--video-direc', default=None, help='raw video directory')
+    parser.add_argument('--filename-path', default='./vietnamese_detected_face_30.csv', help='list of detected video and its subject ID')
+    parser.add_argument('--save-direc', default=None, help='the directory of saving audio waveforms (.npz)')
+    # -- test set only
+    parser.add_argument('--testset-only', default=False, action='store_true', help='process testing set only')
+    # 입력받은 인자값을 args에 저장 (type: namespace)
+    args = parser.parse_args()
+    return args
+args = load_args()  # args 파싱 및 로드
+lines = open(args.filename_path).read().splitlines()  # 문자열을 '\m' 기준으로 쪼갠 후 list 생성
+lines = list(filter(lambda x: 'test' == x.split('/')[-2], lines)) if args.testset_only else lines   # args.testset_only 값이 있다면 test 폴더 속 파일명만 불러와서 list 생성, 아니라면 원래 lines 그대로 값 유지
+# lines 개수만큼 반복문 실행
+for filename_idx, line in enumerate(lines):
+    # 파일명, 사람id
+    filename, person_id = line.split(',')
+    print('idx: {} \tProcessing.\t{}'.format(filename_idx, filename))  # 파일 인덱스번호, 파일명 출력
+    video_pathname = os.path.join(args.video_direc, filename+'.mp4')  # 영상디렉토리 + 파일명.mp4
+    dst_pathname = os.path.join( args.save_direc, filename+'.npz')  # 저장디렉토리 + 파일명.npz
+    # 파일이 있는지 확인, 없으면 AssertionError 메시지를 띄움
+    assert os.path.isfile(video_pathname), "File does not exist. Path input: {}".format(video_pathname)  # 원하는 조건의 변수값을 보증하기 위해 사용
+    # wav 파일 읽는 라이브러리: librosa
+    # librosa 로 데이터를 읽으면 데이터 범위가 [-1,1]로 정규화됨
+    # librosa 입력에서 sr=None 으로 지정하지 않고 임의의 sample_rate를 설정하면 load할 때 resampling 수행함
+    data = librosa.load(video_pathname, sr=16000)[0][-19456:]
+    save2npz(dst_pathname, data=data)  # librosa 로 읽은 데이터를 npz 형식으로 저장

preprocessing/shape_predictor_68_face_landmarks.dat ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fbdc2cb80eb9aa7a758672cbfdda32ba6300efe9b6e6c7a299ff7e736b11b92f
+size 99693937

preprocessing/transform.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import cv2  # OpenCV 라이브러리
+import numpy as np
+from skimage import transform as tf  # 이미지 변환 모듈
+# -- Landmark interpolation:
+def linear_interpolate(landmarks, start_idx, stop_idx):
+    start_landmarks = landmarks[start_idx]  # 랜드마크 시작
+    stop_landmarks = landmarks[stop_idx]  # 랜드마크 끝
+    delta = stop_landmarks - start_landmarks  # 랜드마크 값 차이
+    for idx in range(1, stop_idx-start_idx):
+        landmarks[start_idx+idx] = start_landmarks + idx/float(stop_idx-start_idx) * delta  # 랜드마크 업데이트(보간)
+    return landmarks
+# -- Face Transformation
+# src: 입력 영상, dst: 출력/결과 영상
+def warp_img(src, dst, img, std_size):
+    tform = tf.estimate_transform('similarity', src, dst)  # find the transformation matrix  # 변환 행렬 구하기
+    warped = tf.warp(img, inverse_map=tform.inverse, output_shape=std_size)  # wrap the frame image  # 주어진 좌표 변환에 따라 프레임 이미지 왜곡
+    warped = warped * 255  # note output from wrap is double image (value range [0,1])
+    warped = warped.astype('uint8')  # numpy 데이터 타입 uint8 으로 변경
+    return warped, tform
+def apply_transform(transform, img, std_size):
+    warped = tf.warp(img, inverse_map=transform.inverse, output_shape=std_size)  # wrap the frame image  # 주어진 좌표 변환에 따라 프레임 이미지 왜곡
+    warped = warped * 255  # note output from wrap is double image (value range [0,1])
+    warped = warped.astype('uint8')  # numpy 데이터 타입 uint8 으로 변경
+    return warped
+# -- Crop
+def cut_patch(img, landmarks, height, width, threshold=5):
+    center_x, center_y = np.mean(landmarks, axis=0)  # 각 그룹의 같은 원소끼리 평균
+    # 좌표 처리
+    if center_y - height < 0:
+        center_y = height
+    if center_y - height < 0 - threshold:
+        raise Exception('too much bias in height')
+    if center_x - width < 0:
+        center_x = width
+    if center_x - width < 0 - threshold:
+        raise Exception('too much bias in width')
+    if center_y + height > img.shape[0]:
+        center_y = img.shape[0] - height
+    if center_y + height > img.shape[0] + threshold:
+        raise Exception('too much bias in height')
+    if center_x + width > img.shape[1]:
+        center_x = img.shape[1] - width
+    if center_x + width > img.shape[1] + threshold:
+        raise Exception('too much bias in width')
+    # 배열 복사
+    cutted_img = np.copy(img[ int(round(center_y) - round(height)): int(round(center_y) + round(height)),
+                         int(round(center_x) - round(width)): int(round(center_x) + round(width))])
+    return cutted_img
+# -- RGB to GRAY
+def convert_bgr2gray(data):
+    # np.stack(배열_1, 배열_2, axis=0): 지정한 axis를 완전히 새로운 axis로 생각
+    return np.stack([cv2.cvtColor(_, cv2.COLOR_BGR2GRAY) for _ in data], axis=0)  # gray 변환

preprocessing/utils.py ADDED Viewed

	@@ -0,0 +1,149 @@

+#coding=utf-8
+import os
+import cv2  # OpenCV 라이브러리
+import numpy as np
+from PIL import Image
+# -- IO utils
+# 텍스트 라인 불러오기
+def read_txt_lines(filepath):
+    # 파일이 있는지 확인, 없으면 AssertionError 메시지를 띄움
+    assert os.path.isfile( filepath ), "Error when trying to read txt file, path does not exist: {}".format(filepath)  # 원하는 조건의 변수값을 보증하기 위해 사용
+    # 파일 불러오기
+    with open( filepath ) as myfile:
+        content = myfile.read().splitlines()  # 문자열을 '\n' 기준으로 쪼갠 후 list 생성
+    return content
+# npz 저장
+def save2npz(filename, data=None):
+    # 데이터가 비어있는지 확인, 없으면 AssertionError 메시지를 띄움
+    assert data is not None, "data is {}".format(data)
+    # 파일 없을 경우
+    if not os.path.exists(os.path.dirname(filename)):
+        os.makedirs(os.path.dirname(filename))  # 디렉토리 생성
+    np.savez_compressed(filename, data=data)  # 압축되지 않은 .npz 파일 형식 으로 여러 배열 저장
+def save2npz(filename, data=None):
+    """save2npz.
+    :param filename: str, the fileanme where the data will be saved.
+    :param data: ndarray, arrays to save to the file.
+    """
+    assert data is not None, "data is {}".format(data)
+    if not os.path.exists(os.path.dirname(filename)):
+        os.makedirs(os.path.dirname(filename))
+    np.savez_compressed(filename, data=data)
+# 비디오 불러오기
+def read_video(filename):
+    cap = cv2.VideoCapture(filename)  # 영상 객체(파일) 가져오기
+    while(cap.isOpened()):  # 영상 파일(카메라)이 정상적으로 열렸는지(초기화되었는지) 여부
+        # ret: 정상적으로 읽어왔는가?
+        # frame: 한 장의 이미지(frame) 가져오기
+        ret, frame = cap.read() # BGR
+        if ret:  # 프레임 정보를 정상적으로 읽지 못하면
+            yield frame  # 프레임을 함수 바깥으로 전달하면서 코드 실행을 함수 바깥에 양보
+        else:  # 프레임 정보를 정상적으로 읽지 못하면
+            break  # while 빠져나가기
+    cap.release()  # 영상 파일(카메라) 사용 종료
+# Video 정보 가져오기
+def get_video_info(infilename, is_print=False):
+    cap = cv2.VideoCapture(infilename)
+    if not cap.isOpened():
+        print("could not open : ", infilename)
+        cap.release()
+        exit(0)
+    length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    cap.release()
+    if is_print:
+        print('length : ', length)
+        print('width : ', width)
+        print('height : ', height)
+        print('fps : ', fps)
+    video_info = {
+        'length': length,
+        'width': width,
+        'height': height,
+        'fps': fps,
+    }
+    return video_info
+# Video -> Numpy
+# 참고 깃허브 코드: https://github.com/khazit/Lip2Word/blob/master/lipReader.py#L22
+def videoToArray(video_pathname, is_gray=True) :
+    cap = cv2.VideoCapture(video_pathname)  # 영상 객체(파일) 가져오기
+    # 영상 파일(카메라)이 정상적으로 열리지 않은 경우
+    if not cap.isOpened():
+        print("could not open : ", video_pathname)
+        cap.release()  # 영상 파일(카메라) 사용 종료
+        exit(0)  # 빠져나가기
+    n_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))  # 영상 프레임 개수
+    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))  # 영상 너비
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))  # 영상 높이
+    fps = cap.get(cv2.CAP_PROP_FPS)  # 영상 FPS(Frames Per Second)
+    if is_gray:
+        video = np.zeros((n_frames, height, width))  # gray
+    else:
+        n_channels=3
+        video = np.zeros((n_frames, height, width, n_channels))  # color
+    video = video.astype(np.uint8)
+    i = 0
+    while True :
+        success, frame = cap.read()
+        if not success :
+            break
+        else :
+            # gray scale 적용
+            if is_gray:
+                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+            video[i] = frame
+            i += 1
+    cap.release()  # 영상 파일(카메라) 사용 종료
+    return video  # 영상 정보 앞에 영상 프레임 개수를 추가한 numpy 반환
+# Frame Sampling (프레임 개수 맞추기)
+# 참고 깃허브 코드: https://github.com/khazit/Lip2Word/blob/master/lipReader.py#L62
+def frameAdjust(video, target_frames=29):
+    n_frames = video.shape[0]  # 영상 프레임 개수
+    if target_frames == n_frames :
+        return video  # 영상 그대로 반환
+    else :
+        # 영상 프레임 개수 > 원하는 프레임 개수
+        if n_frames > target_frames :
+            idx = np.linspace(0, n_frames-1, target_frames)  # 숫자 시퀀스 생성 # 구간 시작점, 구간 끝점, 구간 내 숫자 개수
+            idx = np.around(idx, 0).astype(np.int32)  # 반올림하고 dtype 을 정수로 변경
+            return video[idx]  # 원하는 프레임 개수로 sampling 한 영상
+        # 영상 프레임 개수 < 원하는 프레임 개수
+        else :
+            output_video = np.zeros((target_frames, *video.shape[1:])).astype(np.uint8)  # 원하는 프레임 개수에 맞춰서 0으로 초기화한 numpy 생성
+            output_video[:n_frames] = video  # 영상 프레임 개수까지 그대로 영상 정보 저장
+            # 원하는 프레임 개수만큼 마지막 프레임 복제
+            for i in range(target_frames-n_frames+1) :
+                output_video[i+n_frames-1] = output_video[n_frames-1]
+            return output_video  # 원하는 프레임 개수로 sampling 한 영상

preprocessing/vietnamese_detected_face_30_words.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

preprocessing/vietnamese_detected_face_30_words_have_snr.csv ADDED Viewed

The diff for this file is too large to render. See raw diff