HenRick69 commited on
Commit
1e0a8f3
1 Parent(s): c73265e

Upload main files

Browse files
Files changed (4) hide show
  1. AudioClassifier.py +54 -0
  2. FacePosition.py +82 -0
  3. app.py +39 -0
  4. cursor_movement_model.pkl +3 -0
AudioClassifier.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ AudioClassifier class
3
+
4
+ Author: HenryAreiza
5
+ Date: 08/09/2023
6
+ """
7
+
8
+ from scipy.io import wavfile
9
+ from scipy.signal import decimate
10
+ from transformers import pipeline
11
+
12
+ class AudioClassifier:
13
+ """
14
+ A class for classifying audio commands using a pre-trained model.
15
+
16
+ This class provides functionality for classifying audio commands based on
17
+ a pre-trained audio classification model.
18
+
19
+ Attributes:
20
+ vocab (list): Vocabulary of valid commands
21
+ pipe: The Hugging Face Transformers pipeline for audio classification.
22
+ """
23
+
24
+ def __init__(self):
25
+ """
26
+ Initializes the AudioClassifier class.
27
+ """
28
+ self.vocab = ["left", "right", "up", "down", "go", "follow",
29
+ "on", "off", "one", "two", "three", "stop"]
30
+
31
+ # Load the audio classification pipeline
32
+ self.pipe = pipeline("audio-classification", model="0xb1/wav2vec2-base-finetuned-speech_commands-v0.02")
33
+
34
+ def predict(self, audio_path):
35
+ """
36
+ Classify audio data into a command label.
37
+
38
+ Args:
39
+ audio_data (numpy.ndarray): Input audio data.
40
+
41
+ Returns:
42
+ result (str): The classified command label.
43
+ """
44
+ _, audio = wavfile.read(audio_path)
45
+ audio = decimate(audio, 3)
46
+ result = self.pipe(audio)[0]["label"]
47
+
48
+ if result not in self.vocab:
49
+ result = 'unknown'
50
+
51
+ return result
52
+
53
+
54
+
FacePosition.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FacePosition class
3
+
4
+ Author: HenryAreiza
5
+ Date: 08/09/2023
6
+ """
7
+
8
+ import os
9
+ import cv2
10
+ import pickle
11
+ import numpy as np
12
+ import mediapipe as mp
13
+
14
+ class FacePosition:
15
+ """
16
+ A class for controlling the cursor based on head movements.
17
+
18
+ This class provides functionality for detecting a face using
19
+ the MediaPipe library and controlling the cursor's movement accordingly.
20
+
21
+ Attributes:
22
+ movement (list): List of classes corresponding to the predicted movement.
23
+ images (list): List of images associated to each class
24
+ cursor_model: The machine learning model for gesture prediction.
25
+ face_detection: The MediaPipe Face Detection component.
26
+ """
27
+
28
+ def __init__(self):
29
+ """
30
+ Initializes the FaceCursorController class.
31
+ """
32
+ self.movement = ['Center', 'Up', 'Right/Up', 'Right', 'Right/Down', 'Down', 'Left/Down', 'Left', 'Left/Up']
33
+ self.images = [cv2.imread(os.path.join('media', str(i)+'.png')) for i in range(9)]
34
+
35
+ # Load the cursor movement model
36
+ with open('cursor_movement_model.pkl', 'rb') as f:
37
+ self.cursor_model = pickle.load(f)
38
+
39
+ # Initialize the MediaPipe Face Detection component
40
+ self.face_detection = mp.solutions.face_detection.FaceDetection(min_detection_confidence=0.5)
41
+
42
+ def predict(self, frame):
43
+ """
44
+ Move the cursor based on head position.
45
+
46
+ Args:
47
+ reference (list): A list containing reference coordinates and size of the bounding box.
48
+ keypoints (list): A list of keypoints representing face landmarks.
49
+
50
+ Returns:
51
+ result (list): The predicted class image and label.
52
+ """
53
+ # Perform face detection
54
+ results = self.face_detection.process(frame)
55
+
56
+ # Read the reference and landmarks from the detected face
57
+ if results.detections:
58
+ for detection in results.detections:
59
+ reference = [[detection.location_data.relative_bounding_box.xmin,
60
+ detection.location_data.relative_bounding_box.ymin],
61
+ [detection.location_data.relative_bounding_box.width,
62
+ detection.location_data.relative_bounding_box.height]]
63
+ keypoints = []
64
+ for key_point in detection.location_data.relative_keypoints:
65
+ keypoints.append([key_point.x, key_point.y])
66
+ break
67
+
68
+ # Transform the lists into numpy arrays
69
+ reference = np.array(reference)
70
+ keypoints = np.array(keypoints)
71
+
72
+ # Remove off-set from keypoints
73
+ keypoints = (keypoints - reference[0]) / reference[1]
74
+
75
+ # Recognize the head position
76
+ prediction = self.cursor_model.predict(keypoints.reshape((1, -1)))[0]
77
+
78
+ return [self.images[prediction], self.movement[prediction]]
79
+
80
+ else:
81
+ return [self.images[0], self.movement[0]]
82
+
app.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from FacePosition import FacePosition
3
+ from AudioClassifier import AudioClassifier
4
+
5
+
6
+ # Create an instance of the FacePosition class
7
+ movement_controller = FacePosition()
8
+
9
+ cursor_movement = gr.Interface(
10
+ fn = movement_controller.predict,
11
+ inputs = gr.Image(source='webcam', streaming=True),
12
+ outputs = ['image', 'text'],
13
+ live = True,
14
+ title = 'Cursor movement controller',
15
+ description = "This space provides functionality for detecting a face using the MediaPipe library and controlling the cursor's movement accordingly."
16
+ )
17
+
18
+
19
+ # Create an instance of the AudioClassifier class
20
+ audio_classifier = AudioClassifier()
21
+
22
+ audio_commands = gr.Interface(
23
+ fn = audio_classifier.predict,
24
+ inputs = gr.Audio(source="microphone", type="filepath", streaming=True),
25
+ outputs = "text",
26
+ live = True,
27
+ title = 'Speech commands recognition (mouse actions)',
28
+ description = 'This class provides functionality for classifying audio commands associated to mouse actions, based on a pre-trained audio classification model.'
29
+ )
30
+
31
+
32
+ demo = gr.TabbedInterface([cursor_movement, audio_commands],
33
+ title = 'Hands-free Cursor Application',
34
+ tab_names = ['Cursor movement controller', 'Speech commands recognition'],
35
+ theme = gr.themes.Soft())
36
+
37
+
38
+ if __name__ == "__main__":
39
+ demo.launch()
cursor_movement_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3751a31fbe1163000ffc0ae0e230430475ad150412947a16aef3ebdfb6792d4d
3
+ size 1696