1. 依赖库介绍
OpenCV
OpenCV(Open Source Computer Vision Library)是一个开源的计算机视觉和机器学习软件库。它包含了数百个计算机视觉算法。
MediaPipe
MediaPipe是一个跨平台的机器学习解决方案库,可以用于实时人类姿势估计、手势识别等任务。
PyCaw
PyCaw是一个Python库,用于控制Windows上的音频设备。
Python版本
本来在Python 3.11环境中进行测试,结果一直报错,似乎是mediapipe
库的问题,换了Python 3.12环境后顺利解决
安装依赖
pip install mediapipe
pip install comtypes
pip install pycaw
pip install numpy
pip install opencv-python
2. 程序结构
程序主要分为以下几个部分:
- 初始化MediaPipe和音量控制接口。
- 从摄像头获取视频流。
- 处理视频帧以检测手部位置和姿态。
- 计算手指之间的距离,并将其映射到音量控制上。
- 显示处理后的图像,包括手部标志和音量指示。
3. 代码详解
3.1 初始化
首先,我们需要导入必要的库,并初始化MediaPipe和音量控制接口。
import cv2
import mediapipe as mp
from ctypes import cast, POINTER
from comtypes import CLSCTX_ALL
from pycaw.pycaw import AudioUtilities, IAudioEndpointVolume
import time
import math
import numpy as np
class HandControlVolume:
def __init__(self):
self.mp_drawing = mp.solutions.drawing_utils
self.mp_drawing_styles = mp.solutions.drawing_styles
self.mp_hands = mp.solutions.hands
devices = AudioUtilities.GetSpeakers()
interface = devices.Activate(IAudioEndpointVolume._iid_, CLSCTX_ALL, None)
self.volume = cast(interface, POINTER(IAudioEndpointVolume))
self.volume.SetMute(0, None)
self.volume_range = self.volume.GetVolumeRange()
3.2 主函数
recognize
函数是程序的核心,负责处理视频流并进行手势识别和音量控制。
def recognize(self):
fpsTime = time.time()
cap = cv2.VideoCapture(0)
resize_w = 640
resize_h = 480
rect_height = 0
rect_percent_text = 0
with self.mp_hands.Hands(min_detection_confidence=0.7,
min_tracking_confidence=0.5,
max_num_hands=2) as hands:
while cap.isOpened():
success, image = cap.read()
image = cv2.resize(image, (resize_w, resize_h))
if not success:
print("空帧.")
continue
image.flags.writeable = False
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image = cv2.flip(image, 1)
results = hands.process(image)
image.flags.writeable = True
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
if results.multi_hand_landmarks:
for hand_landmarks in results.multi_hand_landmarks:
self.mp_drawing.draw_landmarks(
image,
hand_landmarks,
self.mp_hands.HAND_CONNECTIONS,
self.mp_drawing_styles.get_default_hand_landmarks_style(),
self.mp_drawing_styles.get_default_hand_connections_style())
landmark_list = []
for landmark_id, finger_axis in enumerate(hand_landmarks.landmark):
landmark_list.append([landmark_id, finger_axis.x, finger_axis.y, finger_axis.z])
if landmark_list:
thumb_finger_tip = landmark_list[4]
thumb_finger_tip_x = math.ceil(thumb_finger_tip[1] * resize_w)
thumb_finger_tip_y = math.ceil(thumb_finger_tip[2] * resize_h)
index_finger_tip = landmark_list[8]
index_finger_tip_x = math.ceil(index_finger_tip[1] * resize_w)
index_finger_tip_y = math.ceil(index_finger_tip[2] * resize_h)
finger_middle_point = (thumb_finger_tip_x + index_finger_tip_x) // 2, (
thumb_finger_tip_y + index_finger_tip_y) // 2
thumb_finger_point = (thumb_finger_tip_x, thumb_finger_tip_y)
index_finger_point = (index_finger_tip_x, index_finger_tip_y)
image = cv2.circle(image, thumb_finger_point, 10, (255, 0, 255), -1)
image = cv2.circle(image, index_finger_point, 10, (255, 0, 255), -1)
image = cv2.circle(image, finger_middle_point, 10, (255, 0, 255), -1)
image = cv2.line(image, thumb_finger_point, index_finger_point, (255, 0, 255), 5)
line_len = math.hypot((index_finger_tip_x - thumb_finger_tip_x),
(index_finger_tip_y - thumb_finger_tip_y))
min_volume = self.volume_range[0]
max_volume = self.volume_range[1]
vol = np.interp(line_len, [50, 300], [min_volume, max_volume])
rect_height = np.interp(line_len, [50, 300], [0, 200])
rect_percent_text = np.interp(line_len, [50, 300], [0, 100])
self.volume.SetMasterVolumeLevel(vol, None)
cv2.putText(image, str(math.ceil(rect_percent_text)) + "%", (10, 350),
cv2.FONT_HERSHEY_PLAIN, 3, (255, 0, 0), 3)
image = cv2.rectangle(image, (30, 100), (70, 300), (255, 0, 0), 3)
image = cv2.rectangle(image, (30, math.ceil(300 - rect_height)), (70, 300), (255, 0, 0), -1)
cTime = time.time()
fps_text = 1 / (cTime - fpsTime)
fpsTime = cTime
cv2.putText(image, "FPS: " + str(int(fps_text)), (10, 70),
cv2.FONT_HERSHEY_PLAIN, 3, (255, 0, 0), 3)
cv2.imshow('MediaPipe Hands', image)
if cv2.waitKey(5) & 0xFF == 27 or cv2.getWindowProperty('MediaPipe Hands', cv2.WND_PROP_VISIBLE) < 1:
break
cap.release()
3.3 启动程序
最后,通过实例化HandControlVolume
类并调用recognize
方法来启动程序。
control = HandControlVolume()
control.recognize()
3.4 测试效果
4. Mac版本程序
主要功能
- 使用MediaPipe检测手部姿态。
- 通过计算手指之间的距离来调整系统音量。
- 使用AppleScript来控制Mac系统的音量。
Mac版本所需依赖库
pip install mediapipe
pip install numpy
pip install opencv-python
pip install applescript
代码实现
import cv2
import mediapipe as mp
from ctypes import cast, POINTER
import applescript as al
import time
import math
import numpy as np
class HandControlVolume:
def __init__(self):
self.mp_drawing = mp.solutions.drawing_utils
self.mp_drawing_styles = mp.solutions.drawing_styles
self.mp_hands = mp.solutions.hands
def recognize(self):
fpsTime = time.time()
cap = cv2.VideoCapture(0)
resize_w = 640
resize_h = 480
rect_height = 0
rect_percent_text = 0
with self.mp_hands.Hands(min_detection_confidence=0.7,
min_tracking_confidence=0.5,
max_num_hands=2) as hands:
while cap.isOpened():
success, image = cap.read()
image = cv2.resize(image, (resize_w, resize_h))
if not success:
print("空帧.")
continue
image.flags.writeable = False
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image = cv2.flip(image, 1)
results = hands.process(image)
image.flags.writeable = True
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
if results.multi_hand_landmarks:
for hand_landmarks in results.multi_hand_landmarks:
self.mp_drawing.draw_landmarks(
image,
hand_landmarks,
self.mp_hands.HAND_CONNECTIONS,
self.mp_drawing_styles.get_default_hand_landmarks_style(),
self.mp_drawing_styles.get_default_hand_connections_style())
landmark_list = []
for landmark_id, finger_axis in enumerate(hand_landmarks.landmark):
landmark_list.append([landmark_id, finger_axis.x, finger_axis.y, finger_axis.z])
if landmark_list:
thumb_finger_tip = landmark_list[4]
thumb_finger_tip_x = math.ceil(thumb_finger_tip[1] * resize_w)
thumb_finger_tip_y = math.ceil(thumb_finger_tip[2] * resize_h)
index_finger_tip = landmark_list[8]
index_finger_tip_x = math.ceil(index_finger_tip[1] * resize_w)
index_finger_tip_y = math.ceil(index_finger_tip[2] * resize_h)
finger_middle_point = (thumb_finger_tip_x + index_finger_tip_x) // 2, (
thumb_finger_tip_y + index_finger_tip_y) // 2
thumb_finger_point = (thumb_finger_tip_x, thumb_finger_tip_y)
index_finger_point = (index_finger_tip_x, index_finger_tip_y)
image = cv2.circle(image, thumb_finger_point, 10, (255, 0, 255), -1)
image = cv2.circle(image, index_finger_point, 10, (255, 0, 255), -1)
image = cv2.circle(image, finger_middle_point, 10, (255, 0, 255), -1)
image = cv2.line(image, thumb_finger_point, index_finger_point, (255, 0, 255), 5)
line_len = math.hypot((index_finger_tip_x - thumb_finger_tip_x),
(index_finger_tip_y - thumb_finger_tip_y))
vol = np
.interp(line_len, [50, 300], [0, 100])
rect_height = np.interp(line_len, [50, 300], [0, 200])
rect_percent_text = np.interp(line_len, [50, 300], [0, 100])
al.run('set volume output volume ' + str(vol))
cv2.putText(image, str(math.ceil(rect_percent_text)) + "%", (10, 350),
cv2.FONT_HERSHEY_PLAIN, 3, (255, 0, 0), 3)
image = cv2.rectangle(image, (30, 100), (70, 300), (255, 0, 0), 3)
image = cv2.rectangle(image, (30, math.ceil(300 - rect_height)), (70, 300), (255, 0, 0), -1)
cTime = time.time()
fps_text = 1 / (cTime - fpsTime)
fpsTime = cTime
cv2.putText(image, "FPS: " + str(int(fps_text)), (10, 70),
cv2.FONT_HERSHEY_PLAIN, 3, (255, 0, 0), 3)
cv2.imshow('MediaPipe Hands', image)
if cv2.waitKey(5) & 0xFF == 27:
break
cap.release()
区别分析
-
音量控制方式:
- Windows版本:使用
PyCaw
库通过COM接口控制音量。 - Mac版本:使用
AppleScript
控制音量。
- Windows版本:使用
-
依赖库:
- Windows版本:依赖
PyCaw
和comtypes
库。 - Mac版本:依赖
applescript
库。
- Windows版本:依赖
-
代码调整:
- Mac版本注释掉了与Windows音量控制相关的代码,并替换为
AppleScript
命令。 - 音量计算部分的范围从Windows的音量范围映射变为0到100的映射。
- Mac版本注释掉了与Windows音量控制相关的代码,并替换为
-
平台适配:
- Windows程序利用
PyCaw
库与Windows系统进行交互,而Mac程序利用AppleScript
与Mac系统进行交互。
- Windows程序利用