From ab42cc6de546ec9149d5f4a372576ea2bf58ac57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cjiawei=E2=80=9D?= <“1073198597@qq.com”> Date: Mon, 29 Sep 2025 09:19:40 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E8=A1=A5=E5=85=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ark_api_module.py | 101 +++++++++++++++ camera_module.py | 30 +++++ main_scheduler.py | 291 ++++++++++++++++++++++++++++++++++++++++++ motion_module.py | 59 +++++++++ tts_module.py | 173 +++++++++++++++++++++++++ voice_recog_module.py | 226 ++++++++++++++++++++++++++++++++ volume_module.py | 93 ++++++++++++++ 7 files changed, 973 insertions(+) diff --git a/ark_api_module.py b/ark_api_module.py index e69de29..4454981 100644 --- a/ark_api_module.py +++ b/ark_api_module.py @@ -0,0 +1,101 @@ +from openai import OpenAI +import time +import sys +import queue # 新增:用于缓存实时文本片段 +import threading # 新增:用于并行处理语音播放 +# 原代码7. 火山方舟API调用完整逻辑 +class ArkAPIController: + def __init__(self, ark_api_key, ark_model_id, tts_controller, feedback_text): + # 接收调度脚本传入的TTS实例和反馈文本,保持原逻辑 + self.ARK_API_KEY = ark_api_key + self.ARK_MODEL_ID = ark_model_id + self.tts_controller = tts_controller + self.FEEDBACK_TEXT = feedback_text + self.chat_context = [] # 聊天上下文由模块内部维护(与原逻辑一致) + self.MAX_CONTEXT_LEN = 10 + + # 新增:实时语音播放队列与线程 + self.speech_queue = queue.Queue() # 缓存待播放的文本片段 + self.speech_thread = threading.Thread(target=self._process_speech_queue, daemon=True) + self.speech_thread.start() # 启动语音播放线程 + + # 新增:处理语音队列的函数(循环从队列取片段并播放) + def _process_speech_queue(self): + """持续从队列中获取文本片段并调用TTS播放""" + while True: + text = self.speech_queue.get() # 阻塞等待队列消息 + if text is None: # 退出信号 + break + self.tts_controller.speak(text) # 播放片段 + self.speech_queue.task_done() # 标记任务完成 + def call_ark_api(self, content_type: str, content: dict): + # 播放操作反馈(同步执行) + self.tts_controller.speak(self.FEEDBACK_TEXT[content_type]) + + client = OpenAI( + base_url="https://ark.cn-beijing.volces.com/api/v3", + api_key=self.ARK_API_KEY + ) + + try: + messages = [] + if content_type == "chat": + messages.extend(self.chat_context[-self.MAX_CONTEXT_LEN*2:]) + messages.append({"role": "user", "content": [{"type": "text", "text": content["prompt"]}]}) + elif content_type == "image_recog": + messages.append({ + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{content['image_base64']}"}}, + {"type": "text", "text": content["prompt"]} + ] + }) + + response = client.chat.completions.create( + model=self.ARK_MODEL_ID, + messages=messages, + max_tokens=300, + temperature=0.7 if content_type == "chat" else 0.3, + stream=True + ) + + full_response = "" + current_speech_chunk = "" # 缓存当前待播放的片段 + print("\n" + "="*50) + print("🤖 回应:", end="", flush=True) + + for chunk in response: + if chunk.choices and chunk.choices[0].delta.content: + char = chunk.choices[0].delta.content + full_response += char + current_speech_chunk += char # 累加片段 + print(char, end="", flush=True) + time.sleep(0.05) + + # 关键逻辑:当片段包含标点或达到一定长度时,推送到语音队列 + if any(punct in current_speech_chunk for punct in [".", "。", "!", "!", "?", "?", ",", ",", ";", ";"]): + self.speech_queue.put(current_speech_chunk) # 推送片段到队列 + current_speech_chunk = "" # 重置片段缓存 + + # 处理最后剩余的片段(如果有) + if current_speech_chunk: + self.speech_queue.put(current_speech_chunk) + + print("\n" + "="*50 + "\n") + + # 等待所有语音片段播放完成 + self.speech_queue.join() + + # 维护聊天上下文(原有逻辑) + if content_type == "chat" and full_response.strip(): + self.chat_context.append({"role": "user", "content": [{"type": "text", "text": content["prompt"]}]}) + self.chat_context.append({"role": "assistant", "content": [{"type": "text", "text": full_response}]}) + + return full_response + except Exception as e: + error_msg = f"❌ API调用失败:{str(e)}" + print(f"\n" + "="*50) + print(error_msg) + print("="*50 + "\n") + self.tts_controller.speak(self.FEEDBACK_TEXT["api_error"]) + return error_msg \ No newline at end of file diff --git a/camera_module.py b/camera_module.py index e69de29..0bc26d3 100644 --- a/camera_module.py +++ b/camera_module.py @@ -0,0 +1,30 @@ +import io +import base64 +from PIL import Image +from picamera2 import Picamera2 +import sys + +# 原代码6. 摄像头模块完整逻辑 +class CameraModule: + def __init__(self): + try: + self.camera = Picamera2() + cam_config = self.camera.create_still_configuration(main={"size": (320, 240)}) + self.camera.configure(cam_config) + self.camera.start() + print("📷 摄像头模块初始化成功") + except Exception as e: + print(f"❌ 摄像头失败:{str(e)}") + self.camera = None + + def capture_base64(self): + if not self.camera: + return None + try: + img_array = self.camera.capture_array() + img_byte = io.BytesIO() + Image.fromarray(img_array).save(img_byte, format="JPEG", quality=80) + return base64.b64encode(img_byte.getvalue()).decode("utf-8") + except Exception as e: + print(f"❌ 拍摄失败:{str(e)}") + return None \ No newline at end of file diff --git a/main_scheduler.py b/main_scheduler.py index e69de29..b22567f 100644 --- a/main_scheduler.py +++ b/main_scheduler.py @@ -0,0 +1,291 @@ +import signal +import sys +import time +import re +import subprocess +import queue +# 导入所有模块 +from tts_module import BaiduOnlineTTS +from volume_module import VolumeController, detect_audio_control +from motion_module import RobotMotionController +from camera_module import CameraModule +from ark_api_module import ArkAPIController +from voice_recog_module import VoiceRecogController + +# -------------------- 1. 基础配置(完全保留原代码1. 基础配置) -------------------- +# 1.1 项目路径与运动模型 +PROJECT_ROOT = "/home/duckpi/open_duck_mini_ws/OPEN_DUCK_MINI/Open_Duck_Mini_Runtime-2" +sys.path.append(PROJECT_ROOT) +ONNX_MODEL_PATH = "/home/duckpi/open_duck_mini_ws/OPEN_DUCK_MINI/Open_Duck_Mini-2/BEST_WALK_ONNX_2.onnx" + +# 1.2 火山方舟API配置 +ARK_API_KEY = "390d517c-129a-41c1-bf3d-458048007b69" +ARK_MODEL_ID = "doubao-seed-1-6-250615" + +# 1.3 语音识别与唤醒词配置 +APPID = "1ff50710" +ACCESS_KEY_ID = "a4f43e95ee0a9518d11befac8d31f1d4" +ACCESS_KEY_SECRET = "YzQ4NTRhZjc2ZTM4MDA1YjM2MmIyNDEy" +ACCESS_KEY = "e0EQQBoH0HIVU9KrXsmB7CMlVci+GAs2x0Ejtrdp8CTtZmf25rCLaQ==" +WAKEUP_WORD_PATH = "/home/duckpi/open_duck_mini_ws/OPEN_DUCK_MINI/resources/xiaohuangya_zh_raspberry-pi_v3_0_0.ppn" +MODEL_PATH = "/home/duckpi/open_duck_mini_ws/OPEN_DUCK_MINI/resources/porcupine_params_zh.pv" + +# 1.4 百度在线TTS配置 +BAIDU_TTS_API_KEY = "TnwYZPPvElNushOzfL6vBlUI" +BAIDU_TTS_SECRET_KEY = "55HeI8VNUMNlkW3t2QRwVtrjumpxjfxk" + +# 1.5 语音反馈文本配置 +FEEDBACK_TEXT = { + "wakeup": "你好呀,有什么吩咐", + "move_forward": "好的,我正在前进", + "move_backward": "好的,我正在后退", + "turn_left": "好的,我正在左转", + "turn_right": "好的,我正在右转", + "image_recog": "好的,我来识别一下", + "chat": "好的,我来想想", + "volume_increase": "音量已增大", + "volume_decrease": "音量已减小", + "volume_max": "已调至最大音量", + "volume_min": "已调至最小音量", + "unknown": "抱歉,没听懂,请再说一次", + "api_error": "抱歉,处理请求时出错了" +} + +# 1.6 音频参数 +VOLUME_STEP = 10 +MIN_VOLUME = 0 +MAX_VOLUME = 100 +CURRENT_VOLUME = 40 +AUDIO_CONTROL_NAME = None + +# 1.7 麦克风与扬声器参数(模块内部已定义,此处保留用于一致性) +SAMPLE_RATE = 16000 +CHANNELS = 1 +SAMPLE_FORMAT = "int16" +AUDIO_ENCODE = "pcm_s16le" +LANG = "autodialect" +INTERACTION_TIMEOUT = 30 + +# -------------------- 2. 全局状态变量(完全保留原代码2. 全局状态变量,用列表传引用) -------------------- +audio_q = queue.Queue() +last_audio_time = [time.time()] # 列表传引用,供模块修改 +current_text = [""] # 列表传引用,供模块修改 +final_result = [""] # 列表传引用,供模块修改 +is_processing = [False] # 列表传引用,供模块修改 +last_command_time = [time.time()]# 列表传引用,供模块修改 +feedback_playing = False # TTS模块使用的全局变量 + + +# -------------------- 8. 指令解析与执行(完全保留原代码8. 指令解析与执行) -------------------- +def parse_voice_command(command_text: str): + command_text = command_text.strip().lower() + if not command_text: + return ("unknown", {}) + + # 运动指令 + motion_rules = [ + {"keywords": ["前进", "往前走", "向前走"], "action": "move_forward"}, + {"keywords": ["后退", "往后走", "向后退"], "action": "move_backward"}, + {"keywords": ["左转", "向左转", "往左转"], "action": "turn_left"}, + {"keywords": ["右转", "向右转", "往右转"], "action": "turn_right"}, + ] + for rule in motion_rules: + if any(keyword in command_text for keyword in rule["keywords"]): + number_match = re.search(r"(\d{1,2})", command_text) + seconds = int(number_match.group(1)) if number_match else 2 + return ("motion", {"action": rule["action"], "seconds": seconds}) + + # 图像识别指令 + image_keywords = ["是什么", "这是什么", "识别", "看这个", "这东西", "这物体", "辨认"] + if any(keyword in command_text for keyword in image_keywords): + prompt = f"请简洁描述图片中的物体,1-2句话说明:{command_text}" + return ("image_recog", {"prompt": prompt}) + + # 闲聊指令 + chat_keywords = [ + "什么", "怎么", "为什么", "哪里", "多少", "如何", "吗", "呢", "吧", + "你好", "哈喽", "嗨", "今天", "天气", "时间", "故事", "笑话", "知识" + ] + exclude_keywords = ["前进", "后退", "左转", "右转", "识别", "音量", "增大", "减小"] + if len(command_text) >= 2 and any(k in command_text for k in chat_keywords) and not any(k in command_text for k in exclude_keywords): + return ("chat", {"prompt": command_text}) + + # 音量控制指令 + if any(keyword in command_text for keyword in ["增大音量", "声音大一点", "调大音量"]): + return ("volume", {"action": "increase"}) + elif any(keyword in command_text for keyword in ["减小音量", "声音小一点", "调小音量"]): + return ("volume", {"action": "decrease"}) + elif any(keyword in command_text for keyword in ["最大音量", "声音最大"]): + return ("volume", {"action": "max"}) + elif any(keyword in command_text for keyword in ["最小音量", "声音最小", "静音"]): + return ("volume", {"action": "min"}) + + # 未知指令 + return ("unknown", {}) + +def execute_command(command_type: str, params: dict, motion_controller, ark_api_controller, volume_controller): + global is_processing, feedback_playing + if is_processing[0]: + tts_controller.speak(FEEDBACK_TEXT["unknown"]) + print("⚠️ 已有指令处理中,请稍后再说") + return + is_processing[0] = True + + try: + if command_type == "motion": + motion_controller.execute_motion(params["action"], params["seconds"]) + + elif command_type == "image_recog": + print(f"\n🔍 触发图像识别,正在拍摄...") + image_base64 = camera_module.capture_base64() + if not image_base64: + tts_controller.speak(FEEDBACK_TEXT["unknown"]) + print("\n" + "="*50) + print("❌ 图像采集失败,无法识别") + print("="*50 + "\n") + return + ark_api_controller.call_ark_api("image_recog", {"image_base64": image_base64, "prompt": params["prompt"]}) + + elif command_type == "chat": + print(f"\n💬 触发闲聊,正在思考...") + ark_api_controller.call_ark_api("chat", {"prompt": params["prompt"]}) + + elif command_type == "volume": + volume_action = params["action"] + if volume_action == "increase": + success = volume_controller.adjust_volume(is_increase=True) + if success: + tts_controller.speak(FEEDBACK_TEXT["volume_increase"]) + elif volume_action == "decrease": + success = volume_controller.adjust_volume(is_increase=False) + if success: + tts_controller.speak(FEEDBACK_TEXT["volume_decrease"]) + elif volume_action == "max": + success = volume_controller.set_system_volume(MAX_VOLUME) + if success: + tts_controller.speak(FEEDBACK_TEXT["volume_max"]) + elif volume_action == "min": + success = volume_controller.set_system_volume(MIN_VOLUME) + if success: + tts_controller.speak(FEEDBACK_TEXT["volume_min"]) + + elif command_type == "unknown": + tts_controller.speak(FEEDBACK_TEXT["unknown"]) + print("\n" + "="*50) + print(f"❌ 未识别到有效指令,支持:") + print(f" - 运动:前进3秒、左转2秒 | - 图像识别:这是什么") + print(f" - 闲聊:今天天气怎么样 | - 音量:增大音量、减小音量") + print("="*50 + "\n") + + finally: + is_processing[0] = False + + +# -------------------- 11. 主循环(完全保留原代码11. 主循环逻辑) -------------------- +def main(): + global tts_controller, camera_module, AUDIO_CONTROL_NAME, feedback_playing + + # 初始化各模块(按原代码顺序) + # 1. 初始化TTS + try: + tts_controller = BaiduOnlineTTS(BAIDU_TTS_API_KEY, BAIDU_TTS_SECRET_KEY) + except Exception as e: + print(f"❌ TTS初始化失败: {str(e)}") + sys.exit(1) + + # 2. 初始化音量控制 + AUDIO_CONTROL_NAME = detect_audio_control() + volume_controller = VolumeController( + audio_control_name=AUDIO_CONTROL_NAME, + current_volume=CURRENT_VOLUME, + volume_step=VOLUME_STEP, + min_volume=MIN_VOLUME, + max_volume=MAX_VOLUME + ) + + # 3. 初始化运动控制 + motion_controller = RobotMotionController( + onnx_model_path=ONNX_MODEL_PATH, + tts_controller=tts_controller, + feedback_text=FEEDBACK_TEXT + ) + + # 4. 初始化摄像头 + camera_module = CameraModule() + + # 5. 初始化API控制器 + ark_api_controller = ArkAPIController( + ark_api_key=ARK_API_KEY, + ark_model_id=ARK_MODEL_ID, + tts_controller=tts_controller, + feedback_text=FEEDBACK_TEXT + ) + + # 6. 初始化语音识别 + voice_recog_controller = VoiceRecogController( + access_key=ACCESS_KEY, + wakeup_word_path=WAKEUP_WORD_PATH, + model_path=MODEL_PATH, + appid=APPID, + access_key_id=ACCESS_KEY_ID, + access_key_secret=ACCESS_KEY_SECRET, + tts_controller=tts_controller, + feedback_text=FEEDBACK_TEXT + ) + + # 中断处理(完全保留原逻辑) + def handle_interrupt(signum, frame): + print("\n🛑 收到退出信号,正在清理资源...") + # 停止机器人运动 + if 'motion_controller' in globals() and hasattr(motion_controller, 'rl_walk'): + motion_controller.rl_walk.last_commands = [0.0, 0.0, 0.0] + # 停止TTS播放 + global feedback_playing + feedback_playing = False + # 停止摄像头与麦克风 + if 'camera_module' in globals() and camera_module.camera: + camera_module.camera.stop() + if hasattr(voice_recog_controller, 'stream') and voice_recog_controller.stream and voice_recog_controller.stream.active: + voice_recog_controller.stream.stop() + # 关闭TTS资源 + tts_controller.close() + print("✅ 所有资源清理完成,程序退出") + sys.exit(0) + + signal.signal(signal.SIGINT, handle_interrupt) + + # 强制测试一次语音输出(原逻辑) + print("\n🔍 正在测试语音输出...") + tts_controller.speak("系统初始化完成,等待语音唤醒") + + # 主循环(原逻辑) + while True: + if voice_recog_controller.wakeup_listener(): + # 定义指令执行回调函数(关键修复) + def execute_callback(command_text): + command_type, params = parse_voice_command(command_text) + execute_command(command_type, params, motion_controller, ark_api_controller, volume_controller) + + # 启动WebSocket时传入回调函数 + voice_recog_controller.start_websocket( + current_text=current_text, + final_result=final_result, + last_audio_time=last_audio_time, + is_processing=is_processing, + last_command_time=last_command_time, + execute_callback=execute_callback # 传入回调 + ) + # 重置状态 + last_audio_time[0] = time.time() + last_command_time[0] = time.time() + + +if __name__ == "__main__": + # # 确保ffmpeg已安装(原逻辑) + # try: + # subprocess.run(["ffmpeg", "--version"], capture_output=True, check=True) + # except: + # print("⚠️ 未检测到ffmpeg,正在尝试安装...") + # subprocess.run(["sudo", "apt-get", "install", "-y", "ffmpeg"], check=True) + + main() \ No newline at end of file diff --git a/motion_module.py b/motion_module.py index e69de29..afccfe0 100644 --- a/motion_module.py +++ b/motion_module.py @@ -0,0 +1,59 @@ +import sys +import time +import threading + +# 原代码5. 运动控制模块完整逻辑(保留原路径导入) +PROJECT_ROOT = "/home/duckpi/open_duck_mini_ws/OPEN_DUCK_MINI/Open_Duck_Mini_Runtime-2" +sys.path.append(PROJECT_ROOT) +from v2_rl_walk_mujoco import RLWalk + +class RobotMotionController: + def __init__(self, onnx_model_path, tts_controller, feedback_text): + # 接收调度脚本传入的TTS实例和反馈文本,保持原逻辑调用 + self.tts_controller = tts_controller + self.FEEDBACK_TEXT = feedback_text + try: + self.rl_walk = RLWalk( + onnx_model_path=onnx_model_path, + cutoff_frequency=40, + pid=[30, 0, 0] + ) + self.walk_thread = threading.Thread(target=self.rl_walk.run, daemon=True) + self.walk_thread.start() + time.sleep(1) + print("🤖 运动控制模块初始化成功") + except Exception as e: + print(f"❌ 运动控制失败:{str(e)}") + sys.exit(1) + + def execute_motion(self, action_name: str, seconds: int): + # 全局变量由调度脚本传入,保留原逻辑 + global is_processing + is_processing = True + try: + # 播放反馈(同步执行确保声音输出) + self.tts_controller.speak(self.FEEDBACK_TEXT[action_name]) + + # 执行运动 + seconds = max(2, min(seconds, 5)) + if action_name == "move_forward": + print(f"\n🚶 前进{seconds}秒...") + self.rl_walk.last_commands[0] = 0.17 + elif action_name == "move_backward": + print(f"\n🚶 后退{seconds}秒...") + self.rl_walk.last_commands[0] = -0.17 + elif action_name == "turn_left": + print(f"\n🔄 左转{seconds}秒...") + self.rl_walk.last_commands[2] = 1.1 + elif action_name == "turn_right": + print(f"\n🔄 右转{seconds}秒...") + self.rl_walk.last_commands[2] = -1.1 + + time.sleep(seconds) + self.rl_walk.last_commands = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] + print(f"✅ 运动完成") + except Exception as e: + print(f"❌ 运动执行失败:{str(e)}") + self.rl_walk.last_commands = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] + finally: + is_processing = False \ No newline at end of file diff --git a/tts_module.py b/tts_module.py index e69de29..effc0d1 100644 --- a/tts_module.py +++ b/tts_module.py @@ -0,0 +1,173 @@ +import pyaudio +import wave +import tempfile +import os +import requests +import time +import sys + +# 原代码3. 百度在线TTS模块完整逻辑 +class BaiduOnlineTTS: + def __init__(self, api_key, secret_key): + """初始化百度在线TTS""" + self.api_key = api_key + self.secret_key = secret_key + self.access_token = None + self.token_expires = 0 + + # 初始化音频播放器 + self.audio_player = pyaudio.PyAudio() + + # TTS配置参数 + self.default_options = { + 'vol': 5, # 音量(0-15) + 'spd': 5, # 语速(0-9) + 'pit': 5, # 音调(0-9) + 'per': 0 # 发音人(0:女,1:男,3:情感女,4:情感男) + } + + # 获取初始访问令牌 + if not self._get_access_token(): + raise Exception("无法获取百度API访问令牌,请检查密钥是否正确") + + def _get_access_token(self): + """获取百度API访问令牌""" + # 检查令牌是否仍然有效 + if self.access_token and time.time() < self.token_expires - 300: + return True + + try: + url = f"https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id={self.api_key}&client_secret={self.secret_key}" + response = requests.get(url) + result = response.json() + + if "access_token" in result: + self.access_token = result["access_token"] + self.token_expires = time.time() + result["expires_in"] + print("✅ 成功获取百度API访问令牌") + return True + else: + print(f"❌ 获取令牌失败: {result}") + return False + except Exception as e: + print(f"❌ 获取令牌时发生错误: {str(e)}") + return False + + def text_to_speech(self, text, options=None, save_path=None): + """将文本转换为语音""" + # 确保令牌有效 + if not self._get_access_token(): + return None + + # 合并配置参数 + params = self.default_options.copy() + if options: + params.update(options) + + try: + # 对文本进行URL编码 + encoded_text = requests.utils.quote(text) + url = f"https://tsn.baidu.com/text2audio?tex={encoded_text}&lan=zh&cuid=baidu-tts-python&ctp=1&tok={self.access_token}" + + # 添加合成参数 + for key, value in params.items(): + url += f"&{key}={value}" + + # 发送请求 + response = requests.get(url) + + # 检查响应是否为音频数据 + if response.headers.get("Content-Type", "").startswith("audio/"): + # 保存文件(如果需要) + if save_path: + with open(save_path, "wb") as f: + f.write(response.content) + print(f"✅ 音频已保存至: {save_path}") + + return response.content + else: + # 解析错误信息 + try: + error = response.json() + print(f"❌ 语音合成失败: {error.get('err_msg', '未知错误')}") + except: + print(f"❌ 语音合成失败,响应内容: {response.text}") + return None + + except Exception as e: + print(f"❌ 语音合成时发生错误: {str(e)}") + return None + + def speak(self, text, options=None): + """直接播放文本转换的语音""" + # 全局变量由调度脚本传入,此处保留原逻辑调用 + from main_scheduler import feedback_playing + if feedback_playing: + return False + + feedback_playing = True + + # 限制文本长度(百度API有长度限制) + if len(text) > 1024: + print("⚠️ 文本过长,将截断为1024字符") + text = text[:1024] + + # 获取音频数据 + audio_data = self.text_to_speech(text, options) + if not audio_data: + feedback_playing = False + return False + + try: + # 创建临时MP3文件 + with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_file: + temp_file.write(audio_data) + temp_filename = temp_file.name + + # 转换为WAV格式(适配pyaudio) + from pydub import AudioSegment + audio = AudioSegment.from_mp3(temp_filename) + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as wav_file: + audio.export(wav_file.name, format="wav") + wav_filename = wav_file.name + + # 播放WAV文件 + wf = wave.open(wav_filename, 'rb') + stream = self.audio_player.open( + format=self.audio_player.get_format_from_width(wf.getsampwidth()), + channels=wf.getnchannels(), + rate=wf.getframerate(), + output=True + ) + + # 播放音频 + chunk = 1024 + data = wf.readframes(chunk) + while data and feedback_playing: + stream.write(data) + data = wf.readframes(chunk) + + # 清理资源 + stream.stop_stream() + stream.close() + wf.close() + + print(f"✅ 语音播放完成: {text[:20]}...") + return True + + except Exception as e: + print(f"❌ 播放语音时发生错误: {str(e)}") + return False + + finally: + # 删除临时文件 + if 'temp_filename' in locals() and os.path.exists(temp_filename): + os.remove(temp_filename) + if 'wav_filename' in locals() and os.path.exists(wav_filename): + os.remove(wav_filename) + feedback_playing = False + + def close(self): + """释放资源""" + self.audio_player.terminate() + print("✅ TTS资源已释放") \ No newline at end of file diff --git a/voice_recog_module.py b/voice_recog_module.py index e69de29..9fe2e84 100644 --- a/voice_recog_module.py +++ b/voice_recog_module.py @@ -0,0 +1,226 @@ +import sounddevice as sd +import pvporcupine +import struct +import websocket +import threading +import hmac +import hashlib +import base64 +import json +import time +import urllib.parse +import uuid +import queue +import sys + +# 原代码9. 音频采集与WebSocket + 10. 唤醒词监听完整逻辑 +class VoiceRecogController: + def __init__(self, access_key, wakeup_word_path, model_path, appid, access_key_id, access_key_secret, tts_controller, feedback_text): + # 接收调度脚本传入的参数,保持原逻辑 + self.ACCESS_KEY = access_key + self.WAKEUP_WORD_PATH = wakeup_word_path + self.MODEL_PATH = model_path + self.APPID = appid + self.ACCESS_KEY_ID = access_key_id + self.ACCESS_KEY_SECRET = access_key_secret + self.tts_controller = tts_controller + self.FEEDBACK_TEXT = feedback_text + self.SAMPLE_RATE = 16000 + self.CHANNELS = 1 + self.SAMPLE_FORMAT = "int16" + self.INTERACTION_TIMEOUT = 30 + self.audio_q = queue.Queue() + self.stream = None # 麦克风流后续初始化 + + def wakeup_listener(self): + """原代码10. 唤醒词监听""" + try: + porcupine = pvporcupine.create( + access_key=self.ACCESS_KEY, + keyword_paths=[self.WAKEUP_WORD_PATH], + model_path=self.MODEL_PATH + ) + print(f"\n🎯 唤醒词引擎就绪(采样率:{porcupine.sample_rate})") + + wakeup_mic = sd.RawInputStream( + samplerate=porcupine.sample_rate, + blocksize=porcupine.frame_length, + dtype="int16", + channels=1 + ) + + print("📢 等待唤醒词「小黄鸭」(按Ctrl+C退出)") + with wakeup_mic: + while True: + pcm_data, _ = wakeup_mic.read(porcupine.frame_length) + pcm_unpacked = struct.unpack_from("h" * porcupine.frame_length, pcm_data) + if porcupine.process(pcm_unpacked) >= 0: + print("🚀 检测到唤醒词「小黄鸭」!") + # 播放唤醒反馈(同步执行) + self.tts_controller.speak(self.FEEDBACK_TEXT["wakeup"]) + porcupine.delete() + return True + except Exception as e: + print(f"\n❌ 唤醒词监听失败:{str(e)}") + print(" 排查:1. 唤醒词文件路径 2. 麦克风连接 3. PicoVoice Key有效性") + sys.exit(1) + + def _audio_callback(self, indata, frames, t, status): + """原代码9. 音频采集回调""" + if status: + print(f"⚠️ 音频异常:{status}") + self.audio_q.put(bytes(indata)) + + def _create_ws_url(self): + """原代码9. 创建WebSocket URL""" + try: + host = "office-api-ast-dx.iflyaisol.com" + path = "/ast/communicate/v1" + utc = time.strftime("%Y-%m-%dT%H:%M:%S", time.gmtime()) + "+0000" + session_uuid = str(uuid.uuid4()) + + params = { + "accessKeyId": self.ACCESS_KEY_ID, + "appId": self.APPID, + "samplerate": self.SAMPLE_RATE, + "audio_encode": "pcm_s16le", + "lang": "autodialect", + "uuid": session_uuid, + "utc": utc, + } + + sorted_params = sorted(params.items(), key=lambda x: x[0]) + base_string = "&".join( + f"{urllib.parse.quote_plus(str(k))}={urllib.parse.quote_plus(str(v))}" + for k, v in sorted_params + ) + signature = hmac.new( + self.ACCESS_KEY_SECRET.encode("utf-8"), + base_string.encode("utf-8"), + hashlib.sha1 + ).digest() + signature = base64.b64encode(signature).decode("utf-8") + + query = base_string + "&signature=" + urllib.parse.quote_plus(signature) + return f"wss://{host}{path}?{query}", session_uuid + except Exception as e: + print(f"❌ WebSocket URL生成失败:{str(e)}") + return None, None + + def _on_message(self, ws, message, current_text, last_audio_time): + """原代码9. WebSocket消息处理(接收全局变量引用)""" + try: + data = json.loads(message) + if data.get("msg_type") == "result" and "cn" in data.get("data", {}): + words = [ + cw.get("w", "") + for rt in data["data"]["cn"].get("st", {}).get("rt", []) + for ws_item in rt.get("ws", []) + for cw in ws_item.get("cw", []) + ] + if words: + current_text[0] = "".join(words) # 用列表传引用,修改全局变量 + last_audio_time[0] = time.time() + print(f"🎧 识别中:{current_text[0]}", end="\r") + except Exception as e: + print(f"\n❌ 语音识别消息处理错误:{str(e)}") + + def _on_error(self, ws, error, is_processing): + """原代码9. WebSocket错误处理""" + if not is_processing[0]: + print(f"\n❌ WebSocket连接错误:{str(error)}") + + def _on_close(self, ws, close_status_code, close_msg, current_text, final_result, stream): + """原代码9. WebSocket关闭处理""" + print(f"\n🔌 WebSocket连接关闭 | 状态码:{close_status_code}") + if stream and stream.active: + stream.stop() + current_text[0] = "" + final_result[0] = "" + + def _on_open(self, ws, stream, current_text, final_result, last_audio_time, is_processing, last_command_time, execute_callback): + """新增 execute_callback 参数,用于接收指令执行函数""" + def send_audio_and_handle(): + print("\n🎤 指令已就绪!支持:") + print(" - 运动:前进3秒、左转2秒 | - 图像识别:这是什么") + print(" - 闲聊:今天天气怎么样 | - 音量:增大音量、减小音量\n") + stream.start() + current_text[0] = "" + final_result[0] = "" + last_command_time[0] = time.time() + + while True: + try: + # 1. 处理音频队列(避免堆积) + while self.audio_q.qsize() > 5: + self.audio_q.get_nowait() + # 2. 发送音频数据(若队列有数据) + audio_data = self.audio_q.get(timeout=0.5) + ws.send(audio_data, websocket.ABNF.OPCODE_BINARY) + + # 3. 指令识别与执行:有文本且2秒内无新音频时执行 + if current_text[0] and (time.time() - last_audio_time[0]) > 2: + final_result[0] = current_text[0].strip() + if len(final_result[0]) > 0: # 确保指令有效 + print(f"\n⏹ 最终指令:{final_result[0]}") + # 调用回调函数执行指令(关键修复:直接在这里执行) + execute_callback(final_result[0]) + last_command_time[0] = time.time() # 更新最后操作时间 + current_text[0] = "" # 执行后清空,避免重复识别 + final_result[0] = "" + time.sleep(1) # 等待指令执行完成 + + # 4. 超时检测:30秒无操作则关闭连接 + if time.time() - last_command_time[0] > self.INTERACTION_TIMEOUT: + print(f"\n⌛ {self.INTERACTION_TIMEOUT}秒无操作,关闭连接") + self.tts_controller.speak(self.FEEDBACK_TEXT.get("wakeup_timeout", "长时间没操作,我先休息啦")) + time.sleep(1) + ws.send("close", websocket.ABNF.OPCODE_TEXT) + break + + except queue.Empty: + # 队列为空时,检测超时 + if time.time() - last_command_time[0] > self.INTERACTION_TIMEOUT: + print(f"\n⌛ {self.INTERACTION_TIMEOUT}秒无操作,关闭连接") + self.tts_controller.speak(self.FEEDBACK_TEXT.get("wakeup_timeout", "长时间没操作,我先休息啦")) + time.sleep(1) + ws.send("close", websocket.ABNF.OPCODE_TEXT) + break + continue # 继续循环等待音频 + except Exception as e: + print(f"\n❌ 音频发送错误:{str(e)}") + break + + audio_thread = threading.Thread(target=send_audio_and_handle, daemon=True) + audio_thread.start() + + def start_websocket(self, current_text, final_result, last_audio_time, is_processing, last_command_time, execute_callback): + """新增 execute_callback 参数,用于传递指令执行函数""" + self.stream = sd.RawInputStream( + samplerate=self.SAMPLE_RATE, + channels=self.CHANNELS, + dtype=self.SAMPLE_FORMAT, + callback=self._audio_callback, + ) + + ws_url, session_id = self._create_ws_url() + if not ws_url: + print("⚠️ 无法生成语音识别连接,3秒后重新监听...") + time.sleep(3) + return + + try: + print(f"🔄 连接语音识别服务(会话ID:{session_id[:8]}...)") + # 绑定WebSocket回调时传入 execute_callback + ws = websocket.WebSocketApp( + ws_url, + on_open=lambda ws: self._on_open(ws, self.stream, current_text, final_result, last_audio_time, is_processing, last_command_time, execute_callback), + on_message=lambda ws, msg: self._on_message(ws, msg, current_text, last_audio_time), + on_error=lambda ws, err: self._on_error(ws, err, is_processing), + on_close=lambda ws, status, msg: self._on_close(ws, status, msg, current_text, final_result, self.stream) + ) + ws.run_forever(ping_interval=10, ping_timeout=5) + except Exception as e: + print(f"❌ 语音识别连接失败:{str(e)}") + print("⚠️ 3秒后重新监听唤醒词...") + time.sleep(3) \ No newline at end of file diff --git a/volume_module.py b/volume_module.py index e69de29..fdf1ab2 100644 --- a/volume_module.py +++ b/volume_module.py @@ -0,0 +1,93 @@ +import subprocess +import re +import sys + +# 原代码4. 音频控制项自动检测与音量控制完整逻辑 +def detect_audio_control(): + """自动检测可用的音频播放控制项""" + try: + result = subprocess.run(["amixer", "controls"], capture_output=True, text=True) + playback_controls = [] + + # 优先查找常见的音频控制项 + priority_names = ["Master", "Speaker", "Headphone", "Audio", "Sound"] + + for name in priority_names: + if name in result.stdout: + print(f"✅ 自动检测到音频控制项:{name}") + return name + + # 如果没有找到优先项,从所有控制项中提取 + for line in result.stdout.splitlines(): + if "Playback" in line: + match = re.search(r"'([^']+)'", line) + if match: + playback_controls.append(match.group(1)) + + if playback_controls: + print(f"✅ 找到音频控制项:{playback_controls[0]}") + return playback_controls[0] + else: + print("⚠️ 未检测到音频控制项,将尝试不指定控制项调节音量") + return None + except Exception as e: + print(f"❌ 音频控制检测失败:{str(e)}") + return None + +class VolumeController: + def __init__(self, audio_control_name, current_volume, volume_step, min_volume, max_volume): + # 接收调度脚本传入的全局参数,保持原逻辑 + self.available = True + self.AUDIO_CONTROL_NAME = audio_control_name + self.CURRENT_VOLUME = current_volume + self.VOLUME_STEP = volume_step + self.MIN_VOLUME = min_volume + self.MAX_VOLUME = max_volume + + try: + # 强制取消静音并设置初始音量 + self.set_system_volume(self.CURRENT_VOLUME) + current_volume = self.get_system_volume() + if current_volume is not None: + self.CURRENT_VOLUME = current_volume + print(f"🔊 音量控制初始化成功(当前音量:{self.CURRENT_VOLUME}%)") + else: + print(f"⚠️ 无法获取当前音量,但已尝试设置为{self.CURRENT_VOLUME}%") + except Exception as e: + self.available = False + print(f"❌ 音量控制失败:{str(e)}") + + def get_system_volume(self): + try: + # 根据检测到的控制项获取音量 + cmd = ["amixer", "get"] + if self.AUDIO_CONTROL_NAME: + cmd.append(self.AUDIO_CONTROL_NAME) + + result = subprocess.run(cmd, capture_output=True, text=True) + volume_match = re.search(r"(\d+)%", result.stdout) + return int(volume_match.group(1)) if volume_match else None + except Exception as e: + print(f"❌ 获取音量失败:{str(e)}") + return None + + def set_system_volume(self, target_volume: int): + target_volume = max(self.MIN_VOLUME, min(self.MAX_VOLUME, target_volume)) + try: + # 根据检测到的控制项设置音量 + cmd = ["amixer", "set"] + if self.AUDIO_CONTROL_NAME: + cmd.append(self.AUDIO_CONTROL_NAME) + cmd.extend([f"{target_volume}%", "unmute"]) # 强制取消静音 + + subprocess.run(cmd, capture_output=True) + self.CURRENT_VOLUME = target_volume + print(f"🔊 音量已调整至:{self.CURRENT_VOLUME}%") + return True + except Exception as e: + print(f"❌ 调整音量失败:{str(e)}") + return False + + def adjust_volume(self, is_increase: bool): + target_volume = self.CURRENT_VOLUME + self.VOLUME_STEP if is_increase else self.CURRENT_VOLUME - self.VOLUME_STEP + return self.set_system_volume(target_volume) \ No newline at end of file