From ab42cc6de546ec9149d5f4a372576ea2bf58ac57 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9Cjiawei=E2=80=9D?= <“1073198597@qq.com”>
Date: Mon, 29 Sep 2025 09:19:40 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E8=A1=A5=E5=85=85?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ark_api_module.py     | 101 +++++++++++++++
 camera_module.py      |  30 +++++
 main_scheduler.py     | 291 ++++++++++++++++++++++++++++++++++++++++++
 motion_module.py      |  59 +++++++++
 tts_module.py         | 173 +++++++++++++++++++++++++
 voice_recog_module.py | 226 ++++++++++++++++++++++++++++++++
 volume_module.py      |  93 ++++++++++++++
 7 files changed, 973 insertions(+)

diff --git a/ark_api_module.py b/ark_api_module.py
index e69de29..4454981 100644
--- a/ark_api_module.py
+++ b/ark_api_module.py
@@ -0,0 +1,101 @@
+from openai import OpenAI
+import time
+import sys
+import queue  # 新增：用于缓存实时文本片段
+import threading  # 新增：用于并行处理语音播放
+# 原代码7. 火山方舟API调用完整逻辑
+class ArkAPIController:
+    def __init__(self, ark_api_key, ark_model_id, tts_controller, feedback_text):
+        # 接收调度脚本传入的TTS实例和反馈文本，保持原逻辑
+        self.ARK_API_KEY = ark_api_key
+        self.ARK_MODEL_ID = ark_model_id
+        self.tts_controller = tts_controller
+        self.FEEDBACK_TEXT = feedback_text
+        self.chat_context = []  # 聊天上下文由模块内部维护（与原逻辑一致）
+        self.MAX_CONTEXT_LEN = 10
+
+         # 新增：实时语音播放队列与线程
+        self.speech_queue = queue.Queue()  # 缓存待播放的文本片段
+        self.speech_thread = threading.Thread(target=self._process_speech_queue, daemon=True)
+        self.speech_thread.start()  # 启动语音播放线程
+
+     # 新增：处理语音队列的函数（循环从队列取片段并播放）
+    def _process_speech_queue(self):
+        """持续从队列中获取文本片段并调用TTS播放"""
+        while True:
+            text = self.speech_queue.get()  # 阻塞等待队列消息
+            if text is None:  # 退出信号
+                break
+            self.tts_controller.speak(text)  # 播放片段
+            self.speech_queue.task_done()  # 标记任务完成
+    def call_ark_api(self, content_type: str, content: dict):
+        # 播放操作反馈（同步执行）
+        self.tts_controller.speak(self.FEEDBACK_TEXT[content_type])
+
+        client = OpenAI(
+            base_url="https://ark.cn-beijing.volces.com/api/v3",
+            api_key=self.ARK_API_KEY
+        )
+
+        try:
+            messages = []
+            if content_type == "chat":
+                messages.extend(self.chat_context[-self.MAX_CONTEXT_LEN*2:])
+                messages.append({"role": "user", "content": [{"type": "text", "text": content["prompt"]}]})
+            elif content_type == "image_recog":
+                messages.append({
+                    "role": "user",
+                    "content": [
+                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{content['image_base64']}"}},
+                        {"type": "text", "text": content["prompt"]}
+                    ]
+                })
+
+            response = client.chat.completions.create(
+                model=self.ARK_MODEL_ID,
+                messages=messages,
+                max_tokens=300,
+                temperature=0.7 if content_type == "chat" else 0.3,
+                stream=True
+            )
+
+            full_response = ""
+            current_speech_chunk = ""  # 缓存当前待播放的片段
+            print("\n" + "="*50)
+            print("🤖 回应：", end="", flush=True)
+            
+            for chunk in response:
+                if chunk.choices and chunk.choices[0].delta.content:
+                    char = chunk.choices[0].delta.content
+                    full_response += char
+                    current_speech_chunk += char  # 累加片段
+                    print(char, end="", flush=True)
+                    time.sleep(0.05)
+
+                    # 关键逻辑：当片段包含标点或达到一定长度时，推送到语音队列
+                    if any(punct in current_speech_chunk for punct in [".", "。", "!", "！", "?", "？", ",", "，", ";", "；"]):
+                        self.speech_queue.put(current_speech_chunk)  # 推送片段到队列
+                        current_speech_chunk = ""  # 重置片段缓存
+
+            # 处理最后剩余的片段（如果有）
+            if current_speech_chunk:
+                self.speech_queue.put(current_speech_chunk)
+            
+            print("\n" + "="*50 + "\n")
+
+            # 等待所有语音片段播放完成
+            self.speech_queue.join()
+
+            # 维护聊天上下文（原有逻辑）
+            if content_type == "chat" and full_response.strip():
+                self.chat_context.append({"role": "user", "content": [{"type": "text", "text": content["prompt"]}]})
+                self.chat_context.append({"role": "assistant", "content": [{"type": "text", "text": full_response}]})
+
+            return full_response
+        except Exception as e:
+            error_msg = f"❌ API调用失败：{str(e)}"
+            print(f"\n" + "="*50)
+            print(error_msg)
+            print("="*50 + "\n")
+            self.tts_controller.speak(self.FEEDBACK_TEXT["api_error"])
+            return error_msg
\ No newline at end of file
diff --git a/camera_module.py b/camera_module.py
index e69de29..0bc26d3 100644
--- a/camera_module.py
+++ b/camera_module.py
@@ -0,0 +1,30 @@
+import io
+import base64
+from PIL import Image
+from picamera2 import Picamera2
+import sys
+
+# 原代码6. 摄像头模块完整逻辑
+class CameraModule:
+    def __init__(self):
+        try:
+            self.camera = Picamera2()
+            cam_config = self.camera.create_still_configuration(main={"size": (320, 240)})
+            self.camera.configure(cam_config)
+            self.camera.start()
+            print("📷 摄像头模块初始化成功")
+        except Exception as e:
+            print(f"❌ 摄像头失败：{str(e)}")
+            self.camera = None
+
+    def capture_base64(self):
+        if not self.camera:
+            return None
+        try:
+            img_array = self.camera.capture_array()
+            img_byte = io.BytesIO()
+            Image.fromarray(img_array).save(img_byte, format="JPEG", quality=80)
+            return base64.b64encode(img_byte.getvalue()).decode("utf-8")
+        except Exception as e:
+            print(f"❌ 拍摄失败：{str(e)}")
+            return None
\ No newline at end of file
diff --git a/main_scheduler.py b/main_scheduler.py
index e69de29..b22567f 100644
--- a/main_scheduler.py
+++ b/main_scheduler.py
@@ -0,0 +1,291 @@
+import signal
+import sys
+import time
+import re
+import subprocess
+import queue 
+# 导入所有模块
+from tts_module import BaiduOnlineTTS
+from volume_module import VolumeController, detect_audio_control
+from motion_module import RobotMotionController
+from camera_module import CameraModule
+from ark_api_module import ArkAPIController
+from voice_recog_module import VoiceRecogController
+
+# -------------------- 1. 基础配置（完全保留原代码1. 基础配置） --------------------
+# 1.1 项目路径与运动模型
+PROJECT_ROOT = "/home/duckpi/open_duck_mini_ws/OPEN_DUCK_MINI/Open_Duck_Mini_Runtime-2"
+sys.path.append(PROJECT_ROOT)
+ONNX_MODEL_PATH = "/home/duckpi/open_duck_mini_ws/OPEN_DUCK_MINI/Open_Duck_Mini-2/BEST_WALK_ONNX_2.onnx"
+
+# 1.2 火山方舟API配置
+ARK_API_KEY = "390d517c-129a-41c1-bf3d-458048007b69"
+ARK_MODEL_ID = "doubao-seed-1-6-250615"
+
+# 1.3 语音识别与唤醒词配置
+APPID = "1ff50710"
+ACCESS_KEY_ID = "a4f43e95ee0a9518d11befac8d31f1d4"
+ACCESS_KEY_SECRET = "YzQ4NTRhZjc2ZTM4MDA1YjM2MmIyNDEy"
+ACCESS_KEY = "e0EQQBoH0HIVU9KrXsmB7CMlVci+GAs2x0Ejtrdp8CTtZmf25rCLaQ=="
+WAKEUP_WORD_PATH = "/home/duckpi/open_duck_mini_ws/OPEN_DUCK_MINI/resources/xiaohuangya_zh_raspberry-pi_v3_0_0.ppn"
+MODEL_PATH = "/home/duckpi/open_duck_mini_ws/OPEN_DUCK_MINI/resources/porcupine_params_zh.pv"
+
+# 1.4 百度在线TTS配置
+BAIDU_TTS_API_KEY = "TnwYZPPvElNushOzfL6vBlUI"
+BAIDU_TTS_SECRET_KEY = "55HeI8VNUMNlkW3t2QRwVtrjumpxjfxk"
+
+# 1.5 语音反馈文本配置
+FEEDBACK_TEXT = {
+    "wakeup": "你好呀，有什么吩咐",
+    "move_forward": "好的，我正在前进",
+    "move_backward": "好的，我正在后退",
+    "turn_left": "好的，我正在左转",
+    "turn_right": "好的，我正在右转",
+    "image_recog": "好的，我来识别一下",
+    "chat": "好的，我来想想",
+    "volume_increase": "音量已增大",
+    "volume_decrease": "音量已减小",
+    "volume_max": "已调至最大音量",
+    "volume_min": "已调至最小音量",
+    "unknown": "抱歉，没听懂，请再说一次",
+    "api_error": "抱歉，处理请求时出错了"
+}
+
+# 1.6 音频参数
+VOLUME_STEP = 10
+MIN_VOLUME = 0
+MAX_VOLUME = 100
+CURRENT_VOLUME = 40
+AUDIO_CONTROL_NAME = None
+
+# 1.7 麦克风与扬声器参数（模块内部已定义，此处保留用于一致性）
+SAMPLE_RATE = 16000
+CHANNELS = 1
+SAMPLE_FORMAT = "int16"
+AUDIO_ENCODE = "pcm_s16le"
+LANG = "autodialect"
+INTERACTION_TIMEOUT = 30
+
+# -------------------- 2. 全局状态变量（完全保留原代码2. 全局状态变量，用列表传引用） --------------------
+audio_q = queue.Queue()
+last_audio_time = [time.time()]  # 列表传引用，供模块修改
+current_text = [""]               # 列表传引用，供模块修改
+final_result = [""]              # 列表传引用，供模块修改
+is_processing = [False]          # 列表传引用，供模块修改
+last_command_time = [time.time()]# 列表传引用，供模块修改
+feedback_playing = False         # TTS模块使用的全局变量
+
+
+# -------------------- 8. 指令解析与执行（完全保留原代码8. 指令解析与执行） --------------------
+def parse_voice_command(command_text: str):
+    command_text = command_text.strip().lower()
+    if not command_text:
+        return ("unknown", {})
+
+    # 运动指令
+    motion_rules = [
+        {"keywords": ["前进", "往前走", "向前走"], "action": "move_forward"},
+        {"keywords": ["后退", "往后走", "向后退"], "action": "move_backward"},
+        {"keywords": ["左转", "向左转", "往左转"], "action": "turn_left"},
+        {"keywords": ["右转", "向右转", "往右转"], "action": "turn_right"},
+    ]
+    for rule in motion_rules:
+        if any(keyword in command_text for keyword in rule["keywords"]):
+            number_match = re.search(r"(\d{1,2})", command_text)
+            seconds = int(number_match.group(1)) if number_match else 2
+            return ("motion", {"action": rule["action"], "seconds": seconds})
+
+    # 图像识别指令
+    image_keywords = ["是什么", "这是什么", "识别", "看这个", "这东西", "这物体", "辨认"]
+    if any(keyword in command_text for keyword in image_keywords):
+        prompt = f"请简洁描述图片中的物体，1-2句话说明：{command_text}"
+        return ("image_recog", {"prompt": prompt})
+
+    # 闲聊指令
+    chat_keywords = [
+        "什么", "怎么", "为什么", "哪里", "多少", "如何", "吗", "呢", "吧",
+        "你好", "哈喽", "嗨", "今天", "天气", "时间", "故事", "笑话", "知识"
+    ]
+    exclude_keywords = ["前进", "后退", "左转", "右转", "识别", "音量", "增大", "减小"]
+    if len(command_text) >= 2 and any(k in command_text for k in chat_keywords) and not any(k in command_text for k in exclude_keywords):
+        return ("chat", {"prompt": command_text})
+
+    # 音量控制指令
+    if any(keyword in command_text for keyword in ["增大音量", "声音大一点", "调大音量"]):
+        return ("volume", {"action": "increase"})
+    elif any(keyword in command_text for keyword in ["减小音量", "声音小一点", "调小音量"]):
+        return ("volume", {"action": "decrease"})
+    elif any(keyword in command_text for keyword in ["最大音量", "声音最大"]):
+        return ("volume", {"action": "max"})
+    elif any(keyword in command_text for keyword in ["最小音量", "声音最小", "静音"]):
+        return ("volume", {"action": "min"})
+
+    # 未知指令
+    return ("unknown", {})
+
+def execute_command(command_type: str, params: dict, motion_controller, ark_api_controller, volume_controller):
+    global is_processing, feedback_playing
+    if is_processing[0]:
+        tts_controller.speak(FEEDBACK_TEXT["unknown"])
+        print("⚠️  已有指令处理中，请稍后再说")
+        return
+    is_processing[0] = True
+
+    try:
+        if command_type == "motion":
+            motion_controller.execute_motion(params["action"], params["seconds"])
+        
+        elif command_type == "image_recog":
+            print(f"\n🔍 触发图像识别，正在拍摄...")
+            image_base64 = camera_module.capture_base64()
+            if not image_base64:
+                tts_controller.speak(FEEDBACK_TEXT["unknown"])
+                print("\n" + "="*50)
+                print("❌ 图像采集失败，无法识别")
+                print("="*50 + "\n")
+                return
+            ark_api_controller.call_ark_api("image_recog", {"image_base64": image_base64, "prompt": params["prompt"]})
+        
+        elif command_type == "chat":
+            print(f"\n💬 触发闲聊，正在思考...")
+            ark_api_controller.call_ark_api("chat", {"prompt": params["prompt"]})
+        
+        elif command_type == "volume":
+            volume_action = params["action"]
+            if volume_action == "increase":
+                success = volume_controller.adjust_volume(is_increase=True)
+                if success:
+                    tts_controller.speak(FEEDBACK_TEXT["volume_increase"])
+            elif volume_action == "decrease":
+                success = volume_controller.adjust_volume(is_increase=False)
+                if success:
+                    tts_controller.speak(FEEDBACK_TEXT["volume_decrease"])
+            elif volume_action == "max":
+                success = volume_controller.set_system_volume(MAX_VOLUME)
+                if success:
+                    tts_controller.speak(FEEDBACK_TEXT["volume_max"])
+            elif volume_action == "min":
+                success = volume_controller.set_system_volume(MIN_VOLUME)
+                if success:
+                    tts_controller.speak(FEEDBACK_TEXT["volume_min"])
+        
+        elif command_type == "unknown":
+            tts_controller.speak(FEEDBACK_TEXT["unknown"])
+            print("\n" + "="*50)
+            print(f"❌ 未识别到有效指令，支持：")
+            print(f"  - 运动：前进3秒、左转2秒 |  - 图像识别：这是什么")
+            print(f"  - 闲聊：今天天气怎么样 |  - 音量：增大音量、减小音量")
+            print("="*50 + "\n")
+    
+    finally:
+        is_processing[0] = False
+
+
+# -------------------- 11. 主循环（完全保留原代码11. 主循环逻辑） --------------------
+def main():
+    global tts_controller, camera_module, AUDIO_CONTROL_NAME, feedback_playing
+
+    # 初始化各模块（按原代码顺序）
+    # 1. 初始化TTS
+    try:
+        tts_controller = BaiduOnlineTTS(BAIDU_TTS_API_KEY, BAIDU_TTS_SECRET_KEY)
+    except Exception as e:
+        print(f"❌ TTS初始化失败: {str(e)}")
+        sys.exit(1)
+
+    # 2. 初始化音量控制
+    AUDIO_CONTROL_NAME = detect_audio_control()
+    volume_controller = VolumeController(
+        audio_control_name=AUDIO_CONTROL_NAME,
+        current_volume=CURRENT_VOLUME,
+        volume_step=VOLUME_STEP,
+        min_volume=MIN_VOLUME,
+        max_volume=MAX_VOLUME
+    )
+
+    # 3. 初始化运动控制
+    motion_controller = RobotMotionController(
+        onnx_model_path=ONNX_MODEL_PATH,
+        tts_controller=tts_controller,
+        feedback_text=FEEDBACK_TEXT
+    )
+
+    # 4. 初始化摄像头
+    camera_module = CameraModule()
+
+    # 5. 初始化API控制器
+    ark_api_controller = ArkAPIController(
+        ark_api_key=ARK_API_KEY,
+        ark_model_id=ARK_MODEL_ID,
+        tts_controller=tts_controller,
+        feedback_text=FEEDBACK_TEXT
+    )
+
+    # 6. 初始化语音识别
+    voice_recog_controller = VoiceRecogController(
+        access_key=ACCESS_KEY,
+        wakeup_word_path=WAKEUP_WORD_PATH,
+        model_path=MODEL_PATH,
+        appid=APPID,
+        access_key_id=ACCESS_KEY_ID,
+        access_key_secret=ACCESS_KEY_SECRET,
+        tts_controller=tts_controller,
+        feedback_text=FEEDBACK_TEXT
+    )
+
+    # 中断处理（完全保留原逻辑）
+    def handle_interrupt(signum, frame):
+        print("\n🛑 收到退出信号，正在清理资源...")
+        # 停止机器人运动
+        if 'motion_controller' in globals() and hasattr(motion_controller, 'rl_walk'):
+            motion_controller.rl_walk.last_commands = [0.0, 0.0, 0.0]
+        # 停止TTS播放
+        global feedback_playing
+        feedback_playing = False
+        # 停止摄像头与麦克风
+        if 'camera_module' in globals() and camera_module.camera:
+            camera_module.camera.stop()
+        if hasattr(voice_recog_controller, 'stream') and voice_recog_controller.stream and voice_recog_controller.stream.active:
+            voice_recog_controller.stream.stop()
+        # 关闭TTS资源
+        tts_controller.close()
+        print("✅ 所有资源清理完成，程序退出")
+        sys.exit(0)
+
+    signal.signal(signal.SIGINT, handle_interrupt)
+
+    # 强制测试一次语音输出（原逻辑）
+    print("\n🔍 正在测试语音输出...")
+    tts_controller.speak("系统初始化完成,等待语音唤醒")
+
+    # 主循环（原逻辑）
+    while True:
+        if voice_recog_controller.wakeup_listener():
+            # 定义指令执行回调函数（关键修复）
+            def execute_callback(command_text):
+                command_type, params = parse_voice_command(command_text)
+                execute_command(command_type, params, motion_controller, ark_api_controller, volume_controller)
+            
+            # 启动WebSocket时传入回调函数
+            voice_recog_controller.start_websocket(
+                current_text=current_text,
+                final_result=final_result,
+                last_audio_time=last_audio_time,
+                is_processing=is_processing,
+                last_command_time=last_command_time,
+                execute_callback=execute_callback  # 传入回调
+            )
+            # 重置状态
+            last_audio_time[0] = time.time()
+            last_command_time[0] = time.time()
+
+
+if __name__ == "__main__":
+    # # 确保ffmpeg已安装（原逻辑）
+    # try:
+    #     subprocess.run(["ffmpeg", "--version"], capture_output=True, check=True)
+    # except:
+    #     print("⚠️  未检测到ffmpeg，正在尝试安装...")
+    #     subprocess.run(["sudo", "apt-get", "install", "-y", "ffmpeg"], check=True)
+    
+    main()
\ No newline at end of file
diff --git a/motion_module.py b/motion_module.py
index e69de29..afccfe0 100644
--- a/motion_module.py
+++ b/motion_module.py
@@ -0,0 +1,59 @@
+import sys
+import time
+import threading
+
+# 原代码5. 运动控制模块完整逻辑（保留原路径导入）
+PROJECT_ROOT = "/home/duckpi/open_duck_mini_ws/OPEN_DUCK_MINI/Open_Duck_Mini_Runtime-2"
+sys.path.append(PROJECT_ROOT)
+from v2_rl_walk_mujoco import RLWalk
+
+class RobotMotionController:
+    def __init__(self, onnx_model_path, tts_controller, feedback_text):
+        # 接收调度脚本传入的TTS实例和反馈文本，保持原逻辑调用
+        self.tts_controller = tts_controller
+        self.FEEDBACK_TEXT = feedback_text
+        try:
+            self.rl_walk = RLWalk(
+                onnx_model_path=onnx_model_path,
+                cutoff_frequency=40,
+                pid=[30, 0, 0]
+            )
+            self.walk_thread = threading.Thread(target=self.rl_walk.run, daemon=True)
+            self.walk_thread.start()
+            time.sleep(1)
+            print("🤖 运动控制模块初始化成功")
+        except Exception as e:
+            print(f"❌ 运动控制失败：{str(e)}")
+            sys.exit(1)
+
+    def execute_motion(self, action_name: str, seconds: int):
+        # 全局变量由调度脚本传入，保留原逻辑
+        global is_processing
+        is_processing = True
+        try:
+            # 播放反馈（同步执行确保声音输出）
+            self.tts_controller.speak(self.FEEDBACK_TEXT[action_name])
+
+            # 执行运动
+            seconds = max(2, min(seconds, 5))
+            if action_name == "move_forward":
+                print(f"\n🚶 前进{seconds}秒...")
+                self.rl_walk.last_commands[0] = 0.17
+            elif action_name == "move_backward":
+                print(f"\n🚶 后退{seconds}秒...")
+                self.rl_walk.last_commands[0] = -0.17
+            elif action_name == "turn_left":
+                print(f"\n🔄 左转{seconds}秒...")
+                self.rl_walk.last_commands[2] = 1.1
+            elif action_name == "turn_right":
+                print(f"\n🔄 右转{seconds}秒...")
+                self.rl_walk.last_commands[2] = -1.1
+            
+            time.sleep(seconds)
+            self.rl_walk.last_commands = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+            print(f"✅ 运动完成")
+        except Exception as e:
+            print(f"❌ 运动执行失败：{str(e)}")
+            self.rl_walk.last_commands = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+        finally:
+            is_processing = False
\ No newline at end of file
diff --git a/tts_module.py b/tts_module.py
index e69de29..effc0d1 100644
--- a/tts_module.py
+++ b/tts_module.py
@@ -0,0 +1,173 @@
+import pyaudio
+import wave
+import tempfile
+import os
+import requests
+import time
+import sys
+
+# 原代码3. 百度在线TTS模块完整逻辑
+class BaiduOnlineTTS:
+    def __init__(self, api_key, secret_key):
+        """初始化百度在线TTS"""
+        self.api_key = api_key
+        self.secret_key = secret_key
+        self.access_token = None
+        self.token_expires = 0
+        
+        # 初始化音频播放器
+        self.audio_player = pyaudio.PyAudio()
+        
+        # TTS配置参数
+        self.default_options = {
+            'vol': 5,    # 音量(0-15)
+            'spd': 5,    # 语速(0-9)
+            'pit': 5,    # 音调(0-9)
+            'per': 0     # 发音人(0:女,1:男,3:情感女,4:情感男)
+        }
+        
+        # 获取初始访问令牌
+        if not self._get_access_token():
+            raise Exception("无法获取百度API访问令牌，请检查密钥是否正确")
+
+    def _get_access_token(self):
+        """获取百度API访问令牌"""
+        # 检查令牌是否仍然有效
+        if self.access_token and time.time() < self.token_expires - 300:
+            return True
+            
+        try:
+            url = f"https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id={self.api_key}&client_secret={self.secret_key}"
+            response = requests.get(url)
+            result = response.json()
+            
+            if "access_token" in result:
+                self.access_token = result["access_token"]
+                self.token_expires = time.time() + result["expires_in"]
+                print("✅ 成功获取百度API访问令牌")
+                return True
+            else:
+                print(f"❌ 获取令牌失败: {result}")
+                return False
+        except Exception as e:
+            print(f"❌ 获取令牌时发生错误: {str(e)}")
+            return False
+
+    def text_to_speech(self, text, options=None, save_path=None):
+        """将文本转换为语音"""
+        # 确保令牌有效
+        if not self._get_access_token():
+            return None
+            
+        # 合并配置参数
+        params = self.default_options.copy()
+        if options:
+            params.update(options)
+            
+        try:
+            # 对文本进行URL编码
+            encoded_text = requests.utils.quote(text)
+            url = f"https://tsn.baidu.com/text2audio?tex={encoded_text}&lan=zh&cuid=baidu-tts-python&ctp=1&tok={self.access_token}"
+            
+            # 添加合成参数
+            for key, value in params.items():
+                url += f"&{key}={value}"
+                
+            # 发送请求
+            response = requests.get(url)
+            
+            # 检查响应是否为音频数据
+            if response.headers.get("Content-Type", "").startswith("audio/"):
+                # 保存文件(如果需要)
+                if save_path:
+                    with open(save_path, "wb") as f:
+                        f.write(response.content)
+                    print(f"✅ 音频已保存至: {save_path}")
+                
+                return response.content
+            else:
+                # 解析错误信息
+                try:
+                    error = response.json()
+                    print(f"❌ 语音合成失败: {error.get('err_msg', '未知错误')}")
+                except:
+                    print(f"❌ 语音合成失败，响应内容: {response.text}")
+                return None
+                
+        except Exception as e:
+            print(f"❌ 语音合成时发生错误: {str(e)}")
+            return None
+
+    def speak(self, text, options=None):
+        """直接播放文本转换的语音"""
+        # 全局变量由调度脚本传入，此处保留原逻辑调用
+        from main_scheduler import feedback_playing
+        if feedback_playing:
+            return False
+            
+        feedback_playing = True
+        
+        # 限制文本长度(百度API有长度限制)
+        if len(text) > 1024:
+            print("⚠️ 文本过长，将截断为1024字符")
+            text = text[:1024]
+            
+        # 获取音频数据
+        audio_data = self.text_to_speech(text, options)
+        if not audio_data:
+            feedback_playing = False
+            return False
+            
+        try:
+            # 创建临时MP3文件
+            with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_file:
+                temp_file.write(audio_data)
+                temp_filename = temp_file.name
+            
+            # 转换为WAV格式(适配pyaudio)
+            from pydub import AudioSegment
+            audio = AudioSegment.from_mp3(temp_filename)
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as wav_file:
+                audio.export(wav_file.name, format="wav")
+                wav_filename = wav_file.name
+            
+            # 播放WAV文件
+            wf = wave.open(wav_filename, 'rb')
+            stream = self.audio_player.open(
+                format=self.audio_player.get_format_from_width(wf.getsampwidth()),
+                channels=wf.getnchannels(),
+                rate=wf.getframerate(),
+                output=True
+            )
+            
+            # 播放音频
+            chunk = 1024
+            data = wf.readframes(chunk)
+            while data and feedback_playing:
+                stream.write(data)
+                data = wf.readframes(chunk)
+            
+            # 清理资源
+            stream.stop_stream()
+            stream.close()
+            wf.close()
+            
+            print(f"✅ 语音播放完成: {text[:20]}...")
+            return True
+            
+        except Exception as e:
+            print(f"❌ 播放语音时发生错误: {str(e)}")
+            return False
+            
+        finally:
+            # 删除临时文件
+            if 'temp_filename' in locals() and os.path.exists(temp_filename):
+                os.remove(temp_filename)
+            if 'wav_filename' in locals() and os.path.exists(wav_filename):
+                os.remove(wav_filename)
+            feedback_playing = False
+
+    def close(self):
+        """释放资源"""
+        self.audio_player.terminate()
+        print("✅ TTS资源已释放")
\ No newline at end of file
diff --git a/voice_recog_module.py b/voice_recog_module.py
index e69de29..9fe2e84 100644
--- a/voice_recog_module.py
+++ b/voice_recog_module.py
@@ -0,0 +1,226 @@
+import sounddevice as sd
+import pvporcupine
+import struct
+import websocket
+import threading
+import hmac
+import hashlib
+import base64
+import json
+import time
+import urllib.parse
+import uuid
+import queue
+import sys
+
+# 原代码9. 音频采集与WebSocket + 10. 唤醒词监听完整逻辑
+class VoiceRecogController:
+    def __init__(self, access_key, wakeup_word_path, model_path, appid, access_key_id, access_key_secret, tts_controller, feedback_text):
+        # 接收调度脚本传入的参数，保持原逻辑
+        self.ACCESS_KEY = access_key
+        self.WAKEUP_WORD_PATH = wakeup_word_path
+        self.MODEL_PATH = model_path
+        self.APPID = appid
+        self.ACCESS_KEY_ID = access_key_id
+        self.ACCESS_KEY_SECRET = access_key_secret
+        self.tts_controller = tts_controller
+        self.FEEDBACK_TEXT = feedback_text
+        self.SAMPLE_RATE = 16000
+        self.CHANNELS = 1
+        self.SAMPLE_FORMAT = "int16"
+        self.INTERACTION_TIMEOUT = 30
+        self.audio_q = queue.Queue()
+        self.stream = None  # 麦克风流后续初始化
+
+    def wakeup_listener(self):
+        """原代码10. 唤醒词监听"""
+        try:
+            porcupine = pvporcupine.create(
+                access_key=self.ACCESS_KEY,
+                keyword_paths=[self.WAKEUP_WORD_PATH],
+                model_path=self.MODEL_PATH
+            )
+            print(f"\n🎯 唤醒词引擎就绪（采样率：{porcupine.sample_rate}）")
+
+            wakeup_mic = sd.RawInputStream(
+                samplerate=porcupine.sample_rate,
+                blocksize=porcupine.frame_length,
+                dtype="int16",
+                channels=1
+            )
+
+            print("📢 等待唤醒词「小黄鸭」（按Ctrl+C退出）")
+            with wakeup_mic:
+                while True:
+                    pcm_data, _ = wakeup_mic.read(porcupine.frame_length)
+                    pcm_unpacked = struct.unpack_from("h" * porcupine.frame_length, pcm_data)
+                    if porcupine.process(pcm_unpacked) >= 0:
+                        print("🚀 检测到唤醒词「小黄鸭」！")
+                        # 播放唤醒反馈（同步执行）
+                        self.tts_controller.speak(self.FEEDBACK_TEXT["wakeup"])
+                        porcupine.delete()
+                        return True
+        except Exception as e:
+            print(f"\n❌ 唤醒词监听失败：{str(e)}")
+            print("   排查：1. 唤醒词文件路径 2. 麦克风连接 3. PicoVoice Key有效性")
+            sys.exit(1)
+
+    def _audio_callback(self, indata, frames, t, status):
+        """原代码9. 音频采集回调"""
+        if status:
+            print(f"⚠️  音频异常：{status}")
+        self.audio_q.put(bytes(indata))
+
+    def _create_ws_url(self):
+        """原代码9. 创建WebSocket URL"""
+        try:
+            host = "office-api-ast-dx.iflyaisol.com"
+            path = "/ast/communicate/v1"
+            utc = time.strftime("%Y-%m-%dT%H:%M:%S", time.gmtime()) + "+0000"
+            session_uuid = str(uuid.uuid4())
+
+            params = {
+                "accessKeyId": self.ACCESS_KEY_ID,
+                "appId": self.APPID,
+                "samplerate": self.SAMPLE_RATE,
+                "audio_encode": "pcm_s16le",
+                "lang": "autodialect",
+                "uuid": session_uuid,
+                "utc": utc,
+            }
+
+            sorted_params = sorted(params.items(), key=lambda x: x[0])
+            base_string = "&".join(
+                f"{urllib.parse.quote_plus(str(k))}={urllib.parse.quote_plus(str(v))}"
+                for k, v in sorted_params
+            )
+            signature = hmac.new(
+                self.ACCESS_KEY_SECRET.encode("utf-8"),
+                base_string.encode("utf-8"),
+                hashlib.sha1
+            ).digest()
+            signature = base64.b64encode(signature).decode("utf-8")
+
+            query = base_string + "&signature=" + urllib.parse.quote_plus(signature)
+            return f"wss://{host}{path}?{query}", session_uuid
+        except Exception as e:
+            print(f"❌ WebSocket URL生成失败：{str(e)}")
+            return None, None
+
+    def _on_message(self, ws, message, current_text, last_audio_time):
+        """原代码9. WebSocket消息处理（接收全局变量引用）"""
+        try:
+            data = json.loads(message)
+            if data.get("msg_type") == "result" and "cn" in data.get("data", {}):
+                words = [
+                    cw.get("w", "") 
+                    for rt in data["data"]["cn"].get("st", {}).get("rt", [])
+                    for ws_item in rt.get("ws", [])
+                    for cw in ws_item.get("cw", [])
+                ]
+                if words:
+                    current_text[0] = "".join(words)  # 用列表传引用，修改全局变量
+                    last_audio_time[0] = time.time()
+                    print(f"🎧 识别中：{current_text[0]}", end="\r")
+        except Exception as e:
+            print(f"\n❌ 语音识别消息处理错误：{str(e)}")
+
+    def _on_error(self, ws, error, is_processing):
+        """原代码9. WebSocket错误处理"""
+        if not is_processing[0]:
+            print(f"\n❌ WebSocket连接错误：{str(error)}")
+
+    def _on_close(self, ws, close_status_code, close_msg, current_text, final_result, stream):
+        """原代码9. WebSocket关闭处理"""
+        print(f"\n🔌 WebSocket连接关闭 | 状态码：{close_status_code}")
+        if stream and stream.active:
+            stream.stop()
+        current_text[0] = ""
+        final_result[0] = ""
+
+    def _on_open(self, ws, stream, current_text, final_result, last_audio_time, is_processing, last_command_time, execute_callback):
+        """新增 execute_callback 参数，用于接收指令执行函数"""
+        def send_audio_and_handle():
+            print("\n🎤 指令已就绪！支持：")
+            print("  - 运动：前进3秒、左转2秒 |  - 图像识别：这是什么")
+            print("  - 闲聊：今天天气怎么样 |  - 音量：增大音量、减小音量\n")
+            stream.start()
+            current_text[0] = ""
+            final_result[0] = ""
+            last_command_time[0] = time.time()
+
+            while True:
+                try:
+                    # 1. 处理音频队列（避免堆积）
+                    while self.audio_q.qsize() > 5:
+                        self.audio_q.get_nowait()
+                    # 2. 发送音频数据（若队列有数据）
+                    audio_data = self.audio_q.get(timeout=0.5)
+                    ws.send(audio_data, websocket.ABNF.OPCODE_BINARY)
+
+                    # 3. 指令识别与执行：有文本且2秒内无新音频时执行
+                    if current_text[0] and (time.time() - last_audio_time[0]) > 2:
+                        final_result[0] = current_text[0].strip()
+                        if len(final_result[0]) > 0:  # 确保指令有效
+                            print(f"\n⏹ 最终指令：{final_result[0]}")
+                            # 调用回调函数执行指令（关键修复：直接在这里执行）
+                            execute_callback(final_result[0])
+                            last_command_time[0] = time.time()  # 更新最后操作时间
+                        current_text[0] = ""  # 执行后清空，避免重复识别
+                        final_result[0] = ""
+                        time.sleep(1)  # 等待指令执行完成
+
+                    # 4. 超时检测：30秒无操作则关闭连接
+                    if time.time() - last_command_time[0] > self.INTERACTION_TIMEOUT:
+                        print(f"\n⌛ {self.INTERACTION_TIMEOUT}秒无操作，关闭连接")
+                        self.tts_controller.speak(self.FEEDBACK_TEXT.get("wakeup_timeout", "长时间没操作，我先休息啦"))
+                        time.sleep(1)
+                        ws.send("close", websocket.ABNF.OPCODE_TEXT)
+                        break
+
+                except queue.Empty:
+                    # 队列为空时，检测超时
+                    if time.time() - last_command_time[0] > self.INTERACTION_TIMEOUT:
+                        print(f"\n⌛ {self.INTERACTION_TIMEOUT}秒无操作，关闭连接")
+                        self.tts_controller.speak(self.FEEDBACK_TEXT.get("wakeup_timeout", "长时间没操作，我先休息啦"))
+                        time.sleep(1)
+                        ws.send("close", websocket.ABNF.OPCODE_TEXT)
+                        break
+                    continue  # 继续循环等待音频
+                except Exception as e:
+                    print(f"\n❌ 音频发送错误：{str(e)}")
+                    break
+
+        audio_thread = threading.Thread(target=send_audio_and_handle, daemon=True)
+        audio_thread.start()
+
+    def start_websocket(self, current_text, final_result, last_audio_time, is_processing, last_command_time, execute_callback):
+        """新增 execute_callback 参数，用于传递指令执行函数"""
+        self.stream = sd.RawInputStream(
+            samplerate=self.SAMPLE_RATE,
+            channels=self.CHANNELS,
+            dtype=self.SAMPLE_FORMAT,
+            callback=self._audio_callback,
+        )
+
+        ws_url, session_id = self._create_ws_url()
+        if not ws_url:
+            print("⚠️  无法生成语音识别连接，3秒后重新监听...")
+            time.sleep(3)
+            return
+
+        try:
+            print(f"🔄 连接语音识别服务（会话ID：{session_id[:8]}...）")
+            # 绑定WebSocket回调时传入 execute_callback
+            ws = websocket.WebSocketApp(
+                ws_url,
+                on_open=lambda ws: self._on_open(ws, self.stream, current_text, final_result, last_audio_time, is_processing, last_command_time, execute_callback),
+                on_message=lambda ws, msg: self._on_message(ws, msg, current_text, last_audio_time),
+                on_error=lambda ws, err: self._on_error(ws, err, is_processing),
+                on_close=lambda ws, status, msg: self._on_close(ws, status, msg, current_text, final_result, self.stream)
+            )
+            ws.run_forever(ping_interval=10, ping_timeout=5)
+        except Exception as e:
+            print(f"❌ 语音识别连接失败：{str(e)}")
+            print("⚠️  3秒后重新监听唤醒词...")
+            time.sleep(3)
\ No newline at end of file
diff --git a/volume_module.py b/volume_module.py
index e69de29..fdf1ab2 100644
--- a/volume_module.py
+++ b/volume_module.py
@@ -0,0 +1,93 @@
+import subprocess
+import re
+import sys
+
+# 原代码4. 音频控制项自动检测与音量控制完整逻辑
+def detect_audio_control():
+    """自动检测可用的音频播放控制项"""
+    try:
+        result = subprocess.run(["amixer", "controls"], capture_output=True, text=True)
+        playback_controls = []
+        
+        # 优先查找常见的音频控制项
+        priority_names = ["Master", "Speaker", "Headphone", "Audio", "Sound"]
+        
+        for name in priority_names:
+            if name in result.stdout:
+                print(f"✅ 自动检测到音频控制项：{name}")
+                return name
+        
+        # 如果没有找到优先项，从所有控制项中提取
+        for line in result.stdout.splitlines():
+            if "Playback" in line:
+                match = re.search(r"'([^']+)'", line)
+                if match:
+                    playback_controls.append(match.group(1))
+        
+        if playback_controls:
+            print(f"✅ 找到音频控制项：{playback_controls[0]}")
+            return playback_controls[0]
+        else:
+            print("⚠️  未检测到音频控制项，将尝试不指定控制项调节音量")
+            return None
+    except Exception as e:
+        print(f"❌ 音频控制检测失败：{str(e)}")
+        return None
+
+class VolumeController:
+    def __init__(self, audio_control_name, current_volume, volume_step, min_volume, max_volume):
+        # 接收调度脚本传入的全局参数，保持原逻辑
+        self.available = True
+        self.AUDIO_CONTROL_NAME = audio_control_name
+        self.CURRENT_VOLUME = current_volume
+        self.VOLUME_STEP = volume_step
+        self.MIN_VOLUME = min_volume
+        self.MAX_VOLUME = max_volume
+        
+        try:
+            # 强制取消静音并设置初始音量
+            self.set_system_volume(self.CURRENT_VOLUME)
+            current_volume = self.get_system_volume()
+            if current_volume is not None:
+                self.CURRENT_VOLUME = current_volume
+                print(f"🔊 音量控制初始化成功（当前音量：{self.CURRENT_VOLUME}%）")
+            else:
+                print(f"⚠️  无法获取当前音量，但已尝试设置为{self.CURRENT_VOLUME}%")
+        except Exception as e:
+            self.available = False
+            print(f"❌ 音量控制失败：{str(e)}")
+
+    def get_system_volume(self):
+        try:
+            # 根据检测到的控制项获取音量
+            cmd = ["amixer", "get"]
+            if self.AUDIO_CONTROL_NAME:
+                cmd.append(self.AUDIO_CONTROL_NAME)
+            
+            result = subprocess.run(cmd, capture_output=True, text=True)
+            volume_match = re.search(r"(\d+)%", result.stdout)
+            return int(volume_match.group(1)) if volume_match else None
+        except Exception as e:
+            print(f"❌ 获取音量失败：{str(e)}")
+            return None
+
+    def set_system_volume(self, target_volume: int):
+        target_volume = max(self.MIN_VOLUME, min(self.MAX_VOLUME, target_volume))
+        try:
+            # 根据检测到的控制项设置音量
+            cmd = ["amixer", "set"]
+            if self.AUDIO_CONTROL_NAME:
+                cmd.append(self.AUDIO_CONTROL_NAME)
+            cmd.extend([f"{target_volume}%", "unmute"])  # 强制取消静音
+            
+            subprocess.run(cmd, capture_output=True)
+            self.CURRENT_VOLUME = target_volume
+            print(f"🔊 音量已调整至：{self.CURRENT_VOLUME}%")
+            return True
+        except Exception as e:
+            print(f"❌ 调整音量失败：{str(e)}")
+            return False
+
+    def adjust_volume(self, is_increase: bool):
+        target_volume = self.CURRENT_VOLUME + self.VOLUME_STEP if is_increase else self.CURRENT_VOLUME - self.VOLUME_STEP
+        return self.set_system_volume(target_volume)
\ No newline at end of file