""" CosyVoice API 服务类 - 核心语音克隆功能 通过HTTP调用独立的CosyVoice API服务器 """ import os import logging import requests import tempfile from typing import Optional, Dict, Any, Tuple logger = logging.getLogger(__name__) class CosyVoiceService: """CosyVoice API服务类""" def __init__(self, api_url: str = "http://127.0.0.1:8081"): self.api_url = api_url self._service_status = "未连接" def connect(self) -> bool: """检查连接状态""" try: response = requests.get(f"{self.api_url}/health", timeout=5) if response.status_code == 200: result = response.json() self._service_status = "连接正常" return True else: self._service_status = f"连接失败: HTTP {response.status_code}" return False except Exception as e: self._service_status = f"连接失败: {str(e)}" return False def get_available_voices(self) -> list: """获取可用的音色列表""" try: response = requests.get(f"{self.api_url}/voices", timeout=5) if response.status_code == 200: result = response.json() return result.get('voices', []) else: logger.error(f"获取音色列表失败: HTTP {response.status_code}") return [] except Exception as e: logger.error(f"获取音色列表失败: {str(e)}") return [] def get_reference_audios(self) -> list: """获取参考音频列表(用于兼容)""" return ["麦克阿瑟.mp3", "年轻人,不讲武德.mp3"] def recognize_audio(self, audio_file_path: str) -> str: """语音识别:将音频转换为文本 - 核心功能""" try: logger.info(f"开始语音识别: {audio_file_path}") response = requests.post( f"{self.api_url}/recognize", json={"audio_path": audio_file_path}, timeout=30 ) if response.status_code == 200: result = response.json() if result.get('success'): text = result.get('text', '') logger.info(f"语音识别成功: {text}") return text else: logger.error(f"语音识别失败: {result.get('message')}") return "" else: logger.error(f"语音识别API请求失败: HTTP {response.status_code}") return "" except Exception as e: logger.error(f"语音识别失败: {str(e)}") return "" def generate_speech_with_preset_voice( self, text: str, voice: str = "中文女", seed: int = 42, speed: float = 1.0, stream: bool = False ) -> Tuple[Optional[str], Optional[str]]: """使用预训练音色生成语音""" try: logger.info(f"开始生成语音: 文本='{text[:20]}...', 音色={voice}, 种子={seed}") # 发送生成请求 response = requests.post( f"{self.api_url}/generate/preset", json={ "text": text, "voice": voice, "seed": seed, "speed": speed, "stream": stream }, timeout=30 ) if response.status_code == 200: result = response.json() if result.get('success'): # 直接从响应中获取音频数据 audio_data = result.get('audio_data') if audio_data: local_path = self._save_audio_data(audio_data, 'preset') if local_path: logger.info(f"预训练语音生成成功: {local_path}") return local_path, local_path else: return None, None else: logger.error("响应中没有音频数据") return None, None else: logger.error(f"语音生成失败: {result.get('message')}") return None, None else: logger.error(f"API请求失败: HTTP {response.status_code}") return None, None except Exception as e: logger.error(f"预训练音色语音生成失败: {str(e)}") return None, None def generate_speech_with_voice_cloning( self, text: str, reference_audio_path: str, reference_text: str = "", seed: int = 42 ) -> Tuple[Optional[str], Optional[str]]: """使用语音克隆生成语音 - 核心功能""" try: logger.info(f"开始语音克隆: 文本='{text[:20]}...', 参考音频={reference_audio_path}") # 发送克隆请求 response = requests.post( f"{self.api_url}/generate/clone", json={ "text": text, "reference_audio": reference_audio_path, "reference_text": reference_text, "seed": seed }, timeout=60 # 克隆需要更长时间 ) if response.status_code == 200: result = response.json() if result.get('success'): # 直接从响应中获取音频数据 audio_data = result.get('audio_data') if audio_data: local_path = self._save_audio_data(audio_data, 'clone') if local_path: logger.info(f"语音克隆成功: {local_path}") return local_path, local_path else: return None, None else: logger.error("响应中没有音频数据") return None, None else: logger.error(f"语音克隆失败: {result.get('message')}") return None, None else: logger.error(f"API请求失败: HTTP {response.status_code}") return None, None except Exception as e: logger.error(f"语音克隆失败: {str(e)}") return None, None def generate_speech_with_natural_control( self, text: str, instruction: str = "请用温柔甜美的女声朗读", seed: int = 42 ) -> Tuple[Optional[str], Optional[str]]: """使用自然语言控制生成语音 - 核心功能""" try: logger.info(f"开始自然语言控制生成: 文本='{text[:20]}...', 指令='{instruction}'") # 发送生成请求 response = requests.post( f"{self.api_url}/generate/natural", json={ "text": text, "instruction": instruction, "seed": seed }, timeout=30 ) if response.status_code == 200: result = response.json() if result.get('success'): # 直接从响应中获取音频数据 audio_data = result.get('audio_data') if audio_data: local_path = self._save_audio_data(audio_data, 'natural') if local_path: logger.info(f"自然语言控制生成成功: {local_path}") return local_path, local_path else: return None, None else: logger.error("响应中没有音频数据") return None, None else: logger.error(f"自然语言控制生成失败: {result.get('message')}") return None, None else: logger.error(f"API请求失败: HTTP {response.status_code}") return None, None except Exception as e: logger.error(f"自然语言控制生成失败: {str(e)}") return None, None def _save_audio_data(self, audio_data_hex: str, audio_type: str) -> Optional[str]: """保存音频数据到本地临时文件""" try: # 解码十六进制音频数据 audio_bytes = bytes.fromhex(audio_data_hex) # 创建本地临时文件 temp_fd, local_path = tempfile.mkstemp(suffix='.wav', prefix=f'cosyvoice_{audio_type}_') os.close(temp_fd) # 写入音频数据 with open(local_path, 'wb') as f: f.write(audio_bytes) logger.info(f"音频保存成功: {local_path}, 大小: {len(audio_bytes)} 字节") return local_path except Exception as e: logger.error(f"音频保存失败: {str(e)}") return None def generate_random_seed(self) -> int: """生成随机种子""" try: response = requests.get(f"{self.api_url}/random-seed", timeout=5) if response.status_code == 200: result = response.json() return result.get('seed', 42) else: import random return random.randint(1, 999999) except Exception as e: logger.error(f"生成随机种子失败: {str(e)}") import random return random.randint(1, 999999) def test_connection(self) -> Dict[str, Any]: """测试与CosyVoice服务的连接""" try: if not self.connect(): return { "success": False, "message": "无法连接到CosyVoice API服务器", "api_url": self.api_url, "service_status": self._service_status } # 获取音色列表 voices = self.get_available_voices() # 测试语音生成功能(预训练音色) generation_status = "未测试" try: stream_audio, full_audio = self.generate_speech_with_preset_voice( text="测试", voice=voices[0] if voices else "中文女", seed=42 ) if stream_audio or full_audio: generation_status = "正常" self._service_status = "服务正常" else: generation_status = "生成失败" self._service_status = "语音生成功能异常" except Exception as e: generation_status = f"测试失败: {str(e)}" self._service_status = f"语音生成测试失败: {str(e)}" return { "success": True, "message": "CosyVoice API服务连接成功", "api_url": self.api_url, "available_voices": voices, "reference_audios": self.get_reference_audios(), "generation_status": generation_status, "service_status": self._service_status, "core_features": { "voice_cloning": True, "speech_recognition": True, "natural_control": True } } except Exception as e: self._service_status = f"连接测试失败: {str(e)}" return { "success": False, "message": f"连接测试失败: {str(e)}", "api_url": self.api_url, "service_status": self._service_status } # 全局服务实例 cosyvoice_service = CosyVoiceService()