Happy_language/app/services/cosyvoice_service.py
superlishunqin d227e3d15c debug1
2025-09-22 06:37:33 +08:00

328 lines
13 KiB
Python

"""
CosyVoice API 服务类 - 核心语音克隆功能
通过HTTP调用独立的CosyVoice API服务器
"""
import os
import logging
import requests
import tempfile
from typing import Optional, Dict, Any, Tuple
logger = logging.getLogger(__name__)
class CosyVoiceService:
"""CosyVoice API服务类"""
def __init__(self, api_url: str = "http://127.0.0.1:8081"):
self.api_url = api_url
self._service_status = "未连接"
def connect(self) -> bool:
"""检查连接状态"""
try:
response = requests.get(f"{self.api_url}/health", timeout=5)
if response.status_code == 200:
result = response.json()
self._service_status = "连接正常"
return True
else:
self._service_status = f"连接失败: HTTP {response.status_code}"
return False
except Exception as e:
self._service_status = f"连接失败: {str(e)}"
return False
def get_available_voices(self) -> list:
"""获取可用的音色列表"""
try:
response = requests.get(f"{self.api_url}/voices", timeout=5)
if response.status_code == 200:
result = response.json()
return result.get('voices', [])
else:
logger.error(f"获取音色列表失败: HTTP {response.status_code}")
return []
except Exception as e:
logger.error(f"获取音色列表失败: {str(e)}")
return []
def get_reference_audios(self) -> list:
"""获取参考音频列表(用于兼容)"""
return ["麦克阿瑟.mp3", "年轻人,不讲武德.mp3"]
def recognize_audio(self, audio_file_path: str) -> str:
"""语音识别:将音频转换为文本 - 核心功能"""
try:
logger.info(f"开始语音识别: {audio_file_path}")
response = requests.post(
f"{self.api_url}/recognize",
json={"audio_path": audio_file_path},
timeout=30
)
if response.status_code == 200:
result = response.json()
if result.get('success'):
text = result.get('text', '')
logger.info(f"语音识别成功: {text}")
return text
else:
logger.error(f"语音识别失败: {result.get('message')}")
return ""
else:
logger.error(f"语音识别API请求失败: HTTP {response.status_code}")
return ""
except Exception as e:
logger.error(f"语音识别失败: {str(e)}")
return ""
def generate_speech_with_preset_voice(
self,
text: str,
voice: str = "中文女",
seed: int = 42,
speed: float = 1.0,
stream: bool = False
) -> Tuple[Optional[str], Optional[str]]:
"""使用预训练音色生成语音"""
try:
logger.info(f"开始生成语音: 文本='{text[:20]}...', 音色={voice}, 种子={seed}")
# 发送生成请求
response = requests.post(
f"{self.api_url}/generate/preset",
json={
"text": text,
"voice": voice,
"seed": seed,
"speed": speed,
"stream": stream
},
timeout=30
)
if response.status_code == 200:
result = response.json()
if result.get('success'):
# 直接从响应中获取音频数据
audio_data = result.get('audio_data')
if audio_data:
local_path = self._save_audio_data(audio_data, 'preset')
if local_path:
logger.info(f"预训练语音生成成功: {local_path}")
return local_path, local_path
else:
return None, None
else:
logger.error("响应中没有音频数据")
return None, None
else:
logger.error(f"语音生成失败: {result.get('message')}")
return None, None
else:
logger.error(f"API请求失败: HTTP {response.status_code}")
return None, None
except Exception as e:
logger.error(f"预训练音色语音生成失败: {str(e)}")
return None, None
def generate_speech_with_voice_cloning(
self,
text: str,
reference_audio_path: str,
reference_text: str = "",
seed: int = 42
) -> Tuple[Optional[str], Optional[str]]:
"""使用语音克隆生成语音 - 核心功能"""
try:
logger.info(f"开始语音克隆: 文本='{text[:20]}...', 参考音频={reference_audio_path}")
# 构建请求数据
request_data = {
"text": text,
"reference_audio": reference_audio_path,
"reference_text": reference_text,
"seed": seed
}
logger.info(f"=== 发送给CosyVoice API的数据 ===")
logger.info(f"API URL: {self.api_url}/generate/clone")
logger.info(f"Request Data: {request_data}")
# 发送克隆请求
response = requests.post(
f"{self.api_url}/generate/clone",
json=request_data,
timeout=60 # 克隆需要更长时间
)
logger.info(f"=== CosyVoice API响应 ===")
logger.info(f"Status Code: {response.status_code}")
logger.info(f"Response Headers: {dict(response.headers)}")
if response.status_code != 200:
logger.error(f"Response Text: {response.text}")
return None, None
result = response.json()
if result.get('success'):
# 直接从响应中获取音频数据
audio_data = result.get('audio_data')
if audio_data:
local_path = self._save_audio_data(audio_data, 'clone')
if local_path:
logger.info(f"语音克隆成功: {local_path}")
return local_path, local_path
else:
return None, None
else:
logger.error("响应中没有音频数据")
return None, None
else:
logger.error(f"语音克隆失败: {result.get('message')}")
return None, None
except Exception as e:
logger.error(f"语音克隆失败: {str(e)}")
return None, None
def generate_speech_with_natural_control(
self,
text: str,
instruction: str = "请用温柔甜美的女声朗读",
seed: int = 42
) -> Tuple[Optional[str], Optional[str]]:
"""使用自然语言控制生成语音 - 核心功能"""
try:
logger.info(f"开始自然语言控制生成: 文本='{text[:20]}...', 指令='{instruction}'")
# 发送生成请求
response = requests.post(
f"{self.api_url}/generate/natural",
json={
"text": text,
"instruction": instruction,
"seed": seed
},
timeout=30
)
if response.status_code == 200:
result = response.json()
if result.get('success'):
# 直接从响应中获取音频数据
audio_data = result.get('audio_data')
if audio_data:
local_path = self._save_audio_data(audio_data, 'natural')
if local_path:
logger.info(f"自然语言控制生成成功: {local_path}")
return local_path, local_path
else:
return None, None
else:
logger.error("响应中没有音频数据")
return None, None
else:
logger.error(f"自然语言控制生成失败: {result.get('message')}")
return None, None
else:
logger.error(f"API请求失败: HTTP {response.status_code}")
return None, None
except Exception as e:
logger.error(f"自然语言控制生成失败: {str(e)}")
return None, None
def _save_audio_data(self, audio_data_hex: str, audio_type: str) -> Optional[str]:
"""保存音频数据到本地临时文件"""
try:
# 解码十六进制音频数据
audio_bytes = bytes.fromhex(audio_data_hex)
# 创建本地临时文件
temp_fd, local_path = tempfile.mkstemp(suffix='.wav', prefix=f'cosyvoice_{audio_type}_')
os.close(temp_fd)
# 写入音频数据
with open(local_path, 'wb') as f:
f.write(audio_bytes)
logger.info(f"音频保存成功: {local_path}, 大小: {len(audio_bytes)} 字节")
return local_path
except Exception as e:
logger.error(f"音频保存失败: {str(e)}")
return None
def generate_random_seed(self) -> int:
"""生成随机种子"""
try:
response = requests.get(f"{self.api_url}/random-seed", timeout=5)
if response.status_code == 200:
result = response.json()
return result.get('seed', 42)
else:
import random
return random.randint(1, 999999)
except Exception as e:
logger.error(f"生成随机种子失败: {str(e)}")
import random
return random.randint(1, 999999)
def test_connection(self) -> Dict[str, Any]:
"""测试与CosyVoice服务的连接"""
try:
if not self.connect():
return {
"success": False,
"message": "无法连接到CosyVoice API服务器",
"api_url": self.api_url,
"service_status": self._service_status
}
# 获取音色列表
voices = self.get_available_voices()
# 测试语音生成功能(预训练音色)
generation_status = "未测试"
try:
stream_audio, full_audio = self.generate_speech_with_preset_voice(
text="测试",
voice=voices[0] if voices else "中文女",
seed=42
)
if stream_audio or full_audio:
generation_status = "正常"
self._service_status = "服务正常"
else:
generation_status = "生成失败"
self._service_status = "语音生成功能异常"
except Exception as e:
generation_status = f"测试失败: {str(e)}"
self._service_status = f"语音生成测试失败: {str(e)}"
return {
"success": True,
"message": "CosyVoice API服务连接成功",
"api_url": self.api_url,
"available_voices": voices,
"reference_audios": self.get_reference_audios(),
"generation_status": generation_status,
"service_status": self._service_status,
"core_features": {
"voice_cloning": True,
"speech_recognition": True,
"natural_control": True
}
}
except Exception as e:
self._service_status = f"连接测试失败: {str(e)}"
return {
"success": False,
"message": f"连接测试失败: {str(e)}",
"api_url": self.api_url,
"service_status": self._service_status
}
# 全局服务实例
cosyvoice_service = CosyVoiceService()