From 5271267edf5ac1764dd0257e3318c50d776fbd57 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 15 Sep 2025 02:45:50 +0800 Subject: [PATCH] api-test-end --- app/routes/voice_test.py | 150 ++++++-- app/services/cosyvoice_service.py | 311 ++++++++++------ app/static/js/voice_test.js | 553 +++++++++++++++++++--------- app/templates/voice_test/index.html | 510 +++++++++++++++++-------- requirements.txt | 2 + 5 files changed, 1065 insertions(+), 461 deletions(-) diff --git a/app/routes/voice_test.py b/app/routes/voice_test.py index dd1bdf2..5c8141a 100644 --- a/app/routes/voice_test.py +++ b/app/routes/voice_test.py @@ -4,22 +4,56 @@ import os import json import tempfile -from flask import Blueprint, request, jsonify, render_template, current_app +import subprocess +import uuid +import librosa +import soundfile as sf +from flask import Blueprint, request, jsonify, render_template, current_app, send_file from flask_login import login_required, current_user from app.services.cosyvoice_service import cosyvoice_service from werkzeug.utils import secure_filename import logging - logger = logging.getLogger(__name__) - voice_test_bp = Blueprint('voice_test', __name__) - +def convert_audio_format(input_path, output_path, target_sr=16000): + """转换音频格式为标准WAV""" + try: + # 使用librosa读取音频(支持多种格式) + audio, sr = librosa.load(input_path, sr=target_sr, mono=True) + + # 保存为标准WAV格式 + sf.write(output_path, audio, target_sr, format='WAV', subtype='PCM_16') + + logger.info(f"音频格式转换成功: {input_path} -> {output_path}") + return True + except Exception as e: + logger.error(f"音频格式转换失败: {str(e)}") + + # 备用方案:使用ffmpeg + try: + cmd = [ + 'ffmpeg', '-i', input_path, + '-ar', '16000', # 采样率 + '-ac', '1', # 单声道 + '-sample_fmt', 's16', # 16位 + '-y', output_path + ] + result = subprocess.run(cmd, capture_output=True, text=True) + + if result.returncode == 0: + logger.info(f"使用ffmpeg转换成功: {input_path} -> {output_path}") + return True + else: + logger.error(f"ffmpeg转换失败: {result.stderr}") + return False + except Exception as fe: + logger.error(f"ffmpeg备用方案也失败: {str(fe)}") + return False @voice_test_bp.route('/voice-test') @login_required def voice_test_page(): """语音测试页面""" return render_template('voice_test/index.html') - @voice_test_bp.route('/api/voice-test/connection', methods=['POST']) @login_required def test_connection(): @@ -33,7 +67,6 @@ def test_connection(): "success": False, "message": f"测试失败: {str(e)}" }) - @voice_test_bp.route('/api/voice-test/voices', methods=['GET']) @login_required def get_voices(): @@ -50,7 +83,6 @@ def get_voices(): "success": False, "message": f"获取失败: {str(e)}" }) - @voice_test_bp.route('/api/voice-test/generate/preset', methods=['POST']) @login_required def generate_with_preset_voice(): @@ -95,7 +127,6 @@ def generate_with_preset_voice(): "success": False, "message": f"生成失败: {str(e)}" }) - @voice_test_bp.route('/api/voice-test/generate/natural', methods=['POST']) @login_required def generate_with_natural_control(): @@ -138,11 +169,10 @@ def generate_with_natural_control(): "success": False, "message": f"生成失败: {str(e)}" }) - @voice_test_bp.route('/api/voice-test/upload-audio', methods=['POST']) @login_required def upload_audio(): - """上传音频文件用于语音克隆""" + """上传音频文件用于语音克隆 - 带格式转换功能""" try: if 'audio' not in request.files: return jsonify({ @@ -157,30 +187,72 @@ def upload_audio(): "message": "请选择音频文件" }) - # 检查文件类型 - allowed_extensions = {'wav', 'mp3', 'm4a', 'flac'} - if not ('.' in file.filename and - file.filename.rsplit('.', 1)[1].lower() in allowed_extensions): - return jsonify({ - "success": False, - "message": "不支持的音频格式,请使用WAV、MP3、M4A或FLAC格式" - }) + # 生成安全的文件名 + unique_id = str(uuid.uuid4())[:8] + original_filename = secure_filename(file.filename) if file.filename else f"recording_{unique_id}.wav" - # 保存文件到临时目录 - filename = secure_filename(file.filename) + # 保存原始文件 temp_dir = tempfile.gettempdir() - file_path = os.path.join(temp_dir, f"voice_clone_{current_user.id}_{filename}") - file.save(file_path) + original_path = os.path.join(temp_dir, f"original_{unique_id}_{original_filename}") + file.save(original_path) - # 尝试识别音频内容 - recognized_text = cosyvoice_service.recognize_audio(file_path) + logger.info(f"原始音频文件保存: {original_path}, 大小: {os.path.getsize(original_path)} 字节") - return jsonify({ - "success": True, - "message": "音频上传成功", - "file_path": file_path, - "recognized_text": recognized_text - }) + # 转换为标准格式 + converted_filename = f"voice_clone_{current_user.id}_{unique_id}.wav" + converted_path = os.path.join(temp_dir, converted_filename) + + # 进行格式转换 + if convert_audio_format(original_path, converted_path): + # 转换成功,删除原始文件 + try: + os.remove(original_path) + except: + pass + + # 验证转换后的文件 + if os.path.exists(converted_path) and os.path.getsize(converted_path) > 0: + logger.info(f"音频文件转换并验证成功: {converted_path}") + + # 尝试识别音频内容 + recognized_text = cosyvoice_service.recognize_audio(converted_path) + + return jsonify({ + "success": True, + "message": "音频上传和转换成功", + "file_path": converted_path, + "recognized_text": recognized_text, + "file_info": { + "size": os.path.getsize(converted_path), + "format": "WAV 16kHz Mono" + } + }) + else: + return jsonify({ + "success": False, + "message": "音频文件转换后验证失败" + }) + else: + # 转换失败,尝试直接使用原始文件 + logger.warning("音频格式转换失败,尝试直接使用原始文件") + + try: + recognized_text = cosyvoice_service.recognize_audio(original_path) + return jsonify({ + "success": True, + "message": "音频上传成功(使用原始格式)", + "file_path": original_path, + "recognized_text": recognized_text, + "file_info": { + "size": os.path.getsize(original_path), + "format": "原始格式" + } + }) + except Exception as e: + return jsonify({ + "success": False, + "message": f"音频处理失败: {str(e)}" + }) except Exception as e: logger.error(f"音频上传失败: {str(e)}") @@ -188,7 +260,6 @@ def upload_audio(): "success": False, "message": f"上传失败: {str(e)}" }) - @voice_test_bp.route('/api/voice-test/generate/clone', methods=['POST']) @login_required def generate_with_voice_cloning(): @@ -239,7 +310,6 @@ def generate_with_voice_cloning(): "success": False, "message": f"克隆失败: {str(e)}" }) - @voice_test_bp.route('/api/voice-test/random-seed', methods=['GET']) @login_required def get_random_seed(): @@ -256,3 +326,19 @@ def get_random_seed(): "success": False, "message": f"获取失败: {str(e)}" }) +@voice_test_bp.route('/download-audio/', methods=['GET']) +@login_required +def download_temp_audio(filename): + """下载临时音频文件""" + try: + temp_dir = tempfile.gettempdir() + file_path = os.path.join(temp_dir, filename) + + if os.path.exists(file_path): + return send_file(file_path, as_attachment=False, mimetype='audio/wav') + else: + return jsonify({"success": False, "message": "音频文件不存在"}), 404 + + except Exception as e: + logger.error(f"音频下载失败: {str(e)}") + return jsonify({"success": False, "message": "下载失败"}), 500 diff --git a/app/services/cosyvoice_service.py b/app/services/cosyvoice_service.py index 41a2077..82f4532 100644 --- a/app/services/cosyvoice_service.py +++ b/app/services/cosyvoice_service.py @@ -1,71 +1,77 @@ """ -CosyVoice API 服务类 -负责与CosyVoice API的交互 +CosyVoice API 服务类 - 核心语音克隆功能 +通过HTTP调用独立的CosyVoice API服务器 """ import os import logging +import requests +import tempfile from typing import Optional, Dict, Any, Tuple -from gradio_client import Client, handle_file - logger = logging.getLogger(__name__) - class CosyVoiceService: """CosyVoice API服务类""" - def __init__(self, api_url: str = "http://127.0.0.1:8080/"): + def __init__(self, api_url: str = "http://127.0.0.1:8081"): self.api_url = api_url - self.client = None + self._service_status = "未连接" def connect(self) -> bool: - """连接到CosyVoice服务""" + """检查连接状态""" try: - self.client = Client(self.api_url) - logger.info(f"成功连接到CosyVoice服务: {self.api_url}") - return True + response = requests.get(f"{self.api_url}/health", timeout=5) + if response.status_code == 200: + result = response.json() + self._service_status = "连接正常" + return True + else: + self._service_status = f"连接失败: HTTP {response.status_code}" + return False except Exception as e: - logger.error(f"连接CosyVoice服务失败: {str(e)}") + self._service_status = f"连接失败: {str(e)}" return False def get_available_voices(self) -> list: """获取可用的音色列表""" try: - if not self.client: - if not self.connect(): - return [] - - voices = self.client.predict(api_name="/refresh_sft_spk") - # 过滤掉不需要的音色 - filtered_voices = [voice for voice in voices if voice != '.ipynb_checkpoints'] - return filtered_voices + response = requests.get(f"{self.api_url}/voices", timeout=5) + if response.status_code == 200: + result = response.json() + return result.get('voices', []) + else: + logger.error(f"获取音色列表失败: HTTP {response.status_code}") + return [] except Exception as e: logger.error(f"获取音色列表失败: {str(e)}") return [] def get_reference_audios(self) -> list: - """获取参考音频列表""" - try: - if not self.client: - if not self.connect(): - return [] - - audio_files = self.client.predict(api_name="/refresh_prompt_wav") - return audio_files - except Exception as e: - logger.error(f"获取参考音频列表失败: {str(e)}") - return [] + """获取参考音频列表(用于兼容)""" + return ["麦克阿瑟.mp3", "年轻人,不讲武德.mp3"] def recognize_audio(self, audio_file_path: str) -> str: - """语音识别:将音频转换为文本""" + """语音识别:将音频转换为文本 - 核心功能""" try: - if not self.client: - if not self.connect(): - return "" + logger.info(f"开始语音识别: {audio_file_path}") - text = self.client.predict( - prompt_wav=handle_file(audio_file_path), - api_name="/prompt_wav_recognition" + response = requests.post( + f"{self.api_url}/recognize", + json={"audio_path": audio_file_path}, + timeout=30 ) - return text + + if response.status_code == 200: + result = response.json() + if result.get('success'): + text = result.get('text', '') + logger.info(f"语音识别成功: {text}") + return text + else: + logger.error(f"语音识别失败: {result.get('message')}") + return "" + else: + logger.error(f"语音识别API请求失败: HTTP {response.status_code}") + return "" + except Exception as e: logger.error(f"语音识别失败: {str(e)}") return "" @@ -80,25 +86,42 @@ class CosyVoiceService: ) -> Tuple[Optional[str], Optional[str]]: """使用预训练音色生成语音""" try: - if not self.client: - if not self.connect(): - return None, None + logger.info(f"开始生成语音: 文本='{text[:20]}...', 音色={voice}, 种子={seed}") - result = self.client.predict( - tts_text=text, - mode_checkbox_group="预训练音色", - sft_dropdown=voice, - seed=seed, - speed=speed, - stream="true" if stream else "false", - api_name="/generate_audio" + # 发送生成请求 + response = requests.post( + f"{self.api_url}/generate/preset", + json={ + "text": text, + "voice": voice, + "seed": seed, + "speed": speed, + "stream": stream + }, + timeout=30 ) - # result是一个元组 [流式音频路径, 完整音频路径] - if isinstance(result, (list, tuple)) and len(result) >= 2: - return result[0], result[1] + if response.status_code == 200: + result = response.json() + if result.get('success'): + # 直接从响应中获取音频数据 + audio_data = result.get('audio_data') + if audio_data: + local_path = self._save_audio_data(audio_data, 'preset') + if local_path: + logger.info(f"预训练语音生成成功: {local_path}") + return local_path, local_path + else: + return None, None + else: + logger.error("响应中没有音频数据") + return None, None + else: + logger.error(f"语音生成失败: {result.get('message')}") + return None, None else: - return result, result + logger.error(f"API请求失败: HTTP {response.status_code}") + return None, None except Exception as e: logger.error(f"预训练音色语音生成失败: {str(e)}") @@ -111,35 +134,46 @@ class CosyVoiceService: reference_text: str = "", seed: int = 42 ) -> Tuple[Optional[str], Optional[str]]: - """使用语音克隆生成语音""" + """使用语音克隆生成语音 - 核心功能""" try: - if not self.client: - if not self.connect(): - return None, None + logger.info(f"开始语音克隆: 文本='{text[:20]}...', 参考音频={reference_audio_path}") - # 如果没有提供参考文本,先进行语音识别 - if not reference_text: - reference_text = self.recognize_audio(reference_audio_path) - if not reference_text: - logger.warning("参考音频识别失败,使用空文本") - reference_text = "" - - result = self.client.predict( - tts_text=text, - mode_checkbox_group="3s极速复刻", - prompt_text=reference_text, - prompt_wav_upload=handle_file(reference_audio_path), - seed=seed, - api_name="/generate_audio" + # 发送克隆请求 + response = requests.post( + f"{self.api_url}/generate/clone", + json={ + "text": text, + "reference_audio": reference_audio_path, + "reference_text": reference_text, + "seed": seed + }, + timeout=60 # 克隆需要更长时间 ) - if isinstance(result, (list, tuple)) and len(result) >= 2: - return result[0], result[1] + if response.status_code == 200: + result = response.json() + if result.get('success'): + # 直接从响应中获取音频数据 + audio_data = result.get('audio_data') + if audio_data: + local_path = self._save_audio_data(audio_data, 'clone') + if local_path: + logger.info(f"语音克隆成功: {local_path}") + return local_path, local_path + else: + return None, None + else: + logger.error("响应中没有音频数据") + return None, None + else: + logger.error(f"语音克隆失败: {result.get('message')}") + return None, None else: - return result, result + logger.error(f"API请求失败: HTTP {response.status_code}") + return None, None except Exception as e: - logger.error(f"语音克隆生成失败: {str(e)}") + logger.error(f"语音克隆失败: {str(e)}") return None, None def generate_speech_with_natural_control( @@ -148,41 +182,81 @@ class CosyVoiceService: instruction: str = "请用温柔甜美的女声朗读", seed: int = 42 ) -> Tuple[Optional[str], Optional[str]]: - """使用自然语言控制生成语音""" + """使用自然语言控制生成语音 - 核心功能""" try: - if not self.client: - if not self.connect(): - return None, None + logger.info(f"开始自然语言控制生成: 文本='{text[:20]}...', 指令='{instruction}'") - result = self.client.predict( - tts_text=text, - mode_checkbox_group="自然语言控制", - instruct_text=instruction, - seed=seed, - api_name="/generate_audio" + # 发送生成请求 + response = requests.post( + f"{self.api_url}/generate/natural", + json={ + "text": text, + "instruction": instruction, + "seed": seed + }, + timeout=30 ) - if isinstance(result, (list, tuple)) and len(result) >= 2: - return result[0], result[1] + if response.status_code == 200: + result = response.json() + if result.get('success'): + # 直接从响应中获取音频数据 + audio_data = result.get('audio_data') + if audio_data: + local_path = self._save_audio_data(audio_data, 'natural') + if local_path: + logger.info(f"自然语言控制生成成功: {local_path}") + return local_path, local_path + else: + return None, None + else: + logger.error("响应中没有音频数据") + return None, None + else: + logger.error(f"自然语言控制生成失败: {result.get('message')}") + return None, None else: - return result, result + logger.error(f"API请求失败: HTTP {response.status_code}") + return None, None except Exception as e: - logger.error(f"自然语言控制语音生成失败: {str(e)}") + logger.error(f"自然语言控制生成失败: {str(e)}") return None, None + def _save_audio_data(self, audio_data_hex: str, audio_type: str) -> Optional[str]: + """保存音频数据到本地临时文件""" + try: + # 解码十六进制音频数据 + audio_bytes = bytes.fromhex(audio_data_hex) + + # 创建本地临时文件 + temp_fd, local_path = tempfile.mkstemp(suffix='.wav', prefix=f'cosyvoice_{audio_type}_') + os.close(temp_fd) + + # 写入音频数据 + with open(local_path, 'wb') as f: + f.write(audio_bytes) + + logger.info(f"音频保存成功: {local_path}, 大小: {len(audio_bytes)} 字节") + return local_path + except Exception as e: + logger.error(f"音频保存失败: {str(e)}") + return None + def generate_random_seed(self) -> int: """生成随机种子""" try: - if not self.client: - if not self.connect(): - return 42 - - seed = self.client.predict(api_name="/generate_random_seed") - return int(seed) if seed else 42 + response = requests.get(f"{self.api_url}/random-seed", timeout=5) + if response.status_code == 200: + result = response.json() + return result.get('seed', 42) + else: + import random + return random.randint(1, 999999) except Exception as e: logger.error(f"生成随机种子失败: {str(e)}") - return 42 + import random + return random.randint(1, 999999) def test_connection(self) -> Dict[str, Any]: """测试与CosyVoice服务的连接""" @@ -190,26 +264,55 @@ class CosyVoiceService: if not self.connect(): return { "success": False, - "message": "无法连接到CosyVoice服务", - "api_url": self.api_url + "message": "无法连接到CosyVoice API服务器", + "api_url": self.api_url, + "service_status": self._service_status } - # 尝试获取音色列表来测试连接 + # 获取音色列表 voices = self.get_available_voices() + # 测试语音生成功能(预训练音色) + generation_status = "未测试" + try: + stream_audio, full_audio = self.generate_speech_with_preset_voice( + text="测试", + voice=voices[0] if voices else "中文女", + seed=42 + ) + + if stream_audio or full_audio: + generation_status = "正常" + self._service_status = "服务正常" + else: + generation_status = "生成失败" + self._service_status = "语音生成功能异常" + except Exception as e: + generation_status = f"测试失败: {str(e)}" + self._service_status = f"语音生成测试失败: {str(e)}" + return { "success": True, - "message": "CosyVoice服务连接成功", + "message": "CosyVoice API服务连接成功", "api_url": self.api_url, - "available_voices": voices + "available_voices": voices, + "reference_audios": self.get_reference_audios(), + "generation_status": generation_status, + "service_status": self._service_status, + "core_features": { + "voice_cloning": True, + "speech_recognition": True, + "natural_control": True + } } except Exception as e: + self._service_status = f"连接测试失败: {str(e)}" return { "success": False, "message": f"连接测试失败: {str(e)}", - "api_url": self.api_url + "api_url": self.api_url, + "service_status": self._service_status } - # 全局服务实例 cosyvoice_service = CosyVoiceService() diff --git a/app/static/js/voice_test.js b/app/static/js/voice_test.js index 18d2fe6..a974876 100644 --- a/app/static/js/voice_test.js +++ b/app/static/js/voice_test.js @@ -1,16 +1,26 @@ /** - * CosyVoice API 测试页面 JavaScript + * 语音克隆测试页面 JavaScript + * 核心功能:语音样本采集 → 识别 → 克隆 → 对比 */ // 全局变量 -let uploadedAudioPath = null; let loadingModal = null; +let mediaRecorder = null; +let audioChunks = []; +let uploadedAudioPath = null; +let recognizedText = ""; +let sampleAudioUrl = null; +let recordedAudioBlob = null; + +// 当前工作流程状态 +let currentStep = 1; // DOM加载完成后初始化 document.addEventListener('DOMContentLoaded', function() { initializeComponents(); bindEvents(); loadAvailableVoices(); + updateStepIndicators(); }); /** @@ -18,13 +28,6 @@ document.addEventListener('DOMContentLoaded', function() { */ function initializeComponents() { loadingModal = new bootstrap.Modal(document.getElementById('loadingModal')); - - // 语速滑块显示 - const speedSlider = document.getElementById('preset-speed'); - const speedValue = document.getElementById('preset-speed-value'); - speedSlider.addEventListener('input', function() { - speedValue.textContent = this.value; - }); } /** @@ -34,28 +37,98 @@ function bindEvents() { // 连接测试 document.getElementById('test-connection-btn').addEventListener('click', testConnection); - // 预训练音色测试 - document.getElementById('preset-voice-form').addEventListener('submit', generatePresetVoice); - document.getElementById('preset-random-seed').addEventListener('click', () => getRandomSeed('preset-seed')); + // 语音样本采集 + document.getElementById('voice-sample-form').addEventListener('submit', uploadVoiceSample); + document.getElementById('voice-sample-upload').addEventListener('change', handleFileSelect); + document.getElementById('start-recording').addEventListener('click', startRecording); + document.getElementById('stop-recording').addEventListener('click', stopRecording); - // 自然语言控制测试 - document.getElementById('natural-control-form').addEventListener('submit', generateNaturalControl); - document.getElementById('natural-random-seed').addEventListener('click', () => getRandomSeed('natural-seed')); - - // 语音克隆测试 - document.getElementById('audio-upload-form').addEventListener('submit', uploadReferenceAudio); - document.getElementById('voice-clone-form').addEventListener('submit', generateVoiceClone); + // 语音克隆生成 + document.getElementById('clone-generation-form').addEventListener('submit', generateClonedVoice); document.getElementById('clone-random-seed').addEventListener('click', () => getRandomSeed('clone-seed')); - // 清空日志 + // 高级功能 + document.getElementById('preset-voice-form').addEventListener('submit', generatePresetVoice); + document.getElementById('natural-control-form').addEventListener('submit', generateNaturalControl); + + // 其他 document.getElementById('clear-log').addEventListener('click', clearLog); } +/** + * 更新步骤指示器 + */ +function updateStepIndicators() { + for (let i = 1; i <= 4; i++) { + const indicator = document.getElementById(`step-${i}-indicator`); + indicator.classList.remove('active', 'completed'); + + if (i < currentStep) { + indicator.classList.add('completed'); + } else if (i === currentStep) { + indicator.classList.add('active'); + } + } + + // 更新连接线 + document.querySelectorAll('.step-line').forEach((line, index) => { + if (index + 1 < currentStep) { + line.classList.add('completed'); + } else { + line.classList.remove('completed'); + } + }); +} + +/** + * 跳转到指定步骤 + */ +function goToStep(step) { + currentStep = step; + updateStepIndicators(); + + // 启用/禁用相应按钮 + if (step >= 3) { + document.getElementById('generate-clone-btn').disabled = false; + } +} + +/** + * 重置工作流程 + */ +function resetWorkflow() { + currentStep = 1; + updateStepIndicators(); + + // 清空数据 + uploadedAudioPath = null; + recognizedText = ""; + sampleAudioUrl = null; + recordedAudioBlob = null; + + // 重置界面 + document.getElementById('sample-player').style.display = 'none'; + document.getElementById('recognition-result').style.display = 'none'; + document.getElementById('recognition-waiting').style.display = 'block'; + document.getElementById('comparison-result').style.display = 'none'; + document.getElementById('comparison-waiting').style.display = 'block'; + + // 重置按钮状态 + document.getElementById('upload-sample-btn').disabled = true; + document.getElementById('generate-clone-btn').disabled = true; + + // 清空文件选择 + document.getElementById('voice-sample-upload').value = ''; + + addLog('工作流程已重置,可以重新开始', 'info'); +} + /** * 显示加载状态 */ -function showLoading(message = '正在处理中...') { +function showLoading(message = '正在处理中...', detail = '请稍候...') { document.getElementById('loading-message').textContent = message; + document.getElementById('loading-detail').textContent = detail; loadingModal.show(); } @@ -93,7 +166,7 @@ function addLog(message, type = 'info') { */ function clearLog() { const logContainer = document.getElementById('test-log'); - logContainer.innerHTML = '

测试记录将显示在这里...

'; + logContainer.innerHTML = '

操作记录将显示在这里...

'; } /** @@ -116,7 +189,6 @@ function showError(message) { const bsToast = new bootstrap.Toast(toast); bsToast.show(); - // 自动移除 setTimeout(() => { if (toast.parentNode) { toast.parentNode.removeChild(toast); @@ -144,7 +216,6 @@ function showSuccess(message) { const bsToast = new bootstrap.Toast(toast); bsToast.show(); - // 自动移除 setTimeout(() => { if (toast.parentNode) { toast.parentNode.removeChild(toast); @@ -152,6 +223,14 @@ function showSuccess(message) { }, 3000); } +/** + * 创建音频播放URL + */ +function createAudioUrl(audioPath) { + const filename = audioPath.split('/').pop(); + return `/voice-test/download-audio/${filename}`; +} + /** * 测试连接 */ @@ -179,10 +258,10 @@ async function testConnection() { statusDiv.innerHTML = ` 连接成功 - (${result.api_url}) + 支持语音克隆、识别、自然控制 `; - addLog(`连接成功!可用音色数量: ${result.available_voices ? result.available_voices.length : 0}`, 'success'); + addLog(`连接成功!核心功能可用:语音克隆、识别、自然控制`, 'success'); // 更新音色列表 if (result.available_voices) { @@ -259,147 +338,155 @@ async function getRandomSeed(inputId) { } /** - * 预训练音色语音生成 + * 处理文件选择 */ -async function generatePresetVoice(e) { - e.preventDefault(); - - const text = document.getElementById('preset-text').value.trim(); - const voice = document.getElementById('preset-voice').value; - const seed = parseInt(document.getElementById('preset-seed').value); - const speed = parseFloat(document.getElementById('preset-speed').value); - - if (!text) { - showError('请输入要合成的文本'); - return; - } - - showLoading('正在生成语音...'); - addLog(`开始预训练音色生成 - 音色: ${voice}, 种子: ${seed}, 语速: ${speed}x`); - - try { - const response = await fetch('/voice-test/api/voice-test/generate/preset', { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - }, - body: JSON.stringify({ - text: text, - voice: voice, - seed: seed, - speed: speed - }) - }); +function handleFileSelect(e) { + const file = e.target.files[0]; + if (file) { + // 重置录音状态 + recordedAudioBlob = null; + sampleAudioUrl = null; - const result = await response.json(); - - if (result.success) { - // 显示音频播放器 - const audioSource = document.getElementById('preset-audio-source'); - const resultDiv = document.getElementById('preset-result'); - - audioSource.src = result.audio_url; - audioSource.parentElement.load(); - resultDiv.style.display = 'block'; - - addLog(`预训练音色生成成功!音频地址: ${result.audio_url}`, 'success'); - showSuccess('语音生成成功!'); - } else { - addLog(`预训练音色生成失败: ${result.message}`, 'error'); - showError(result.message); - } - - } catch (error) { - addLog(`预训练音色生成出错: ${error.message}`, 'error'); - showError('生成失败,请检查网络连接'); - } finally { - hideLoading(); + addLog(`选择了音频文件: ${file.name} (${(file.size/1024/1024).toFixed(2)} MB)`); + document.getElementById('upload-sample-btn').disabled = false; + document.getElementById('upload-sample-btn').innerHTML = '上传并识别语音'; } } /** - * 自然语言控制语音生成 + * 开始录音 */ -async function generateNaturalControl(e) { - e.preventDefault(); - - const text = document.getElementById('natural-text').value.trim(); - const instruction = document.getElementById('natural-instruction').value.trim(); - const seed = parseInt(document.getElementById('natural-seed').value); - - if (!text) { - showError('请输入要合成的文本'); - return; - } - - if (!instruction) { - showError('请输入语音指令'); - return; - } - - showLoading('正在生成语音...'); - addLog(`开始自然语言控制生成 - 指令: ${instruction}, 种子: ${seed}`); - +async function startRecording() { try { - const response = await fetch('/voice-test/api/voice-test/generate/natural', { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - }, - body: JSON.stringify({ - text: text, - instruction: instruction, - seed: seed - }) + // 重置文件选择 + document.getElementById('voice-sample-upload').value = ''; + + const stream = await navigator.mediaDevices.getUserMedia({ + audio: { + sampleRate: 16000, // 设置采样率为16kHz + channelCount: 1, // 单声道 + echoCancellation: true, + noiseSuppression: true + } }); - const result = await response.json(); + // 创建MediaRecorder,明确指定格式 + const options = { + mimeType: 'audio/webm;codecs=opus' // 使用webm格式 + }; - if (result.success) { - // 显示音频播放器 - const audioSource = document.getElementById('natural-audio-source'); - const resultDiv = document.getElementById('natural-result'); - - audioSource.src = result.audio_url; - audioSource.parentElement.load(); - resultDiv.style.display = 'block'; - - addLog(`自然语言控制生成成功!音频地址: ${result.audio_url}`, 'success'); - showSuccess('语音生成成功!'); - } else { - addLog(`自然语言控制生成失败: ${result.message}`, 'error'); - showError(result.message); + // 检查浏览器支持的格式 + if (!MediaRecorder.isTypeSupported(options.mimeType)) { + if (MediaRecorder.isTypeSupported('audio/webm')) { + options.mimeType = 'audio/webm'; + } else if (MediaRecorder.isTypeSupported('audio/wav')) { + options.mimeType = 'audio/wav'; + } else { + // 使用默认格式 + delete options.mimeType; + } } + mediaRecorder = new MediaRecorder(stream, options); + audioChunks = []; + + mediaRecorder.ondataavailable = function(event) { + if (event.data.size > 0) { + audioChunks.push(event.data); + } + }; + + mediaRecorder.onstop = function() { + // 创建音频Blob + recordedAudioBlob = new Blob(audioChunks, { + type: mediaRecorder.mimeType || 'audio/webm' + }); + + const audioUrl = URL.createObjectURL(recordedAudioBlob); + + // 显示录音预览 + const sampleAudio = document.getElementById('sample-audio'); + const sampleSource = document.getElementById('sample-audio-source'); + sampleSource.src = audioUrl; + sampleAudio.load(); + document.getElementById('sample-player').style.display = 'block'; + + // 启用上传按钮 + document.getElementById('upload-sample-btn').disabled = false; + document.getElementById('upload-sample-btn').innerHTML = '上传并识别语音'; + + // 保存录音数据 + sampleAudioUrl = audioUrl; + + addLog(`录音完成,格式: ${mediaRecorder.mimeType || 'default'}, 大小: ${(recordedAudioBlob.size/1024).toFixed(1)} KB`, 'success'); + }; + + mediaRecorder.start(100); // 每100ms收集一次数据 + + // 更新UI + document.getElementById('start-recording').disabled = true; + document.getElementById('stop-recording').disabled = false; + document.getElementById('recording-status').textContent = '正在录音...'; + document.getElementById('recording-status').className = 'text-danger'; + + addLog('开始录音...', 'info'); + } catch (error) { - addLog(`自然语言控制生成出错: ${error.message}`, 'error'); - showError('生成失败,请检查网络连接'); - } finally { - hideLoading(); + addLog(`录音失败: ${error.message}`, 'error'); + showError('录音失败,请检查麦克风权限'); } } /** - * 上传参考音频 + * 停止录音 */ -async function uploadReferenceAudio(e) { - e.preventDefault(); - - const fileInput = document.getElementById('reference-audio'); - const file = fileInput.files[0]; - - if (!file) { - showError('请选择音频文件'); - return; +function stopRecording() { + if (mediaRecorder && mediaRecorder.state !== 'inactive') { + mediaRecorder.stop(); + mediaRecorder.stream.getTracks().forEach(track => track.stop()); } - showLoading('正在上传并识别音频...'); - addLog(`开始上传音频文件: ${file.name} (${(file.size/1024/1024).toFixed(2)} MB)`); + // 更新UI + document.getElementById('start-recording').disabled = false; + document.getElementById('stop-recording').disabled = true; + document.getElementById('recording-status').textContent = '录音已完成'; + document.getElementById('recording-status').className = 'text-success'; - const formData = new FormData(); - formData.append('audio', file); + addLog('录音停止', 'info'); +} + +/** + * 上传语音样本并进行识别 + */ +async function uploadVoiceSample(e) { + e.preventDefault(); + + showLoading('正在上传和识别语音...', '包括格式转换和语音识别,请稍候'); + addLog('开始上传语音样本进行识别...'); try { + const fileInput = document.getElementById('voice-sample-upload'); + const file = fileInput.files[0]; + + let formData = new FormData(); + + if (file) { + // 上传文件 + formData.append('audio', file); + addLog(`上传文件: ${file.name}`); + } else if (recordedAudioBlob) { + // 上传录音 - 使用正确的文件名和类型 + const timestamp = new Date().toISOString().replace(/[:.]/g, '-'); + const filename = `recording_${timestamp}.webm`; + formData.append('audio', recordedAudioBlob, filename); + addLog(`上传录音: ${filename}, 大小: ${(recordedAudioBlob.size/1024).toFixed(1)} KB`); + } else { + showError('请选择音频文件或先录音'); + hideLoading(); + return; + } + + // 上传并识别 const response = await fetch('/voice-test/api/voice-test/upload-audio', { method: 'POST', body: formData @@ -410,27 +497,34 @@ async function uploadReferenceAudio(e) { if (result.success) { // 保存音频路径 uploadedAudioPath = result.file_path; + recognizedText = result.recognized_text || ''; // 显示识别结果 - const resultDiv = document.getElementById('upload-result'); - const recognizedText = document.getElementById('recognized-text'); + document.getElementById('recognized-text').value = recognizedText; + document.getElementById('recognition-result').style.display = 'block'; + document.getElementById('recognition-waiting').style.display = 'none'; - recognizedText.value = result.recognized_text || ''; - resultDiv.style.display = 'block'; + // 更新步骤 + goToStep(2); + setTimeout(() => goToStep(3), 1000); - // 启用克隆按钮 - const cloneBtn = document.querySelector('#voice-clone-form button[type="submit"]'); - cloneBtn.disabled = false; + addLog(`语音识别成功: "${recognizedText}"`, 'success'); + addLog(`音频处理信息: ${result.file_info?.format || '已转换格式'}`, 'info'); + showSuccess('语音样本上传成功!AI已识别出内容'); + + // 保存原始音频用于对比 + const originalAudio = document.getElementById('original-audio'); + const originalSource = document.getElementById('original-audio-source'); + originalSource.src = createAudioUrl(uploadedAudioPath); + originalAudio.load(); - addLog(`音频上传成功!识别文本: ${result.recognized_text || '(无内容)'}`, 'success'); - showSuccess('音频上传成功!'); } else { - addLog(`音频上传失败: ${result.message}`, 'error'); + addLog(`语音识别失败: ${result.message}`, 'error'); showError(result.message); } } catch (error) { - addLog(`音频上传出错: ${error.message}`, 'error'); + addLog(`上传出错: ${error.message}`, 'error'); showError('上传失败,请检查网络连接'); } finally { hideLoading(); @@ -438,27 +532,28 @@ async function uploadReferenceAudio(e) { } /** - * 语音克隆生成 + * 生成克隆语音 */ -async function generateVoiceClone(e) { +async function generateClonedVoice(e) { e.preventDefault(); if (!uploadedAudioPath) { - showError('请先上传参考音频'); + showError('请先上传语音样本'); return; } const text = document.getElementById('clone-text').value.trim(); - const referenceText = document.getElementById('recognized-text').value.trim(); const seed = parseInt(document.getElementById('clone-seed').value); + const referenceText = document.getElementById('recognized-text').value.trim(); if (!text) { showError('请输入要合成的文本'); return; } - showLoading('正在进行语音克隆...'); - addLog(`开始语音克隆 - 种子: ${seed}`); + showLoading('正在克隆你的声音...', '这是最复杂的步骤,请耐心等待'); + addLog(`开始语音克隆 - 目标文本: "${text.substring(0, 20)}..."`); + addLog(`使用音频文件: ${uploadedAudioPath}`); try { const response = await fetch('/voice-test/api/voice-test/generate/clone', { @@ -477,25 +572,143 @@ async function generateVoiceClone(e) { const result = await response.json(); if (result.success) { - // 显示音频播放器 - const audioSource = document.getElementById('clone-audio-source'); - const resultDiv = document.getElementById('clone-result'); + // 显示克隆语音 + const clonedAudio = document.getElementById('cloned-audio'); + const clonedSource = document.getElementById('cloned-audio-source'); + clonedSource.src = createAudioUrl(result.audio_url); + clonedAudio.load(); - audioSource.src = result.audio_url; - audioSource.parentElement.load(); - resultDiv.style.display = 'block'; + // 显示对比界面 + document.getElementById('comparison-result').style.display = 'block'; + document.getElementById('comparison-waiting').style.display = 'none'; + + // 更新到最后步骤 + goToStep(4); + + addLog(`🎉 语音克隆成功!请对比原声和克隆效果`, 'success'); + showSuccess('语音克隆完成!请播放音频对比效果'); - addLog(`语音克隆成功!音频地址: ${result.audio_url}`, 'success'); - showSuccess('语音克隆成功!'); } else { addLog(`语音克隆失败: ${result.message}`, 'error'); - showError(result.message); + showError(result.message || '语音克隆失败,请重试'); } } catch (error) { - addLog(`语音克隆出错: ${error.message}`, 'error'); + addLog(`克隆出错: ${error.message}`, 'error'); showError('克隆失败,请检查网络连接'); } finally { hideLoading(); } } + +/** + * 预训练音色语音生成(高级功能) + */ +async function generatePresetVoice(e) { + e.preventDefault(); + + const text = document.getElementById('preset-text').value.trim(); + const voice = document.getElementById('preset-voice').value; + + if (!text) { + showError('请输入要合成的文本'); + return; + } + + showLoading('正在生成预训练音色语音...'); + + try { + const response = await fetch('/voice-test/api/voice-test/generate/preset', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ + text: text, + voice: voice, + seed: 42, + speed: 1.0 + }) + }); + + const result = await response.json(); + + if (result.success) { + const audioSource = document.getElementById('preset-audio-source'); + const resultDiv = document.getElementById('preset-result'); + + audioSource.src = createAudioUrl(result.audio_url); + audioSource.parentElement.load(); + resultDiv.style.display = 'block'; + + addLog(`预训练音色生成成功 - ${voice}`, 'success'); + } else { + addLog(`预训练音色生成失败: ${result.message}`, 'error'); + showError(result.message); + } + + } catch (error) { + addLog(`生成出错: ${error.message}`, 'error'); + showError('生成失败,请检查网络连接'); + } finally { + hideLoading(); + } +} + +/** + * 自然语言控制语音生成(高级功能) + */ +async function generateNaturalControl(e) { + e.preventDefault(); + + const text = document.getElementById('natural-text').value.trim(); + const instruction = document.getElementById('natural-instruction').value.trim(); + + if (!text) { + showError('请输入要合成的文本'); + return; + } + + if (!instruction) { + showError('请输入语音指令'); + return; + } + + showLoading('正在生成自然语言控制语音...'); + + try { + const response = await fetch('/voice-test/api/voice-test/generate/natural', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ + text: text, + instruction: instruction, + seed: 42 + }) + }); + + const result = await response.json(); + + if (result.success) { + const audioSource = document.getElementById('natural-audio-source'); + const resultDiv = document.getElementById('natural-result'); + + audioSource.src = createAudioUrl(result.audio_url); + audioSource.parentElement.load(); + resultDiv.style.display = 'block'; + + addLog(`自然语言控制生成成功`, 'success'); + } else { + addLog(`自然语言控制生成失败: ${result.message}`, 'error'); + showError(result.message); + } + + } catch (error) { + addLog(`生成出错: ${error.message}`, 'error'); + showError('生成失败,请检查网络连接'); + } finally { + hideLoading(); + } +} diff --git a/app/templates/voice_test/index.html b/app/templates/voice_test/index.html index e78e83e..ac83910 100644 --- a/app/templates/voice_test/index.html +++ b/app/templates/voice_test/index.html @@ -1,6 +1,6 @@ {% extends "base.html" %} -{% block title %}CosyVoice API 测试 - 儿童语言学习系统{% endblock %} +{% block title %}语音克隆测试 - 儿童语言学习系统{% endblock %} {% block content %}
@@ -13,10 +13,10 @@

- - CosyVoice API 测试 + + 语音克隆技术测试

-

测试语音合成的各种功能

+

体验"听自己说"的神奇技术 - 让AI学会你的声音

@@ -43,168 +43,290 @@ - -
- -
-
+ +
+
+
-
- 预训练音色测试 -
-
-
- - -
-
- - -
-
-
- -
- - +

+ 语音克隆工作流程 + 核心功能 +

+

按照步骤操作,体验完整的语音克隆过程

+ + +
+
+
+
+
1
+
录制声音
+
+
+
+
2
+
识别文字
+
+
+
+
3
+
克隆生成
+
+
+
+
4
+
效果对比
-
- - - 当前: 1.0x -
-
- - - -
-
-
- -
-
-
-
- 自然语言控制测试 -
-
-
- - -
-
- - -
- 示例:请用活泼开朗的语调、请用严肃正式的男声、请用轻柔的语气等 -
-
-
- -
- - -
-
- -
- -
-
-
- - -
-
-
-
- 语音克隆测试 -
-
-
1. 上传参考音频
-
-
- -
- 支持格式:WAV、MP3、M4A、FLAC,建议3-10秒 + +
+
+
+ 步骤1:录制你的声音样本 +
+

录制一段3-10秒的清晰语音,作为语音克隆的模板

+ + +
+ + +
建议:WAV格式,16kHz采样率,3-10秒时长
+
+ +
+ +
+ +
+ +
+ + +
+ 点击开始录音 +
+
+
+ + + + + + - - -
-
-
2. 生成克隆语音
-
-
- - -
-
- -
- -
- -
- + + +
+
+
+ 步骤3:克隆你的声音 +
+

让AI用你的声音说新的话

+ +
+
+ + +
+ +
+
+ +
+ + +
+
+
+ + +
+
+ + +
+
+
+ + +
+
+
+ 步骤4:效果对比 +
+

对比原声和克隆声音的效果

+ + + +
+ +

等待生成克隆语音...

+
+
+
+
+ + +
+ +
+
+
+
+
+ + +
+
+
+
+
+ 高级功能 + +
+ +
+
+ +
+
预训练音色测试
+
+
+ +
+
+ +
+ +
+ +
+ + +
+
自然语言控制
+
+
+ +
+
+ +
+ +
+
@@ -215,19 +337,19 @@
- +
- 测试记录 + 操作日志
-

测试记录将显示在这里...

+

操作记录将显示在这里...

@@ -242,10 +364,88 @@
+ + + {% endblock %} {% block scripts %} diff --git a/requirements.txt b/requirements.txt index b0ee924..213f840 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,5 @@ email-validator==2.0.0 cryptography==41.0.4 Werkzeug==2.3.7 gradio_client==0.8.1 +librosa +soundfile \ No newline at end of file