From cfa15f2c5c7b5ad1bc2aa9bb39dc652fb9527857 Mon Sep 17 00:00:00 2001
From: superlishunqin <852326703@qq.com>
Date: Mon, 15 Sep 2025 00:51:19 +0800
Subject: [PATCH] test-api-fix3

---
 app/services/cosyvoice_service.py | 112 ++++++++++++++++++------------
 1 file changed, 66 insertions(+), 46 deletions(-)

diff --git a/app/services/cosyvoice_service.py b/app/services/cosyvoice_service.py
index b7e3603..5a09434 100644
--- a/app/services/cosyvoice_service.py
+++ b/app/services/cosyvoice_service.py
@@ -34,11 +34,17 @@ class CosyVoiceService:
                     return []
             
             result = self.client.predict(api_name="/refresh_sft_spk")
+            logger.info(f"音色列表原始返回: {result}")
             
             # 处理返回的字典格式
             if isinstance(result, dict) and 'choices' in result:
-                # 从choices中提取音色名称
-                voices = [choice[0] for choice in result['choices'] if choice[0] != '.ipynb_checkpoints']
+                # 从choices中提取音色名称，格式是 [['name', 'name'], ...]
+                voices = []
+                for choice in result['choices']:
+                    if isinstance(choice, list) and len(choice) > 0:
+                        voice_name = choice[0]
+                        if voice_name != '.ipynb_checkpoints':
+                            voices.append(voice_name)
                 return voices
             elif isinstance(result, list):
                 # 直接是列表格式
@@ -62,7 +68,10 @@ class CosyVoiceService:
             
             # 处理返回的字典格式
             if isinstance(result, dict) and 'choices' in result:
-                audios = [choice[0] for choice in result['choices']]
+                audios = []
+                for choice in result['choices']:
+                    if isinstance(choice, list) and len(choice) > 0:
+                        audios.append(choice[0])
                 return audios
             elif isinstance(result, list):
                 return result
@@ -89,6 +98,35 @@ class CosyVoiceService:
             logger.error(f"语音识别失败: {str(e)}")
             return ""
     
+    def _create_empty_audio_file(self) -> str:
+        """创建临时的空音频文件"""
+        import tempfile
+        import wave
+        import numpy as np
+        
+        # 创建临时文件
+        temp_fd, temp_path = tempfile.mkstemp(suffix='.wav')
+        os.close(temp_fd)  # 关闭文件描述符
+        
+        try:
+            # 创建一个很短的静音音频
+            with wave.open(temp_path, 'w') as wav_file:
+                wav_file.setnchannels(1)  # 单声道
+                wav_file.setsampwidth(2)  # 16位
+                wav_file.setframerate(16000)  # 16kHz采样率
+                # 写入0.01秒的静音
+                silence = np.zeros(160, dtype=np.int16)  
+                wav_file.writeframes(silence.tobytes())
+            
+            return temp_path
+        except Exception as e:
+            # 如果创建失败，删除临时文件
+            try:
+                os.unlink(temp_path)
+            except:
+                pass
+            raise e
+    
     def generate_speech_with_preset_voice(
         self, 
         text: str, 
@@ -98,33 +136,22 @@ class CosyVoiceService:
         stream: bool = False
     ) -> Tuple[Optional[str], Optional[str]]:
         """使用预训练音色生成语音"""
+        temp_audio_path = None
         try:
             if not self.client:
                 if not self.connect():
                     return None, None
             
-            # 创建临时空音频文件用于占位
-            import tempfile
-            import wave
-            import numpy as np
-            
-            # 创建一个短的静音音频作为占位符
-            temp_audio = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
-            with wave.open(temp_audio.name, 'w') as wav_file:
-                wav_file.setnchannels(1)  # 单声道
-                wav_file.setsampwidth(2)  # 16位
-                wav_file.setframerate(16000)  # 16kHz采样率
-                # 写入很短的静音（0.1秒）
-                silence = np.zeros(1600, dtype=np.int16)  
-                wav_file.writeframes(silence.tobytes())
+            # 创建临时空音频文件
+            temp_audio_path = self._create_empty_audio_file()
             
             result = self.client.predict(
                 tts_text=text,
                 mode_checkbox_group="预训练音色",
                 sft_dropdown=voice,
                 prompt_text="",
-                prompt_wav_upload=handle_file(temp_audio.name),
-                prompt_wav_record=handle_file(temp_audio.name),
+                prompt_wav_upload=handle_file(temp_audio_path),
+                prompt_wav_record=handle_file(temp_audio_path),
                 instruct_text="",
                 seed=float(seed),
                 stream="True" if stream else "False",
@@ -132,11 +159,7 @@ class CosyVoiceService:
                 api_name="/generate_audio"
             )
             
-            # 清理临时文件
-            try:
-                os.unlink(temp_audio.name)
-            except:
-                pass
+            logger.info(f"预训练音色生成结果: {result}")
             
             # result是一个元组 [流式音频路径, 完整音频路径]
             if isinstance(result, (list, tuple)) and len(result) >= 2:
@@ -147,6 +170,13 @@ class CosyVoiceService:
         except Exception as e:
             logger.error(f"预训练音色语音生成失败: {str(e)}")
             return None, None
+        finally:
+            # 清理临时文件
+            if temp_audio_path and os.path.exists(temp_audio_path):
+                try:
+                    os.unlink(temp_audio_path)
+                except:
+                    pass
     
     def generate_speech_with_voice_cloning(
         self, 
@@ -198,33 +228,22 @@ class CosyVoiceService:
         seed: int = 42
     ) -> Tuple[Optional[str], Optional[str]]:
         """使用自然语言控制生成语音"""
+        temp_audio_path = None
         try:
             if not self.client:
                 if not self.connect():
                     return None, None
             
-            # 创建临时空音频文件用于占位
-            import tempfile
-            import wave
-            import numpy as np
-            
-            # 创建一个短的静音音频作为占位符
-            temp_audio = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
-            with wave.open(temp_audio.name, 'w') as wav_file:
-                wav_file.setnchannels(1)  # 单声道
-                wav_file.setsampwidth(2)  # 16位
-                wav_file.setframerate(16000)  # 16kHz采样率
-                # 写入很短的静音（0.1秒）
-                silence = np.zeros(1600, dtype=np.int16)  
-                wav_file.writeframes(silence.tobytes())
+            # 创建临时空音频文件
+            temp_audio_path = self._create_empty_audio_file()
             
             result = self.client.predict(
                 tts_text=text,
                 mode_checkbox_group="自然语言控制",
                 sft_dropdown="中文女",
                 prompt_text="",
-                prompt_wav_upload=handle_file(temp_audio.name),
-                prompt_wav_record=handle_file(temp_audio.name),
+                prompt_wav_upload=handle_file(temp_audio_path),
+                prompt_wav_record=handle_file(temp_audio_path),
                 instruct_text=instruction,
                 seed=float(seed),
                 stream="False",
@@ -232,12 +251,6 @@ class CosyVoiceService:
                 api_name="/generate_audio"
             )
             
-            # 清理临时文件
-            try:
-                os.unlink(temp_audio.name)
-            except:
-                pass
-            
             if isinstance(result, (list, tuple)) and len(result) >= 2:
                 return result[0], result[1]
             else:
@@ -246,6 +259,13 @@ class CosyVoiceService:
         except Exception as e:
             logger.error(f"自然语言控制语音生成失败: {str(e)}")
             return None, None
+        finally:
+            # 清理临时文件
+            if temp_audio_path and os.path.exists(temp_audio_path):
+                try:
+                    os.unlink(temp_audio_path)
+                except:
+                    pass
     
     def generate_random_seed(self) -> int:
         """生成随机种子"""