import asyncio,logging from collections import deque import threading, time,queue,uuid,time,array from concurrent.futures import ThreadPoolExecutor ALI_KEY = "sk-a47a3fb5f4a94f66bbaf713779101c75" from dashscope.api_entities.dashscope_response import SpeechSynthesisResponse from dashscope.audio.tts import ( ResultCallback as TTSResultCallback, SpeechSynthesizer as TTSSpeechSynthesizer, SpeechSynthesisResult as TTSSpeechSynthesisResult, ) # cyx 2025 01 19 测试cosyvoice 使用tts_v2 版本 from dashscope.audio.tts_v2 import ( ResultCallback as CosyResultCallback, SpeechSynthesizer as CosySpeechSynthesizer, AudioFormat, ) class QwenTTS: def __init__(self, key,format="mp3",sample_rate=44100, model_name="cosyvoice-v1/longyuan"): import dashscope import ssl logging.info(f"---QwenTTS Construtor-- {format} {sample_rate} {model_name}") # cyx self.model_name = model_name dashscope.api_key = key ssl._create_default_https_context = ssl._create_unverified_context # 禁用验证 self.synthesizer = None self.callback = None self.is_cosyvoice = False self.voice = "" self.format = format self.sample_rate = sample_rate if '/' in model_name: parts = model_name.split('/', 1) # 返回分离后的两个字符串parts[0], parts[1] if parts[0] == 'cosyvoice-v1': self.is_cosyvoice = True self.voice = parts[1] class Callback(TTSResultCallback): def __init__(self) -> None: self.dque = deque() def _run(self): while True: if not self.dque: time.sleep(0) continue val = self.dque.popleft() if val: yield val else: break def on_open(self): pass def on_complete(self): self.dque.append(None) def on_error(self, response: SpeechSynthesisResponse): print("Qwen tts error", str(response)) raise RuntimeError(str(response)) def on_close(self): pass def on_event(self, result: TTSSpeechSynthesisResult): if result.get_audio_frame() is not None: self.dque.append(result.get_audio_frame()) # -------------------------- class Callback_Cosy(CosyResultCallback): def __init__(self,on_audio_data) -> None: self.dque = deque() self.on_audio_data = on_audio_data def _run(self): while True: if not self.dque: time.sleep(0) continue val = self.dque.popleft() if val: yield val else: break def on_open(self): logging.info("---Qwen tts on_open---") pass def on_complete(self): self.dque.append(None) def on_error(self, response: SpeechSynthesisResponse): print("Qwen tts error", str(response)) raise RuntimeError(str(response)) def on_close(self): # print("---Qwen call back close") # cyx logging.info("---Qwen tts on_close---") pass """ canceled for test 语音大模型CosyVoice def on_event(self, result: SpeechSynthesisResult): if result.get_audio_frame() is not None: self.dque.append(result.get_audio_frame()) """ def on_event(self, message): # print(f"recv speech synthsis message {message}") pass # 以下适合语音大模型CosyVoice def on_data(self, data: bytes) -> None: if len(data) > 0: if self.on_audio_data: self.on_audio_data(data) else: self.dque.append(data) # -------------------------- def tts(self, text): print(f"--QwenTTS--tts_stream begin-- {text} {self.is_cosyvoice} {self.voice}") # cyx # text = self.normalize_text(text) try: # if self.model_name != 'cosyvoice-v1': if self.is_cosyvoice is False: self.callback = self.Callback() TTSSpeechSynthesizer.call(model=self.model_name, text=text, callback=self.callback, format="wav") # format="mp3") else: self.callback = self.Callback_Cosy(None) format =self.get_audio_format(self.format,self.sample_rate) self.synthesizer = CosySpeechSynthesizer( model='cosyvoice-v1', # voice="longyuan", #"longfei", voice=self.voice, callback=self.callback, format=format ) self.synthesizer.call(text) except Exception as e: print(f"---dale---20 error {e}") # cyx # ----------------------------------- try: for data in self.callback._run(): #logging.info(f"dashcope return data {len(data)}") yield data # print(f"---Qwen return data {num_tokens_from_string(text)}") # yield num_tokens_from_string(text) except Exception as e: raise RuntimeError(f"**ERROR**: {e}") def init_streaming_call(self, on_data): try: self.callback = self.Callback_Cosy(on_data) format =self.get_audio_format(self.format,self.sample_rate) self.synthesizer = CosySpeechSynthesizer( model='cosyvoice-v1', # voice="longyuan", #"longfei", voice=self.voice, callback=self.callback, format=format ) except Exception as e: print(f"---dale---30 error {e}") # cyx # ----------------------------------- def streaming_call(self,text): if self.synthesizer: self.synthesizer.streaming_call(text) def end_streaming_call(self): if self.synthesizer: self.synthesizer.streaming_complete() def get_audio_format(self, format: str, sample_rate: int): """动态获取音频格式""" from dashscope.audio.tts_v2 import AudioFormat format_map = { (8000, 'mp3'): AudioFormat.MP3_8000HZ_MONO_128KBPS, (8000, 'pcm'): AudioFormat.PCM_8000HZ_MONO_16BIT, (8000, 'wav'): AudioFormat.WAV_8000HZ_MONO_16BIT, (16000, 'pcm'): AudioFormat.PCM_16000HZ_MONO_16BIT, (22050, 'mp3'): AudioFormat.MP3_22050HZ_MONO_256KBPS, (22050, 'pcm'): AudioFormat.PCM_22050HZ_MONO_16BIT, (22050, 'wav'): AudioFormat.WAV_22050HZ_MONO_16BIT, (44100, 'mp3'): AudioFormat.MP3_44100HZ_MONO_256KBPS, (44100, 'pcm'): AudioFormat.PCM_44100HZ_MONO_16BIT, (44100, 'wav'): AudioFormat.WAV_44100HZ_MONO_16BIT, (48000, 'mp3'): AudioFormat.MP3_48000HZ_MONO_256KBPS, (48000, 'pcm'): AudioFormat.PCM_48000HZ_MONO_16BIT, (48000, 'wav'):AudioFormat.WAV_48000HZ_MONO_16BIT } return format_map.get((sample_rate, format), AudioFormat.MP3_16000HZ_MONO_128KBPS) class StreamSessionManager: def __init__(self): self.sessions = {} # {session_id: {'tts_model': obj, 'buffer': queue, 'task_queue': Queue}} self.lock = threading.Lock() self.executor = ThreadPoolExecutor(max_workers=30) # 固定大小线程池 self.gc_interval = 300 # 5分钟清理一次 5 x 60 300秒 self.gc_tts = 10 # 10s 大模型开始输出文本有可能需要比较久,2025年5 24 从3s->10s def create_session(self, tts_model,sample_rate =8000, stream_format='mp3',voice='cosyvoice-v1/longxiaochun'): session_id = str(uuid.uuid4()) def on_audio_data(chunk): session = self.sessions.get(session_id) first_chunk = not session['tts_chunk_data_valid'] if session['stream_format'] == 'wav': if first_chunk: chunk_len = len(chunk) if chunk_len > 2048: session['buffer'].put(audio_fade_in(chunk, 1024)) else: session['buffer'].put(audio_fade_in(chunk, chunk_len)) else: session['buffer'].put(chunk) else: session['buffer'].put(chunk) session['last_active'] = time.time() session['audio_chunk_count'] = session['audio_chunk_count'] + 1 if session['tts_chunk_data_valid'] is False: session['tts_chunk_data_valid'] = True # 20250510 增加,表示连接TTS后台已经返回,可以通知前端了 with self.lock: ali_tts_model = QwenTTS(ALI_KEY,stream_format, sample_rate,voice.split('@')[0]) self.sessions[session_id] = { 'tts_model': ali_tts_model, #tts_model, 'buffer': queue.Queue(maxsize=300), # 线程安全队列 'task_queue': queue.Queue(), 'active': True, 'last_active': time.time(), 'audio_chunk_count':0, 'finished': threading.Event(), # 添加事件对象 'sample_rate':sample_rate, 'stream_format':stream_format, "tts_chunk_data_valid":False, 'voice':voice, } self.sessions[session_id]['tts_model'].init_streaming_call(on_audio_data) # 启动任务处理线程 threading.Thread(target=self._process_tasks, args=(session_id,), daemon=True).start() return session_id def append_text(self, session_id, text): with self.lock: session = self.sessions.get(session_id) if not session: return # 将文本放入任务队列(非阻塞) #logging.info(f"StreamSessionManager append_text {text}") try: session['task_queue'].put(text, block=False) except queue.Full: logging.warning(f"Session {session_id} task queue full") def _process_tasks(self, session_id): """任务处理线程(每个会话独立)""" while True: session = self.sessions.get(session_id) if not session or not session['active']: break try: #logging.info(f"StreamSessionManager _process_tasks {session['task_queue'].qsize()}") # 合并多个文本块(最多等待50ms) texts = [] while len(texts) < 5: # 最大合并5个文本块 try: text = session['task_queue'].get(timeout=0.1) #logging.info(f"StreamSessionManager _process_tasks --0 {len(texts)}") texts.append(text) except queue.Empty: break if texts: session['last_active'] = time.time() # 如果有处理文本,重置活跃时间 # 提交到线程池处理 #future=self.executor.submit( # self._generate_audio, # session_id, # ' '.join(texts) # 合并文本减少请求次数 #) #future.result() # 等待转换任务执行完毕 session['tts_model'].streaming_call(''.join(texts)) session['last_active'] = time.time() # 会话超时检查 if time.time() - session['last_active'] > self.gc_interval: self.close_session(session_id) break if time.time() - session['last_active'] > self.gc_tts: session['tts_model'].end_streaming_call() session['finished'].set() break except Exception as e: logging.error(f"Task processing error: {str(e)}") def _generate_audio(self, session_id, text): """实际生成音频(线程池执行)""" session = self.sessions.get(session_id) if not session: return # logging.info(f"_generate_audio:{text}") first_chunk = True logging.info(f"转换开始!!! {text}") try: for chunk in session['tts_model'].tts(text,session['sample_rate'],session['stream_format']): if session['stream_format'] == 'wav': if first_chunk: chunk_len = len(chunk) if chunk_len > 2048: session['buffer'].put(audio_fade_in(chunk,1024)) else: session['buffer'].put(audio_fade_in(chunk, chunk_len)) first_chunk = False else: session['buffer'].put(chunk) else: session['buffer'].put(chunk) session['last_active'] = time.time() session['audio_chunk_count'] = session['audio_chunk_count'] + 1 if session['tts_chunk_data_valid'] is False: session['tts_chunk_data_valid'] = True #20250510 增加,表示连接TTS后台已经返回,可以通知前端了 logging.info(f"转换结束!!! {session['audio_chunk_count'] }") except Exception as e: session['buffer'].put(f"ERROR:{str(e)}") logging.info(f"--_generate_audio--error {str(e)}") def close_session(self, session_id): with self.lock: if session_id in self.sessions: logging.info(f"--Session {session_id} close_session") # 标记会话为不活跃 self.sessions[session_id]['active'] = False # 延迟2秒后清理资源 threading.Timer(1, self._clean_session, args=[session_id]).start() def _clean_session(self, session_id): with self.lock: if session_id in self.sessions: del self.sessions[session_id] def get_session(self, session_id): return self.sessions.get(session_id) stream_manager_w_stream = StreamSessionManager() def audio_fade_in(audio_data, fade_length): # 假设音频数据是16位单声道PCM # 将二进制数据转换为整数数组 samples = array.array('h', audio_data) # 对前fade_length个样本进行淡入处理 for i in range(fade_length): fade_factor = i / fade_length samples[i] = int(samples[i] * fade_factor) # 将整数数组转换回二进制数据 return samples.tobytes()