主要修改在asr-monitor-test 修改小程序手机号码登录小程序的TTS生成(查一查、AI) 增加和支付相关的功能

2025-07-10 22:04:44 +08:00
parent 0665eb2c2d
commit 74899acab9
23 changed files with 4467 additions and 459 deletions
--- a/api/db/services/ali_tts_service.py
+++ b/api/db/services/ali_tts_service.py
@@ -0,0 +1,367 @@
+import asyncio,logging
+from collections import deque
+import threading, time,queue,uuid,time,array
+from concurrent.futures import ThreadPoolExecutor
+
+ALI_KEY = "sk-a47a3fb5f4a94f66bbaf713779101c75"
+from dashscope.api_entities.dashscope_response import SpeechSynthesisResponse
+from dashscope.audio.tts import (
+        ResultCallback as TTSResultCallback,
+        SpeechSynthesizer as TTSSpeechSynthesizer,
+        SpeechSynthesisResult as TTSSpeechSynthesisResult,
+    )
+# cyx 2025 01 19 测试cosyvoice  使用tts_v2 版本
+from dashscope.audio.tts_v2 import (
+        ResultCallback as CosyResultCallback,
+        SpeechSynthesizer as CosySpeechSynthesizer,
+        AudioFormat,
+    )
+
+class QwenTTS:
+    def __init__(self, key,format="mp3",sample_rate=44100, model_name="cosyvoice-v1/longyuan"):
+        import dashscope
+        import ssl
+        logging.info(f"---QwenTTS Construtor-- {format} {sample_rate} {model_name}")  # cyx
+        self.model_name = model_name
+        dashscope.api_key = key
+        ssl._create_default_https_context = ssl._create_unverified_context  # 禁用验证
+        self.synthesizer = None
+        self.callback = None
+        self.is_cosyvoice = False
+        self.voice = ""
+        self.format = format
+        self.sample_rate = sample_rate
+        if '/' in model_name:
+            parts = model_name.split('/', 1)
+            # 返回分离后的两个字符串parts[0], parts[1]
+            if parts[0] == 'cosyvoice-v1':
+                self.is_cosyvoice = True
+                self.voice = parts[1]
+
+    class Callback(TTSResultCallback):
+        def __init__(self) -> None:
+            self.dque = deque()
+
+        def _run(self):
+            while True:
+                if not self.dque:
+                    time.sleep(0)
+                    continue
+                val = self.dque.popleft()
+                if val:
+                    yield val
+                else:
+                    break
+
+        def on_open(self):
+            pass
+
+        def on_complete(self):
+            self.dque.append(None)
+
+        def on_error(self, response: SpeechSynthesisResponse):
+            print("Qwen tts error", str(response))
+            raise RuntimeError(str(response))
+
+        def on_close(self):
+            pass
+
+        def on_event(self, result: TTSSpeechSynthesisResult):
+            if result.get_audio_frame() is not None:
+                self.dque.append(result.get_audio_frame())
+
+        # --------------------------
+
+    class Callback_Cosy(CosyResultCallback):
+        def __init__(self,on_audio_data) -> None:
+            self.dque = deque()
+            self.on_audio_data = on_audio_data
+
+        def _run(self):
+            while True:
+                if not self.dque:
+                    time.sleep(0)
+                    continue
+                val = self.dque.popleft()
+                if val:
+                    yield val
+                else:
+                    break
+
+        def on_open(self):
+            logging.info("---Qwen tts on_open---")
+            pass
+
+        def on_complete(self):
+            self.dque.append(None)
+
+        def on_error(self, response: SpeechSynthesisResponse):
+            print("Qwen tts error", str(response))
+            raise RuntimeError(str(response))
+
+        def on_close(self):
+            # print("---Qwen call back close")  # cyx
+            logging.info("---Qwen tts on_close---")
+            pass
+
+        """ canceled for test 语音大模型CosyVoice
+        def on_event(self, result: SpeechSynthesisResult):
+            if result.get_audio_frame() is not None:
+                self.dque.append(result.get_audio_frame())
+        """
+
+        def on_event(self, message):
+            # print(f"recv speech synthsis message {message}")
+            pass
+
+        # 以下适合语音大模型CosyVoice
+        def on_data(self, data: bytes) -> None:
+            if len(data) > 0:
+                if self.on_audio_data:
+                    self.on_audio_data(data)
+                else:
+                    self.dque.append(data)
+
+        # --------------------------
+
+    def tts(self, text):
+        print(f"--QwenTTS--tts_stream begin-- {text} {self.is_cosyvoice} {self.voice}")  # cyx
+        # text = self.normalize_text(text)
+        try:
+            # if self.model_name != 'cosyvoice-v1':
+            if self.is_cosyvoice is False:
+                self.callback = self.Callback()
+                TTSSpeechSynthesizer.call(model=self.model_name,
+                                       text=text,
+                                       callback=self.callback,
+                                       format="wav")  # format="mp3")
+            else:
+                self.callback = self.Callback_Cosy(None)
+                format =self.get_audio_format(self.format,self.sample_rate)
+                self.synthesizer = CosySpeechSynthesizer(
+                    model='cosyvoice-v1',
+                    # voice="longyuan", #"longfei",
+                    voice=self.voice,
+                    callback=self.callback,
+                    format=format
+                )
+                self.synthesizer.call(text)
+        except Exception as e:
+            print(f"---dale---20  error {e}")  # cyx
+        # -----------------------------------
+        try:
+            for data in self.callback._run():
+                #logging.info(f"dashcope return data {len(data)}")
+                yield data
+            # print(f"---Qwen return data {num_tokens_from_string(text)}")
+            # yield num_tokens_from_string(text)
+
+        except Exception as e:
+            raise RuntimeError(f"**ERROR**: {e}")
+
+    def init_streaming_call(self, on_data):
+        try:
+            self.callback = self.Callback_Cosy(on_data)
+            format =self.get_audio_format(self.format,self.sample_rate)
+            self.synthesizer = CosySpeechSynthesizer(
+                model='cosyvoice-v1',
+                # voice="longyuan", #"longfei",
+                voice=self.voice,
+                callback=self.callback,
+                format=format
+            )
+        except Exception as e:
+            print(f"---dale---30  error {e}")  # cyx
+        # -----------------------------------
+
+    def streaming_call(self,text):
+        if self.synthesizer:
+            self.synthesizer.streaming_call(text)
+    def end_streaming_call(self):
+        if self.synthesizer:
+            self.synthesizer.streaming_complete()
+
+    def get_audio_format(self, format: str, sample_rate: int):
+        """动态获取音频格式"""
+        from dashscope.audio.tts_v2 import AudioFormat
+        format_map = {
+            (8000, 'mp3'): AudioFormat.MP3_8000HZ_MONO_128KBPS,
+            (8000, 'pcm'): AudioFormat.PCM_8000HZ_MONO_16BIT,
+            (8000, 'wav'): AudioFormat.WAV_8000HZ_MONO_16BIT,
+            (16000, 'pcm'): AudioFormat.PCM_16000HZ_MONO_16BIT,
+            (22050, 'mp3'): AudioFormat.MP3_22050HZ_MONO_256KBPS,
+            (22050, 'pcm'): AudioFormat.PCM_22050HZ_MONO_16BIT,
+            (22050, 'wav'): AudioFormat.WAV_22050HZ_MONO_16BIT,
+            (44100, 'mp3'): AudioFormat.MP3_44100HZ_MONO_256KBPS,
+            (44100, 'pcm'): AudioFormat.PCM_44100HZ_MONO_16BIT,
+            (44100, 'wav'): AudioFormat.WAV_44100HZ_MONO_16BIT,
+            (48000, 'mp3'): AudioFormat.MP3_48000HZ_MONO_256KBPS,
+            (48000, 'pcm'): AudioFormat.PCM_48000HZ_MONO_16BIT,
+            (48000, 'wav'):AudioFormat.WAV_48000HZ_MONO_16BIT
+
+        }
+        return format_map.get((sample_rate, format), AudioFormat.MP3_16000HZ_MONO_128KBPS)
+
+class StreamSessionManager:
+    def __init__(self):
+        self.sessions = {}  # {session_id: {'tts_model': obj, 'buffer': queue, 'task_queue': Queue}}
+        self.lock = threading.Lock()
+        self.executor = ThreadPoolExecutor(max_workers=30)  # 固定大小线程池
+        self.gc_interval = 300  # 5分钟清理一次 5 x 60 300秒
+        self.gc_tts = 10 # 10s 大模型开始输出文本有可能需要比较久，2025年5 24 从3s->10s
+
+    def create_session(self, tts_model,sample_rate =8000, stream_format='mp3',voice='cosyvoice-v1/longxiaochun'):
+        session_id = str(uuid.uuid4())
+        def on_audio_data(chunk):
+            session = self.sessions.get(session_id)
+            first_chunk = not session['tts_chunk_data_valid']
+            if session['stream_format'] == 'wav':
+                if first_chunk:
+                    chunk_len = len(chunk)
+                    if chunk_len > 2048:
+                        session['buffer'].put(audio_fade_in(chunk, 1024))
+                    else:
+                        session['buffer'].put(audio_fade_in(chunk, chunk_len))
+                else:
+                    session['buffer'].put(chunk)
+            else:
+                session['buffer'].put(chunk)
+            session['last_active'] = time.time()
+            session['audio_chunk_count'] = session['audio_chunk_count'] + 1
+            if session['tts_chunk_data_valid'] is False:
+                session['tts_chunk_data_valid'] = True  # 20250510 增加，表示连接TTS后台已经返回，可以通知前端了
+
+        with self.lock:
+            ali_tts_model = QwenTTS(ALI_KEY,stream_format, sample_rate,voice.split('@')[0])
+            self.sessions[session_id] = {
+                'tts_model': ali_tts_model,  #tts_model,
+                'buffer': queue.Queue(maxsize=300),  # 线程安全队列
+                'task_queue': queue.Queue(),
+                'active': True,
+                'last_active': time.time(),
+                'audio_chunk_count':0,
+                'finished': threading.Event(),  # 添加事件对象
+                'sample_rate':sample_rate,
+                'stream_format':stream_format,
+                "tts_chunk_data_valid":False,
+                'voice':voice,
+            }
+            self.sessions[session_id]['tts_model'].init_streaming_call(on_audio_data)
+            # 启动任务处理线程
+            threading.Thread(target=self._process_tasks, args=(session_id,), daemon=True).start()
+        return session_id
+
+    def append_text(self, session_id, text):
+        with self.lock:
+            session = self.sessions.get(session_id)
+            if not session: return
+            # 将文本放入任务队列（非阻塞）
+            #logging.info(f"StreamSessionManager append_text {text}")
+            try:
+                session['task_queue'].put(text, block=False)
+            except queue.Full:
+                logging.warning(f"Session {session_id} task queue full")
+
+    def _process_tasks(self, session_id):
+        """任务处理线程（每个会话独立）"""
+        while True:
+            session = self.sessions.get(session_id)
+            if not session or not session['active']:
+                break
+            try:
+                #logging.info(f"StreamSessionManager _process_tasks {session['task_queue'].qsize()}")
+                # 合并多个文本块（最多等待50ms）
+                texts = []
+                while len(texts) < 5:  # 最大合并5个文本块
+                    try:
+                        text = session['task_queue'].get(timeout=0.1)
+                        #logging.info(f"StreamSessionManager _process_tasks --0 {len(texts)}")
+                        texts.append(text)
+                    except queue.Empty:
+                        break
+
+                if texts:
+                    session['last_active'] = time.time() # 如果有处理文本，重置活跃时间
+                    # 提交到线程池处理
+                    #future=self.executor.submit(
+                    #    self._generate_audio,
+                    #    session_id,
+                    #    ' '.join(texts)  # 合并文本减少请求次数
+                    #)
+                    #future.result()  # 等待转换任务执行完毕
+                    session['tts_model'].streaming_call(''.join(texts))
+                    session['last_active'] = time.time()
+                # 会话超时检查
+                if time.time() - session['last_active'] > self.gc_interval:
+                    self.close_session(session_id)
+                    break
+                if time.time() - session['last_active'] > self.gc_tts:
+                    session['tts_model'].end_streaming_call()
+                    session['finished'].set()
+                    break
+
+            except Exception as e:
+                logging.error(f"Task processing error: {str(e)}")
+
+    def _generate_audio(self, session_id, text):
+        """实际生成音频（线程池执行）"""
+        session = self.sessions.get(session_id)
+        if not session: return
+        # logging.info(f"_generate_audio:{text}")
+        first_chunk = True
+        logging.info(f"转换开始!!! {text}")
+        try:
+            for chunk in session['tts_model'].tts(text,session['sample_rate'],session['stream_format']):
+                if session['stream_format'] == 'wav':
+                    if first_chunk:
+                        chunk_len = len(chunk)
+                        if chunk_len > 2048:
+                            session['buffer'].put(audio_fade_in(chunk,1024))
+                        else:
+                            session['buffer'].put(audio_fade_in(chunk, chunk_len))
+                        first_chunk = False
+                    else:
+                        session['buffer'].put(chunk)
+                else:
+                    session['buffer'].put(chunk)
+                session['last_active'] = time.time()
+                session['audio_chunk_count'] = session['audio_chunk_count'] + 1
+                if session['tts_chunk_data_valid'] is False:
+                    session['tts_chunk_data_valid'] = True  #20250510 增加，表示连接TTS后台已经返回，可以通知前端了
+            logging.info(f"转换结束!!! {session['audio_chunk_count'] }")
+        except Exception as e:
+            session['buffer'].put(f"ERROR:{str(e)}")
+            logging.info(f"--_generate_audio--error {str(e)}")
+
+
+    def close_session(self, session_id):
+        with self.lock:
+            if session_id in self.sessions:
+                logging.info(f"--Session {session_id} close_session")
+                # 标记会话为不活跃
+                self.sessions[session_id]['active'] = False
+                # 延迟2秒后清理资源
+                threading.Timer(1, self._clean_session, args=[session_id]).start()
+
+    def _clean_session(self, session_id):
+        with self.lock:
+            if session_id in self.sessions:
+                del self.sessions[session_id]
+
+    def get_session(self, session_id):
+        return self.sessions.get(session_id)
+
+
+stream_manager_w_stream = StreamSessionManager()
+def audio_fade_in(audio_data, fade_length):
+    # 假设音频数据是16位单声道PCM
+    # 将二进制数据转换为整数数组
+    samples = array.array('h', audio_data)
+
+    # 对前fade_length个样本进行淡入处理
+    for i in range(fade_length):
+        fade_factor = i / fade_length
+        samples[i] = int(samples[i] * fade_factor)
+
+    # 将整数数组转换回二进制数据
+    return samples.tobytes()