主要修改在asr-monitor-test 修改小程序手机号码登录 小程序的TTS生成(查一查、AI) 增加和支付相关的功能
This commit is contained in:
367
api/db/services/ali_tts_service.py
Normal file
367
api/db/services/ali_tts_service.py
Normal file
@@ -0,0 +1,367 @@
|
||||
import asyncio,logging
|
||||
from collections import deque
|
||||
import threading, time,queue,uuid,time,array
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
ALI_KEY = "sk-a47a3fb5f4a94f66bbaf713779101c75"
|
||||
from dashscope.api_entities.dashscope_response import SpeechSynthesisResponse
|
||||
from dashscope.audio.tts import (
|
||||
ResultCallback as TTSResultCallback,
|
||||
SpeechSynthesizer as TTSSpeechSynthesizer,
|
||||
SpeechSynthesisResult as TTSSpeechSynthesisResult,
|
||||
)
|
||||
# cyx 2025 01 19 测试cosyvoice 使用tts_v2 版本
|
||||
from dashscope.audio.tts_v2 import (
|
||||
ResultCallback as CosyResultCallback,
|
||||
SpeechSynthesizer as CosySpeechSynthesizer,
|
||||
AudioFormat,
|
||||
)
|
||||
|
||||
class QwenTTS:
|
||||
def __init__(self, key,format="mp3",sample_rate=44100, model_name="cosyvoice-v1/longyuan"):
|
||||
import dashscope
|
||||
import ssl
|
||||
logging.info(f"---QwenTTS Construtor-- {format} {sample_rate} {model_name}") # cyx
|
||||
self.model_name = model_name
|
||||
dashscope.api_key = key
|
||||
ssl._create_default_https_context = ssl._create_unverified_context # 禁用验证
|
||||
self.synthesizer = None
|
||||
self.callback = None
|
||||
self.is_cosyvoice = False
|
||||
self.voice = ""
|
||||
self.format = format
|
||||
self.sample_rate = sample_rate
|
||||
if '/' in model_name:
|
||||
parts = model_name.split('/', 1)
|
||||
# 返回分离后的两个字符串parts[0], parts[1]
|
||||
if parts[0] == 'cosyvoice-v1':
|
||||
self.is_cosyvoice = True
|
||||
self.voice = parts[1]
|
||||
|
||||
class Callback(TTSResultCallback):
|
||||
def __init__(self) -> None:
|
||||
self.dque = deque()
|
||||
|
||||
def _run(self):
|
||||
while True:
|
||||
if not self.dque:
|
||||
time.sleep(0)
|
||||
continue
|
||||
val = self.dque.popleft()
|
||||
if val:
|
||||
yield val
|
||||
else:
|
||||
break
|
||||
|
||||
def on_open(self):
|
||||
pass
|
||||
|
||||
def on_complete(self):
|
||||
self.dque.append(None)
|
||||
|
||||
def on_error(self, response: SpeechSynthesisResponse):
|
||||
print("Qwen tts error", str(response))
|
||||
raise RuntimeError(str(response))
|
||||
|
||||
def on_close(self):
|
||||
pass
|
||||
|
||||
def on_event(self, result: TTSSpeechSynthesisResult):
|
||||
if result.get_audio_frame() is not None:
|
||||
self.dque.append(result.get_audio_frame())
|
||||
|
||||
# --------------------------
|
||||
|
||||
class Callback_Cosy(CosyResultCallback):
|
||||
def __init__(self,on_audio_data) -> None:
|
||||
self.dque = deque()
|
||||
self.on_audio_data = on_audio_data
|
||||
|
||||
def _run(self):
|
||||
while True:
|
||||
if not self.dque:
|
||||
time.sleep(0)
|
||||
continue
|
||||
val = self.dque.popleft()
|
||||
if val:
|
||||
yield val
|
||||
else:
|
||||
break
|
||||
|
||||
def on_open(self):
|
||||
logging.info("---Qwen tts on_open---")
|
||||
pass
|
||||
|
||||
def on_complete(self):
|
||||
self.dque.append(None)
|
||||
|
||||
def on_error(self, response: SpeechSynthesisResponse):
|
||||
print("Qwen tts error", str(response))
|
||||
raise RuntimeError(str(response))
|
||||
|
||||
def on_close(self):
|
||||
# print("---Qwen call back close") # cyx
|
||||
logging.info("---Qwen tts on_close---")
|
||||
pass
|
||||
|
||||
""" canceled for test 语音大模型CosyVoice
|
||||
def on_event(self, result: SpeechSynthesisResult):
|
||||
if result.get_audio_frame() is not None:
|
||||
self.dque.append(result.get_audio_frame())
|
||||
"""
|
||||
|
||||
def on_event(self, message):
|
||||
# print(f"recv speech synthsis message {message}")
|
||||
pass
|
||||
|
||||
# 以下适合语音大模型CosyVoice
|
||||
def on_data(self, data: bytes) -> None:
|
||||
if len(data) > 0:
|
||||
if self.on_audio_data:
|
||||
self.on_audio_data(data)
|
||||
else:
|
||||
self.dque.append(data)
|
||||
|
||||
# --------------------------
|
||||
|
||||
def tts(self, text):
|
||||
print(f"--QwenTTS--tts_stream begin-- {text} {self.is_cosyvoice} {self.voice}") # cyx
|
||||
# text = self.normalize_text(text)
|
||||
try:
|
||||
# if self.model_name != 'cosyvoice-v1':
|
||||
if self.is_cosyvoice is False:
|
||||
self.callback = self.Callback()
|
||||
TTSSpeechSynthesizer.call(model=self.model_name,
|
||||
text=text,
|
||||
callback=self.callback,
|
||||
format="wav") # format="mp3")
|
||||
else:
|
||||
self.callback = self.Callback_Cosy(None)
|
||||
format =self.get_audio_format(self.format,self.sample_rate)
|
||||
self.synthesizer = CosySpeechSynthesizer(
|
||||
model='cosyvoice-v1',
|
||||
# voice="longyuan", #"longfei",
|
||||
voice=self.voice,
|
||||
callback=self.callback,
|
||||
format=format
|
||||
)
|
||||
self.synthesizer.call(text)
|
||||
except Exception as e:
|
||||
print(f"---dale---20 error {e}") # cyx
|
||||
# -----------------------------------
|
||||
try:
|
||||
for data in self.callback._run():
|
||||
#logging.info(f"dashcope return data {len(data)}")
|
||||
yield data
|
||||
# print(f"---Qwen return data {num_tokens_from_string(text)}")
|
||||
# yield num_tokens_from_string(text)
|
||||
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"**ERROR**: {e}")
|
||||
|
||||
def init_streaming_call(self, on_data):
|
||||
try:
|
||||
self.callback = self.Callback_Cosy(on_data)
|
||||
format =self.get_audio_format(self.format,self.sample_rate)
|
||||
self.synthesizer = CosySpeechSynthesizer(
|
||||
model='cosyvoice-v1',
|
||||
# voice="longyuan", #"longfei",
|
||||
voice=self.voice,
|
||||
callback=self.callback,
|
||||
format=format
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"---dale---30 error {e}") # cyx
|
||||
# -----------------------------------
|
||||
|
||||
def streaming_call(self,text):
|
||||
if self.synthesizer:
|
||||
self.synthesizer.streaming_call(text)
|
||||
def end_streaming_call(self):
|
||||
if self.synthesizer:
|
||||
self.synthesizer.streaming_complete()
|
||||
|
||||
def get_audio_format(self, format: str, sample_rate: int):
|
||||
"""动态获取音频格式"""
|
||||
from dashscope.audio.tts_v2 import AudioFormat
|
||||
format_map = {
|
||||
(8000, 'mp3'): AudioFormat.MP3_8000HZ_MONO_128KBPS,
|
||||
(8000, 'pcm'): AudioFormat.PCM_8000HZ_MONO_16BIT,
|
||||
(8000, 'wav'): AudioFormat.WAV_8000HZ_MONO_16BIT,
|
||||
(16000, 'pcm'): AudioFormat.PCM_16000HZ_MONO_16BIT,
|
||||
(22050, 'mp3'): AudioFormat.MP3_22050HZ_MONO_256KBPS,
|
||||
(22050, 'pcm'): AudioFormat.PCM_22050HZ_MONO_16BIT,
|
||||
(22050, 'wav'): AudioFormat.WAV_22050HZ_MONO_16BIT,
|
||||
(44100, 'mp3'): AudioFormat.MP3_44100HZ_MONO_256KBPS,
|
||||
(44100, 'pcm'): AudioFormat.PCM_44100HZ_MONO_16BIT,
|
||||
(44100, 'wav'): AudioFormat.WAV_44100HZ_MONO_16BIT,
|
||||
(48000, 'mp3'): AudioFormat.MP3_48000HZ_MONO_256KBPS,
|
||||
(48000, 'pcm'): AudioFormat.PCM_48000HZ_MONO_16BIT,
|
||||
(48000, 'wav'):AudioFormat.WAV_48000HZ_MONO_16BIT
|
||||
|
||||
}
|
||||
return format_map.get((sample_rate, format), AudioFormat.MP3_16000HZ_MONO_128KBPS)
|
||||
|
||||
class StreamSessionManager:
|
||||
def __init__(self):
|
||||
self.sessions = {} # {session_id: {'tts_model': obj, 'buffer': queue, 'task_queue': Queue}}
|
||||
self.lock = threading.Lock()
|
||||
self.executor = ThreadPoolExecutor(max_workers=30) # 固定大小线程池
|
||||
self.gc_interval = 300 # 5分钟清理一次 5 x 60 300秒
|
||||
self.gc_tts = 10 # 10s 大模型开始输出文本有可能需要比较久,2025年5 24 从3s->10s
|
||||
|
||||
def create_session(self, tts_model,sample_rate =8000, stream_format='mp3',voice='cosyvoice-v1/longxiaochun'):
|
||||
session_id = str(uuid.uuid4())
|
||||
def on_audio_data(chunk):
|
||||
session = self.sessions.get(session_id)
|
||||
first_chunk = not session['tts_chunk_data_valid']
|
||||
if session['stream_format'] == 'wav':
|
||||
if first_chunk:
|
||||
chunk_len = len(chunk)
|
||||
if chunk_len > 2048:
|
||||
session['buffer'].put(audio_fade_in(chunk, 1024))
|
||||
else:
|
||||
session['buffer'].put(audio_fade_in(chunk, chunk_len))
|
||||
else:
|
||||
session['buffer'].put(chunk)
|
||||
else:
|
||||
session['buffer'].put(chunk)
|
||||
session['last_active'] = time.time()
|
||||
session['audio_chunk_count'] = session['audio_chunk_count'] + 1
|
||||
if session['tts_chunk_data_valid'] is False:
|
||||
session['tts_chunk_data_valid'] = True # 20250510 增加,表示连接TTS后台已经返回,可以通知前端了
|
||||
|
||||
with self.lock:
|
||||
ali_tts_model = QwenTTS(ALI_KEY,stream_format, sample_rate,voice.split('@')[0])
|
||||
self.sessions[session_id] = {
|
||||
'tts_model': ali_tts_model, #tts_model,
|
||||
'buffer': queue.Queue(maxsize=300), # 线程安全队列
|
||||
'task_queue': queue.Queue(),
|
||||
'active': True,
|
||||
'last_active': time.time(),
|
||||
'audio_chunk_count':0,
|
||||
'finished': threading.Event(), # 添加事件对象
|
||||
'sample_rate':sample_rate,
|
||||
'stream_format':stream_format,
|
||||
"tts_chunk_data_valid":False,
|
||||
'voice':voice,
|
||||
}
|
||||
self.sessions[session_id]['tts_model'].init_streaming_call(on_audio_data)
|
||||
# 启动任务处理线程
|
||||
threading.Thread(target=self._process_tasks, args=(session_id,), daemon=True).start()
|
||||
return session_id
|
||||
|
||||
def append_text(self, session_id, text):
|
||||
with self.lock:
|
||||
session = self.sessions.get(session_id)
|
||||
if not session: return
|
||||
# 将文本放入任务队列(非阻塞)
|
||||
#logging.info(f"StreamSessionManager append_text {text}")
|
||||
try:
|
||||
session['task_queue'].put(text, block=False)
|
||||
except queue.Full:
|
||||
logging.warning(f"Session {session_id} task queue full")
|
||||
|
||||
def _process_tasks(self, session_id):
|
||||
"""任务处理线程(每个会话独立)"""
|
||||
while True:
|
||||
session = self.sessions.get(session_id)
|
||||
if not session or not session['active']:
|
||||
break
|
||||
try:
|
||||
#logging.info(f"StreamSessionManager _process_tasks {session['task_queue'].qsize()}")
|
||||
# 合并多个文本块(最多等待50ms)
|
||||
texts = []
|
||||
while len(texts) < 5: # 最大合并5个文本块
|
||||
try:
|
||||
text = session['task_queue'].get(timeout=0.1)
|
||||
#logging.info(f"StreamSessionManager _process_tasks --0 {len(texts)}")
|
||||
texts.append(text)
|
||||
except queue.Empty:
|
||||
break
|
||||
|
||||
if texts:
|
||||
session['last_active'] = time.time() # 如果有处理文本,重置活跃时间
|
||||
# 提交到线程池处理
|
||||
#future=self.executor.submit(
|
||||
# self._generate_audio,
|
||||
# session_id,
|
||||
# ' '.join(texts) # 合并文本减少请求次数
|
||||
#)
|
||||
#future.result() # 等待转换任务执行完毕
|
||||
session['tts_model'].streaming_call(''.join(texts))
|
||||
session['last_active'] = time.time()
|
||||
# 会话超时检查
|
||||
if time.time() - session['last_active'] > self.gc_interval:
|
||||
self.close_session(session_id)
|
||||
break
|
||||
if time.time() - session['last_active'] > self.gc_tts:
|
||||
session['tts_model'].end_streaming_call()
|
||||
session['finished'].set()
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Task processing error: {str(e)}")
|
||||
|
||||
def _generate_audio(self, session_id, text):
|
||||
"""实际生成音频(线程池执行)"""
|
||||
session = self.sessions.get(session_id)
|
||||
if not session: return
|
||||
# logging.info(f"_generate_audio:{text}")
|
||||
first_chunk = True
|
||||
logging.info(f"转换开始!!! {text}")
|
||||
try:
|
||||
for chunk in session['tts_model'].tts(text,session['sample_rate'],session['stream_format']):
|
||||
if session['stream_format'] == 'wav':
|
||||
if first_chunk:
|
||||
chunk_len = len(chunk)
|
||||
if chunk_len > 2048:
|
||||
session['buffer'].put(audio_fade_in(chunk,1024))
|
||||
else:
|
||||
session['buffer'].put(audio_fade_in(chunk, chunk_len))
|
||||
first_chunk = False
|
||||
else:
|
||||
session['buffer'].put(chunk)
|
||||
else:
|
||||
session['buffer'].put(chunk)
|
||||
session['last_active'] = time.time()
|
||||
session['audio_chunk_count'] = session['audio_chunk_count'] + 1
|
||||
if session['tts_chunk_data_valid'] is False:
|
||||
session['tts_chunk_data_valid'] = True #20250510 增加,表示连接TTS后台已经返回,可以通知前端了
|
||||
logging.info(f"转换结束!!! {session['audio_chunk_count'] }")
|
||||
except Exception as e:
|
||||
session['buffer'].put(f"ERROR:{str(e)}")
|
||||
logging.info(f"--_generate_audio--error {str(e)}")
|
||||
|
||||
|
||||
def close_session(self, session_id):
|
||||
with self.lock:
|
||||
if session_id in self.sessions:
|
||||
logging.info(f"--Session {session_id} close_session")
|
||||
# 标记会话为不活跃
|
||||
self.sessions[session_id]['active'] = False
|
||||
# 延迟2秒后清理资源
|
||||
threading.Timer(1, self._clean_session, args=[session_id]).start()
|
||||
|
||||
def _clean_session(self, session_id):
|
||||
with self.lock:
|
||||
if session_id in self.sessions:
|
||||
del self.sessions[session_id]
|
||||
|
||||
def get_session(self, session_id):
|
||||
return self.sessions.get(session_id)
|
||||
|
||||
|
||||
stream_manager_w_stream = StreamSessionManager()
|
||||
def audio_fade_in(audio_data, fade_length):
|
||||
# 假设音频数据是16位单声道PCM
|
||||
# 将二进制数据转换为整数数组
|
||||
samples = array.array('h', audio_data)
|
||||
|
||||
# 对前fade_length个样本进行淡入处理
|
||||
for i in range(fade_length):
|
||||
fade_factor = i / fade_length
|
||||
samples[i] = int(samples[i] * fade_factor)
|
||||
|
||||
# 将整数数组转换回二进制数据
|
||||
return samples.tobytes()
|
||||
Reference in New Issue
Block a user