feat: increase siliconflow TTS services

This commit is contained in:
yyhhyyyyyy 2025-05-09 23:31:04 +08:00
parent 22f47d90de
commit 45f32756a3
9 changed files with 273 additions and 17 deletions

View File

@ -36,6 +36,7 @@ def save_config():
with open(config_file, "w", encoding="utf-8") as f:
_cfg["app"] = app
_cfg["azure"] = azure
_cfg["siliconflow"] = siliconflow
_cfg["ui"] = ui
f.write(toml.dumps(_cfg))
@ -45,9 +46,13 @@ app = _cfg.get("app", {})
whisper = _cfg.get("whisper", {})
proxy = _cfg.get("proxy", {})
azure = _cfg.get("azure", {})
ui = _cfg.get("ui", {
"hide_log": False,
})
siliconflow = _cfg.get("siliconflow", {})
ui = _cfg.get(
"ui",
{
"hide_log": False,
},
)
hostname = socket.gethostname()

View File

@ -6,6 +6,7 @@ from typing import Union
from xml.sax.saxutils import unescape
import edge_tts
import requests
from edge_tts import SubMaker, submaker
from edge_tts.submaker import mktimestamp
from loguru import logger
@ -15,6 +16,32 @@ from app.config import config
from app.utils import utils
def get_siliconflow_voices() -> list[str]:
"""
获取硅基流动的声音列表
Returns:
声音列表格式为 ["siliconflow:FunAudioLLM/CosyVoice2-0.5B:alex", ...]
"""
# 硅基流动的声音列表和对应的性别(用于显示)
voices_with_gender = [
("FunAudioLLM/CosyVoice2-0.5B", "alex", "Male"),
("FunAudioLLM/CosyVoice2-0.5B", "anna", "Female"),
("FunAudioLLM/CosyVoice2-0.5B", "bella", "Female"),
("FunAudioLLM/CosyVoice2-0.5B", "benjamin", "Male"),
("FunAudioLLM/CosyVoice2-0.5B", "charles", "Male"),
("FunAudioLLM/CosyVoice2-0.5B", "claire", "Female"),
("FunAudioLLM/CosyVoice2-0.5B", "david", "Male"),
("FunAudioLLM/CosyVoice2-0.5B", "diana", "Female"),
]
# 添加siliconflow:前缀,并格式化为显示名称
return [
f"siliconflow:{model}:{voice}-{gender}"
for model, voice, gender in voices_with_gender
]
def get_all_azure_voices(filter_locals=None) -> list[str]:
azure_voices_str = """
Name: af-ZA-AdriNeural
@ -1045,11 +1072,37 @@ def is_azure_v2_voice(voice_name: str):
return ""
def is_siliconflow_voice(voice_name: str):
"""检查是否是硅基流动的声音"""
return voice_name.startswith("siliconflow:")
def tts(
text: str, voice_name: str, voice_rate: float, voice_file: str
text: str,
voice_name: str,
voice_rate: float,
voice_file: str,
voice_volume: float = 1.0,
) -> Union[SubMaker, None]:
if is_azure_v2_voice(voice_name):
return azure_tts_v2(text, voice_name, voice_file)
elif is_siliconflow_voice(voice_name):
# 从voice_name中提取模型和声音
# 格式: siliconflow:model:voice-Gender
parts = voice_name.split(":")
if len(parts) >= 3:
model = parts[1]
# 移除性别后缀,例如 "alex-Male" -> "alex"
voice_with_gender = parts[2]
voice = voice_with_gender.split("-")[0]
# 构建完整的voice参数格式为 "model:voice"
full_voice = f"{model}:{voice}"
return siliconflow_tts(
text, model, full_voice, voice_rate, voice_file, voice_volume
)
else:
logger.error(f"Invalid siliconflow voice name format: {voice_name}")
return None
return azure_tts_v1(text, voice_name, voice_rate, voice_file)
@ -1098,6 +1151,144 @@ def azure_tts_v1(
return None
def siliconflow_tts(
text: str,
model: str,
voice: str,
voice_rate: float,
voice_file: str,
voice_volume: float = 1.0,
) -> Union[SubMaker, None]:
"""
使用硅基流动的API生成语音
Args:
text: 要转换为语音的文本
model: 模型名称 "FunAudioLLM/CosyVoice2-0.5B"
voice: 声音名称 "FunAudioLLM/CosyVoice2-0.5B:alex"
voice_rate: 语音速度范围[0.25, 4.0]
voice_file: 输出的音频文件路径
voice_volume: 语音音量范围[0.6, 5.0]需要转换为硅基流动的增益范围[-10, 10]
Returns:
SubMaker对象或None
"""
text = text.strip()
api_key = config.siliconflow.get("api_key", "")
if not api_key:
logger.error("SiliconFlow API key is not set")
return None
# 将voice_volume转换为硅基流动的增益范围
# 默认voice_volume为1.0对应gain为0
gain = voice_volume - 1.0
# 确保gain在[-10, 10]范围内
gain = max(-10, min(10, gain))
url = "https://api.siliconflow.cn/v1/audio/speech"
payload = {
"model": model,
"input": text,
"voice": voice,
"response_format": "mp3",
"sample_rate": 32000,
"stream": False,
"speed": voice_rate,
"gain": gain,
}
headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
for i in range(3): # 尝试3次
try:
logger.info(
f"start siliconflow tts, model: {model}, voice: {voice}, try: {i + 1}"
)
response = requests.post(url, json=payload, headers=headers)
if response.status_code == 200:
# 保存音频文件
with open(voice_file, "wb") as f:
f.write(response.content)
# 创建一个空的SubMaker对象
sub_maker = SubMaker()
# 获取音频文件的实际长度
try:
# 尝试使用moviepy获取音频长度
from moviepy import AudioFileClip
audio_clip = AudioFileClip(voice_file)
audio_duration = audio_clip.duration
audio_clip.close()
# 将音频长度转换为100纳秒单位与edge_tts兼容
audio_duration_100ns = int(audio_duration * 10000000)
# 使用文本分割来创建更准确的字幕
# 将文本按标点符号分割成句子
sentences = utils.split_string_by_punctuations(text)
if sentences:
# 计算每个句子的大致时长(按字符数比例分配)
total_chars = sum(len(s) for s in sentences)
char_duration = (
audio_duration_100ns / total_chars if total_chars > 0 else 0
)
current_offset = 0
for sentence in sentences:
if not sentence.strip():
continue
# 计算当前句子的时长
sentence_chars = len(sentence)
sentence_duration = int(sentence_chars * char_duration)
# 添加到SubMaker
sub_maker.subs.append(sentence)
sub_maker.offset.append(
(current_offset, current_offset + sentence_duration)
)
# 更新偏移量
current_offset += sentence_duration
else:
# 如果无法分割,则使用整个文本作为一个字幕
sub_maker.subs = [text]
sub_maker.offset = [(0, audio_duration_100ns)]
except Exception as e:
logger.warning(f"Failed to create accurate subtitles: {str(e)}")
# 回退到简单的字幕
sub_maker.subs = [text]
# 使用音频文件的实际长度如果无法获取则假设为10秒
sub_maker.offset = [
(
0,
audio_duration_100ns
if "audio_duration_100ns" in locals()
else 10000000,
)
]
logger.success(f"siliconflow tts succeeded: {voice_file}")
print("s", sub_maker.subs, sub_maker.offset)
return sub_maker
else:
logger.error(
f"siliconflow tts failed with status code {response.status_code}: {response.text}"
)
except Exception as e:
logger.error(f"siliconflow tts failed: {str(e)}")
return None
def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> Union[SubMaker, None]:
voice_name = is_azure_v2_voice(voice_name)
if not voice_name:

View File

@ -193,6 +193,11 @@ compute_type = "int8"
speech_key = ""
speech_region = ""
[siliconflow]
# SiliconFlow API Key
# Get your API key at https://siliconflow.cn
api_key = ""
[ui]
# UI related settings
# 是否隐藏日志信息

View File

@ -621,6 +621,7 @@ with middle_panel:
tts_servers = [
("azure-tts-v1", "Azure TTS V1"),
("azure-tts-v2", "Azure TTS V2"),
("siliconflow", "SiliconFlow TTS"),
]
# 获取保存的TTS服务器默认为v1
@ -641,20 +642,26 @@ with middle_panel:
selected_tts_server = tts_servers[selected_tts_server_index][0]
config.ui["tts_server"] = selected_tts_server
# 获取所有声音
all_voices = voice.get_all_azure_voices(filter_locals=None)
# 根据选择的TTS服务器筛选声音
# 根据选择的TTS服务器获取声音列表
filtered_voices = []
for v in all_voices:
if selected_tts_server == "azure-tts-v2":
# V2版本的声音名称中包含"v2"
if "V2" in v:
filtered_voices.append(v)
else:
# V1版本的声音名称中不包含"v2"
if "V2" not in v:
filtered_voices.append(v)
if selected_tts_server == "siliconflow":
# 获取硅基流动的声音列表
filtered_voices = voice.get_siliconflow_voices()
else:
# 获取Azure的声音列表
all_voices = voice.get_all_azure_voices(filter_locals=None)
# 根据选择的TTS服务器筛选声音
for v in all_voices:
if selected_tts_server == "azure-tts-v2":
# V2版本的声音名称中包含"v2"
if "V2" in v:
filtered_voices.append(v)
else:
# V1版本的声音名称中不包含"v2"
if "V2" not in v:
filtered_voices.append(v)
friendly_names = {
v: v.replace("Female", tr("Female"))
@ -720,6 +727,7 @@ with middle_panel:
voice_name=voice_name,
voice_rate=params.voice_rate,
voice_file=audio_file,
voice_volume=params.voice_volume,
)
# if the voice file generation failed, try again with a default content.
if not sub_maker:
@ -729,6 +737,7 @@ with middle_panel:
voice_name=voice_name,
voice_rate=params.voice_rate,
voice_file=audio_file,
voice_volume=params.voice_volume,
)
if sub_maker and os.path.exists(audio_file):
@ -756,6 +765,32 @@ with middle_panel:
config.azure["speech_region"] = azure_speech_region
config.azure["speech_key"] = azure_speech_key
# 当选择硅基流动时显示API key输入框和说明信息
if selected_tts_server == "siliconflow" or (
voice_name and voice.is_siliconflow_voice(voice_name)
):
saved_siliconflow_api_key = config.siliconflow.get("api_key", "")
siliconflow_api_key = st.text_input(
tr("SiliconFlow API Key"),
value=saved_siliconflow_api_key,
type="password",
key="siliconflow_api_key_input",
)
# 显示硅基流动的说明信息
st.info(
tr("SiliconFlow TTS Settings")
+ ":\n"
+ "- "
+ tr("Speed: Range [0.25, 4.0], default is 1.0")
+ "\n"
+ "- "
+ tr("Volume: Uses Speech Volume setting, default 1.0 maps to gain 0")
)
config.siliconflow["api_key"] = siliconflow_api_key
params.voice_volume = st.selectbox(
tr("Speech Volume"),
options=[0.6, 0.8, 1.0, 1.2, 1.5, 2.0, 3.0, 4.0, 5.0],

View File

@ -93,6 +93,10 @@
"TTS Provider": "Sprachsynthese-Anbieter auswählen",
"TTS Servers": "TTS-Server",
"No voices available for the selected TTS server. Please select another server.": "Keine Stimmen für den ausgewählten TTS-Server verfügbar. Bitte wählen Sie einen anderen Server.",
"SiliconFlow API Key": "SiliconFlow API-Schlüssel",
"SiliconFlow TTS Settings": "SiliconFlow TTS-Einstellungen",
"Speed: Range [0.25, 4.0], default is 1.0": "Geschwindigkeit: Bereich [0.25, 4.0], Standardwert ist 1.0",
"Volume: Uses Speech Volume setting, default 1.0 maps to gain 0": "Lautstärke: Verwendet die Sprachlautstärke-Einstellung, Standardwert 1.0 entspricht Verstärkung 0",
"Hide Log": "Protokoll ausblenden",
"Hide Basic Settings": "Basis-Einstellungen ausblenden\n\nWenn diese Option deaktiviert ist, wird die Basis-Einstellungen-Leiste nicht auf der Seite angezeigt.\n\nWenn Sie sie erneut anzeigen möchten, setzen Sie `hide_config = false` in `config.toml`",
"LLM Settings": "**LLM-Einstellungen**",

View File

@ -93,6 +93,10 @@
"TTS Provider": "Select the voice synthesis provider",
"TTS Servers": "TTS Servers",
"No voices available for the selected TTS server. Please select another server.": "No voices available for the selected TTS server. Please select another server.",
"SiliconFlow API Key": "SiliconFlow API Key",
"SiliconFlow TTS Settings": "SiliconFlow TTS Settings",
"Speed: Range [0.25, 4.0], default is 1.0": "Speed: Range [0.25, 4.0], default is 1.0",
"Volume: Uses Speech Volume setting, default 1.0 maps to gain 0": "Volume: Uses Speech Volume setting, default 1.0 maps to gain 0",
"Hide Log": "Hide Log",
"Hide Basic Settings": "Hide Basic Settings\n\nHidden, the basic settings panel will not be displayed on the page.\n\nIf you need to display it again, please set `hide_config = false` in `config.toml`",
"LLM Settings": "**LLM Settings**",

View File

@ -93,6 +93,10 @@
"TTS Provider": "Selecione o provedor de síntese de voz",
"TTS Servers": "Servidores TTS",
"No voices available for the selected TTS server. Please select another server.": "Não há vozes disponíveis para o servidor TTS selecionado. Por favor, selecione outro servidor.",
"SiliconFlow API Key": "Chave API do SiliconFlow",
"SiliconFlow TTS Settings": "Configurações do SiliconFlow TTS",
"Speed: Range [0.25, 4.0], default is 1.0": "Velocidade: Intervalo [0.25, 4.0], o padrão é 1.0",
"Volume: Uses Speech Volume setting, default 1.0 maps to gain 0": "Volume: Usa a configuração de Volume de Fala, o padrão 1.0 corresponde ao ganho 0",
"Hide Log": "Ocultar Log",
"Hide Basic Settings": "Ocultar Configurações Básicas\n\nOculto, o painel de configurações básicas não será exibido na página.\n\nSe precisar exibi-lo novamente, defina `hide_config = false` em `config.toml`",
"LLM Settings": "**Configurações do LLM**",

View File

@ -93,6 +93,10 @@
"TTS Provider": "Chọn nhà cung cấp tổng hợp giọng nói",
"TTS Servers": "Máy chủ TTS",
"No voices available for the selected TTS server. Please select another server.": "Không có giọng nói nào cho máy chủ TTS đã chọn. Vui lòng chọn máy chủ khác.",
"SiliconFlow API Key": "Khóa API SiliconFlow",
"SiliconFlow TTS Settings": "Cài đặt SiliconFlow TTS",
"Speed: Range [0.25, 4.0], default is 1.0": "Tốc độ: Phạm vi [0.25, 4.0], mặc định là 1.0",
"Volume: Uses Speech Volume setting, default 1.0 maps to gain 0": "Âm lượng: Sử dụng cài đặt Âm lượng Giọng nói, mặc định 1.0 tương ứng với tăng ích 0",
"Hide Log": "Ẩn Nhật Ký",
"Hide Basic Settings": "Ẩn Cài Đặt Cơ Bản\n\nẨn, thanh cài đặt cơ bản sẽ không hiển thị trên trang web.\n\nNếu bạn muốn hiển thị lại, vui lòng đặt `hide_config = false` trong `config.toml`",
"LLM Settings": "**Cài Đặt LLM**",

View File

@ -93,6 +93,10 @@
"TTS Provider": "语音合成提供商",
"TTS Servers": "TTS服务器",
"No voices available for the selected TTS server. Please select another server.": "当前选择的TTS服务器没有可用的声音请选择其他服务器。",
"SiliconFlow API Key": "硅基流动API密钥",
"SiliconFlow TTS Settings": "硅基流动TTS设置",
"Speed: Range [0.25, 4.0], default is 1.0": "语速范围 [0.25, 4.0]默认值为1.0",
"Volume: Uses Speech Volume setting, default 1.0 maps to gain 0": "音量使用朗读音量设置默认值1.0对应增益0",
"Hide Log": "隐藏日志",
"Hide Basic Settings": "隐藏基础设置\n\n隐藏后基础设置面板将不会显示在页面中。\n\n如需要再次显示请在 `config.toml` 中设置 `hide_config = false`",
"LLM Settings": "**大模型设置**",