diff --git a/app/config/config.py b/app/config/config.py index e1534a4..aabf3bd 100644 --- a/app/config/config.py +++ b/app/config/config.py @@ -36,6 +36,7 @@ def save_config(): with open(config_file, "w", encoding="utf-8") as f: _cfg["app"] = app _cfg["azure"] = azure + _cfg["siliconflow"] = siliconflow _cfg["ui"] = ui f.write(toml.dumps(_cfg)) @@ -45,9 +46,13 @@ app = _cfg.get("app", {}) whisper = _cfg.get("whisper", {}) proxy = _cfg.get("proxy", {}) azure = _cfg.get("azure", {}) -ui = _cfg.get("ui", { - "hide_log": False, -}) +siliconflow = _cfg.get("siliconflow", {}) +ui = _cfg.get( + "ui", + { + "hide_log": False, + }, +) hostname = socket.gethostname() diff --git a/app/services/voice.py b/app/services/voice.py index e2d9fe9..8e9d824 100644 --- a/app/services/voice.py +++ b/app/services/voice.py @@ -6,6 +6,7 @@ from typing import Union from xml.sax.saxutils import unescape import edge_tts +import requests from edge_tts import SubMaker, submaker from edge_tts.submaker import mktimestamp from loguru import logger @@ -15,6 +16,32 @@ from app.config import config from app.utils import utils +def get_siliconflow_voices() -> list[str]: + """ + 获取硅基流动的声音列表 + + Returns: + 声音列表,格式为 ["siliconflow:FunAudioLLM/CosyVoice2-0.5B:alex", ...] + """ + # 硅基流动的声音列表和对应的性别(用于显示) + voices_with_gender = [ + ("FunAudioLLM/CosyVoice2-0.5B", "alex", "Male"), + ("FunAudioLLM/CosyVoice2-0.5B", "anna", "Female"), + ("FunAudioLLM/CosyVoice2-0.5B", "bella", "Female"), + ("FunAudioLLM/CosyVoice2-0.5B", "benjamin", "Male"), + ("FunAudioLLM/CosyVoice2-0.5B", "charles", "Male"), + ("FunAudioLLM/CosyVoice2-0.5B", "claire", "Female"), + ("FunAudioLLM/CosyVoice2-0.5B", "david", "Male"), + ("FunAudioLLM/CosyVoice2-0.5B", "diana", "Female"), + ] + + # 添加siliconflow:前缀,并格式化为显示名称 + return [ + f"siliconflow:{model}:{voice}-{gender}" + for model, voice, gender in voices_with_gender + ] + + def get_all_azure_voices(filter_locals=None) -> list[str]: azure_voices_str = """ Name: af-ZA-AdriNeural @@ -1045,11 +1072,37 @@ def is_azure_v2_voice(voice_name: str): return "" +def is_siliconflow_voice(voice_name: str): + """检查是否是硅基流动的声音""" + return voice_name.startswith("siliconflow:") + + def tts( - text: str, voice_name: str, voice_rate: float, voice_file: str + text: str, + voice_name: str, + voice_rate: float, + voice_file: str, + voice_volume: float = 1.0, ) -> Union[SubMaker, None]: if is_azure_v2_voice(voice_name): return azure_tts_v2(text, voice_name, voice_file) + elif is_siliconflow_voice(voice_name): + # 从voice_name中提取模型和声音 + # 格式: siliconflow:model:voice-Gender + parts = voice_name.split(":") + if len(parts) >= 3: + model = parts[1] + # 移除性别后缀,例如 "alex-Male" -> "alex" + voice_with_gender = parts[2] + voice = voice_with_gender.split("-")[0] + # 构建完整的voice参数,格式为 "model:voice" + full_voice = f"{model}:{voice}" + return siliconflow_tts( + text, model, full_voice, voice_rate, voice_file, voice_volume + ) + else: + logger.error(f"Invalid siliconflow voice name format: {voice_name}") + return None return azure_tts_v1(text, voice_name, voice_rate, voice_file) @@ -1098,6 +1151,144 @@ def azure_tts_v1( return None +def siliconflow_tts( + text: str, + model: str, + voice: str, + voice_rate: float, + voice_file: str, + voice_volume: float = 1.0, +) -> Union[SubMaker, None]: + """ + 使用硅基流动的API生成语音 + + Args: + text: 要转换为语音的文本 + model: 模型名称,如 "FunAudioLLM/CosyVoice2-0.5B" + voice: 声音名称,如 "FunAudioLLM/CosyVoice2-0.5B:alex" + voice_rate: 语音速度,范围[0.25, 4.0] + voice_file: 输出的音频文件路径 + voice_volume: 语音音量,范围[0.6, 5.0],需要转换为硅基流动的增益范围[-10, 10] + + Returns: + SubMaker对象或None + """ + text = text.strip() + api_key = config.siliconflow.get("api_key", "") + + if not api_key: + logger.error("SiliconFlow API key is not set") + return None + + # 将voice_volume转换为硅基流动的增益范围 + # 默认voice_volume为1.0,对应gain为0 + gain = voice_volume - 1.0 + # 确保gain在[-10, 10]范围内 + gain = max(-10, min(10, gain)) + + url = "https://api.siliconflow.cn/v1/audio/speech" + + payload = { + "model": model, + "input": text, + "voice": voice, + "response_format": "mp3", + "sample_rate": 32000, + "stream": False, + "speed": voice_rate, + "gain": gain, + } + + headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"} + + for i in range(3): # 尝试3次 + try: + logger.info( + f"start siliconflow tts, model: {model}, voice: {voice}, try: {i + 1}" + ) + + response = requests.post(url, json=payload, headers=headers) + + if response.status_code == 200: + # 保存音频文件 + with open(voice_file, "wb") as f: + f.write(response.content) + + # 创建一个空的SubMaker对象 + sub_maker = SubMaker() + + # 获取音频文件的实际长度 + try: + # 尝试使用moviepy获取音频长度 + from moviepy import AudioFileClip + + audio_clip = AudioFileClip(voice_file) + audio_duration = audio_clip.duration + audio_clip.close() + + # 将音频长度转换为100纳秒单位(与edge_tts兼容) + audio_duration_100ns = int(audio_duration * 10000000) + + # 使用文本分割来创建更准确的字幕 + # 将文本按标点符号分割成句子 + sentences = utils.split_string_by_punctuations(text) + + if sentences: + # 计算每个句子的大致时长(按字符数比例分配) + total_chars = sum(len(s) for s in sentences) + char_duration = ( + audio_duration_100ns / total_chars if total_chars > 0 else 0 + ) + + current_offset = 0 + for sentence in sentences: + if not sentence.strip(): + continue + + # 计算当前句子的时长 + sentence_chars = len(sentence) + sentence_duration = int(sentence_chars * char_duration) + + # 添加到SubMaker + sub_maker.subs.append(sentence) + sub_maker.offset.append( + (current_offset, current_offset + sentence_duration) + ) + + # 更新偏移量 + current_offset += sentence_duration + else: + # 如果无法分割,则使用整个文本作为一个字幕 + sub_maker.subs = [text] + sub_maker.offset = [(0, audio_duration_100ns)] + + except Exception as e: + logger.warning(f"Failed to create accurate subtitles: {str(e)}") + # 回退到简单的字幕 + sub_maker.subs = [text] + # 使用音频文件的实际长度,如果无法获取,则假设为10秒 + sub_maker.offset = [ + ( + 0, + audio_duration_100ns + if "audio_duration_100ns" in locals() + else 10000000, + ) + ] + + logger.success(f"siliconflow tts succeeded: {voice_file}") + print("s", sub_maker.subs, sub_maker.offset) + return sub_maker + else: + logger.error( + f"siliconflow tts failed with status code {response.status_code}: {response.text}" + ) + except Exception as e: + logger.error(f"siliconflow tts failed: {str(e)}") + + return None + + def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> Union[SubMaker, None]: voice_name = is_azure_v2_voice(voice_name) if not voice_name: diff --git a/config.example.toml b/config.example.toml index 9fa0049..ecea468 100644 --- a/config.example.toml +++ b/config.example.toml @@ -193,6 +193,11 @@ compute_type = "int8" speech_key = "" speech_region = "" +[siliconflow] +# SiliconFlow API Key +# Get your API key at https://siliconflow.cn +api_key = "" + [ui] # UI related settings # 是否隐藏日志信息 diff --git a/webui/Main.py b/webui/Main.py index 4e4cee6..aafed1b 100644 --- a/webui/Main.py +++ b/webui/Main.py @@ -621,6 +621,7 @@ with middle_panel: tts_servers = [ ("azure-tts-v1", "Azure TTS V1"), ("azure-tts-v2", "Azure TTS V2"), + ("siliconflow", "SiliconFlow TTS"), ] # 获取保存的TTS服务器,默认为v1 @@ -641,20 +642,26 @@ with middle_panel: selected_tts_server = tts_servers[selected_tts_server_index][0] config.ui["tts_server"] = selected_tts_server - # 获取所有声音 - all_voices = voice.get_all_azure_voices(filter_locals=None) - - # 根据选择的TTS服务器筛选声音 + # 根据选择的TTS服务器获取声音列表 filtered_voices = [] - for v in all_voices: - if selected_tts_server == "azure-tts-v2": - # V2版本的声音名称中包含"v2" - if "V2" in v: - filtered_voices.append(v) - else: - # V1版本的声音名称中不包含"v2" - if "V2" not in v: - filtered_voices.append(v) + + if selected_tts_server == "siliconflow": + # 获取硅基流动的声音列表 + filtered_voices = voice.get_siliconflow_voices() + else: + # 获取Azure的声音列表 + all_voices = voice.get_all_azure_voices(filter_locals=None) + + # 根据选择的TTS服务器筛选声音 + for v in all_voices: + if selected_tts_server == "azure-tts-v2": + # V2版本的声音名称中包含"v2" + if "V2" in v: + filtered_voices.append(v) + else: + # V1版本的声音名称中不包含"v2" + if "V2" not in v: + filtered_voices.append(v) friendly_names = { v: v.replace("Female", tr("Female")) @@ -720,6 +727,7 @@ with middle_panel: voice_name=voice_name, voice_rate=params.voice_rate, voice_file=audio_file, + voice_volume=params.voice_volume, ) # if the voice file generation failed, try again with a default content. if not sub_maker: @@ -729,6 +737,7 @@ with middle_panel: voice_name=voice_name, voice_rate=params.voice_rate, voice_file=audio_file, + voice_volume=params.voice_volume, ) if sub_maker and os.path.exists(audio_file): @@ -756,6 +765,32 @@ with middle_panel: config.azure["speech_region"] = azure_speech_region config.azure["speech_key"] = azure_speech_key + # 当选择硅基流动时,显示API key输入框和说明信息 + if selected_tts_server == "siliconflow" or ( + voice_name and voice.is_siliconflow_voice(voice_name) + ): + saved_siliconflow_api_key = config.siliconflow.get("api_key", "") + + siliconflow_api_key = st.text_input( + tr("SiliconFlow API Key"), + value=saved_siliconflow_api_key, + type="password", + key="siliconflow_api_key_input", + ) + + # 显示硅基流动的说明信息 + st.info( + tr("SiliconFlow TTS Settings") + + ":\n" + + "- " + + tr("Speed: Range [0.25, 4.0], default is 1.0") + + "\n" + + "- " + + tr("Volume: Uses Speech Volume setting, default 1.0 maps to gain 0") + ) + + config.siliconflow["api_key"] = siliconflow_api_key + params.voice_volume = st.selectbox( tr("Speech Volume"), options=[0.6, 0.8, 1.0, 1.2, 1.5, 2.0, 3.0, 4.0, 5.0], diff --git a/webui/i18n/de.json b/webui/i18n/de.json index 159154c..cedc3b7 100644 --- a/webui/i18n/de.json +++ b/webui/i18n/de.json @@ -93,6 +93,10 @@ "TTS Provider": "Sprachsynthese-Anbieter auswählen", "TTS Servers": "TTS-Server", "No voices available for the selected TTS server. Please select another server.": "Keine Stimmen für den ausgewählten TTS-Server verfügbar. Bitte wählen Sie einen anderen Server.", + "SiliconFlow API Key": "SiliconFlow API-Schlüssel", + "SiliconFlow TTS Settings": "SiliconFlow TTS-Einstellungen", + "Speed: Range [0.25, 4.0], default is 1.0": "Geschwindigkeit: Bereich [0.25, 4.0], Standardwert ist 1.0", + "Volume: Uses Speech Volume setting, default 1.0 maps to gain 0": "Lautstärke: Verwendet die Sprachlautstärke-Einstellung, Standardwert 1.0 entspricht Verstärkung 0", "Hide Log": "Protokoll ausblenden", "Hide Basic Settings": "Basis-Einstellungen ausblenden\n\nWenn diese Option deaktiviert ist, wird die Basis-Einstellungen-Leiste nicht auf der Seite angezeigt.\n\nWenn Sie sie erneut anzeigen möchten, setzen Sie `hide_config = false` in `config.toml`", "LLM Settings": "**LLM-Einstellungen**", diff --git a/webui/i18n/en.json b/webui/i18n/en.json index 2116844..0f3c2c1 100644 --- a/webui/i18n/en.json +++ b/webui/i18n/en.json @@ -93,6 +93,10 @@ "TTS Provider": "Select the voice synthesis provider", "TTS Servers": "TTS Servers", "No voices available for the selected TTS server. Please select another server.": "No voices available for the selected TTS server. Please select another server.", + "SiliconFlow API Key": "SiliconFlow API Key", + "SiliconFlow TTS Settings": "SiliconFlow TTS Settings", + "Speed: Range [0.25, 4.0], default is 1.0": "Speed: Range [0.25, 4.0], default is 1.0", + "Volume: Uses Speech Volume setting, default 1.0 maps to gain 0": "Volume: Uses Speech Volume setting, default 1.0 maps to gain 0", "Hide Log": "Hide Log", "Hide Basic Settings": "Hide Basic Settings\n\nHidden, the basic settings panel will not be displayed on the page.\n\nIf you need to display it again, please set `hide_config = false` in `config.toml`", "LLM Settings": "**LLM Settings**", diff --git a/webui/i18n/pt.json b/webui/i18n/pt.json index 55339c9..6a9d47b 100644 --- a/webui/i18n/pt.json +++ b/webui/i18n/pt.json @@ -93,6 +93,10 @@ "TTS Provider": "Selecione o provedor de síntese de voz", "TTS Servers": "Servidores TTS", "No voices available for the selected TTS server. Please select another server.": "Não há vozes disponíveis para o servidor TTS selecionado. Por favor, selecione outro servidor.", + "SiliconFlow API Key": "Chave API do SiliconFlow", + "SiliconFlow TTS Settings": "Configurações do SiliconFlow TTS", + "Speed: Range [0.25, 4.0], default is 1.0": "Velocidade: Intervalo [0.25, 4.0], o padrão é 1.0", + "Volume: Uses Speech Volume setting, default 1.0 maps to gain 0": "Volume: Usa a configuração de Volume de Fala, o padrão 1.0 corresponde ao ganho 0", "Hide Log": "Ocultar Log", "Hide Basic Settings": "Ocultar Configurações Básicas\n\nOculto, o painel de configurações básicas não será exibido na página.\n\nSe precisar exibi-lo novamente, defina `hide_config = false` em `config.toml`", "LLM Settings": "**Configurações do LLM**", diff --git a/webui/i18n/vi.json b/webui/i18n/vi.json index e71fa6d..c1a604b 100644 --- a/webui/i18n/vi.json +++ b/webui/i18n/vi.json @@ -93,6 +93,10 @@ "TTS Provider": "Chọn nhà cung cấp tổng hợp giọng nói", "TTS Servers": "Máy chủ TTS", "No voices available for the selected TTS server. Please select another server.": "Không có giọng nói nào cho máy chủ TTS đã chọn. Vui lòng chọn máy chủ khác.", + "SiliconFlow API Key": "Khóa API SiliconFlow", + "SiliconFlow TTS Settings": "Cài đặt SiliconFlow TTS", + "Speed: Range [0.25, 4.0], default is 1.0": "Tốc độ: Phạm vi [0.25, 4.0], mặc định là 1.0", + "Volume: Uses Speech Volume setting, default 1.0 maps to gain 0": "Âm lượng: Sử dụng cài đặt Âm lượng Giọng nói, mặc định 1.0 tương ứng với tăng ích 0", "Hide Log": "Ẩn Nhật Ký", "Hide Basic Settings": "Ẩn Cài Đặt Cơ Bản\n\nẨn, thanh cài đặt cơ bản sẽ không hiển thị trên trang web.\n\nNếu bạn muốn hiển thị lại, vui lòng đặt `hide_config = false` trong `config.toml`", "LLM Settings": "**Cài Đặt LLM**", diff --git a/webui/i18n/zh.json b/webui/i18n/zh.json index e48d560..3811ae8 100644 --- a/webui/i18n/zh.json +++ b/webui/i18n/zh.json @@ -93,6 +93,10 @@ "TTS Provider": "语音合成提供商", "TTS Servers": "TTS服务器", "No voices available for the selected TTS server. Please select another server.": "当前选择的TTS服务器没有可用的声音,请选择其他服务器。", + "SiliconFlow API Key": "硅基流动API密钥", + "SiliconFlow TTS Settings": "硅基流动TTS设置", + "Speed: Range [0.25, 4.0], default is 1.0": "语速范围 [0.25, 4.0],默认值为1.0", + "Volume: Uses Speech Volume setting, default 1.0 maps to gain 0": "音量:使用朗读音量设置,默认值1.0对应增益0", "Hide Log": "隐藏日志", "Hide Basic Settings": "隐藏基础设置\n\n隐藏后,基础设置面板将不会显示在页面中。\n\n如需要再次显示,请在 `config.toml` 中设置 `hide_config = false`", "LLM Settings": "**大模型设置**",