From d65e126486ad34099176ff0619c05153bd7dff57 Mon Sep 17 00:00:00 2001 From: zhangxindong Date: Tue, 8 Jul 2025 10:39:22 +0800 Subject: [PATCH] feat: integrate Google Gemini TTS with 15 voice options - Add gemini_tts() function with proper PCM audio handling - Support 15 Gemini voices (Zephyr, Puck, Kore, etc.) - Fix audio data format issue preventing video generation - Add Gemini TTS option to WebUI settings - Update .gitignore to exclude debug files --- .gitignore | 9 ++- app/services/voice.py | 174 ++++++++++++++++++++++++++++++++++++++++++ webui/Main.py | 4 + 3 files changed, 186 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 6aa0ca7..066562a 100644 --- a/.gitignore +++ b/.gitignore @@ -25,4 +25,11 @@ node_modules ./models/* venv/ -.venv \ No newline at end of file +.venv + +# Debug and test files +CLAUDE.md +debug/ +debug_*.py +test_*.py +streamlit.log \ No newline at end of file diff --git a/app/services/voice.py b/app/services/voice.py index e6b4d59..81f796b 100644 --- a/app/services/voice.py +++ b/app/services/voice.py @@ -42,6 +42,39 @@ def get_siliconflow_voices() -> list[str]: ] +def get_gemini_voices() -> list[str]: + """ + 获取Gemini TTS的声音列表 + + Returns: + 声音列表,格式为 ["gemini:Zephyr-Female", "gemini:Puck-Male", ...] + """ + # Gemini TTS支持的语音列表 + voices_with_gender = [ + ("Zephyr", "Female"), + ("Puck", "Male"), + ("Charon", "Male"), + ("Kore", "Female"), + ("Fenrir", "Male"), + ("Aoede", "Female"), + ("Thalia", "Female"), + ("Sage", "Male"), + ("Echo", "Female"), + ("Harmony", "Female"), + ("Lux", "Female"), + ("Nova", "Female"), + ("Vale", "Male"), + ("Orion", "Male"), + ("Atlas", "Male"), + ] + + # 添加gemini:前缀,并格式化为显示名称 + return [ + f"gemini:{voice}-{gender}" + for voice, gender in voices_with_gender + ] + + def get_all_azure_voices(filter_locals=None) -> list[str]: azure_voices_str = """ Name: af-ZA-AdriNeural @@ -1077,6 +1110,11 @@ def is_siliconflow_voice(voice_name: str): return voice_name.startswith("siliconflow:") +def is_gemini_voice(voice_name: str): + """检查是否是Gemini TTS的声音""" + return voice_name.startswith("gemini:") + + def tts( text: str, voice_name: str, @@ -1103,6 +1141,18 @@ def tts( else: logger.error(f"Invalid siliconflow voice name format: {voice_name}") return None + elif is_gemini_voice(voice_name): + # 从voice_name中提取声音名称 + # 格式: gemini:voice-Gender + parts = voice_name.split(":") + if len(parts) >= 2: + # 移除性别后缀,例如 "Zephyr-Female" -> "Zephyr" + voice_with_gender = parts[1] + voice = voice_with_gender.split("-")[0] + return gemini_tts(text, voice, voice_rate, voice_file, voice_volume) + else: + logger.error(f"Invalid gemini voice name format: {voice_name}") + return None return azure_tts_v1(text, voice_name, voice_rate, voice_file) @@ -1384,6 +1434,130 @@ def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> Union[SubMaker, return None +def gemini_tts( + text: str, + voice_name: str, + voice_rate: float, + voice_file: str, + voice_volume: float = 1.0, +) -> Union[SubMaker, None]: + """ + 使用Google Gemini TTS生成语音 + + Args: + text: 要转换的文本 + voice_name: 语音名称,如 "Zephyr", "Puck" 等 + voice_rate: 语音速率(当前未使用) + voice_file: 输出音频文件路径 + voice_volume: 音频音量(当前未使用) + + Returns: + SubMaker对象或None + """ + import base64 + import json + import io + from pydub import AudioSegment + import google.generativeai as genai + + try: + # 配置Gemini API + api_key = config.app.get("gemini_api_key", "") + if not api_key: + logger.error("Gemini API key is not set") + return None + + genai.configure(api_key=api_key) + + logger.info(f"start, voice name: {voice_name}, try: 1") + + # 使用Gemini TTS API + model = genai.GenerativeModel("gemini-2.5-flash-preview-tts") + + generation_config = { + "response_modalities": ["AUDIO"], + "speech_config": { + "voice_config": { + "prebuilt_voice_config": { + "voice_name": voice_name + } + } + } + } + + response = model.generate_content( + contents=text, + generation_config=generation_config + ) + + # 检查响应 + if not response.candidates or not response.candidates[0].content: + logger.error("No audio content received from Gemini TTS") + return None + + # 获取音频数据 + audio_data = None + for part in response.candidates[0].content.parts: + if hasattr(part, 'inline_data') and part.inline_data: + audio_data = part.inline_data.data + break + + if not audio_data: + logger.error("No audio data found in response") + return None + + # 音频数据已经是原始字节,不需要base64解码 + if isinstance(audio_data, str): + # 如果是字符串,则需要base64解码 + audio_bytes = base64.b64decode(audio_data) + else: + # 如果已经是字节,直接使用 + audio_bytes = audio_data + + # 尝试不同的音频格式 - Gemini可能返回不同的格式 + audio_segment = None + + # Gemini返回Linear PCM格式,按照文档参数解析 + try: + audio_segment = AudioSegment.from_file( + io.BytesIO(audio_bytes), + format="raw", + frame_rate=24000, # Gemini TTS默认采样率 + channels=1, # 单声道 + sample_width=2 # 16-bit + ) + except Exception as e: + logger.error(f"Failed to load PCM audio: {e}") + return None + + # 导出为MP3格式 + audio_segment.export(voice_file, format="mp3") + + logger.info(f"completed, output file: {voice_file}") + + # 创建SubMaker对象用于字幕 + sub_maker = SubMaker() + audio_duration = len(audio_segment) / 1000.0 # 转换为秒 + + # 将音频长度转换为100纳秒单位(与edge_tts兼容) + audio_duration_100ns = int(audio_duration * 10000000) + + # 使用create_sub方法正确创建字幕项 + sub_maker.create_sub( + (0, audio_duration_100ns), + text + ) + + return sub_maker + + except ImportError as e: + logger.error(f"Missing required package for Gemini TTS: {str(e)}. Please install: pip install pydub") + return None + except Exception as e: + logger.error(f"Gemini TTS failed, error: {str(e)}") + return None + + def _format_text(text: str) -> str: # text = text.replace("\n", " ") text = text.replace("[", " ") diff --git a/webui/Main.py b/webui/Main.py index 1b55abe..b600f37 100644 --- a/webui/Main.py +++ b/webui/Main.py @@ -634,6 +634,7 @@ with middle_panel: ("azure-tts-v1", "Azure TTS V1"), ("azure-tts-v2", "Azure TTS V2"), ("siliconflow", "SiliconFlow TTS"), + ("gemini-tts", "Google Gemini TTS"), ] # 获取保存的TTS服务器,默认为v1 @@ -660,6 +661,9 @@ with middle_panel: if selected_tts_server == "siliconflow": # 获取硅基流动的声音列表 filtered_voices = voice.get_siliconflow_voices() + elif selected_tts_server == "gemini-tts": + # 获取Gemini TTS的声音列表 + filtered_voices = voice.get_gemini_voices() else: # 获取Azure的声音列表 all_voices = voice.get_all_azure_voices(filter_locals=None)