From d65e126486ad34099176ff0619c05153bd7dff57 Mon Sep 17 00:00:00 2001
From: zhangxindong <zhangxindong888@163.com>
Date: Tue, 8 Jul 2025 10:39:22 +0800
Subject: [PATCH] feat: integrate Google Gemini TTS with 15 voice options

- Add gemini_tts() function with proper PCM audio handling
- Support 15 Gemini voices (Zephyr, Puck, Kore, etc.)
- Fix audio data format issue preventing video generation
- Add Gemini TTS option to WebUI settings
- Update .gitignore to exclude debug files
---
 .gitignore            |   9 ++-
 app/services/voice.py | 174 ++++++++++++++++++++++++++++++++++++++++++
 webui/Main.py         |   4 +
 3 files changed, 186 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 6aa0ca7..066562a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,4 +25,11 @@ node_modules
 ./models/*
 
 venv/
-.venv
\ No newline at end of file
+.venv
+
+# Debug and test files
+CLAUDE.md
+debug/
+debug_*.py
+test_*.py
+streamlit.log
\ No newline at end of file
diff --git a/app/services/voice.py b/app/services/voice.py
index e6b4d59..81f796b 100644
--- a/app/services/voice.py
+++ b/app/services/voice.py
@@ -42,6 +42,39 @@ def get_siliconflow_voices() -> list[str]:
     ]
 
 
+def get_gemini_voices() -> list[str]:
+    """
+    获取Gemini TTS的声音列表
+    
+    Returns:
+        声音列表，格式为 ["gemini:Zephyr-Female", "gemini:Puck-Male", ...]
+    """
+    # Gemini TTS支持的语音列表
+    voices_with_gender = [
+        ("Zephyr", "Female"),
+        ("Puck", "Male"), 
+        ("Charon", "Male"),
+        ("Kore", "Female"),
+        ("Fenrir", "Male"),
+        ("Aoede", "Female"),
+        ("Thalia", "Female"),
+        ("Sage", "Male"),
+        ("Echo", "Female"),
+        ("Harmony", "Female"),
+        ("Lux", "Female"),
+        ("Nova", "Female"),
+        ("Vale", "Male"),
+        ("Orion", "Male"),
+        ("Atlas", "Male"),
+    ]
+    
+    # 添加gemini:前缀，并格式化为显示名称
+    return [
+        f"gemini:{voice}-{gender}"
+        for voice, gender in voices_with_gender
+    ]
+
+
 def get_all_azure_voices(filter_locals=None) -> list[str]:
     azure_voices_str = """
 Name: af-ZA-AdriNeural
@@ -1077,6 +1110,11 @@ def is_siliconflow_voice(voice_name: str):
     return voice_name.startswith("siliconflow:")
 
 
+def is_gemini_voice(voice_name: str):
+    """检查是否是Gemini TTS的声音"""
+    return voice_name.startswith("gemini:")
+
+
 def tts(
     text: str,
     voice_name: str,
@@ -1103,6 +1141,18 @@ def tts(
         else:
             logger.error(f"Invalid siliconflow voice name format: {voice_name}")
             return None
+    elif is_gemini_voice(voice_name):
+        # 从voice_name中提取声音名称
+        # 格式: gemini:voice-Gender
+        parts = voice_name.split(":")
+        if len(parts) >= 2:
+            # 移除性别后缀，例如 "Zephyr-Female" -> "Zephyr"
+            voice_with_gender = parts[1]
+            voice = voice_with_gender.split("-")[0]
+            return gemini_tts(text, voice, voice_rate, voice_file, voice_volume)
+        else:
+            logger.error(f"Invalid gemini voice name format: {voice_name}")
+            return None
     return azure_tts_v1(text, voice_name, voice_rate, voice_file)
 
 
@@ -1384,6 +1434,130 @@ def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> Union[SubMaker,
     return None
 
 
+def gemini_tts(
+    text: str,
+    voice_name: str,
+    voice_rate: float,
+    voice_file: str,
+    voice_volume: float = 1.0,
+) -> Union[SubMaker, None]:
+    """
+    使用Google Gemini TTS生成语音
+    
+    Args:
+        text: 要转换的文本
+        voice_name: 语音名称，如 "Zephyr", "Puck" 等
+        voice_rate: 语音速率（当前未使用）
+        voice_file: 输出音频文件路径
+        voice_volume: 音频音量（当前未使用）
+        
+    Returns:
+        SubMaker对象或None
+    """
+    import base64
+    import json
+    import io
+    from pydub import AudioSegment
+    import google.generativeai as genai
+    
+    try:
+        # 配置Gemini API
+        api_key = config.app.get("gemini_api_key", "")
+        if not api_key:
+            logger.error("Gemini API key is not set")
+            return None
+            
+        genai.configure(api_key=api_key)
+        
+        logger.info(f"start, voice name: {voice_name}, try: 1")
+        
+        # 使用Gemini TTS API
+        model = genai.GenerativeModel("gemini-2.5-flash-preview-tts")
+        
+        generation_config = {
+            "response_modalities": ["AUDIO"],
+            "speech_config": {
+                "voice_config": {
+                    "prebuilt_voice_config": {
+                        "voice_name": voice_name
+                    }
+                }
+            }
+        }
+        
+        response = model.generate_content(
+            contents=text,
+            generation_config=generation_config
+        )
+        
+        # 检查响应
+        if not response.candidates or not response.candidates[0].content:
+            logger.error("No audio content received from Gemini TTS")
+            return None
+            
+        # 获取音频数据
+        audio_data = None
+        for part in response.candidates[0].content.parts:
+            if hasattr(part, 'inline_data') and part.inline_data:
+                audio_data = part.inline_data.data
+                break
+                
+        if not audio_data:
+            logger.error("No audio data found in response")
+            return None
+            
+        # 音频数据已经是原始字节，不需要base64解码
+        if isinstance(audio_data, str):
+            # 如果是字符串，则需要base64解码
+            audio_bytes = base64.b64decode(audio_data)
+        else:
+            # 如果已经是字节，直接使用
+            audio_bytes = audio_data
+        
+        # 尝试不同的音频格式 - Gemini可能返回不同的格式
+        audio_segment = None
+        
+        # Gemini返回Linear PCM格式，按照文档参数解析
+        try:
+            audio_segment = AudioSegment.from_file(
+                io.BytesIO(audio_bytes), 
+                format="raw",
+                frame_rate=24000,  # Gemini TTS默认采样率
+                channels=1,        # 单声道
+                sample_width=2     # 16-bit
+            )
+        except Exception as e:
+            logger.error(f"Failed to load PCM audio: {e}")
+            return None
+        
+        # 导出为MP3格式
+        audio_segment.export(voice_file, format="mp3")
+        
+        logger.info(f"completed, output file: {voice_file}")
+        
+        # 创建SubMaker对象用于字幕
+        sub_maker = SubMaker()
+        audio_duration = len(audio_segment) / 1000.0  # 转换为秒
+        
+        # 将音频长度转换为100纳秒单位（与edge_tts兼容）
+        audio_duration_100ns = int(audio_duration * 10000000)
+        
+        # 使用create_sub方法正确创建字幕项
+        sub_maker.create_sub(
+            (0, audio_duration_100ns), 
+            text
+        )
+        
+        return sub_maker
+        
+    except ImportError as e:
+        logger.error(f"Missing required package for Gemini TTS: {str(e)}. Please install: pip install pydub")
+        return None
+    except Exception as e:
+        logger.error(f"Gemini TTS failed, error: {str(e)}")
+        return None
+
+
 def _format_text(text: str) -> str:
     # text = text.replace("\n", " ")
     text = text.replace("[", " ")
diff --git a/webui/Main.py b/webui/Main.py
index 1b55abe..b600f37 100644
--- a/webui/Main.py
+++ b/webui/Main.py
@@ -634,6 +634,7 @@ with middle_panel:
             ("azure-tts-v1", "Azure TTS V1"),
             ("azure-tts-v2", "Azure TTS V2"),
             ("siliconflow", "SiliconFlow TTS"),
+            ("gemini-tts", "Google Gemini TTS"),
         ]
 
         # 获取保存的TTS服务器，默认为v1
@@ -660,6 +661,9 @@ with middle_panel:
         if selected_tts_server == "siliconflow":
             # 获取硅基流动的声音列表
             filtered_voices = voice.get_siliconflow_voices()
+        elif selected_tts_server == "gemini-tts":
+            # 获取Gemini TTS的声音列表
+            filtered_voices = voice.get_gemini_voices()
         else:
             # 获取Azure的声音列表
             all_voices = voice.get_all_azure_voices(filter_locals=None)