mirror of
https://github.com/harry0703/MoneyPrinterTurbo.git
synced 2026-02-21 08:27:22 +08:00
feat: integrate Google Gemini TTS with 15 voice options
- Add gemini_tts() function with proper PCM audio handling - Support 15 Gemini voices (Zephyr, Puck, Kore, etc.) - Fix audio data format issue preventing video generation - Add Gemini TTS option to WebUI settings - Update .gitignore to exclude debug files
This commit is contained in:
parent
6cb5f23487
commit
d65e126486
9
.gitignore
vendored
9
.gitignore
vendored
@ -25,4 +25,11 @@ node_modules
|
||||
./models/*
|
||||
|
||||
venv/
|
||||
.venv
|
||||
.venv
|
||||
|
||||
# Debug and test files
|
||||
CLAUDE.md
|
||||
debug/
|
||||
debug_*.py
|
||||
test_*.py
|
||||
streamlit.log
|
||||
@ -42,6 +42,39 @@ def get_siliconflow_voices() -> list[str]:
|
||||
]
|
||||
|
||||
|
||||
def get_gemini_voices() -> list[str]:
|
||||
"""
|
||||
获取Gemini TTS的声音列表
|
||||
|
||||
Returns:
|
||||
声音列表,格式为 ["gemini:Zephyr-Female", "gemini:Puck-Male", ...]
|
||||
"""
|
||||
# Gemini TTS支持的语音列表
|
||||
voices_with_gender = [
|
||||
("Zephyr", "Female"),
|
||||
("Puck", "Male"),
|
||||
("Charon", "Male"),
|
||||
("Kore", "Female"),
|
||||
("Fenrir", "Male"),
|
||||
("Aoede", "Female"),
|
||||
("Thalia", "Female"),
|
||||
("Sage", "Male"),
|
||||
("Echo", "Female"),
|
||||
("Harmony", "Female"),
|
||||
("Lux", "Female"),
|
||||
("Nova", "Female"),
|
||||
("Vale", "Male"),
|
||||
("Orion", "Male"),
|
||||
("Atlas", "Male"),
|
||||
]
|
||||
|
||||
# 添加gemini:前缀,并格式化为显示名称
|
||||
return [
|
||||
f"gemini:{voice}-{gender}"
|
||||
for voice, gender in voices_with_gender
|
||||
]
|
||||
|
||||
|
||||
def get_all_azure_voices(filter_locals=None) -> list[str]:
|
||||
azure_voices_str = """
|
||||
Name: af-ZA-AdriNeural
|
||||
@ -1077,6 +1110,11 @@ def is_siliconflow_voice(voice_name: str):
|
||||
return voice_name.startswith("siliconflow:")
|
||||
|
||||
|
||||
def is_gemini_voice(voice_name: str):
|
||||
"""检查是否是Gemini TTS的声音"""
|
||||
return voice_name.startswith("gemini:")
|
||||
|
||||
|
||||
def tts(
|
||||
text: str,
|
||||
voice_name: str,
|
||||
@ -1103,6 +1141,18 @@ def tts(
|
||||
else:
|
||||
logger.error(f"Invalid siliconflow voice name format: {voice_name}")
|
||||
return None
|
||||
elif is_gemini_voice(voice_name):
|
||||
# 从voice_name中提取声音名称
|
||||
# 格式: gemini:voice-Gender
|
||||
parts = voice_name.split(":")
|
||||
if len(parts) >= 2:
|
||||
# 移除性别后缀,例如 "Zephyr-Female" -> "Zephyr"
|
||||
voice_with_gender = parts[1]
|
||||
voice = voice_with_gender.split("-")[0]
|
||||
return gemini_tts(text, voice, voice_rate, voice_file, voice_volume)
|
||||
else:
|
||||
logger.error(f"Invalid gemini voice name format: {voice_name}")
|
||||
return None
|
||||
return azure_tts_v1(text, voice_name, voice_rate, voice_file)
|
||||
|
||||
|
||||
@ -1384,6 +1434,130 @@ def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> Union[SubMaker,
|
||||
return None
|
||||
|
||||
|
||||
def gemini_tts(
|
||||
text: str,
|
||||
voice_name: str,
|
||||
voice_rate: float,
|
||||
voice_file: str,
|
||||
voice_volume: float = 1.0,
|
||||
) -> Union[SubMaker, None]:
|
||||
"""
|
||||
使用Google Gemini TTS生成语音
|
||||
|
||||
Args:
|
||||
text: 要转换的文本
|
||||
voice_name: 语音名称,如 "Zephyr", "Puck" 等
|
||||
voice_rate: 语音速率(当前未使用)
|
||||
voice_file: 输出音频文件路径
|
||||
voice_volume: 音频音量(当前未使用)
|
||||
|
||||
Returns:
|
||||
SubMaker对象或None
|
||||
"""
|
||||
import base64
|
||||
import json
|
||||
import io
|
||||
from pydub import AudioSegment
|
||||
import google.generativeai as genai
|
||||
|
||||
try:
|
||||
# 配置Gemini API
|
||||
api_key = config.app.get("gemini_api_key", "")
|
||||
if not api_key:
|
||||
logger.error("Gemini API key is not set")
|
||||
return None
|
||||
|
||||
genai.configure(api_key=api_key)
|
||||
|
||||
logger.info(f"start, voice name: {voice_name}, try: 1")
|
||||
|
||||
# 使用Gemini TTS API
|
||||
model = genai.GenerativeModel("gemini-2.5-flash-preview-tts")
|
||||
|
||||
generation_config = {
|
||||
"response_modalities": ["AUDIO"],
|
||||
"speech_config": {
|
||||
"voice_config": {
|
||||
"prebuilt_voice_config": {
|
||||
"voice_name": voice_name
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
response = model.generate_content(
|
||||
contents=text,
|
||||
generation_config=generation_config
|
||||
)
|
||||
|
||||
# 检查响应
|
||||
if not response.candidates or not response.candidates[0].content:
|
||||
logger.error("No audio content received from Gemini TTS")
|
||||
return None
|
||||
|
||||
# 获取音频数据
|
||||
audio_data = None
|
||||
for part in response.candidates[0].content.parts:
|
||||
if hasattr(part, 'inline_data') and part.inline_data:
|
||||
audio_data = part.inline_data.data
|
||||
break
|
||||
|
||||
if not audio_data:
|
||||
logger.error("No audio data found in response")
|
||||
return None
|
||||
|
||||
# 音频数据已经是原始字节,不需要base64解码
|
||||
if isinstance(audio_data, str):
|
||||
# 如果是字符串,则需要base64解码
|
||||
audio_bytes = base64.b64decode(audio_data)
|
||||
else:
|
||||
# 如果已经是字节,直接使用
|
||||
audio_bytes = audio_data
|
||||
|
||||
# 尝试不同的音频格式 - Gemini可能返回不同的格式
|
||||
audio_segment = None
|
||||
|
||||
# Gemini返回Linear PCM格式,按照文档参数解析
|
||||
try:
|
||||
audio_segment = AudioSegment.from_file(
|
||||
io.BytesIO(audio_bytes),
|
||||
format="raw",
|
||||
frame_rate=24000, # Gemini TTS默认采样率
|
||||
channels=1, # 单声道
|
||||
sample_width=2 # 16-bit
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load PCM audio: {e}")
|
||||
return None
|
||||
|
||||
# 导出为MP3格式
|
||||
audio_segment.export(voice_file, format="mp3")
|
||||
|
||||
logger.info(f"completed, output file: {voice_file}")
|
||||
|
||||
# 创建SubMaker对象用于字幕
|
||||
sub_maker = SubMaker()
|
||||
audio_duration = len(audio_segment) / 1000.0 # 转换为秒
|
||||
|
||||
# 将音频长度转换为100纳秒单位(与edge_tts兼容)
|
||||
audio_duration_100ns = int(audio_duration * 10000000)
|
||||
|
||||
# 使用create_sub方法正确创建字幕项
|
||||
sub_maker.create_sub(
|
||||
(0, audio_duration_100ns),
|
||||
text
|
||||
)
|
||||
|
||||
return sub_maker
|
||||
|
||||
except ImportError as e:
|
||||
logger.error(f"Missing required package for Gemini TTS: {str(e)}. Please install: pip install pydub")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Gemini TTS failed, error: {str(e)}")
|
||||
return None
|
||||
|
||||
|
||||
def _format_text(text: str) -> str:
|
||||
# text = text.replace("\n", " ")
|
||||
text = text.replace("[", " ")
|
||||
|
||||
@ -634,6 +634,7 @@ with middle_panel:
|
||||
("azure-tts-v1", "Azure TTS V1"),
|
||||
("azure-tts-v2", "Azure TTS V2"),
|
||||
("siliconflow", "SiliconFlow TTS"),
|
||||
("gemini-tts", "Google Gemini TTS"),
|
||||
]
|
||||
|
||||
# 获取保存的TTS服务器,默认为v1
|
||||
@ -660,6 +661,9 @@ with middle_panel:
|
||||
if selected_tts_server == "siliconflow":
|
||||
# 获取硅基流动的声音列表
|
||||
filtered_voices = voice.get_siliconflow_voices()
|
||||
elif selected_tts_server == "gemini-tts":
|
||||
# 获取Gemini TTS的声音列表
|
||||
filtered_voices = voice.get_gemini_voices()
|
||||
else:
|
||||
# 获取Azure的声音列表
|
||||
all_voices = voice.get_all_azure_voices(filter_locals=None)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user