Merge pull request #448 from yyhhyyyyyy/add-rate

feat(azure_tts_v1): Allows to control the speed of speech generation.
2026-02-21 16:37:21 +08:00 · 2024-07-19 14:17:15 +08:00 · 2024-07-19 14:17:15 +08:00 · 36a367d713
commit 36a367d713
parent 6853163905 77b304537a
8 changed files with 28 additions and 7 deletions
--- a/app/models/schema.py
+++ b/app/models/schema.py
@ -108,6 +108,7 @@ class VideoParams(BaseModel):

    voice_name: Optional[str] = ""
    voice_volume: Optional[float] = 1.0
+    voice_rate: Optional[float] = 1.0
    bgm_type: Optional[str] = "random"
    bgm_file: Optional[str] = ""
    bgm_volume: Optional[float] = 0.2
--- a/app/services/task.py
+++ b/app/services/task.py
@ -32,6 +32,7 @@ def start(task_id, params: VideoParams):

    video_subject = params.video_subject
    voice_name = voice.parse_voice_name(params.voice_name)
+    voice_rate = params.voice_rate
    paragraph_number = params.paragraph_number
    n_threads = params.n_threads
    max_clip_duration = params.video_clip_duration
@ -84,7 +85,7 @@ def start(task_id, params: VideoParams):

    logger.info("\n\n## generating audio")
    audio_file = path.join(utils.task_dir(task_id), f"audio.mp3")
-    sub_maker = voice.tts(text=video_script, voice_name=voice_name, voice_file=audio_file)
+    sub_maker = voice.tts(text=video_script, voice_name=voice_name, voice_rate=voice_rate, voice_file=audio_file)
    if sub_maker is None:
        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
        logger.error(
--- a/app/services/voice.py
+++ b/app/services/voice.py
@ -1028,21 +1028,32 @@ def is_azure_v2_voice(voice_name: str):
    return ""


-def tts(text: str, voice_name: str, voice_file: str) -> [SubMaker, None]:
+def tts(text: str, voice_name: str, voice_rate: float, voice_file: str) -> [SubMaker, None]:
    if is_azure_v2_voice(voice_name):
        return azure_tts_v2(text, voice_name, voice_file)
-    return azure_tts_v1(text, voice_name, voice_file)
+    return azure_tts_v1(text, voice_name, voice_rate, voice_file)


-def azure_tts_v1(text: str, voice_name: str, voice_file: str) -> [SubMaker, None]:
+def convert_rate_to_percent(rate: float) -> str:
+    if rate == 1.0:
+        return "+0%"
+    percent = round((rate - 1.0) * 100)
+    if percent > 0:
+        return f"+{percent}%"
+    else:
+        return f"{percent}%"
+    
+
+def azure_tts_v1(text: str, voice_name: str, voice_rate: float, voice_file: str) -> [SubMaker, None]:
    voice_name = parse_voice_name(voice_name)
    text = text.strip()
+    rate_str = convert_rate_to_percent(voice_rate)
    for i in range(3):
        try:
            logger.info(f"start, voice name: {voice_name}, try: {i + 1}")

            async def _do() -> SubMaker:
-                communicate = edge_tts.Communicate(text, voice_name)
+                communicate = edge_tts.Communicate(text, voice_name, rate=rate_str)
                sub_maker = edge_tts.SubMaker()
                with open(voice_file, "wb") as file:
                    async for chunk in communicate.stream():
--- a/webui/Main.py
+++ b/webui/Main.py
@ -510,11 +510,11 @@ with middle_panel:
            with st.spinner(tr("Synthesizing Voice")):
                temp_dir = utils.storage_dir("temp", create=True)
                audio_file = os.path.join(temp_dir, f"tmp-voice-{str(uuid4())}.mp3")
-                sub_maker = voice.tts(text=play_content, voice_name=voice_name, voice_file=audio_file)
+                sub_maker = voice.tts(text=play_content, voice_name=voice_name, voice_rate=params.voice_rate, voice_file=audio_file)
                # if the voice file generation failed, try again with a default content.
                if not sub_maker:
                    play_content = "This is a example voice. if you hear this, the voice synthesis failed with the original content."
-                    sub_maker = voice.tts(text=play_content, voice_name=voice_name, voice_file=audio_file)
+                    sub_maker = voice.tts(text=play_content, voice_name=voice_name, voice_rate=params.voice_rate, voice_file=audio_file)

                if sub_maker and os.path.exists(audio_file):
                    st.audio(audio_file, format="audio/mp3")
@ -531,6 +531,10 @@ with middle_panel:

        params.voice_volume = st.selectbox(tr("Speech Volume"),
                                           options=[0.6, 0.8, 1.0, 1.2, 1.5, 2.0, 3.0, 4.0, 5.0], index=2)
+        
+        params.voice_rate = st.selectbox(tr("Speech Rate"),
+                                           options=[0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5, 1.8, 2.0], index=2)
+        
        bgm_options = [
            (tr("No Background Music"), ""),
            (tr("Random Background Music"), "random"),
--- a/webui/i18n/de.json
+++ b/webui/i18n/de.json
@ -26,6 +26,7 @@
    "Speech Region": "Region(:red[Required，[Get Region](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
    "Speech Key": "API Key(:red[Required，[Get API Key](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
    "Speech Volume": "Lautstärke der Sprachausgabe",
+    "Speech Rate": "Lesegeschwindigkeit (1,0 bedeutet 1x)",
    "Male": "Männlich",
    "Female": "Weiblich",
    "Background Music": "Hintergrundmusik",
--- a/webui/i18n/en.json
+++ b/webui/i18n/en.json
@ -26,6 +26,7 @@
    "Speech Region": "Region(:red[Required，[Get Region](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
    "Speech Key": "API Key(:red[Required，[Get API Key](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
    "Speech Volume": "Speech Volume (1.0 represents 100%)",
+    "Speech Rate": "Speech Rate (1.0 means 1x speed)",
    "Male": "Male",
    "Female": "Female",
    "Background Music": "Background Music",
--- a/webui/i18n/vi.json
+++ b/webui/i18n/vi.json
@ -26,6 +26,7 @@
    "Speech Region": "Vùng(:red[Bắt Buộc，[Lấy Vùng](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
    "Speech Key": "Khóa API(:red[Bắt Buộc，[Lấy Khóa API](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
    "Speech Volume": "Âm Lượng Giọng Đọc (1.0 đại diện cho 100%)",
+    "Speech Rate": "Tốc độ đọc (1.0 biểu thị tốc độ gốc)",
    "Male": "Nam",
    "Female": "Nữ",
    "Background Music": "Âm Nhạc Nền",
--- a/webui/i18n/zh.json
+++ b/webui/i18n/zh.json
@ -26,6 +26,7 @@
    "Speech Region": "服务区域 (:red[必填，[点击获取](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
    "Speech Key": "API Key (:red[必填，密钥1 或 密钥2 均可 [点击获取](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
    "Speech Volume": "朗读音量（1.0表示100%）",
+    "Speech Rate": "朗读速度（1.0表示1倍速）",
    "Male": "男性",
    "Female": "女性",
    "Background Music": "背景音乐",