From 63fb848a1731ea58209630051b00deb6ff36345d Mon Sep 17 00:00:00 2001 From: yyhhyyyyyy <95077259+yyhhyyyyyy@users.noreply.github.com> Date: Fri, 19 Jul 2024 11:06:34 +0800 Subject: [PATCH 1/2] 1. Add azure_tts_v1 to control the speed of speech --- app/models/schema.py | 1 + app/services/task.py | 3 ++- app/services/voice.py | 19 +++++++++++++++---- webui/Main.py | 8 ++++++-- webui/i18n/zh.json | 1 + 5 files changed, 25 insertions(+), 7 deletions(-) diff --git a/app/models/schema.py b/app/models/schema.py index e7d6576..c45d7d9 100644 --- a/app/models/schema.py +++ b/app/models/schema.py @@ -108,6 +108,7 @@ class VideoParams(BaseModel): voice_name: Optional[str] = "" voice_volume: Optional[float] = 1.0 + voice_rate: Optional[float] = 1.0 bgm_type: Optional[str] = "random" bgm_file: Optional[str] = "" bgm_volume: Optional[float] = 0.2 diff --git a/app/services/task.py b/app/services/task.py index c413303..cf396c4 100644 --- a/app/services/task.py +++ b/app/services/task.py @@ -32,6 +32,7 @@ def start(task_id, params: VideoParams): video_subject = params.video_subject voice_name = voice.parse_voice_name(params.voice_name) + voice_rate = params.voice_rate paragraph_number = params.paragraph_number n_threads = params.n_threads max_clip_duration = params.video_clip_duration @@ -84,7 +85,7 @@ def start(task_id, params: VideoParams): logger.info("\n\n## generating audio") audio_file = path.join(utils.task_dir(task_id), f"audio.mp3") - sub_maker = voice.tts(text=video_script, voice_name=voice_name, voice_file=audio_file) + sub_maker = voice.tts(text=video_script, voice_name=voice_name, voice_rate=voice_rate, voice_file=audio_file) if sub_maker is None: sm.state.update_task(task_id, state=const.TASK_STATE_FAILED) logger.error( diff --git a/app/services/voice.py b/app/services/voice.py index 2611835..2a31637 100644 --- a/app/services/voice.py +++ b/app/services/voice.py @@ -1028,21 +1028,32 @@ def is_azure_v2_voice(voice_name: str): return "" -def tts(text: str, voice_name: str, voice_file: str) -> [SubMaker, None]: +def tts(text: str, voice_name: str, voice_rate: float, voice_file: str) -> [SubMaker, None]: if is_azure_v2_voice(voice_name): return azure_tts_v2(text, voice_name, voice_file) - return azure_tts_v1(text, voice_name, voice_file) + return azure_tts_v1(text, voice_name, voice_rate, voice_file) -def azure_tts_v1(text: str, voice_name: str, voice_file: str) -> [SubMaker, None]: +def convert_rate_to_percent(rate: float) -> str: + if rate == 1.0: + return "+0%" + percent = round((rate - 1.0) * 100) + if percent > 0: + return f"+{percent}%" + else: + return f"{percent}%" + + +def azure_tts_v1(text: str, voice_name: str, voice_rate: float, voice_file: str) -> [SubMaker, None]: voice_name = parse_voice_name(voice_name) text = text.strip() + rate_str = convert_rate_to_percent(voice_rate) for i in range(3): try: logger.info(f"start, voice name: {voice_name}, try: {i + 1}") async def _do() -> SubMaker: - communicate = edge_tts.Communicate(text, voice_name) + communicate = edge_tts.Communicate(text, voice_name, rate=rate_str) sub_maker = edge_tts.SubMaker() with open(voice_file, "wb") as file: async for chunk in communicate.stream(): diff --git a/webui/Main.py b/webui/Main.py index 69979e2..1cec842 100644 --- a/webui/Main.py +++ b/webui/Main.py @@ -510,11 +510,11 @@ with middle_panel: with st.spinner(tr("Synthesizing Voice")): temp_dir = utils.storage_dir("temp", create=True) audio_file = os.path.join(temp_dir, f"tmp-voice-{str(uuid4())}.mp3") - sub_maker = voice.tts(text=play_content, voice_name=voice_name, voice_file=audio_file) + sub_maker = voice.tts(text=play_content, voice_name=voice_name, voice_rate=params.voice_rate, voice_file=audio_file) # if the voice file generation failed, try again with a default content. if not sub_maker: play_content = "This is a example voice. if you hear this, the voice synthesis failed with the original content." - sub_maker = voice.tts(text=play_content, voice_name=voice_name, voice_file=audio_file) + sub_maker = voice.tts(text=play_content, voice_name=voice_name, voice_rate=params.voice_rate, voice_file=audio_file) if sub_maker and os.path.exists(audio_file): st.audio(audio_file, format="audio/mp3") @@ -531,6 +531,10 @@ with middle_panel: params.voice_volume = st.selectbox(tr("Speech Volume"), options=[0.6, 0.8, 1.0, 1.2, 1.5, 2.0, 3.0, 4.0, 5.0], index=2) + + params.voice_rate = st.selectbox(tr("Speech Rate"), + options=[0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5, 1.8, 2.0], index=2) + bgm_options = [ (tr("No Background Music"), ""), (tr("Random Background Music"), "random"), diff --git a/webui/i18n/zh.json b/webui/i18n/zh.json index 4d560da..019e7c1 100644 --- a/webui/i18n/zh.json +++ b/webui/i18n/zh.json @@ -26,6 +26,7 @@ "Speech Region": "服务区域 (:red[必填,[点击获取](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])", "Speech Key": "API Key (:red[必填,密钥1 或 密钥2 均可 [点击获取](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])", "Speech Volume": "朗读音量(1.0表示100%)", + "Speech Rate": "朗读速度(1.0表示1倍速)", "Male": "男性", "Female": "女性", "Background Music": "背景音乐", From 77b304537ade3bf98db0d3d52963b02ce869e729 Mon Sep 17 00:00:00 2001 From: yyhhyyyyyy <95077259+yyhhyyyyyy@users.noreply.github.com> Date: Fri, 19 Jul 2024 11:15:36 +0800 Subject: [PATCH 2/2] Speech Rate --- webui/i18n/de.json | 1 + webui/i18n/en.json | 1 + webui/i18n/vi.json | 1 + 3 files changed, 3 insertions(+) diff --git a/webui/i18n/de.json b/webui/i18n/de.json index ca4af0a..42a26c3 100644 --- a/webui/i18n/de.json +++ b/webui/i18n/de.json @@ -26,6 +26,7 @@ "Speech Region": "Region(:red[Required,[Get Region](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])", "Speech Key": "API Key(:red[Required,[Get API Key](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])", "Speech Volume": "Lautstärke der Sprachausgabe", + "Speech Rate": "Lesegeschwindigkeit (1,0 bedeutet 1x)", "Male": "Männlich", "Female": "Weiblich", "Background Music": "Hintergrundmusik", diff --git a/webui/i18n/en.json b/webui/i18n/en.json index 3ca37ca..11d4831 100644 --- a/webui/i18n/en.json +++ b/webui/i18n/en.json @@ -26,6 +26,7 @@ "Speech Region": "Region(:red[Required,[Get Region](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])", "Speech Key": "API Key(:red[Required,[Get API Key](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])", "Speech Volume": "Speech Volume (1.0 represents 100%)", + "Speech Rate": "Speech Rate (1.0 means 1x speed)", "Male": "Male", "Female": "Female", "Background Music": "Background Music", diff --git a/webui/i18n/vi.json b/webui/i18n/vi.json index c1fd822..2cd6a98 100644 --- a/webui/i18n/vi.json +++ b/webui/i18n/vi.json @@ -26,6 +26,7 @@ "Speech Region": "Vùng(:red[Bắt Buộc,[Lấy Vùng](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])", "Speech Key": "Khóa API(:red[Bắt Buộc,[Lấy Khóa API](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])", "Speech Volume": "Âm Lượng Giọng Đọc (1.0 đại diện cho 100%)", + "Speech Rate": "Tốc độ đọc (1.0 biểu thị tốc độ gốc)", "Male": "Nam", "Female": "Nữ", "Background Music": "Âm Nhạc Nền",