fix: Handle Edge TTS failures with Azure fallback and stabilize media pipeline

This commit is contained in:
JAAAACCCCCCKKKK 2025-10-01 21:18:52 +01:00
parent 6cb5f23487
commit f76e19ed52
No known key found for this signature in database
GPG Key ID: 031A2F4F241F588B
4 changed files with 161 additions and 42 deletions

View File

@ -249,6 +249,9 @@ def combine_videos(
# if there is only one clip, use it directly
if len(processed_clips) == 1:
logger.info("using single clip directly")
# remove existing file to avoid FileExistsError
if os.path.exists(combined_video_path):
delete_files(combined_video_path)
shutil.copy(processed_clips[0].file_path, combined_video_path)
delete_files(processed_clips)
logger.info("video combining completed")
@ -260,6 +263,7 @@ def combine_videos(
temp_merged_next = f"{output_dir}/temp-merged-next.mp4"
# copy first clip as initial merged video
delete_files([temp_merged_video, temp_merged_next])
shutil.copy(base_clip_path, temp_merged_video)
# merge remaining video clips one by one
@ -289,14 +293,16 @@ def combine_videos(
# replace base file with new merged file
delete_files(temp_merged_video)
os.rename(temp_merged_next, temp_merged_video)
os.replace(temp_merged_next, temp_merged_video)
except Exception as e:
logger.error(f"failed to merge clip: {str(e)}")
continue
# after merging, rename final result to target file name
os.rename(temp_merged_video, combined_video_path)
if os.path.exists(combined_video_path):
delete_files(combined_video_path)
os.replace(temp_merged_video, combined_video_path)
# clean temp files
clip_files = [clip.file_path for clip in processed_clips]
@ -490,42 +496,44 @@ def preprocess_video(materials: List[MaterialInfo], clip_duration=4):
continue
ext = utils.parse_extension(material.url)
clip = None
try:
clip = VideoFileClip(material.url)
except Exception:
clip = ImageClip(material.url)
try:
clip = VideoFileClip(material.url)
except Exception:
clip = ImageClip(material.url)
width = clip.size[0]
height = clip.size[1]
if width < 480 or height < 480:
logger.warning(f"low resolution material: {width}x{height}, minimum 480x480 required")
continue
width = clip.size[0]
height = clip.size[1]
if width < 480 or height < 480:
logger.warning(f"low resolution material: {width}x{height}, minimum 480x480 required")
continue
if ext in const.FILE_TYPE_IMAGES:
logger.info(f"processing image: {material.url}")
# Create an image clip and set its duration to 3 seconds
clip = (
ImageClip(material.url)
.with_duration(clip_duration)
.with_position("center")
)
# Apply a zoom effect using the resize method.
# A lambda function is used to make the zoom effect dynamic over time.
# The zoom effect starts from the original size and gradually scales up to 120%.
# t represents the current time, and clip.duration is the total duration of the clip (3 seconds).
# Note: 1 represents 100% size, so 1.2 represents 120% size.
zoom_clip = clip.resized(
lambda t: 1 + (clip_duration * 0.03) * (t / clip.duration)
)
if ext in const.FILE_TYPE_IMAGES:
logger.info(f"processing image: {material.url}")
image_clip = None
zoom_clip = None
final_clip = None
try:
image_clip = (
ImageClip(material.url)
.with_duration(clip_duration)
.with_position("center")
)
zoom_clip = image_clip.resized(
lambda t: 1 + (clip_duration * 0.03) * (t / image_clip.duration)
)
# Optionally, create a composite video clip containing the zoomed clip.
# This is useful when you want to add other elements to the video.
final_clip = CompositeVideoClip([zoom_clip])
final_clip = CompositeVideoClip([zoom_clip])
# Output the video to a file.
video_file = f"{material.url}.mp4"
final_clip.write_videofile(video_file, fps=30, logger=None)
video_file = f"{material.url}.mp4"
final_clip.write_videofile(video_file, fps=30, logger=None)
material.url = video_file
logger.success(f"image processed: {video_file}")
finally:
close_clip(final_clip)
close_clip(zoom_clip)
close_clip(image_clip)
finally:
close_clip(clip)
material.url = video_file
logger.success(f"image processed: {video_file}")
return materials

View File

@ -7,6 +7,7 @@ from xml.sax.saxutils import unescape
import edge_tts
import requests
from aiohttp import ClientConnectorError
from edge_tts import SubMaker, submaker
from edge_tts.submaker import mktimestamp
from loguru import logger
@ -1115,20 +1116,34 @@ def convert_rate_to_percent(rate: float) -> str:
else:
return f"{percent}%"
def _ensure_voice_directory(voice_file: str) -> None:
dir_path = os.path.dirname(voice_file)
if dir_path:
os.makedirs(dir_path, exist_ok=True)
def azure_tts_v1(
text: str, voice_name: str, voice_rate: float, voice_file: str
) -> Union[SubMaker, None]:
voice_name = parse_voice_name(voice_name)
norm_voice_name = parse_voice_name(voice_name)
text = text.strip()
rate_str = convert_rate_to_percent(voice_rate)
azure_key = config.azure.get("speech_key", "")
azure_region = config.azure.get("speech_region", "")
fallback_available = bool(azure_key and azure_region)
fallback_attempted = False
for i in range(3):
try:
logger.info(f"start, voice name: {voice_name}, try: {i + 1}")
logger.info(
f"Edge TTS start, voice name: {norm_voice_name}, try: {i + 1}"
)
async def _do() -> SubMaker:
communicate = edge_tts.Communicate(text, voice_name, rate=rate_str)
communicate = edge_tts.Communicate(
text, norm_voice_name, rate=rate_str
)
sub_maker = edge_tts.SubMaker()
_ensure_voice_directory(voice_file)
with open(voice_file, "wb") as file:
async for chunk in communicate.stream():
if chunk["type"] == "audio":
@ -1141,13 +1156,53 @@ def azure_tts_v1(
sub_maker = asyncio.run(_do())
if not sub_maker or not sub_maker.subs:
logger.warning("failed, sub_maker is None or sub_maker.subs is None")
logger.warning(
"Edge TTS failed, sub_maker is None or sub_maker.subs is None"
)
continue
logger.info(f"completed, output file: {voice_file}")
logger.info(f"Edge TTS completed, output file: {voice_file}")
return sub_maker
except (asyncio.TimeoutError, ClientConnectorError) as network_error:
logger.warning(
"Edge TTS encountered a network issue: {}".format(network_error)
)
if fallback_available and not fallback_attempted:
fallback_attempted = True
fallback_voice_name = f"{norm_voice_name}-V2"
logger.info(
"Attempting Azure Speech SDK fallback with voice: {}".format(
fallback_voice_name
)
)
fallback_sub_maker = azure_tts_v2(
text=text,
voice_name=fallback_voice_name,
voice_file=voice_file,
)
if fallback_sub_maker and getattr(
fallback_sub_maker, "subs", None
):
logger.info(
f"Azure Speech SDK fallback completed, output file: {voice_file}"
)
return fallback_sub_maker
logger.error("Azure Speech SDK fallback failed to synthesize audio")
elif not fallback_available:
logger.warning(
"Azure Speech SDK fallback unavailable - missing credentials"
)
except Exception as e:
logger.error(f"failed, error: {str(e)}")
logger.error(f"Edge TTS failed, error: {str(e)}")
if fallback_available and fallback_attempted:
logger.error(
f"Edge TTS and Azure Speech SDK fallback both failed for voice: {norm_voice_name}"
)
else:
logger.error(
f"Edge TTS failed for voice: {norm_voice_name} after retries"
)
return None
@ -1211,6 +1266,7 @@ def siliconflow_tts(
if response.status_code == 200:
# 保存音频文件
_ensure_voice_directory(voice_file)
with open(voice_file, "wb") as f:
f.write(response.content)
@ -1341,6 +1397,8 @@ def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> Union[SubMaker,
logger.error("Azure speech key or region is not set")
return None
_ensure_voice_directory(voice_file)
audio_config = speechsdk.audio.AudioOutputConfig(
filename=voice_file, use_default_speaker=True
)

View File

@ -41,8 +41,11 @@ class TestVideoService(unittest.TestCase):
# moviepy get video info
clip = VideoFileClip(materials[0].url)
print(clip)
try:
print(clip)
finally:
clip.close()
# clean generated test video file
if os.path.exists(materials[0].url):
os.remove(materials[0].url)

View File

@ -3,6 +3,7 @@ import unittest
import os
import sys
from pathlib import Path
from unittest import mock
# add project root to python path
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
@ -101,6 +102,55 @@ class TestVoiceService(unittest.TestCase):
self.loop.run_until_complete(_do())
def test_azure_tts_v1_fallback_to_v2(self):
voice_name = "en-US-JennyNeural-Female"
normalized_voice_name = vs.parse_voice_name(voice_name)
voice_file = f"{temp_dir}/tts-azure-fallback-{normalized_voice_name}.mp3"
fallback_sub_maker = vs.SubMaker()
fallback_sub_maker.subs = ["hello"]
text_value = " hello world "
def raise_timeout(coro):
coro.close()
raise asyncio.TimeoutError()
with mock.patch(
"app.services.voice.asyncio.run", side_effect=raise_timeout
) as mock_asyncio_run, mock.patch(
"app.services.voice.azure_tts_v2", return_value=fallback_sub_maker
) as mock_azure_v2:
original_key = vs.config.azure.get("speech_key")
original_region = vs.config.azure.get("speech_region")
vs.config.azure["speech_key"] = "dummy-key"
vs.config.azure["speech_region"] = "dummy-region"
try:
sub_maker = vs.azure_tts_v1(
text=text_value,
voice_name=voice_name,
voice_rate=1.0,
voice_file=voice_file,
)
finally:
if original_key is None:
vs.config.azure.pop("speech_key", None)
else:
vs.config.azure["speech_key"] = original_key
if original_region is None:
vs.config.azure.pop("speech_region", None)
else:
vs.config.azure["speech_region"] = original_region
self.assertIs(sub_maker, fallback_sub_maker)
mock_asyncio_run.assert_called_once()
mock_azure_v2.assert_called_once_with(
text=text_value.strip(),
voice_name=f"{normalized_voice_name}-V2",
voice_file=voice_file,
)
if __name__ == "__main__":
# python -m unittest test.services.test_voice.TestVoiceService.test_azure_tts_v1
# python -m unittest test.services.test_voice.TestVoiceService.test_azure_tts_v2