fix: Handle Edge TTS failures with Azure fallback and stabilize media pipeline

2026-02-21 16:37:21 +08:00 · 2025-10-01 21:18:52 +01:00 · 2025-10-01 21:18:52 +01:00 · f76e19ed52
commit f76e19ed52
parent 6cb5f23487
4 changed files with 161 additions and 42 deletions
--- a/app/services/video.py
+++ b/app/services/video.py
@ -249,6 +249,9 @@ def combine_videos(
    # if there is only one clip, use it directly
    if len(processed_clips) == 1:
        logger.info("using single clip directly")
+        # remove existing file to avoid FileExistsError
+        if os.path.exists(combined_video_path):
+            delete_files(combined_video_path)
        shutil.copy(processed_clips[0].file_path, combined_video_path)
        delete_files(processed_clips)
        logger.info("video combining completed")
@ -260,6 +263,7 @@ def combine_videos(
    temp_merged_next = f"{output_dir}/temp-merged-next.mp4"
    
    # copy first clip as initial merged video
+    delete_files([temp_merged_video, temp_merged_next])
    shutil.copy(base_clip_path, temp_merged_video)
    
    # merge remaining video clips one by one
@ -289,14 +293,16 @@ def combine_videos(
            
            # replace base file with new merged file
            delete_files(temp_merged_video)
-            os.rename(temp_merged_next, temp_merged_video)
+            os.replace(temp_merged_next, temp_merged_video)
            
        except Exception as e:
            logger.error(f"failed to merge clip: {str(e)}")
            continue
    
    # after merging, rename final result to target file name
-    os.rename(temp_merged_video, combined_video_path)
+    if os.path.exists(combined_video_path):
+        delete_files(combined_video_path)
+    os.replace(temp_merged_video, combined_video_path)
    
    # clean temp files
    clip_files = [clip.file_path for clip in processed_clips]
@ -490,42 +496,44 @@ def preprocess_video(materials: List[MaterialInfo], clip_duration=4):
            continue

        ext = utils.parse_extension(material.url)
+        clip = None
        try:
-            clip = VideoFileClip(material.url)
-        except Exception:
-            clip = ImageClip(material.url)
+            try:
+                clip = VideoFileClip(material.url)
+            except Exception:
+                clip = ImageClip(material.url)

-        width = clip.size[0]
-        height = clip.size[1]
-        if width < 480 or height < 480:
-            logger.warning(f"low resolution material: {width}x{height}, minimum 480x480 required")
-            continue
+            width = clip.size[0]
+            height = clip.size[1]
+            if width < 480 or height < 480:
+                logger.warning(f"low resolution material: {width}x{height}, minimum 480x480 required")
+                continue

-        if ext in const.FILE_TYPE_IMAGES:
-            logger.info(f"processing image: {material.url}")
-            # Create an image clip and set its duration to 3 seconds
-            clip = (
-                ImageClip(material.url)
-                .with_duration(clip_duration)
-                .with_position("center")
-            )
-            # Apply a zoom effect using the resize method.
-            # A lambda function is used to make the zoom effect dynamic over time.
-            # The zoom effect starts from the original size and gradually scales up to 120%.
-            # t represents the current time, and clip.duration is the total duration of the clip (3 seconds).
-            # Note: 1 represents 100% size, so 1.2 represents 120% size.
-            zoom_clip = clip.resized(
-                lambda t: 1 + (clip_duration * 0.03) * (t / clip.duration)
-            )
+            if ext in const.FILE_TYPE_IMAGES:
+                logger.info(f"processing image: {material.url}")
+                image_clip = None
+                zoom_clip = None
+                final_clip = None
+                try:
+                    image_clip = (
+                        ImageClip(material.url)
+                        .with_duration(clip_duration)
+                        .with_position("center")
+                    )
+                    zoom_clip = image_clip.resized(
+                        lambda t: 1 + (clip_duration * 0.03) * (t / image_clip.duration)
+                    )

-            # Optionally, create a composite video clip containing the zoomed clip.
-            # This is useful when you want to add other elements to the video.
-            final_clip = CompositeVideoClip([zoom_clip])
+                    final_clip = CompositeVideoClip([zoom_clip])

-            # Output the video to a file.
-            video_file = f"{material.url}.mp4"
-            final_clip.write_videofile(video_file, fps=30, logger=None)
+                    video_file = f"{material.url}.mp4"
+                    final_clip.write_videofile(video_file, fps=30, logger=None)
+                    material.url = video_file
+                    logger.success(f"image processed: {video_file}")
+                finally:
+                    close_clip(final_clip)
+                    close_clip(zoom_clip)
+                    close_clip(image_clip)
+        finally:
            close_clip(clip)
-            material.url = video_file
-            logger.success(f"image processed: {video_file}")
    return materials
--- a/app/services/voice.py
+++ b/app/services/voice.py
@ -7,6 +7,7 @@ from xml.sax.saxutils import unescape

 import edge_tts
 import requests
+from aiohttp import ClientConnectorError
 from edge_tts import SubMaker, submaker
 from edge_tts.submaker import mktimestamp
 from loguru import logger
@ -1115,20 +1116,34 @@ def convert_rate_to_percent(rate: float) -> str:
    else:
        return f"{percent}%"

+def _ensure_voice_directory(voice_file: str) -> None:
+    dir_path = os.path.dirname(voice_file)
+    if dir_path:
+        os.makedirs(dir_path, exist_ok=True)

 def azure_tts_v1(
    text: str, voice_name: str, voice_rate: float, voice_file: str
 ) -> Union[SubMaker, None]:
-    voice_name = parse_voice_name(voice_name)
+    norm_voice_name = parse_voice_name(voice_name)
    text = text.strip()
    rate_str = convert_rate_to_percent(voice_rate)
+    azure_key = config.azure.get("speech_key", "")
+    azure_region = config.azure.get("speech_region", "")
+    fallback_available = bool(azure_key and azure_region)
+    fallback_attempted = False
+
    for i in range(3):
        try:
-            logger.info(f"start, voice name: {voice_name}, try: {i + 1}")
+            logger.info(
+                f"Edge TTS start, voice name: {norm_voice_name}, try: {i + 1}"
+            )

            async def _do() -> SubMaker:
-                communicate = edge_tts.Communicate(text, voice_name, rate=rate_str)
+                communicate = edge_tts.Communicate(
+                    text, norm_voice_name, rate=rate_str
+                )
                sub_maker = edge_tts.SubMaker()
+                _ensure_voice_directory(voice_file)
                with open(voice_file, "wb") as file:
                    async for chunk in communicate.stream():
                        if chunk["type"] == "audio":
@ -1141,13 +1156,53 @@ def azure_tts_v1(

            sub_maker = asyncio.run(_do())
            if not sub_maker or not sub_maker.subs:
-                logger.warning("failed, sub_maker is None or sub_maker.subs is None")
+                logger.warning(
+                    "Edge TTS failed, sub_maker is None or sub_maker.subs is None"
+                )
                continue

-            logger.info(f"completed, output file: {voice_file}")
+            logger.info(f"Edge TTS completed, output file: {voice_file}")
            return sub_maker
+        except (asyncio.TimeoutError, ClientConnectorError) as network_error:
+            logger.warning(
+                "Edge TTS encountered a network issue: {}".format(network_error)
+            )
+            if fallback_available and not fallback_attempted:
+                fallback_attempted = True
+                fallback_voice_name = f"{norm_voice_name}-V2"
+                logger.info(
+                    "Attempting Azure Speech SDK fallback with voice: {}".format(
+                        fallback_voice_name
+                    )
+                )
+                fallback_sub_maker = azure_tts_v2(
+                    text=text,
+                    voice_name=fallback_voice_name,
+                    voice_file=voice_file,
+                )
+                if fallback_sub_maker and getattr(
+                        fallback_sub_maker, "subs", None
+                ):
+                    logger.info(
+                        f"Azure Speech SDK fallback completed, output file: {voice_file}"
+                    )
+                    return fallback_sub_maker
+                logger.error("Azure Speech SDK fallback failed to synthesize audio")
+            elif not fallback_available:
+                logger.warning(
+                    "Azure Speech SDK fallback unavailable - missing credentials"
+                )
        except Exception as e:
-            logger.error(f"failed, error: {str(e)}")
+            logger.error(f"Edge TTS failed, error: {str(e)}")
+
+            if fallback_available and fallback_attempted:
+                logger.error(
+                    f"Edge TTS and Azure Speech SDK fallback both failed for voice: {norm_voice_name}"
+                )
+            else:
+                logger.error(
+                    f"Edge TTS failed for voice: {norm_voice_name} after retries"
+                )
    return None


@ -1211,6 +1266,7 @@ def siliconflow_tts(

            if response.status_code == 200:
                # 保存音频文件
+                _ensure_voice_directory(voice_file)
                with open(voice_file, "wb") as f:
                    f.write(response.content)

@ -1341,6 +1397,8 @@ def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> Union[SubMaker,
                logger.error("Azure speech key or region is not set")
                return None

+            _ensure_voice_directory(voice_file)
+
            audio_config = speechsdk.audio.AudioOutputConfig(
                filename=voice_file, use_default_speaker=True
            )
--- a/test/services/test_video.py
+++ b/test/services/test_video.py
@ -41,8 +41,11 @@ class TestVideoService(unittest.TestCase):
        
        # moviepy get video info
        clip = VideoFileClip(materials[0].url)
-        print(clip)
-        
+        try:
+            print(clip)
+        finally:
+            clip.close()
+
        # clean generated test video file
        if os.path.exists(materials[0].url):
            os.remove(materials[0].url)
--- a/test/services/test_voice.py
+++ b/test/services/test_voice.py
@ -3,6 +3,7 @@ import unittest
 import os
 import sys
 from pathlib import Path
+from unittest import mock

 # add project root to python path
 sys.path.insert(0, str(Path(__file__).parent.parent.parent))
@ -101,6 +102,55 @@ class TestVoiceService(unittest.TestCase):

        self.loop.run_until_complete(_do())

+    def test_azure_tts_v1_fallback_to_v2(self):
+        voice_name = "en-US-JennyNeural-Female"
+        normalized_voice_name = vs.parse_voice_name(voice_name)
+        voice_file = f"{temp_dir}/tts-azure-fallback-{normalized_voice_name}.mp3"
+        fallback_sub_maker = vs.SubMaker()
+        fallback_sub_maker.subs = ["hello"]
+
+        text_value = "  hello world  "
+
+        def raise_timeout(coro):
+            coro.close()
+            raise asyncio.TimeoutError()
+
+        with mock.patch(
+                "app.services.voice.asyncio.run", side_effect=raise_timeout
+        ) as mock_asyncio_run, mock.patch(
+            "app.services.voice.azure_tts_v2", return_value=fallback_sub_maker
+        ) as mock_azure_v2:
+            original_key = vs.config.azure.get("speech_key")
+            original_region = vs.config.azure.get("speech_region")
+            vs.config.azure["speech_key"] = "dummy-key"
+            vs.config.azure["speech_region"] = "dummy-region"
+
+            try:
+                sub_maker = vs.azure_tts_v1(
+                    text=text_value,
+                    voice_name=voice_name,
+                    voice_rate=1.0,
+                    voice_file=voice_file,
+                )
+            finally:
+                if original_key is None:
+                    vs.config.azure.pop("speech_key", None)
+                else:
+                    vs.config.azure["speech_key"] = original_key
+
+                if original_region is None:
+                    vs.config.azure.pop("speech_region", None)
+                else:
+                    vs.config.azure["speech_region"] = original_region
+
+        self.assertIs(sub_maker, fallback_sub_maker)
+        mock_asyncio_run.assert_called_once()
+        mock_azure_v2.assert_called_once_with(
+            text=text_value.strip(),
+            voice_name=f"{normalized_voice_name}-V2",
+            voice_file=voice_file,
+        )
+
 if __name__ == "__main__":
    # python -m unittest test.services.test_voice.TestVoiceService.test_azure_tts_v1
    # python -m unittest test.services.test_voice.TestVoiceService.test_azure_tts_v2