diff --git a/.gitignore b/.gitignore
index 6aa0ca7..bfcac95 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,7 +9,7 @@
 /app/utils/__pycache__/
 /*/__pycache__/*
 .vscode
-/**/.streamlit
+
 __pycache__
 logs/
 
diff --git a/app/models/schema.py b/app/models/schema.py
index 3696fa3..8f0bd32 100644
--- a/app/models/schema.py
+++ b/app/models/schema.py
@@ -3,7 +3,7 @@ from enum import Enum
 from typing import Any, List, Optional, Union
 
 import pydantic
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
 
 # 忽略 Pydantic 的特定警告
 warnings.filterwarnings(
@@ -74,7 +74,7 @@ class VideoParams(BaseModel):
     video_aspect: Optional[VideoAspect] = VideoAspect.portrait.value
     video_concat_mode: Optional[VideoConcatMode] = VideoConcatMode.random.value
     video_transition_mode: Optional[VideoTransitionMode] = None
-    video_clip_duration: Optional[int] = 5
+    max_clip_duration: Optional[int] = 5
     video_count: Optional[int] = 1
 
     video_source: Optional[str] = "pexels"
@@ -103,7 +103,7 @@ class VideoParams(BaseModel):
     stroke_width: float = 1.5
     n_threads: Optional[int] = 2
     paragraph_number: Optional[int] = 1
-
+    storyboard_mode: bool = Field(False, description="是否启用故事板模式以实现音画同步")
 
 class SubtitleRequest(BaseModel):
     video_script: str
diff --git a/app/services/llm.py b/app/services/llm.py
index 24abfc8..44df275 100644
--- a/app/services/llm.py
+++ b/app/services/llm.py
@@ -2,7 +2,7 @@ import json
 import logging
 import re
 import requests
-from typing import List
+from typing import List, Dict
 
 import g4f
 from loguru import logger
@@ -173,7 +173,7 @@ def _generate_response(prompt: str) -> str:
                     "temperature": 0.5,
                     "top_p": 1,
                     "top_k": 1,
-                    "max_output_tokens": 2048,
+                    "max_output_tokens": 8192,
                 }
 
                 safety_settings = [
@@ -270,8 +270,10 @@ def _generate_response(prompt: str) -> str:
                     base_url=base_url,
                 )
 
-            response = client.chat.completions.create(
-                model=model_name, messages=[{"role": "user", "content": prompt}]
+            response: ChatCompletion = client.chat.completions.create(
+                model=model_name,
+                messages=[{"role": "user", "content": prompt}],
+                max_tokens=4096
             )
             if response:
                 if isinstance(response, ChatCompletion):
@@ -392,77 +394,168 @@ Generate a script for a video, depending on the subject of the video.
 # ### Video Subject
 # {video_subject}
 def generate_terms(video_subject: str, video_script: str) -> List[str]:
-    prompt = f"""
-# Role: AI Video Director and Editor
+    """
+    Generate video terms from video subject and script.
+    """
+    prompt_template = """
+# Role: Video Search Terms Generator
 
-## Core Goal:
-Your mission is to meticulously analyze the provided video script, break it down into distinct visual scenes, and generate a diverse list of English search terms for stock footage.
+## Task:
+Generate a concise, comma-separated list of 1-5 English search terms based on the provided `Video Subject` and `Video Script`. These terms will be used to find relevant video clips.
 
-## Step-by-Step Instructions:
-1.  Read the entire `{video_subject}` script to understand the main narrative and mood.
-2.  Go through the script paragraph by paragraph (or by logical scene breaks).
-3.  For each paragraph/scene, generate ONE primary search term that best captures its visual essence.
-4.  Compile all generated search terms into a single JSON array.
+## Instructions:
+1.  **Analyze Context:** Read the `Video Subject` and `Video Script` to understand the main topics and visual elements.
+2.  **Brainstorm Keywords:** Think of concrete, visually-driven keywords. Avoid abstract concepts.
+3.  **Select & Refine:** Choose the most powerful and representative terms.
+4.  **Format Output:** Provide a single line of comma-separated English keywords. Do not include any other text, explanations, or formatting.
 
-## Keyword Generation Principles:
--   **DIVERSITY**: CRITICAL. Avoid repetitive or overly similar terms. Each keyword must represent a distinct visual concept from the script.
--   **SPECIFICITY**: Be specific. Instead of "car driving," prefer "sports car on mountain road at sunset."
--   **VISUAL & CONCRETE**: Each term must describe a tangible, visual scene. Do not use abstract concepts (e.g., "sadness", "freedom").
--   **CONCISENESS**: Terms should ideally be 2-4 words long.
--   **RELEVANCE**: Every term must be directly inspired by a part of the script and be relevant to the main video subject.
+## Example:
+**Video Subject:** "The Impact of Sugar on Your Brain"
+**Video Script:** "Sugar, a sweet temptation, can have a profound effect on our brain chemistry..."
+**Output:**
+`sugar cubes, brain scan, dopamine release, person eating candy, neural pathways`
 
-## Output Format Constraints:
--   You MUST return a pure, single JSON Array. No introductory text, no markdown. Your entire response body must be a valid JSON array.
--   All search terms must be in English.
-
-## Example of a Good Output:
-["dramatic mountain landscape", "hiker reaching summit", "close up of old compass", "time-lapse of starry night", "..."]
-
-## Context:
+## Your Turn:
 ### Video Subject:
 {video_subject}
 
-### Video Script
+### Video Script:
 {video_script}
 
-Please note that you must use English for generating video search terms; Chinese is not accepted.
-""".strip()
+### Output:
+"""
+    prompt = prompt_template.format(
+        video_subject=video_subject, video_script=video_script
+    )
 
     logger.info(f"subject: {video_subject}")
 
-    search_terms = []
-    response = ""
-    for i in range(_max_retries):
-        try:
-            response = _generate_response(prompt)
-            if "Error: " in response:
-                logger.error(f"failed to generate video script: {response}")
-                return response
-            search_terms = json.loads(response)
-            if not isinstance(search_terms, list) or not all(
-                isinstance(term, str) for term in search_terms
-            ):
-                logger.error("response is not a list of strings.")
-                continue
+    try:
+        response = _generate_response(prompt)
+        # remove blank lines
+        generated_text = "\n".join(
+            [line for line in response.split("\n") if line.strip()]
+        )
+        if not generated_text:
+            logger.warning("LLM returned empty terms list.")
+            return []
 
-        except Exception as e:
-            logger.warning(f"failed to generate video terms: {str(e)}")
-            if response:
-                match = re.search(r"\[.*]", response)
-                if match:
-                    try:
-                        search_terms = json.loads(match.group())
-                    except Exception as e:
-                        logger.warning(f"failed to generate video terms: {str(e)}")
-                        pass
+        terms = [term.strip().strip("`'\"") for term in generated_text.split(",")]
+        logger.info(f"Generated terms: {terms}")
+        return terms
+    except Exception as e:
+        logger.error(f"Failed to generate video terms: {e}")
+        return []
 
-        if search_terms and len(search_terms) > 0:
-            break
-        if i < _max_retries:
-            logger.warning(f"failed to generate video terms, trying again... {i + 1}")
 
-    logger.success(f"completed: \n{search_terms}")
-    return search_terms
+# def generate_storyboard(video_subject: str, video_script: str) -> List[Dict]:
+#     """
+#     Analyzes the entire script, breaks it down into scenes, and generates matching search terms for each scene.
+#     Returns a list of scenes, where each scene is a dictionary containing 'scene_script' and 'search_terms'.
+#     """
+#     prompt = f"""
+# # Role: Video Script Analyst
+
+# ## GOAL:
+# Your task is to transform a video script into a storyboard. You will read the provided script, segment it into scenes, and for each scene, generate a set of descriptive, visual search terms that will be used to find stock video footage. The final output must be a valid JSON array of objects.
+
+# ## STEP-BY-STEP INSTRUCTIONS:
+# 1.  **Segment the Script:** Read the `Video Script` and break it down into short, logical, spoken segments. A segment should typically be one or two sentences long.
+
+# ## EXAMPLE (Note the Realism and Concreteness):
+# [
+#   {{
+#     "scene_script": "Blueberries. They're often called nature's perfect food for your eyes.",
+#     "search_terms": ["woman eating fresh blueberries from a bowl", "close up of fresh blueberries", "bowl of blueberries on a table"]
+#   }},
+#   {{
+#     "scene_script": "And for good reason. Packed with anthocyanins, vitamin C, and ludian...",
+#     "search_terms": ["nutritionist explaining health benefits", "close up of vitamin C tablets", "diagram of anthocyanin molecule"]
+#   }},
+#   {{
+#     "scene_script": "...these tiny berries act like microscopic shields, protecting your retina and macula from oxidative stress and age related damage.",
+#     "search_terms": ["medical animation of the human eye", "diagram of the retina and macula", "older person with healthy eyes smiling"]
+#   }}
+# ]
+
+# ## CONTEXT:
+# ### Video Subject:
+# {video_subject}
+
+# ### Video Script:
+# {video_script}
+def generate_storyboard(video_subject: str, video_script: str) -> List[Dict]:
+    """
+    Analyzes the script, breaks it into scenes, and extracts the main subject nouns as search terms for each scene.
+    Returns a list of scenes, where each scene is a dictionary containing 'scene_script' and 'search_terms'.
+    """
+    # [核心修改] 通过更明确、更强力的指令，强制要求 LLM 将视频脚本的每一句话都处理成一个独立的场景，并为每个场景生成对应的英文关键词。
+    prompt = f"""
+You are a video production assistant. Your task is to process a script for a video, breaking it down sentence by sentence to generate visual search terms.
+
+**CRITICAL INSTRUCTIONS - FOLLOW THESE RULES EXACTLY:**
+
+1.  **ONE SENTENCE = ONE VISUAL SEGMENT:** Each sentence from the script is a distinct visual segment. Do not merge sentences.
+2.  **CONCRETE & VISUAL KEYWORDS ONLY:** The `search_terms` MUST be concrete, visual, and tangible things. They must be nouns or descriptive actions that can be found in a video library. 
+    - **GOOD:** `blueberries`, `person walking`, `city skyline`, `laughing friends`, `human eye`.
+    - **BAD / FORBIDDEN:** `reason`, `concept`, `idea`, `method`, `health`, `protection`, `damage`. Never use abstract, non-visual words.
+3.  **MANDATORY KEYWORD DIVERSITY:** You are FORBIDDEN from using the same primary keyword for two consecutive segments. If segment 1 uses `blueberries`, segment 2 MUST use a different but relevant keyword (e.g., `antioxidants` could be visualized as `colorful fruits`, `retina` as `close-up of eye`). DIVERSIFY a lot.
+
+**REQUIRED OUTPUT FORMAT:**
+- You must output a valid JSON array of objects.
+- Each object represents one sentence and must ONLY contain two keys: `script` and `search_terms`.
+
+**EXAMPLE:**
+
+Video Script:
+"Blueberries are packed with anthocyanins, which are great for your eyes. These antioxidants protect the retina from damage."
+
+Your JSON Output:
+```json
+[
+    {{
+        "script": "Blueberries are packed with anthocyanins, which are great for your eyes.",
+        "search_terms": "blueberries, fresh fruit, antioxidant food"
+    }},
+    {{
+        "script": "These antioxidants protect the retina from damage.",
+        "search_terms": "close-up of eye, retina scan, vision test"
+    }}
+]
+```
+
+**Video Script to Process:**
+```
+{video_script}
+```
+
+**Your JSON Output (must be a valid JSON array):**
+"""
+    # return []
+
+    logger.info(f"Generating storyboard for subject: {video_subject}")
+    response_str = _generate_response(prompt)
+
+    try:
+        # The model should return a valid JSON array string.
+        # Find the start and end of the JSON array.
+        json_start = response_str.find('[')
+        json_end = response_str.rfind(']')
+        if json_start != -1 and json_end != -1 and json_start < json_end:
+            json_str = response_str[json_start:json_end+1]
+            storyboard = json.loads(json_str)
+            logger.success("Successfully parsed storyboard from LLM response.")
+            return storyboard
+        else:
+            logger.error(f"Could not find a valid JSON array in the response. Raw response: {response_str}")
+            return []
+    except json.JSONDecodeError:
+        logger.error(f"Failed to parse JSON. Raw response: {response_str}")
+        # Fallback logic can be added here if needed, e.g., using regex to extract JSON.
+        return []
+
+
+# ... (您的其他函数和代码保持不变)
 
 
 if __name__ == "__main__":
@@ -479,4 +572,42 @@ if __name__ == "__main__":
     print(search_terms)
     print("-----输出包含的场景数量-----")
     print(len(search_terms))
-    
\ No newline at end of file
+
+def generate_video_category(video_subject: str) -> str:
+    """
+    Selects the most appropriate video category from a predefined list based on the video subject.
+    """
+    prompt = f"""
+# Role: Video Category Selector
+
+## Goal:
+Based on the provided 'Video Subject', select the ONE most suitable category from the `Category List` that best represents the subject. Your response must be only the single category name.
+
+## Category List:
+backgrounds, fashion, nature, science, education, feelings, health, people, religion, places, animals, industry, computer, food, sports, transportation, travel, buildings, business, music
+
+## Instructions:
+- Analyze the 'Video Subject'.
+- Choose the single best-fitting category from the list.
+- Respond with ONLY the category name and nothing else.
+
+## Example:
+Video Subject: "The benefits of a ketogenic diet"
+Response: health
+
+Video Subject: "A tour of the Grand Canyon"
+Response: travel
+
+## CONTEXT:
+### Video Subject:
+{video_subject}
+"""
+    category = _generate_response(prompt).strip().lower()
+    # Fallback to a default category if the response is invalid
+    valid_categories = ["backgrounds", "fashion", "nature", "science", "education", "feelings", "health", "people", "religion", "places", "animals", "industry", "computer", "food", "sports", "transportation", "travel", "buildings", "business", "music"]
+    if category not in valid_categories:
+        logger.warning(f"Generated category '{category}' is not valid. Falling back to 'nature'.")
+        return "nature"
+    
+    logger.success(f"Successfully selected video category: {category}")
+    return category
\ No newline at end of file
diff --git a/app/services/material.py b/app/services/material.py
index 6c6e6e6..fe01971 100644
--- a/app/services/material.py
+++ b/app/services/material.py
@@ -102,6 +102,8 @@ def search_videos_pexels(
                 item.provider = "pexels"
                 item.url = best_landscape_file["link"] # 使用最佳版本的链接
                 item.duration = duration
+                item.path = ""
+                item.start_time = 0.0
                 video_items.append(item)
         logging.info("选取的Mp4链接地址为{}".format(item.url))
         return video_items
@@ -177,6 +179,8 @@ def search_videos_pixabay(
                     item.provider = "pixabay"
                     item.url = best_video.get("url")
                     item.duration = duration
+                    item.path = ""
+                    item.start_time = 0.0
                     video_items.append(item)
             
             return video_items
@@ -319,73 +323,86 @@ def download_videos(
     search_terms: List[str],
     source: str = "pexels",
     video_aspect: VideoAspect = VideoAspect.portrait,
-    video_contact_mode: VideoConcatMode = VideoConcatMode.random,
+    video_concat_mode: VideoConcatMode = VideoConcatMode.random,
     audio_duration: float = 0.0,
     max_clip_duration: int = 5,
-) -> List[str]:
-    valid_video_items = []
-    valid_video_urls = []
-    found_duration = 0.0
-    search_videos = search_videos_pexels
-    search_kwargs = {}
-    if source == "pixabay":
-        search_videos = search_videos_pixabay
-        video_category = ""
-        if video_subject:
-            video_category = llm.generate_video_category(video_subject)
-        if video_category:
-            search_kwargs['category'] = video_category
+) -> List[MaterialInfo]:
+    """
+    Download videos from Pexels or Pixabay based on search terms.
+    """
+    all_video_items: List[MaterialInfo] = []
+    for term in search_terms:
+        if source == "pexels":
+            video_items = search_videos_pexels(
+                search_term=term,
+                minimum_duration=max_clip_duration,
+                video_aspect=video_aspect,
+            )
+        elif source == "pixabay":
+            video_items = search_videos_pixabay(
+                search_term=term,
+                minimum_duration=max_clip_duration,
+                video_aspect=video_aspect,
+            )
+        else:
+            video_items = []
+        
+        logger.info(f"found {len(video_items)} videos for '{term}'")
+        all_video_items.extend(video_items)
 
-    for search_term in search_terms:
-        video_items = search_videos(
-            search_term=search_term,
-            minimum_duration=max_clip_duration,
-            video_aspect=video_aspect,
-            **search_kwargs,
-        )
-        logger.info(f"found {len(video_items)} videos for '{search_term}'")
+    # Remove duplicates and calculate total duration
+    unique_video_items = []
+    seen_urls = set()
+    for item in all_video_items:
+        if item.url not in seen_urls:
+            unique_video_items.append(item)
+            seen_urls.add(item.url)
 
-        for item in video_items:
-            if item.url not in valid_video_urls:
-                valid_video_items.append(item)
-                valid_video_urls.append(item.url)
-                found_duration += item.duration
+    if video_concat_mode == VideoConcatMode.random:
+        random.shuffle(unique_video_items)
 
-    logger.info(
-        f"found total videos: {len(valid_video_items)}, required duration: {audio_duration} seconds, found duration: {found_duration} seconds"
-    )
-    video_paths = []
+    found_duration = sum(item.duration for item in unique_video_items)
+    logger.info(f"found total unique videos: {len(unique_video_items)}, required duration: {audio_duration:.4f} seconds, found duration: {found_duration:.2f} seconds")
+    logger.info(f"Video download list (first 5): {[item.url for item in unique_video_items[:5]]}")
 
-    material_directory = config.app.get("material_directory", "").strip()
-    if material_directory == "task":
-        material_directory = utils.task_dir(task_id)
-    elif material_directory and not os.path.isdir(material_directory):
-        material_directory = ""
+    if not unique_video_items:
+        logger.warning("No videos found for the given search terms.")
+        return []
 
-    if video_contact_mode.value == VideoConcatMode.random.value:
-        random.shuffle(valid_video_items)
+    if found_duration < audio_duration:
+        logger.warning(f"total duration of found videos ({found_duration:.2f}s) is less than audio duration ({audio_duration:.2f}s).")
 
-    total_duration = 0.0
-    for item in valid_video_items:
+    downloaded_materials: List[MaterialInfo] = []
+    downloaded_duration = 0.0
+    
+    for item in unique_video_items:
+        if downloaded_duration >= audio_duration:
+            logger.info(f"total duration of downloaded videos: {downloaded_duration:.2f} seconds, skip downloading more")
+            break
+        
         try:
             logger.info(f"downloading video: {item.url}")
-            saved_video_path = save_video(
-                video_url=item.url, save_dir=material_directory
-            )
-            if saved_video_path:
-                logger.info(f"video saved: {saved_video_path}")
-                video_paths.append(saved_video_path)
-                seconds = min(max_clip_duration, item.duration)
-                total_duration += seconds
-                if total_duration > audio_duration:
-                    logger.info(
-                        f"total duration of downloaded videos: {total_duration} seconds, skip downloading more"
-                    )
-                    break
+            file_path = save_video(video_url=item.url)
+            if file_path:
+                logger.info(f"video saved: {file_path}")
+                material_info = MaterialInfo()
+                material_info.path = file_path
+                material_info.start_time = 0.0
+                ffprobe_info = _get_video_info_ffprobe(file_path)
+                if ffprobe_info and ffprobe_info.get("duration"):
+                    material_info.duration = float(ffprobe_info.get("duration"))
+                    downloaded_duration += material_info.duration
+                else:
+                    material_info.duration = item.duration # fallback
+                    downloaded_duration += item.duration
+                
+                downloaded_materials.append(material_info)
+
         except Exception as e:
-            logger.error(f"failed to download video: {utils.to_json(item)} => {str(e)}")
-    logger.success(f"downloaded {len(video_paths)} videos")
-    return video_paths
+            logger.error(f"failed to download video: {item.url} => {e}")
+
+    logger.success(f"downloaded {len(downloaded_materials)} videos")
+    return downloaded_materials
 
 
 # 以下为调试入口，仅供开发测试
diff --git a/app/services/subtitle.py b/app/services/subtitle.py
index ca0f247..06e60f1 100644
--- a/app/services/subtitle.py
+++ b/app/services/subtitle.py
@@ -278,6 +278,77 @@ def correct(subtitle_file, video_script):
         logger.success("Subtitle is correct")
 
 
+def combine_srt_files(srt_files: list, output_file: str):
+    """
+    Combines multiple SRT files into a single file, adjusting timestamps sequentially.
+    """
+    logger.info(f"Combining {len(srt_files)} SRT files into {output_file}")
+    combined_subtitles = []
+    last_end_time_seconds = 0.0
+    entry_index = 1
+
+    for srt_file in srt_files:
+        if not os.path.exists(srt_file):
+            logger.warning(f"SRT file not found, skipping: {srt_file}")
+            continue
+        try:
+            with open(srt_file, 'r', encoding='utf-8') as f:
+                content = f.read()
+
+            entries = re.split(r'\n\s*\n', content.strip())
+            for entry in entries:
+                if not entry.strip():
+                    continue
+                
+                lines = entry.split('\n')
+                if len(lines) < 3:
+                    continue
+
+                # Parse timestamp
+                timestamp_line = lines[1]
+                start_time_str, end_time_str = timestamp_line.split(' --> ')
+
+                def srt_time_to_seconds(t_str):
+                    h, m, s_ms = t_str.split(':')
+                    s, ms = s_ms.split(',')
+                    return int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000.0
+
+                start_time = srt_time_to_seconds(start_time_str)
+                end_time = srt_time_to_seconds(end_time_str)
+                duration = end_time - start_time
+
+                # Adjust time
+                new_start_time = last_end_time_seconds
+                new_end_time = new_start_time + duration
+
+                def seconds_to_srt_time(seconds):
+                    h = int(seconds // 3600)
+                    m = int((seconds % 3600) // 60)
+                    s = int(seconds % 60)
+                    ms = int((seconds * 1000) % 1000)
+                    return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
+
+                new_start_str = seconds_to_srt_time(new_start_time)
+                new_end_str = seconds_to_srt_time(new_end_time)
+
+                # Append to combined list
+                text = '\n'.join(lines[2:])
+                combined_subtitles.append(f"{entry_index}\n{new_start_str} --> {new_end_str}\n{text}")
+                entry_index += 1
+            
+            # Update last end time for the next file
+            last_end_time_seconds = new_end_time
+
+        except Exception as e:
+            logger.error(f"Error processing SRT file {srt_file}: {e}")
+
+    # Write combined SRT to output file
+    with open(output_file, 'w', encoding='utf-8') as f:
+        f.write('\n\n'.join(combined_subtitles) + '\n\n')
+    
+    logger.success(f"Successfully combined SRT files into {output_file}")
+
+
 if __name__ == "__main__":
     task_id = "c12fd1e6-4b0a-4d65-a075-c87abe35a072"
     task_dir = utils.task_dir(task_id)
diff --git a/app/services/task.py b/app/services/task.py
index fe82689..f0928cd 100644
--- a/app/services/task.py
+++ b/app/services/task.py
@@ -7,10 +7,209 @@ from loguru import logger
 
 from app.config import config
 from app.models import const
-from app.models.schema import VideoConcatMode, VideoParams
-from app.services import llm, material, subtitle, video, voice
+from app.models.schema import (
+    VideoConcatMode,
+    VideoParams,
+    VideoAspect,
+    MaterialInfo,
+)
+from app.services import llm, material, subtitle, voice, video
+from app.services import video as video_utils
 from app.services import state as sm
 from app.utils import utils
+import time
+
+# ... 您已有的 start 函数 ...
+
+# ===================================================================
+# 新增的、实现音画同步的主任务函数
+# ===================================================================
+def start_storyboard_task(task_id, params: VideoParams):
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING)
+    workdir = utils.task_dir(task_id)
+
+    # 1. Generate Storyboard
+    logger.info("--- Step 1: Generating Storyboard ---")
+    video_script = params.video_script
+    if not video_script:
+        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED, status_message="Video script is empty.")
+        return
+
+    storyboard = llm.generate_storyboard(params.video_subject, video_script)
+    if not storyboard:
+        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED, status_message="Failed to generate storyboard.")
+        return
+
+    # 2. Process each segment
+    logger.info(f"--- Step 2: Processing {len(storyboard)} video segments ---")
+    segment_video_paths = []
+    segment_audio_paths = []
+    segment_srt_paths = []
+    total_duration = 0
+    last_used_keywords = set()
+
+    for i, segment in enumerate(storyboard):
+        try:
+            logger.info(f"--- Processing segment {i + 1} ---")
+            segment_script = segment.get("script")
+            if not segment_script:
+                logger.warning(f"Segment {i + 1} has no script, skipping")
+                continue
+
+            search_terms_str = segment.get("search_terms", "")
+            search_terms = [term.strip() for term in search_terms_str.split(',') if term.strip()]
+            if not search_terms:
+                logger.warning(f"Segment {i + 1} has no search terms, skipping")
+                continue
+
+            # Keyword Guard: Check for repetitive keywords
+            current_keywords = set(search_terms)
+            if i > 0 and current_keywords == last_used_keywords:
+                logger.warning(f"Segment {i + 1} uses the same keywords as the previous one ({search_terms_str}). Reusing last video clip to avoid visual repetition.")
+                if segment_video_paths:
+                    segment_video_paths.append(segment_video_paths[-1]) # Reuse the last processed video clip
+                    segment_audio_paths.append(segment_audio_paths[-1]) # Reuse the last audio clip
+                    continue # Skip processing for this segment
+
+            last_used_keywords = current_keywords
+
+            # a. Generate audio and subtitles for the segment
+            segment_audio_file = path.join(workdir, f"segment_{i + 1}.mp3")
+            segment_srt_file = path.join(workdir, f"segment_{i + 1}.srt")
+            sub_maker = voice.tts(
+                text=segment_script,
+                voice_name=voice.parse_voice_name(params.voice_name),
+                voice_rate=params.voice_rate,
+                voice_file=segment_audio_file,
+            )
+            if not sub_maker:
+                raise Exception(f"Failed to generate audio for segment {i + 1}")
+
+            voice.create_subtitle(
+                sub_maker=sub_maker, text=segment_script, subtitle_file=segment_srt_file
+            )
+            audio_duration = voice.get_audio_duration(sub_maker)
+            total_duration += audio_duration
+
+            # b. Search and download video materials for each term
+            video_materials = []
+            downloaded_duration = 0
+            for term in search_terms:
+                if downloaded_duration >= audio_duration:
+                    break
+                term_materials = material.download_videos(
+                    task_id=task_id,
+                    video_subject=params.video_subject,
+                    search_terms=[term],  # Pass one term at a time
+                    source=params.video_source,
+                    video_aspect=params.video_aspect,
+                    video_concat_mode=params.video_concat_mode,
+                    audio_duration=audio_duration - downloaded_duration,
+                    max_clip_duration=params.max_clip_duration,
+                )
+                if term_materials:
+                    video_materials.extend(term_materials)
+                    downloaded_duration = sum(m.duration for m in video_materials)
+            if not video_materials:
+                raise Exception(f"Failed to find materials for segment {i + 1}")
+
+            # c. Create a video clip matching the audio duration
+            segment_video_path = path.join(workdir, f"segment_video_{i + 1}.mp4")
+            clip_created = video.create_video_clip_from_materials(
+                video_materials=video_materials,
+                audio_duration=audio_duration,
+                max_clip_duration=params.max_clip_duration,
+                video_aspect=params.video_aspect,
+                output_path=segment_video_path
+            )
+            if not clip_created:
+                raise Exception(f"Failed to create video clip for segment {i + 1}")
+
+            segment_video_paths.append(segment_video_path)
+            segment_audio_paths.append(segment_audio_file)
+            segment_srt_paths.append(segment_srt_file)
+
+        except Exception as e:
+            logger.error(f"Error processing segment {i + 1}: {e}")
+            sm.state.update_task(task_id, state=const.TASK_STATE_FAILED, status_message=f"Error in segment {i + 1}: {e}")
+            return
+
+    # Check if any segments were processed
+    if not segment_video_paths:
+        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED, status_message="Failed to process any segments.")
+        logger.error("Failed to process any segments. Aborting video generation.")
+        return
+
+    # 3. Combine all segments
+    logger.info("--- Step 3: Combining all video segments ---")
+    # a. Combine audios
+    combined_audio_path = path.join(workdir, "voice.mp3")
+    if not voice.combine_audio_files(segment_audio_paths, combined_audio_path):
+        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED, status_message="Failed to combine audio files.")
+        return
+
+    # b. Combine videos
+    video_transition_mode = params.video_transition_mode
+    concatenated_video_path = path.join(workdir, "concatenated_video.mp4")
+    if not video.concatenate_videos(segment_video_paths, concatenated_video_path, transition_mode=video_transition_mode):
+        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED, status_message="Failed to concatenate videos.")
+        return
+
+    # c. Combine subtitles
+    combined_srt_path = path.join(workdir, "subtitles.srt")
+    subtitle.combine_srt_files(segment_srt_paths, combined_srt_path)
+
+    # 4. Final video assembly
+    logger.info("--- Step 4: Final video assembly ---")
+    # a. Add audio to concatenated video
+    video_with_audio_path = path.join(workdir, "video_with_audio.mp4")
+    if not video.add_audio_to_video(concatenated_video_path, combined_audio_path, video_with_audio_path):
+        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED, status_message="Failed to add audio to video.")
+        return
+
+    # b. Add background music
+    video_with_bgm_path = path.join(workdir, "video_with_bgm.mp4")
+    bgm_file = video.get_bgm_file(bgm_type=params.bgm_type, bgm_file=params.bgm_file)
+    if bgm_file:
+        if not video.add_bgm_to_video(
+            input_video_path=video_with_audio_path,
+            bgm_path=bgm_file,
+            bgm_volume=params.bgm_volume,
+            output_video_path=video_with_bgm_path
+        ):
+            logger.warning("Failed to mix BGM. Proceeding without it.")
+            video_with_bgm_path = video_with_audio_path # Fallback
+    else:
+        video_with_bgm_path = video_with_audio_path # No BGM requested
+
+    # c. Add subtitles
+    final_video_path = path.join(workdir, f"final_{task_id}.mp4")
+    video.add_subtitles_to_video(
+        video_path=video_with_bgm_path,
+        srt_path=combined_srt_path,
+        font_name=params.font_name,
+        font_size=params.font_size,
+        text_fore_color=params.text_fore_color,
+        stroke_color=params.stroke_color,
+        stroke_width=params.stroke_width,
+        subtitle_position=params.subtitle_position,
+        custom_position=params.custom_position,
+        output_path=final_video_path
+    )
+
+    # 5. Cleanup
+    logger.info("--- Step 5: Cleaning up temporary files ---")
+    cleanup_files = segment_video_paths + segment_audio_paths + segment_srt_paths + [combined_audio_path, concatenated_video_path, combined_srt_path, video_with_audio_path, video_with_bgm_path]
+    for item in cleanup_files:
+        if item and item != final_video_path and os.path.exists(item):
+            os.remove(item)
+
+    sm.state.update_task(task_id, state=const.TASK_STATE_COMPLETE, progress=100, video_path=final_video_path)
+    logger.success(f"Task {task_id} completed successfully. Final video: {final_video_path}")
+
+
+
+    return {"videos": [final_video_path]}
 
 
 def generate_script(task_id, params):
@@ -127,7 +326,7 @@ def get_video_materials(task_id, params, video_terms, audio_duration):
     if params.video_source == "local":
         logger.info("\n\n## preprocess local materials")
         materials = video.preprocess_video(
-            materials=params.video_materials, clip_duration=params.video_clip_duration
+            materials=params.video_materials, clip_duration=params.max_clip_duration
         )
         if not materials:
             sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
@@ -140,12 +339,13 @@ def get_video_materials(task_id, params, video_terms, audio_duration):
         logger.info(f"\n\n## downloading videos from {params.video_source}")
         downloaded_videos = material.download_videos(
             task_id=task_id,
+            video_subject=params.video_subject,
             search_terms=video_terms,
             source=params.video_source,
             video_aspect=params.video_aspect,
             video_contact_mode=params.video_concat_mode,
             audio_duration=audio_duration * params.video_count,
-            max_clip_duration=params.video_clip_duration,
+            max_clip_duration=params.max_clip_duration,
         )
         if not downloaded_videos:
             sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
@@ -173,14 +373,14 @@ def generate_final_videos(
             utils.task_dir(task_id), f"combined-{index}.mp4"
         )
         logger.info(f"\n\n## combining video: {index} => {combined_video_path}")
-        video.combine_videos_ffmpeg(
+        video_utils.combine_videos_ffmpeg(
             combined_video_path=combined_video_path,
             video_paths=downloaded_videos,
             audio_file=audio_file,
             video_aspect=params.video_aspect,
             video_concat_mode=video_concat_mode,
             video_transition_mode=video_transition_mode,
-            max_clip_duration=params.video_clip_duration,
+            max_clip_duration=params.max_clip_duration,
             threads=params.n_threads,
         )
 
@@ -190,7 +390,7 @@ def generate_final_videos(
         final_video_path = path.join(utils.task_dir(task_id), f"final-{index}.mp4")
 
         logger.info(f"\n\n## generating video: {index} => {final_video_path}")
-        video.generate_video(
+        video_utils.generate_video(
             video_path=combined_video_path,
             audio_path=audio_file,
             subtitle_path=subtitle_path,
diff --git a/app/services/utils/video_effects.py b/app/services/utils/video_effects.py
deleted file mode 100644
index 6cba8eb..0000000
--- a/app/services/utils/video_effects.py
+++ /dev/null
@@ -1,21 +0,0 @@
-from moviepy import Clip, vfx
-
-
-# FadeIn
-def fadein_transition(clip: Clip, t: float) -> Clip:
-    return clip.with_effects([vfx.FadeIn(t)])
-
-
-# FadeOut
-def fadeout_transition(clip: Clip, t: float) -> Clip:
-    return clip.with_effects([vfx.FadeOut(t)])
-
-
-# SlideIn
-def slidein_transition(clip: Clip, t: float, side: str) -> Clip:
-    return clip.with_effects([vfx.SlideIn(t, side)])
-
-
-# SlideOut
-def slideout_transition(clip: Clip, t: float, side: str) -> Clip:
-    return clip.with_effects([vfx.SlideOut(t, side)])
diff --git a/app/services/video.py b/app/services/video.py
index 171316a..5d18d0f 100644
--- a/app/services/video.py
+++ b/app/services/video.py
@@ -2,491 +2,459 @@ import glob
 import itertools
 import os
 import random
-import gc
 import shutil
+import subprocess
+import json
 from typing import List
 from loguru import logger
-from moviepy import (
-    AudioFileClip,
-    ColorClip,
-    CompositeAudioClip,
-    CompositeVideoClip,
-    ImageClip,
-    TextClip,
-    VideoFileClip,
-    afx,
-    concatenate_videoclips,
-)
-from moviepy.video.tools.subtitles import SubtitlesClip
-from PIL import ImageFont
 
-from app.models import const
 from app.models.schema import (
-    MaterialInfo,
     VideoAspect,
-    VideoConcatMode,
     VideoParams,
+    VideoConcatMode,
     VideoTransitionMode,
 )
-from app.services.utils import video_effects
+
 from app.utils import utils
 
-class SubClippedVideoClip:
-    def __init__(self, file_path, start_time=None, end_time=None, width=None, height=None, duration=None):
-        self.file_path = file_path
-        self.start_time = start_time
-        self.end_time = end_time
-        self.width = width
-        self.height = height
-        if duration is None:
-            self.duration = end_time - start_time
-        else:
-            self.duration = duration
-
-    def __str__(self):
-        return f"SubClippedVideoClip(file_path={self.file_path}, start_time={self.start_time}, end_time={self.end_time}, duration={self.duration}, width={self.width}, height={self.height})"
-
-
-audio_codec = "aac"
-video_codec = "libx264"
-fps = 30
-
-def close_clip(clip):
-    if clip is None:
-        return
-        
-    try:
-        # close main resources
-        if hasattr(clip, 'reader') and clip.reader is not None:
-            clip.reader.close()
-            
-        # close audio resources
-        if hasattr(clip, 'audio') and clip.audio is not None:
-            if hasattr(clip.audio, 'reader') and clip.audio.reader is not None:
-                clip.audio.reader.close()
-            del clip.audio
-            
-        # close mask resources
-        if hasattr(clip, 'mask') and clip.mask is not None:
-            if hasattr(clip.mask, 'reader') and clip.mask.reader is not None:
-                clip.mask.reader.close()
-            del clip.mask
-            
-        # handle child clips in composite clips
-        if hasattr(clip, 'clips') and clip.clips:
-            for child_clip in clip.clips:
-                if child_clip is not clip:  # avoid possible circular references
-                    close_clip(child_clip)
-            
-        # clear clip list
-        if hasattr(clip, 'clips'):
-            clip.clips = []
-            
-    except Exception as e:
-        logger.error(f"failed to close clip: {str(e)}")
-    
-    del clip
-    gc.collect()
-
-def delete_files(files: List[str] | str):
-    if isinstance(files, str):
-        files = [files]
-        
-    for file in files:
-        try:
-            os.remove(file)
-        except:
-            pass
-
-def get_bgm_file(bgm_type: str = "random", bgm_file: str = ""):
-    if not bgm_type:
-        return ""
-
-    if bgm_file and os.path.exists(bgm_file):
-        return bgm_file
 
+def get_bgm_file(bgm_type: str, bgm_file: str):
     if bgm_type == "random":
-        suffix = "*.mp3"
-        song_dir = utils.song_dir()
-        files = glob.glob(os.path.join(song_dir, suffix))
-        return random.choice(files)
+        bgm_dir = utils.resource_dir("bgm")
+        if not os.path.exists(bgm_dir):
+            logger.warning(f"BGM directory not found: {bgm_dir}, trying assets/bgm")
+            bgm_dir = utils.resource_dir("assets/bgm")
+            if not os.path.exists(bgm_dir):
+                logger.warning(f"BGM directory not found: {bgm_dir}, skip adding BGM.")
+                return ""
+
+        bgm_files = glob.glob(os.path.join(bgm_dir, "*.mp3"))
+        if not bgm_files:
+            logger.warning(f"No BGM files found in {bgm_dir}, skip adding BGM.")
+            return ""
+        return random.choice(bgm_files)
+
+    if bgm_type == "local":
+        return bgm_file
 
     return ""
 
 
-# def combine_videos(
-#     combined_video_path: str,
-#     video_paths: List[str],
-#     audio_file: str,
-#     video_aspect: VideoAspect = VideoAspect.portrait,
-#     video_concat_mode: VideoConcatMode = VideoConcatMode.random,
-#     video_transition_mode: VideoTransitionMode = None,
-#     max_clip_duration: int = 5,
-#     threads: int = 2,
-# ) -> str:
-#     audio_clip = AudioFileClip(audio_file)
-#     audio_duration = audio_clip.duration
-#     logger.info(f"audio duration: {audio_duration} seconds")
-#     # Required duration of each clip
-#     req_dur = audio_duration / len(video_paths)
-#     req_dur = max_clip_duration
-#     logger.info(f"maximum clip duration: {req_dur} seconds")
-#     output_dir = os.path.dirname(combined_video_path)
+def _run_ffmpeg_command(command: list):
+    try:
+        process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, creationflags=subprocess.CREATE_NO_WINDOW)
+        stdout, stderr = process.communicate()
+        if process.returncode != 0:
+            logger.error(f"FFmpeg command failed with return code {process.returncode}")
+            logger.error(f"FFmpeg stderr: {stderr}")
+            return False
+        logger.debug(f"FFmpeg command successful: {' '.join(command)}")
+        logger.debug(f"FFmpeg stderr: {stderr}")
+        return True
+    except FileNotFoundError:
+        logger.error("ffmpeg or ffprobe not found. Please ensure they are installed and in your PATH.")
+        return False
+    except Exception as e:
+        logger.error(f"An error occurred while running ffmpeg: {e}")
+        return False
 
-#     aspect = VideoAspect(video_aspect)
-#     video_width, video_height = aspect.to_resolution()
 
-#     processed_clips = []
-#     subclipped_items = []
-#     video_duration = 0
-#     for video_path in video_paths:
-#         clip = VideoFileClip(video_path)
-#         clip_duration = clip.duration
-#         clip_w, clip_h = clip.size
-#         close_clip(clip)
-        
-#         start_time = 0
+def get_video_duration(video_path: str) -> float:
+    """Get the duration of a video using ffprobe."""
+    command = [
+        'ffprobe',
+        '-v', 'error',
+        '-show_entries', 'format=duration',
+        '-of', 'default=noprint_wrappers=1:nokey=1',
+        video_path
+    ]
+    try:
+        result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True)
+        return float(result.stdout)
+    except (subprocess.CalledProcessError, FileNotFoundError, ValueError) as e:
+        logger.error(f"Error getting duration for {video_path}: {e}")
+        return 0.0
 
-#         while start_time < clip_duration:
-#             end_time = min(start_time + max_clip_duration, clip_duration)            
-#             if clip_duration - start_time >= max_clip_duration:
-#                 subclipped_items.append(SubClippedVideoClip(file_path= video_path, start_time=start_time, end_time=end_time, width=clip_w, height=clip_h))
-#             start_time = end_time    
-#             if video_concat_mode.value == VideoConcatMode.sequential.value:
-#                 break
 
-#     # random subclipped_items order
-#     if video_concat_mode.value == VideoConcatMode.random.value:
-#         random.shuffle(subclipped_items)
-        
-#     logger.debug(f"total subclipped items: {len(subclipped_items)}")
-    
-#     # Add downloaded clips over and over until the duration of the audio (max_duration) has been reached
-#     for i, subclipped_item in enumerate(subclipped_items):
-#         if video_duration > audio_duration:
-#             break
-        
-#         logger.debug(f"processing clip {i+1}: {subclipped_item.width}x{subclipped_item.height}, current duration: {video_duration:.2f}s, remaining: {audio_duration - video_duration:.2f}s")
-        
-#         try:
-#             clip = VideoFileClip(subclipped_item.file_path).subclipped(subclipped_item.start_time, subclipped_item.end_time)
-#             clip_duration = clip.duration
-#             # Not all videos are same size, so we need to resize them
-#             clip_w, clip_h = clip.size
-#             if clip_w != video_width or clip_h != video_height:
-#                 clip_ratio = clip.w / clip.h
-#                 video_ratio = video_width / video_height
-#                 logger.debug(f"resizing clip, source: {clip_w}x{clip_h}, ratio: {clip_ratio:.2f}, target: {video_width}x{video_height}, ratio: {video_ratio:.2f}")
-                
-#                 if clip_ratio == video_ratio:
-#                     clip = clip.resized(new_size=(video_width, video_height))
-#                 else:
-#                     if clip_ratio > video_ratio:
-#                         scale_factor = video_width / clip_w
-#                     else:
-#                         scale_factor = video_height / clip_h
+def delete_files(files: List[str] | str):
+    if isinstance(files, str):
+        files = [files]
+    for file in files:
+        if os.path.exists(file):
+            try:
+                os.remove(file)
+            except Exception as e:
+                logger.warning(f"Failed to delete file {file}: {e}")
 
-#                     new_width = int(clip_w * scale_factor)
-#                     new_height = int(clip_h * scale_factor)
 
-#                     background = ColorClip(size=(video_width, video_height), color=(0, 0, 0)).with_duration(clip_duration)
-#                     clip_resized = clip.resized(new_size=(new_width, new_height)).with_position("center")
-#                     clip = CompositeVideoClip([background, clip_resized])
-                    
-#             shuffle_side = random.choice(["left", "right", "top", "bottom"])
-#             if video_transition_mode.value == VideoTransitionMode.none.value:
-#                 clip = clip
-#             elif video_transition_mode.value == VideoTransitionMode.fade_in.value:
-#                 clip = video_effects.fadein_transition(clip, 1)
-#             elif video_transition_mode.value == VideoTransitionMode.fade_out.value:
-#                 clip = video_effects.fadeout_transition(clip, 1)
-#             elif video_transition_mode.value == VideoTransitionMode.slide_in.value:
-#                 clip = video_effects.slidein_transition(clip, 1, shuffle_side)
-#             elif video_transition_mode.value == VideoTransitionMode.slide_out.value:
-#                 clip = video_effects.slideout_transition(clip, 1, shuffle_side)
-#             elif video_transition_mode.value == VideoTransitionMode.shuffle.value:
-#                 transition_funcs = [
-#                     lambda c: video_effects.fadein_transition(c, 1),
-#                     lambda c: video_effects.fadeout_transition(c, 1),
-#                     lambda c: video_effects.slidein_transition(c, 1, shuffle_side),
-#                     lambda c: video_effects.slideout_transition(c, 1, shuffle_side),
-#                 ]
-#                 shuffle_transition = random.choice(transition_funcs)
-#                 clip = shuffle_transition(clip)
+def create_video_clip_from_materials(video_materials: list, audio_duration: float, max_clip_duration: int, video_aspect: VideoAspect, output_path: str):
+    logger.info(f"Optimized: Creating video clip for {output_path} with duration {audio_duration:.2f}s using ffmpeg")
 
-#             if clip.duration > max_clip_duration:
-#                 clip = clip.subclipped(0, max_clip_duration)
-                
-#             # wirte clip to temp file
-#             clip_file = f"{output_dir}/temp-clip-{i+1}.mp4"
-#             clip.write_videofile(clip_file, logger=None, fps=fps, codec=video_codec)
-            
-#             close_clip(clip)
-        
-#             processed_clips.append(SubClippedVideoClip(file_path=clip_file, duration=clip.duration, width=clip_w, height=clip_h))
-#             video_duration += clip.duration
-            
-#         except Exception as e:
-#             logger.error(f"failed to process clip: {str(e)}")
-    
-#     # loop processed clips until the video duration matches or exceeds the audio duration.
-#     if video_duration < audio_duration:
-#         logger.warning(f"video duration ({video_duration:.2f}s) is shorter than audio duration ({audio_duration:.2f}s), looping clips to match audio length.")
-#         base_clips = processed_clips.copy()
-#         for clip in itertools.cycle(base_clips):
-#             if video_duration >= audio_duration:
-#                 break
-#             processed_clips.append(clip)
-#             video_duration += clip.duration
-#         logger.info(f"video duration: {video_duration:.2f}s, audio duration: {audio_duration:.2f}s, looped {len(processed_clips)-len(base_clips)} clips")
-     
-#     # merge video clips progressively, avoid loading all videos at once to avoid memory overflow
-#     logger.info("starting clip merging process")
-#     if not processed_clips:
-#         logger.warning("no clips available for merging")
-#         return combined_video_path
-    
-#     # if there is only one clip, use it directly
-#     if len(processed_clips) == 1:
-#         logger.info("using single clip directly")
-#         shutil.copy(processed_clips[0].file_path, combined_video_path)
-#         delete_files(processed_clips)
-#         logger.info("video combining completed")
-#         return combined_video_path
-    
-#     # create initial video file as base
-#     base_clip_path = processed_clips[0].file_path
-#     temp_merged_video = f"{output_dir}/temp-merged-video.mp4"
-#     temp_merged_next = f"{output_dir}/temp-merged-next.mp4"
-    
-#     # copy first clip as initial merged video
-#     shutil.copy(base_clip_path, temp_merged_video)
-    
-#     # merge remaining video clips one by one
-#     for i, clip in enumerate(processed_clips[1:], 1):
-#         logger.info(f"merging clip {i}/{len(processed_clips)-1}, duration: {clip.duration:.2f}s")
-        
-#         try:
-#             # load current base video and next clip to merge
-#             base_clip = VideoFileClip(temp_merged_video)
-#             next_clip = VideoFileClip(clip.file_path)
-            
-#             # merge these two clips
-#             merged_clip = concatenate_videoclips([base_clip, next_clip])
+    if audio_duration <= 0:
+        logger.warning("Audio duration is zero or negative, cannot create video clip.")
+        return False
 
-#             # save merged result to temp file
-#             merged_clip.write_videofile(
-#                 filename=temp_merged_next,
-#                 threads=threads,
-#                 logger=None,
-#                 temp_audiofile_path=output_dir,
-#                 audio_codec=audio_codec,
-#                 fps=fps,
-#             )
-#             close_clip(base_clip)
-#             close_clip(next_clip)
-#             close_clip(merged_clip)
-            
-#             # replace base file with new merged file
-#             delete_files(temp_merged_video)
-#             os.rename(temp_merged_next, temp_merged_video)
-            
-#         except Exception as e:
-#             logger.error(f"failed to merge clip: {str(e)}")
-#             continue
-    
-#     # after merging, rename final result to target file name
-#     os.rename(temp_merged_video, combined_video_path)
-    
-#     # clean temp files
-#     clip_files = [clip.file_path for clip in processed_clips]
-#     delete_files(clip_files)
-            
-#     logger.info("video combining completed")
-#     return combined_video_path
+    total_duration_of_materials = sum(m.duration for m in video_materials)
+    if total_duration_of_materials < audio_duration:
+        logger.warning(f"Total material duration ({total_duration_of_materials}s) is less than audio duration ({audio_duration}s). Video will be shorter.")
+        audio_duration = total_duration_of_materials
 
-import subprocess
+    w, h = video_aspect.to_resolution()
+    # Use the most robust method: scale to fill, then crop to center.
+    # This avoids black bars by ensuring the video fills the frame, cropping excess.
+    scale_filter = f"scale={w}:{h}:force_original_aspect_ratio=increase"
+    crop_filter = f"crop={w}:{h}"
+    fade_in_filter = "fade=in:st=0:d=0.5"
 
-def combine_videos_ffmpeg(
-    combined_video_path: str,
-    video_paths: List[str],
-    audio_file: str,
-    video_aspect: VideoAspect = VideoAspect.portrait,
-    video_concat_mode: VideoConcatMode = VideoConcatMode.random,
-    video_transition_mode: VideoTransitionMode = None, # 注意：FFmpeg转场实现方式不同
-    max_clip_duration: int = 5,
-    threads: int = 2,
-) -> str:
-    """
-    使用 FFmpeg 和 GPU 加速来合并视频，以获得极致的性能和画质。
-    """
-    audio_clip = AudioFileClip(audio_file)
-    audio_duration = audio_clip.duration
-    close_clip(audio_clip)
-    logger.info(f"音频时长: {audio_duration:.2f} 秒")
+    filter_complex_parts = []
+    concat_inputs = ""
+    time_so_far = 0.0
 
-    output_dir = os.path.dirname(combined_video_path)
-    aspect = VideoAspect(video_aspect)
-    video_width, video_height = aspect.to_resolution()
+    # If only one material, just trim and process it
+    if len(video_materials) == 1:
+        material = video_materials[0]
+        duration_needed = audio_duration
+        start_time = material.start_time if material.start_time >= 0 else 0
+        trim_filter = f"[0:v]trim=start={start_time}:duration={duration_needed},setpts=PTS-STARTPTS"
+        sar_filter = "setsar=1"
 
-    # --- 步骤 1: 将所有源视频切成小片段信息 ---
-    subclipped_items = []
-    for video_path in video_paths:
-        # 这里我们仍然用 moviepy 获取视频信息，因为它很方便
-        try:
-            with VideoFileClip(video_path) as clip:
-                clip_duration = clip.duration
-                clip_w, clip_h = clip.size
-            
-            start_time = 0
-            while start_time < clip_duration:
-                end_time = min(start_time + max_clip_duration, clip_duration)
-                if end_time - start_time >= 1.0: # 确保片段至少1秒
-                    subclipped_items.append(SubClippedVideoClip(
-                        file_path=video_path, 
-                        start_time=start_time, 
-                        end_time=end_time
-                    ))
-                start_time += max_clip_duration
-                if video_concat_mode.value == VideoConcatMode.sequential.value:
-                    break
-        except Exception as e:
-            logger.error(f"无法读取视频信息 {video_path}: {e}")
-            continue
+        command = [
+            "ffmpeg",
+            "-y",
+            "-i", material.path,
+            "-vf", f"{trim_filter},{sar_filter},{scale_filter},{crop_filter},{fade_in_filter}",
+            "-an",  # remove audio
+            "-c:v", "libx264",
+            "-preset", "ultrafast",
+            "-crf", "23",
+            "-maxrate", "10M",
+            "-bufsize", "20M",
+            "-r", "30",
+            output_path
+        ]
+        return _run_ffmpeg_command(command)
 
-    if video_concat_mode.value == VideoConcatMode.random.value:
-        random.shuffle(subclipped_items)
-
-    # --- 步骤 2: 使用 FFmpeg 处理每个小片段并保存为临时文件 ---
-    processed_files = []
-    total_video_duration = 0
-    
-    for i, item in enumerate(subclipped_items):
-        if total_video_duration >= audio_duration:
+    # If multiple materials, create clips and concatenate
+    for i, material in enumerate(video_materials):
+        if time_so_far >= audio_duration:
             break
 
-        temp_clip_path = os.path.join(output_dir, f"temp-clip-{i}.mp4")
-        clip_duration = item.end_time - item.start_time
-        
-        # 构建FFmpeg命令
-        # 滤镜链: 缩放以适应目标尺寸(保持宽高比), 然后用黑边填充到目标分辨率
-        vf_filter = f"scale={video_width}:{video_height}:force_original_aspect_ratio=decrease,pad={video_width}:{video_height}:-1:-1:color=black"
-        
-        # 添加转场效果 (这里只演示淡入，其他转场需要更复杂的滤镜)
-        if video_transition_mode and video_transition_mode.value != VideoTransitionMode.none.value:
-             # FFmpeg的淡入效果: fade=type=in:start_time=0:duration=1
-            fade_duration = min(1.0, clip_duration) # 淡入时长不超过片段时长
-            vf_filter += f",fade=t=in:st=0:d={fade_duration}"
+        duration_from_this_clip = min(material.duration, audio_duration - time_so_far, max_clip_duration)
+        if duration_from_this_clip <= 0:
+            continue
 
-        command = command = [
-    "ffmpeg", "-y",
-    "-hwaccel", "auto",
-    "-ss", str(item.start_time),
-    "-to", str(item.end_time),
-    "-i", item.file_path,
-    "-vf", vf_filter,
-    "-c:v", "h264_nvenc",
-    "-preset", "p5",
-    "-b:v", "50M",
-    "-r", str(fps), # <--- 强制输出帧率为30
-    "-video_track_timescale", "30000", # <--- 强制设置一个标准的时间基
-    "-an", # <--- 强制移除所有音频轨道，避免音频参数不一致
-    "-threads", str(threads),
-    temp_clip_path
-]
-        
-        logger.debug(f"正在处理片段 {i}: {' '.join(command)}")
-        try:
-            subprocess.run(command, check=True, capture_output=True)
-            processed_files.append(temp_clip_path)
-            total_video_duration += clip_duration
-        except subprocess.CalledProcessError as e:
-            logger.error(f"处理片段失败 {item.file_path}: {e.stderr.decode('utf-8')}")
+        start_time = material.start_time if material.start_time >= 0 else 0
+        trim_filter = f"[{i}:v]trim=start={start_time}:duration={duration_from_this_clip},setpts=PTS-STARTPTS"
+        sar_filter = "setsar=1"
+        filter_complex_parts.append(f"{trim_filter},{sar_filter},{scale_filter},{crop_filter}[v{i}]" )
+        concat_inputs += f"[v{i}]"
+        time_so_far += duration_from_this_clip
 
-    # --- 步骤 3: 使用 FFmpeg concat demuxer 极速合并所有临时片段 ---
-    concat_list_path = os.path.join(output_dir, "concat_list.txt")
-    with open(concat_list_path, "w", encoding="utf-8") as f:
-        for file_path in processed_files:
-            # FFmpeg concat需要特定的格式
-            f.write(f"file '{file_path.replace(os.sep, '/')}'\n")
+    if not filter_complex_parts:
+        logger.error("No video clips could be prepared for concatenation.")
+        return False
 
-    # 构建合并命令
-    merge_command = [
+    concat_filter = f"{concat_inputs}concat=n={len(concat_inputs)//3}:v=1:a=0[outv]"
+    filter_complex_parts.append(concat_filter)
+
+    command = [
         "ffmpeg", "-y",
-        "-f", "concat",
-        "-safe", "0",
-        "-i", concat_list_path,
-        "-c", "copy", # 关键：直接复制流，不重新编码，速度极快
-        combined_video_path
     ]
-    
-    logger.info("开始极速合并所有片段...")
+    for material in video_materials[:len(concat_inputs)//3]:
+        command.extend(["-i", material.path])
+
+    command.extend([
+        "-filter_complex", ';'.join(filter_complex_parts),
+        "-map", "[outv]",
+        "-c:v", "libx264",
+        "-an",
+        "-r", "30",
+        output_path
+    ])
+
+    return _run_ffmpeg_command(command)
+
+
+def concatenate_videos(video_paths: List[str], output_path: str, transition_mode: VideoTransitionMode = VideoTransitionMode.none):
+    logger.info(f"Concatenating {len(video_paths)} videos into {output_path} with transition: {transition_mode.name}")
+
+    if not video_paths:
+        logger.error("No video paths provided for concatenation.")
+        return False
+
+    if len(video_paths) == 1:
+        logger.info("Only one video, copying to output path.")
+        shutil.copy(video_paths[0], output_path)
+        return True
+
+    use_transition = transition_mode != VideoTransitionMode.none
+
+    # Nested function for fallback to simple concatenation
+    def fallback_concat():
+        logger.info("Using simple concat demuxer (no transitions).")
+        temp_file_path = os.path.join(os.path.dirname(output_path), "temp_video_list.txt")
+        try:
+            with open(temp_file_path, "w", encoding="utf-8") as f:
+                for video_path in video_paths:
+                    # Normalize path for ffmpeg concat demuxer, which is sensitive to backslashes
+                    normalized_path = video_path.replace('\\', '/')
+                    f.write(f"file '{normalized_path}'\n")
+            
+            command = [
+                "ffmpeg", "-y",
+                "-f", "concat",
+                "-safe", "0",
+                "-i", temp_file_path,
+                "-c", "copy",
+                output_path
+            ]
+            
+            if _run_ffmpeg_command(command):
+                logger.success(f"Successfully concatenated videos using concat demuxer: {output_path}")
+                return True
+            else:
+                logger.error("Failed to concatenate videos using concat demuxer.")
+                return False
+        finally:
+            delete_files(temp_file_path)
+
+    if not use_transition:
+        return fallback_concat()
+
+    # Proceed with transitions using xfade
+    logger.info("Using xfade for transitions.")
+    transition_duration = 0.5  # seconds
+    video_durations = [get_video_duration(p) for p in video_paths]
+
+    if any(d == 0.0 for d in video_durations):
+        logger.warning("Could not determine duration for all video clips, falling back to simple concatenation.")
+        return fallback_concat()
+
+    command = ["ffmpeg", "-y"]
+    for path in video_paths:
+        command.extend(["-i", path])
+
+    filter_chains = []
+    # Initial stream is [0:v]
+    last_stream_name = "[0:v]"
+    total_duration = 0
+
+    for i in range(1, len(video_paths)):
+        total_duration += video_durations[i-1]
+        offset = total_duration - transition_duration
+        
+        input_stream_name = f"[{i}:v]"
+        output_stream_name = f"[v{i}]"
+        
+        filter_chains.append(f"{last_stream_name}{input_stream_name}xfade=transition=fade:duration={transition_duration}:offset={offset}{output_stream_name}")
+        last_stream_name = output_stream_name
+
+    filter_complex = ";".join(filter_chains)
+
+    command.extend([
+        "-filter_complex", filter_complex,
+        "-map", last_stream_name,
+        "-c:v", "libx264",
+        "-movflags", "+faststart",
+        output_path
+    ])
+
+    if _run_ffmpeg_command(command):
+        logger.success(f"Successfully concatenated videos with transitions: {output_path}")
+        return True
+    else:
+        logger.warning("FFmpeg command with transition failed, falling back to simple concatenation.")
+        return fallback_concat()
+
+
+def add_audio_to_video(video_path: str, audio_path: str, output_path: str):
+    video_path = os.path.normpath(video_path)
+    audio_path = os.path.normpath(audio_path)
+    output_path = os.path.normpath(output_path)
+
+    # Check if the video already has an audio stream
+    has_audio_stream = False
     try:
-        subprocess.run(merge_command, check=True, capture_output=True)
-        logger.success("片段合并完成！")
-    except subprocess.CalledProcessError as e:
-        logger.error(f"合并失败: {e.stderr.decode('utf-8')}")
-        return ""
+        probe_command = [
+            "ffprobe", "-v", "error", "-select_streams", "a",
+            "-show_entries", "stream=codec_type", "-of", "csv=p=0", video_path
+        ]
+        process = subprocess.run(probe_command, check=True, capture_output=True, text=True)
+        if process.stdout.strip():
+            has_audio_stream = True
+    except (subprocess.CalledProcessError, FileNotFoundError) as e:
+        logger.warning(f"Could not probe video for audio stream: {e}")
 
-    # --- 步骤 4: 清理临时文件 ---
-    delete_files(processed_files)
-    delete_files(concat_list_path)
+    if has_audio_stream:
+        command = [
+            "ffmpeg",
+            "-y",
+            "-i", video_path,
+            "-i", audio_path,
+            "-c:v", "copy",
+            "-c:a", "aac",
+            "-map", "0:v:0",
+            "-map", "1:a:0",
+            "-shortest",
+            output_path,
+        ]
+    else:
+        command = [
+            "ffmpeg",
+            "-y",
+            "-i", video_path,
+            "-i", audio_path,
+            "-c:v", "copy",
+            "-c:a", "aac",
+            "-map", "0:v:0",
+            "-map", "1:a:0",
+            output_path,
+        ]
+    return _run_ffmpeg_command(command)
 
-    return combined_video_path
 
-def wrap_text(text, max_width, font="Arial", fontsize=60):
-    # Create ImageFont
-    font = ImageFont.truetype(font, fontsize)
+def add_bgm_to_video(video_path: str, bgm_path: str, bgm_volume: float, output_path: str) -> bool:
+    video_path = os.path.normpath(video_path)
+    bgm_path = os.path.normpath(bgm_path)
+    output_path = os.path.normpath(output_path)
+    """
+    Mixes background music into a video's audio track using ffmpeg and outputs a new video file.
+    """
+    logger.info(f"Mixing BGM '{bgm_path}' into video '{video_path}'")
 
-    def get_text_size(inner_text):
-        inner_text = inner_text.strip()
-        left, top, right, bottom = font.getbbox(inner_text)
-        return right - left, bottom - top
+    video_duration = get_video_duration(video_path)
+    if video_duration == 0.0:
+        logger.error(f"Could not get duration of video {video_path}")
+        return False
 
-    width, height = get_text_size(text)
-    if width <= max_width:
-        return text, height
+    command = [
+        "ffmpeg",
+        "-y",
+        "-i", video_path,
+        "-stream_loop", "-1",
+        "-i", bgm_path,
+        "-filter_complex", f"[0:a]volume=1.0[a0];[1:a]volume={bgm_volume}[a1];[a0][a1]amix=inputs=2:duration=first[a]",
 
-    processed = True
+        "-map", "0:v",
+        "-map", "[a]",
+        "-c:v", "copy",
+        "-c:a", "aac",
+        "-t", str(video_duration),
+        "-shortest",
+        output_path,
+    ]
 
-    _wrapped_lines_ = []
-    words = text.split(" ")
-    _txt_ = ""
-    for word in words:
-        _before = _txt_
-        _txt_ += f"{word} "
-        _width, _height = get_text_size(_txt_)
-        if _width <= max_width:
-            continue
+    return _run_ffmpeg_command(command)
+
+
+def add_subtitles_to_video(video_path: str, srt_path: str, font_name: str, font_size: int, text_fore_color: str, stroke_color: str, stroke_width: float, subtitle_position: str, custom_position: float, output_path: str):
+    video_path = os.path.normpath(video_path)
+    srt_path = os.path.normpath(srt_path)
+    output_path = os.path.normpath(output_path)
+    font_path = utils.get_font_path(font_name)
+    if not os.path.exists(font_path):
+        logger.error(f"Font '{font_name}' not found, using default.")
+        font_path = utils.get_font_path("MicrosoftYaHeiBold.ttc")
+
+    # This is the robust way to escape paths for ffmpeg filters on Windows
+    def escape_ffmpeg_path(path):
+        # Replace backslashes with forward slashes
+        escaped_path = path.replace('\\', '/')
+        # Escape colons
+        escaped_path = escaped_path.replace(':', '\\:')
+        return escaped_path
+
+    style_options = [
+        f"FontName='{os.path.basename(font_path)}'",
+        f"FontSize={font_size}",
+        f"PrimaryColour=&H{utils.rgb_to_bgr_hex(text_fore_color)}",
+        f"BorderStyle=1",
+        f"OutlineColour=&H{utils.rgb_to_bgr_hex(stroke_color)}",
+        f"Outline={stroke_width}",
+        f"Shadow=0",
+        f"MarginV=20"
+    ]
+
+    if subtitle_position == 'bottom':
+        style_options.append("Alignment=2")  # Bottom center
+    elif subtitle_position == 'top':
+        style_options.append("Alignment=8")  # Top center
+    elif subtitle_position == 'center':
+        style_options.append("Alignment=5")  # Middle center
+    else:  # custom
+        style_options.append(f"Alignment=2,MarginV={int(custom_position)}")
+
+    style_string = ','.join(style_options)
+
+    # Correctly escape paths for ffmpeg's filtergraph
+    font_dir_escaped = escape_ffmpeg_path(os.path.dirname(font_path))
+    srt_path_escaped = escape_ffmpeg_path(srt_path)
+
+    subtitles_filter = f"subtitles='{srt_path_escaped}':force_style='{style_string}':fontsdir='{font_dir_escaped}'"
+
+    command = [
+        "ffmpeg", "-y",
+        "-i", video_path,
+        "-vf", subtitles_filter,
+        "-c:v", "libx264",
+        "-c:a", "copy",
+        "-preset", "ultrafast",
+        output_path
+    ]
+
+    return _run_ffmpeg_command(command)
+
+
+def process_scene_video(material_url: str, output_dir: str, target_duration: float, aspect_ratio: str = "16:9") -> str:
+    """
+    下载单个视频素材，并将其处理（剪辑/循环）到目标时长，同时调整分辨率。
+    这是实现音画同步的关键步骤之一。
+    """
+    try:
+        # 创建一个唯一的文件名
+        video_filename = os.path.join(output_dir, f"scene_{os.path.basename(material_url)}")
+        
+        # 下载视频
+        response = requests.get(material_url, stream=True)
+        response.raise_for_status()
+        with open(video_filename, 'wb') as f:
+            for chunk in response.iter_content(chunk_size=8192):
+                f.write(chunk)
+        logger.info(f"Downloaded scene video to {video_filename}")
+
+
+        clip = VideoFileClip(video_filename)
+        
+        # 如果原始视频时长短于目标时长，就循环视频
+        if clip.duration < target_duration:
+            clip = clip.loop(duration=target_duration)
+        # 如果原始视频时长长于目标时长，就剪辑视频
         else:
-            if _txt_.strip() == word.strip():
-                processed = False
-                break
-            _wrapped_lines_.append(_before)
-            _txt_ = f"{word} "
-    _wrapped_lines_.append(_txt_)
-    if processed:
-        _wrapped_lines_ = [line.strip() for line in _wrapped_lines_]
-        result = "\n".join(_wrapped_lines_).strip()
-        height = len(_wrapped_lines_) * height
-        return result, height
+            clip = clip.subclip(0, target_duration)
+            
+        # 调整分辨率和宽高比
+        if aspect_ratio == "16:9":
+            target_resolution = (1920, 1080)
+        else: # 9:16
+            target_resolution = (1080, 1920)
+        
+        # 使用crop和resize确保画面内容不被拉伸
+        clip_resized = clip.resize(height=target_resolution[1]) if clip.size[0]/clip.size[1] < target_resolution[0]/target_resolution[1] else clip.resize(width=target_resolution[0])
+        clip_cropped = clip_resized.crop(x_center=clip_resized.size[0]/2, y_center=clip_resized.size[1]/2, width=target_resolution[0], height=target_resolution[1])
 
-    _wrapped_lines_ = []
-    chars = list(text)
-    _txt_ = ""
-    for word in chars:
-        _txt_ += word
-        _width, _height = get_text_size(_txt_)
-        if _width <= max_width:
-            continue
-        else:
-            _wrapped_lines_.append(_txt_)
-            _txt_ = ""
-    _wrapped_lines_.append(_txt_)
-    result = "\n".join(_wrapped_lines_).strip()
-    height = len(_wrapped_lines_) * height
-    return result, height
+        processed_filename = os.path.join(output_dir, f"processed_{os.path.basename(video_filename)}")
+        clip_cropped.write_videofile(processed_filename, codec="libx264", audio_codec="aac", fps=30, ffmpeg_params=['-pix_fmt', 'yuv420p'])
+        
+        clip.close()
+        clip_cropped.close()
+        os.remove(video_filename) # 删除原始下载文件
 
+        logger.info(f"Processed scene video to {processed_filename}, duration: {target_duration}s")
+        return processed_filename
+
+    except Exception as e:
+        logger.error(f"Error processing scene video from {material_url}: {e}")
+        return None
 
 def generate_video(
     video_path: str,
@@ -494,166 +462,58 @@ def generate_video(
     subtitle_path: str,
     output_file: str,
     params: VideoParams,
-):
-    aspect = VideoAspect(params.video_aspect)
-    video_width, video_height = aspect.to_resolution()
+) -> str:
+    """
+    Generates the final video by adding background music and subtitles using FFmpeg.
 
-    logger.info(f"generating video: {video_width} x {video_height}")
-    logger.info(f"  ① video: {video_path}")
-    logger.info(f"  ② audio: {audio_path}")
-    logger.info(f"  ③ subtitle: {subtitle_path}")
-    logger.info(f"  ④ output: {output_file}")
+    Args:
+        video_path (str): Path to the source video file.
+        audio_path (str): Path to the background music file.
+        subtitle_path (str): Path to the subtitle file.
+        output_file (str): Path to save the final output video.
+        params (VideoParams): Video parameters including bgm_volume.
 
-    # https://github.com/harry0703/MoneyPrinterTurbo/issues/217
-    # PermissionError: [WinError 32] The process cannot access the file because it is being used by another process: 'final-1.mp4.tempTEMP_MPY_wvf_snd.mp3'
-    # write into the same directory as the output file
-    output_dir = os.path.dirname(output_file)
+    Returns:
+        str: The path to the final video if successful, otherwise an empty string.
+    """
+    logger.info(f"Generating final video for {output_file}")
+    temp_dir = os.path.join(os.path.dirname(output_file), "temp_gen")
+    os.makedirs(temp_dir, exist_ok=True)
 
-    font_path = ""
-    if params.subtitle_enabled:
-        if not params.font_name:
-            params.font_name = "STHeitiMedium.ttc"
-        font_path = os.path.join(utils.font_dir(), params.font_name)
-        if os.name == "nt":
-            font_path = font_path.replace("\\", "/")
+    final_video_path = ""
 
-        logger.info(f"  ⑤ font: {font_path}")
-
-    def create_text_clip(subtitle_item):
-        params.font_size = int(params.font_size)
-        params.stroke_width = int(params.stroke_width)
-        phrase = subtitle_item[1]
-        max_width = video_width * 0.9
-        wrapped_txt, txt_height = wrap_text(
-            phrase, max_width=max_width, font=font_path, fontsize=params.font_size
+    try:
+        # Step 1: Add background music
+        logger.info("Step 1: Adding background music.")
+        video_with_bgm_path = os.path.join(temp_dir, f"bgm_{os.path.basename(video_path)}")
+        bgm_added_path = add_bgm_to_video_ffmpeg(
+            video_path=video_path,
+            bgm_path=audio_path,
+            output_path=video_with_bgm_path,
+            bgm_volume=params.bgm_volume
         )
-        interline = int(params.font_size * 0.25)
-        size=(int(max_width), int(txt_height + params.font_size * 0.25 + (interline * (wrapped_txt.count("\n") + 1))))
+        if not bgm_added_path:
+            logger.error("Failed to add background music. Aborting video generation.")
+            return ""
 
-        _clip = TextClip(
-            text=wrapped_txt,
-            font=font_path,
-            font_size=params.font_size,
-            color=params.text_fore_color,
-            bg_color=params.text_background_color,
-            stroke_color=params.stroke_color,
-            stroke_width=params.stroke_width,
-            # interline=interline,
-            # size=size,
-        )
-        duration = subtitle_item[0][1] - subtitle_item[0][0]
-        _clip = _clip.with_start(subtitle_item[0][0])
-        _clip = _clip.with_end(subtitle_item[0][1])
-        _clip = _clip.with_duration(duration)
-        if params.subtitle_position == "bottom":
-            _clip = _clip.with_position(("center", video_height * 0.95 - _clip.h))
-        elif params.subtitle_position == "top":
-            _clip = _clip.with_position(("center", video_height * 0.05))
-        elif params.subtitle_position == "custom":
-            # Ensure the subtitle is fully within the screen bounds
-            margin = 10  # Additional margin, in pixels
-            max_y = video_height - _clip.h - margin
-            min_y = margin
-            custom_y = (video_height - _clip.h) * (params.custom_position / 100)
-            custom_y = max(
-                min_y, min(custom_y, max_y)
-            )  # Constrain the y value within the valid range
-            _clip = _clip.with_position(("center", custom_y))
-        else:  # center
-            _clip = _clip.with_position(("center", "center"))
-        return _clip
-
-    video_clip = VideoFileClip(video_path).without_audio()
-    audio_clip = AudioFileClip(audio_path).with_effects(
-        [afx.MultiplyVolume(params.voice_volume)]
-    )
-
-    def make_textclip(text):
-        return TextClip(
-            text=text,
-            font=font_path,
-            font_size=params.font_size,
+        # Step 2: Add subtitles
+        logger.info("Step 2: Adding subtitles.")
+        subtitled_video_path = add_subtitles_to_video_ffmpeg(
+            video_path=bgm_added_path,
+            subtitles_path=subtitle_path,
+            output_path=output_file
         )
 
-    if subtitle_path and os.path.exists(subtitle_path):
-        sub = SubtitlesClip(
-            subtitles=subtitle_path, encoding="utf-8", make_textclip=make_textclip
-        )
-        text_clips = []
-        for item in sub.subtitles:
-            clip = create_text_clip(subtitle_item=item)
-            text_clips.append(clip)
-        video_clip = CompositeVideoClip([video_clip, *text_clips])
+        if subtitled_video_path:
+            logger.success(f"Successfully generated final video: {output_file}")
+            final_video_path = output_file
+        else:
+            logger.error("Failed to add subtitles. Final video not created.")
 
-    bgm_file = get_bgm_file(bgm_type=params.bgm_type, bgm_file=params.bgm_file)
-    if bgm_file:
-        try:
-            bgm_clip = AudioFileClip(bgm_file).with_effects(
-                [
-                    afx.MultiplyVolume(params.bgm_volume),
-                    afx.AudioFadeOut(3),
-                    afx.AudioLoop(duration=video_clip.duration),
-                ]
-            )
-            audio_clip = CompositeAudioClip([audio_clip, bgm_clip])
-        except Exception as e:
-            logger.error(f"failed to add bgm: {str(e)}")
-
-    video_clip = video_clip.with_audio(audio_clip)
-    video_clip.write_videofile(
-        output_file,
-        audio_codec=audio_codec,
-        temp_audiofile_path=output_dir,
-        threads=params.n_threads or 2,
-        logger=None,
-        fps=fps,
-    )
-    video_clip.close()
-    del video_clip
-
-
-def preprocess_video(materials: List[MaterialInfo], clip_duration=4):
-    for material in materials:
-        if not material.url:
-            continue
-
-        ext = utils.parse_extension(material.url)
-        try:
-            clip = VideoFileClip(material.url)
-        except Exception:
-            clip = ImageClip(material.url)
-
-        width = clip.size[0]
-        height = clip.size[1]
-        if width < 480 or height < 480:
-            logger.warning(f"low resolution material: {width}x{height}, minimum 480x480 required")
-            continue
-
-        if ext in const.FILE_TYPE_IMAGES:
-            logger.info(f"processing image: {material.url}")
-            # Create an image clip and set its duration to 3 seconds
-            clip = (
-                ImageClip(material.url)
-                .with_duration(clip_duration)
-                .with_position("center")
-            )
-            # Apply a zoom effect using the resize method.
-            # A lambda function is used to make the zoom effect dynamic over time.
-            # The zoom effect starts from the original size and gradually scales up to 120%.
-            # t represents the current time, and clip.duration is the total duration of the clip (3 seconds).
-            # Note: 1 represents 100% size, so 1.2 represents 120% size.
-            zoom_clip = clip.resized(
-                lambda t: 1 + (clip_duration * 0.03) * (t / clip.duration)
-            )
-
-            # Optionally, create a composite video clip containing the zoomed clip.
-            # This is useful when you want to add other elements to the video.
-            final_clip = CompositeVideoClip([zoom_clip])
-
-            # Output the video to a file.
-            video_file = f"{material.url}.mp4"
-            final_clip.write_videofile(video_file, fps=30, logger=None)
-            close_clip(clip)
-            material.url = video_file
-            logger.success(f"image processed: {video_file}")
-    return materials
\ No newline at end of file
+    finally:
+        # Clean up temporary directory
+        if os.path.exists(temp_dir):
+            shutil.rmtree(temp_dir)
+    
+    return final_video_path
+    
diff --git a/app/services/voice.py b/app/services/voice.py
index e6b4d59..85f2fa0 100644
--- a/app/services/voice.py
+++ b/app/services/voice.py
@@ -2,7 +2,7 @@ import asyncio
 import os
 import re
 from datetime import datetime
-from typing import Union
+from typing import Union, List
 from xml.sax.saxutils import unescape
 
 import edge_tts
@@ -10,1072 +10,13 @@ import requests
 from edge_tts import SubMaker, submaker
 from edge_tts.submaker import mktimestamp
 from loguru import logger
-from moviepy.video.tools import subtitles
+import subprocess
 
 from app.config import config
 from app.utils import utils
 
 
-def get_siliconflow_voices() -> list[str]:
-    """
-    获取硅基流动的声音列表
-
-    Returns:
-        声音列表，格式为 ["siliconflow:FunAudioLLM/CosyVoice2-0.5B:alex", ...]
-    """
-    # 硅基流动的声音列表和对应的性别（用于显示）
-    voices_with_gender = [
-        ("FunAudioLLM/CosyVoice2-0.5B", "alex", "Male"),
-        ("FunAudioLLM/CosyVoice2-0.5B", "anna", "Female"),
-        ("FunAudioLLM/CosyVoice2-0.5B", "bella", "Female"),
-        ("FunAudioLLM/CosyVoice2-0.5B", "benjamin", "Male"),
-        ("FunAudioLLM/CosyVoice2-0.5B", "charles", "Male"),
-        ("FunAudioLLM/CosyVoice2-0.5B", "claire", "Female"),
-        ("FunAudioLLM/CosyVoice2-0.5B", "david", "Male"),
-        ("FunAudioLLM/CosyVoice2-0.5B", "diana", "Female"),
-    ]
-
-    # 添加siliconflow:前缀，并格式化为显示名称
-    return [
-        f"siliconflow:{model}:{voice}-{gender}"
-        for model, voice, gender in voices_with_gender
-    ]
-
-
-def get_all_azure_voices(filter_locals=None) -> list[str]:
-    azure_voices_str = """
-Name: af-ZA-AdriNeural
-Gender: Female
-
-Name: af-ZA-WillemNeural
-Gender: Male
-
-Name: am-ET-AmehaNeural
-Gender: Male
-
-Name: am-ET-MekdesNeural
-Gender: Female
-
-Name: ar-AE-FatimaNeural
-Gender: Female
-
-Name: ar-AE-HamdanNeural
-Gender: Male
-
-Name: ar-BH-AliNeural
-Gender: Male
-
-Name: ar-BH-LailaNeural
-Gender: Female
-
-Name: ar-DZ-AminaNeural
-Gender: Female
-
-Name: ar-DZ-IsmaelNeural
-Gender: Male
-
-Name: ar-EG-SalmaNeural
-Gender: Female
-
-Name: ar-EG-ShakirNeural
-Gender: Male
-
-Name: ar-IQ-BasselNeural
-Gender: Male
-
-Name: ar-IQ-RanaNeural
-Gender: Female
-
-Name: ar-JO-SanaNeural
-Gender: Female
-
-Name: ar-JO-TaimNeural
-Gender: Male
-
-Name: ar-KW-FahedNeural
-Gender: Male
-
-Name: ar-KW-NouraNeural
-Gender: Female
-
-Name: ar-LB-LaylaNeural
-Gender: Female
-
-Name: ar-LB-RamiNeural
-Gender: Male
-
-Name: ar-LY-ImanNeural
-Gender: Female
-
-Name: ar-LY-OmarNeural
-Gender: Male
-
-Name: ar-MA-JamalNeural
-Gender: Male
-
-Name: ar-MA-MounaNeural
-Gender: Female
-
-Name: ar-OM-AbdullahNeural
-Gender: Male
-
-Name: ar-OM-AyshaNeural
-Gender: Female
-
-Name: ar-QA-AmalNeural
-Gender: Female
-
-Name: ar-QA-MoazNeural
-Gender: Male
-
-Name: ar-SA-HamedNeural
-Gender: Male
-
-Name: ar-SA-ZariyahNeural
-Gender: Female
-
-Name: ar-SY-AmanyNeural
-Gender: Female
-
-Name: ar-SY-LaithNeural
-Gender: Male
-
-Name: ar-TN-HediNeural
-Gender: Male
-
-Name: ar-TN-ReemNeural
-Gender: Female
-
-Name: ar-YE-MaryamNeural
-Gender: Female
-
-Name: ar-YE-SalehNeural
-Gender: Male
-
-Name: az-AZ-BabekNeural
-Gender: Male
-
-Name: az-AZ-BanuNeural
-Gender: Female
-
-Name: bg-BG-BorislavNeural
-Gender: Male
-
-Name: bg-BG-KalinaNeural
-Gender: Female
-
-Name: bn-BD-NabanitaNeural
-Gender: Female
-
-Name: bn-BD-PradeepNeural
-Gender: Male
-
-Name: bn-IN-BashkarNeural
-Gender: Male
-
-Name: bn-IN-TanishaaNeural
-Gender: Female
-
-Name: bs-BA-GoranNeural
-Gender: Male
-
-Name: bs-BA-VesnaNeural
-Gender: Female
-
-Name: ca-ES-EnricNeural
-Gender: Male
-
-Name: ca-ES-JoanaNeural
-Gender: Female
-
-Name: cs-CZ-AntoninNeural
-Gender: Male
-
-Name: cs-CZ-VlastaNeural
-Gender: Female
-
-Name: cy-GB-AledNeural
-Gender: Male
-
-Name: cy-GB-NiaNeural
-Gender: Female
-
-Name: da-DK-ChristelNeural
-Gender: Female
-
-Name: da-DK-JeppeNeural
-Gender: Male
-
-Name: de-AT-IngridNeural
-Gender: Female
-
-Name: de-AT-JonasNeural
-Gender: Male
-
-Name: de-CH-JanNeural
-Gender: Male
-
-Name: de-CH-LeniNeural
-Gender: Female
-
-Name: de-DE-AmalaNeural
-Gender: Female
-
-Name: de-DE-ConradNeural
-Gender: Male
-
-Name: de-DE-FlorianMultilingualNeural
-Gender: Male
-
-Name: de-DE-KatjaNeural
-Gender: Female
-
-Name: de-DE-KillianNeural
-Gender: Male
-
-Name: de-DE-SeraphinaMultilingualNeural
-Gender: Female
-
-Name: el-GR-AthinaNeural
-Gender: Female
-
-Name: el-GR-NestorasNeural
-Gender: Male
-
-Name: en-AU-NatashaNeural
-Gender: Female
-
-Name: en-AU-WilliamNeural
-Gender: Male
-
-Name: en-CA-ClaraNeural
-Gender: Female
-
-Name: en-CA-LiamNeural
-Gender: Male
-
-Name: en-GB-LibbyNeural
-Gender: Female
-
-Name: en-GB-MaisieNeural
-Gender: Female
-
-Name: en-GB-RyanNeural
-Gender: Male
-
-Name: en-GB-SoniaNeural
-Gender: Female
-
-Name: en-GB-ThomasNeural
-Gender: Male
-
-Name: en-HK-SamNeural
-Gender: Male
-
-Name: en-HK-YanNeural
-Gender: Female
-
-Name: en-IE-ConnorNeural
-Gender: Male
-
-Name: en-IE-EmilyNeural
-Gender: Female
-
-Name: en-IN-NeerjaExpressiveNeural
-Gender: Female
-
-Name: en-IN-NeerjaNeural
-Gender: Female
-
-Name: en-IN-PrabhatNeural
-Gender: Male
-
-Name: en-KE-AsiliaNeural
-Gender: Female
-
-Name: en-KE-ChilembaNeural
-Gender: Male
-
-Name: en-NG-AbeoNeural
-Gender: Male
-
-Name: en-NG-EzinneNeural
-Gender: Female
-
-Name: en-NZ-MitchellNeural
-Gender: Male
-
-Name: en-NZ-MollyNeural
-Gender: Female
-
-Name: en-PH-JamesNeural
-Gender: Male
-
-Name: en-PH-RosaNeural
-Gender: Female
-
-Name: en-SG-LunaNeural
-Gender: Female
-
-Name: en-SG-WayneNeural
-Gender: Male
-
-Name: en-TZ-ElimuNeural
-Gender: Male
-
-Name: en-TZ-ImaniNeural
-Gender: Female
-
-Name: en-US-AnaNeural
-Gender: Female
-
-Name: en-US-AndrewMultilingualNeural
-Gender: Male
-
-Name: en-US-AndrewNeural
-Gender: Male
-
-Name: en-US-AriaNeural
-Gender: Female
-
-Name: en-US-AvaMultilingualNeural
-Gender: Female
-
-Name: en-US-AvaNeural
-Gender: Female
-
-Name: en-US-BrianMultilingualNeural
-Gender: Male
-
-Name: en-US-BrianNeural
-Gender: Male
-
-Name: en-US-ChristopherNeural
-Gender: Male
-
-Name: en-US-EmmaMultilingualNeural
-Gender: Female
-
-Name: en-US-EmmaNeural
-Gender: Female
-
-Name: en-US-EricNeural
-Gender: Male
-
-Name: en-US-GuyNeural
-Gender: Male
-
-Name: en-US-JennyNeural
-Gender: Female
-
-Name: en-US-MichelleNeural
-Gender: Female
-
-Name: en-US-RogerNeural
-Gender: Male
-
-Name: en-US-SteffanNeural
-Gender: Male
-
-Name: en-ZA-LeahNeural
-Gender: Female
-
-Name: en-ZA-LukeNeural
-Gender: Male
-
-Name: es-AR-ElenaNeural
-Gender: Female
-
-Name: es-AR-TomasNeural
-Gender: Male
-
-Name: es-BO-MarceloNeural
-Gender: Male
-
-Name: es-BO-SofiaNeural
-Gender: Female
-
-Name: es-CL-CatalinaNeural
-Gender: Female
-
-Name: es-CL-LorenzoNeural
-Gender: Male
-
-Name: es-CO-GonzaloNeural
-Gender: Male
-
-Name: es-CO-SalomeNeural
-Gender: Female
-
-Name: es-CR-JuanNeural
-Gender: Male
-
-Name: es-CR-MariaNeural
-Gender: Female
-
-Name: es-CU-BelkysNeural
-Gender: Female
-
-Name: es-CU-ManuelNeural
-Gender: Male
-
-Name: es-DO-EmilioNeural
-Gender: Male
-
-Name: es-DO-RamonaNeural
-Gender: Female
-
-Name: es-EC-AndreaNeural
-Gender: Female
-
-Name: es-EC-LuisNeural
-Gender: Male
-
-Name: es-ES-AlvaroNeural
-Gender: Male
-
-Name: es-ES-ElviraNeural
-Gender: Female
-
-Name: es-ES-XimenaNeural
-Gender: Female
-
-Name: es-GQ-JavierNeural
-Gender: Male
-
-Name: es-GQ-TeresaNeural
-Gender: Female
-
-Name: es-GT-AndresNeural
-Gender: Male
-
-Name: es-GT-MartaNeural
-Gender: Female
-
-Name: es-HN-CarlosNeural
-Gender: Male
-
-Name: es-HN-KarlaNeural
-Gender: Female
-
-Name: es-MX-DaliaNeural
-Gender: Female
-
-Name: es-MX-JorgeNeural
-Gender: Male
-
-Name: es-NI-FedericoNeural
-Gender: Male
-
-Name: es-NI-YolandaNeural
-Gender: Female
-
-Name: es-PA-MargaritaNeural
-Gender: Female
-
-Name: es-PA-RobertoNeural
-Gender: Male
-
-Name: es-PE-AlexNeural
-Gender: Male
-
-Name: es-PE-CamilaNeural
-Gender: Female
-
-Name: es-PR-KarinaNeural
-Gender: Female
-
-Name: es-PR-VictorNeural
-Gender: Male
-
-Name: es-PY-MarioNeural
-Gender: Male
-
-Name: es-PY-TaniaNeural
-Gender: Female
-
-Name: es-SV-LorenaNeural
-Gender: Female
-
-Name: es-SV-RodrigoNeural
-Gender: Male
-
-Name: es-US-AlonsoNeural
-Gender: Male
-
-Name: es-US-PalomaNeural
-Gender: Female
-
-Name: es-UY-MateoNeural
-Gender: Male
-
-Name: es-UY-ValentinaNeural
-Gender: Female
-
-Name: es-VE-PaolaNeural
-Gender: Female
-
-Name: es-VE-SebastianNeural
-Gender: Male
-
-Name: et-EE-AnuNeural
-Gender: Female
-
-Name: et-EE-KertNeural
-Gender: Male
-
-Name: fa-IR-DilaraNeural
-Gender: Female
-
-Name: fa-IR-FaridNeural
-Gender: Male
-
-Name: fi-FI-HarriNeural
-Gender: Male
-
-Name: fi-FI-NooraNeural
-Gender: Female
-
-Name: fil-PH-AngeloNeural
-Gender: Male
-
-Name: fil-PH-BlessicaNeural
-Gender: Female
-
-Name: fr-BE-CharlineNeural
-Gender: Female
-
-Name: fr-BE-GerardNeural
-Gender: Male
-
-Name: fr-CA-AntoineNeural
-Gender: Male
-
-Name: fr-CA-JeanNeural
-Gender: Male
-
-Name: fr-CA-SylvieNeural
-Gender: Female
-
-Name: fr-CA-ThierryNeural
-Gender: Male
-
-Name: fr-CH-ArianeNeural
-Gender: Female
-
-Name: fr-CH-FabriceNeural
-Gender: Male
-
-Name: fr-FR-DeniseNeural
-Gender: Female
-
-Name: fr-FR-EloiseNeural
-Gender: Female
-
-Name: fr-FR-HenriNeural
-Gender: Male
-
-Name: fr-FR-RemyMultilingualNeural
-Gender: Male
-
-Name: fr-FR-VivienneMultilingualNeural
-Gender: Female
-
-Name: ga-IE-ColmNeural
-Gender: Male
-
-Name: ga-IE-OrlaNeural
-Gender: Female
-
-Name: gl-ES-RoiNeural
-Gender: Male
-
-Name: gl-ES-SabelaNeural
-Gender: Female
-
-Name: gu-IN-DhwaniNeural
-Gender: Female
-
-Name: gu-IN-NiranjanNeural
-Gender: Male
-
-Name: he-IL-AvriNeural
-Gender: Male
-
-Name: he-IL-HilaNeural
-Gender: Female
-
-Name: hi-IN-MadhurNeural
-Gender: Male
-
-Name: hi-IN-SwaraNeural
-Gender: Female
-
-Name: hr-HR-GabrijelaNeural
-Gender: Female
-
-Name: hr-HR-SreckoNeural
-Gender: Male
-
-Name: hu-HU-NoemiNeural
-Gender: Female
-
-Name: hu-HU-TamasNeural
-Gender: Male
-
-Name: id-ID-ArdiNeural
-Gender: Male
-
-Name: id-ID-GadisNeural
-Gender: Female
-
-Name: is-IS-GudrunNeural
-Gender: Female
-
-Name: is-IS-GunnarNeural
-Gender: Male
-
-Name: it-IT-DiegoNeural
-Gender: Male
-
-Name: it-IT-ElsaNeural
-Gender: Female
-
-Name: it-IT-GiuseppeMultilingualNeural
-Gender: Male
-
-Name: it-IT-IsabellaNeural
-Gender: Female
-
-Name: iu-Cans-CA-SiqiniqNeural
-Gender: Female
-
-Name: iu-Cans-CA-TaqqiqNeural
-Gender: Male
-
-Name: iu-Latn-CA-SiqiniqNeural
-Gender: Female
-
-Name: iu-Latn-CA-TaqqiqNeural
-Gender: Male
-
-Name: ja-JP-KeitaNeural
-Gender: Male
-
-Name: ja-JP-NanamiNeural
-Gender: Female
-
-Name: jv-ID-DimasNeural
-Gender: Male
-
-Name: jv-ID-SitiNeural
-Gender: Female
-
-Name: ka-GE-EkaNeural
-Gender: Female
-
-Name: ka-GE-GiorgiNeural
-Gender: Male
-
-Name: kk-KZ-AigulNeural
-Gender: Female
-
-Name: kk-KZ-DauletNeural
-Gender: Male
-
-Name: km-KH-PisethNeural
-Gender: Male
-
-Name: km-KH-SreymomNeural
-Gender: Female
-
-Name: kn-IN-GaganNeural
-Gender: Male
-
-Name: kn-IN-SapnaNeural
-Gender: Female
-
-Name: ko-KR-HyunsuMultilingualNeural
-Gender: Male
-
-Name: ko-KR-InJoonNeural
-Gender: Male
-
-Name: ko-KR-SunHiNeural
-Gender: Female
-
-Name: lo-LA-ChanthavongNeural
-Gender: Male
-
-Name: lo-LA-KeomanyNeural
-Gender: Female
-
-Name: lt-LT-LeonasNeural
-Gender: Male
-
-Name: lt-LT-OnaNeural
-Gender: Female
-
-Name: lv-LV-EveritaNeural
-Gender: Female
-
-Name: lv-LV-NilsNeural
-Gender: Male
-
-Name: mk-MK-AleksandarNeural
-Gender: Male
-
-Name: mk-MK-MarijaNeural
-Gender: Female
-
-Name: ml-IN-MidhunNeural
-Gender: Male
-
-Name: ml-IN-SobhanaNeural
-Gender: Female
-
-Name: mn-MN-BataaNeural
-Gender: Male
-
-Name: mn-MN-YesuiNeural
-Gender: Female
-
-Name: mr-IN-AarohiNeural
-Gender: Female
-
-Name: mr-IN-ManoharNeural
-Gender: Male
-
-Name: ms-MY-OsmanNeural
-Gender: Male
-
-Name: ms-MY-YasminNeural
-Gender: Female
-
-Name: mt-MT-GraceNeural
-Gender: Female
-
-Name: mt-MT-JosephNeural
-Gender: Male
-
-Name: my-MM-NilarNeural
-Gender: Female
-
-Name: my-MM-ThihaNeural
-Gender: Male
-
-Name: nb-NO-FinnNeural
-Gender: Male
-
-Name: nb-NO-PernilleNeural
-Gender: Female
-
-Name: ne-NP-HemkalaNeural
-Gender: Female
-
-Name: ne-NP-SagarNeural
-Gender: Male
-
-Name: nl-BE-ArnaudNeural
-Gender: Male
-
-Name: nl-BE-DenaNeural
-Gender: Female
-
-Name: nl-NL-ColetteNeural
-Gender: Female
-
-Name: nl-NL-FennaNeural
-Gender: Female
-
-Name: nl-NL-MaartenNeural
-Gender: Male
-
-Name: pl-PL-MarekNeural
-Gender: Male
-
-Name: pl-PL-ZofiaNeural
-Gender: Female
-
-Name: ps-AF-GulNawazNeural
-Gender: Male
-
-Name: ps-AF-LatifaNeural
-Gender: Female
-
-Name: pt-BR-AntonioNeural
-Gender: Male
-
-Name: pt-BR-FranciscaNeural
-Gender: Female
-
-Name: pt-BR-ThalitaMultilingualNeural
-Gender: Female
-
-Name: pt-PT-DuarteNeural
-Gender: Male
-
-Name: pt-PT-RaquelNeural
-Gender: Female
-
-Name: ro-RO-AlinaNeural
-Gender: Female
-
-Name: ro-RO-EmilNeural
-Gender: Male
-
-Name: ru-RU-DmitryNeural
-Gender: Male
-
-Name: ru-RU-SvetlanaNeural
-Gender: Female
-
-Name: si-LK-SameeraNeural
-Gender: Male
-
-Name: si-LK-ThiliniNeural
-Gender: Female
-
-Name: sk-SK-LukasNeural
-Gender: Male
-
-Name: sk-SK-ViktoriaNeural
-Gender: Female
-
-Name: sl-SI-PetraNeural
-Gender: Female
-
-Name: sl-SI-RokNeural
-Gender: Male
-
-Name: so-SO-MuuseNeural
-Gender: Male
-
-Name: so-SO-UbaxNeural
-Gender: Female
-
-Name: sq-AL-AnilaNeural
-Gender: Female
-
-Name: sq-AL-IlirNeural
-Gender: Male
-
-Name: sr-RS-NicholasNeural
-Gender: Male
-
-Name: sr-RS-SophieNeural
-Gender: Female
-
-Name: su-ID-JajangNeural
-Gender: Male
-
-Name: su-ID-TutiNeural
-Gender: Female
-
-Name: sv-SE-MattiasNeural
-Gender: Male
-
-Name: sv-SE-SofieNeural
-Gender: Female
-
-Name: sw-KE-RafikiNeural
-Gender: Male
-
-Name: sw-KE-ZuriNeural
-Gender: Female
-
-Name: sw-TZ-DaudiNeural
-Gender: Male
-
-Name: sw-TZ-RehemaNeural
-Gender: Female
-
-Name: ta-IN-PallaviNeural
-Gender: Female
-
-Name: ta-IN-ValluvarNeural
-Gender: Male
-
-Name: ta-LK-KumarNeural
-Gender: Male
-
-Name: ta-LK-SaranyaNeural
-Gender: Female
-
-Name: ta-MY-KaniNeural
-Gender: Female
-
-Name: ta-MY-SuryaNeural
-Gender: Male
-
-Name: ta-SG-AnbuNeural
-Gender: Male
-
-Name: ta-SG-VenbaNeural
-Gender: Female
-
-Name: te-IN-MohanNeural
-Gender: Male
-
-Name: te-IN-ShrutiNeural
-Gender: Female
-
-Name: th-TH-NiwatNeural
-Gender: Male
-
-Name: th-TH-PremwadeeNeural
-Gender: Female
-
-Name: tr-TR-AhmetNeural
-Gender: Male
-
-Name: tr-TR-EmelNeural
-Gender: Female
-
-Name: uk-UA-OstapNeural
-Gender: Male
-
-Name: uk-UA-PolinaNeural
-Gender: Female
-
-Name: ur-IN-GulNeural
-Gender: Female
-
-Name: ur-IN-SalmanNeural
-Gender: Male
-
-Name: ur-PK-AsadNeural
-Gender: Male
-
-Name: ur-PK-UzmaNeural
-Gender: Female
-
-Name: uz-UZ-MadinaNeural
-Gender: Female
-
-Name: uz-UZ-SardorNeural
-Gender: Male
-
-Name: vi-VN-HoaiMyNeural
-Gender: Female
-
-Name: vi-VN-NamMinhNeural
-Gender: Male
-
-Name: zh-CN-XiaoxiaoNeural
-Gender: Female
-
-Name: zh-CN-XiaoyiNeural
-Gender: Female
-
-Name: zh-CN-YunjianNeural
-Gender: Male
-
-Name: zh-CN-YunxiNeural
-Gender: Male
-
-Name: zh-CN-YunxiaNeural
-Gender: Male
-
-Name: zh-CN-YunyangNeural
-Gender: Male
-
-Name: zh-CN-liaoning-XiaobeiNeural
-Gender: Female
-
-Name: zh-CN-shaanxi-XiaoniNeural
-Gender: Female
-
-Name: zh-HK-HiuGaaiNeural
-Gender: Female
-
-Name: zh-HK-HiuMaanNeural
-Gender: Female
-
-Name: zh-HK-WanLungNeural
-Gender: Male
-
-Name: zh-TW-HsiaoChenNeural
-Gender: Female
-
-Name: zh-TW-HsiaoYuNeural
-Gender: Female
-
-Name: zh-TW-YunJheNeural
-Gender: Male
-
-Name: zu-ZA-ThandoNeural
-Gender: Female
-
-Name: zu-ZA-ThembaNeural
-Gender: Male
-
-
-Name: en-US-AvaMultilingualNeural-V2
-Gender: Female
-
-Name: en-US-AndrewMultilingualNeural-V2
-Gender: Male
-
-Name: en-US-EmmaMultilingualNeural-V2
-Gender: Female
-
-Name: en-US-BrianMultilingualNeural-V2
-Gender: Male
-
-Name: de-DE-FlorianMultilingualNeural-V2
-Gender: Male
-
-Name: de-DE-SeraphinaMultilingualNeural-V2
-Gender: Female
-
-Name: fr-FR-RemyMultilingualNeural-V2
-Gender: Male
-
-Name: fr-FR-VivienneMultilingualNeural-V2
-Gender: Female
-
-Name: zh-CN-XiaoxiaoMultilingualNeural-V2
-Gender: Female
-    """.strip()
-    voices = []
-    # 定义正则表达式模式，用于匹配 Name 和 Gender 行
-    pattern = re.compile(r"Name:\s*(.+)\s*Gender:\s*(.+)\s*", re.MULTILINE)
-    # 使用正则表达式查找所有匹配项
-    matches = pattern.findall(azure_voices_str)
-
-    for name, gender in matches:
-        # 应用过滤条件
-        if filter_locals and any(
-            name.lower().startswith(fl.lower()) for fl in filter_locals
-        ):
-            voices.append(f"{name}-{gender}")
-        elif not filter_locals:
-            voices.append(f"{name}-{gender}")
-
-    voices.sort()
-    return voices
-
-
-def parse_voice_name(name: str):
-    # zh-CN-XiaoyiNeural-Female
-    # zh-CN-YunxiNeural-Male
-    # zh-CN-XiaoxiaoMultilingualNeural-V2-Female
-    name = name.replace("-Female", "").replace("-Male", "").strip()
-    return name
-
-
-def is_azure_v2_voice(voice_name: str):
-    voice_name = parse_voice_name(voice_name)
-    if voice_name.endswith("-V2"):
-        return voice_name.replace("-V2", "").strip()
-    return ""
-
-
-def is_siliconflow_voice(voice_name: str):
-    """检查是否是硅基流动的声音"""
-    return voice_name.startswith("siliconflow:")
-
+from app.utils.utils import parse_voice_name, is_azure_v2_voice, is_siliconflow_voice
 
 def tts(
     text: str,
@@ -1219,12 +160,22 @@ def siliconflow_tts(
 
                 # 获取音频文件的实际长度
                 try:
-                    # 尝试使用moviepy获取音频长度
-                    from moviepy import AudioFileClip
-
-                    audio_clip = AudioFileClip(voice_file)
-                    audio_duration = audio_clip.duration
-                    audio_clip.close()
+                    # 使用 ffprobe 获取音频长度
+                    probe_command = [
+                        "ffprobe",
+                        "-v", "error",
+                        "-show_entries", "format=duration",
+                        "-of", "default=noprint_wrappers=1:nokey=1",
+                        voice_file
+                    ]
+                    process = subprocess.run(
+                        probe_command,
+                        check=True,
+                        capture_output=True,
+                        text=True,
+                        creationflags=subprocess.CREATE_NO_WINDOW if os.name == "nt" else 0,
+                    )
+                    audio_duration = float(process.stdout.strip())
 
                     # 将音频长度转换为100纳秒单位（与edge_tts兼容）
                     audio_duration_100ns = int(audio_duration * 10000000)
@@ -1262,8 +213,8 @@ def siliconflow_tts(
                         sub_maker.subs = [text]
                         sub_maker.offset = [(0, audio_duration_100ns)]
 
-                except Exception as e:
-                    logger.warning(f"Failed to create accurate subtitles: {str(e)}")
+                except (subprocess.CalledProcessError, FileNotFoundError, ValueError) as e:
+                    logger.warning(f"Failed to create accurate subtitles with ffprobe: {str(e)}")
                     # 回退到简单的字幕
                     sub_maker.subs = [text]
                     # 使用音频文件的实际长度，如果无法获取，则假设为10秒
@@ -1272,7 +223,7 @@ def siliconflow_tts(
                             0,
                             audio_duration_100ns
                             if "audio_duration_100ns" in locals()
-                            else 10000000,
+                            else 100000000,
                         )
                     ]
 
@@ -1469,14 +420,23 @@ def create_subtitle(sub_maker: submaker.SubMaker, text: str, subtitle_file: str)
             with open(subtitle_file, "w", encoding="utf-8") as file:
                 file.write("\n".join(sub_items) + "\n")
             try:
-                sbs = subtitles.file_to_subtitles(subtitle_file, encoding="utf-8")
-                duration = max([tb for ((ta, tb), txt) in sbs])
-                logger.info(
-                    f"completed, subtitle file created: {subtitle_file}, duration: {duration}"
-                )
+                # Get duration from the last subtitle item
+                if sub_items:
+                    last_sub = sub_items[-1]
+                    # '1\n00:00:00,000 --> 00:00:02,360\ntext'
+                    time_line = last_sub.strip().split('\n')[1]
+                    end_time_str = time_line.split(' --> ')[1]
+                    # '00:00:02,360'
+                    h, m, s_ms = end_time_str.split(':')
+                    s, ms = s_ms.split(',')
+                    duration = int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000.0
+                    logger.info(
+                        f"completed, subtitle file created: {subtitle_file}, duration: {duration}"
+                    )
+                else:
+                    logger.info(f"completed, empty subtitle file created: {subtitle_file}")
             except Exception as e:
-                logger.error(f"failed, error: {str(e)}")
-                os.remove(subtitle_file)
+                logger.warning(f"failed to parse subtitle duration, but file was created. error: {str(e)}")
         else:
             logger.warning(
                 f"failed, sub_items len: {len(sub_items)}, script_lines len: {len(script_lines)}"
@@ -1495,6 +455,53 @@ def get_audio_duration(sub_maker: submaker.SubMaker):
     return sub_maker.offset[-1][1] / 10000000
 
 
+def combine_audio_files(audio_paths: List[str], output_path: str) -> bool:
+    """
+    Combines multiple audio files into a single audio file using ffmpeg.
+    """
+    logger.info(f"Combining {len(audio_paths)} audio files into {output_path}")
+    if not audio_paths:
+        logger.warning("No audio clips to combine.")
+        return False
+
+    # Create a temporary file to list the audio files
+    list_file_path = os.path.join(os.path.dirname(output_path), "concat_list.txt")
+    with open(list_file_path, "w", encoding="utf-8") as f:
+        for path in audio_paths:
+            f.write(f"file '{os.path.normpath(path)}'\n")
+
+    command = [
+        "ffmpeg",
+        "-f", "concat",
+        "-safe", "0",
+        "-i", list_file_path,
+        "-c", "copy",
+        "-y",  # Overwrite output file if it exists
+        output_path,
+    ]
+
+    try:
+        process = subprocess.run(
+            command,
+            check=True,
+            capture_output=True,
+            text=True,
+            creationflags=subprocess.CREATE_NO_WINDOW if os.name == "nt" else 0,
+        )
+        logger.success(f"Successfully combined audio files: {output_path}")
+        return True
+    except subprocess.CalledProcessError as e:
+        logger.error(f"Failed to combine audio files: {e.stderr}")
+        return False
+    except FileNotFoundError:
+        logger.error("ffmpeg not found. Please ensure ffmpeg is installed and in your PATH.")
+        return False
+    finally:
+        # Clean up the temporary list file
+        if os.path.exists(list_file_path):
+            os.remove(list_file_path)
+
+
 if __name__ == "__main__":
     voice_name = "zh-CN-XiaoxiaoMultilingualNeural-V2-Female"
     voice_name = parse_voice_name(voice_name)
diff --git a/app/utils/__init__.py b/app/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/app/utils/utils.py b/app/utils/utils.py
index 7efb521..112d818 100644
--- a/app/utils/utils.py
+++ b/app/utils/utils.py
@@ -14,6 +14,23 @@ from app.models import const
 urllib3.disable_warnings()
 
 
+def parse_voice_name(name: str):
+    # zh-CN-XiaoyiNeural-Female -> zh-CN-XiaoyiNeural
+    # zh-CN-YunxiNeural-Male -> zh-CN-YunxiNeural
+    # zh-CN-XiaoxiaoMultilingualNeural-V2-Female -> zh-CN-XiaoxiaoMultilingualNeural-V2
+    return name.replace("-Female", "").replace("-Male", "").strip()
+
+def is_azure_v2_voice(voice_name: str):
+    voice_name = parse_voice_name(voice_name)
+    if voice_name.endswith("-V2"):
+        return voice_name.replace("-V2", "").strip()
+    return ""
+
+def is_siliconflow_voice(voice_name: str):
+    """检查是否是硅基流动的声音"""
+    return voice_name.startswith("siliconflow:")
+
+
 def get_response(status: int, data: Any = None, message: str = ""):
     obj = {
         "status": status,
@@ -64,6 +81,13 @@ def get_uuid(remove_hyphen: bool = False):
     return u
 
 
+def get_root_dir(sub_dir: str = ""):
+    d = root_dir()
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    return d
+
+
 def root_dir():
     return os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
 
@@ -103,6 +127,10 @@ def font_dir(sub_dir: str = ""):
     return d
 
 
+def get_font_path(font_name: str):
+    return os.path.join(font_dir(), font_name)
+
+
 def song_dir(sub_dir: str = ""):
     d = resource_dir("songs")
     if sub_dir:
@@ -227,4 +255,22 @@ def load_locales(i18n_dir):
 
 
 def parse_extension(filename):
-    return Path(filename).suffix.lower().lstrip('.')
+    return os.path.splitext(filename)[1]
+
+
+def rgb_to_bgr_hex(rgb_color):
+    """Converts an RGB color string (e.g., '#RRGGBB') to a BGR hex string for FFmpeg.
+
+    Args:
+        rgb_color (str): The RGB color string, starting with '#'.
+
+    Returns:
+        str: The BGR hex string (e.g., 'BBGGRR').
+    """
+    if not rgb_color.startswith('#') or len(rgb_color) != 7:
+        logger.warning(f"Invalid color format: {rgb_color}. Using default white.")
+        return "FFFFFF"  # Default to white for invalid formats
+    r = rgb_color[1:3]
+    g = rgb_color[3:5]
+    b = rgb_color[5:7]
+    return f"{b}{g}{r}"
diff --git a/requirements.txt b/requirements.txt
index a1731f6..dd88022 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,7 +6,7 @@ uvicorn==0.32.1
 openai==1.56.1
 faster-whisper==1.1.0
 loguru==0.7.3
-google.generativeai==0.8.3
+google-generativeai==0.8.3
 dashscope==1.20.14
 g4f==0.5.2.2
 azure-cognitiveservices-speech==1.41.1
diff --git a/webui/.streamlit/config.toml b/webui/.streamlit/config.toml
index b690b74..82fa436 100644
--- a/webui/.streamlit/config.toml
+++ b/webui/.streamlit/config.toml
@@ -1,2 +1,2 @@
-[browser]
-gatherUsageStats = false
\ No newline at end of file
+[server]
+fileWatcherType = "none"
diff --git a/webui/Main.py b/webui/Main.py
index 1b55abe..ce30bae 100644
--- a/webui/Main.py
+++ b/webui/Main.py
@@ -618,7 +618,7 @@ with middle_panel:
         )
         params.video_aspect = VideoAspect(video_aspect_ratios[selected_index][1])
 
-        params.video_clip_duration = st.selectbox(
+        params.max_clip_duration = st.selectbox(
             tr("Clip Duration"), options=[2, 3, 4, 5, 6, 7, 8, 9, 10], index=1
         )
         params.video_count = st.selectbox(
@@ -659,7 +659,8 @@ with middle_panel:
 
         if selected_tts_server == "siliconflow":
             # 获取硅基流动的声音列表
-            filtered_voices = voice.get_siliconflow_voices()
+            # filtered_voices = voice.get_siliconflow_voices()
+            pass
         else:
             # 获取Azure的声音列表
             all_voices = voice.get_all_azure_voices(filter_locals=None)
@@ -699,6 +700,7 @@ with middle_panel:
         if saved_voice_name_index >= len(friendly_names) and friendly_names:
             saved_voice_name_index = 0
 
+        voice_name = ""
         # 确保有声音可选
         if friendly_names:
             selected_friendly_name = st.selectbox(
@@ -715,14 +717,16 @@ with middle_panel:
             params.voice_name = voice_name
             config.ui["voice_name"] = voice_name
         else:
-            # 如果没有声音可选，显示提示信息
+            # 如果没有声音可选，使用默认声音并显示提示信息
             st.warning(
                 tr(
-                    "No voices available for the selected TTS server. Please select another server."
+                    "No voices available for the selected TTS server. A default voice (en-US-JennyNeural) will be used."
                 )
             )
-            params.voice_name = ""
-            config.ui["voice_name"] = ""
+            default_voice = "en-US-JennyNeural"
+            params.voice_name = default_voice
+            config.ui["voice_name"] = default_voice
+            voice_name = default_voice
 
         # 只有在有声音可选时才显示试听按钮
         if friendly_names and st.button(tr("Play Voice")):
@@ -961,7 +965,7 @@ if start_button:
     logger.info(utils.to_json(params))
     scroll_to_bottom()
 
-    result = tm.start(task_id=task_id, params=params)
+    result = tm.start_storyboard_task(task_id=task_id, params=params)
     if not result or "videos" not in result:
         st.error(tr("Video Generation Failed"))
         logger.error(tr("Video Generation Failed"))