diff --git a/.gitignore b/.gitignore index 6aa0ca7..bfcac95 100644 --- a/.gitignore +++ b/.gitignore @@ -9,7 +9,7 @@ /app/utils/__pycache__/ /*/__pycache__/* .vscode -/**/.streamlit + __pycache__ logs/ diff --git a/app/models/schema.py b/app/models/schema.py index 3696fa3..8f0bd32 100644 --- a/app/models/schema.py +++ b/app/models/schema.py @@ -3,7 +3,7 @@ from enum import Enum from typing import Any, List, Optional, Union import pydantic -from pydantic import BaseModel +from pydantic import BaseModel, Field # 忽略 Pydantic 的特定警告 warnings.filterwarnings( @@ -74,7 +74,7 @@ class VideoParams(BaseModel): video_aspect: Optional[VideoAspect] = VideoAspect.portrait.value video_concat_mode: Optional[VideoConcatMode] = VideoConcatMode.random.value video_transition_mode: Optional[VideoTransitionMode] = None - video_clip_duration: Optional[int] = 5 + max_clip_duration: Optional[int] = 5 video_count: Optional[int] = 1 video_source: Optional[str] = "pexels" @@ -103,7 +103,7 @@ class VideoParams(BaseModel): stroke_width: float = 1.5 n_threads: Optional[int] = 2 paragraph_number: Optional[int] = 1 - + storyboard_mode: bool = Field(False, description="是否启用故事板模式以实现音画同步") class SubtitleRequest(BaseModel): video_script: str diff --git a/app/services/llm.py b/app/services/llm.py index 24abfc8..44df275 100644 --- a/app/services/llm.py +++ b/app/services/llm.py @@ -2,7 +2,7 @@ import json import logging import re import requests -from typing import List +from typing import List, Dict import g4f from loguru import logger @@ -173,7 +173,7 @@ def _generate_response(prompt: str) -> str: "temperature": 0.5, "top_p": 1, "top_k": 1, - "max_output_tokens": 2048, + "max_output_tokens": 8192, } safety_settings = [ @@ -270,8 +270,10 @@ def _generate_response(prompt: str) -> str: base_url=base_url, ) - response = client.chat.completions.create( - model=model_name, messages=[{"role": "user", "content": prompt}] + response: ChatCompletion = client.chat.completions.create( + model=model_name, + messages=[{"role": "user", "content": prompt}], + max_tokens=4096 ) if response: if isinstance(response, ChatCompletion): @@ -392,77 +394,168 @@ Generate a script for a video, depending on the subject of the video. # ### Video Subject # {video_subject} def generate_terms(video_subject: str, video_script: str) -> List[str]: - prompt = f""" -# Role: AI Video Director and Editor + """ + Generate video terms from video subject and script. + """ + prompt_template = """ +# Role: Video Search Terms Generator -## Core Goal: -Your mission is to meticulously analyze the provided video script, break it down into distinct visual scenes, and generate a diverse list of English search terms for stock footage. +## Task: +Generate a concise, comma-separated list of 1-5 English search terms based on the provided `Video Subject` and `Video Script`. These terms will be used to find relevant video clips. -## Step-by-Step Instructions: -1. Read the entire `{video_subject}` script to understand the main narrative and mood. -2. Go through the script paragraph by paragraph (or by logical scene breaks). -3. For each paragraph/scene, generate ONE primary search term that best captures its visual essence. -4. Compile all generated search terms into a single JSON array. +## Instructions: +1. **Analyze Context:** Read the `Video Subject` and `Video Script` to understand the main topics and visual elements. +2. **Brainstorm Keywords:** Think of concrete, visually-driven keywords. Avoid abstract concepts. +3. **Select & Refine:** Choose the most powerful and representative terms. +4. **Format Output:** Provide a single line of comma-separated English keywords. Do not include any other text, explanations, or formatting. -## Keyword Generation Principles: -- **DIVERSITY**: CRITICAL. Avoid repetitive or overly similar terms. Each keyword must represent a distinct visual concept from the script. -- **SPECIFICITY**: Be specific. Instead of "car driving," prefer "sports car on mountain road at sunset." -- **VISUAL & CONCRETE**: Each term must describe a tangible, visual scene. Do not use abstract concepts (e.g., "sadness", "freedom"). -- **CONCISENESS**: Terms should ideally be 2-4 words long. -- **RELEVANCE**: Every term must be directly inspired by a part of the script and be relevant to the main video subject. +## Example: +**Video Subject:** "The Impact of Sugar on Your Brain" +**Video Script:** "Sugar, a sweet temptation, can have a profound effect on our brain chemistry..." +**Output:** +`sugar cubes, brain scan, dopamine release, person eating candy, neural pathways` -## Output Format Constraints: -- You MUST return a pure, single JSON Array. No introductory text, no markdown. Your entire response body must be a valid JSON array. -- All search terms must be in English. - -## Example of a Good Output: -["dramatic mountain landscape", "hiker reaching summit", "close up of old compass", "time-lapse of starry night", "..."] - -## Context: +## Your Turn: ### Video Subject: {video_subject} -### Video Script +### Video Script: {video_script} -Please note that you must use English for generating video search terms; Chinese is not accepted. -""".strip() +### Output: +""" + prompt = prompt_template.format( + video_subject=video_subject, video_script=video_script + ) logger.info(f"subject: {video_subject}") - search_terms = [] - response = "" - for i in range(_max_retries): - try: - response = _generate_response(prompt) - if "Error: " in response: - logger.error(f"failed to generate video script: {response}") - return response - search_terms = json.loads(response) - if not isinstance(search_terms, list) or not all( - isinstance(term, str) for term in search_terms - ): - logger.error("response is not a list of strings.") - continue + try: + response = _generate_response(prompt) + # remove blank lines + generated_text = "\n".join( + [line for line in response.split("\n") if line.strip()] + ) + if not generated_text: + logger.warning("LLM returned empty terms list.") + return [] - except Exception as e: - logger.warning(f"failed to generate video terms: {str(e)}") - if response: - match = re.search(r"\[.*]", response) - if match: - try: - search_terms = json.loads(match.group()) - except Exception as e: - logger.warning(f"failed to generate video terms: {str(e)}") - pass + terms = [term.strip().strip("`'\"") for term in generated_text.split(",")] + logger.info(f"Generated terms: {terms}") + return terms + except Exception as e: + logger.error(f"Failed to generate video terms: {e}") + return [] - if search_terms and len(search_terms) > 0: - break - if i < _max_retries: - logger.warning(f"failed to generate video terms, trying again... {i + 1}") - logger.success(f"completed: \n{search_terms}") - return search_terms +# def generate_storyboard(video_subject: str, video_script: str) -> List[Dict]: +# """ +# Analyzes the entire script, breaks it down into scenes, and generates matching search terms for each scene. +# Returns a list of scenes, where each scene is a dictionary containing 'scene_script' and 'search_terms'. +# """ +# prompt = f""" +# # Role: Video Script Analyst + +# ## GOAL: +# Your task is to transform a video script into a storyboard. You will read the provided script, segment it into scenes, and for each scene, generate a set of descriptive, visual search terms that will be used to find stock video footage. The final output must be a valid JSON array of objects. + +# ## STEP-BY-STEP INSTRUCTIONS: +# 1. **Segment the Script:** Read the `Video Script` and break it down into short, logical, spoken segments. A segment should typically be one or two sentences long. + +# ## EXAMPLE (Note the Realism and Concreteness): +# [ +# {{ +# "scene_script": "Blueberries. They're often called nature's perfect food for your eyes.", +# "search_terms": ["woman eating fresh blueberries from a bowl", "close up of fresh blueberries", "bowl of blueberries on a table"] +# }}, +# {{ +# "scene_script": "And for good reason. Packed with anthocyanins, vitamin C, and ludian...", +# "search_terms": ["nutritionist explaining health benefits", "close up of vitamin C tablets", "diagram of anthocyanin molecule"] +# }}, +# {{ +# "scene_script": "...these tiny berries act like microscopic shields, protecting your retina and macula from oxidative stress and age related damage.", +# "search_terms": ["medical animation of the human eye", "diagram of the retina and macula", "older person with healthy eyes smiling"] +# }} +# ] + +# ## CONTEXT: +# ### Video Subject: +# {video_subject} + +# ### Video Script: +# {video_script} +def generate_storyboard(video_subject: str, video_script: str) -> List[Dict]: + """ + Analyzes the script, breaks it into scenes, and extracts the main subject nouns as search terms for each scene. + Returns a list of scenes, where each scene is a dictionary containing 'scene_script' and 'search_terms'. + """ + # [核心修改] 通过更明确、更强力的指令,强制要求 LLM 将视频脚本的每一句话都处理成一个独立的场景,并为每个场景生成对应的英文关键词。 + prompt = f""" +You are a video production assistant. Your task is to process a script for a video, breaking it down sentence by sentence to generate visual search terms. + +**CRITICAL INSTRUCTIONS - FOLLOW THESE RULES EXACTLY:** + +1. **ONE SENTENCE = ONE VISUAL SEGMENT:** Each sentence from the script is a distinct visual segment. Do not merge sentences. +2. **CONCRETE & VISUAL KEYWORDS ONLY:** The `search_terms` MUST be concrete, visual, and tangible things. They must be nouns or descriptive actions that can be found in a video library. + - **GOOD:** `blueberries`, `person walking`, `city skyline`, `laughing friends`, `human eye`. + - **BAD / FORBIDDEN:** `reason`, `concept`, `idea`, `method`, `health`, `protection`, `damage`. Never use abstract, non-visual words. +3. **MANDATORY KEYWORD DIVERSITY:** You are FORBIDDEN from using the same primary keyword for two consecutive segments. If segment 1 uses `blueberries`, segment 2 MUST use a different but relevant keyword (e.g., `antioxidants` could be visualized as `colorful fruits`, `retina` as `close-up of eye`). DIVERSIFY a lot. + +**REQUIRED OUTPUT FORMAT:** +- You must output a valid JSON array of objects. +- Each object represents one sentence and must ONLY contain two keys: `script` and `search_terms`. + +**EXAMPLE:** + +Video Script: +"Blueberries are packed with anthocyanins, which are great for your eyes. These antioxidants protect the retina from damage." + +Your JSON Output: +```json +[ + {{ + "script": "Blueberries are packed with anthocyanins, which are great for your eyes.", + "search_terms": "blueberries, fresh fruit, antioxidant food" + }}, + {{ + "script": "These antioxidants protect the retina from damage.", + "search_terms": "close-up of eye, retina scan, vision test" + }} +] +``` + +**Video Script to Process:** +``` +{video_script} +``` + +**Your JSON Output (must be a valid JSON array):** +""" + # return [] + + logger.info(f"Generating storyboard for subject: {video_subject}") + response_str = _generate_response(prompt) + + try: + # The model should return a valid JSON array string. + # Find the start and end of the JSON array. + json_start = response_str.find('[') + json_end = response_str.rfind(']') + if json_start != -1 and json_end != -1 and json_start < json_end: + json_str = response_str[json_start:json_end+1] + storyboard = json.loads(json_str) + logger.success("Successfully parsed storyboard from LLM response.") + return storyboard + else: + logger.error(f"Could not find a valid JSON array in the response. Raw response: {response_str}") + return [] + except json.JSONDecodeError: + logger.error(f"Failed to parse JSON. Raw response: {response_str}") + # Fallback logic can be added here if needed, e.g., using regex to extract JSON. + return [] + + +# ... (您的其他函数和代码保持不变) if __name__ == "__main__": @@ -479,4 +572,42 @@ if __name__ == "__main__": print(search_terms) print("-----输出包含的场景数量-----") print(len(search_terms)) - \ No newline at end of file + +def generate_video_category(video_subject: str) -> str: + """ + Selects the most appropriate video category from a predefined list based on the video subject. + """ + prompt = f""" +# Role: Video Category Selector + +## Goal: +Based on the provided 'Video Subject', select the ONE most suitable category from the `Category List` that best represents the subject. Your response must be only the single category name. + +## Category List: +backgrounds, fashion, nature, science, education, feelings, health, people, religion, places, animals, industry, computer, food, sports, transportation, travel, buildings, business, music + +## Instructions: +- Analyze the 'Video Subject'. +- Choose the single best-fitting category from the list. +- Respond with ONLY the category name and nothing else. + +## Example: +Video Subject: "The benefits of a ketogenic diet" +Response: health + +Video Subject: "A tour of the Grand Canyon" +Response: travel + +## CONTEXT: +### Video Subject: +{video_subject} +""" + category = _generate_response(prompt).strip().lower() + # Fallback to a default category if the response is invalid + valid_categories = ["backgrounds", "fashion", "nature", "science", "education", "feelings", "health", "people", "religion", "places", "animals", "industry", "computer", "food", "sports", "transportation", "travel", "buildings", "business", "music"] + if category not in valid_categories: + logger.warning(f"Generated category '{category}' is not valid. Falling back to 'nature'.") + return "nature" + + logger.success(f"Successfully selected video category: {category}") + return category \ No newline at end of file diff --git a/app/services/material.py b/app/services/material.py index 6c6e6e6..fe01971 100644 --- a/app/services/material.py +++ b/app/services/material.py @@ -102,6 +102,8 @@ def search_videos_pexels( item.provider = "pexels" item.url = best_landscape_file["link"] # 使用最佳版本的链接 item.duration = duration + item.path = "" + item.start_time = 0.0 video_items.append(item) logging.info("选取的Mp4链接地址为{}".format(item.url)) return video_items @@ -177,6 +179,8 @@ def search_videos_pixabay( item.provider = "pixabay" item.url = best_video.get("url") item.duration = duration + item.path = "" + item.start_time = 0.0 video_items.append(item) return video_items @@ -319,73 +323,86 @@ def download_videos( search_terms: List[str], source: str = "pexels", video_aspect: VideoAspect = VideoAspect.portrait, - video_contact_mode: VideoConcatMode = VideoConcatMode.random, + video_concat_mode: VideoConcatMode = VideoConcatMode.random, audio_duration: float = 0.0, max_clip_duration: int = 5, -) -> List[str]: - valid_video_items = [] - valid_video_urls = [] - found_duration = 0.0 - search_videos = search_videos_pexels - search_kwargs = {} - if source == "pixabay": - search_videos = search_videos_pixabay - video_category = "" - if video_subject: - video_category = llm.generate_video_category(video_subject) - if video_category: - search_kwargs['category'] = video_category +) -> List[MaterialInfo]: + """ + Download videos from Pexels or Pixabay based on search terms. + """ + all_video_items: List[MaterialInfo] = [] + for term in search_terms: + if source == "pexels": + video_items = search_videos_pexels( + search_term=term, + minimum_duration=max_clip_duration, + video_aspect=video_aspect, + ) + elif source == "pixabay": + video_items = search_videos_pixabay( + search_term=term, + minimum_duration=max_clip_duration, + video_aspect=video_aspect, + ) + else: + video_items = [] + + logger.info(f"found {len(video_items)} videos for '{term}'") + all_video_items.extend(video_items) - for search_term in search_terms: - video_items = search_videos( - search_term=search_term, - minimum_duration=max_clip_duration, - video_aspect=video_aspect, - **search_kwargs, - ) - logger.info(f"found {len(video_items)} videos for '{search_term}'") + # Remove duplicates and calculate total duration + unique_video_items = [] + seen_urls = set() + for item in all_video_items: + if item.url not in seen_urls: + unique_video_items.append(item) + seen_urls.add(item.url) - for item in video_items: - if item.url not in valid_video_urls: - valid_video_items.append(item) - valid_video_urls.append(item.url) - found_duration += item.duration + if video_concat_mode == VideoConcatMode.random: + random.shuffle(unique_video_items) - logger.info( - f"found total videos: {len(valid_video_items)}, required duration: {audio_duration} seconds, found duration: {found_duration} seconds" - ) - video_paths = [] + found_duration = sum(item.duration for item in unique_video_items) + logger.info(f"found total unique videos: {len(unique_video_items)}, required duration: {audio_duration:.4f} seconds, found duration: {found_duration:.2f} seconds") + logger.info(f"Video download list (first 5): {[item.url for item in unique_video_items[:5]]}") - material_directory = config.app.get("material_directory", "").strip() - if material_directory == "task": - material_directory = utils.task_dir(task_id) - elif material_directory and not os.path.isdir(material_directory): - material_directory = "" + if not unique_video_items: + logger.warning("No videos found for the given search terms.") + return [] - if video_contact_mode.value == VideoConcatMode.random.value: - random.shuffle(valid_video_items) + if found_duration < audio_duration: + logger.warning(f"total duration of found videos ({found_duration:.2f}s) is less than audio duration ({audio_duration:.2f}s).") - total_duration = 0.0 - for item in valid_video_items: + downloaded_materials: List[MaterialInfo] = [] + downloaded_duration = 0.0 + + for item in unique_video_items: + if downloaded_duration >= audio_duration: + logger.info(f"total duration of downloaded videos: {downloaded_duration:.2f} seconds, skip downloading more") + break + try: logger.info(f"downloading video: {item.url}") - saved_video_path = save_video( - video_url=item.url, save_dir=material_directory - ) - if saved_video_path: - logger.info(f"video saved: {saved_video_path}") - video_paths.append(saved_video_path) - seconds = min(max_clip_duration, item.duration) - total_duration += seconds - if total_duration > audio_duration: - logger.info( - f"total duration of downloaded videos: {total_duration} seconds, skip downloading more" - ) - break + file_path = save_video(video_url=item.url) + if file_path: + logger.info(f"video saved: {file_path}") + material_info = MaterialInfo() + material_info.path = file_path + material_info.start_time = 0.0 + ffprobe_info = _get_video_info_ffprobe(file_path) + if ffprobe_info and ffprobe_info.get("duration"): + material_info.duration = float(ffprobe_info.get("duration")) + downloaded_duration += material_info.duration + else: + material_info.duration = item.duration # fallback + downloaded_duration += item.duration + + downloaded_materials.append(material_info) + except Exception as e: - logger.error(f"failed to download video: {utils.to_json(item)} => {str(e)}") - logger.success(f"downloaded {len(video_paths)} videos") - return video_paths + logger.error(f"failed to download video: {item.url} => {e}") + + logger.success(f"downloaded {len(downloaded_materials)} videos") + return downloaded_materials # 以下为调试入口,仅供开发测试 diff --git a/app/services/subtitle.py b/app/services/subtitle.py index ca0f247..06e60f1 100644 --- a/app/services/subtitle.py +++ b/app/services/subtitle.py @@ -278,6 +278,77 @@ def correct(subtitle_file, video_script): logger.success("Subtitle is correct") +def combine_srt_files(srt_files: list, output_file: str): + """ + Combines multiple SRT files into a single file, adjusting timestamps sequentially. + """ + logger.info(f"Combining {len(srt_files)} SRT files into {output_file}") + combined_subtitles = [] + last_end_time_seconds = 0.0 + entry_index = 1 + + for srt_file in srt_files: + if not os.path.exists(srt_file): + logger.warning(f"SRT file not found, skipping: {srt_file}") + continue + try: + with open(srt_file, 'r', encoding='utf-8') as f: + content = f.read() + + entries = re.split(r'\n\s*\n', content.strip()) + for entry in entries: + if not entry.strip(): + continue + + lines = entry.split('\n') + if len(lines) < 3: + continue + + # Parse timestamp + timestamp_line = lines[1] + start_time_str, end_time_str = timestamp_line.split(' --> ') + + def srt_time_to_seconds(t_str): + h, m, s_ms = t_str.split(':') + s, ms = s_ms.split(',') + return int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000.0 + + start_time = srt_time_to_seconds(start_time_str) + end_time = srt_time_to_seconds(end_time_str) + duration = end_time - start_time + + # Adjust time + new_start_time = last_end_time_seconds + new_end_time = new_start_time + duration + + def seconds_to_srt_time(seconds): + h = int(seconds // 3600) + m = int((seconds % 3600) // 60) + s = int(seconds % 60) + ms = int((seconds * 1000) % 1000) + return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}" + + new_start_str = seconds_to_srt_time(new_start_time) + new_end_str = seconds_to_srt_time(new_end_time) + + # Append to combined list + text = '\n'.join(lines[2:]) + combined_subtitles.append(f"{entry_index}\n{new_start_str} --> {new_end_str}\n{text}") + entry_index += 1 + + # Update last end time for the next file + last_end_time_seconds = new_end_time + + except Exception as e: + logger.error(f"Error processing SRT file {srt_file}: {e}") + + # Write combined SRT to output file + with open(output_file, 'w', encoding='utf-8') as f: + f.write('\n\n'.join(combined_subtitles) + '\n\n') + + logger.success(f"Successfully combined SRT files into {output_file}") + + if __name__ == "__main__": task_id = "c12fd1e6-4b0a-4d65-a075-c87abe35a072" task_dir = utils.task_dir(task_id) diff --git a/app/services/task.py b/app/services/task.py index fe82689..f0928cd 100644 --- a/app/services/task.py +++ b/app/services/task.py @@ -7,10 +7,209 @@ from loguru import logger from app.config import config from app.models import const -from app.models.schema import VideoConcatMode, VideoParams -from app.services import llm, material, subtitle, video, voice +from app.models.schema import ( + VideoConcatMode, + VideoParams, + VideoAspect, + MaterialInfo, +) +from app.services import llm, material, subtitle, voice, video +from app.services import video as video_utils from app.services import state as sm from app.utils import utils +import time + +# ... 您已有的 start 函数 ... + +# =================================================================== +# 新增的、实现音画同步的主任务函数 +# =================================================================== +def start_storyboard_task(task_id, params: VideoParams): + sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING) + workdir = utils.task_dir(task_id) + + # 1. Generate Storyboard + logger.info("--- Step 1: Generating Storyboard ---") + video_script = params.video_script + if not video_script: + sm.state.update_task(task_id, state=const.TASK_STATE_FAILED, status_message="Video script is empty.") + return + + storyboard = llm.generate_storyboard(params.video_subject, video_script) + if not storyboard: + sm.state.update_task(task_id, state=const.TASK_STATE_FAILED, status_message="Failed to generate storyboard.") + return + + # 2. Process each segment + logger.info(f"--- Step 2: Processing {len(storyboard)} video segments ---") + segment_video_paths = [] + segment_audio_paths = [] + segment_srt_paths = [] + total_duration = 0 + last_used_keywords = set() + + for i, segment in enumerate(storyboard): + try: + logger.info(f"--- Processing segment {i + 1} ---") + segment_script = segment.get("script") + if not segment_script: + logger.warning(f"Segment {i + 1} has no script, skipping") + continue + + search_terms_str = segment.get("search_terms", "") + search_terms = [term.strip() for term in search_terms_str.split(',') if term.strip()] + if not search_terms: + logger.warning(f"Segment {i + 1} has no search terms, skipping") + continue + + # Keyword Guard: Check for repetitive keywords + current_keywords = set(search_terms) + if i > 0 and current_keywords == last_used_keywords: + logger.warning(f"Segment {i + 1} uses the same keywords as the previous one ({search_terms_str}). Reusing last video clip to avoid visual repetition.") + if segment_video_paths: + segment_video_paths.append(segment_video_paths[-1]) # Reuse the last processed video clip + segment_audio_paths.append(segment_audio_paths[-1]) # Reuse the last audio clip + continue # Skip processing for this segment + + last_used_keywords = current_keywords + + # a. Generate audio and subtitles for the segment + segment_audio_file = path.join(workdir, f"segment_{i + 1}.mp3") + segment_srt_file = path.join(workdir, f"segment_{i + 1}.srt") + sub_maker = voice.tts( + text=segment_script, + voice_name=voice.parse_voice_name(params.voice_name), + voice_rate=params.voice_rate, + voice_file=segment_audio_file, + ) + if not sub_maker: + raise Exception(f"Failed to generate audio for segment {i + 1}") + + voice.create_subtitle( + sub_maker=sub_maker, text=segment_script, subtitle_file=segment_srt_file + ) + audio_duration = voice.get_audio_duration(sub_maker) + total_duration += audio_duration + + # b. Search and download video materials for each term + video_materials = [] + downloaded_duration = 0 + for term in search_terms: + if downloaded_duration >= audio_duration: + break + term_materials = material.download_videos( + task_id=task_id, + video_subject=params.video_subject, + search_terms=[term], # Pass one term at a time + source=params.video_source, + video_aspect=params.video_aspect, + video_concat_mode=params.video_concat_mode, + audio_duration=audio_duration - downloaded_duration, + max_clip_duration=params.max_clip_duration, + ) + if term_materials: + video_materials.extend(term_materials) + downloaded_duration = sum(m.duration for m in video_materials) + if not video_materials: + raise Exception(f"Failed to find materials for segment {i + 1}") + + # c. Create a video clip matching the audio duration + segment_video_path = path.join(workdir, f"segment_video_{i + 1}.mp4") + clip_created = video.create_video_clip_from_materials( + video_materials=video_materials, + audio_duration=audio_duration, + max_clip_duration=params.max_clip_duration, + video_aspect=params.video_aspect, + output_path=segment_video_path + ) + if not clip_created: + raise Exception(f"Failed to create video clip for segment {i + 1}") + + segment_video_paths.append(segment_video_path) + segment_audio_paths.append(segment_audio_file) + segment_srt_paths.append(segment_srt_file) + + except Exception as e: + logger.error(f"Error processing segment {i + 1}: {e}") + sm.state.update_task(task_id, state=const.TASK_STATE_FAILED, status_message=f"Error in segment {i + 1}: {e}") + return + + # Check if any segments were processed + if not segment_video_paths: + sm.state.update_task(task_id, state=const.TASK_STATE_FAILED, status_message="Failed to process any segments.") + logger.error("Failed to process any segments. Aborting video generation.") + return + + # 3. Combine all segments + logger.info("--- Step 3: Combining all video segments ---") + # a. Combine audios + combined_audio_path = path.join(workdir, "voice.mp3") + if not voice.combine_audio_files(segment_audio_paths, combined_audio_path): + sm.state.update_task(task_id, state=const.TASK_STATE_FAILED, status_message="Failed to combine audio files.") + return + + # b. Combine videos + video_transition_mode = params.video_transition_mode + concatenated_video_path = path.join(workdir, "concatenated_video.mp4") + if not video.concatenate_videos(segment_video_paths, concatenated_video_path, transition_mode=video_transition_mode): + sm.state.update_task(task_id, state=const.TASK_STATE_FAILED, status_message="Failed to concatenate videos.") + return + + # c. Combine subtitles + combined_srt_path = path.join(workdir, "subtitles.srt") + subtitle.combine_srt_files(segment_srt_paths, combined_srt_path) + + # 4. Final video assembly + logger.info("--- Step 4: Final video assembly ---") + # a. Add audio to concatenated video + video_with_audio_path = path.join(workdir, "video_with_audio.mp4") + if not video.add_audio_to_video(concatenated_video_path, combined_audio_path, video_with_audio_path): + sm.state.update_task(task_id, state=const.TASK_STATE_FAILED, status_message="Failed to add audio to video.") + return + + # b. Add background music + video_with_bgm_path = path.join(workdir, "video_with_bgm.mp4") + bgm_file = video.get_bgm_file(bgm_type=params.bgm_type, bgm_file=params.bgm_file) + if bgm_file: + if not video.add_bgm_to_video( + input_video_path=video_with_audio_path, + bgm_path=bgm_file, + bgm_volume=params.bgm_volume, + output_video_path=video_with_bgm_path + ): + logger.warning("Failed to mix BGM. Proceeding without it.") + video_with_bgm_path = video_with_audio_path # Fallback + else: + video_with_bgm_path = video_with_audio_path # No BGM requested + + # c. Add subtitles + final_video_path = path.join(workdir, f"final_{task_id}.mp4") + video.add_subtitles_to_video( + video_path=video_with_bgm_path, + srt_path=combined_srt_path, + font_name=params.font_name, + font_size=params.font_size, + text_fore_color=params.text_fore_color, + stroke_color=params.stroke_color, + stroke_width=params.stroke_width, + subtitle_position=params.subtitle_position, + custom_position=params.custom_position, + output_path=final_video_path + ) + + # 5. Cleanup + logger.info("--- Step 5: Cleaning up temporary files ---") + cleanup_files = segment_video_paths + segment_audio_paths + segment_srt_paths + [combined_audio_path, concatenated_video_path, combined_srt_path, video_with_audio_path, video_with_bgm_path] + for item in cleanup_files: + if item and item != final_video_path and os.path.exists(item): + os.remove(item) + + sm.state.update_task(task_id, state=const.TASK_STATE_COMPLETE, progress=100, video_path=final_video_path) + logger.success(f"Task {task_id} completed successfully. Final video: {final_video_path}") + + + + return {"videos": [final_video_path]} def generate_script(task_id, params): @@ -127,7 +326,7 @@ def get_video_materials(task_id, params, video_terms, audio_duration): if params.video_source == "local": logger.info("\n\n## preprocess local materials") materials = video.preprocess_video( - materials=params.video_materials, clip_duration=params.video_clip_duration + materials=params.video_materials, clip_duration=params.max_clip_duration ) if not materials: sm.state.update_task(task_id, state=const.TASK_STATE_FAILED) @@ -140,12 +339,13 @@ def get_video_materials(task_id, params, video_terms, audio_duration): logger.info(f"\n\n## downloading videos from {params.video_source}") downloaded_videos = material.download_videos( task_id=task_id, + video_subject=params.video_subject, search_terms=video_terms, source=params.video_source, video_aspect=params.video_aspect, video_contact_mode=params.video_concat_mode, audio_duration=audio_duration * params.video_count, - max_clip_duration=params.video_clip_duration, + max_clip_duration=params.max_clip_duration, ) if not downloaded_videos: sm.state.update_task(task_id, state=const.TASK_STATE_FAILED) @@ -173,14 +373,14 @@ def generate_final_videos( utils.task_dir(task_id), f"combined-{index}.mp4" ) logger.info(f"\n\n## combining video: {index} => {combined_video_path}") - video.combine_videos_ffmpeg( + video_utils.combine_videos_ffmpeg( combined_video_path=combined_video_path, video_paths=downloaded_videos, audio_file=audio_file, video_aspect=params.video_aspect, video_concat_mode=video_concat_mode, video_transition_mode=video_transition_mode, - max_clip_duration=params.video_clip_duration, + max_clip_duration=params.max_clip_duration, threads=params.n_threads, ) @@ -190,7 +390,7 @@ def generate_final_videos( final_video_path = path.join(utils.task_dir(task_id), f"final-{index}.mp4") logger.info(f"\n\n## generating video: {index} => {final_video_path}") - video.generate_video( + video_utils.generate_video( video_path=combined_video_path, audio_path=audio_file, subtitle_path=subtitle_path, diff --git a/app/services/utils/video_effects.py b/app/services/utils/video_effects.py deleted file mode 100644 index 6cba8eb..0000000 --- a/app/services/utils/video_effects.py +++ /dev/null @@ -1,21 +0,0 @@ -from moviepy import Clip, vfx - - -# FadeIn -def fadein_transition(clip: Clip, t: float) -> Clip: - return clip.with_effects([vfx.FadeIn(t)]) - - -# FadeOut -def fadeout_transition(clip: Clip, t: float) -> Clip: - return clip.with_effects([vfx.FadeOut(t)]) - - -# SlideIn -def slidein_transition(clip: Clip, t: float, side: str) -> Clip: - return clip.with_effects([vfx.SlideIn(t, side)]) - - -# SlideOut -def slideout_transition(clip: Clip, t: float, side: str) -> Clip: - return clip.with_effects([vfx.SlideOut(t, side)]) diff --git a/app/services/video.py b/app/services/video.py index 171316a..5d18d0f 100644 --- a/app/services/video.py +++ b/app/services/video.py @@ -2,491 +2,459 @@ import glob import itertools import os import random -import gc import shutil +import subprocess +import json from typing import List from loguru import logger -from moviepy import ( - AudioFileClip, - ColorClip, - CompositeAudioClip, - CompositeVideoClip, - ImageClip, - TextClip, - VideoFileClip, - afx, - concatenate_videoclips, -) -from moviepy.video.tools.subtitles import SubtitlesClip -from PIL import ImageFont -from app.models import const from app.models.schema import ( - MaterialInfo, VideoAspect, - VideoConcatMode, VideoParams, + VideoConcatMode, VideoTransitionMode, ) -from app.services.utils import video_effects + from app.utils import utils -class SubClippedVideoClip: - def __init__(self, file_path, start_time=None, end_time=None, width=None, height=None, duration=None): - self.file_path = file_path - self.start_time = start_time - self.end_time = end_time - self.width = width - self.height = height - if duration is None: - self.duration = end_time - start_time - else: - self.duration = duration - - def __str__(self): - return f"SubClippedVideoClip(file_path={self.file_path}, start_time={self.start_time}, end_time={self.end_time}, duration={self.duration}, width={self.width}, height={self.height})" - - -audio_codec = "aac" -video_codec = "libx264" -fps = 30 - -def close_clip(clip): - if clip is None: - return - - try: - # close main resources - if hasattr(clip, 'reader') and clip.reader is not None: - clip.reader.close() - - # close audio resources - if hasattr(clip, 'audio') and clip.audio is not None: - if hasattr(clip.audio, 'reader') and clip.audio.reader is not None: - clip.audio.reader.close() - del clip.audio - - # close mask resources - if hasattr(clip, 'mask') and clip.mask is not None: - if hasattr(clip.mask, 'reader') and clip.mask.reader is not None: - clip.mask.reader.close() - del clip.mask - - # handle child clips in composite clips - if hasattr(clip, 'clips') and clip.clips: - for child_clip in clip.clips: - if child_clip is not clip: # avoid possible circular references - close_clip(child_clip) - - # clear clip list - if hasattr(clip, 'clips'): - clip.clips = [] - - except Exception as e: - logger.error(f"failed to close clip: {str(e)}") - - del clip - gc.collect() - -def delete_files(files: List[str] | str): - if isinstance(files, str): - files = [files] - - for file in files: - try: - os.remove(file) - except: - pass - -def get_bgm_file(bgm_type: str = "random", bgm_file: str = ""): - if not bgm_type: - return "" - - if bgm_file and os.path.exists(bgm_file): - return bgm_file +def get_bgm_file(bgm_type: str, bgm_file: str): if bgm_type == "random": - suffix = "*.mp3" - song_dir = utils.song_dir() - files = glob.glob(os.path.join(song_dir, suffix)) - return random.choice(files) + bgm_dir = utils.resource_dir("bgm") + if not os.path.exists(bgm_dir): + logger.warning(f"BGM directory not found: {bgm_dir}, trying assets/bgm") + bgm_dir = utils.resource_dir("assets/bgm") + if not os.path.exists(bgm_dir): + logger.warning(f"BGM directory not found: {bgm_dir}, skip adding BGM.") + return "" + + bgm_files = glob.glob(os.path.join(bgm_dir, "*.mp3")) + if not bgm_files: + logger.warning(f"No BGM files found in {bgm_dir}, skip adding BGM.") + return "" + return random.choice(bgm_files) + + if bgm_type == "local": + return bgm_file return "" -# def combine_videos( -# combined_video_path: str, -# video_paths: List[str], -# audio_file: str, -# video_aspect: VideoAspect = VideoAspect.portrait, -# video_concat_mode: VideoConcatMode = VideoConcatMode.random, -# video_transition_mode: VideoTransitionMode = None, -# max_clip_duration: int = 5, -# threads: int = 2, -# ) -> str: -# audio_clip = AudioFileClip(audio_file) -# audio_duration = audio_clip.duration -# logger.info(f"audio duration: {audio_duration} seconds") -# # Required duration of each clip -# req_dur = audio_duration / len(video_paths) -# req_dur = max_clip_duration -# logger.info(f"maximum clip duration: {req_dur} seconds") -# output_dir = os.path.dirname(combined_video_path) +def _run_ffmpeg_command(command: list): + try: + process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, creationflags=subprocess.CREATE_NO_WINDOW) + stdout, stderr = process.communicate() + if process.returncode != 0: + logger.error(f"FFmpeg command failed with return code {process.returncode}") + logger.error(f"FFmpeg stderr: {stderr}") + return False + logger.debug(f"FFmpeg command successful: {' '.join(command)}") + logger.debug(f"FFmpeg stderr: {stderr}") + return True + except FileNotFoundError: + logger.error("ffmpeg or ffprobe not found. Please ensure they are installed and in your PATH.") + return False + except Exception as e: + logger.error(f"An error occurred while running ffmpeg: {e}") + return False -# aspect = VideoAspect(video_aspect) -# video_width, video_height = aspect.to_resolution() -# processed_clips = [] -# subclipped_items = [] -# video_duration = 0 -# for video_path in video_paths: -# clip = VideoFileClip(video_path) -# clip_duration = clip.duration -# clip_w, clip_h = clip.size -# close_clip(clip) - -# start_time = 0 +def get_video_duration(video_path: str) -> float: + """Get the duration of a video using ffprobe.""" + command = [ + 'ffprobe', + '-v', 'error', + '-show_entries', 'format=duration', + '-of', 'default=noprint_wrappers=1:nokey=1', + video_path + ] + try: + result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True) + return float(result.stdout) + except (subprocess.CalledProcessError, FileNotFoundError, ValueError) as e: + logger.error(f"Error getting duration for {video_path}: {e}") + return 0.0 -# while start_time < clip_duration: -# end_time = min(start_time + max_clip_duration, clip_duration) -# if clip_duration - start_time >= max_clip_duration: -# subclipped_items.append(SubClippedVideoClip(file_path= video_path, start_time=start_time, end_time=end_time, width=clip_w, height=clip_h)) -# start_time = end_time -# if video_concat_mode.value == VideoConcatMode.sequential.value: -# break -# # random subclipped_items order -# if video_concat_mode.value == VideoConcatMode.random.value: -# random.shuffle(subclipped_items) - -# logger.debug(f"total subclipped items: {len(subclipped_items)}") - -# # Add downloaded clips over and over until the duration of the audio (max_duration) has been reached -# for i, subclipped_item in enumerate(subclipped_items): -# if video_duration > audio_duration: -# break - -# logger.debug(f"processing clip {i+1}: {subclipped_item.width}x{subclipped_item.height}, current duration: {video_duration:.2f}s, remaining: {audio_duration - video_duration:.2f}s") - -# try: -# clip = VideoFileClip(subclipped_item.file_path).subclipped(subclipped_item.start_time, subclipped_item.end_time) -# clip_duration = clip.duration -# # Not all videos are same size, so we need to resize them -# clip_w, clip_h = clip.size -# if clip_w != video_width or clip_h != video_height: -# clip_ratio = clip.w / clip.h -# video_ratio = video_width / video_height -# logger.debug(f"resizing clip, source: {clip_w}x{clip_h}, ratio: {clip_ratio:.2f}, target: {video_width}x{video_height}, ratio: {video_ratio:.2f}") - -# if clip_ratio == video_ratio: -# clip = clip.resized(new_size=(video_width, video_height)) -# else: -# if clip_ratio > video_ratio: -# scale_factor = video_width / clip_w -# else: -# scale_factor = video_height / clip_h +def delete_files(files: List[str] | str): + if isinstance(files, str): + files = [files] + for file in files: + if os.path.exists(file): + try: + os.remove(file) + except Exception as e: + logger.warning(f"Failed to delete file {file}: {e}") -# new_width = int(clip_w * scale_factor) -# new_height = int(clip_h * scale_factor) -# background = ColorClip(size=(video_width, video_height), color=(0, 0, 0)).with_duration(clip_duration) -# clip_resized = clip.resized(new_size=(new_width, new_height)).with_position("center") -# clip = CompositeVideoClip([background, clip_resized]) - -# shuffle_side = random.choice(["left", "right", "top", "bottom"]) -# if video_transition_mode.value == VideoTransitionMode.none.value: -# clip = clip -# elif video_transition_mode.value == VideoTransitionMode.fade_in.value: -# clip = video_effects.fadein_transition(clip, 1) -# elif video_transition_mode.value == VideoTransitionMode.fade_out.value: -# clip = video_effects.fadeout_transition(clip, 1) -# elif video_transition_mode.value == VideoTransitionMode.slide_in.value: -# clip = video_effects.slidein_transition(clip, 1, shuffle_side) -# elif video_transition_mode.value == VideoTransitionMode.slide_out.value: -# clip = video_effects.slideout_transition(clip, 1, shuffle_side) -# elif video_transition_mode.value == VideoTransitionMode.shuffle.value: -# transition_funcs = [ -# lambda c: video_effects.fadein_transition(c, 1), -# lambda c: video_effects.fadeout_transition(c, 1), -# lambda c: video_effects.slidein_transition(c, 1, shuffle_side), -# lambda c: video_effects.slideout_transition(c, 1, shuffle_side), -# ] -# shuffle_transition = random.choice(transition_funcs) -# clip = shuffle_transition(clip) +def create_video_clip_from_materials(video_materials: list, audio_duration: float, max_clip_duration: int, video_aspect: VideoAspect, output_path: str): + logger.info(f"Optimized: Creating video clip for {output_path} with duration {audio_duration:.2f}s using ffmpeg") -# if clip.duration > max_clip_duration: -# clip = clip.subclipped(0, max_clip_duration) - -# # wirte clip to temp file -# clip_file = f"{output_dir}/temp-clip-{i+1}.mp4" -# clip.write_videofile(clip_file, logger=None, fps=fps, codec=video_codec) - -# close_clip(clip) - -# processed_clips.append(SubClippedVideoClip(file_path=clip_file, duration=clip.duration, width=clip_w, height=clip_h)) -# video_duration += clip.duration - -# except Exception as e: -# logger.error(f"failed to process clip: {str(e)}") - -# # loop processed clips until the video duration matches or exceeds the audio duration. -# if video_duration < audio_duration: -# logger.warning(f"video duration ({video_duration:.2f}s) is shorter than audio duration ({audio_duration:.2f}s), looping clips to match audio length.") -# base_clips = processed_clips.copy() -# for clip in itertools.cycle(base_clips): -# if video_duration >= audio_duration: -# break -# processed_clips.append(clip) -# video_duration += clip.duration -# logger.info(f"video duration: {video_duration:.2f}s, audio duration: {audio_duration:.2f}s, looped {len(processed_clips)-len(base_clips)} clips") - -# # merge video clips progressively, avoid loading all videos at once to avoid memory overflow -# logger.info("starting clip merging process") -# if not processed_clips: -# logger.warning("no clips available for merging") -# return combined_video_path - -# # if there is only one clip, use it directly -# if len(processed_clips) == 1: -# logger.info("using single clip directly") -# shutil.copy(processed_clips[0].file_path, combined_video_path) -# delete_files(processed_clips) -# logger.info("video combining completed") -# return combined_video_path - -# # create initial video file as base -# base_clip_path = processed_clips[0].file_path -# temp_merged_video = f"{output_dir}/temp-merged-video.mp4" -# temp_merged_next = f"{output_dir}/temp-merged-next.mp4" - -# # copy first clip as initial merged video -# shutil.copy(base_clip_path, temp_merged_video) - -# # merge remaining video clips one by one -# for i, clip in enumerate(processed_clips[1:], 1): -# logger.info(f"merging clip {i}/{len(processed_clips)-1}, duration: {clip.duration:.2f}s") - -# try: -# # load current base video and next clip to merge -# base_clip = VideoFileClip(temp_merged_video) -# next_clip = VideoFileClip(clip.file_path) - -# # merge these two clips -# merged_clip = concatenate_videoclips([base_clip, next_clip]) + if audio_duration <= 0: + logger.warning("Audio duration is zero or negative, cannot create video clip.") + return False -# # save merged result to temp file -# merged_clip.write_videofile( -# filename=temp_merged_next, -# threads=threads, -# logger=None, -# temp_audiofile_path=output_dir, -# audio_codec=audio_codec, -# fps=fps, -# ) -# close_clip(base_clip) -# close_clip(next_clip) -# close_clip(merged_clip) - -# # replace base file with new merged file -# delete_files(temp_merged_video) -# os.rename(temp_merged_next, temp_merged_video) - -# except Exception as e: -# logger.error(f"failed to merge clip: {str(e)}") -# continue - -# # after merging, rename final result to target file name -# os.rename(temp_merged_video, combined_video_path) - -# # clean temp files -# clip_files = [clip.file_path for clip in processed_clips] -# delete_files(clip_files) - -# logger.info("video combining completed") -# return combined_video_path + total_duration_of_materials = sum(m.duration for m in video_materials) + if total_duration_of_materials < audio_duration: + logger.warning(f"Total material duration ({total_duration_of_materials}s) is less than audio duration ({audio_duration}s). Video will be shorter.") + audio_duration = total_duration_of_materials -import subprocess + w, h = video_aspect.to_resolution() + # Use the most robust method: scale to fill, then crop to center. + # This avoids black bars by ensuring the video fills the frame, cropping excess. + scale_filter = f"scale={w}:{h}:force_original_aspect_ratio=increase" + crop_filter = f"crop={w}:{h}" + fade_in_filter = "fade=in:st=0:d=0.5" -def combine_videos_ffmpeg( - combined_video_path: str, - video_paths: List[str], - audio_file: str, - video_aspect: VideoAspect = VideoAspect.portrait, - video_concat_mode: VideoConcatMode = VideoConcatMode.random, - video_transition_mode: VideoTransitionMode = None, # 注意:FFmpeg转场实现方式不同 - max_clip_duration: int = 5, - threads: int = 2, -) -> str: - """ - 使用 FFmpeg 和 GPU 加速来合并视频,以获得极致的性能和画质。 - """ - audio_clip = AudioFileClip(audio_file) - audio_duration = audio_clip.duration - close_clip(audio_clip) - logger.info(f"音频时长: {audio_duration:.2f} 秒") + filter_complex_parts = [] + concat_inputs = "" + time_so_far = 0.0 - output_dir = os.path.dirname(combined_video_path) - aspect = VideoAspect(video_aspect) - video_width, video_height = aspect.to_resolution() + # If only one material, just trim and process it + if len(video_materials) == 1: + material = video_materials[0] + duration_needed = audio_duration + start_time = material.start_time if material.start_time >= 0 else 0 + trim_filter = f"[0:v]trim=start={start_time}:duration={duration_needed},setpts=PTS-STARTPTS" + sar_filter = "setsar=1" - # --- 步骤 1: 将所有源视频切成小片段信息 --- - subclipped_items = [] - for video_path in video_paths: - # 这里我们仍然用 moviepy 获取视频信息,因为它很方便 - try: - with VideoFileClip(video_path) as clip: - clip_duration = clip.duration - clip_w, clip_h = clip.size - - start_time = 0 - while start_time < clip_duration: - end_time = min(start_time + max_clip_duration, clip_duration) - if end_time - start_time >= 1.0: # 确保片段至少1秒 - subclipped_items.append(SubClippedVideoClip( - file_path=video_path, - start_time=start_time, - end_time=end_time - )) - start_time += max_clip_duration - if video_concat_mode.value == VideoConcatMode.sequential.value: - break - except Exception as e: - logger.error(f"无法读取视频信息 {video_path}: {e}") - continue + command = [ + "ffmpeg", + "-y", + "-i", material.path, + "-vf", f"{trim_filter},{sar_filter},{scale_filter},{crop_filter},{fade_in_filter}", + "-an", # remove audio + "-c:v", "libx264", + "-preset", "ultrafast", + "-crf", "23", + "-maxrate", "10M", + "-bufsize", "20M", + "-r", "30", + output_path + ] + return _run_ffmpeg_command(command) - if video_concat_mode.value == VideoConcatMode.random.value: - random.shuffle(subclipped_items) - - # --- 步骤 2: 使用 FFmpeg 处理每个小片段并保存为临时文件 --- - processed_files = [] - total_video_duration = 0 - - for i, item in enumerate(subclipped_items): - if total_video_duration >= audio_duration: + # If multiple materials, create clips and concatenate + for i, material in enumerate(video_materials): + if time_so_far >= audio_duration: break - temp_clip_path = os.path.join(output_dir, f"temp-clip-{i}.mp4") - clip_duration = item.end_time - item.start_time - - # 构建FFmpeg命令 - # 滤镜链: 缩放以适应目标尺寸(保持宽高比), 然后用黑边填充到目标分辨率 - vf_filter = f"scale={video_width}:{video_height}:force_original_aspect_ratio=decrease,pad={video_width}:{video_height}:-1:-1:color=black" - - # 添加转场效果 (这里只演示淡入,其他转场需要更复杂的滤镜) - if video_transition_mode and video_transition_mode.value != VideoTransitionMode.none.value: - # FFmpeg的淡入效果: fade=type=in:start_time=0:duration=1 - fade_duration = min(1.0, clip_duration) # 淡入时长不超过片段时长 - vf_filter += f",fade=t=in:st=0:d={fade_duration}" + duration_from_this_clip = min(material.duration, audio_duration - time_so_far, max_clip_duration) + if duration_from_this_clip <= 0: + continue - command = command = [ - "ffmpeg", "-y", - "-hwaccel", "auto", - "-ss", str(item.start_time), - "-to", str(item.end_time), - "-i", item.file_path, - "-vf", vf_filter, - "-c:v", "h264_nvenc", - "-preset", "p5", - "-b:v", "50M", - "-r", str(fps), # <--- 强制输出帧率为30 - "-video_track_timescale", "30000", # <--- 强制设置一个标准的时间基 - "-an", # <--- 强制移除所有音频轨道,避免音频参数不一致 - "-threads", str(threads), - temp_clip_path -] - - logger.debug(f"正在处理片段 {i}: {' '.join(command)}") - try: - subprocess.run(command, check=True, capture_output=True) - processed_files.append(temp_clip_path) - total_video_duration += clip_duration - except subprocess.CalledProcessError as e: - logger.error(f"处理片段失败 {item.file_path}: {e.stderr.decode('utf-8')}") + start_time = material.start_time if material.start_time >= 0 else 0 + trim_filter = f"[{i}:v]trim=start={start_time}:duration={duration_from_this_clip},setpts=PTS-STARTPTS" + sar_filter = "setsar=1" + filter_complex_parts.append(f"{trim_filter},{sar_filter},{scale_filter},{crop_filter}[v{i}]" ) + concat_inputs += f"[v{i}]" + time_so_far += duration_from_this_clip - # --- 步骤 3: 使用 FFmpeg concat demuxer 极速合并所有临时片段 --- - concat_list_path = os.path.join(output_dir, "concat_list.txt") - with open(concat_list_path, "w", encoding="utf-8") as f: - for file_path in processed_files: - # FFmpeg concat需要特定的格式 - f.write(f"file '{file_path.replace(os.sep, '/')}'\n") + if not filter_complex_parts: + logger.error("No video clips could be prepared for concatenation.") + return False - # 构建合并命令 - merge_command = [ + concat_filter = f"{concat_inputs}concat=n={len(concat_inputs)//3}:v=1:a=0[outv]" + filter_complex_parts.append(concat_filter) + + command = [ "ffmpeg", "-y", - "-f", "concat", - "-safe", "0", - "-i", concat_list_path, - "-c", "copy", # 关键:直接复制流,不重新编码,速度极快 - combined_video_path ] - - logger.info("开始极速合并所有片段...") + for material in video_materials[:len(concat_inputs)//3]: + command.extend(["-i", material.path]) + + command.extend([ + "-filter_complex", ';'.join(filter_complex_parts), + "-map", "[outv]", + "-c:v", "libx264", + "-an", + "-r", "30", + output_path + ]) + + return _run_ffmpeg_command(command) + + +def concatenate_videos(video_paths: List[str], output_path: str, transition_mode: VideoTransitionMode = VideoTransitionMode.none): + logger.info(f"Concatenating {len(video_paths)} videos into {output_path} with transition: {transition_mode.name}") + + if not video_paths: + logger.error("No video paths provided for concatenation.") + return False + + if len(video_paths) == 1: + logger.info("Only one video, copying to output path.") + shutil.copy(video_paths[0], output_path) + return True + + use_transition = transition_mode != VideoTransitionMode.none + + # Nested function for fallback to simple concatenation + def fallback_concat(): + logger.info("Using simple concat demuxer (no transitions).") + temp_file_path = os.path.join(os.path.dirname(output_path), "temp_video_list.txt") + try: + with open(temp_file_path, "w", encoding="utf-8") as f: + for video_path in video_paths: + # Normalize path for ffmpeg concat demuxer, which is sensitive to backslashes + normalized_path = video_path.replace('\\', '/') + f.write(f"file '{normalized_path}'\n") + + command = [ + "ffmpeg", "-y", + "-f", "concat", + "-safe", "0", + "-i", temp_file_path, + "-c", "copy", + output_path + ] + + if _run_ffmpeg_command(command): + logger.success(f"Successfully concatenated videos using concat demuxer: {output_path}") + return True + else: + logger.error("Failed to concatenate videos using concat demuxer.") + return False + finally: + delete_files(temp_file_path) + + if not use_transition: + return fallback_concat() + + # Proceed with transitions using xfade + logger.info("Using xfade for transitions.") + transition_duration = 0.5 # seconds + video_durations = [get_video_duration(p) for p in video_paths] + + if any(d == 0.0 for d in video_durations): + logger.warning("Could not determine duration for all video clips, falling back to simple concatenation.") + return fallback_concat() + + command = ["ffmpeg", "-y"] + for path in video_paths: + command.extend(["-i", path]) + + filter_chains = [] + # Initial stream is [0:v] + last_stream_name = "[0:v]" + total_duration = 0 + + for i in range(1, len(video_paths)): + total_duration += video_durations[i-1] + offset = total_duration - transition_duration + + input_stream_name = f"[{i}:v]" + output_stream_name = f"[v{i}]" + + filter_chains.append(f"{last_stream_name}{input_stream_name}xfade=transition=fade:duration={transition_duration}:offset={offset}{output_stream_name}") + last_stream_name = output_stream_name + + filter_complex = ";".join(filter_chains) + + command.extend([ + "-filter_complex", filter_complex, + "-map", last_stream_name, + "-c:v", "libx264", + "-movflags", "+faststart", + output_path + ]) + + if _run_ffmpeg_command(command): + logger.success(f"Successfully concatenated videos with transitions: {output_path}") + return True + else: + logger.warning("FFmpeg command with transition failed, falling back to simple concatenation.") + return fallback_concat() + + +def add_audio_to_video(video_path: str, audio_path: str, output_path: str): + video_path = os.path.normpath(video_path) + audio_path = os.path.normpath(audio_path) + output_path = os.path.normpath(output_path) + + # Check if the video already has an audio stream + has_audio_stream = False try: - subprocess.run(merge_command, check=True, capture_output=True) - logger.success("片段合并完成!") - except subprocess.CalledProcessError as e: - logger.error(f"合并失败: {e.stderr.decode('utf-8')}") - return "" + probe_command = [ + "ffprobe", "-v", "error", "-select_streams", "a", + "-show_entries", "stream=codec_type", "-of", "csv=p=0", video_path + ] + process = subprocess.run(probe_command, check=True, capture_output=True, text=True) + if process.stdout.strip(): + has_audio_stream = True + except (subprocess.CalledProcessError, FileNotFoundError) as e: + logger.warning(f"Could not probe video for audio stream: {e}") - # --- 步骤 4: 清理临时文件 --- - delete_files(processed_files) - delete_files(concat_list_path) + if has_audio_stream: + command = [ + "ffmpeg", + "-y", + "-i", video_path, + "-i", audio_path, + "-c:v", "copy", + "-c:a", "aac", + "-map", "0:v:0", + "-map", "1:a:0", + "-shortest", + output_path, + ] + else: + command = [ + "ffmpeg", + "-y", + "-i", video_path, + "-i", audio_path, + "-c:v", "copy", + "-c:a", "aac", + "-map", "0:v:0", + "-map", "1:a:0", + output_path, + ] + return _run_ffmpeg_command(command) - return combined_video_path -def wrap_text(text, max_width, font="Arial", fontsize=60): - # Create ImageFont - font = ImageFont.truetype(font, fontsize) +def add_bgm_to_video(video_path: str, bgm_path: str, bgm_volume: float, output_path: str) -> bool: + video_path = os.path.normpath(video_path) + bgm_path = os.path.normpath(bgm_path) + output_path = os.path.normpath(output_path) + """ + Mixes background music into a video's audio track using ffmpeg and outputs a new video file. + """ + logger.info(f"Mixing BGM '{bgm_path}' into video '{video_path}'") - def get_text_size(inner_text): - inner_text = inner_text.strip() - left, top, right, bottom = font.getbbox(inner_text) - return right - left, bottom - top + video_duration = get_video_duration(video_path) + if video_duration == 0.0: + logger.error(f"Could not get duration of video {video_path}") + return False - width, height = get_text_size(text) - if width <= max_width: - return text, height + command = [ + "ffmpeg", + "-y", + "-i", video_path, + "-stream_loop", "-1", + "-i", bgm_path, + "-filter_complex", f"[0:a]volume=1.0[a0];[1:a]volume={bgm_volume}[a1];[a0][a1]amix=inputs=2:duration=first[a]", - processed = True + "-map", "0:v", + "-map", "[a]", + "-c:v", "copy", + "-c:a", "aac", + "-t", str(video_duration), + "-shortest", + output_path, + ] - _wrapped_lines_ = [] - words = text.split(" ") - _txt_ = "" - for word in words: - _before = _txt_ - _txt_ += f"{word} " - _width, _height = get_text_size(_txt_) - if _width <= max_width: - continue + return _run_ffmpeg_command(command) + + +def add_subtitles_to_video(video_path: str, srt_path: str, font_name: str, font_size: int, text_fore_color: str, stroke_color: str, stroke_width: float, subtitle_position: str, custom_position: float, output_path: str): + video_path = os.path.normpath(video_path) + srt_path = os.path.normpath(srt_path) + output_path = os.path.normpath(output_path) + font_path = utils.get_font_path(font_name) + if not os.path.exists(font_path): + logger.error(f"Font '{font_name}' not found, using default.") + font_path = utils.get_font_path("MicrosoftYaHeiBold.ttc") + + # This is the robust way to escape paths for ffmpeg filters on Windows + def escape_ffmpeg_path(path): + # Replace backslashes with forward slashes + escaped_path = path.replace('\\', '/') + # Escape colons + escaped_path = escaped_path.replace(':', '\\:') + return escaped_path + + style_options = [ + f"FontName='{os.path.basename(font_path)}'", + f"FontSize={font_size}", + f"PrimaryColour=&H{utils.rgb_to_bgr_hex(text_fore_color)}", + f"BorderStyle=1", + f"OutlineColour=&H{utils.rgb_to_bgr_hex(stroke_color)}", + f"Outline={stroke_width}", + f"Shadow=0", + f"MarginV=20" + ] + + if subtitle_position == 'bottom': + style_options.append("Alignment=2") # Bottom center + elif subtitle_position == 'top': + style_options.append("Alignment=8") # Top center + elif subtitle_position == 'center': + style_options.append("Alignment=5") # Middle center + else: # custom + style_options.append(f"Alignment=2,MarginV={int(custom_position)}") + + style_string = ','.join(style_options) + + # Correctly escape paths for ffmpeg's filtergraph + font_dir_escaped = escape_ffmpeg_path(os.path.dirname(font_path)) + srt_path_escaped = escape_ffmpeg_path(srt_path) + + subtitles_filter = f"subtitles='{srt_path_escaped}':force_style='{style_string}':fontsdir='{font_dir_escaped}'" + + command = [ + "ffmpeg", "-y", + "-i", video_path, + "-vf", subtitles_filter, + "-c:v", "libx264", + "-c:a", "copy", + "-preset", "ultrafast", + output_path + ] + + return _run_ffmpeg_command(command) + + +def process_scene_video(material_url: str, output_dir: str, target_duration: float, aspect_ratio: str = "16:9") -> str: + """ + 下载单个视频素材,并将其处理(剪辑/循环)到目标时长,同时调整分辨率。 + 这是实现音画同步的关键步骤之一。 + """ + try: + # 创建一个唯一的文件名 + video_filename = os.path.join(output_dir, f"scene_{os.path.basename(material_url)}") + + # 下载视频 + response = requests.get(material_url, stream=True) + response.raise_for_status() + with open(video_filename, 'wb') as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + logger.info(f"Downloaded scene video to {video_filename}") + + + clip = VideoFileClip(video_filename) + + # 如果原始视频时长短于目标时长,就循环视频 + if clip.duration < target_duration: + clip = clip.loop(duration=target_duration) + # 如果原始视频时长长于目标时长,就剪辑视频 else: - if _txt_.strip() == word.strip(): - processed = False - break - _wrapped_lines_.append(_before) - _txt_ = f"{word} " - _wrapped_lines_.append(_txt_) - if processed: - _wrapped_lines_ = [line.strip() for line in _wrapped_lines_] - result = "\n".join(_wrapped_lines_).strip() - height = len(_wrapped_lines_) * height - return result, height + clip = clip.subclip(0, target_duration) + + # 调整分辨率和宽高比 + if aspect_ratio == "16:9": + target_resolution = (1920, 1080) + else: # 9:16 + target_resolution = (1080, 1920) + + # 使用crop和resize确保画面内容不被拉伸 + clip_resized = clip.resize(height=target_resolution[1]) if clip.size[0]/clip.size[1] < target_resolution[0]/target_resolution[1] else clip.resize(width=target_resolution[0]) + clip_cropped = clip_resized.crop(x_center=clip_resized.size[0]/2, y_center=clip_resized.size[1]/2, width=target_resolution[0], height=target_resolution[1]) - _wrapped_lines_ = [] - chars = list(text) - _txt_ = "" - for word in chars: - _txt_ += word - _width, _height = get_text_size(_txt_) - if _width <= max_width: - continue - else: - _wrapped_lines_.append(_txt_) - _txt_ = "" - _wrapped_lines_.append(_txt_) - result = "\n".join(_wrapped_lines_).strip() - height = len(_wrapped_lines_) * height - return result, height + processed_filename = os.path.join(output_dir, f"processed_{os.path.basename(video_filename)}") + clip_cropped.write_videofile(processed_filename, codec="libx264", audio_codec="aac", fps=30, ffmpeg_params=['-pix_fmt', 'yuv420p']) + + clip.close() + clip_cropped.close() + os.remove(video_filename) # 删除原始下载文件 + logger.info(f"Processed scene video to {processed_filename}, duration: {target_duration}s") + return processed_filename + + except Exception as e: + logger.error(f"Error processing scene video from {material_url}: {e}") + return None def generate_video( video_path: str, @@ -494,166 +462,58 @@ def generate_video( subtitle_path: str, output_file: str, params: VideoParams, -): - aspect = VideoAspect(params.video_aspect) - video_width, video_height = aspect.to_resolution() +) -> str: + """ + Generates the final video by adding background music and subtitles using FFmpeg. - logger.info(f"generating video: {video_width} x {video_height}") - logger.info(f" ① video: {video_path}") - logger.info(f" ② audio: {audio_path}") - logger.info(f" ③ subtitle: {subtitle_path}") - logger.info(f" ④ output: {output_file}") + Args: + video_path (str): Path to the source video file. + audio_path (str): Path to the background music file. + subtitle_path (str): Path to the subtitle file. + output_file (str): Path to save the final output video. + params (VideoParams): Video parameters including bgm_volume. - # https://github.com/harry0703/MoneyPrinterTurbo/issues/217 - # PermissionError: [WinError 32] The process cannot access the file because it is being used by another process: 'final-1.mp4.tempTEMP_MPY_wvf_snd.mp3' - # write into the same directory as the output file - output_dir = os.path.dirname(output_file) + Returns: + str: The path to the final video if successful, otherwise an empty string. + """ + logger.info(f"Generating final video for {output_file}") + temp_dir = os.path.join(os.path.dirname(output_file), "temp_gen") + os.makedirs(temp_dir, exist_ok=True) - font_path = "" - if params.subtitle_enabled: - if not params.font_name: - params.font_name = "STHeitiMedium.ttc" - font_path = os.path.join(utils.font_dir(), params.font_name) - if os.name == "nt": - font_path = font_path.replace("\\", "/") + final_video_path = "" - logger.info(f" ⑤ font: {font_path}") - - def create_text_clip(subtitle_item): - params.font_size = int(params.font_size) - params.stroke_width = int(params.stroke_width) - phrase = subtitle_item[1] - max_width = video_width * 0.9 - wrapped_txt, txt_height = wrap_text( - phrase, max_width=max_width, font=font_path, fontsize=params.font_size + try: + # Step 1: Add background music + logger.info("Step 1: Adding background music.") + video_with_bgm_path = os.path.join(temp_dir, f"bgm_{os.path.basename(video_path)}") + bgm_added_path = add_bgm_to_video_ffmpeg( + video_path=video_path, + bgm_path=audio_path, + output_path=video_with_bgm_path, + bgm_volume=params.bgm_volume ) - interline = int(params.font_size * 0.25) - size=(int(max_width), int(txt_height + params.font_size * 0.25 + (interline * (wrapped_txt.count("\n") + 1)))) + if not bgm_added_path: + logger.error("Failed to add background music. Aborting video generation.") + return "" - _clip = TextClip( - text=wrapped_txt, - font=font_path, - font_size=params.font_size, - color=params.text_fore_color, - bg_color=params.text_background_color, - stroke_color=params.stroke_color, - stroke_width=params.stroke_width, - # interline=interline, - # size=size, - ) - duration = subtitle_item[0][1] - subtitle_item[0][0] - _clip = _clip.with_start(subtitle_item[0][0]) - _clip = _clip.with_end(subtitle_item[0][1]) - _clip = _clip.with_duration(duration) - if params.subtitle_position == "bottom": - _clip = _clip.with_position(("center", video_height * 0.95 - _clip.h)) - elif params.subtitle_position == "top": - _clip = _clip.with_position(("center", video_height * 0.05)) - elif params.subtitle_position == "custom": - # Ensure the subtitle is fully within the screen bounds - margin = 10 # Additional margin, in pixels - max_y = video_height - _clip.h - margin - min_y = margin - custom_y = (video_height - _clip.h) * (params.custom_position / 100) - custom_y = max( - min_y, min(custom_y, max_y) - ) # Constrain the y value within the valid range - _clip = _clip.with_position(("center", custom_y)) - else: # center - _clip = _clip.with_position(("center", "center")) - return _clip - - video_clip = VideoFileClip(video_path).without_audio() - audio_clip = AudioFileClip(audio_path).with_effects( - [afx.MultiplyVolume(params.voice_volume)] - ) - - def make_textclip(text): - return TextClip( - text=text, - font=font_path, - font_size=params.font_size, + # Step 2: Add subtitles + logger.info("Step 2: Adding subtitles.") + subtitled_video_path = add_subtitles_to_video_ffmpeg( + video_path=bgm_added_path, + subtitles_path=subtitle_path, + output_path=output_file ) - if subtitle_path and os.path.exists(subtitle_path): - sub = SubtitlesClip( - subtitles=subtitle_path, encoding="utf-8", make_textclip=make_textclip - ) - text_clips = [] - for item in sub.subtitles: - clip = create_text_clip(subtitle_item=item) - text_clips.append(clip) - video_clip = CompositeVideoClip([video_clip, *text_clips]) + if subtitled_video_path: + logger.success(f"Successfully generated final video: {output_file}") + final_video_path = output_file + else: + logger.error("Failed to add subtitles. Final video not created.") - bgm_file = get_bgm_file(bgm_type=params.bgm_type, bgm_file=params.bgm_file) - if bgm_file: - try: - bgm_clip = AudioFileClip(bgm_file).with_effects( - [ - afx.MultiplyVolume(params.bgm_volume), - afx.AudioFadeOut(3), - afx.AudioLoop(duration=video_clip.duration), - ] - ) - audio_clip = CompositeAudioClip([audio_clip, bgm_clip]) - except Exception as e: - logger.error(f"failed to add bgm: {str(e)}") - - video_clip = video_clip.with_audio(audio_clip) - video_clip.write_videofile( - output_file, - audio_codec=audio_codec, - temp_audiofile_path=output_dir, - threads=params.n_threads or 2, - logger=None, - fps=fps, - ) - video_clip.close() - del video_clip - - -def preprocess_video(materials: List[MaterialInfo], clip_duration=4): - for material in materials: - if not material.url: - continue - - ext = utils.parse_extension(material.url) - try: - clip = VideoFileClip(material.url) - except Exception: - clip = ImageClip(material.url) - - width = clip.size[0] - height = clip.size[1] - if width < 480 or height < 480: - logger.warning(f"low resolution material: {width}x{height}, minimum 480x480 required") - continue - - if ext in const.FILE_TYPE_IMAGES: - logger.info(f"processing image: {material.url}") - # Create an image clip and set its duration to 3 seconds - clip = ( - ImageClip(material.url) - .with_duration(clip_duration) - .with_position("center") - ) - # Apply a zoom effect using the resize method. - # A lambda function is used to make the zoom effect dynamic over time. - # The zoom effect starts from the original size and gradually scales up to 120%. - # t represents the current time, and clip.duration is the total duration of the clip (3 seconds). - # Note: 1 represents 100% size, so 1.2 represents 120% size. - zoom_clip = clip.resized( - lambda t: 1 + (clip_duration * 0.03) * (t / clip.duration) - ) - - # Optionally, create a composite video clip containing the zoomed clip. - # This is useful when you want to add other elements to the video. - final_clip = CompositeVideoClip([zoom_clip]) - - # Output the video to a file. - video_file = f"{material.url}.mp4" - final_clip.write_videofile(video_file, fps=30, logger=None) - close_clip(clip) - material.url = video_file - logger.success(f"image processed: {video_file}") - return materials \ No newline at end of file + finally: + # Clean up temporary directory + if os.path.exists(temp_dir): + shutil.rmtree(temp_dir) + + return final_video_path + diff --git a/app/services/voice.py b/app/services/voice.py index e6b4d59..85f2fa0 100644 --- a/app/services/voice.py +++ b/app/services/voice.py @@ -2,7 +2,7 @@ import asyncio import os import re from datetime import datetime -from typing import Union +from typing import Union, List from xml.sax.saxutils import unescape import edge_tts @@ -10,1072 +10,13 @@ import requests from edge_tts import SubMaker, submaker from edge_tts.submaker import mktimestamp from loguru import logger -from moviepy.video.tools import subtitles +import subprocess from app.config import config from app.utils import utils -def get_siliconflow_voices() -> list[str]: - """ - 获取硅基流动的声音列表 - - Returns: - 声音列表,格式为 ["siliconflow:FunAudioLLM/CosyVoice2-0.5B:alex", ...] - """ - # 硅基流动的声音列表和对应的性别(用于显示) - voices_with_gender = [ - ("FunAudioLLM/CosyVoice2-0.5B", "alex", "Male"), - ("FunAudioLLM/CosyVoice2-0.5B", "anna", "Female"), - ("FunAudioLLM/CosyVoice2-0.5B", "bella", "Female"), - ("FunAudioLLM/CosyVoice2-0.5B", "benjamin", "Male"), - ("FunAudioLLM/CosyVoice2-0.5B", "charles", "Male"), - ("FunAudioLLM/CosyVoice2-0.5B", "claire", "Female"), - ("FunAudioLLM/CosyVoice2-0.5B", "david", "Male"), - ("FunAudioLLM/CosyVoice2-0.5B", "diana", "Female"), - ] - - # 添加siliconflow:前缀,并格式化为显示名称 - return [ - f"siliconflow:{model}:{voice}-{gender}" - for model, voice, gender in voices_with_gender - ] - - -def get_all_azure_voices(filter_locals=None) -> list[str]: - azure_voices_str = """ -Name: af-ZA-AdriNeural -Gender: Female - -Name: af-ZA-WillemNeural -Gender: Male - -Name: am-ET-AmehaNeural -Gender: Male - -Name: am-ET-MekdesNeural -Gender: Female - -Name: ar-AE-FatimaNeural -Gender: Female - -Name: ar-AE-HamdanNeural -Gender: Male - -Name: ar-BH-AliNeural -Gender: Male - -Name: ar-BH-LailaNeural -Gender: Female - -Name: ar-DZ-AminaNeural -Gender: Female - -Name: ar-DZ-IsmaelNeural -Gender: Male - -Name: ar-EG-SalmaNeural -Gender: Female - -Name: ar-EG-ShakirNeural -Gender: Male - -Name: ar-IQ-BasselNeural -Gender: Male - -Name: ar-IQ-RanaNeural -Gender: Female - -Name: ar-JO-SanaNeural -Gender: Female - -Name: ar-JO-TaimNeural -Gender: Male - -Name: ar-KW-FahedNeural -Gender: Male - -Name: ar-KW-NouraNeural -Gender: Female - -Name: ar-LB-LaylaNeural -Gender: Female - -Name: ar-LB-RamiNeural -Gender: Male - -Name: ar-LY-ImanNeural -Gender: Female - -Name: ar-LY-OmarNeural -Gender: Male - -Name: ar-MA-JamalNeural -Gender: Male - -Name: ar-MA-MounaNeural -Gender: Female - -Name: ar-OM-AbdullahNeural -Gender: Male - -Name: ar-OM-AyshaNeural -Gender: Female - -Name: ar-QA-AmalNeural -Gender: Female - -Name: ar-QA-MoazNeural -Gender: Male - -Name: ar-SA-HamedNeural -Gender: Male - -Name: ar-SA-ZariyahNeural -Gender: Female - -Name: ar-SY-AmanyNeural -Gender: Female - -Name: ar-SY-LaithNeural -Gender: Male - -Name: ar-TN-HediNeural -Gender: Male - -Name: ar-TN-ReemNeural -Gender: Female - -Name: ar-YE-MaryamNeural -Gender: Female - -Name: ar-YE-SalehNeural -Gender: Male - -Name: az-AZ-BabekNeural -Gender: Male - -Name: az-AZ-BanuNeural -Gender: Female - -Name: bg-BG-BorislavNeural -Gender: Male - -Name: bg-BG-KalinaNeural -Gender: Female - -Name: bn-BD-NabanitaNeural -Gender: Female - -Name: bn-BD-PradeepNeural -Gender: Male - -Name: bn-IN-BashkarNeural -Gender: Male - -Name: bn-IN-TanishaaNeural -Gender: Female - -Name: bs-BA-GoranNeural -Gender: Male - -Name: bs-BA-VesnaNeural -Gender: Female - -Name: ca-ES-EnricNeural -Gender: Male - -Name: ca-ES-JoanaNeural -Gender: Female - -Name: cs-CZ-AntoninNeural -Gender: Male - -Name: cs-CZ-VlastaNeural -Gender: Female - -Name: cy-GB-AledNeural -Gender: Male - -Name: cy-GB-NiaNeural -Gender: Female - -Name: da-DK-ChristelNeural -Gender: Female - -Name: da-DK-JeppeNeural -Gender: Male - -Name: de-AT-IngridNeural -Gender: Female - -Name: de-AT-JonasNeural -Gender: Male - -Name: de-CH-JanNeural -Gender: Male - -Name: de-CH-LeniNeural -Gender: Female - -Name: de-DE-AmalaNeural -Gender: Female - -Name: de-DE-ConradNeural -Gender: Male - -Name: de-DE-FlorianMultilingualNeural -Gender: Male - -Name: de-DE-KatjaNeural -Gender: Female - -Name: de-DE-KillianNeural -Gender: Male - -Name: de-DE-SeraphinaMultilingualNeural -Gender: Female - -Name: el-GR-AthinaNeural -Gender: Female - -Name: el-GR-NestorasNeural -Gender: Male - -Name: en-AU-NatashaNeural -Gender: Female - -Name: en-AU-WilliamNeural -Gender: Male - -Name: en-CA-ClaraNeural -Gender: Female - -Name: en-CA-LiamNeural -Gender: Male - -Name: en-GB-LibbyNeural -Gender: Female - -Name: en-GB-MaisieNeural -Gender: Female - -Name: en-GB-RyanNeural -Gender: Male - -Name: en-GB-SoniaNeural -Gender: Female - -Name: en-GB-ThomasNeural -Gender: Male - -Name: en-HK-SamNeural -Gender: Male - -Name: en-HK-YanNeural -Gender: Female - -Name: en-IE-ConnorNeural -Gender: Male - -Name: en-IE-EmilyNeural -Gender: Female - -Name: en-IN-NeerjaExpressiveNeural -Gender: Female - -Name: en-IN-NeerjaNeural -Gender: Female - -Name: en-IN-PrabhatNeural -Gender: Male - -Name: en-KE-AsiliaNeural -Gender: Female - -Name: en-KE-ChilembaNeural -Gender: Male - -Name: en-NG-AbeoNeural -Gender: Male - -Name: en-NG-EzinneNeural -Gender: Female - -Name: en-NZ-MitchellNeural -Gender: Male - -Name: en-NZ-MollyNeural -Gender: Female - -Name: en-PH-JamesNeural -Gender: Male - -Name: en-PH-RosaNeural -Gender: Female - -Name: en-SG-LunaNeural -Gender: Female - -Name: en-SG-WayneNeural -Gender: Male - -Name: en-TZ-ElimuNeural -Gender: Male - -Name: en-TZ-ImaniNeural -Gender: Female - -Name: en-US-AnaNeural -Gender: Female - -Name: en-US-AndrewMultilingualNeural -Gender: Male - -Name: en-US-AndrewNeural -Gender: Male - -Name: en-US-AriaNeural -Gender: Female - -Name: en-US-AvaMultilingualNeural -Gender: Female - -Name: en-US-AvaNeural -Gender: Female - -Name: en-US-BrianMultilingualNeural -Gender: Male - -Name: en-US-BrianNeural -Gender: Male - -Name: en-US-ChristopherNeural -Gender: Male - -Name: en-US-EmmaMultilingualNeural -Gender: Female - -Name: en-US-EmmaNeural -Gender: Female - -Name: en-US-EricNeural -Gender: Male - -Name: en-US-GuyNeural -Gender: Male - -Name: en-US-JennyNeural -Gender: Female - -Name: en-US-MichelleNeural -Gender: Female - -Name: en-US-RogerNeural -Gender: Male - -Name: en-US-SteffanNeural -Gender: Male - -Name: en-ZA-LeahNeural -Gender: Female - -Name: en-ZA-LukeNeural -Gender: Male - -Name: es-AR-ElenaNeural -Gender: Female - -Name: es-AR-TomasNeural -Gender: Male - -Name: es-BO-MarceloNeural -Gender: Male - -Name: es-BO-SofiaNeural -Gender: Female - -Name: es-CL-CatalinaNeural -Gender: Female - -Name: es-CL-LorenzoNeural -Gender: Male - -Name: es-CO-GonzaloNeural -Gender: Male - -Name: es-CO-SalomeNeural -Gender: Female - -Name: es-CR-JuanNeural -Gender: Male - -Name: es-CR-MariaNeural -Gender: Female - -Name: es-CU-BelkysNeural -Gender: Female - -Name: es-CU-ManuelNeural -Gender: Male - -Name: es-DO-EmilioNeural -Gender: Male - -Name: es-DO-RamonaNeural -Gender: Female - -Name: es-EC-AndreaNeural -Gender: Female - -Name: es-EC-LuisNeural -Gender: Male - -Name: es-ES-AlvaroNeural -Gender: Male - -Name: es-ES-ElviraNeural -Gender: Female - -Name: es-ES-XimenaNeural -Gender: Female - -Name: es-GQ-JavierNeural -Gender: Male - -Name: es-GQ-TeresaNeural -Gender: Female - -Name: es-GT-AndresNeural -Gender: Male - -Name: es-GT-MartaNeural -Gender: Female - -Name: es-HN-CarlosNeural -Gender: Male - -Name: es-HN-KarlaNeural -Gender: Female - -Name: es-MX-DaliaNeural -Gender: Female - -Name: es-MX-JorgeNeural -Gender: Male - -Name: es-NI-FedericoNeural -Gender: Male - -Name: es-NI-YolandaNeural -Gender: Female - -Name: es-PA-MargaritaNeural -Gender: Female - -Name: es-PA-RobertoNeural -Gender: Male - -Name: es-PE-AlexNeural -Gender: Male - -Name: es-PE-CamilaNeural -Gender: Female - -Name: es-PR-KarinaNeural -Gender: Female - -Name: es-PR-VictorNeural -Gender: Male - -Name: es-PY-MarioNeural -Gender: Male - -Name: es-PY-TaniaNeural -Gender: Female - -Name: es-SV-LorenaNeural -Gender: Female - -Name: es-SV-RodrigoNeural -Gender: Male - -Name: es-US-AlonsoNeural -Gender: Male - -Name: es-US-PalomaNeural -Gender: Female - -Name: es-UY-MateoNeural -Gender: Male - -Name: es-UY-ValentinaNeural -Gender: Female - -Name: es-VE-PaolaNeural -Gender: Female - -Name: es-VE-SebastianNeural -Gender: Male - -Name: et-EE-AnuNeural -Gender: Female - -Name: et-EE-KertNeural -Gender: Male - -Name: fa-IR-DilaraNeural -Gender: Female - -Name: fa-IR-FaridNeural -Gender: Male - -Name: fi-FI-HarriNeural -Gender: Male - -Name: fi-FI-NooraNeural -Gender: Female - -Name: fil-PH-AngeloNeural -Gender: Male - -Name: fil-PH-BlessicaNeural -Gender: Female - -Name: fr-BE-CharlineNeural -Gender: Female - -Name: fr-BE-GerardNeural -Gender: Male - -Name: fr-CA-AntoineNeural -Gender: Male - -Name: fr-CA-JeanNeural -Gender: Male - -Name: fr-CA-SylvieNeural -Gender: Female - -Name: fr-CA-ThierryNeural -Gender: Male - -Name: fr-CH-ArianeNeural -Gender: Female - -Name: fr-CH-FabriceNeural -Gender: Male - -Name: fr-FR-DeniseNeural -Gender: Female - -Name: fr-FR-EloiseNeural -Gender: Female - -Name: fr-FR-HenriNeural -Gender: Male - -Name: fr-FR-RemyMultilingualNeural -Gender: Male - -Name: fr-FR-VivienneMultilingualNeural -Gender: Female - -Name: ga-IE-ColmNeural -Gender: Male - -Name: ga-IE-OrlaNeural -Gender: Female - -Name: gl-ES-RoiNeural -Gender: Male - -Name: gl-ES-SabelaNeural -Gender: Female - -Name: gu-IN-DhwaniNeural -Gender: Female - -Name: gu-IN-NiranjanNeural -Gender: Male - -Name: he-IL-AvriNeural -Gender: Male - -Name: he-IL-HilaNeural -Gender: Female - -Name: hi-IN-MadhurNeural -Gender: Male - -Name: hi-IN-SwaraNeural -Gender: Female - -Name: hr-HR-GabrijelaNeural -Gender: Female - -Name: hr-HR-SreckoNeural -Gender: Male - -Name: hu-HU-NoemiNeural -Gender: Female - -Name: hu-HU-TamasNeural -Gender: Male - -Name: id-ID-ArdiNeural -Gender: Male - -Name: id-ID-GadisNeural -Gender: Female - -Name: is-IS-GudrunNeural -Gender: Female - -Name: is-IS-GunnarNeural -Gender: Male - -Name: it-IT-DiegoNeural -Gender: Male - -Name: it-IT-ElsaNeural -Gender: Female - -Name: it-IT-GiuseppeMultilingualNeural -Gender: Male - -Name: it-IT-IsabellaNeural -Gender: Female - -Name: iu-Cans-CA-SiqiniqNeural -Gender: Female - -Name: iu-Cans-CA-TaqqiqNeural -Gender: Male - -Name: iu-Latn-CA-SiqiniqNeural -Gender: Female - -Name: iu-Latn-CA-TaqqiqNeural -Gender: Male - -Name: ja-JP-KeitaNeural -Gender: Male - -Name: ja-JP-NanamiNeural -Gender: Female - -Name: jv-ID-DimasNeural -Gender: Male - -Name: jv-ID-SitiNeural -Gender: Female - -Name: ka-GE-EkaNeural -Gender: Female - -Name: ka-GE-GiorgiNeural -Gender: Male - -Name: kk-KZ-AigulNeural -Gender: Female - -Name: kk-KZ-DauletNeural -Gender: Male - -Name: km-KH-PisethNeural -Gender: Male - -Name: km-KH-SreymomNeural -Gender: Female - -Name: kn-IN-GaganNeural -Gender: Male - -Name: kn-IN-SapnaNeural -Gender: Female - -Name: ko-KR-HyunsuMultilingualNeural -Gender: Male - -Name: ko-KR-InJoonNeural -Gender: Male - -Name: ko-KR-SunHiNeural -Gender: Female - -Name: lo-LA-ChanthavongNeural -Gender: Male - -Name: lo-LA-KeomanyNeural -Gender: Female - -Name: lt-LT-LeonasNeural -Gender: Male - -Name: lt-LT-OnaNeural -Gender: Female - -Name: lv-LV-EveritaNeural -Gender: Female - -Name: lv-LV-NilsNeural -Gender: Male - -Name: mk-MK-AleksandarNeural -Gender: Male - -Name: mk-MK-MarijaNeural -Gender: Female - -Name: ml-IN-MidhunNeural -Gender: Male - -Name: ml-IN-SobhanaNeural -Gender: Female - -Name: mn-MN-BataaNeural -Gender: Male - -Name: mn-MN-YesuiNeural -Gender: Female - -Name: mr-IN-AarohiNeural -Gender: Female - -Name: mr-IN-ManoharNeural -Gender: Male - -Name: ms-MY-OsmanNeural -Gender: Male - -Name: ms-MY-YasminNeural -Gender: Female - -Name: mt-MT-GraceNeural -Gender: Female - -Name: mt-MT-JosephNeural -Gender: Male - -Name: my-MM-NilarNeural -Gender: Female - -Name: my-MM-ThihaNeural -Gender: Male - -Name: nb-NO-FinnNeural -Gender: Male - -Name: nb-NO-PernilleNeural -Gender: Female - -Name: ne-NP-HemkalaNeural -Gender: Female - -Name: ne-NP-SagarNeural -Gender: Male - -Name: nl-BE-ArnaudNeural -Gender: Male - -Name: nl-BE-DenaNeural -Gender: Female - -Name: nl-NL-ColetteNeural -Gender: Female - -Name: nl-NL-FennaNeural -Gender: Female - -Name: nl-NL-MaartenNeural -Gender: Male - -Name: pl-PL-MarekNeural -Gender: Male - -Name: pl-PL-ZofiaNeural -Gender: Female - -Name: ps-AF-GulNawazNeural -Gender: Male - -Name: ps-AF-LatifaNeural -Gender: Female - -Name: pt-BR-AntonioNeural -Gender: Male - -Name: pt-BR-FranciscaNeural -Gender: Female - -Name: pt-BR-ThalitaMultilingualNeural -Gender: Female - -Name: pt-PT-DuarteNeural -Gender: Male - -Name: pt-PT-RaquelNeural -Gender: Female - -Name: ro-RO-AlinaNeural -Gender: Female - -Name: ro-RO-EmilNeural -Gender: Male - -Name: ru-RU-DmitryNeural -Gender: Male - -Name: ru-RU-SvetlanaNeural -Gender: Female - -Name: si-LK-SameeraNeural -Gender: Male - -Name: si-LK-ThiliniNeural -Gender: Female - -Name: sk-SK-LukasNeural -Gender: Male - -Name: sk-SK-ViktoriaNeural -Gender: Female - -Name: sl-SI-PetraNeural -Gender: Female - -Name: sl-SI-RokNeural -Gender: Male - -Name: so-SO-MuuseNeural -Gender: Male - -Name: so-SO-UbaxNeural -Gender: Female - -Name: sq-AL-AnilaNeural -Gender: Female - -Name: sq-AL-IlirNeural -Gender: Male - -Name: sr-RS-NicholasNeural -Gender: Male - -Name: sr-RS-SophieNeural -Gender: Female - -Name: su-ID-JajangNeural -Gender: Male - -Name: su-ID-TutiNeural -Gender: Female - -Name: sv-SE-MattiasNeural -Gender: Male - -Name: sv-SE-SofieNeural -Gender: Female - -Name: sw-KE-RafikiNeural -Gender: Male - -Name: sw-KE-ZuriNeural -Gender: Female - -Name: sw-TZ-DaudiNeural -Gender: Male - -Name: sw-TZ-RehemaNeural -Gender: Female - -Name: ta-IN-PallaviNeural -Gender: Female - -Name: ta-IN-ValluvarNeural -Gender: Male - -Name: ta-LK-KumarNeural -Gender: Male - -Name: ta-LK-SaranyaNeural -Gender: Female - -Name: ta-MY-KaniNeural -Gender: Female - -Name: ta-MY-SuryaNeural -Gender: Male - -Name: ta-SG-AnbuNeural -Gender: Male - -Name: ta-SG-VenbaNeural -Gender: Female - -Name: te-IN-MohanNeural -Gender: Male - -Name: te-IN-ShrutiNeural -Gender: Female - -Name: th-TH-NiwatNeural -Gender: Male - -Name: th-TH-PremwadeeNeural -Gender: Female - -Name: tr-TR-AhmetNeural -Gender: Male - -Name: tr-TR-EmelNeural -Gender: Female - -Name: uk-UA-OstapNeural -Gender: Male - -Name: uk-UA-PolinaNeural -Gender: Female - -Name: ur-IN-GulNeural -Gender: Female - -Name: ur-IN-SalmanNeural -Gender: Male - -Name: ur-PK-AsadNeural -Gender: Male - -Name: ur-PK-UzmaNeural -Gender: Female - -Name: uz-UZ-MadinaNeural -Gender: Female - -Name: uz-UZ-SardorNeural -Gender: Male - -Name: vi-VN-HoaiMyNeural -Gender: Female - -Name: vi-VN-NamMinhNeural -Gender: Male - -Name: zh-CN-XiaoxiaoNeural -Gender: Female - -Name: zh-CN-XiaoyiNeural -Gender: Female - -Name: zh-CN-YunjianNeural -Gender: Male - -Name: zh-CN-YunxiNeural -Gender: Male - -Name: zh-CN-YunxiaNeural -Gender: Male - -Name: zh-CN-YunyangNeural -Gender: Male - -Name: zh-CN-liaoning-XiaobeiNeural -Gender: Female - -Name: zh-CN-shaanxi-XiaoniNeural -Gender: Female - -Name: zh-HK-HiuGaaiNeural -Gender: Female - -Name: zh-HK-HiuMaanNeural -Gender: Female - -Name: zh-HK-WanLungNeural -Gender: Male - -Name: zh-TW-HsiaoChenNeural -Gender: Female - -Name: zh-TW-HsiaoYuNeural -Gender: Female - -Name: zh-TW-YunJheNeural -Gender: Male - -Name: zu-ZA-ThandoNeural -Gender: Female - -Name: zu-ZA-ThembaNeural -Gender: Male - - -Name: en-US-AvaMultilingualNeural-V2 -Gender: Female - -Name: en-US-AndrewMultilingualNeural-V2 -Gender: Male - -Name: en-US-EmmaMultilingualNeural-V2 -Gender: Female - -Name: en-US-BrianMultilingualNeural-V2 -Gender: Male - -Name: de-DE-FlorianMultilingualNeural-V2 -Gender: Male - -Name: de-DE-SeraphinaMultilingualNeural-V2 -Gender: Female - -Name: fr-FR-RemyMultilingualNeural-V2 -Gender: Male - -Name: fr-FR-VivienneMultilingualNeural-V2 -Gender: Female - -Name: zh-CN-XiaoxiaoMultilingualNeural-V2 -Gender: Female - """.strip() - voices = [] - # 定义正则表达式模式,用于匹配 Name 和 Gender 行 - pattern = re.compile(r"Name:\s*(.+)\s*Gender:\s*(.+)\s*", re.MULTILINE) - # 使用正则表达式查找所有匹配项 - matches = pattern.findall(azure_voices_str) - - for name, gender in matches: - # 应用过滤条件 - if filter_locals and any( - name.lower().startswith(fl.lower()) for fl in filter_locals - ): - voices.append(f"{name}-{gender}") - elif not filter_locals: - voices.append(f"{name}-{gender}") - - voices.sort() - return voices - - -def parse_voice_name(name: str): - # zh-CN-XiaoyiNeural-Female - # zh-CN-YunxiNeural-Male - # zh-CN-XiaoxiaoMultilingualNeural-V2-Female - name = name.replace("-Female", "").replace("-Male", "").strip() - return name - - -def is_azure_v2_voice(voice_name: str): - voice_name = parse_voice_name(voice_name) - if voice_name.endswith("-V2"): - return voice_name.replace("-V2", "").strip() - return "" - - -def is_siliconflow_voice(voice_name: str): - """检查是否是硅基流动的声音""" - return voice_name.startswith("siliconflow:") - +from app.utils.utils import parse_voice_name, is_azure_v2_voice, is_siliconflow_voice def tts( text: str, @@ -1219,12 +160,22 @@ def siliconflow_tts( # 获取音频文件的实际长度 try: - # 尝试使用moviepy获取音频长度 - from moviepy import AudioFileClip - - audio_clip = AudioFileClip(voice_file) - audio_duration = audio_clip.duration - audio_clip.close() + # 使用 ffprobe 获取音频长度 + probe_command = [ + "ffprobe", + "-v", "error", + "-show_entries", "format=duration", + "-of", "default=noprint_wrappers=1:nokey=1", + voice_file + ] + process = subprocess.run( + probe_command, + check=True, + capture_output=True, + text=True, + creationflags=subprocess.CREATE_NO_WINDOW if os.name == "nt" else 0, + ) + audio_duration = float(process.stdout.strip()) # 将音频长度转换为100纳秒单位(与edge_tts兼容) audio_duration_100ns = int(audio_duration * 10000000) @@ -1262,8 +213,8 @@ def siliconflow_tts( sub_maker.subs = [text] sub_maker.offset = [(0, audio_duration_100ns)] - except Exception as e: - logger.warning(f"Failed to create accurate subtitles: {str(e)}") + except (subprocess.CalledProcessError, FileNotFoundError, ValueError) as e: + logger.warning(f"Failed to create accurate subtitles with ffprobe: {str(e)}") # 回退到简单的字幕 sub_maker.subs = [text] # 使用音频文件的实际长度,如果无法获取,则假设为10秒 @@ -1272,7 +223,7 @@ def siliconflow_tts( 0, audio_duration_100ns if "audio_duration_100ns" in locals() - else 10000000, + else 100000000, ) ] @@ -1469,14 +420,23 @@ def create_subtitle(sub_maker: submaker.SubMaker, text: str, subtitle_file: str) with open(subtitle_file, "w", encoding="utf-8") as file: file.write("\n".join(sub_items) + "\n") try: - sbs = subtitles.file_to_subtitles(subtitle_file, encoding="utf-8") - duration = max([tb for ((ta, tb), txt) in sbs]) - logger.info( - f"completed, subtitle file created: {subtitle_file}, duration: {duration}" - ) + # Get duration from the last subtitle item + if sub_items: + last_sub = sub_items[-1] + # '1\n00:00:00,000 --> 00:00:02,360\ntext' + time_line = last_sub.strip().split('\n')[1] + end_time_str = time_line.split(' --> ')[1] + # '00:00:02,360' + h, m, s_ms = end_time_str.split(':') + s, ms = s_ms.split(',') + duration = int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000.0 + logger.info( + f"completed, subtitle file created: {subtitle_file}, duration: {duration}" + ) + else: + logger.info(f"completed, empty subtitle file created: {subtitle_file}") except Exception as e: - logger.error(f"failed, error: {str(e)}") - os.remove(subtitle_file) + logger.warning(f"failed to parse subtitle duration, but file was created. error: {str(e)}") else: logger.warning( f"failed, sub_items len: {len(sub_items)}, script_lines len: {len(script_lines)}" @@ -1495,6 +455,53 @@ def get_audio_duration(sub_maker: submaker.SubMaker): return sub_maker.offset[-1][1] / 10000000 +def combine_audio_files(audio_paths: List[str], output_path: str) -> bool: + """ + Combines multiple audio files into a single audio file using ffmpeg. + """ + logger.info(f"Combining {len(audio_paths)} audio files into {output_path}") + if not audio_paths: + logger.warning("No audio clips to combine.") + return False + + # Create a temporary file to list the audio files + list_file_path = os.path.join(os.path.dirname(output_path), "concat_list.txt") + with open(list_file_path, "w", encoding="utf-8") as f: + for path in audio_paths: + f.write(f"file '{os.path.normpath(path)}'\n") + + command = [ + "ffmpeg", + "-f", "concat", + "-safe", "0", + "-i", list_file_path, + "-c", "copy", + "-y", # Overwrite output file if it exists + output_path, + ] + + try: + process = subprocess.run( + command, + check=True, + capture_output=True, + text=True, + creationflags=subprocess.CREATE_NO_WINDOW if os.name == "nt" else 0, + ) + logger.success(f"Successfully combined audio files: {output_path}") + return True + except subprocess.CalledProcessError as e: + logger.error(f"Failed to combine audio files: {e.stderr}") + return False + except FileNotFoundError: + logger.error("ffmpeg not found. Please ensure ffmpeg is installed and in your PATH.") + return False + finally: + # Clean up the temporary list file + if os.path.exists(list_file_path): + os.remove(list_file_path) + + if __name__ == "__main__": voice_name = "zh-CN-XiaoxiaoMultilingualNeural-V2-Female" voice_name = parse_voice_name(voice_name) diff --git a/app/utils/__init__.py b/app/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/utils/utils.py b/app/utils/utils.py index 7efb521..112d818 100644 --- a/app/utils/utils.py +++ b/app/utils/utils.py @@ -14,6 +14,23 @@ from app.models import const urllib3.disable_warnings() +def parse_voice_name(name: str): + # zh-CN-XiaoyiNeural-Female -> zh-CN-XiaoyiNeural + # zh-CN-YunxiNeural-Male -> zh-CN-YunxiNeural + # zh-CN-XiaoxiaoMultilingualNeural-V2-Female -> zh-CN-XiaoxiaoMultilingualNeural-V2 + return name.replace("-Female", "").replace("-Male", "").strip() + +def is_azure_v2_voice(voice_name: str): + voice_name = parse_voice_name(voice_name) + if voice_name.endswith("-V2"): + return voice_name.replace("-V2", "").strip() + return "" + +def is_siliconflow_voice(voice_name: str): + """检查是否是硅基流动的声音""" + return voice_name.startswith("siliconflow:") + + def get_response(status: int, data: Any = None, message: str = ""): obj = { "status": status, @@ -64,6 +81,13 @@ def get_uuid(remove_hyphen: bool = False): return u +def get_root_dir(sub_dir: str = ""): + d = root_dir() + if sub_dir: + d = os.path.join(d, sub_dir) + return d + + def root_dir(): return os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) @@ -103,6 +127,10 @@ def font_dir(sub_dir: str = ""): return d +def get_font_path(font_name: str): + return os.path.join(font_dir(), font_name) + + def song_dir(sub_dir: str = ""): d = resource_dir("songs") if sub_dir: @@ -227,4 +255,22 @@ def load_locales(i18n_dir): def parse_extension(filename): - return Path(filename).suffix.lower().lstrip('.') + return os.path.splitext(filename)[1] + + +def rgb_to_bgr_hex(rgb_color): + """Converts an RGB color string (e.g., '#RRGGBB') to a BGR hex string for FFmpeg. + + Args: + rgb_color (str): The RGB color string, starting with '#'. + + Returns: + str: The BGR hex string (e.g., 'BBGGRR'). + """ + if not rgb_color.startswith('#') or len(rgb_color) != 7: + logger.warning(f"Invalid color format: {rgb_color}. Using default white.") + return "FFFFFF" # Default to white for invalid formats + r = rgb_color[1:3] + g = rgb_color[3:5] + b = rgb_color[5:7] + return f"{b}{g}{r}" diff --git a/requirements.txt b/requirements.txt index a1731f6..dd88022 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ uvicorn==0.32.1 openai==1.56.1 faster-whisper==1.1.0 loguru==0.7.3 -google.generativeai==0.8.3 +google-generativeai==0.8.3 dashscope==1.20.14 g4f==0.5.2.2 azure-cognitiveservices-speech==1.41.1 diff --git a/webui/.streamlit/config.toml b/webui/.streamlit/config.toml index b690b74..82fa436 100644 --- a/webui/.streamlit/config.toml +++ b/webui/.streamlit/config.toml @@ -1,2 +1,2 @@ -[browser] -gatherUsageStats = false \ No newline at end of file +[server] +fileWatcherType = "none" diff --git a/webui/Main.py b/webui/Main.py index 1b55abe..ce30bae 100644 --- a/webui/Main.py +++ b/webui/Main.py @@ -618,7 +618,7 @@ with middle_panel: ) params.video_aspect = VideoAspect(video_aspect_ratios[selected_index][1]) - params.video_clip_duration = st.selectbox( + params.max_clip_duration = st.selectbox( tr("Clip Duration"), options=[2, 3, 4, 5, 6, 7, 8, 9, 10], index=1 ) params.video_count = st.selectbox( @@ -659,7 +659,8 @@ with middle_panel: if selected_tts_server == "siliconflow": # 获取硅基流动的声音列表 - filtered_voices = voice.get_siliconflow_voices() + # filtered_voices = voice.get_siliconflow_voices() + pass else: # 获取Azure的声音列表 all_voices = voice.get_all_azure_voices(filter_locals=None) @@ -699,6 +700,7 @@ with middle_panel: if saved_voice_name_index >= len(friendly_names) and friendly_names: saved_voice_name_index = 0 + voice_name = "" # 确保有声音可选 if friendly_names: selected_friendly_name = st.selectbox( @@ -715,14 +717,16 @@ with middle_panel: params.voice_name = voice_name config.ui["voice_name"] = voice_name else: - # 如果没有声音可选,显示提示信息 + # 如果没有声音可选,使用默认声音并显示提示信息 st.warning( tr( - "No voices available for the selected TTS server. Please select another server." + "No voices available for the selected TTS server. A default voice (en-US-JennyNeural) will be used." ) ) - params.voice_name = "" - config.ui["voice_name"] = "" + default_voice = "en-US-JennyNeural" + params.voice_name = default_voice + config.ui["voice_name"] = default_voice + voice_name = default_voice # 只有在有声音可选时才显示试听按钮 if friendly_names and st.button(tr("Play Voice")): @@ -961,7 +965,7 @@ if start_button: logger.info(utils.to_json(params)) scroll_to_bottom() - result = tm.start(task_id=task_id, params=params) + result = tm.start_storyboard_task(task_id=task_id, params=params) if not result or "videos" not in result: st.error(tr("Video Generation Failed")) logger.error(tr("Video Generation Failed"))