1, optimize the subtitle generation in edge mode

2, optimize the llm prompt, use the same language as the video subject
2026-02-21 16:37:21 +08:00 · 2024-03-24 17:50:50 +08:00 · 2024-03-24 17:50:50 +08:00 · b471a272b6
commit b471a272b6
parent 296a1370d3
4 changed files with 20 additions and 9 deletions
--- a/app/models/schema.py
+++ b/app/models/schema.py
@ -89,7 +89,7 @@ class VideoParams:
    """
    video_subject: str
    video_script: str = ""  # 用于生成视频的脚本
-    video_terms: str = ""  # 用于生成视频的关键词
+    video_terms: Optional[str | list] = None  # 用于生成视频的关键词
    video_aspect: Optional[VideoAspect] = VideoAspect.portrait.value
    video_concat_mode: Optional[VideoConcatMode] = VideoConcatMode.random.value
    video_clip_duration: Optional[int] = 5
@ -98,7 +98,7 @@ class VideoParams:
    bgm_type: Optional[str] = "random"
    bgm_file: Optional[str] = ""
    bgm_volume: Optional[float] = 0.2
-    
+
    subtitle_enabled: Optional[bool] = True
    subtitle_position: Optional[str] = "bottom"  # top, bottom, center
    font_name: Optional[str] = "STHeitiMedium.ttc"
--- a/app/services/llm.py
+++ b/app/services/llm.py
@ -77,13 +77,13 @@ Generate a script for a video, depending on the subject of the video.
 5. only return the raw content of the script. 
 6. do not include "voiceover", "narrator" or similar indicators of what should be spoken at the beginning of each paragraph or line. 
 7. you must not mention the prompt, or anything about the script itself. also, never talk about the amount of paragraphs or lines. just write the script.
+8. respond in the same language as the video subject.

 ## Output Example:
 What is the meaning of life. This question has puzzled philosophers.

 # Initialization:
 - video subject: {video_subject}
- output language: {language}
 - number of paragraphs: {paragraph_number}
 """.strip()

--- a/app/services/task.py
+++ b/app/services/task.py
@ -58,7 +58,13 @@ def start(task_id, params: VideoParams):
    if not video_terms:
        video_terms = llm.generate_terms(video_subject=video_subject, video_script=video_script, amount=5)
    else:
-        video_terms = [term.strip() for term in re.split(r'[,，]', video_terms)]
+        if isinstance(video_terms, str):
+            video_terms = [term.strip() for term in re.split(r'[,，]', video_terms)]
+        elif isinstance(video_terms, list):
+            video_terms = [term.strip() for term in video_terms]
+        else:
+            raise ValueError("video_terms must be a string or a list of strings.")
+
        logger.debug(f"video terms: {utils.to_json(video_terms)}")

    script_file = path.join(utils.task_dir(task_id), f"script.json")
@ -95,7 +101,7 @@ def start(task_id, params: VideoParams):
            else:
                subtitle_lines = subtitle.file_to_subtitles(subtitle_path)
                if not subtitle_lines:
-                    logger.warning(f"subtitle file is invalid: {subtitle_path}")
+                    logger.warning(f"subtitle file is invalid, fallback to whisper : {subtitle_path}")
                    subtitle_fallback = True

        if subtitle_provider == "whisper" or subtitle_fallback:
--- a/app/services/voice.py
+++ b/app/services/voice.py
@ -57,6 +57,8 @@ def create_subtitle(sub_maker: submaker.SubMaker, text: str, subtitle_file: str)
    sub_index = 0

    script_lines = utils.split_string_by_punctuations(text)
+    # remove space in every word
+    script_lines_without_space = [line.replace(" ", "") for line in script_lines]

    sub_line = ""
    for _, (offset, sub) in enumerate(zip(sub_maker.offset, sub_maker.subs)):
@ -66,14 +68,17 @@ def create_subtitle(sub_maker: submaker.SubMaker, text: str, subtitle_file: str)

        sub = unescape(sub)
        sub_line += sub
-        if sub_line == script_lines[sub_index]:
+        if sub_line == script_lines[sub_index] or sub_line == script_lines_without_space[sub_index]:
+            sub_text = script_lines[sub_index]
            sub_index += 1
-            sub_items.append(formatter(
+            line = formatter(
                idx=sub_index,
                start_time=start_time,
                end_time=end_time,
-                sub_text=sub_line,
-            ))
+                sub_text=sub_text,
+            )
+            # logger.debug(line.strip())
+            sub_items.append(line)
            start_time = -1.0
            sub_line = ""