From b033e074e1bcf5322e8e39157c1fedd3577cd66c Mon Sep 17 00:00:00 2001
From: cystal-dot <jiniaen4@gmail.com>
Date: Tue, 13 May 2025 23:49:08 +0900
Subject: [PATCH] =?UTF-8?q?feat:stable-diffusion=E3=81=AB=E7=94=9F?=
 =?UTF-8?q?=E6=88=90=E3=81=95=E3=81=9B=E3=81=9F=E7=94=BB=E5=83=8F=E3=82=92?=
 =?UTF-8?q?=E4=BD=BF=E3=81=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore           | 61 +++++++++++++++++++------------
 app/services/llm.py  | 87 +++++++++++++++++---------------------------
 app/services/task.py | 18 ++++++++-
 webui/Main.py        |  5 ++-
 4 files changed, 91 insertions(+), 80 deletions(-)

diff --git a/.gitignore b/.gitignore
index 6aa0ca7..4244085 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,28 +1,41 @@
+# OS固有
 .DS_Store
-/config.toml
-/storage/
-/.idea/
-/app/services/__pycache__
-/app/__pycache__/
-/app/config/__pycache__/
-/app/models/__pycache__/
-/app/utils/__pycache__/
-/*/__pycache__/*
-.vscode
-/**/.streamlit
-__pycache__
-logs/
 
-node_modules
-# VuePress 默认临时文件目录
-/sites/docs/.vuepress/.temp
-# VuePress 默认缓存目录
-/sites/docs/.vuepress/.cache
-# VuePress 默认构建生成的静态文件目录
-/sites/docs/.vuepress/dist
-# 模型目录
-/models/
-./models/*
+# Pythonキャッシュ
+__pycache__/
+**/__pycache__/
 
+# Python仮想環境
 venv/
-.venv
\ No newline at end of file
+.venv/
+
+# 設定・ログ・一時ファイル
+config.toml
+logs/
+.idea/
+.vscode/
+storage/
+*.log
+
+# Streamlit
+**/.streamlit/
+
+# Node.js
+node_modules/
+
+# VuePress
+/sites/docs/.vuepress/.temp/
+/sites/docs/.vuepress/.cache/
+/sites/docs/.vuepress/dist/
+
+# モデル
+models/
+models/*
+
+# その他
+app/services/__pycache__/
+app/config/__pycache__/
+app/models/__pycache__/
+app/utils/__pycache__/
+.pdm-python
+forme
diff --git a/app/services/llm.py b/app/services/llm.py
index 2c45ef9..5d87f3d 100644
--- a/app/services/llm.py
+++ b/app/services/llm.py
@@ -332,70 +332,51 @@ Generate a script for a video, depending on the subject of the video.
         logger.success(f"completed: \n{final_script}")
     return final_script.strip()
 
+from typing import List
+import json
 
-def generate_terms(video_subject: str, video_script: str, amount: int = 5) -> List[str]:
+def generate_terms(
+    video_subject: str,
+    video_script: str,
+    amount: int = 5
+) -> List[str]:
+    """
+    動画主題とスクリプトからAI画像生成プロンプトを生成します。
+    LLMに生のJSON配列のみを返させるよう、明確に指示を強化しています。
+    """
     prompt = f"""
-# Role: Video Search Terms Generator
+Generate exactly {amount} image prompts as a raw JSON array of strings. No markdown, code fences, or extra characters.
 
-## Goals:
-Generate {amount} search terms for stock videos, depending on the subject of a video.
-
-## Constrains:
-1. the search terms are to be returned as a json-array of strings.
-2. each search term should consist of 1-3 words, always add the main subject of the video.
-3. you must only return the json-array of strings. you must not return anything else. you must not return the script.
-4. the search terms must be related to the subject of the video.
-5. reply with english search terms only.
-
-## Output Example:
-["search term 1", "search term 2", "search term 3","search term 4","search term 5"]
-
-## Context:
-### Video Subject
+Video Subject:
 {video_subject}
 
-### Video Script
+Video Script:
 {video_script}
 
-Please note that you must use English for generating video search terms; Chinese is not accepted.
+Rules:
+1. Each prompt must start with either:
+   - "1girl, solo, anatomically correct" for character prompts, or
+   - a concise scene noun phrase for scene prompts (e.g., "moonlit forest clearing").
+2. Include at least one abstract symbol (hourglass, gear, DNA helix, etc.) and one environmental element (lantern, river, ancient ruins, etc.).
+3. Specify mood & lighting (e.g., golden hour, moody fog).
+4. Append: "portrait, 9:16, masterpiece:1.1, high detail, beautiful lighting, cinematic".
+
+Return only the JSON array, for example:
+["prompt1", "prompt2", ..., "promptN"]
 """.strip()
 
-    logger.info(f"subject: {video_subject}")
+    # LLM呼び出し
+    response = _generate_response(prompt)
 
-    search_terms = []
-    response = ""
-    for i in range(_max_retries):
-        try:
-            response = _generate_response(prompt)
-            if "Error: " in response:
-                logger.error(f"failed to generate video script: {response}")
-                return response
-            search_terms = json.loads(response)
-            if not isinstance(search_terms, list) or not all(
-                isinstance(term, str) for term in search_terms
-            ):
-                logger.error("response is not a list of strings.")
-                continue
-
-        except Exception as e:
-            logger.warning(f"failed to generate video terms: {str(e)}")
-            if response:
-                match = re.search(r"\[.*]", response)
-                if match:
-                    try:
-                        search_terms = json.loads(match.group())
-                    except Exception as e:
-                        logger.warning(f"failed to generate video terms: {str(e)}")
-                        pass
-
-        if search_terms and len(search_terms) > 0:
-            break
-        if i < _max_retries:
-            logger.warning(f"failed to generate video terms, trying again... {i + 1}")
-
-    logger.success(f"completed: \n{search_terms}")
-    return search_terms
+    # JSONとしてパース
+    try:
+        prompts = json.loads(response)
+    except json.JSONDecodeError:
+        raise ValueError(f"LLM response is not valid JSON: {response}")
+    if not isinstance(prompts, list) or not all(isinstance(p, str) for p in prompts):
+        raise ValueError(f"LLM response is not a list of strings: {prompts}")
 
+    return prompts
 
 if __name__ == "__main__":
     video_subject = "生命的意义是什么"
diff --git a/app/services/task.py b/app/services/task.py
index 77ca908..ffabe4b 100644
--- a/app/services/task.py
+++ b/app/services/task.py
@@ -8,7 +8,7 @@ from loguru import logger
 from app.config import config
 from app.models import const
 from app.models.schema import VideoConcatMode, VideoParams
-from app.services import llm, material, subtitle, video, voice
+from app.services import llm, material, subtitle, video, voice, imagegen
 from app.services import state as sm
 from app.utils import utils
 
@@ -124,7 +124,21 @@ def generate_subtitle(task_id, params, video_script, sub_maker, audio_file):
 
 
 def get_video_materials(task_id, params, video_terms, audio_duration):
-    if params.video_source == "local":
+    if params.video_source == "local-ai":
+        logger.info("\n\n## generating AI images as materials")
+        prompts = video_terms if isinstance(video_terms, list) else [video_terms]
+        image_paths = []
+        for i, prompt in enumerate(prompts):
+            output_path = path.join(utils.task_dir(task_id), f"aiimg_{i}.png")
+            imagegen.generate_image(prompt, output_path)
+            image_paths.append(output_path)
+        materials = []
+        for img_path in image_paths:
+            m = material.MaterialInfo(provider="local-ai", url=img_path, duration=0)
+            materials.append(m)
+        processed = video.preprocess_video(materials, clip_duration=params.video_clip_duration)
+        return [material.url for material in materials]
+    elif params.video_source == "local":
         logger.info("\n\n## preprocess local materials")
         materials = video.preprocess_video(
             materials=params.video_materials, clip_duration=params.video_clip_duration
diff --git a/webui/Main.py b/webui/Main.py
index aafed1b..9adb772 100644
--- a/webui/Main.py
+++ b/webui/Main.py
@@ -532,6 +532,7 @@ with middle_panel:
             (tr("Pexels"), "pexels"),
             (tr("Pixabay"), "pixabay"),
             (tr("Local file"), "local"),
+            (tr("AI Image (local)"), "local-ai"),
             (tr("TikTok"), "douyin"),
             (tr("Bilibili"), "bilibili"),
             (tr("Xiaohongshu"), "xiaohongshu"),
@@ -904,7 +905,7 @@ if start_button:
         scroll_to_bottom()
         st.stop()
 
-    if params.video_source not in ["pexels", "pixabay", "local"]:
+    if params.video_source not in ["pexels", "pixabay", "local", "local-ai", "douyin", "bilibili", "xiaohongshu"]:
         st.error(tr("Please Select a Valid Video Source"))
         scroll_to_bottom()
         st.stop()
@@ -966,6 +967,8 @@ if start_button:
     except Exception:
         pass
 
+    # logger.info(tr("!!! Uploading Video Files To Tiktok!!!"))
+
     open_task_folder(task_id)
     logger.info(tr("Video Generation Completed"))
     scroll_to_bottom()