MoneyPrinterTurbo/app/services/voice.py
harry b471a272b6 1, optimize the subtitle generation in edge mode
2, optimize the llm prompt, use the same language as the video subject
2024-03-24 17:52:12 +08:00

131 lines
4.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import asyncio
from concurrent.futures import ThreadPoolExecutor
from xml.sax.saxutils import unescape
from edge_tts.submaker import mktimestamp
from loguru import logger
from edge_tts import submaker, SubMaker
import edge_tts
from app.utils import utils
def tts(text: str, voice_name: str, voice_file: str) -> [SubMaker, None]:
logger.info(f"start, voice name: {voice_name}")
try:
async def _do() -> SubMaker:
communicate = edge_tts.Communicate(text, voice_name)
sub_maker = edge_tts.SubMaker()
with open(voice_file, "wb") as file:
async for chunk in communicate.stream():
if chunk["type"] == "audio":
file.write(chunk["data"])
elif chunk["type"] == "WordBoundary":
sub_maker.create_sub((chunk["offset"], chunk["duration"]), chunk["text"])
return sub_maker
sub_maker = asyncio.run(_do())
logger.info(f"completed, output file: {voice_file}")
return sub_maker
except Exception as e:
logger.error(f"failed, error: {e}")
return None
def create_subtitle(sub_maker: submaker.SubMaker, text: str, subtitle_file: str):
"""
优化字幕文件
1. 将字幕文件按照标点符号分割成多行
2. 逐行匹配字幕文件中的文本
3. 生成新的字幕文件
"""
def formatter(idx: int, start_time: float, end_time: float, sub_text: str) -> str:
"""
1
00:00:00,000 --> 00:00:02,360
跑步是一项简单易行的运动
"""
start_t = mktimestamp(start_time).replace(".", ",")
end_t = mktimestamp(end_time).replace(".", ",")
return (
f"{idx}\n"
f"{start_t} --> {end_t}\n"
f"{sub_text}\n"
)
start_time = -1.0
sub_items = []
sub_index = 0
script_lines = utils.split_string_by_punctuations(text)
# remove space in every word
script_lines_without_space = [line.replace(" ", "") for line in script_lines]
sub_line = ""
for _, (offset, sub) in enumerate(zip(sub_maker.offset, sub_maker.subs)):
_start_time, end_time = offset
if start_time < 0:
start_time = _start_time
sub = unescape(sub)
sub_line += sub
if sub_line == script_lines[sub_index] or sub_line == script_lines_without_space[sub_index]:
sub_text = script_lines[sub_index]
sub_index += 1
line = formatter(
idx=sub_index,
start_time=start_time,
end_time=end_time,
sub_text=sub_text,
)
# logger.debug(line.strip())
sub_items.append(line)
start_time = -1.0
sub_line = ""
with open(subtitle_file, "w", encoding="utf-8") as file:
file.write("\n".join(sub_items) + "\n")
def get_audio_duration(sub_maker: submaker.SubMaker):
"""
获取音频时长
"""
if not sub_maker.offset:
return 0.0
return sub_maker.offset[-1][1] / 10000000
if __name__ == "__main__":
async def _do():
temp_dir = utils.storage_dir("temp")
voice_names = [
# 女性
"zh-CN-XiaoxiaoNeural",
"zh-CN-XiaoyiNeural",
# 男性
"zh-CN-YunyangNeural",
"zh-CN-YunxiNeural",
]
text = """
预计未来3天深圳冷空气活动频繁未来两天持续阴天有小雨出门带好雨具
10-11日持续阴天有小雨日温差小气温在13-17℃之间体感阴凉
12日天气短暂好转早晚清凉
"""
for voice_name in voice_names:
voice_file = f"{temp_dir}/tts-{voice_name}.mp3"
subtitle_file = f"{temp_dir}/tts.mp3.srt"
sub_maker = tts(text=text, voice_name=voice_name, voice_file=voice_file)
create_subtitle(sub_maker=sub_maker, text=text, subtitle_file=subtitle_file)
audio_duration = get_audio_duration(sub_maker)
print(f"voice: {voice_name}, audio duration: {audio_duration}s")
loop = asyncio.get_event_loop_policy().get_event_loop()
try:
loop.run_until_complete(_do())
finally:
loop.close()