AI
/
video-create


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
							import os
import json
from typing import Optional, Dict, Any

from urllib3 import response
from utils.tools import string_to_json, save_json_file
from tools.text_generator import media_captioner


system_prompt_select_reference_camera = \
"""
[角色]  
你是一位专业的视频剪辑专家，擅长多机位镜头分析与场景结构建模。你深谙影视语言，能够理解景别（如全景、中景、特写）与内容包含关系，并能根据镜头描述推断机位间的层级结构。  

[任务]  
你的任务是分析输入的机位数据，构建"机位树"。该树状结构表示父机位内容包含子机位内容的关系。具体而言，你需要为每个机位识别其父机位（若存在），并确定依赖镜头索引（即父机位素材中包含子机位内容的具体镜头）。若某机位无父机位，则输出None。  

[输入]  
输入为一系列机位数据，序列由<CAMERA_SEQ>和</CAMERA_SEQ>包裹。  
每个机位包含该机位拍摄的镜头序列，由<CAMERA_N>和</CAMERA_N>包裹，其中N为机位索引。  

以下为输入格式示例：  

<CAMERA_SEQ>  
<CAMERA_0>  
shot 0：街道中景。爱丽丝和鲍勃相向而行。  
shot 2：街道中景。爱丽丝和鲍勃相拥。  
</CAMERA_0>  
<CAMERA_1>  
shot 1：爱丽丝面部特写。她认出鲍勃时表情从惊讶转为欣喜。  
</CAMERA_1>  
</CAMERA_SEQ>  

[输出]
严格遵循以下JSON格式输出：
```json
{
    "camera_tree": [
        {
            "parent_cam_idx": // 父机位的索引。如果机位没有父级（例如根机位），则设置为**None**。例如：0、1、None。
            "parent_shot_idx": // 依赖镜头的索引。如果机位没有父级（例如根机位），则设置为**None**。例如：0、3、None。
            "reason": // 选择父机位的原因。如果机位没有父级，应解释为什么它是根机位。例如：父机位的视野涵盖了子机位的视野（从中景到特写）
            "is_parent_fully_covers_child": // 父机位是否完全覆盖子机位的内容。如果机位没有父级，则设置为**None**。例如：True、False、None。
            "missing_info": // 子镜头中父镜头未涵盖的缺失元素。如果父镜头完全覆盖子镜头，则设置为**None**。例如：罗宇尘的正面视角、None。
        },
        // 更多机位的父机位信息
    ]
}
```

[要求]
- 所有输出值（不包括键名）的语言必须与输入语言保持一致。
- 内容包容性检查：父机位应尽可能在特定画面中完全包含子机位内容（例如父中景双人镜头应涵盖子过肩反打镜头）。通过对比关键词（如角色、动作、场景）分析镜头描述，确保父镜头视场能覆盖子镜头。
- 过渡流畅度优先：优先选择更大景别作为父机位，例如全景→中景或中景→特写。相邻父子节点的景别差异应尽可能小，严禁直接从远景跳切到特写（除非绝对必要）。
- 时间邻近性：每个机位由其对应的首个画面描述确定父机位位置，父机位的画面索引应尽可能接近子机位的首个画面索引。
- 逻辑一致性：机位树必须无环，避免循环依赖。若某镜头被多个潜在父机位包含，则选择最佳匹配（基于景别和内容）。若无合适父机位则输出None。
- 当缺乏更广视角时，选择视场重叠最大的镜头作为父镜头（信息重合度最高者），或正反打镜头可互为父子。当两个机位可互为父子时，索引较小者作为索引较大者的父机位。
- 仅允许存在一个无父机位的根机位。
- 描述镜头缺失元素时，需仔细比对父子镜头细节。例如父镜头是角色A与B侧身相对的中景，子镜头是角色A的正脸特写时，需注明子镜头缺失角色A的正面视角信息。
- 首个机位必须作为机位树的根节点。
- **camera_tree**中每个元素代表一个机位的父机位信息；如果机位没有父级（例如根机位），则设置为None。列表的长度应与机位数量相同。
"""

human_prompt_select_reference_camera = \
"""
<CAMERA_SEQ>
{camera_seq_str}
</CAMERA_SEQ>
"""

class CameraTreeCreator:
    def __init__(self) -> None:
        pass

    def create_camera_tree(
        self,
        shot_descriptions: list[Dict[str, Any]],
    ):

        cameras = []
        for shot_description in shot_descriptions:
            if shot_description["cam_idx"] not in [camera["idx"] for camera in cameras]:
                cameras.append({"idx": shot_description["cam_idx"], "active_shot_idxs": [shot_description["idx"]]})
            else:
                cameras[shot_description["cam_idx"]]["active_shot_idxs"].append(shot_description["idx"])

        camera_seq_str = ""
        for cam in cameras:
            camera_seq_str += f"<CAMERA_{cam['idx']}>\n"
            for shot_idx in cam["active_shot_idxs"]:
                camera_seq_str += f"Shot {shot_idx}: {shot_descriptions[shot_idx]['visual_desc']}\n"
            camera_seq_str += f"</CAMERA_{cam['idx']}>\n"

        user_prompt = human_prompt_select_reference_camera.format(camera_seq_str=camera_seq_str)
        system_prompt = system_prompt_select_reference_camera

        response = media_captioner.generate_text_understanding(
            system_prompt=system_prompt,
            user_prompt=user_prompt
        )

        response = string_to_json(response)
        for idx, item in enumerate(response["camera_tree"]):
            item["active_shot_idxs"] = cameras[idx]["active_shot_idxs"]
            
        save_json_file(response, "./camera_tree.json")

        return response

camera_tree_creator = CameraTreeCreator()

if __name__ == "__main__":

    with open("./output.json", "r") as f:
        shot_descriptions = json.load(f)

    shot = shot_descriptions["storyboard"]
    camera_tree_creator.create_camera_tree(shot)