| | import os
|
| | import subprocess
|
| | import gradio as gr
|
| | from retinaface import RetinaFace
|
| | from PIL import Image
|
| | import filetype
|
| | from datetime import datetime
|
| | import re
|
| | import sys
|
| | import torch
|
| | import argparse
|
| |
|
| | import platform, os
|
| |
|
| | def open_folder():
|
| | open_folder_path = os.path.abspath("outputs")
|
| | if platform.system() == "Windows":
|
| | os.startfile(open_folder_path)
|
| | elif platform.system() == "Linux":
|
| | os.system(f'xdg-open "{open_folder_path}"')
|
| |
|
| |
|
| |
|
| | python_executable = sys.executable
|
| |
|
| | def display_media(file):
|
| |
|
| | if file is None:
|
| | return gr.update(visible=False), gr.update(visible=False)
|
| | kind = filetype.guess(file.name)
|
| |
|
| | if kind is None:
|
| | return gr.update(visible=False), gr.update(visible=False)
|
| |
|
| | if kind.mime.startswith('video'):
|
| | return gr.update(value=file.name, visible=True), gr.update(visible=False)
|
| | elif kind.mime.startswith('audio'):
|
| | return gr.update(visible=False), gr.update(value=file.name, visible=True)
|
| | else:
|
| | return gr.update(visible=False), gr.update(visible=False)
|
| |
|
| |
|
| | parser = argparse.ArgumentParser()
|
| | parser.add_argument("--share", type=str, default=False, help="Set to True to share the app publicly.")
|
| | args = parser.parse_args()
|
| |
|
| |
|
| |
|
| | def extract_audio(video_path, audio_path):
|
| | command = [python_executable, "-m", "ffmpeg", "-i", video_path, "-vn", "-acodec", "libmp3lame", "-q:a", "2", audio_path]
|
| | subprocess.call(command)
|
| |
|
| |
|
| | def convert_audio_to_mp3(audio_path, mp3_path):
|
| | command = ["ffmpeg", "-i", audio_path, "-acodec", "libmp3lame", "-q:a", "2", mp3_path]
|
| | subprocess.call(command)
|
| |
|
| | def crop_and_save_image(image_path, auto_crop, crop_width, crop_height, crop_expansion):
|
| | cropped_image = auto_crop_image(image_path, crop_expansion, crop_size=(crop_width, crop_height))
|
| | if cropped_image is not None:
|
| | cropped_folder = os.path.join("outputs", "cropped_images")
|
| | os.makedirs(cropped_folder, exist_ok=True)
|
| |
|
| |
|
| | base_name, extension = os.path.splitext(os.path.basename(image_path))
|
| |
|
| |
|
| | counter = 1
|
| |
|
| |
|
| | new_image_name = f"{base_name}_{counter:04d}{extension}"
|
| | cropped_image_path = os.path.join(cropped_folder, new_image_name)
|
| |
|
| |
|
| | while os.path.exists(cropped_image_path):
|
| | counter += 1
|
| | new_image_name = f"{base_name}_{counter:04d}{extension}"
|
| | cropped_image_path = os.path.join(cropped_folder, new_image_name)
|
| |
|
| |
|
| | cropped_image.save(cropped_image_path, format='PNG')
|
| | return cropped_image_path
|
| | return None
|
| |
|
| |
|
| | def generate_kps_sequence_and_audio(video_path, kps_sequence_save_path, audio_save_path):
|
| | command = [python_executable, "scripts/extract_kps_sequence_and_audio.py", "--video_path", video_path, "--kps_sequence_save_path", kps_sequence_save_path, "--audio_save_path", audio_save_path]
|
| | subprocess.call(command)
|
| |
|
| | def auto_crop_image(image_path, expand_percent, crop_size=(512, 512)):
|
| |
|
| | if torch.cuda.is_available():
|
| | device = 'cuda'
|
| | print("Using GPU for RetinaFace detection.")
|
| | else:
|
| | device = 'cpu'
|
| | print("Using CPU for RetinaFace detection.")
|
| |
|
| |
|
| | img = Image.open(image_path)
|
| |
|
| |
|
| | faces = RetinaFace.detect_faces(image_path)
|
| |
|
| | if not faces:
|
| | print("No faces detected.")
|
| | return None
|
| |
|
| |
|
| |
|
| | face = list(faces.values())[0]
|
| | landmarks = face['landmarks']
|
| |
|
| |
|
| | right_eye = landmarks['right_eye']
|
| | left_eye = landmarks['left_eye']
|
| | right_mouth = landmarks['mouth_right']
|
| | left_mouth = landmarks['mouth_left']
|
| |
|
| |
|
| | eye_distance = abs(right_eye[0] - left_eye[0])
|
| |
|
| |
|
| | head_width = eye_distance * 4.5
|
| | head_height = eye_distance * 6.5
|
| |
|
| |
|
| | eye_center_x = (right_eye[0] + left_eye[0]) // 2
|
| | eye_center_y = (right_eye[1] + left_eye[1]) // 2
|
| |
|
| |
|
| | head_left = max(0, int(eye_center_x - head_width // 2))
|
| | head_top = max(0, int(eye_center_y - head_height // 2))
|
| | head_right = min(img.width, int(eye_center_x + head_width // 2))
|
| | head_bottom = min(img.height, int(eye_center_y + head_height // 2))
|
| |
|
| |
|
| | assumed_head_img = img.crop((head_left, head_top, head_right, head_bottom))
|
| | assumed_head_img.save("assumed_head.png", format='PNG')
|
| |
|
| |
|
| | expanded_w = int(head_width * (1 + expand_percent))
|
| | expanded_h = int(head_height * (1 + expand_percent))
|
| |
|
| |
|
| | center_x, center_y = head_left + head_width // 2, head_top + head_height // 2
|
| | left = max(0, center_x - expanded_w // 2)
|
| | right = min(img.width, center_x + expanded_w // 2)
|
| | top = max(0, center_y - expanded_h // 2)
|
| | bottom = min(img.height, center_y + expanded_h // 2)
|
| |
|
| |
|
| | cropped_img = img.crop((left, top, right, bottom))
|
| | cropped_img.save("expanded_face.png", format='PNG')
|
| |
|
| |
|
| | cropped_width, cropped_height = cropped_img.size
|
| | aspect_ratio = cropped_width / cropped_height
|
| |
|
| |
|
| | target_width = crop_size[0]
|
| | target_height = crop_size[1]
|
| |
|
| |
|
| | if aspect_ratio > target_width / target_height:
|
| |
|
| | new_width = int(cropped_height * target_width / target_height)
|
| | left_crop = (cropped_width - new_width) // 2
|
| | right_crop = left_crop + new_width
|
| | top_crop = 0
|
| | bottom_crop = cropped_height
|
| | else:
|
| |
|
| | new_height = int(cropped_width * target_height / target_width)
|
| | top_crop = (cropped_height - new_height) // 2
|
| | bottom_crop = top_crop + new_height
|
| | left_crop = 0
|
| | right_crop = cropped_width
|
| |
|
| |
|
| | final_cropped_img = cropped_img.crop((left_crop, top_crop, right_crop, bottom_crop))
|
| | final_cropped_img.save("final_cropped_img.png", format='PNG')
|
| |
|
| |
|
| | resized_img = final_cropped_img.resize(crop_size, resample=Image.LANCZOS)
|
| |
|
| |
|
| | resized_img.save(image_path, format='PNG')
|
| | return resized_img
|
| |
|
| |
|
| | def generate_output_video(reference_image_path, audio_path, kps_path, output_path, retarget_strategy, num_inference_steps, reference_attention_weight, audio_attention_weight, auto_crop, crop_width, crop_height, crop_expansion,image_width,image_height, low_vram):
|
| | print("auto cropping...")
|
| | if auto_crop:
|
| | auto_crop_image(reference_image_path,crop_expansion, crop_size=(crop_width, crop_height))
|
| |
|
| | print("starting inference...")
|
| | command = [
|
| | python_executable, "inference.py",
|
| | "--reference_image_path", reference_image_path,
|
| | "--audio_path", audio_path,
|
| | "--kps_path", kps_path,
|
| | "--output_path", output_path,
|
| | "--retarget_strategy", retarget_strategy,
|
| | "--num_inference_steps", str(num_inference_steps),
|
| | "--reference_attention_weight", str(reference_attention_weight),
|
| | "--audio_attention_weight", str(audio_attention_weight),
|
| | "--image_width", str(image_width),
|
| | "--image_height", str(image_height)
|
| | ]
|
| |
|
| | if low_vram:
|
| | command.append("--save_gpu_memory")
|
| |
|
| | with open("executed_command.txt", "w") as file:
|
| | file.write(" ".join(command))
|
| |
|
| | subprocess.call(command)
|
| | return output_path, reference_image_path
|
| |
|
| | def sanitize_folder_name(name):
|
| |
|
| | invalid_chars = r'[<>:"/\\|?*\x00-\x1F]'
|
| |
|
| | sanitized_name = re.sub(invalid_chars, '_', name)
|
| | return sanitized_name
|
| |
|
| |
|
| | def process_input(reference_image, target_input, retarget_strategy, num_inference_steps, reference_attention_weight, audio_attention_weight, auto_crop, crop_width, crop_height, crop_expansion,image_width,image_height,low_vram):
|
| |
|
| | temp_process_dir = "temp_process"
|
| | os.makedirs(temp_process_dir, exist_ok=True)
|
| |
|
| | input_file_name = os.path.splitext(os.path.basename(reference_image))[0]
|
| | input_file_name=sanitize_folder_name(input_file_name)
|
| | timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
|
| | temp_dir = os.path.join(temp_process_dir, f"{input_file_name}_{timestamp}")
|
| | os.makedirs(temp_dir, exist_ok=True)
|
| |
|
| | kind = filetype.guess(target_input)
|
| | if not kind:
|
| | raise ValueError("Cannot determine file type. Please provide a valid video or audio file.")
|
| |
|
| | mime_type = kind.mime
|
| |
|
| | if mime_type.startswith("video/"):
|
| | audio_path = os.path.join(temp_dir, "target_audio.mp3")
|
| | kps_path = os.path.join(temp_dir, "kps.pth")
|
| | print("generating generate_kps_sequence_and_audio...")
|
| | generate_kps_sequence_and_audio(target_input, kps_path, audio_path)
|
| | elif mime_type.startswith("audio/"):
|
| | audio_path = target_input
|
| | if mime_type != "audio/mpeg":
|
| | mp3_path = os.path.join(temp_dir, "target_audio_converted.mp3")
|
| | convert_audio_to_mp3(target_input, mp3_path)
|
| | audio_path = mp3_path
|
| | kps_path = ""
|
| | else:
|
| | raise ValueError("Unsupported file type. Please provide a video or audio file.")
|
| |
|
| | output_dir = "outputs"
|
| | os.makedirs(output_dir, exist_ok=True)
|
| | output_file_name = f"{input_file_name}_result_"
|
| | output_file_name=sanitize_folder_name(output_file_name)
|
| | output_file_ext = ".mp4"
|
| | output_file_count = 1
|
| | while os.path.exists(os.path.join(output_dir, f"{output_file_name}{output_file_count:04d}{output_file_ext}")):
|
| | output_file_count += 1
|
| | output_path = os.path.join(output_dir, f"{output_file_name}{output_file_count:04d}{output_file_ext}")
|
| |
|
| |
|
| | output_video_path, cropped_image_path = generate_output_video(reference_image, audio_path, kps_path, output_path, retarget_strategy, num_inference_steps, reference_attention_weight, audio_attention_weight, auto_crop,crop_width,crop_height, crop_expansion,image_width,image_height,low_vram)
|
| |
|
| | return output_video_path, cropped_image_path
|
| |
|
| | def launch_interface():
|
| | retarget_strategies = ["fix_face", "no_retarget", "offset_retarget", "naive_retarget"]
|
| |
|
| | with gr.Blocks() as demo:
|
| | gr.Markdown("# Tencent AI Lab - V-Express Image to Animation V4 : https://www.patreon.com/posts/105251204")
|
| | with gr.Row():
|
| | with gr.Column():
|
| | input_image = gr.Image(label="Reference Image", format="png", type="filepath", height=512)
|
| | generate_button = gr.Button("Generate Talking Video")
|
| | low_vram = gr.Checkbox(label="Low VRAM - Greatly reduces VRAM usage but takes longer", value=False,visible=False)
|
| | crop_button = gr.Button("Crop Image")
|
| | with gr.Row():
|
| |
|
| | with gr.Column(min_width=0):
|
| | image_width = gr.Number(label="Target Video Width", value=512)
|
| |
|
| | with gr.Column(min_width=0):
|
| | image_height = gr.Number(label="Target Video Height", value=512)
|
| |
|
| | with gr.Row():
|
| | with gr.Column(min_width=0):
|
| | retarget_strategy = gr.Dropdown(retarget_strategies, label="Retarget Strategy", value="fix_face")
|
| | with gr.Column(min_width=0):
|
| | inference_steps = gr.Slider(10, 90, step=1, label="Number of Inference Steps", value=30)
|
| |
|
| | with gr.Row():
|
| | with gr.Column(min_width=0):
|
| | reference_attention = gr.Slider(0.80, 1.1, step=0.01, label="Reference Attention Weight", value=0.95)
|
| | with gr.Column(min_width=0):
|
| | audio_attention = gr.Slider(1.0, 5.0, step=0.1, label="Audio Attention Weight", value=3.0)
|
| |
|
| | with gr.Row(visible=True) as crop_size_row:
|
| | with gr.Column(min_width=0):
|
| | auto_crop = gr.Checkbox(label="Auto Crop Image", value=True)
|
| | with gr.Column(min_width=0):
|
| | crop_expansion = gr.Slider(0.0, 1.0, step=0.01, label="Face Focus Expansion Percent", value=0.15)
|
| | with gr.Row():
|
| | with gr.Column(min_width=0):
|
| | crop_width = gr.Number(label="Crop Width", value=512)
|
| | with gr.Column(min_width=0):
|
| | crop_height = gr.Number(label="Crop Height", value=512)
|
| |
|
| | with gr.Column():
|
| | input_video = gr.File(
|
| | label="Target Input (Image or Video)",
|
| | type="filepath",
|
| | file_count="single",
|
| | file_types=[
|
| | ".mp4", ".avi", ".mov", ".wmv", ".flv", ".mkv", ".webm",
|
| | ".3gp", ".m4v", ".mpg", ".mpeg", ".m2v", ".m4v", ".mts",
|
| | ".mp3", ".wav", ".aac", ".flac", ".m4a", ".wma", ".ogg"
|
| | ],
|
| | height=512 )
|
| | video_output = gr.Video(visible=False)
|
| | audio_output = gr.Audio(visible=False)
|
| |
|
| | input_video.change(display_media, inputs=input_video, outputs=[video_output, audio_output])
|
| | btn_open_outputs = gr.Button("Open Outputs Folder")
|
| | btn_open_outputs.click(fn=open_folder)
|
| | gr.Markdown("""
|
| |
|
| | Retarget Strategies
|
| |
|
| | Only target audio : fix_face
|
| |
|
| | Input picture and target video (same person - best practice) select : no_retarget
|
| |
|
| | Input picture and target video (different person) select : offset_retarget or naive_retarget
|
| |
|
| | Please look examples in Tests folder to see which settings you like most. I feel like offset_retarget is best
|
| |
|
| | You can turn up reference_attention_weight to make the model maintain higher character consistency, and turn down audio_attention_weight to reduce mouth artifacts. E.g. setting both values to 1.0
|
| | """)
|
| |
|
| |
|
| |
|
| | with gr.Column():
|
| | output_video = gr.Video(label="Generated Video", height=512)
|
| | output_image = gr.Image(label="Cropped Image")
|
| |
|
| |
|
| | generate_button.click(
|
| | fn=process_input,
|
| | inputs=[
|
| | input_image,
|
| | input_video,
|
| | retarget_strategy,
|
| | inference_steps,
|
| | reference_attention,
|
| | audio_attention,
|
| | auto_crop,
|
| | crop_width,
|
| | crop_height,
|
| | crop_expansion,
|
| | image_width,
|
| | image_height,
|
| | low_vram
|
| | ],
|
| | outputs=[output_video, output_image]
|
| | )
|
| |
|
| | crop_button.click(
|
| | fn=crop_and_save_image,
|
| | inputs=[
|
| | input_image,
|
| | auto_crop,
|
| | crop_width,
|
| | crop_height,
|
| | crop_expansion
|
| | ],
|
| | outputs=output_image
|
| | )
|
| |
|
| | demo.queue()
|
| | demo.launch(inbrowser=True,share=args.share)
|
| |
|
| |
|
| | launch_interface() |