From c70b45bd39881237de1957a0e8134eedf9516bb6 Mon Sep 17 00:00:00 2001 From: Henry Ruhs Date: Wed, 29 Jan 2025 12:50:29 +0100 Subject: [PATCH] Feat/improve content analyser (#861) * Introduce fit_frame to improve content analyser, rename resize_frame_resolution to restrict_frame * Fix CI, Add some spaces * Normalize according to face detector --- facefusion/content_analyser.py | 9 +++------ facefusion/face_detector.py | 22 ++++++++++++++++------ facefusion/uis/components/preview.py | 6 +++--- facefusion/vision.py | 24 ++++++++++++++++++++---- 4 files changed, 42 insertions(+), 19 deletions(-) diff --git a/facefusion/content_analyser.py b/facefusion/content_analyser.py index 151bf67..7e2bbf8 100644 --- a/facefusion/content_analyser.py +++ b/facefusion/content_analyser.py @@ -9,7 +9,7 @@ from facefusion.download import conditional_download_hashes, conditional_downloa from facefusion.filesystem import resolve_relative_path from facefusion.thread_helper import conditional_thread_semaphore from facefusion.typing import Detection, DownloadScope, Fps, InferencePool, ModelOptions, ModelSet, Score, VisionFrame -from facefusion.vision import detect_video_fps, read_image, read_video_frame, resize_frame_resolution +from facefusion.vision import detect_video_fps, fit_frame, read_image, read_video_frame STREAM_COUNTER = 0 @@ -106,7 +106,7 @@ def analyse_video(video_path : str, trim_frame_start : int, trim_frame_end : int def detect_nsfw(vision_frame : VisionFrame) -> List[Score]: nsfw_scores = [] model_size = get_model_options().get('size') - temp_vision_frame = resize_frame_resolution(vision_frame, model_size) + temp_vision_frame = fit_frame(vision_frame, model_size) detect_vision_frame = prepare_detect_frame(temp_vision_frame) detection = forward(detect_vision_frame) detection = numpy.squeeze(detection).T @@ -133,9 +133,6 @@ def forward(vision_frame : VisionFrame) -> Detection: def prepare_detect_frame(temp_vision_frame : VisionFrame) -> VisionFrame: - model_size = get_model_options().get('size') - detect_vision_frame = numpy.zeros((model_size[0], model_size[1], 3)) - detect_vision_frame[:temp_vision_frame.shape[0], :temp_vision_frame.shape[1], :] = temp_vision_frame - detect_vision_frame = detect_vision_frame / 255.0 + detect_vision_frame = temp_vision_frame / 255.0 detect_vision_frame = numpy.expand_dims(detect_vision_frame.transpose(2, 0, 1), axis = 0).astype(numpy.float32) return detect_vision_frame diff --git a/facefusion/face_detector.py b/facefusion/face_detector.py index 5f57622..c5d29d2 100644 --- a/facefusion/face_detector.py +++ b/facefusion/face_detector.py @@ -1,5 +1,5 @@ from functools import lru_cache -from typing import List, Tuple +from typing import List, Sequence, Tuple import cv2 import numpy @@ -10,7 +10,7 @@ from facefusion.face_helper import create_rotated_matrix_and_size, create_static from facefusion.filesystem import resolve_relative_path from facefusion.thread_helper import thread_semaphore from facefusion.typing import Angle, BoundingBox, Detection, DownloadScope, DownloadSet, FaceLandmark5, InferencePool, ModelSet, Score, VisionFrame -from facefusion.vision import resize_frame_resolution, unpack_resolution +from facefusion.vision import restrict_frame, unpack_resolution @lru_cache(maxsize = None) @@ -151,10 +151,11 @@ def detect_with_retinaface(vision_frame : VisionFrame, face_detector_size : str) anchor_total = 2 face_detector_score = state_manager.get_item('face_detector_score') face_detector_width, face_detector_height = unpack_resolution(face_detector_size) - temp_vision_frame = resize_frame_resolution(vision_frame, (face_detector_width, face_detector_height)) + temp_vision_frame = restrict_frame(vision_frame, (face_detector_width, face_detector_height)) ratio_height = vision_frame.shape[0] / temp_vision_frame.shape[0] ratio_width = vision_frame.shape[1] / temp_vision_frame.shape[1] detect_vision_frame = prepare_detect_frame(temp_vision_frame, face_detector_size) + detect_vision_frame = normalize_detect_frame(detect_vision_frame, [ -1, 1 ]) detection = forward_with_retinaface(detect_vision_frame) for index, feature_stride in enumerate(feature_strides): @@ -194,10 +195,11 @@ def detect_with_scrfd(vision_frame : VisionFrame, face_detector_size : str) -> T anchor_total = 2 face_detector_score = state_manager.get_item('face_detector_score') face_detector_width, face_detector_height = unpack_resolution(face_detector_size) - temp_vision_frame = resize_frame_resolution(vision_frame, (face_detector_width, face_detector_height)) + temp_vision_frame = restrict_frame(vision_frame, (face_detector_width, face_detector_height)) ratio_height = vision_frame.shape[0] / temp_vision_frame.shape[0] ratio_width = vision_frame.shape[1] / temp_vision_frame.shape[1] detect_vision_frame = prepare_detect_frame(temp_vision_frame, face_detector_size) + detect_vision_frame = normalize_detect_frame(detect_vision_frame, [ -1, 1 ]) detection = forward_with_scrfd(detect_vision_frame) for index, feature_stride in enumerate(feature_strides): @@ -234,10 +236,11 @@ def detect_with_yolo_face(vision_frame : VisionFrame, face_detector_size : str) face_landmarks_5 = [] face_detector_score = state_manager.get_item('face_detector_score') face_detector_width, face_detector_height = unpack_resolution(face_detector_size) - temp_vision_frame = resize_frame_resolution(vision_frame, (face_detector_width, face_detector_height)) + temp_vision_frame = restrict_frame(vision_frame, (face_detector_width, face_detector_height)) ratio_height = vision_frame.shape[0] / temp_vision_frame.shape[0] ratio_width = vision_frame.shape[1] / temp_vision_frame.shape[1] detect_vision_frame = prepare_detect_frame(temp_vision_frame, face_detector_size) + detect_vision_frame = normalize_detect_frame(detect_vision_frame, [ 0, 1 ]) detection = forward_with_yolo_face(detect_vision_frame) detection = numpy.squeeze(detection).T bounding_boxes_raw, face_scores_raw, face_landmarks_5_raw = numpy.split(detection, [ 4, 5 ], axis = 1) @@ -305,6 +308,13 @@ def prepare_detect_frame(temp_vision_frame : VisionFrame, face_detector_size : s face_detector_width, face_detector_height = unpack_resolution(face_detector_size) detect_vision_frame = numpy.zeros((face_detector_height, face_detector_width, 3)) detect_vision_frame[:temp_vision_frame.shape[0], :temp_vision_frame.shape[1], :] = temp_vision_frame - detect_vision_frame = (detect_vision_frame - 127.5) / 128.0 detect_vision_frame = numpy.expand_dims(detect_vision_frame.transpose(2, 0, 1), axis = 0).astype(numpy.float32) return detect_vision_frame + + +def normalize_detect_frame(detect_vision_frame : VisionFrame, normalize_range : Sequence[int]) -> VisionFrame: + if normalize_range == [ -1, 1 ]: + return (detect_vision_frame - 127.5) / 128.0 + if normalize_range == [ 0, 1 ]: + return detect_vision_frame / 255.0 + return detect_vision_frame diff --git a/facefusion/uis/components/preview.py b/facefusion/uis/components/preview.py index e3fb85b..8b8cfc7 100755 --- a/facefusion/uis/components/preview.py +++ b/facefusion/uis/components/preview.py @@ -18,7 +18,7 @@ from facefusion.processors.core import get_processors_modules from facefusion.typing import AudioFrame, Face, FaceSet, VisionFrame from facefusion.uis.core import get_ui_component, get_ui_components, register_ui_component from facefusion.uis.typing import ComponentOptions -from facefusion.vision import count_video_frame_total, detect_frame_orientation, normalize_frame_color, read_static_image, read_static_images, read_video_frame, resize_frame_resolution +from facefusion.vision import count_video_frame_total, detect_frame_orientation, normalize_frame_color, read_static_image, read_static_images, read_video_frame, restrict_frame PREVIEW_IMAGE : Optional[gradio.Image] = None PREVIEW_FRAME_SLIDER : Optional[gradio.Slider] = None @@ -185,7 +185,7 @@ def clear_and_update_preview_image(frame_number : int = 0) -> gradio.Image: def slide_preview_image(frame_number : int = 0) -> gradio.Image: if is_video(state_manager.get_item('target_path')): preview_vision_frame = normalize_frame_color(read_video_frame(state_manager.get_item('target_path'), frame_number)) - preview_vision_frame = resize_frame_resolution(preview_vision_frame, (1024, 1024)) + preview_vision_frame = restrict_frame(preview_vision_frame, (1024, 1024)) return gradio.Image(value = preview_vision_frame) return gradio.Image(value = None) @@ -237,7 +237,7 @@ def update_preview_frame_slider() -> gradio.Slider: def process_preview_frame(reference_faces : FaceSet, source_face : Face, source_audio_frame : AudioFrame, target_vision_frame : VisionFrame) -> VisionFrame: - target_vision_frame = resize_frame_resolution(target_vision_frame, (1024, 1024)) + target_vision_frame = restrict_frame(target_vision_frame, (1024, 1024)) source_vision_frame = target_vision_frame.copy() if analyse_frame(target_vision_frame): return cv2.GaussianBlur(target_vision_frame, (99, 99), 0) diff --git a/facefusion/vision.py b/facefusion/vision.py index 02b7aed..f5a0bf7 100644 --- a/facefusion/vision.py +++ b/facefusion/vision.py @@ -217,18 +217,31 @@ def detect_frame_orientation(vision_frame : VisionFrame) -> Orientation: return 'portrait' -def resize_frame_resolution(vision_frame : VisionFrame, max_resolution : Resolution) -> VisionFrame: +def restrict_frame(vision_frame : VisionFrame, resolution : Resolution) -> VisionFrame: height, width = vision_frame.shape[:2] - max_width, max_height = max_resolution + restrict_width, restrict_height = resolution - if height > max_height or width > max_width: - scale = min(max_height / height, max_width / width) + if height > restrict_height or width > restrict_width: + scale = min(restrict_height / height, restrict_width / width) new_width = int(width * scale) new_height = int(height * scale) return cv2.resize(vision_frame, (new_width, new_height)) return vision_frame +def fit_frame(vision_frame: VisionFrame, resolution: Resolution) -> VisionFrame: + fit_width, fit_height = resolution + height, width = vision_frame.shape[:2] + scale = min(fit_height / height, fit_width / width) + new_width = int(width * scale) + new_height = int(height * scale) + paste_vision_frame = cv2.resize(vision_frame, (new_width, new_height)) + x_pad = (fit_width - new_width) // 2 + y_pad = (fit_height - new_height) // 2 + temp_vision_frame = numpy.pad(paste_vision_frame, ((y_pad, fit_height - new_height - y_pad), (x_pad, fit_width - new_width - x_pad), (0, 0))) + return temp_vision_frame + + def normalize_frame_color(vision_frame : VisionFrame) -> VisionFrame: return cv2.cvtColor(vision_frame, cv2.COLOR_BGR2RGB) @@ -283,10 +296,12 @@ def create_tile_frames(vision_frame : VisionFrame, size : Size) -> Tuple[List[Vi for row_vision_frame in row_range: top = row_vision_frame - size[2] bottom = row_vision_frame + size[2] + tile_width + for column_vision_frame in col_range: left = column_vision_frame - size[2] right = column_vision_frame + size[2] + tile_width tile_vision_frames.append(pad_vision_frame[top:bottom, left:right, :]) + return tile_vision_frames, pad_width, pad_height @@ -304,5 +319,6 @@ def merge_tile_frames(tile_vision_frames : List[VisionFrame], temp_width : int, left = col_index * tile_vision_frame.shape[1] right = left + tile_vision_frame.shape[1] merge_vision_frame[top:bottom, left:right, :] = tile_vision_frame + merge_vision_frame = merge_vision_frame[size[1] : size[1] + temp_height, size[1]: size[1] + temp_width, :] return merge_vision_frame