From c70b45bd39881237de1957a0e8134eedf9516bb6 Mon Sep 17 00:00:00 2001
From: Henry Ruhs <info@henryruhs.com>
Date: Wed, 29 Jan 2025 12:50:29 +0100
Subject: [PATCH] Feat/improve content analyser (#861)

* Introduce fit_frame to improve content analyser, rename resize_frame_resolution to restrict_frame

* Fix CI, Add some spaces

* Normalize according to face detector
---
 facefusion/content_analyser.py       |  9 +++------
 facefusion/face_detector.py          | 22 ++++++++++++++++------
 facefusion/uis/components/preview.py |  6 +++---
 facefusion/vision.py                 | 24 ++++++++++++++++++++----
 4 files changed, 42 insertions(+), 19 deletions(-)

diff --git a/facefusion/content_analyser.py b/facefusion/content_analyser.py
index 151bf67..7e2bbf8 100644
--- a/facefusion/content_analyser.py
+++ b/facefusion/content_analyser.py
@@ -9,7 +9,7 @@ from facefusion.download import conditional_download_hashes, conditional_downloa
 from facefusion.filesystem import resolve_relative_path
 from facefusion.thread_helper import conditional_thread_semaphore
 from facefusion.typing import Detection, DownloadScope, Fps, InferencePool, ModelOptions, ModelSet, Score, VisionFrame
-from facefusion.vision import detect_video_fps, read_image, read_video_frame, resize_frame_resolution
+from facefusion.vision import detect_video_fps, fit_frame, read_image, read_video_frame
 
 STREAM_COUNTER = 0
 
@@ -106,7 +106,7 @@ def analyse_video(video_path : str, trim_frame_start : int, trim_frame_end : int
 def detect_nsfw(vision_frame : VisionFrame) -> List[Score]:
 	nsfw_scores = []
 	model_size = get_model_options().get('size')
-	temp_vision_frame = resize_frame_resolution(vision_frame, model_size)
+	temp_vision_frame = fit_frame(vision_frame, model_size)
 	detect_vision_frame = prepare_detect_frame(temp_vision_frame)
 	detection = forward(detect_vision_frame)
 	detection = numpy.squeeze(detection).T
@@ -133,9 +133,6 @@ def forward(vision_frame : VisionFrame) -> Detection:
 
 
 def prepare_detect_frame(temp_vision_frame : VisionFrame) -> VisionFrame:
-	model_size = get_model_options().get('size')
-	detect_vision_frame = numpy.zeros((model_size[0], model_size[1], 3))
-	detect_vision_frame[:temp_vision_frame.shape[0], :temp_vision_frame.shape[1], :] = temp_vision_frame
-	detect_vision_frame = detect_vision_frame / 255.0
+	detect_vision_frame = temp_vision_frame / 255.0
 	detect_vision_frame = numpy.expand_dims(detect_vision_frame.transpose(2, 0, 1), axis = 0).astype(numpy.float32)
 	return detect_vision_frame
diff --git a/facefusion/face_detector.py b/facefusion/face_detector.py
index 5f57622..c5d29d2 100644
--- a/facefusion/face_detector.py
+++ b/facefusion/face_detector.py
@@ -1,5 +1,5 @@
 from functools import lru_cache
-from typing import List, Tuple
+from typing import List, Sequence, Tuple
 
 import cv2
 import numpy
@@ -10,7 +10,7 @@ from facefusion.face_helper import create_rotated_matrix_and_size, create_static
 from facefusion.filesystem import resolve_relative_path
 from facefusion.thread_helper import thread_semaphore
 from facefusion.typing import Angle, BoundingBox, Detection, DownloadScope, DownloadSet, FaceLandmark5, InferencePool, ModelSet, Score, VisionFrame
-from facefusion.vision import resize_frame_resolution, unpack_resolution
+from facefusion.vision import restrict_frame, unpack_resolution
 
 
 @lru_cache(maxsize = None)
@@ -151,10 +151,11 @@ def detect_with_retinaface(vision_frame : VisionFrame, face_detector_size : str)
 	anchor_total = 2
 	face_detector_score = state_manager.get_item('face_detector_score')
 	face_detector_width, face_detector_height = unpack_resolution(face_detector_size)
-	temp_vision_frame = resize_frame_resolution(vision_frame, (face_detector_width, face_detector_height))
+	temp_vision_frame = restrict_frame(vision_frame, (face_detector_width, face_detector_height))
 	ratio_height = vision_frame.shape[0] / temp_vision_frame.shape[0]
 	ratio_width = vision_frame.shape[1] / temp_vision_frame.shape[1]
 	detect_vision_frame = prepare_detect_frame(temp_vision_frame, face_detector_size)
+	detect_vision_frame = normalize_detect_frame(detect_vision_frame, [ -1, 1 ])
 	detection = forward_with_retinaface(detect_vision_frame)
 
 	for index, feature_stride in enumerate(feature_strides):
@@ -194,10 +195,11 @@ def detect_with_scrfd(vision_frame : VisionFrame, face_detector_size : str) -> T
 	anchor_total = 2
 	face_detector_score = state_manager.get_item('face_detector_score')
 	face_detector_width, face_detector_height = unpack_resolution(face_detector_size)
-	temp_vision_frame = resize_frame_resolution(vision_frame, (face_detector_width, face_detector_height))
+	temp_vision_frame = restrict_frame(vision_frame, (face_detector_width, face_detector_height))
 	ratio_height = vision_frame.shape[0] / temp_vision_frame.shape[0]
 	ratio_width = vision_frame.shape[1] / temp_vision_frame.shape[1]
 	detect_vision_frame = prepare_detect_frame(temp_vision_frame, face_detector_size)
+	detect_vision_frame = normalize_detect_frame(detect_vision_frame, [ -1, 1 ])
 	detection = forward_with_scrfd(detect_vision_frame)
 
 	for index, feature_stride in enumerate(feature_strides):
@@ -234,10 +236,11 @@ def detect_with_yolo_face(vision_frame : VisionFrame, face_detector_size : str)
 	face_landmarks_5 = []
 	face_detector_score = state_manager.get_item('face_detector_score')
 	face_detector_width, face_detector_height = unpack_resolution(face_detector_size)
-	temp_vision_frame = resize_frame_resolution(vision_frame, (face_detector_width, face_detector_height))
+	temp_vision_frame = restrict_frame(vision_frame, (face_detector_width, face_detector_height))
 	ratio_height = vision_frame.shape[0] / temp_vision_frame.shape[0]
 	ratio_width = vision_frame.shape[1] / temp_vision_frame.shape[1]
 	detect_vision_frame = prepare_detect_frame(temp_vision_frame, face_detector_size)
+	detect_vision_frame = normalize_detect_frame(detect_vision_frame, [ 0, 1 ])
 	detection = forward_with_yolo_face(detect_vision_frame)
 	detection = numpy.squeeze(detection).T
 	bounding_boxes_raw, face_scores_raw, face_landmarks_5_raw = numpy.split(detection, [ 4, 5 ], axis = 1)
@@ -305,6 +308,13 @@ def prepare_detect_frame(temp_vision_frame : VisionFrame, face_detector_size : s
 	face_detector_width, face_detector_height = unpack_resolution(face_detector_size)
 	detect_vision_frame = numpy.zeros((face_detector_height, face_detector_width, 3))
 	detect_vision_frame[:temp_vision_frame.shape[0], :temp_vision_frame.shape[1], :] = temp_vision_frame
-	detect_vision_frame = (detect_vision_frame - 127.5) / 128.0
 	detect_vision_frame = numpy.expand_dims(detect_vision_frame.transpose(2, 0, 1), axis = 0).astype(numpy.float32)
 	return detect_vision_frame
+
+
+def normalize_detect_frame(detect_vision_frame : VisionFrame, normalize_range : Sequence[int]) -> VisionFrame:
+	if normalize_range == [ -1, 1 ]:
+		return (detect_vision_frame - 127.5) / 128.0
+	if normalize_range == [ 0, 1 ]:
+		return detect_vision_frame / 255.0
+	return detect_vision_frame
diff --git a/facefusion/uis/components/preview.py b/facefusion/uis/components/preview.py
index e3fb85b..8b8cfc7 100755
--- a/facefusion/uis/components/preview.py
+++ b/facefusion/uis/components/preview.py
@@ -18,7 +18,7 @@ from facefusion.processors.core import get_processors_modules
 from facefusion.typing import AudioFrame, Face, FaceSet, VisionFrame
 from facefusion.uis.core import get_ui_component, get_ui_components, register_ui_component
 from facefusion.uis.typing import ComponentOptions
-from facefusion.vision import count_video_frame_total, detect_frame_orientation, normalize_frame_color, read_static_image, read_static_images, read_video_frame, resize_frame_resolution
+from facefusion.vision import count_video_frame_total, detect_frame_orientation, normalize_frame_color, read_static_image, read_static_images, read_video_frame, restrict_frame
 
 PREVIEW_IMAGE : Optional[gradio.Image] = None
 PREVIEW_FRAME_SLIDER : Optional[gradio.Slider] = None
@@ -185,7 +185,7 @@ def clear_and_update_preview_image(frame_number : int = 0) -> gradio.Image:
 def slide_preview_image(frame_number : int = 0) -> gradio.Image:
 	if is_video(state_manager.get_item('target_path')):
 		preview_vision_frame = normalize_frame_color(read_video_frame(state_manager.get_item('target_path'), frame_number))
-		preview_vision_frame = resize_frame_resolution(preview_vision_frame, (1024, 1024))
+		preview_vision_frame = restrict_frame(preview_vision_frame, (1024, 1024))
 		return gradio.Image(value = preview_vision_frame)
 	return gradio.Image(value = None)
 
@@ -237,7 +237,7 @@ def update_preview_frame_slider() -> gradio.Slider:
 
 
 def process_preview_frame(reference_faces : FaceSet, source_face : Face, source_audio_frame : AudioFrame, target_vision_frame : VisionFrame) -> VisionFrame:
-	target_vision_frame = resize_frame_resolution(target_vision_frame, (1024, 1024))
+	target_vision_frame = restrict_frame(target_vision_frame, (1024, 1024))
 	source_vision_frame = target_vision_frame.copy()
 	if analyse_frame(target_vision_frame):
 		return cv2.GaussianBlur(target_vision_frame, (99, 99), 0)
diff --git a/facefusion/vision.py b/facefusion/vision.py
index 02b7aed..f5a0bf7 100644
--- a/facefusion/vision.py
+++ b/facefusion/vision.py
@@ -217,18 +217,31 @@ def detect_frame_orientation(vision_frame : VisionFrame) -> Orientation:
 	return 'portrait'
 
 
-def resize_frame_resolution(vision_frame : VisionFrame, max_resolution : Resolution) -> VisionFrame:
+def restrict_frame(vision_frame : VisionFrame, resolution : Resolution) -> VisionFrame:
 	height, width = vision_frame.shape[:2]
-	max_width, max_height = max_resolution
+	restrict_width, restrict_height = resolution
 
-	if height > max_height or width > max_width:
-		scale = min(max_height / height, max_width / width)
+	if height > restrict_height or width > restrict_width:
+		scale = min(restrict_height / height, restrict_width / width)
 		new_width = int(width * scale)
 		new_height = int(height * scale)
 		return cv2.resize(vision_frame, (new_width, new_height))
 	return vision_frame
 
 
+def fit_frame(vision_frame: VisionFrame, resolution: Resolution) -> VisionFrame:
+	fit_width, fit_height = resolution
+	height, width = vision_frame.shape[:2]
+	scale = min(fit_height / height, fit_width / width)
+	new_width = int(width * scale)
+	new_height = int(height * scale)
+	paste_vision_frame = cv2.resize(vision_frame, (new_width, new_height))
+	x_pad = (fit_width - new_width) // 2
+	y_pad = (fit_height - new_height) // 2
+	temp_vision_frame = numpy.pad(paste_vision_frame, ((y_pad, fit_height - new_height - y_pad), (x_pad, fit_width - new_width - x_pad), (0, 0)))
+	return temp_vision_frame
+
+
 def normalize_frame_color(vision_frame : VisionFrame) -> VisionFrame:
 	return cv2.cvtColor(vision_frame, cv2.COLOR_BGR2RGB)
 
@@ -283,10 +296,12 @@ def create_tile_frames(vision_frame : VisionFrame, size : Size) -> Tuple[List[Vi
 	for row_vision_frame in row_range:
 		top = row_vision_frame - size[2]
 		bottom = row_vision_frame + size[2] + tile_width
+
 		for column_vision_frame in col_range:
 			left = column_vision_frame - size[2]
 			right = column_vision_frame + size[2] + tile_width
 			tile_vision_frames.append(pad_vision_frame[top:bottom, left:right, :])
+
 	return tile_vision_frames, pad_width, pad_height
 
 
@@ -304,5 +319,6 @@ def merge_tile_frames(tile_vision_frames : List[VisionFrame], temp_width : int,
 		left = col_index * tile_vision_frame.shape[1]
 		right = left + tile_vision_frame.shape[1]
 		merge_vision_frame[top:bottom, left:right, :] = tile_vision_frame
+
 	merge_vision_frame = merge_vision_frame[size[1] : size[1] + temp_height, size[1]: size[1] + temp_width, :]
 	return merge_vision_frame