Feat/improve content analyser (#861)

* Introduce fit_frame to improve content analyser, rename resize_frame_resolution to restrict_frame

* Fix CI, Add some spaces

* Normalize according to face detector
This commit is contained in:
Henry Ruhs
2025-01-29 12:50:29 +01:00
committed by henryruhs
parent e79a99fac4
commit c70b45bd39
4 changed files with 42 additions and 19 deletions

View File

@@ -9,7 +9,7 @@ from facefusion.download import conditional_download_hashes, conditional_downloa
from facefusion.filesystem import resolve_relative_path from facefusion.filesystem import resolve_relative_path
from facefusion.thread_helper import conditional_thread_semaphore from facefusion.thread_helper import conditional_thread_semaphore
from facefusion.typing import Detection, DownloadScope, Fps, InferencePool, ModelOptions, ModelSet, Score, VisionFrame from facefusion.typing import Detection, DownloadScope, Fps, InferencePool, ModelOptions, ModelSet, Score, VisionFrame
from facefusion.vision import detect_video_fps, read_image, read_video_frame, resize_frame_resolution from facefusion.vision import detect_video_fps, fit_frame, read_image, read_video_frame
STREAM_COUNTER = 0 STREAM_COUNTER = 0
@@ -106,7 +106,7 @@ def analyse_video(video_path : str, trim_frame_start : int, trim_frame_end : int
def detect_nsfw(vision_frame : VisionFrame) -> List[Score]: def detect_nsfw(vision_frame : VisionFrame) -> List[Score]:
nsfw_scores = [] nsfw_scores = []
model_size = get_model_options().get('size') model_size = get_model_options().get('size')
temp_vision_frame = resize_frame_resolution(vision_frame, model_size) temp_vision_frame = fit_frame(vision_frame, model_size)
detect_vision_frame = prepare_detect_frame(temp_vision_frame) detect_vision_frame = prepare_detect_frame(temp_vision_frame)
detection = forward(detect_vision_frame) detection = forward(detect_vision_frame)
detection = numpy.squeeze(detection).T detection = numpy.squeeze(detection).T
@@ -133,9 +133,6 @@ def forward(vision_frame : VisionFrame) -> Detection:
def prepare_detect_frame(temp_vision_frame : VisionFrame) -> VisionFrame: def prepare_detect_frame(temp_vision_frame : VisionFrame) -> VisionFrame:
model_size = get_model_options().get('size') detect_vision_frame = temp_vision_frame / 255.0
detect_vision_frame = numpy.zeros((model_size[0], model_size[1], 3))
detect_vision_frame[:temp_vision_frame.shape[0], :temp_vision_frame.shape[1], :] = temp_vision_frame
detect_vision_frame = detect_vision_frame / 255.0
detect_vision_frame = numpy.expand_dims(detect_vision_frame.transpose(2, 0, 1), axis = 0).astype(numpy.float32) detect_vision_frame = numpy.expand_dims(detect_vision_frame.transpose(2, 0, 1), axis = 0).astype(numpy.float32)
return detect_vision_frame return detect_vision_frame

View File

@@ -1,5 +1,5 @@
from functools import lru_cache from functools import lru_cache
from typing import List, Tuple from typing import List, Sequence, Tuple
import cv2 import cv2
import numpy import numpy
@@ -10,7 +10,7 @@ from facefusion.face_helper import create_rotated_matrix_and_size, create_static
from facefusion.filesystem import resolve_relative_path from facefusion.filesystem import resolve_relative_path
from facefusion.thread_helper import thread_semaphore from facefusion.thread_helper import thread_semaphore
from facefusion.typing import Angle, BoundingBox, Detection, DownloadScope, DownloadSet, FaceLandmark5, InferencePool, ModelSet, Score, VisionFrame from facefusion.typing import Angle, BoundingBox, Detection, DownloadScope, DownloadSet, FaceLandmark5, InferencePool, ModelSet, Score, VisionFrame
from facefusion.vision import resize_frame_resolution, unpack_resolution from facefusion.vision import restrict_frame, unpack_resolution
@lru_cache(maxsize = None) @lru_cache(maxsize = None)
@@ -151,10 +151,11 @@ def detect_with_retinaface(vision_frame : VisionFrame, face_detector_size : str)
anchor_total = 2 anchor_total = 2
face_detector_score = state_manager.get_item('face_detector_score') face_detector_score = state_manager.get_item('face_detector_score')
face_detector_width, face_detector_height = unpack_resolution(face_detector_size) face_detector_width, face_detector_height = unpack_resolution(face_detector_size)
temp_vision_frame = resize_frame_resolution(vision_frame, (face_detector_width, face_detector_height)) temp_vision_frame = restrict_frame(vision_frame, (face_detector_width, face_detector_height))
ratio_height = vision_frame.shape[0] / temp_vision_frame.shape[0] ratio_height = vision_frame.shape[0] / temp_vision_frame.shape[0]
ratio_width = vision_frame.shape[1] / temp_vision_frame.shape[1] ratio_width = vision_frame.shape[1] / temp_vision_frame.shape[1]
detect_vision_frame = prepare_detect_frame(temp_vision_frame, face_detector_size) detect_vision_frame = prepare_detect_frame(temp_vision_frame, face_detector_size)
detect_vision_frame = normalize_detect_frame(detect_vision_frame, [ -1, 1 ])
detection = forward_with_retinaface(detect_vision_frame) detection = forward_with_retinaface(detect_vision_frame)
for index, feature_stride in enumerate(feature_strides): for index, feature_stride in enumerate(feature_strides):
@@ -194,10 +195,11 @@ def detect_with_scrfd(vision_frame : VisionFrame, face_detector_size : str) -> T
anchor_total = 2 anchor_total = 2
face_detector_score = state_manager.get_item('face_detector_score') face_detector_score = state_manager.get_item('face_detector_score')
face_detector_width, face_detector_height = unpack_resolution(face_detector_size) face_detector_width, face_detector_height = unpack_resolution(face_detector_size)
temp_vision_frame = resize_frame_resolution(vision_frame, (face_detector_width, face_detector_height)) temp_vision_frame = restrict_frame(vision_frame, (face_detector_width, face_detector_height))
ratio_height = vision_frame.shape[0] / temp_vision_frame.shape[0] ratio_height = vision_frame.shape[0] / temp_vision_frame.shape[0]
ratio_width = vision_frame.shape[1] / temp_vision_frame.shape[1] ratio_width = vision_frame.shape[1] / temp_vision_frame.shape[1]
detect_vision_frame = prepare_detect_frame(temp_vision_frame, face_detector_size) detect_vision_frame = prepare_detect_frame(temp_vision_frame, face_detector_size)
detect_vision_frame = normalize_detect_frame(detect_vision_frame, [ -1, 1 ])
detection = forward_with_scrfd(detect_vision_frame) detection = forward_with_scrfd(detect_vision_frame)
for index, feature_stride in enumerate(feature_strides): for index, feature_stride in enumerate(feature_strides):
@@ -234,10 +236,11 @@ def detect_with_yolo_face(vision_frame : VisionFrame, face_detector_size : str)
face_landmarks_5 = [] face_landmarks_5 = []
face_detector_score = state_manager.get_item('face_detector_score') face_detector_score = state_manager.get_item('face_detector_score')
face_detector_width, face_detector_height = unpack_resolution(face_detector_size) face_detector_width, face_detector_height = unpack_resolution(face_detector_size)
temp_vision_frame = resize_frame_resolution(vision_frame, (face_detector_width, face_detector_height)) temp_vision_frame = restrict_frame(vision_frame, (face_detector_width, face_detector_height))
ratio_height = vision_frame.shape[0] / temp_vision_frame.shape[0] ratio_height = vision_frame.shape[0] / temp_vision_frame.shape[0]
ratio_width = vision_frame.shape[1] / temp_vision_frame.shape[1] ratio_width = vision_frame.shape[1] / temp_vision_frame.shape[1]
detect_vision_frame = prepare_detect_frame(temp_vision_frame, face_detector_size) detect_vision_frame = prepare_detect_frame(temp_vision_frame, face_detector_size)
detect_vision_frame = normalize_detect_frame(detect_vision_frame, [ 0, 1 ])
detection = forward_with_yolo_face(detect_vision_frame) detection = forward_with_yolo_face(detect_vision_frame)
detection = numpy.squeeze(detection).T detection = numpy.squeeze(detection).T
bounding_boxes_raw, face_scores_raw, face_landmarks_5_raw = numpy.split(detection, [ 4, 5 ], axis = 1) bounding_boxes_raw, face_scores_raw, face_landmarks_5_raw = numpy.split(detection, [ 4, 5 ], axis = 1)
@@ -305,6 +308,13 @@ def prepare_detect_frame(temp_vision_frame : VisionFrame, face_detector_size : s
face_detector_width, face_detector_height = unpack_resolution(face_detector_size) face_detector_width, face_detector_height = unpack_resolution(face_detector_size)
detect_vision_frame = numpy.zeros((face_detector_height, face_detector_width, 3)) detect_vision_frame = numpy.zeros((face_detector_height, face_detector_width, 3))
detect_vision_frame[:temp_vision_frame.shape[0], :temp_vision_frame.shape[1], :] = temp_vision_frame detect_vision_frame[:temp_vision_frame.shape[0], :temp_vision_frame.shape[1], :] = temp_vision_frame
detect_vision_frame = (detect_vision_frame - 127.5) / 128.0
detect_vision_frame = numpy.expand_dims(detect_vision_frame.transpose(2, 0, 1), axis = 0).astype(numpy.float32) detect_vision_frame = numpy.expand_dims(detect_vision_frame.transpose(2, 0, 1), axis = 0).astype(numpy.float32)
return detect_vision_frame return detect_vision_frame
def normalize_detect_frame(detect_vision_frame : VisionFrame, normalize_range : Sequence[int]) -> VisionFrame:
if normalize_range == [ -1, 1 ]:
return (detect_vision_frame - 127.5) / 128.0
if normalize_range == [ 0, 1 ]:
return detect_vision_frame / 255.0
return detect_vision_frame

View File

@@ -18,7 +18,7 @@ from facefusion.processors.core import get_processors_modules
from facefusion.typing import AudioFrame, Face, FaceSet, VisionFrame from facefusion.typing import AudioFrame, Face, FaceSet, VisionFrame
from facefusion.uis.core import get_ui_component, get_ui_components, register_ui_component from facefusion.uis.core import get_ui_component, get_ui_components, register_ui_component
from facefusion.uis.typing import ComponentOptions from facefusion.uis.typing import ComponentOptions
from facefusion.vision import count_video_frame_total, detect_frame_orientation, normalize_frame_color, read_static_image, read_static_images, read_video_frame, resize_frame_resolution from facefusion.vision import count_video_frame_total, detect_frame_orientation, normalize_frame_color, read_static_image, read_static_images, read_video_frame, restrict_frame
PREVIEW_IMAGE : Optional[gradio.Image] = None PREVIEW_IMAGE : Optional[gradio.Image] = None
PREVIEW_FRAME_SLIDER : Optional[gradio.Slider] = None PREVIEW_FRAME_SLIDER : Optional[gradio.Slider] = None
@@ -185,7 +185,7 @@ def clear_and_update_preview_image(frame_number : int = 0) -> gradio.Image:
def slide_preview_image(frame_number : int = 0) -> gradio.Image: def slide_preview_image(frame_number : int = 0) -> gradio.Image:
if is_video(state_manager.get_item('target_path')): if is_video(state_manager.get_item('target_path')):
preview_vision_frame = normalize_frame_color(read_video_frame(state_manager.get_item('target_path'), frame_number)) preview_vision_frame = normalize_frame_color(read_video_frame(state_manager.get_item('target_path'), frame_number))
preview_vision_frame = resize_frame_resolution(preview_vision_frame, (1024, 1024)) preview_vision_frame = restrict_frame(preview_vision_frame, (1024, 1024))
return gradio.Image(value = preview_vision_frame) return gradio.Image(value = preview_vision_frame)
return gradio.Image(value = None) return gradio.Image(value = None)
@@ -237,7 +237,7 @@ def update_preview_frame_slider() -> gradio.Slider:
def process_preview_frame(reference_faces : FaceSet, source_face : Face, source_audio_frame : AudioFrame, target_vision_frame : VisionFrame) -> VisionFrame: def process_preview_frame(reference_faces : FaceSet, source_face : Face, source_audio_frame : AudioFrame, target_vision_frame : VisionFrame) -> VisionFrame:
target_vision_frame = resize_frame_resolution(target_vision_frame, (1024, 1024)) target_vision_frame = restrict_frame(target_vision_frame, (1024, 1024))
source_vision_frame = target_vision_frame.copy() source_vision_frame = target_vision_frame.copy()
if analyse_frame(target_vision_frame): if analyse_frame(target_vision_frame):
return cv2.GaussianBlur(target_vision_frame, (99, 99), 0) return cv2.GaussianBlur(target_vision_frame, (99, 99), 0)

View File

@@ -217,18 +217,31 @@ def detect_frame_orientation(vision_frame : VisionFrame) -> Orientation:
return 'portrait' return 'portrait'
def resize_frame_resolution(vision_frame : VisionFrame, max_resolution : Resolution) -> VisionFrame: def restrict_frame(vision_frame : VisionFrame, resolution : Resolution) -> VisionFrame:
height, width = vision_frame.shape[:2] height, width = vision_frame.shape[:2]
max_width, max_height = max_resolution restrict_width, restrict_height = resolution
if height > max_height or width > max_width: if height > restrict_height or width > restrict_width:
scale = min(max_height / height, max_width / width) scale = min(restrict_height / height, restrict_width / width)
new_width = int(width * scale) new_width = int(width * scale)
new_height = int(height * scale) new_height = int(height * scale)
return cv2.resize(vision_frame, (new_width, new_height)) return cv2.resize(vision_frame, (new_width, new_height))
return vision_frame return vision_frame
def fit_frame(vision_frame: VisionFrame, resolution: Resolution) -> VisionFrame:
fit_width, fit_height = resolution
height, width = vision_frame.shape[:2]
scale = min(fit_height / height, fit_width / width)
new_width = int(width * scale)
new_height = int(height * scale)
paste_vision_frame = cv2.resize(vision_frame, (new_width, new_height))
x_pad = (fit_width - new_width) // 2
y_pad = (fit_height - new_height) // 2
temp_vision_frame = numpy.pad(paste_vision_frame, ((y_pad, fit_height - new_height - y_pad), (x_pad, fit_width - new_width - x_pad), (0, 0)))
return temp_vision_frame
def normalize_frame_color(vision_frame : VisionFrame) -> VisionFrame: def normalize_frame_color(vision_frame : VisionFrame) -> VisionFrame:
return cv2.cvtColor(vision_frame, cv2.COLOR_BGR2RGB) return cv2.cvtColor(vision_frame, cv2.COLOR_BGR2RGB)
@@ -283,10 +296,12 @@ def create_tile_frames(vision_frame : VisionFrame, size : Size) -> Tuple[List[Vi
for row_vision_frame in row_range: for row_vision_frame in row_range:
top = row_vision_frame - size[2] top = row_vision_frame - size[2]
bottom = row_vision_frame + size[2] + tile_width bottom = row_vision_frame + size[2] + tile_width
for column_vision_frame in col_range: for column_vision_frame in col_range:
left = column_vision_frame - size[2] left = column_vision_frame - size[2]
right = column_vision_frame + size[2] + tile_width right = column_vision_frame + size[2] + tile_width
tile_vision_frames.append(pad_vision_frame[top:bottom, left:right, :]) tile_vision_frames.append(pad_vision_frame[top:bottom, left:right, :])
return tile_vision_frames, pad_width, pad_height return tile_vision_frames, pad_width, pad_height
@@ -304,5 +319,6 @@ def merge_tile_frames(tile_vision_frames : List[VisionFrame], temp_width : int,
left = col_index * tile_vision_frame.shape[1] left = col_index * tile_vision_frame.shape[1]
right = left + tile_vision_frame.shape[1] right = left + tile_vision_frame.shape[1]
merge_vision_frame[top:bottom, left:right, :] = tile_vision_frame merge_vision_frame[top:bottom, left:right, :] = tile_vision_frame
merge_vision_frame = merge_vision_frame[size[1] : size[1] + temp_height, size[1]: size[1] + temp_width, :] merge_vision_frame = merge_vision_frame[size[1] : size[1] + temp_height, size[1]: size[1] + temp_width, :]
return merge_vision_frame return merge_vision_frame