Feat/improve content analyser (#861)

* Introduce fit_frame to improve content analyser, rename resize_frame_resolution to restrict_frame

* Fix CI, Add some spaces

* Normalize according to face detector
This commit is contained in:
Henry Ruhs
2025-01-29 12:50:29 +01:00
committed by henryruhs
parent e79a99fac4
commit c70b45bd39
4 changed files with 42 additions and 19 deletions

View File

@@ -9,7 +9,7 @@ from facefusion.download import conditional_download_hashes, conditional_downloa
from facefusion.filesystem import resolve_relative_path
from facefusion.thread_helper import conditional_thread_semaphore
from facefusion.typing import Detection, DownloadScope, Fps, InferencePool, ModelOptions, ModelSet, Score, VisionFrame
from facefusion.vision import detect_video_fps, read_image, read_video_frame, resize_frame_resolution
from facefusion.vision import detect_video_fps, fit_frame, read_image, read_video_frame
STREAM_COUNTER = 0
@@ -106,7 +106,7 @@ def analyse_video(video_path : str, trim_frame_start : int, trim_frame_end : int
def detect_nsfw(vision_frame : VisionFrame) -> List[Score]:
nsfw_scores = []
model_size = get_model_options().get('size')
temp_vision_frame = resize_frame_resolution(vision_frame, model_size)
temp_vision_frame = fit_frame(vision_frame, model_size)
detect_vision_frame = prepare_detect_frame(temp_vision_frame)
detection = forward(detect_vision_frame)
detection = numpy.squeeze(detection).T
@@ -133,9 +133,6 @@ def forward(vision_frame : VisionFrame) -> Detection:
def prepare_detect_frame(temp_vision_frame : VisionFrame) -> VisionFrame:
model_size = get_model_options().get('size')
detect_vision_frame = numpy.zeros((model_size[0], model_size[1], 3))
detect_vision_frame[:temp_vision_frame.shape[0], :temp_vision_frame.shape[1], :] = temp_vision_frame
detect_vision_frame = detect_vision_frame / 255.0
detect_vision_frame = temp_vision_frame / 255.0
detect_vision_frame = numpy.expand_dims(detect_vision_frame.transpose(2, 0, 1), axis = 0).astype(numpy.float32)
return detect_vision_frame

View File

@@ -1,5 +1,5 @@
from functools import lru_cache
from typing import List, Tuple
from typing import List, Sequence, Tuple
import cv2
import numpy
@@ -10,7 +10,7 @@ from facefusion.face_helper import create_rotated_matrix_and_size, create_static
from facefusion.filesystem import resolve_relative_path
from facefusion.thread_helper import thread_semaphore
from facefusion.typing import Angle, BoundingBox, Detection, DownloadScope, DownloadSet, FaceLandmark5, InferencePool, ModelSet, Score, VisionFrame
from facefusion.vision import resize_frame_resolution, unpack_resolution
from facefusion.vision import restrict_frame, unpack_resolution
@lru_cache(maxsize = None)
@@ -151,10 +151,11 @@ def detect_with_retinaface(vision_frame : VisionFrame, face_detector_size : str)
anchor_total = 2
face_detector_score = state_manager.get_item('face_detector_score')
face_detector_width, face_detector_height = unpack_resolution(face_detector_size)
temp_vision_frame = resize_frame_resolution(vision_frame, (face_detector_width, face_detector_height))
temp_vision_frame = restrict_frame(vision_frame, (face_detector_width, face_detector_height))
ratio_height = vision_frame.shape[0] / temp_vision_frame.shape[0]
ratio_width = vision_frame.shape[1] / temp_vision_frame.shape[1]
detect_vision_frame = prepare_detect_frame(temp_vision_frame, face_detector_size)
detect_vision_frame = normalize_detect_frame(detect_vision_frame, [ -1, 1 ])
detection = forward_with_retinaface(detect_vision_frame)
for index, feature_stride in enumerate(feature_strides):
@@ -194,10 +195,11 @@ def detect_with_scrfd(vision_frame : VisionFrame, face_detector_size : str) -> T
anchor_total = 2
face_detector_score = state_manager.get_item('face_detector_score')
face_detector_width, face_detector_height = unpack_resolution(face_detector_size)
temp_vision_frame = resize_frame_resolution(vision_frame, (face_detector_width, face_detector_height))
temp_vision_frame = restrict_frame(vision_frame, (face_detector_width, face_detector_height))
ratio_height = vision_frame.shape[0] / temp_vision_frame.shape[0]
ratio_width = vision_frame.shape[1] / temp_vision_frame.shape[1]
detect_vision_frame = prepare_detect_frame(temp_vision_frame, face_detector_size)
detect_vision_frame = normalize_detect_frame(detect_vision_frame, [ -1, 1 ])
detection = forward_with_scrfd(detect_vision_frame)
for index, feature_stride in enumerate(feature_strides):
@@ -234,10 +236,11 @@ def detect_with_yolo_face(vision_frame : VisionFrame, face_detector_size : str)
face_landmarks_5 = []
face_detector_score = state_manager.get_item('face_detector_score')
face_detector_width, face_detector_height = unpack_resolution(face_detector_size)
temp_vision_frame = resize_frame_resolution(vision_frame, (face_detector_width, face_detector_height))
temp_vision_frame = restrict_frame(vision_frame, (face_detector_width, face_detector_height))
ratio_height = vision_frame.shape[0] / temp_vision_frame.shape[0]
ratio_width = vision_frame.shape[1] / temp_vision_frame.shape[1]
detect_vision_frame = prepare_detect_frame(temp_vision_frame, face_detector_size)
detect_vision_frame = normalize_detect_frame(detect_vision_frame, [ 0, 1 ])
detection = forward_with_yolo_face(detect_vision_frame)
detection = numpy.squeeze(detection).T
bounding_boxes_raw, face_scores_raw, face_landmarks_5_raw = numpy.split(detection, [ 4, 5 ], axis = 1)
@@ -305,6 +308,13 @@ def prepare_detect_frame(temp_vision_frame : VisionFrame, face_detector_size : s
face_detector_width, face_detector_height = unpack_resolution(face_detector_size)
detect_vision_frame = numpy.zeros((face_detector_height, face_detector_width, 3))
detect_vision_frame[:temp_vision_frame.shape[0], :temp_vision_frame.shape[1], :] = temp_vision_frame
detect_vision_frame = (detect_vision_frame - 127.5) / 128.0
detect_vision_frame = numpy.expand_dims(detect_vision_frame.transpose(2, 0, 1), axis = 0).astype(numpy.float32)
return detect_vision_frame
def normalize_detect_frame(detect_vision_frame : VisionFrame, normalize_range : Sequence[int]) -> VisionFrame:
if normalize_range == [ -1, 1 ]:
return (detect_vision_frame - 127.5) / 128.0
if normalize_range == [ 0, 1 ]:
return detect_vision_frame / 255.0
return detect_vision_frame

View File

@@ -18,7 +18,7 @@ from facefusion.processors.core import get_processors_modules
from facefusion.typing import AudioFrame, Face, FaceSet, VisionFrame
from facefusion.uis.core import get_ui_component, get_ui_components, register_ui_component
from facefusion.uis.typing import ComponentOptions
from facefusion.vision import count_video_frame_total, detect_frame_orientation, normalize_frame_color, read_static_image, read_static_images, read_video_frame, resize_frame_resolution
from facefusion.vision import count_video_frame_total, detect_frame_orientation, normalize_frame_color, read_static_image, read_static_images, read_video_frame, restrict_frame
PREVIEW_IMAGE : Optional[gradio.Image] = None
PREVIEW_FRAME_SLIDER : Optional[gradio.Slider] = None
@@ -185,7 +185,7 @@ def clear_and_update_preview_image(frame_number : int = 0) -> gradio.Image:
def slide_preview_image(frame_number : int = 0) -> gradio.Image:
if is_video(state_manager.get_item('target_path')):
preview_vision_frame = normalize_frame_color(read_video_frame(state_manager.get_item('target_path'), frame_number))
preview_vision_frame = resize_frame_resolution(preview_vision_frame, (1024, 1024))
preview_vision_frame = restrict_frame(preview_vision_frame, (1024, 1024))
return gradio.Image(value = preview_vision_frame)
return gradio.Image(value = None)
@@ -237,7 +237,7 @@ def update_preview_frame_slider() -> gradio.Slider:
def process_preview_frame(reference_faces : FaceSet, source_face : Face, source_audio_frame : AudioFrame, target_vision_frame : VisionFrame) -> VisionFrame:
target_vision_frame = resize_frame_resolution(target_vision_frame, (1024, 1024))
target_vision_frame = restrict_frame(target_vision_frame, (1024, 1024))
source_vision_frame = target_vision_frame.copy()
if analyse_frame(target_vision_frame):
return cv2.GaussianBlur(target_vision_frame, (99, 99), 0)

View File

@@ -217,18 +217,31 @@ def detect_frame_orientation(vision_frame : VisionFrame) -> Orientation:
return 'portrait'
def resize_frame_resolution(vision_frame : VisionFrame, max_resolution : Resolution) -> VisionFrame:
def restrict_frame(vision_frame : VisionFrame, resolution : Resolution) -> VisionFrame:
height, width = vision_frame.shape[:2]
max_width, max_height = max_resolution
restrict_width, restrict_height = resolution
if height > max_height or width > max_width:
scale = min(max_height / height, max_width / width)
if height > restrict_height or width > restrict_width:
scale = min(restrict_height / height, restrict_width / width)
new_width = int(width * scale)
new_height = int(height * scale)
return cv2.resize(vision_frame, (new_width, new_height))
return vision_frame
def fit_frame(vision_frame: VisionFrame, resolution: Resolution) -> VisionFrame:
fit_width, fit_height = resolution
height, width = vision_frame.shape[:2]
scale = min(fit_height / height, fit_width / width)
new_width = int(width * scale)
new_height = int(height * scale)
paste_vision_frame = cv2.resize(vision_frame, (new_width, new_height))
x_pad = (fit_width - new_width) // 2
y_pad = (fit_height - new_height) // 2
temp_vision_frame = numpy.pad(paste_vision_frame, ((y_pad, fit_height - new_height - y_pad), (x_pad, fit_width - new_width - x_pad), (0, 0)))
return temp_vision_frame
def normalize_frame_color(vision_frame : VisionFrame) -> VisionFrame:
return cv2.cvtColor(vision_frame, cv2.COLOR_BGR2RGB)
@@ -283,10 +296,12 @@ def create_tile_frames(vision_frame : VisionFrame, size : Size) -> Tuple[List[Vi
for row_vision_frame in row_range:
top = row_vision_frame - size[2]
bottom = row_vision_frame + size[2] + tile_width
for column_vision_frame in col_range:
left = column_vision_frame - size[2]
right = column_vision_frame + size[2] + tile_width
tile_vision_frames.append(pad_vision_frame[top:bottom, left:right, :])
return tile_vision_frames, pad_width, pad_height
@@ -304,5 +319,6 @@ def merge_tile_frames(tile_vision_frames : List[VisionFrame], temp_width : int,
left = col_index * tile_vision_frame.shape[1]
right = left + tile_vision_frame.shape[1]
merge_vision_frame[top:bottom, left:right, :] = tile_vision_frame
merge_vision_frame = merge_vision_frame[size[1] : size[1] + temp_height, size[1]: size[1] + temp_width, :]
return merge_vision_frame