From 8aec81d63b0c1c7718480261e4b03211b852fa7e Mon Sep 17 00:00:00 2001 From: harisreedhar Date: Thu, 12 Jun 2025 17:07:51 +0530 Subject: [PATCH 1/4] re-arrange to methods --- facefusion/processors/modules/lip_syncer.py | 46 ++++++++++++++------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/facefusion/processors/modules/lip_syncer.py b/facefusion/processors/modules/lip_syncer.py index 8fc1023..8e7a3b4 100755 --- a/facefusion/processors/modules/lip_syncer.py +++ b/facefusion/processors/modules/lip_syncer.py @@ -23,7 +23,7 @@ from facefusion.processors import choices as processors_choices from facefusion.processors.types import LipSyncerInputs from facefusion.program_helper import find_argument_group from facefusion.thread_helper import conditional_thread_semaphore -from facefusion.types import ApplyStateItem, Args, AudioFrame, DownloadScope, Face, InferencePool, ModelOptions, ModelSet, ProcessMode, QueuePayload, UpdateProgress, VisionFrame +from facefusion.types import ApplyStateItem, Args, AudioFrame, BoundingBox, DownloadScope, Face, InferencePool, ModelOptions, ModelSet, ProcessMode, QueuePayload, UpdateProgress, VisionFrame from facefusion.vision import read_image, read_static_image, restrict_video_fps, write_image @@ -144,12 +144,9 @@ def post_process() -> None: def sync_lip(target_face : Face, temp_audio_frame : AudioFrame, temp_vision_frame : VisionFrame) -> VisionFrame: - model_size = get_model_options().get('size') temp_audio_frame = prepare_audio_frame(temp_audio_frame) crop_vision_frame, affine_matrix = warp_face_by_face_landmark_5(temp_vision_frame, target_face.landmark_set.get('5/68'), 'ffhq_512', (512, 512)) face_landmark_68 = cv2.transform(target_face.landmark_set.get('68').reshape(1, -1, 2), affine_matrix).reshape(-1, 2) - bounding_box = create_bounding_box(face_landmark_68) - bounding_box[1] -= numpy.abs(bounding_box[3] - bounding_box[1]) * 0.125 area_mask = create_area_mask(face_landmark_68, [ 'lower-face' ]) crop_masks =\ [ @@ -160,16 +157,24 @@ def sync_lip(target_face : Face, temp_audio_frame : AudioFrame, temp_vision_fram occlusion_mask = create_occlusion_mask(crop_vision_frame) crop_masks.append(occlusion_mask) - close_vision_frame, close_matrix = warp_face_by_bounding_box(crop_vision_frame, bounding_box, model_size) - close_vision_frame = prepare_crop_frame(close_vision_frame) - close_vision_frame = forward(temp_audio_frame, close_vision_frame) - close_vision_frame = normalize_close_frame(close_vision_frame) - crop_vision_frame = cv2.warpAffine(close_vision_frame, cv2.invertAffineTransform(close_matrix), (512, 512), borderMode = cv2.BORDER_REPLICATE) + bounding_box = create_bounding_box(face_landmark_68) + bounding_box = prepare_bounding_box(bounding_box) + crop_vision_frame = process_wav2lip(crop_vision_frame, temp_audio_frame, bounding_box) crop_mask = numpy.minimum.reduce(crop_masks) paste_vision_frame = paste_back(temp_vision_frame, crop_vision_frame, crop_mask, affine_matrix) return paste_vision_frame +def process_wav2lip(crop_vision_frame : VisionFrame, temp_audio_frame : AudioFrame, bounding_box : BoundingBox) -> VisionFrame: + model_size = get_model_options().get('size') + close_vision_frame, close_matrix = warp_face_by_bounding_box(crop_vision_frame, bounding_box, model_size) + close_vision_frame = prepare_close_frame(close_vision_frame) + close_vision_frame = forward(temp_audio_frame, close_vision_frame) + close_vision_frame = normalize_close_frame(close_vision_frame) + crop_vision_frame = cv2.warpAffine(close_vision_frame, cv2.invertAffineTransform(close_matrix), (512, 512), borderMode = cv2.BORDER_REPLICATE) + return crop_vision_frame + + def forward(temp_audio_frame : AudioFrame, close_vision_frame : VisionFrame) -> VisionFrame: lip_syncer = get_inference_pool().get('lip_syncer') @@ -192,13 +197,22 @@ def prepare_audio_frame(temp_audio_frame : AudioFrame) -> AudioFrame: return temp_audio_frame -def prepare_crop_frame(crop_vision_frame : VisionFrame) -> VisionFrame: - crop_vision_frame = numpy.expand_dims(crop_vision_frame, axis = 0) - prepare_vision_frame = crop_vision_frame.copy() - prepare_vision_frame[:, 48:] = 0 - crop_vision_frame = numpy.concatenate((prepare_vision_frame, crop_vision_frame), axis = 3) - crop_vision_frame = crop_vision_frame.transpose(0, 3, 1, 2).astype('float32') / 255.0 - return crop_vision_frame +def prepare_close_frame(close_vision_frame : VisionFrame) -> VisionFrame: + model_size = get_model_options().get('size') + close_vision_frame = numpy.expand_dims(close_vision_frame, axis=0) + prepare_vision_frame = close_vision_frame.copy() + prepare_vision_frame[:, model_size[0] // 2:] = 0 + close_vision_frame = numpy.concatenate((prepare_vision_frame, close_vision_frame), axis=3) + close_vision_frame = close_vision_frame.transpose(0, 3, 1, 2).astype('float32') / 255.0 + return close_vision_frame + + +def prepare_bounding_box(bounding_box : BoundingBox) -> BoundingBox: + bounding_box[3] += min(8, 511) + x1, y1, x2, y2 = bounding_box + y1 = y2 - (4 / 3) * (x2 - x1) + bounding_box[1] = max(y1, 0) + return bounding_box def normalize_close_frame(crop_vision_frame : VisionFrame) -> VisionFrame: From fb05361dc3be7bc4eaf30047ec6bec240260a363 Mon Sep 17 00:00:00 2001 From: harisreedhar Date: Thu, 12 Jun 2025 17:18:37 +0530 Subject: [PATCH 2/4] space --- facefusion/processors/modules/lip_syncer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/facefusion/processors/modules/lip_syncer.py b/facefusion/processors/modules/lip_syncer.py index 8e7a3b4..80c77d3 100755 --- a/facefusion/processors/modules/lip_syncer.py +++ b/facefusion/processors/modules/lip_syncer.py @@ -199,10 +199,10 @@ def prepare_audio_frame(temp_audio_frame : AudioFrame) -> AudioFrame: def prepare_close_frame(close_vision_frame : VisionFrame) -> VisionFrame: model_size = get_model_options().get('size') - close_vision_frame = numpy.expand_dims(close_vision_frame, axis=0) + close_vision_frame = numpy.expand_dims(close_vision_frame, axis = 0) prepare_vision_frame = close_vision_frame.copy() prepare_vision_frame[:, model_size[0] // 2:] = 0 - close_vision_frame = numpy.concatenate((prepare_vision_frame, close_vision_frame), axis=3) + close_vision_frame = numpy.concatenate((prepare_vision_frame, close_vision_frame), axis = 3) close_vision_frame = close_vision_frame.transpose(0, 3, 1, 2).astype('float32') / 255.0 return close_vision_frame From 36cad4d1b7a8c2a484e7261e3b0ee2cff7b6783b Mon Sep 17 00:00:00 2001 From: harisreedhar Date: Thu, 12 Jun 2025 19:08:26 +0530 Subject: [PATCH 3/4] add edtalk --- facefusion/processors/choices.py | 2 +- facefusion/processors/modules/lip_syncer.py | 104 ++++++++++++++++---- facefusion/processors/types.py | 2 +- 3 files changed, 87 insertions(+), 21 deletions(-) diff --git a/facefusion/processors/choices.py b/facefusion/processors/choices.py index 4bc4b92..342c2f4 100755 --- a/facefusion/processors/choices.py +++ b/facefusion/processors/choices.py @@ -195,7 +195,7 @@ face_swapper_models : List[FaceSwapperModel] = list(face_swapper_set.keys()) frame_colorizer_models : List[FrameColorizerModel] = [ 'ddcolor', 'ddcolor_artistic', 'deoldify', 'deoldify_artistic', 'deoldify_stable' ] frame_colorizer_sizes : List[str] = [ '192x192', '256x256', '384x384', '512x512' ] frame_enhancer_models : List[FrameEnhancerModel] = [ 'clear_reality_x4', 'lsdir_x4', 'nomos8k_sc_x4', 'real_esrgan_x2', 'real_esrgan_x2_fp16', 'real_esrgan_x4', 'real_esrgan_x4_fp16', 'real_esrgan_x8', 'real_esrgan_x8_fp16', 'real_hatgan_x4', 'real_web_photo_x4', 'realistic_rescaler_x4', 'remacri_x4', 'siax_x4', 'span_kendata_x4', 'swin2_sr_x4', 'ultra_sharp_x4', 'ultra_sharp_2_x4' ] -lip_syncer_models : List[LipSyncerModel] = [ 'wav2lip_96', 'wav2lip_gan_96' ] +lip_syncer_models : List[LipSyncerModel] = [ 'wav2lip_96', 'wav2lip_gan_96', 'edtalk_256' ] age_modifier_direction_range : Sequence[int] = create_int_range(-100, 100, 1) deep_swapper_morph_range : Sequence[int] = create_int_range(0, 100, 1) diff --git a/facefusion/processors/modules/lip_syncer.py b/facefusion/processors/modules/lip_syncer.py index 80c77d3..9932538 100755 --- a/facefusion/processors/modules/lip_syncer.py +++ b/facefusion/processors/modules/lip_syncer.py @@ -15,7 +15,7 @@ from facefusion.common_helper import get_first from facefusion.download import conditional_download_hashes, conditional_download_sources, resolve_download_url from facefusion.face_analyser import get_many_faces, get_one_face from facefusion.face_helper import create_bounding_box, paste_back, warp_face_by_bounding_box, warp_face_by_face_landmark_5 -from facefusion.face_masker import create_area_mask, create_occlusion_mask +from facefusion.face_masker import create_area_mask, create_occlusion_mask, create_static_box_mask from facefusion.face_selector import find_similar_faces, sort_and_filter_faces from facefusion.face_store import get_reference_faces from facefusion.filesystem import filter_audio_paths, has_audio, in_directory, is_image, is_video, resolve_relative_path, same_file_extension @@ -49,6 +49,7 @@ def create_static_model_set(download_scope : DownloadScope) -> ModelSet: 'path': resolve_relative_path('../.assets/models/wav2lip_96.onnx') } }, + 'type': 'wav2lip', 'size': (96, 96) }, 'wav2lip_gan_96': @@ -69,7 +70,29 @@ def create_static_model_set(download_scope : DownloadScope) -> ModelSet: 'path': resolve_relative_path('../.assets/models/wav2lip_gan_96.onnx') } }, + 'type': 'wav2lip', 'size': (96, 96) + }, + 'edtalk_256': + { + 'hashes': + { + 'lip_syncer': + { + 'url': resolve_download_url('models-3.3.0', 'edtalk_256.hash'), + 'path': resolve_relative_path('../.assets/models/edtalk_256.hash') + } + }, + 'sources': + { + 'lip_syncer': + { + 'url': resolve_download_url('models-3.3.0', 'edtalk_256.onnx'), + 'path': resolve_relative_path('../.assets/models/edtalk_256.onnx') + } + }, + 'type': 'edtalk', + 'size': (256, 256) } } @@ -144,22 +167,29 @@ def post_process() -> None: def sync_lip(target_face : Face, temp_audio_frame : AudioFrame, temp_vision_frame : VisionFrame) -> VisionFrame: + model_name = state_manager.get_item('lip_syncer_model') temp_audio_frame = prepare_audio_frame(temp_audio_frame) crop_vision_frame, affine_matrix = warp_face_by_face_landmark_5(temp_vision_frame, target_face.landmark_set.get('5/68'), 'ffhq_512', (512, 512)) - face_landmark_68 = cv2.transform(target_face.landmark_set.get('68').reshape(1, -1, 2), affine_matrix).reshape(-1, 2) - area_mask = create_area_mask(face_landmark_68, [ 'lower-face' ]) + box_mask = create_static_box_mask(crop_vision_frame.shape[:2][::-1], state_manager.get_item('face_mask_blur'), state_manager.get_item('face_mask_padding')) crop_masks =\ [ - area_mask + box_mask ] if 'occlusion' in state_manager.get_item('face_mask_types'): occlusion_mask = create_occlusion_mask(crop_vision_frame) crop_masks.append(occlusion_mask) - bounding_box = create_bounding_box(face_landmark_68) - bounding_box = prepare_bounding_box(bounding_box) - crop_vision_frame = process_wav2lip(crop_vision_frame, temp_audio_frame, bounding_box) + if model_name.startswith('wav2lip'): + face_landmark_68 = cv2.transform(target_face.landmark_set.get('68').reshape(1, -1, 2), affine_matrix).reshape(-1, 2) + area_mask = create_area_mask(face_landmark_68, [ 'lower-face' ]) + crop_masks.append(area_mask) + bounding_box = create_bounding_box(face_landmark_68) + bounding_box = prepare_bounding_box(bounding_box) + crop_vision_frame = process_wav2lip(crop_vision_frame, temp_audio_frame, bounding_box) + elif model_name == 'edtalk_256': + crop_vision_frame = process_edtalk(crop_vision_frame, temp_audio_frame) + crop_mask = numpy.minimum.reduce(crop_masks) paste_vision_frame = paste_back(temp_vision_frame, crop_vision_frame, crop_mask, affine_matrix) return paste_vision_frame @@ -168,14 +198,22 @@ def sync_lip(target_face : Face, temp_audio_frame : AudioFrame, temp_vision_fram def process_wav2lip(crop_vision_frame : VisionFrame, temp_audio_frame : AudioFrame, bounding_box : BoundingBox) -> VisionFrame: model_size = get_model_options().get('size') close_vision_frame, close_matrix = warp_face_by_bounding_box(crop_vision_frame, bounding_box, model_size) - close_vision_frame = prepare_close_frame(close_vision_frame) - close_vision_frame = forward(temp_audio_frame, close_vision_frame) - close_vision_frame = normalize_close_frame(close_vision_frame) + close_vision_frame = prepare_crop_frame(close_vision_frame) + close_vision_frame = forward_wav2lip(temp_audio_frame, close_vision_frame) + close_vision_frame = normalize_crop_frame(close_vision_frame) crop_vision_frame = cv2.warpAffine(close_vision_frame, cv2.invertAffineTransform(close_matrix), (512, 512), borderMode = cv2.BORDER_REPLICATE) return crop_vision_frame -def forward(temp_audio_frame : AudioFrame, close_vision_frame : VisionFrame) -> VisionFrame: +def process_edtalk(crop_vision_frame : VisionFrame, temp_audio_frame : AudioFrame) -> VisionFrame: + lip_syncer_weight = state_manager.get_item('lip_syncer_weight') * 1.25 + crop_vision_frame = prepare_crop_frame(crop_vision_frame) + crop_vision_frame = forward_edtalk(temp_audio_frame, crop_vision_frame, lip_syncer_weight) + crop_vision_frame = normalize_crop_frame(crop_vision_frame) + return crop_vision_frame + + +def forward_wav2lip(temp_audio_frame : AudioFrame, close_vision_frame : VisionFrame) -> VisionFrame: lip_syncer = get_inference_pool().get('lip_syncer') with conditional_thread_semaphore(): @@ -188,6 +226,20 @@ def forward(temp_audio_frame : AudioFrame, close_vision_frame : VisionFrame) -> return close_vision_frame +def forward_edtalk(temp_audio_frame : AudioFrame, crop_vision_frame : VisionFrame, lip_syncer_weight : float) -> VisionFrame: + lip_syncer = get_inference_pool().get('lip_syncer') + + with conditional_thread_semaphore(): + crop_vision_frame = lip_syncer.run(None, + { + 'source': temp_audio_frame, + 'target': crop_vision_frame, + 'weight': [ numpy.float32(lip_syncer_weight) ] + })[0] + + return crop_vision_frame + + def prepare_audio_frame(temp_audio_frame : AudioFrame) -> AudioFrame: temp_audio_frame = numpy.maximum(numpy.exp(-5 * numpy.log(10)), temp_audio_frame) temp_audio_frame = numpy.log10(temp_audio_frame) * 1.6 + 3.2 @@ -197,14 +249,22 @@ def prepare_audio_frame(temp_audio_frame : AudioFrame) -> AudioFrame: return temp_audio_frame -def prepare_close_frame(close_vision_frame : VisionFrame) -> VisionFrame: +def prepare_crop_frame(crop_vision_frame : VisionFrame) -> VisionFrame: + model_type = get_model_options().get('type') model_size = get_model_options().get('size') - close_vision_frame = numpy.expand_dims(close_vision_frame, axis = 0) - prepare_vision_frame = close_vision_frame.copy() - prepare_vision_frame[:, model_size[0] // 2:] = 0 - close_vision_frame = numpy.concatenate((prepare_vision_frame, close_vision_frame), axis = 3) - close_vision_frame = close_vision_frame.transpose(0, 3, 1, 2).astype('float32') / 255.0 - return close_vision_frame + + if model_type == 'wav2lip': + crop_vision_frame = numpy.expand_dims(crop_vision_frame, axis = 0) + prepare_vision_frame = crop_vision_frame.copy() + prepare_vision_frame[:, model_size[0] // 2:] = 0 + crop_vision_frame = numpy.concatenate((prepare_vision_frame, crop_vision_frame), axis = 3) + crop_vision_frame = crop_vision_frame.transpose(0, 3, 1, 2).astype('float32') / 255.0 + elif model_type == 'edtalk': + crop_vision_frame = cv2.resize(crop_vision_frame, (256, 256), interpolation = cv2.INTER_AREA) + crop_vision_frame = crop_vision_frame[:, :, ::-1] / 255.0 + crop_vision_frame = numpy.expand_dims(crop_vision_frame.transpose(2, 0, 1), axis = 0).astype(numpy.float32) + + return crop_vision_frame def prepare_bounding_box(bounding_box : BoundingBox) -> BoundingBox: @@ -215,10 +275,16 @@ def prepare_bounding_box(bounding_box : BoundingBox) -> BoundingBox: return bounding_box -def normalize_close_frame(crop_vision_frame : VisionFrame) -> VisionFrame: +def normalize_crop_frame(crop_vision_frame : VisionFrame) -> VisionFrame: + model_type = get_model_options().get('type') crop_vision_frame = crop_vision_frame[0].transpose(1, 2, 0) crop_vision_frame = crop_vision_frame.clip(0, 1) * 255 crop_vision_frame = crop_vision_frame.astype(numpy.uint8) + + if model_type == 'edtalk': + crop_vision_frame = crop_vision_frame[:, :, ::-1] + crop_vision_frame = cv2.resize(crop_vision_frame, (512, 512), interpolation = cv2.INTER_CUBIC) + return crop_vision_frame diff --git a/facefusion/processors/types.py b/facefusion/processors/types.py index a499c30..ed7d535 100644 --- a/facefusion/processors/types.py +++ b/facefusion/processors/types.py @@ -13,7 +13,7 @@ FaceEnhancerModel = Literal['codeformer', 'gfpgan_1.2', 'gfpgan_1.3', 'gfpgan_1. FaceSwapperModel = Literal['blendswap_256', 'ghost_1_256', 'ghost_2_256', 'ghost_3_256', 'hififace_unofficial_256', 'inswapper_128', 'inswapper_128_fp16', 'simswap_256', 'simswap_unofficial_512', 'uniface_256'] FrameColorizerModel = Literal['ddcolor', 'ddcolor_artistic', 'deoldify', 'deoldify_artistic', 'deoldify_stable'] FrameEnhancerModel = Literal['clear_reality_x4', 'lsdir_x4', 'nomos8k_sc_x4', 'real_esrgan_x2', 'real_esrgan_x2_fp16', 'real_esrgan_x4', 'real_esrgan_x4_fp16', 'real_esrgan_x8', 'real_esrgan_x8_fp16', 'real_hatgan_x4', 'real_web_photo_x4', 'realistic_rescaler_x4', 'remacri_x4', 'siax_x4', 'span_kendata_x4', 'swin2_sr_x4', 'ultra_sharp_x4', 'ultra_sharp_2_x4'] -LipSyncerModel = Literal['wav2lip_96', 'wav2lip_gan_96'] +LipSyncerModel = Literal['wav2lip_96', 'wav2lip_gan_96', 'edtalk_256'] FaceSwapperSet : TypeAlias = Dict[FaceSwapperModel, List[str]] From 7905cfe6a33d4af69a819787aa006c9b4f6827e5 Mon Sep 17 00:00:00 2001 From: harisreedhar Date: Thu, 12 Jun 2025 20:07:41 +0530 Subject: [PATCH 4/4] stage 1 OCD --- facefusion/processors/choices.py | 2 +- facefusion/processors/modules/lip_syncer.py | 103 +++++++++----------- facefusion/processors/types.py | 3 +- 3 files changed, 49 insertions(+), 59 deletions(-) diff --git a/facefusion/processors/choices.py b/facefusion/processors/choices.py index 342c2f4..3e1222f 100755 --- a/facefusion/processors/choices.py +++ b/facefusion/processors/choices.py @@ -195,7 +195,7 @@ face_swapper_models : List[FaceSwapperModel] = list(face_swapper_set.keys()) frame_colorizer_models : List[FrameColorizerModel] = [ 'ddcolor', 'ddcolor_artistic', 'deoldify', 'deoldify_artistic', 'deoldify_stable' ] frame_colorizer_sizes : List[str] = [ '192x192', '256x256', '384x384', '512x512' ] frame_enhancer_models : List[FrameEnhancerModel] = [ 'clear_reality_x4', 'lsdir_x4', 'nomos8k_sc_x4', 'real_esrgan_x2', 'real_esrgan_x2_fp16', 'real_esrgan_x4', 'real_esrgan_x4_fp16', 'real_esrgan_x8', 'real_esrgan_x8_fp16', 'real_hatgan_x4', 'real_web_photo_x4', 'realistic_rescaler_x4', 'remacri_x4', 'siax_x4', 'span_kendata_x4', 'swin2_sr_x4', 'ultra_sharp_x4', 'ultra_sharp_2_x4' ] -lip_syncer_models : List[LipSyncerModel] = [ 'wav2lip_96', 'wav2lip_gan_96', 'edtalk_256' ] +lip_syncer_models : List[LipSyncerModel] = [ 'edtalk_256', 'wav2lip_96', 'wav2lip_gan_96' ] age_modifier_direction_range : Sequence[int] = create_int_range(-100, 100, 1) deep_swapper_morph_range : Sequence[int] = create_int_range(0, 100, 1) diff --git a/facefusion/processors/modules/lip_syncer.py b/facefusion/processors/modules/lip_syncer.py index 9932538..9fd3f82 100755 --- a/facefusion/processors/modules/lip_syncer.py +++ b/facefusion/processors/modules/lip_syncer.py @@ -20,7 +20,7 @@ from facefusion.face_selector import find_similar_faces, sort_and_filter_faces from facefusion.face_store import get_reference_faces from facefusion.filesystem import filter_audio_paths, has_audio, in_directory, is_image, is_video, resolve_relative_path, same_file_extension from facefusion.processors import choices as processors_choices -from facefusion.processors.types import LipSyncerInputs +from facefusion.processors.types import LipSyncerInputs, LipSyncerWeight from facefusion.program_helper import find_argument_group from facefusion.thread_helper import conditional_thread_semaphore from facefusion.types import ApplyStateItem, Args, AudioFrame, BoundingBox, DownloadScope, Face, InferencePool, ModelOptions, ModelSet, ProcessMode, QueuePayload, UpdateProgress, VisionFrame @@ -31,6 +31,27 @@ from facefusion.vision import read_image, read_static_image, restrict_video_fps, def create_static_model_set(download_scope : DownloadScope) -> ModelSet: return\ { + 'edtalk_256': + { + 'hashes': + { + 'lip_syncer': + { + 'url': resolve_download_url('models-3.3.0', 'edtalk_256.hash'), + 'path': resolve_relative_path('../.assets/models/edtalk_256.hash') + } + }, + 'sources': + { + 'lip_syncer': + { + 'url': resolve_download_url('models-3.3.0', 'edtalk_256.onnx'), + 'path': resolve_relative_path('../.assets/models/edtalk_256.onnx') + } + }, + 'type': 'edtalk', + 'size': (256, 256) + }, 'wav2lip_96': { 'hashes': @@ -72,27 +93,6 @@ def create_static_model_set(download_scope : DownloadScope) -> ModelSet: }, 'type': 'wav2lip', 'size': (96, 96) - }, - 'edtalk_256': - { - 'hashes': - { - 'lip_syncer': - { - 'url': resolve_download_url('models-3.3.0', 'edtalk_256.hash'), - 'path': resolve_relative_path('../.assets/models/edtalk_256.hash') - } - }, - 'sources': - { - 'lip_syncer': - { - 'url': resolve_download_url('models-3.3.0', 'edtalk_256.onnx'), - 'path': resolve_relative_path('../.assets/models/edtalk_256.onnx') - } - }, - 'type': 'edtalk', - 'size': (256, 256) } } @@ -168,51 +168,40 @@ def post_process() -> None: def sync_lip(target_face : Face, temp_audio_frame : AudioFrame, temp_vision_frame : VisionFrame) -> VisionFrame: model_name = state_manager.get_item('lip_syncer_model') + model_size = get_model_options().get('size') temp_audio_frame = prepare_audio_frame(temp_audio_frame) crop_vision_frame, affine_matrix = warp_face_by_face_landmark_5(temp_vision_frame, target_face.landmark_set.get('5/68'), 'ffhq_512', (512, 512)) - box_mask = create_static_box_mask(crop_vision_frame.shape[:2][::-1], state_manager.get_item('face_mask_blur'), state_manager.get_item('face_mask_padding')) - crop_masks =\ - [ - box_mask - ] + crop_masks = [] if 'occlusion' in state_manager.get_item('face_mask_types'): occlusion_mask = create_occlusion_mask(crop_vision_frame) crop_masks.append(occlusion_mask) + if model_name == 'edtalk_256': + lip_syncer_weight = numpy.array([ state_manager.get_item('lip_syncer_weight') ]).astype(numpy.float32) * 1.25 + box_mask = create_static_box_mask(crop_vision_frame.shape[:2][::-1], state_manager.get_item('face_mask_blur'), state_manager.get_item('face_mask_padding')) + crop_masks.append(box_mask) + crop_vision_frame = prepare_crop_frame(crop_vision_frame) + crop_vision_frame = forward_edtalk(temp_audio_frame, crop_vision_frame, lip_syncer_weight) + crop_vision_frame = normalize_crop_frame(crop_vision_frame) + if model_name.startswith('wav2lip'): face_landmark_68 = cv2.transform(target_face.landmark_set.get('68').reshape(1, -1, 2), affine_matrix).reshape(-1, 2) area_mask = create_area_mask(face_landmark_68, [ 'lower-face' ]) crop_masks.append(area_mask) bounding_box = create_bounding_box(face_landmark_68) - bounding_box = prepare_bounding_box(bounding_box) - crop_vision_frame = process_wav2lip(crop_vision_frame, temp_audio_frame, bounding_box) - elif model_name == 'edtalk_256': - crop_vision_frame = process_edtalk(crop_vision_frame, temp_audio_frame) + bounding_box = resize_bounding_box(bounding_box, 4 / 3) + close_vision_frame, close_matrix = warp_face_by_bounding_box(crop_vision_frame, bounding_box, model_size) + close_vision_frame = prepare_crop_frame(close_vision_frame) + close_vision_frame = forward_wav2lip(temp_audio_frame, close_vision_frame) + close_vision_frame = normalize_crop_frame(close_vision_frame) + crop_vision_frame = cv2.warpAffine(close_vision_frame, cv2.invertAffineTransform(close_matrix), (512, 512), borderMode = cv2.BORDER_REPLICATE) crop_mask = numpy.minimum.reduce(crop_masks) paste_vision_frame = paste_back(temp_vision_frame, crop_vision_frame, crop_mask, affine_matrix) return paste_vision_frame -def process_wav2lip(crop_vision_frame : VisionFrame, temp_audio_frame : AudioFrame, bounding_box : BoundingBox) -> VisionFrame: - model_size = get_model_options().get('size') - close_vision_frame, close_matrix = warp_face_by_bounding_box(crop_vision_frame, bounding_box, model_size) - close_vision_frame = prepare_crop_frame(close_vision_frame) - close_vision_frame = forward_wav2lip(temp_audio_frame, close_vision_frame) - close_vision_frame = normalize_crop_frame(close_vision_frame) - crop_vision_frame = cv2.warpAffine(close_vision_frame, cv2.invertAffineTransform(close_matrix), (512, 512), borderMode = cv2.BORDER_REPLICATE) - return crop_vision_frame - - -def process_edtalk(crop_vision_frame : VisionFrame, temp_audio_frame : AudioFrame) -> VisionFrame: - lip_syncer_weight = state_manager.get_item('lip_syncer_weight') * 1.25 - crop_vision_frame = prepare_crop_frame(crop_vision_frame) - crop_vision_frame = forward_edtalk(temp_audio_frame, crop_vision_frame, lip_syncer_weight) - crop_vision_frame = normalize_crop_frame(crop_vision_frame) - return crop_vision_frame - - def forward_wav2lip(temp_audio_frame : AudioFrame, close_vision_frame : VisionFrame) -> VisionFrame: lip_syncer = get_inference_pool().get('lip_syncer') @@ -226,7 +215,7 @@ def forward_wav2lip(temp_audio_frame : AudioFrame, close_vision_frame : VisionFr return close_vision_frame -def forward_edtalk(temp_audio_frame : AudioFrame, crop_vision_frame : VisionFrame, lip_syncer_weight : float) -> VisionFrame: +def forward_edtalk(temp_audio_frame : AudioFrame, crop_vision_frame : VisionFrame, lip_syncer_weight : LipSyncerWeight) -> VisionFrame: lip_syncer = get_inference_pool().get('lip_syncer') with conditional_thread_semaphore(): @@ -234,7 +223,7 @@ def forward_edtalk(temp_audio_frame : AudioFrame, crop_vision_frame : VisionFram { 'source': temp_audio_frame, 'target': crop_vision_frame, - 'weight': [ numpy.float32(lip_syncer_weight) ] + 'weight': lip_syncer_weight })[0] return crop_vision_frame @@ -253,24 +242,24 @@ def prepare_crop_frame(crop_vision_frame : VisionFrame) -> VisionFrame: model_type = get_model_options().get('type') model_size = get_model_options().get('size') + if model_type == 'edtalk': + crop_vision_frame = cv2.resize(crop_vision_frame, (256, 256), interpolation = cv2.INTER_AREA) + crop_vision_frame = crop_vision_frame[:, :, ::-1] / 255.0 + crop_vision_frame = numpy.expand_dims(crop_vision_frame.transpose(2, 0, 1), axis = 0).astype(numpy.float32) if model_type == 'wav2lip': crop_vision_frame = numpy.expand_dims(crop_vision_frame, axis = 0) prepare_vision_frame = crop_vision_frame.copy() prepare_vision_frame[:, model_size[0] // 2:] = 0 crop_vision_frame = numpy.concatenate((prepare_vision_frame, crop_vision_frame), axis = 3) crop_vision_frame = crop_vision_frame.transpose(0, 3, 1, 2).astype('float32') / 255.0 - elif model_type == 'edtalk': - crop_vision_frame = cv2.resize(crop_vision_frame, (256, 256), interpolation = cv2.INTER_AREA) - crop_vision_frame = crop_vision_frame[:, :, ::-1] / 255.0 - crop_vision_frame = numpy.expand_dims(crop_vision_frame.transpose(2, 0, 1), axis = 0).astype(numpy.float32) return crop_vision_frame -def prepare_bounding_box(bounding_box : BoundingBox) -> BoundingBox: +def resize_bounding_box(bounding_box : BoundingBox, aspect_ratio : float) -> BoundingBox: bounding_box[3] += min(8, 511) x1, y1, x2, y2 = bounding_box - y1 = y2 - (4 / 3) * (x2 - x1) + y1 = y2 - aspect_ratio * (x2 - x1) bounding_box[1] = max(y1, 0) return bounding_box diff --git a/facefusion/processors/types.py b/facefusion/processors/types.py index ed7d535..5bfc132 100644 --- a/facefusion/processors/types.py +++ b/facefusion/processors/types.py @@ -13,7 +13,7 @@ FaceEnhancerModel = Literal['codeformer', 'gfpgan_1.2', 'gfpgan_1.3', 'gfpgan_1. FaceSwapperModel = Literal['blendswap_256', 'ghost_1_256', 'ghost_2_256', 'ghost_3_256', 'hififace_unofficial_256', 'inswapper_128', 'inswapper_128_fp16', 'simswap_256', 'simswap_unofficial_512', 'uniface_256'] FrameColorizerModel = Literal['ddcolor', 'ddcolor_artistic', 'deoldify', 'deoldify_artistic', 'deoldify_stable'] FrameEnhancerModel = Literal['clear_reality_x4', 'lsdir_x4', 'nomos8k_sc_x4', 'real_esrgan_x2', 'real_esrgan_x2_fp16', 'real_esrgan_x4', 'real_esrgan_x4_fp16', 'real_esrgan_x8', 'real_esrgan_x8_fp16', 'real_hatgan_x4', 'real_web_photo_x4', 'realistic_rescaler_x4', 'remacri_x4', 'siax_x4', 'span_kendata_x4', 'swin2_sr_x4', 'ultra_sharp_x4', 'ultra_sharp_2_x4'] -LipSyncerModel = Literal['wav2lip_96', 'wav2lip_gan_96', 'edtalk_256'] +LipSyncerModel = Literal['edtalk_256', 'wav2lip_96', 'wav2lip_gan_96'] FaceSwapperSet : TypeAlias = Dict[FaceSwapperModel, List[str]] @@ -147,6 +147,7 @@ ProcessorStateSet : TypeAlias = Dict[AppContext, ProcessorState] AgeModifierDirection : TypeAlias = NDArray[Any] DeepSwapperMorph : TypeAlias = NDArray[Any] FaceEnhancerWeight : TypeAlias = NDArray[Any] +LipSyncerWeight : TypeAlias = NDArray[Any] LivePortraitPitch : TypeAlias = float LivePortraitYaw : TypeAlias = float LivePortraitRoll : TypeAlias = float