Merge pull request #897 from facefusion/organize-lipsyncer

Add Edtalk and re organize lipsyncer
This commit is contained in:
Henry Ruhs
2025-06-12 17:13:30 +02:00
committed by GitHub
3 changed files with 95 additions and 25 deletions

View File

@@ -195,7 +195,7 @@ face_swapper_models : List[FaceSwapperModel] = list(face_swapper_set.keys())
frame_colorizer_models : List[FrameColorizerModel] = [ 'ddcolor', 'ddcolor_artistic', 'deoldify', 'deoldify_artistic', 'deoldify_stable' ] frame_colorizer_models : List[FrameColorizerModel] = [ 'ddcolor', 'ddcolor_artistic', 'deoldify', 'deoldify_artistic', 'deoldify_stable' ]
frame_colorizer_sizes : List[str] = [ '192x192', '256x256', '384x384', '512x512' ] frame_colorizer_sizes : List[str] = [ '192x192', '256x256', '384x384', '512x512' ]
frame_enhancer_models : List[FrameEnhancerModel] = [ 'clear_reality_x4', 'lsdir_x4', 'nomos8k_sc_x4', 'real_esrgan_x2', 'real_esrgan_x2_fp16', 'real_esrgan_x4', 'real_esrgan_x4_fp16', 'real_esrgan_x8', 'real_esrgan_x8_fp16', 'real_hatgan_x4', 'real_web_photo_x4', 'realistic_rescaler_x4', 'remacri_x4', 'siax_x4', 'span_kendata_x4', 'swin2_sr_x4', 'ultra_sharp_x4', 'ultra_sharp_2_x4' ] frame_enhancer_models : List[FrameEnhancerModel] = [ 'clear_reality_x4', 'lsdir_x4', 'nomos8k_sc_x4', 'real_esrgan_x2', 'real_esrgan_x2_fp16', 'real_esrgan_x4', 'real_esrgan_x4_fp16', 'real_esrgan_x8', 'real_esrgan_x8_fp16', 'real_hatgan_x4', 'real_web_photo_x4', 'realistic_rescaler_x4', 'remacri_x4', 'siax_x4', 'span_kendata_x4', 'swin2_sr_x4', 'ultra_sharp_x4', 'ultra_sharp_2_x4' ]
lip_syncer_models : List[LipSyncerModel] = [ 'wav2lip_96', 'wav2lip_gan_96' ] lip_syncer_models : List[LipSyncerModel] = [ 'edtalk_256', 'wav2lip_96', 'wav2lip_gan_96' ]
age_modifier_direction_range : Sequence[int] = create_int_range(-100, 100, 1) age_modifier_direction_range : Sequence[int] = create_int_range(-100, 100, 1)
deep_swapper_morph_range : Sequence[int] = create_int_range(0, 100, 1) deep_swapper_morph_range : Sequence[int] = create_int_range(0, 100, 1)

View File

@@ -15,15 +15,15 @@ from facefusion.common_helper import get_first
from facefusion.download import conditional_download_hashes, conditional_download_sources, resolve_download_url from facefusion.download import conditional_download_hashes, conditional_download_sources, resolve_download_url
from facefusion.face_analyser import get_many_faces, get_one_face from facefusion.face_analyser import get_many_faces, get_one_face
from facefusion.face_helper import create_bounding_box, paste_back, warp_face_by_bounding_box, warp_face_by_face_landmark_5 from facefusion.face_helper import create_bounding_box, paste_back, warp_face_by_bounding_box, warp_face_by_face_landmark_5
from facefusion.face_masker import create_area_mask, create_occlusion_mask from facefusion.face_masker import create_area_mask, create_occlusion_mask, create_static_box_mask
from facefusion.face_selector import find_similar_faces, sort_and_filter_faces from facefusion.face_selector import find_similar_faces, sort_and_filter_faces
from facefusion.face_store import get_reference_faces from facefusion.face_store import get_reference_faces
from facefusion.filesystem import filter_audio_paths, has_audio, in_directory, is_image, is_video, resolve_relative_path, same_file_extension from facefusion.filesystem import filter_audio_paths, has_audio, in_directory, is_image, is_video, resolve_relative_path, same_file_extension
from facefusion.processors import choices as processors_choices from facefusion.processors import choices as processors_choices
from facefusion.processors.types import LipSyncerInputs from facefusion.processors.types import LipSyncerInputs, LipSyncerWeight
from facefusion.program_helper import find_argument_group from facefusion.program_helper import find_argument_group
from facefusion.thread_helper import conditional_thread_semaphore from facefusion.thread_helper import conditional_thread_semaphore
from facefusion.types import ApplyStateItem, Args, AudioFrame, DownloadScope, Face, InferencePool, ModelOptions, ModelSet, ProcessMode, QueuePayload, UpdateProgress, VisionFrame from facefusion.types import ApplyStateItem, Args, AudioFrame, BoundingBox, DownloadScope, Face, InferencePool, ModelOptions, ModelSet, ProcessMode, QueuePayload, UpdateProgress, VisionFrame
from facefusion.vision import read_image, read_static_image, restrict_video_fps, write_image from facefusion.vision import read_image, read_static_image, restrict_video_fps, write_image
@@ -31,6 +31,27 @@ from facefusion.vision import read_image, read_static_image, restrict_video_fps,
def create_static_model_set(download_scope : DownloadScope) -> ModelSet: def create_static_model_set(download_scope : DownloadScope) -> ModelSet:
return\ return\
{ {
'edtalk_256':
{
'hashes':
{
'lip_syncer':
{
'url': resolve_download_url('models-3.3.0', 'edtalk_256.hash'),
'path': resolve_relative_path('../.assets/models/edtalk_256.hash')
}
},
'sources':
{
'lip_syncer':
{
'url': resolve_download_url('models-3.3.0', 'edtalk_256.onnx'),
'path': resolve_relative_path('../.assets/models/edtalk_256.onnx')
}
},
'type': 'edtalk',
'size': (256, 256)
},
'wav2lip_96': 'wav2lip_96':
{ {
'hashes': 'hashes':
@@ -49,6 +70,7 @@ def create_static_model_set(download_scope : DownloadScope) -> ModelSet:
'path': resolve_relative_path('../.assets/models/wav2lip_96.onnx') 'path': resolve_relative_path('../.assets/models/wav2lip_96.onnx')
} }
}, },
'type': 'wav2lip',
'size': (96, 96) 'size': (96, 96)
}, },
'wav2lip_gan_96': 'wav2lip_gan_96':
@@ -69,6 +91,7 @@ def create_static_model_set(download_scope : DownloadScope) -> ModelSet:
'path': resolve_relative_path('../.assets/models/wav2lip_gan_96.onnx') 'path': resolve_relative_path('../.assets/models/wav2lip_gan_96.onnx')
} }
}, },
'type': 'wav2lip',
'size': (96, 96) 'size': (96, 96)
} }
} }
@@ -144,33 +167,42 @@ def post_process() -> None:
def sync_lip(target_face : Face, temp_audio_frame : AudioFrame, temp_vision_frame : VisionFrame) -> VisionFrame: def sync_lip(target_face : Face, temp_audio_frame : AudioFrame, temp_vision_frame : VisionFrame) -> VisionFrame:
model_name = state_manager.get_item('lip_syncer_model')
model_size = get_model_options().get('size') model_size = get_model_options().get('size')
temp_audio_frame = prepare_audio_frame(temp_audio_frame) temp_audio_frame = prepare_audio_frame(temp_audio_frame)
crop_vision_frame, affine_matrix = warp_face_by_face_landmark_5(temp_vision_frame, target_face.landmark_set.get('5/68'), 'ffhq_512', (512, 512)) crop_vision_frame, affine_matrix = warp_face_by_face_landmark_5(temp_vision_frame, target_face.landmark_set.get('5/68'), 'ffhq_512', (512, 512))
face_landmark_68 = cv2.transform(target_face.landmark_set.get('68').reshape(1, -1, 2), affine_matrix).reshape(-1, 2) crop_masks = []
bounding_box = create_bounding_box(face_landmark_68)
bounding_box[1] -= numpy.abs(bounding_box[3] - bounding_box[1]) * 0.125
area_mask = create_area_mask(face_landmark_68, [ 'lower-face' ])
crop_masks =\
[
area_mask
]
if 'occlusion' in state_manager.get_item('face_mask_types'): if 'occlusion' in state_manager.get_item('face_mask_types'):
occlusion_mask = create_occlusion_mask(crop_vision_frame) occlusion_mask = create_occlusion_mask(crop_vision_frame)
crop_masks.append(occlusion_mask) crop_masks.append(occlusion_mask)
if model_name == 'edtalk_256':
lip_syncer_weight = numpy.array([ state_manager.get_item('lip_syncer_weight') ]).astype(numpy.float32) * 1.25
box_mask = create_static_box_mask(crop_vision_frame.shape[:2][::-1], state_manager.get_item('face_mask_blur'), state_manager.get_item('face_mask_padding'))
crop_masks.append(box_mask)
crop_vision_frame = prepare_crop_frame(crop_vision_frame)
crop_vision_frame = forward_edtalk(temp_audio_frame, crop_vision_frame, lip_syncer_weight)
crop_vision_frame = normalize_crop_frame(crop_vision_frame)
if model_name.startswith('wav2lip'):
face_landmark_68 = cv2.transform(target_face.landmark_set.get('68').reshape(1, -1, 2), affine_matrix).reshape(-1, 2)
area_mask = create_area_mask(face_landmark_68, [ 'lower-face' ])
crop_masks.append(area_mask)
bounding_box = create_bounding_box(face_landmark_68)
bounding_box = resize_bounding_box(bounding_box, 4 / 3)
close_vision_frame, close_matrix = warp_face_by_bounding_box(crop_vision_frame, bounding_box, model_size) close_vision_frame, close_matrix = warp_face_by_bounding_box(crop_vision_frame, bounding_box, model_size)
close_vision_frame = prepare_crop_frame(close_vision_frame) close_vision_frame = prepare_crop_frame(close_vision_frame)
close_vision_frame = forward(temp_audio_frame, close_vision_frame) close_vision_frame = forward_wav2lip(temp_audio_frame, close_vision_frame)
close_vision_frame = normalize_close_frame(close_vision_frame) close_vision_frame = normalize_crop_frame(close_vision_frame)
crop_vision_frame = cv2.warpAffine(close_vision_frame, cv2.invertAffineTransform(close_matrix), (512, 512), borderMode = cv2.BORDER_REPLICATE) crop_vision_frame = cv2.warpAffine(close_vision_frame, cv2.invertAffineTransform(close_matrix), (512, 512), borderMode = cv2.BORDER_REPLICATE)
crop_mask = numpy.minimum.reduce(crop_masks) crop_mask = numpy.minimum.reduce(crop_masks)
paste_vision_frame = paste_back(temp_vision_frame, crop_vision_frame, crop_mask, affine_matrix) paste_vision_frame = paste_back(temp_vision_frame, crop_vision_frame, crop_mask, affine_matrix)
return paste_vision_frame return paste_vision_frame
def forward(temp_audio_frame : AudioFrame, close_vision_frame : VisionFrame) -> VisionFrame: def forward_wav2lip(temp_audio_frame : AudioFrame, close_vision_frame : VisionFrame) -> VisionFrame:
lip_syncer = get_inference_pool().get('lip_syncer') lip_syncer = get_inference_pool().get('lip_syncer')
with conditional_thread_semaphore(): with conditional_thread_semaphore():
@@ -183,6 +215,20 @@ def forward(temp_audio_frame : AudioFrame, close_vision_frame : VisionFrame) ->
return close_vision_frame return close_vision_frame
def forward_edtalk(temp_audio_frame : AudioFrame, crop_vision_frame : VisionFrame, lip_syncer_weight : LipSyncerWeight) -> VisionFrame:
lip_syncer = get_inference_pool().get('lip_syncer')
with conditional_thread_semaphore():
crop_vision_frame = lip_syncer.run(None,
{
'source': temp_audio_frame,
'target': crop_vision_frame,
'weight': lip_syncer_weight
})[0]
return crop_vision_frame
def prepare_audio_frame(temp_audio_frame : AudioFrame) -> AudioFrame: def prepare_audio_frame(temp_audio_frame : AudioFrame) -> AudioFrame:
temp_audio_frame = numpy.maximum(numpy.exp(-5 * numpy.log(10)), temp_audio_frame) temp_audio_frame = numpy.maximum(numpy.exp(-5 * numpy.log(10)), temp_audio_frame)
temp_audio_frame = numpy.log10(temp_audio_frame) * 1.6 + 3.2 temp_audio_frame = numpy.log10(temp_audio_frame) * 1.6 + 3.2
@@ -193,18 +239,41 @@ def prepare_audio_frame(temp_audio_frame : AudioFrame) -> AudioFrame:
def prepare_crop_frame(crop_vision_frame : VisionFrame) -> VisionFrame: def prepare_crop_frame(crop_vision_frame : VisionFrame) -> VisionFrame:
model_type = get_model_options().get('type')
model_size = get_model_options().get('size')
if model_type == 'edtalk':
crop_vision_frame = cv2.resize(crop_vision_frame, (256, 256), interpolation = cv2.INTER_AREA)
crop_vision_frame = crop_vision_frame[:, :, ::-1] / 255.0
crop_vision_frame = numpy.expand_dims(crop_vision_frame.transpose(2, 0, 1), axis = 0).astype(numpy.float32)
if model_type == 'wav2lip':
crop_vision_frame = numpy.expand_dims(crop_vision_frame, axis = 0) crop_vision_frame = numpy.expand_dims(crop_vision_frame, axis = 0)
prepare_vision_frame = crop_vision_frame.copy() prepare_vision_frame = crop_vision_frame.copy()
prepare_vision_frame[:, 48:] = 0 prepare_vision_frame[:, model_size[0] // 2:] = 0
crop_vision_frame = numpy.concatenate((prepare_vision_frame, crop_vision_frame), axis = 3) crop_vision_frame = numpy.concatenate((prepare_vision_frame, crop_vision_frame), axis = 3)
crop_vision_frame = crop_vision_frame.transpose(0, 3, 1, 2).astype('float32') / 255.0 crop_vision_frame = crop_vision_frame.transpose(0, 3, 1, 2).astype('float32') / 255.0
return crop_vision_frame return crop_vision_frame
def normalize_close_frame(crop_vision_frame : VisionFrame) -> VisionFrame: def resize_bounding_box(bounding_box : BoundingBox, aspect_ratio : float) -> BoundingBox:
bounding_box[3] += min(8, 511)
x1, y1, x2, y2 = bounding_box
y1 = y2 - aspect_ratio * (x2 - x1)
bounding_box[1] = max(y1, 0)
return bounding_box
def normalize_crop_frame(crop_vision_frame : VisionFrame) -> VisionFrame:
model_type = get_model_options().get('type')
crop_vision_frame = crop_vision_frame[0].transpose(1, 2, 0) crop_vision_frame = crop_vision_frame[0].transpose(1, 2, 0)
crop_vision_frame = crop_vision_frame.clip(0, 1) * 255 crop_vision_frame = crop_vision_frame.clip(0, 1) * 255
crop_vision_frame = crop_vision_frame.astype(numpy.uint8) crop_vision_frame = crop_vision_frame.astype(numpy.uint8)
if model_type == 'edtalk':
crop_vision_frame = crop_vision_frame[:, :, ::-1]
crop_vision_frame = cv2.resize(crop_vision_frame, (512, 512), interpolation = cv2.INTER_CUBIC)
return crop_vision_frame return crop_vision_frame

View File

@@ -13,7 +13,7 @@ FaceEnhancerModel = Literal['codeformer', 'gfpgan_1.2', 'gfpgan_1.3', 'gfpgan_1.
FaceSwapperModel = Literal['blendswap_256', 'ghost_1_256', 'ghost_2_256', 'ghost_3_256', 'hififace_unofficial_256', 'inswapper_128', 'inswapper_128_fp16', 'simswap_256', 'simswap_unofficial_512', 'uniface_256'] FaceSwapperModel = Literal['blendswap_256', 'ghost_1_256', 'ghost_2_256', 'ghost_3_256', 'hififace_unofficial_256', 'inswapper_128', 'inswapper_128_fp16', 'simswap_256', 'simswap_unofficial_512', 'uniface_256']
FrameColorizerModel = Literal['ddcolor', 'ddcolor_artistic', 'deoldify', 'deoldify_artistic', 'deoldify_stable'] FrameColorizerModel = Literal['ddcolor', 'ddcolor_artistic', 'deoldify', 'deoldify_artistic', 'deoldify_stable']
FrameEnhancerModel = Literal['clear_reality_x4', 'lsdir_x4', 'nomos8k_sc_x4', 'real_esrgan_x2', 'real_esrgan_x2_fp16', 'real_esrgan_x4', 'real_esrgan_x4_fp16', 'real_esrgan_x8', 'real_esrgan_x8_fp16', 'real_hatgan_x4', 'real_web_photo_x4', 'realistic_rescaler_x4', 'remacri_x4', 'siax_x4', 'span_kendata_x4', 'swin2_sr_x4', 'ultra_sharp_x4', 'ultra_sharp_2_x4'] FrameEnhancerModel = Literal['clear_reality_x4', 'lsdir_x4', 'nomos8k_sc_x4', 'real_esrgan_x2', 'real_esrgan_x2_fp16', 'real_esrgan_x4', 'real_esrgan_x4_fp16', 'real_esrgan_x8', 'real_esrgan_x8_fp16', 'real_hatgan_x4', 'real_web_photo_x4', 'realistic_rescaler_x4', 'remacri_x4', 'siax_x4', 'span_kendata_x4', 'swin2_sr_x4', 'ultra_sharp_x4', 'ultra_sharp_2_x4']
LipSyncerModel = Literal['wav2lip_96', 'wav2lip_gan_96'] LipSyncerModel = Literal['edtalk_256', 'wav2lip_96', 'wav2lip_gan_96']
FaceSwapperSet : TypeAlias = Dict[FaceSwapperModel, List[str]] FaceSwapperSet : TypeAlias = Dict[FaceSwapperModel, List[str]]
@@ -147,6 +147,7 @@ ProcessorStateSet : TypeAlias = Dict[AppContext, ProcessorState]
AgeModifierDirection : TypeAlias = NDArray[Any] AgeModifierDirection : TypeAlias = NDArray[Any]
DeepSwapperMorph : TypeAlias = NDArray[Any] DeepSwapperMorph : TypeAlias = NDArray[Any]
FaceEnhancerWeight : TypeAlias = NDArray[Any] FaceEnhancerWeight : TypeAlias = NDArray[Any]
LipSyncerWeight : TypeAlias = NDArray[Any]
LivePortraitPitch : TypeAlias = float LivePortraitPitch : TypeAlias = float
LivePortraitYaw : TypeAlias = float LivePortraitYaw : TypeAlias = float
LivePortraitRoll : TypeAlias = float LivePortraitRoll : TypeAlias = float