Merge pull request #897 from facefusion/organize-lipsyncer
Add Edtalk and re organize lipsyncer
This commit is contained in:
@@ -195,7 +195,7 @@ face_swapper_models : List[FaceSwapperModel] = list(face_swapper_set.keys())
|
|||||||
frame_colorizer_models : List[FrameColorizerModel] = [ 'ddcolor', 'ddcolor_artistic', 'deoldify', 'deoldify_artistic', 'deoldify_stable' ]
|
frame_colorizer_models : List[FrameColorizerModel] = [ 'ddcolor', 'ddcolor_artistic', 'deoldify', 'deoldify_artistic', 'deoldify_stable' ]
|
||||||
frame_colorizer_sizes : List[str] = [ '192x192', '256x256', '384x384', '512x512' ]
|
frame_colorizer_sizes : List[str] = [ '192x192', '256x256', '384x384', '512x512' ]
|
||||||
frame_enhancer_models : List[FrameEnhancerModel] = [ 'clear_reality_x4', 'lsdir_x4', 'nomos8k_sc_x4', 'real_esrgan_x2', 'real_esrgan_x2_fp16', 'real_esrgan_x4', 'real_esrgan_x4_fp16', 'real_esrgan_x8', 'real_esrgan_x8_fp16', 'real_hatgan_x4', 'real_web_photo_x4', 'realistic_rescaler_x4', 'remacri_x4', 'siax_x4', 'span_kendata_x4', 'swin2_sr_x4', 'ultra_sharp_x4', 'ultra_sharp_2_x4' ]
|
frame_enhancer_models : List[FrameEnhancerModel] = [ 'clear_reality_x4', 'lsdir_x4', 'nomos8k_sc_x4', 'real_esrgan_x2', 'real_esrgan_x2_fp16', 'real_esrgan_x4', 'real_esrgan_x4_fp16', 'real_esrgan_x8', 'real_esrgan_x8_fp16', 'real_hatgan_x4', 'real_web_photo_x4', 'realistic_rescaler_x4', 'remacri_x4', 'siax_x4', 'span_kendata_x4', 'swin2_sr_x4', 'ultra_sharp_x4', 'ultra_sharp_2_x4' ]
|
||||||
lip_syncer_models : List[LipSyncerModel] = [ 'wav2lip_96', 'wav2lip_gan_96' ]
|
lip_syncer_models : List[LipSyncerModel] = [ 'edtalk_256', 'wav2lip_96', 'wav2lip_gan_96' ]
|
||||||
|
|
||||||
age_modifier_direction_range : Sequence[int] = create_int_range(-100, 100, 1)
|
age_modifier_direction_range : Sequence[int] = create_int_range(-100, 100, 1)
|
||||||
deep_swapper_morph_range : Sequence[int] = create_int_range(0, 100, 1)
|
deep_swapper_morph_range : Sequence[int] = create_int_range(0, 100, 1)
|
||||||
|
|||||||
@@ -15,15 +15,15 @@ from facefusion.common_helper import get_first
|
|||||||
from facefusion.download import conditional_download_hashes, conditional_download_sources, resolve_download_url
|
from facefusion.download import conditional_download_hashes, conditional_download_sources, resolve_download_url
|
||||||
from facefusion.face_analyser import get_many_faces, get_one_face
|
from facefusion.face_analyser import get_many_faces, get_one_face
|
||||||
from facefusion.face_helper import create_bounding_box, paste_back, warp_face_by_bounding_box, warp_face_by_face_landmark_5
|
from facefusion.face_helper import create_bounding_box, paste_back, warp_face_by_bounding_box, warp_face_by_face_landmark_5
|
||||||
from facefusion.face_masker import create_area_mask, create_occlusion_mask
|
from facefusion.face_masker import create_area_mask, create_occlusion_mask, create_static_box_mask
|
||||||
from facefusion.face_selector import find_similar_faces, sort_and_filter_faces
|
from facefusion.face_selector import find_similar_faces, sort_and_filter_faces
|
||||||
from facefusion.face_store import get_reference_faces
|
from facefusion.face_store import get_reference_faces
|
||||||
from facefusion.filesystem import filter_audio_paths, has_audio, in_directory, is_image, is_video, resolve_relative_path, same_file_extension
|
from facefusion.filesystem import filter_audio_paths, has_audio, in_directory, is_image, is_video, resolve_relative_path, same_file_extension
|
||||||
from facefusion.processors import choices as processors_choices
|
from facefusion.processors import choices as processors_choices
|
||||||
from facefusion.processors.types import LipSyncerInputs
|
from facefusion.processors.types import LipSyncerInputs, LipSyncerWeight
|
||||||
from facefusion.program_helper import find_argument_group
|
from facefusion.program_helper import find_argument_group
|
||||||
from facefusion.thread_helper import conditional_thread_semaphore
|
from facefusion.thread_helper import conditional_thread_semaphore
|
||||||
from facefusion.types import ApplyStateItem, Args, AudioFrame, DownloadScope, Face, InferencePool, ModelOptions, ModelSet, ProcessMode, QueuePayload, UpdateProgress, VisionFrame
|
from facefusion.types import ApplyStateItem, Args, AudioFrame, BoundingBox, DownloadScope, Face, InferencePool, ModelOptions, ModelSet, ProcessMode, QueuePayload, UpdateProgress, VisionFrame
|
||||||
from facefusion.vision import read_image, read_static_image, restrict_video_fps, write_image
|
from facefusion.vision import read_image, read_static_image, restrict_video_fps, write_image
|
||||||
|
|
||||||
|
|
||||||
@@ -31,6 +31,27 @@ from facefusion.vision import read_image, read_static_image, restrict_video_fps,
|
|||||||
def create_static_model_set(download_scope : DownloadScope) -> ModelSet:
|
def create_static_model_set(download_scope : DownloadScope) -> ModelSet:
|
||||||
return\
|
return\
|
||||||
{
|
{
|
||||||
|
'edtalk_256':
|
||||||
|
{
|
||||||
|
'hashes':
|
||||||
|
{
|
||||||
|
'lip_syncer':
|
||||||
|
{
|
||||||
|
'url': resolve_download_url('models-3.3.0', 'edtalk_256.hash'),
|
||||||
|
'path': resolve_relative_path('../.assets/models/edtalk_256.hash')
|
||||||
|
}
|
||||||
|
},
|
||||||
|
'sources':
|
||||||
|
{
|
||||||
|
'lip_syncer':
|
||||||
|
{
|
||||||
|
'url': resolve_download_url('models-3.3.0', 'edtalk_256.onnx'),
|
||||||
|
'path': resolve_relative_path('../.assets/models/edtalk_256.onnx')
|
||||||
|
}
|
||||||
|
},
|
||||||
|
'type': 'edtalk',
|
||||||
|
'size': (256, 256)
|
||||||
|
},
|
||||||
'wav2lip_96':
|
'wav2lip_96':
|
||||||
{
|
{
|
||||||
'hashes':
|
'hashes':
|
||||||
@@ -49,6 +70,7 @@ def create_static_model_set(download_scope : DownloadScope) -> ModelSet:
|
|||||||
'path': resolve_relative_path('../.assets/models/wav2lip_96.onnx')
|
'path': resolve_relative_path('../.assets/models/wav2lip_96.onnx')
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
'type': 'wav2lip',
|
||||||
'size': (96, 96)
|
'size': (96, 96)
|
||||||
},
|
},
|
||||||
'wav2lip_gan_96':
|
'wav2lip_gan_96':
|
||||||
@@ -69,6 +91,7 @@ def create_static_model_set(download_scope : DownloadScope) -> ModelSet:
|
|||||||
'path': resolve_relative_path('../.assets/models/wav2lip_gan_96.onnx')
|
'path': resolve_relative_path('../.assets/models/wav2lip_gan_96.onnx')
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
'type': 'wav2lip',
|
||||||
'size': (96, 96)
|
'size': (96, 96)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -144,33 +167,42 @@ def post_process() -> None:
|
|||||||
|
|
||||||
|
|
||||||
def sync_lip(target_face : Face, temp_audio_frame : AudioFrame, temp_vision_frame : VisionFrame) -> VisionFrame:
|
def sync_lip(target_face : Face, temp_audio_frame : AudioFrame, temp_vision_frame : VisionFrame) -> VisionFrame:
|
||||||
|
model_name = state_manager.get_item('lip_syncer_model')
|
||||||
model_size = get_model_options().get('size')
|
model_size = get_model_options().get('size')
|
||||||
temp_audio_frame = prepare_audio_frame(temp_audio_frame)
|
temp_audio_frame = prepare_audio_frame(temp_audio_frame)
|
||||||
crop_vision_frame, affine_matrix = warp_face_by_face_landmark_5(temp_vision_frame, target_face.landmark_set.get('5/68'), 'ffhq_512', (512, 512))
|
crop_vision_frame, affine_matrix = warp_face_by_face_landmark_5(temp_vision_frame, target_face.landmark_set.get('5/68'), 'ffhq_512', (512, 512))
|
||||||
face_landmark_68 = cv2.transform(target_face.landmark_set.get('68').reshape(1, -1, 2), affine_matrix).reshape(-1, 2)
|
crop_masks = []
|
||||||
bounding_box = create_bounding_box(face_landmark_68)
|
|
||||||
bounding_box[1] -= numpy.abs(bounding_box[3] - bounding_box[1]) * 0.125
|
|
||||||
area_mask = create_area_mask(face_landmark_68, [ 'lower-face' ])
|
|
||||||
crop_masks =\
|
|
||||||
[
|
|
||||||
area_mask
|
|
||||||
]
|
|
||||||
|
|
||||||
if 'occlusion' in state_manager.get_item('face_mask_types'):
|
if 'occlusion' in state_manager.get_item('face_mask_types'):
|
||||||
occlusion_mask = create_occlusion_mask(crop_vision_frame)
|
occlusion_mask = create_occlusion_mask(crop_vision_frame)
|
||||||
crop_masks.append(occlusion_mask)
|
crop_masks.append(occlusion_mask)
|
||||||
|
|
||||||
close_vision_frame, close_matrix = warp_face_by_bounding_box(crop_vision_frame, bounding_box, model_size)
|
if model_name == 'edtalk_256':
|
||||||
close_vision_frame = prepare_crop_frame(close_vision_frame)
|
lip_syncer_weight = numpy.array([ state_manager.get_item('lip_syncer_weight') ]).astype(numpy.float32) * 1.25
|
||||||
close_vision_frame = forward(temp_audio_frame, close_vision_frame)
|
box_mask = create_static_box_mask(crop_vision_frame.shape[:2][::-1], state_manager.get_item('face_mask_blur'), state_manager.get_item('face_mask_padding'))
|
||||||
close_vision_frame = normalize_close_frame(close_vision_frame)
|
crop_masks.append(box_mask)
|
||||||
crop_vision_frame = cv2.warpAffine(close_vision_frame, cv2.invertAffineTransform(close_matrix), (512, 512), borderMode = cv2.BORDER_REPLICATE)
|
crop_vision_frame = prepare_crop_frame(crop_vision_frame)
|
||||||
|
crop_vision_frame = forward_edtalk(temp_audio_frame, crop_vision_frame, lip_syncer_weight)
|
||||||
|
crop_vision_frame = normalize_crop_frame(crop_vision_frame)
|
||||||
|
|
||||||
|
if model_name.startswith('wav2lip'):
|
||||||
|
face_landmark_68 = cv2.transform(target_face.landmark_set.get('68').reshape(1, -1, 2), affine_matrix).reshape(-1, 2)
|
||||||
|
area_mask = create_area_mask(face_landmark_68, [ 'lower-face' ])
|
||||||
|
crop_masks.append(area_mask)
|
||||||
|
bounding_box = create_bounding_box(face_landmark_68)
|
||||||
|
bounding_box = resize_bounding_box(bounding_box, 4 / 3)
|
||||||
|
close_vision_frame, close_matrix = warp_face_by_bounding_box(crop_vision_frame, bounding_box, model_size)
|
||||||
|
close_vision_frame = prepare_crop_frame(close_vision_frame)
|
||||||
|
close_vision_frame = forward_wav2lip(temp_audio_frame, close_vision_frame)
|
||||||
|
close_vision_frame = normalize_crop_frame(close_vision_frame)
|
||||||
|
crop_vision_frame = cv2.warpAffine(close_vision_frame, cv2.invertAffineTransform(close_matrix), (512, 512), borderMode = cv2.BORDER_REPLICATE)
|
||||||
|
|
||||||
crop_mask = numpy.minimum.reduce(crop_masks)
|
crop_mask = numpy.minimum.reduce(crop_masks)
|
||||||
paste_vision_frame = paste_back(temp_vision_frame, crop_vision_frame, crop_mask, affine_matrix)
|
paste_vision_frame = paste_back(temp_vision_frame, crop_vision_frame, crop_mask, affine_matrix)
|
||||||
return paste_vision_frame
|
return paste_vision_frame
|
||||||
|
|
||||||
|
|
||||||
def forward(temp_audio_frame : AudioFrame, close_vision_frame : VisionFrame) -> VisionFrame:
|
def forward_wav2lip(temp_audio_frame : AudioFrame, close_vision_frame : VisionFrame) -> VisionFrame:
|
||||||
lip_syncer = get_inference_pool().get('lip_syncer')
|
lip_syncer = get_inference_pool().get('lip_syncer')
|
||||||
|
|
||||||
with conditional_thread_semaphore():
|
with conditional_thread_semaphore():
|
||||||
@@ -183,6 +215,20 @@ def forward(temp_audio_frame : AudioFrame, close_vision_frame : VisionFrame) ->
|
|||||||
return close_vision_frame
|
return close_vision_frame
|
||||||
|
|
||||||
|
|
||||||
|
def forward_edtalk(temp_audio_frame : AudioFrame, crop_vision_frame : VisionFrame, lip_syncer_weight : LipSyncerWeight) -> VisionFrame:
|
||||||
|
lip_syncer = get_inference_pool().get('lip_syncer')
|
||||||
|
|
||||||
|
with conditional_thread_semaphore():
|
||||||
|
crop_vision_frame = lip_syncer.run(None,
|
||||||
|
{
|
||||||
|
'source': temp_audio_frame,
|
||||||
|
'target': crop_vision_frame,
|
||||||
|
'weight': lip_syncer_weight
|
||||||
|
})[0]
|
||||||
|
|
||||||
|
return crop_vision_frame
|
||||||
|
|
||||||
|
|
||||||
def prepare_audio_frame(temp_audio_frame : AudioFrame) -> AudioFrame:
|
def prepare_audio_frame(temp_audio_frame : AudioFrame) -> AudioFrame:
|
||||||
temp_audio_frame = numpy.maximum(numpy.exp(-5 * numpy.log(10)), temp_audio_frame)
|
temp_audio_frame = numpy.maximum(numpy.exp(-5 * numpy.log(10)), temp_audio_frame)
|
||||||
temp_audio_frame = numpy.log10(temp_audio_frame) * 1.6 + 3.2
|
temp_audio_frame = numpy.log10(temp_audio_frame) * 1.6 + 3.2
|
||||||
@@ -193,18 +239,41 @@ def prepare_audio_frame(temp_audio_frame : AudioFrame) -> AudioFrame:
|
|||||||
|
|
||||||
|
|
||||||
def prepare_crop_frame(crop_vision_frame : VisionFrame) -> VisionFrame:
|
def prepare_crop_frame(crop_vision_frame : VisionFrame) -> VisionFrame:
|
||||||
crop_vision_frame = numpy.expand_dims(crop_vision_frame, axis = 0)
|
model_type = get_model_options().get('type')
|
||||||
prepare_vision_frame = crop_vision_frame.copy()
|
model_size = get_model_options().get('size')
|
||||||
prepare_vision_frame[:, 48:] = 0
|
|
||||||
crop_vision_frame = numpy.concatenate((prepare_vision_frame, crop_vision_frame), axis = 3)
|
if model_type == 'edtalk':
|
||||||
crop_vision_frame = crop_vision_frame.transpose(0, 3, 1, 2).astype('float32') / 255.0
|
crop_vision_frame = cv2.resize(crop_vision_frame, (256, 256), interpolation = cv2.INTER_AREA)
|
||||||
|
crop_vision_frame = crop_vision_frame[:, :, ::-1] / 255.0
|
||||||
|
crop_vision_frame = numpy.expand_dims(crop_vision_frame.transpose(2, 0, 1), axis = 0).astype(numpy.float32)
|
||||||
|
if model_type == 'wav2lip':
|
||||||
|
crop_vision_frame = numpy.expand_dims(crop_vision_frame, axis = 0)
|
||||||
|
prepare_vision_frame = crop_vision_frame.copy()
|
||||||
|
prepare_vision_frame[:, model_size[0] // 2:] = 0
|
||||||
|
crop_vision_frame = numpy.concatenate((prepare_vision_frame, crop_vision_frame), axis = 3)
|
||||||
|
crop_vision_frame = crop_vision_frame.transpose(0, 3, 1, 2).astype('float32') / 255.0
|
||||||
|
|
||||||
return crop_vision_frame
|
return crop_vision_frame
|
||||||
|
|
||||||
|
|
||||||
def normalize_close_frame(crop_vision_frame : VisionFrame) -> VisionFrame:
|
def resize_bounding_box(bounding_box : BoundingBox, aspect_ratio : float) -> BoundingBox:
|
||||||
|
bounding_box[3] += min(8, 511)
|
||||||
|
x1, y1, x2, y2 = bounding_box
|
||||||
|
y1 = y2 - aspect_ratio * (x2 - x1)
|
||||||
|
bounding_box[1] = max(y1, 0)
|
||||||
|
return bounding_box
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_crop_frame(crop_vision_frame : VisionFrame) -> VisionFrame:
|
||||||
|
model_type = get_model_options().get('type')
|
||||||
crop_vision_frame = crop_vision_frame[0].transpose(1, 2, 0)
|
crop_vision_frame = crop_vision_frame[0].transpose(1, 2, 0)
|
||||||
crop_vision_frame = crop_vision_frame.clip(0, 1) * 255
|
crop_vision_frame = crop_vision_frame.clip(0, 1) * 255
|
||||||
crop_vision_frame = crop_vision_frame.astype(numpy.uint8)
|
crop_vision_frame = crop_vision_frame.astype(numpy.uint8)
|
||||||
|
|
||||||
|
if model_type == 'edtalk':
|
||||||
|
crop_vision_frame = crop_vision_frame[:, :, ::-1]
|
||||||
|
crop_vision_frame = cv2.resize(crop_vision_frame, (512, 512), interpolation = cv2.INTER_CUBIC)
|
||||||
|
|
||||||
return crop_vision_frame
|
return crop_vision_frame
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ FaceEnhancerModel = Literal['codeformer', 'gfpgan_1.2', 'gfpgan_1.3', 'gfpgan_1.
|
|||||||
FaceSwapperModel = Literal['blendswap_256', 'ghost_1_256', 'ghost_2_256', 'ghost_3_256', 'hififace_unofficial_256', 'inswapper_128', 'inswapper_128_fp16', 'simswap_256', 'simswap_unofficial_512', 'uniface_256']
|
FaceSwapperModel = Literal['blendswap_256', 'ghost_1_256', 'ghost_2_256', 'ghost_3_256', 'hififace_unofficial_256', 'inswapper_128', 'inswapper_128_fp16', 'simswap_256', 'simswap_unofficial_512', 'uniface_256']
|
||||||
FrameColorizerModel = Literal['ddcolor', 'ddcolor_artistic', 'deoldify', 'deoldify_artistic', 'deoldify_stable']
|
FrameColorizerModel = Literal['ddcolor', 'ddcolor_artistic', 'deoldify', 'deoldify_artistic', 'deoldify_stable']
|
||||||
FrameEnhancerModel = Literal['clear_reality_x4', 'lsdir_x4', 'nomos8k_sc_x4', 'real_esrgan_x2', 'real_esrgan_x2_fp16', 'real_esrgan_x4', 'real_esrgan_x4_fp16', 'real_esrgan_x8', 'real_esrgan_x8_fp16', 'real_hatgan_x4', 'real_web_photo_x4', 'realistic_rescaler_x4', 'remacri_x4', 'siax_x4', 'span_kendata_x4', 'swin2_sr_x4', 'ultra_sharp_x4', 'ultra_sharp_2_x4']
|
FrameEnhancerModel = Literal['clear_reality_x4', 'lsdir_x4', 'nomos8k_sc_x4', 'real_esrgan_x2', 'real_esrgan_x2_fp16', 'real_esrgan_x4', 'real_esrgan_x4_fp16', 'real_esrgan_x8', 'real_esrgan_x8_fp16', 'real_hatgan_x4', 'real_web_photo_x4', 'realistic_rescaler_x4', 'remacri_x4', 'siax_x4', 'span_kendata_x4', 'swin2_sr_x4', 'ultra_sharp_x4', 'ultra_sharp_2_x4']
|
||||||
LipSyncerModel = Literal['wav2lip_96', 'wav2lip_gan_96']
|
LipSyncerModel = Literal['edtalk_256', 'wav2lip_96', 'wav2lip_gan_96']
|
||||||
|
|
||||||
FaceSwapperSet : TypeAlias = Dict[FaceSwapperModel, List[str]]
|
FaceSwapperSet : TypeAlias = Dict[FaceSwapperModel, List[str]]
|
||||||
|
|
||||||
@@ -147,6 +147,7 @@ ProcessorStateSet : TypeAlias = Dict[AppContext, ProcessorState]
|
|||||||
AgeModifierDirection : TypeAlias = NDArray[Any]
|
AgeModifierDirection : TypeAlias = NDArray[Any]
|
||||||
DeepSwapperMorph : TypeAlias = NDArray[Any]
|
DeepSwapperMorph : TypeAlias = NDArray[Any]
|
||||||
FaceEnhancerWeight : TypeAlias = NDArray[Any]
|
FaceEnhancerWeight : TypeAlias = NDArray[Any]
|
||||||
|
LipSyncerWeight : TypeAlias = NDArray[Any]
|
||||||
LivePortraitPitch : TypeAlias = float
|
LivePortraitPitch : TypeAlias = float
|
||||||
LivePortraitYaw : TypeAlias = float
|
LivePortraitYaw : TypeAlias = float
|
||||||
LivePortraitRoll : TypeAlias = float
|
LivePortraitRoll : TypeAlias = float
|
||||||
|
|||||||
Reference in New Issue
Block a user