diff --git a/facefusion/processors/modules/lip_syncer.py b/facefusion/processors/modules/lip_syncer.py index 9fd3f82..315e1db 100755 --- a/facefusion/processors/modules/lip_syncer.py +++ b/facefusion/processors/modules/lip_syncer.py @@ -184,35 +184,34 @@ def sync_lip(target_face : Face, temp_audio_frame : AudioFrame, temp_vision_fram crop_vision_frame = prepare_crop_frame(crop_vision_frame) crop_vision_frame = forward_edtalk(temp_audio_frame, crop_vision_frame, lip_syncer_weight) crop_vision_frame = normalize_crop_frame(crop_vision_frame) - if model_name.startswith('wav2lip'): face_landmark_68 = cv2.transform(target_face.landmark_set.get('68').reshape(1, -1, 2), affine_matrix).reshape(-1, 2) area_mask = create_area_mask(face_landmark_68, [ 'lower-face' ]) crop_masks.append(area_mask) bounding_box = create_bounding_box(face_landmark_68) bounding_box = resize_bounding_box(bounding_box, 4 / 3) - close_vision_frame, close_matrix = warp_face_by_bounding_box(crop_vision_frame, bounding_box, model_size) - close_vision_frame = prepare_crop_frame(close_vision_frame) - close_vision_frame = forward_wav2lip(temp_audio_frame, close_vision_frame) - close_vision_frame = normalize_crop_frame(close_vision_frame) - crop_vision_frame = cv2.warpAffine(close_vision_frame, cv2.invertAffineTransform(close_matrix), (512, 512), borderMode = cv2.BORDER_REPLICATE) + temp_vision_frame, temp_matrix = warp_face_by_bounding_box(crop_vision_frame, bounding_box, model_size) + temp_vision_frame = prepare_crop_frame(temp_vision_frame) + temp_vision_frame = forward_wav2lip(temp_audio_frame, temp_vision_frame) + temp_vision_frame = normalize_crop_frame(temp_vision_frame) + crop_vision_frame = cv2.warpAffine(temp_vision_frame, cv2.invertAffineTransform(temp_matrix), (512, 512), borderMode = cv2.BORDER_REPLICATE) crop_mask = numpy.minimum.reduce(crop_masks) paste_vision_frame = paste_back(temp_vision_frame, crop_vision_frame, crop_mask, affine_matrix) return paste_vision_frame -def forward_wav2lip(temp_audio_frame : AudioFrame, close_vision_frame : VisionFrame) -> VisionFrame: +def forward_wav2lip(temp_audio_frame : AudioFrame, crop_vision_frame : VisionFrame) -> VisionFrame: lip_syncer = get_inference_pool().get('lip_syncer') with conditional_thread_semaphore(): - close_vision_frame = lip_syncer.run(None, + crop_vision_frame = lip_syncer.run(None, { 'source': temp_audio_frame, - 'target': close_vision_frame + 'target': crop_vision_frame })[0] - return close_vision_frame + return crop_vision_frame def forward_edtalk(temp_audio_frame : AudioFrame, crop_vision_frame : VisionFrame, lip_syncer_weight : LipSyncerWeight) -> VisionFrame: @@ -272,7 +271,7 @@ def normalize_crop_frame(crop_vision_frame : VisionFrame) -> VisionFrame: if model_type == 'edtalk': crop_vision_frame = crop_vision_frame[:, :, ::-1] - crop_vision_frame = cv2.resize(crop_vision_frame, (512, 512), interpolation = cv2.INTER_CUBIC) + crop_vision_frame = cv2.resize(crop_vision_frame, (512, 512), interpolation=cv2.INTER_CUBIC) return crop_vision_frame