stage 1 OCD

This commit is contained in:
harisreedhar
2025-06-12 20:07:41 +05:30
parent 36cad4d1b7
commit 7905cfe6a3
3 changed files with 49 additions and 59 deletions

View File

@@ -195,7 +195,7 @@ face_swapper_models : List[FaceSwapperModel] = list(face_swapper_set.keys())
frame_colorizer_models : List[FrameColorizerModel] = [ 'ddcolor', 'ddcolor_artistic', 'deoldify', 'deoldify_artistic', 'deoldify_stable' ] frame_colorizer_models : List[FrameColorizerModel] = [ 'ddcolor', 'ddcolor_artistic', 'deoldify', 'deoldify_artistic', 'deoldify_stable' ]
frame_colorizer_sizes : List[str] = [ '192x192', '256x256', '384x384', '512x512' ] frame_colorizer_sizes : List[str] = [ '192x192', '256x256', '384x384', '512x512' ]
frame_enhancer_models : List[FrameEnhancerModel] = [ 'clear_reality_x4', 'lsdir_x4', 'nomos8k_sc_x4', 'real_esrgan_x2', 'real_esrgan_x2_fp16', 'real_esrgan_x4', 'real_esrgan_x4_fp16', 'real_esrgan_x8', 'real_esrgan_x8_fp16', 'real_hatgan_x4', 'real_web_photo_x4', 'realistic_rescaler_x4', 'remacri_x4', 'siax_x4', 'span_kendata_x4', 'swin2_sr_x4', 'ultra_sharp_x4', 'ultra_sharp_2_x4' ] frame_enhancer_models : List[FrameEnhancerModel] = [ 'clear_reality_x4', 'lsdir_x4', 'nomos8k_sc_x4', 'real_esrgan_x2', 'real_esrgan_x2_fp16', 'real_esrgan_x4', 'real_esrgan_x4_fp16', 'real_esrgan_x8', 'real_esrgan_x8_fp16', 'real_hatgan_x4', 'real_web_photo_x4', 'realistic_rescaler_x4', 'remacri_x4', 'siax_x4', 'span_kendata_x4', 'swin2_sr_x4', 'ultra_sharp_x4', 'ultra_sharp_2_x4' ]
lip_syncer_models : List[LipSyncerModel] = [ 'wav2lip_96', 'wav2lip_gan_96', 'edtalk_256' ] lip_syncer_models : List[LipSyncerModel] = [ 'edtalk_256', 'wav2lip_96', 'wav2lip_gan_96' ]
age_modifier_direction_range : Sequence[int] = create_int_range(-100, 100, 1) age_modifier_direction_range : Sequence[int] = create_int_range(-100, 100, 1)
deep_swapper_morph_range : Sequence[int] = create_int_range(0, 100, 1) deep_swapper_morph_range : Sequence[int] = create_int_range(0, 100, 1)

View File

@@ -20,7 +20,7 @@ from facefusion.face_selector import find_similar_faces, sort_and_filter_faces
from facefusion.face_store import get_reference_faces from facefusion.face_store import get_reference_faces
from facefusion.filesystem import filter_audio_paths, has_audio, in_directory, is_image, is_video, resolve_relative_path, same_file_extension from facefusion.filesystem import filter_audio_paths, has_audio, in_directory, is_image, is_video, resolve_relative_path, same_file_extension
from facefusion.processors import choices as processors_choices from facefusion.processors import choices as processors_choices
from facefusion.processors.types import LipSyncerInputs from facefusion.processors.types import LipSyncerInputs, LipSyncerWeight
from facefusion.program_helper import find_argument_group from facefusion.program_helper import find_argument_group
from facefusion.thread_helper import conditional_thread_semaphore from facefusion.thread_helper import conditional_thread_semaphore
from facefusion.types import ApplyStateItem, Args, AudioFrame, BoundingBox, DownloadScope, Face, InferencePool, ModelOptions, ModelSet, ProcessMode, QueuePayload, UpdateProgress, VisionFrame from facefusion.types import ApplyStateItem, Args, AudioFrame, BoundingBox, DownloadScope, Face, InferencePool, ModelOptions, ModelSet, ProcessMode, QueuePayload, UpdateProgress, VisionFrame
@@ -31,6 +31,27 @@ from facefusion.vision import read_image, read_static_image, restrict_video_fps,
def create_static_model_set(download_scope : DownloadScope) -> ModelSet: def create_static_model_set(download_scope : DownloadScope) -> ModelSet:
return\ return\
{ {
'edtalk_256':
{
'hashes':
{
'lip_syncer':
{
'url': resolve_download_url('models-3.3.0', 'edtalk_256.hash'),
'path': resolve_relative_path('../.assets/models/edtalk_256.hash')
}
},
'sources':
{
'lip_syncer':
{
'url': resolve_download_url('models-3.3.0', 'edtalk_256.onnx'),
'path': resolve_relative_path('../.assets/models/edtalk_256.onnx')
}
},
'type': 'edtalk',
'size': (256, 256)
},
'wav2lip_96': 'wav2lip_96':
{ {
'hashes': 'hashes':
@@ -72,27 +93,6 @@ def create_static_model_set(download_scope : DownloadScope) -> ModelSet:
}, },
'type': 'wav2lip', 'type': 'wav2lip',
'size': (96, 96) 'size': (96, 96)
},
'edtalk_256':
{
'hashes':
{
'lip_syncer':
{
'url': resolve_download_url('models-3.3.0', 'edtalk_256.hash'),
'path': resolve_relative_path('../.assets/models/edtalk_256.hash')
}
},
'sources':
{
'lip_syncer':
{
'url': resolve_download_url('models-3.3.0', 'edtalk_256.onnx'),
'path': resolve_relative_path('../.assets/models/edtalk_256.onnx')
}
},
'type': 'edtalk',
'size': (256, 256)
} }
} }
@@ -168,49 +168,38 @@ def post_process() -> None:
def sync_lip(target_face : Face, temp_audio_frame : AudioFrame, temp_vision_frame : VisionFrame) -> VisionFrame: def sync_lip(target_face : Face, temp_audio_frame : AudioFrame, temp_vision_frame : VisionFrame) -> VisionFrame:
model_name = state_manager.get_item('lip_syncer_model') model_name = state_manager.get_item('lip_syncer_model')
model_size = get_model_options().get('size')
temp_audio_frame = prepare_audio_frame(temp_audio_frame) temp_audio_frame = prepare_audio_frame(temp_audio_frame)
crop_vision_frame, affine_matrix = warp_face_by_face_landmark_5(temp_vision_frame, target_face.landmark_set.get('5/68'), 'ffhq_512', (512, 512)) crop_vision_frame, affine_matrix = warp_face_by_face_landmark_5(temp_vision_frame, target_face.landmark_set.get('5/68'), 'ffhq_512', (512, 512))
box_mask = create_static_box_mask(crop_vision_frame.shape[:2][::-1], state_manager.get_item('face_mask_blur'), state_manager.get_item('face_mask_padding')) crop_masks = []
crop_masks =\
[
box_mask
]
if 'occlusion' in state_manager.get_item('face_mask_types'): if 'occlusion' in state_manager.get_item('face_mask_types'):
occlusion_mask = create_occlusion_mask(crop_vision_frame) occlusion_mask = create_occlusion_mask(crop_vision_frame)
crop_masks.append(occlusion_mask) crop_masks.append(occlusion_mask)
if model_name == 'edtalk_256':
lip_syncer_weight = numpy.array([ state_manager.get_item('lip_syncer_weight') ]).astype(numpy.float32) * 1.25
box_mask = create_static_box_mask(crop_vision_frame.shape[:2][::-1], state_manager.get_item('face_mask_blur'), state_manager.get_item('face_mask_padding'))
crop_masks.append(box_mask)
crop_vision_frame = prepare_crop_frame(crop_vision_frame)
crop_vision_frame = forward_edtalk(temp_audio_frame, crop_vision_frame, lip_syncer_weight)
crop_vision_frame = normalize_crop_frame(crop_vision_frame)
if model_name.startswith('wav2lip'): if model_name.startswith('wav2lip'):
face_landmark_68 = cv2.transform(target_face.landmark_set.get('68').reshape(1, -1, 2), affine_matrix).reshape(-1, 2) face_landmark_68 = cv2.transform(target_face.landmark_set.get('68').reshape(1, -1, 2), affine_matrix).reshape(-1, 2)
area_mask = create_area_mask(face_landmark_68, [ 'lower-face' ]) area_mask = create_area_mask(face_landmark_68, [ 'lower-face' ])
crop_masks.append(area_mask) crop_masks.append(area_mask)
bounding_box = create_bounding_box(face_landmark_68) bounding_box = create_bounding_box(face_landmark_68)
bounding_box = prepare_bounding_box(bounding_box) bounding_box = resize_bounding_box(bounding_box, 4 / 3)
crop_vision_frame = process_wav2lip(crop_vision_frame, temp_audio_frame, bounding_box)
elif model_name == 'edtalk_256':
crop_vision_frame = process_edtalk(crop_vision_frame, temp_audio_frame)
crop_mask = numpy.minimum.reduce(crop_masks)
paste_vision_frame = paste_back(temp_vision_frame, crop_vision_frame, crop_mask, affine_matrix)
return paste_vision_frame
def process_wav2lip(crop_vision_frame : VisionFrame, temp_audio_frame : AudioFrame, bounding_box : BoundingBox) -> VisionFrame:
model_size = get_model_options().get('size')
close_vision_frame, close_matrix = warp_face_by_bounding_box(crop_vision_frame, bounding_box, model_size) close_vision_frame, close_matrix = warp_face_by_bounding_box(crop_vision_frame, bounding_box, model_size)
close_vision_frame = prepare_crop_frame(close_vision_frame) close_vision_frame = prepare_crop_frame(close_vision_frame)
close_vision_frame = forward_wav2lip(temp_audio_frame, close_vision_frame) close_vision_frame = forward_wav2lip(temp_audio_frame, close_vision_frame)
close_vision_frame = normalize_crop_frame(close_vision_frame) close_vision_frame = normalize_crop_frame(close_vision_frame)
crop_vision_frame = cv2.warpAffine(close_vision_frame, cv2.invertAffineTransform(close_matrix), (512, 512), borderMode = cv2.BORDER_REPLICATE) crop_vision_frame = cv2.warpAffine(close_vision_frame, cv2.invertAffineTransform(close_matrix), (512, 512), borderMode = cv2.BORDER_REPLICATE)
return crop_vision_frame
crop_mask = numpy.minimum.reduce(crop_masks)
def process_edtalk(crop_vision_frame : VisionFrame, temp_audio_frame : AudioFrame) -> VisionFrame: paste_vision_frame = paste_back(temp_vision_frame, crop_vision_frame, crop_mask, affine_matrix)
lip_syncer_weight = state_manager.get_item('lip_syncer_weight') * 1.25 return paste_vision_frame
crop_vision_frame = prepare_crop_frame(crop_vision_frame)
crop_vision_frame = forward_edtalk(temp_audio_frame, crop_vision_frame, lip_syncer_weight)
crop_vision_frame = normalize_crop_frame(crop_vision_frame)
return crop_vision_frame
def forward_wav2lip(temp_audio_frame : AudioFrame, close_vision_frame : VisionFrame) -> VisionFrame: def forward_wav2lip(temp_audio_frame : AudioFrame, close_vision_frame : VisionFrame) -> VisionFrame:
@@ -226,7 +215,7 @@ def forward_wav2lip(temp_audio_frame : AudioFrame, close_vision_frame : VisionFr
return close_vision_frame return close_vision_frame
def forward_edtalk(temp_audio_frame : AudioFrame, crop_vision_frame : VisionFrame, lip_syncer_weight : float) -> VisionFrame: def forward_edtalk(temp_audio_frame : AudioFrame, crop_vision_frame : VisionFrame, lip_syncer_weight : LipSyncerWeight) -> VisionFrame:
lip_syncer = get_inference_pool().get('lip_syncer') lip_syncer = get_inference_pool().get('lip_syncer')
with conditional_thread_semaphore(): with conditional_thread_semaphore():
@@ -234,7 +223,7 @@ def forward_edtalk(temp_audio_frame : AudioFrame, crop_vision_frame : VisionFram
{ {
'source': temp_audio_frame, 'source': temp_audio_frame,
'target': crop_vision_frame, 'target': crop_vision_frame,
'weight': [ numpy.float32(lip_syncer_weight) ] 'weight': lip_syncer_weight
})[0] })[0]
return crop_vision_frame return crop_vision_frame
@@ -253,24 +242,24 @@ def prepare_crop_frame(crop_vision_frame : VisionFrame) -> VisionFrame:
model_type = get_model_options().get('type') model_type = get_model_options().get('type')
model_size = get_model_options().get('size') model_size = get_model_options().get('size')
if model_type == 'edtalk':
crop_vision_frame = cv2.resize(crop_vision_frame, (256, 256), interpolation = cv2.INTER_AREA)
crop_vision_frame = crop_vision_frame[:, :, ::-1] / 255.0
crop_vision_frame = numpy.expand_dims(crop_vision_frame.transpose(2, 0, 1), axis = 0).astype(numpy.float32)
if model_type == 'wav2lip': if model_type == 'wav2lip':
crop_vision_frame = numpy.expand_dims(crop_vision_frame, axis = 0) crop_vision_frame = numpy.expand_dims(crop_vision_frame, axis = 0)
prepare_vision_frame = crop_vision_frame.copy() prepare_vision_frame = crop_vision_frame.copy()
prepare_vision_frame[:, model_size[0] // 2:] = 0 prepare_vision_frame[:, model_size[0] // 2:] = 0
crop_vision_frame = numpy.concatenate((prepare_vision_frame, crop_vision_frame), axis = 3) crop_vision_frame = numpy.concatenate((prepare_vision_frame, crop_vision_frame), axis = 3)
crop_vision_frame = crop_vision_frame.transpose(0, 3, 1, 2).astype('float32') / 255.0 crop_vision_frame = crop_vision_frame.transpose(0, 3, 1, 2).astype('float32') / 255.0
elif model_type == 'edtalk':
crop_vision_frame = cv2.resize(crop_vision_frame, (256, 256), interpolation = cv2.INTER_AREA)
crop_vision_frame = crop_vision_frame[:, :, ::-1] / 255.0
crop_vision_frame = numpy.expand_dims(crop_vision_frame.transpose(2, 0, 1), axis = 0).astype(numpy.float32)
return crop_vision_frame return crop_vision_frame
def prepare_bounding_box(bounding_box : BoundingBox) -> BoundingBox: def resize_bounding_box(bounding_box : BoundingBox, aspect_ratio : float) -> BoundingBox:
bounding_box[3] += min(8, 511) bounding_box[3] += min(8, 511)
x1, y1, x2, y2 = bounding_box x1, y1, x2, y2 = bounding_box
y1 = y2 - (4 / 3) * (x2 - x1) y1 = y2 - aspect_ratio * (x2 - x1)
bounding_box[1] = max(y1, 0) bounding_box[1] = max(y1, 0)
return bounding_box return bounding_box

View File

@@ -13,7 +13,7 @@ FaceEnhancerModel = Literal['codeformer', 'gfpgan_1.2', 'gfpgan_1.3', 'gfpgan_1.
FaceSwapperModel = Literal['blendswap_256', 'ghost_1_256', 'ghost_2_256', 'ghost_3_256', 'hififace_unofficial_256', 'inswapper_128', 'inswapper_128_fp16', 'simswap_256', 'simswap_unofficial_512', 'uniface_256'] FaceSwapperModel = Literal['blendswap_256', 'ghost_1_256', 'ghost_2_256', 'ghost_3_256', 'hififace_unofficial_256', 'inswapper_128', 'inswapper_128_fp16', 'simswap_256', 'simswap_unofficial_512', 'uniface_256']
FrameColorizerModel = Literal['ddcolor', 'ddcolor_artistic', 'deoldify', 'deoldify_artistic', 'deoldify_stable'] FrameColorizerModel = Literal['ddcolor', 'ddcolor_artistic', 'deoldify', 'deoldify_artistic', 'deoldify_stable']
FrameEnhancerModel = Literal['clear_reality_x4', 'lsdir_x4', 'nomos8k_sc_x4', 'real_esrgan_x2', 'real_esrgan_x2_fp16', 'real_esrgan_x4', 'real_esrgan_x4_fp16', 'real_esrgan_x8', 'real_esrgan_x8_fp16', 'real_hatgan_x4', 'real_web_photo_x4', 'realistic_rescaler_x4', 'remacri_x4', 'siax_x4', 'span_kendata_x4', 'swin2_sr_x4', 'ultra_sharp_x4', 'ultra_sharp_2_x4'] FrameEnhancerModel = Literal['clear_reality_x4', 'lsdir_x4', 'nomos8k_sc_x4', 'real_esrgan_x2', 'real_esrgan_x2_fp16', 'real_esrgan_x4', 'real_esrgan_x4_fp16', 'real_esrgan_x8', 'real_esrgan_x8_fp16', 'real_hatgan_x4', 'real_web_photo_x4', 'realistic_rescaler_x4', 'remacri_x4', 'siax_x4', 'span_kendata_x4', 'swin2_sr_x4', 'ultra_sharp_x4', 'ultra_sharp_2_x4']
LipSyncerModel = Literal['wav2lip_96', 'wav2lip_gan_96', 'edtalk_256'] LipSyncerModel = Literal['edtalk_256', 'wav2lip_96', 'wav2lip_gan_96']
FaceSwapperSet : TypeAlias = Dict[FaceSwapperModel, List[str]] FaceSwapperSet : TypeAlias = Dict[FaceSwapperModel, List[str]]
@@ -147,6 +147,7 @@ ProcessorStateSet : TypeAlias = Dict[AppContext, ProcessorState]
AgeModifierDirection : TypeAlias = NDArray[Any] AgeModifierDirection : TypeAlias = NDArray[Any]
DeepSwapperMorph : TypeAlias = NDArray[Any] DeepSwapperMorph : TypeAlias = NDArray[Any]
FaceEnhancerWeight : TypeAlias = NDArray[Any] FaceEnhancerWeight : TypeAlias = NDArray[Any]
LipSyncerWeight : TypeAlias = NDArray[Any]
LivePortraitPitch : TypeAlias = float LivePortraitPitch : TypeAlias = float
LivePortraitYaw : TypeAlias = float LivePortraitYaw : TypeAlias = float
LivePortraitRoll : TypeAlias = float LivePortraitRoll : TypeAlias = float