From d260c28cf35f194cb5618e514e2060198456ac58 Mon Sep 17 00:00:00 2001 From: Henry Ruhs Date: Sun, 26 Jan 2025 22:54:07 +0100 Subject: [PATCH] Feat/available encoders (#860) * Introduce available audio encoders and video encoders * Introduce available audio encoders and video encoders * Introduce available audio encoders and video encoders * Introduce available audio encoders and video encoders * Add flac to audio encoders --- facefusion/choices.py | 11 +++++--- facefusion/ffmpeg.py | 29 ++++++++++++++++++++- facefusion/ffmpeg_builder.py | 4 +++ facefusion/installer.py | 1 - facefusion/program.py | 8 +++--- facefusion/typing.py | 7 ++++- facefusion/uis/components/output_options.py | 6 +++-- tests/test_ffmpeg.py | 9 ++++++- tests/test_ffmpeg_builder.py | 3 +++ 9 files changed, 66 insertions(+), 12 deletions(-) diff --git a/facefusion/choices.py b/facefusion/choices.py index 4f1cddd..652dcb2 100755 --- a/facefusion/choices.py +++ b/facefusion/choices.py @@ -2,7 +2,7 @@ import logging from typing import List, Sequence from facefusion.common_helper import create_float_range, create_int_range -from facefusion.typing import Angle, AudioEncoder, AudioFormat, AudioTypeSet, DownloadProvider, DownloadProviderSet, DownloadScope, ExecutionProvider, ExecutionProviderSet, FaceDetectorModel, FaceDetectorSet, FaceLandmarkerModel, FaceMaskRegion, FaceMaskRegionSet, FaceMaskType, FaceOccluderModel, FaceParserModel, FaceSelectorMode, FaceSelectorOrder, Gender, ImageFormat, ImageTypeSet, JobStatus, LogLevel, LogLevelSet, Race, Score, UiWorkflow, VideoEncoder, VideoFormat, VideoMemoryStrategy, VideoPreset, VideoTypeSet, WebcamMode +from facefusion.typing import Angle, AudioEncoder, AudioFormat, AudioTypeSet, DownloadProvider, DownloadProviderSet, DownloadScope, EncoderSet, ExecutionProvider, ExecutionProviderSet, FaceDetectorModel, FaceDetectorSet, FaceLandmarkerModel, FaceMaskRegion, FaceMaskRegionSet, FaceMaskType, FaceOccluderModel, FaceParserModel, FaceSelectorMode, FaceSelectorOrder, Gender, ImageFormat, ImageTypeSet, JobStatus, LogLevel, LogLevelSet, Race, Score, UiWorkflow, VideoEncoder, VideoFormat, VideoMemoryStrategy, VideoPreset, VideoTypeSet, WebcamMode face_detector_set : FaceDetectorSet =\ { @@ -62,8 +62,13 @@ image_formats : List[ImageFormat] = list(image_type_set.keys()) video_formats : List[VideoFormat] = list(video_type_set.keys()) temp_frame_formats : List[ImageFormat] = [ 'bmp', 'jpeg', 'png', 'tiff' ] -output_audio_encoders : List[AudioEncoder] = [ 'aac', 'libmp3lame', 'libopus', 'libvorbis' ] -output_video_encoders : List[VideoEncoder] = [ 'libx264', 'libx265', 'libvpx-vp9', 'h264_nvenc', 'hevc_nvenc', 'h264_amf', 'hevc_amf', 'h264_qsv', 'hevc_qsv', 'h264_videotoolbox', 'hevc_videotoolbox' ] +output_encoder_set : EncoderSet =\ +{ + 'audio': [ 'aac', 'libmp3lame', 'libopus', 'libvorbis', 'flac' ], + 'video': [ 'libx264', 'libx265', 'libvpx-vp9', 'h264_nvenc', 'hevc_nvenc', 'h264_amf', 'hevc_amf', 'h264_qsv', 'hevc_qsv', 'h264_videotoolbox', 'hevc_videotoolbox' ] +} +output_audio_encoders : List[AudioEncoder] = output_encoder_set.get('audio') +output_video_encoders : List[VideoEncoder] = output_encoder_set.get('video') output_video_presets : List[VideoPreset] = [ 'ultrafast', 'superfast', 'veryfast', 'faster', 'fast', 'medium', 'slow', 'slower', 'veryslow' ] image_template_sizes : List[float] = [ 0.25, 0.5, 0.75, 1, 1.5, 2, 2.5, 3, 3.5, 4 ] diff --git a/facefusion/ffmpeg.py b/facefusion/ffmpeg.py index 0463534..b1f743b 100644 --- a/facefusion/ffmpeg.py +++ b/facefusion/ffmpeg.py @@ -5,10 +5,11 @@ from typing import List, Optional from tqdm import tqdm +import facefusion.choices from facefusion import ffmpeg_builder, logger, process_manager, state_manager, wording from facefusion.filesystem import get_file_format, remove_file from facefusion.temp_helper import get_temp_file_path, get_temp_frames_pattern, resolve_temp_frame_paths -from facefusion.typing import AudioBuffer, Commands, Fps, UpdateProgress +from facefusion.typing import AudioBuffer, Commands, EncoderSet, Fps, UpdateProgress from facefusion.vision import count_trim_frame_total, detect_video_duration, restrict_video_fps @@ -72,6 +73,32 @@ def log_debug(process : subprocess.Popen[bytes]) -> None: logger.debug(error.strip(), __name__) +def get_available_encoder_set() -> EncoderSet: + available_encoder_set : EncoderSet =\ + { + 'audio': [], + 'video': [] + } + commands = ffmpeg_builder.chain( + ffmpeg_builder.get_encoders() + ) + process = run_ffmpeg(commands) + + while line := process.stdout.readline().decode().lower(): + if line.startswith(' a'): + audio_encoder = line.split()[1] + + if audio_encoder in facefusion.choices.output_audio_encoders: + available_encoder_set['audio'].append(audio_encoder) #type:ignore[arg-type] + if line.startswith(' v'): + video_encoder = line.split()[1] + + if video_encoder in facefusion.choices.output_video_encoders: + available_encoder_set['video'].append(video_encoder) #type:ignore[arg-type] + + return available_encoder_set + + def extract_frames(target_path : str, temp_video_resolution : str, temp_video_fps : Fps, trim_frame_start : int, trim_frame_end : int) -> bool: extract_frame_total = count_trim_frame_total(target_path, trim_frame_start, trim_frame_end) temp_frames_pattern = get_temp_frames_pattern(target_path, '%08d') diff --git a/facefusion/ffmpeg_builder.py b/facefusion/ffmpeg_builder.py index 58a95f6..b2ac2f2 100644 --- a/facefusion/ffmpeg_builder.py +++ b/facefusion/ffmpeg_builder.py @@ -16,6 +16,10 @@ def chain(*commands : Commands) -> Commands: return list(itertools.chain(*commands)) +def get_encoders() -> Commands: + return [ '-encoders' ] + + def set_progress() -> Commands: return [ '-progress' ] diff --git a/facefusion/installer.py b/facefusion/installer.py index d4a4a1f..ee84b57 100644 --- a/facefusion/installer.py +++ b/facefusion/installer.py @@ -8,7 +8,6 @@ from argparse import ArgumentParser, HelpFormatter from facefusion import metadata, wording from facefusion.common_helper import is_linux, is_windows - ONNXRUNTIME_SET =\ { 'default': ('onnxruntime', '1.20.1') diff --git a/facefusion/program.py b/facefusion/program.py index ea0d832..496647f 100755 --- a/facefusion/program.py +++ b/facefusion/program.py @@ -3,8 +3,9 @@ from argparse import ArgumentParser, HelpFormatter import facefusion.choices from facefusion import config, metadata, state_manager, wording -from facefusion.common_helper import create_float_metavar, create_int_metavar, get_last +from facefusion.common_helper import create_float_metavar, create_int_metavar, get_first, get_last from facefusion.execution import get_available_execution_providers, suggest_execution_provider +from facefusion.ffmpeg import get_available_encoder_set from facefusion.filesystem import get_file_name, resolve_file_paths from facefusion.jobs import job_store from facefusion.processors.core import get_processors_modules @@ -155,13 +156,14 @@ def create_frame_extraction_program() -> ArgumentParser: def create_output_creation_program() -> ArgumentParser: program = ArgumentParser(add_help = False) + available_encoder_set = get_available_encoder_set() group_output_creation = program.add_argument_group('output creation') group_output_creation.add_argument('--output-image-quality', help = wording.get('help.output_image_quality'), type = int, default = config.get_int_value('output_creation.output_image_quality', '80'), choices = facefusion.choices.output_image_quality_range, metavar = create_int_metavar(facefusion.choices.output_image_quality_range)) group_output_creation.add_argument('--output-image-resolution', help = wording.get('help.output_image_resolution'), default = config.get_str_value('output_creation.output_image_resolution')) - group_output_creation.add_argument('--output-audio-encoder', help = wording.get('help.output_audio_encoder'), default = config.get_str_value('output_creation.output_audio_encoder', 'aac'), choices = facefusion.choices.output_audio_encoders) + group_output_creation.add_argument('--output-audio-encoder', help = wording.get('help.output_audio_encoder'), default = config.get_str_value('output_creation.output_audio_encoder', get_first(available_encoder_set.get('audio'))), choices = available_encoder_set.get('audio')) group_output_creation.add_argument('--output-audio-quality', help = wording.get('help.output_audio_quality'), type = int, default = config.get_int_value('output_creation.output_audio_quality', '80'), choices = facefusion.choices.output_audio_quality_range, metavar = create_int_metavar(facefusion.choices.output_audio_quality_range)) group_output_creation.add_argument('--output-audio-volume', help = wording.get('help.output_audio_volume'), type = int, default = config.get_int_value('output_creation.output_audio_volume', '100'), choices = facefusion.choices.output_audio_volume_range, metavar = create_int_metavar(facefusion.choices.output_audio_volume_range)) - group_output_creation.add_argument('--output-video-encoder', help = wording.get('help.output_video_encoder'), default = config.get_str_value('output_creation.output_video_encoder', 'libx264'), choices = facefusion.choices.output_video_encoders) + group_output_creation.add_argument('--output-video-encoder', help = wording.get('help.output_video_encoder'), default = config.get_str_value('output_creation.output_video_encoder', get_first(available_encoder_set.get('video'))), choices = available_encoder_set.get('video')) group_output_creation.add_argument('--output-video-preset', help = wording.get('help.output_video_preset'), default = config.get_str_value('output_creation.output_video_preset', 'veryfast'), choices = facefusion.choices.output_video_presets) group_output_creation.add_argument('--output-video-quality', help = wording.get('help.output_video_quality'), type = int, default = config.get_int_value('output_creation.output_video_quality', '80'), choices = facefusion.choices.output_video_quality_range, metavar = create_int_metavar(facefusion.choices.output_video_quality_range)) group_output_creation.add_argument('--output-video-resolution', help = wording.get('help.output_video_resolution'), default = config.get_str_value('output_creation.output_video_resolution')) diff --git a/facefusion/typing.py b/facefusion/typing.py index 1882d68..10ef6ec 100755 --- a/facefusion/typing.py +++ b/facefusion/typing.py @@ -117,8 +117,13 @@ AudioTypeSet = Dict[AudioFormat, str] ImageTypeSet = Dict[ImageFormat, str] VideoTypeSet = Dict[VideoFormat, str] -AudioEncoder = Literal['aac', 'libmp3lame', 'libopus', 'libvorbis'] +AudioEncoder = Literal['aac', 'libmp3lame', 'libopus', 'libvorbis', 'flac'] VideoEncoder = Literal['libx264', 'libx265', 'libvpx-vp9', 'h264_nvenc', 'hevc_nvenc', 'h264_amf', 'hevc_amf', 'h264_qsv', 'hevc_qsv', 'h264_videotoolbox', 'hevc_videotoolbox'] +EncoderSet = TypedDict('EncoderSet', +{ + 'audio' : List[AudioEncoder], + 'video' : List[VideoEncoder] +}) VideoPreset = Literal['ultrafast', 'superfast', 'veryfast', 'faster', 'fast', 'medium', 'slow', 'slower', 'veryslow'] WebcamMode = Literal['inline', 'udp', 'v4l2'] diff --git a/facefusion/uis/components/output_options.py b/facefusion/uis/components/output_options.py index 69175a2..c65489f 100644 --- a/facefusion/uis/components/output_options.py +++ b/facefusion/uis/components/output_options.py @@ -5,6 +5,7 @@ import gradio import facefusion.choices from facefusion import state_manager, wording from facefusion.common_helper import calc_int_step +from facefusion.ffmpeg import get_available_encoder_set from facefusion.filesystem import is_image, is_video from facefusion.typing import AudioEncoder, Fps, VideoEncoder, VideoPreset from facefusion.uis.core import get_ui_components, register_ui_component @@ -36,6 +37,7 @@ def render() -> None: output_image_resolutions = [] output_video_resolutions = [] + available_encoder_set = get_available_encoder_set() if is_image(state_manager.get_item('target_path')): output_image_resolution = detect_image_resolution(state_manager.get_item('target_path')) output_image_resolutions = create_image_resolutions(output_image_resolution) @@ -58,7 +60,7 @@ def render() -> None: ) OUTPUT_AUDIO_ENCODER_DROPDOWN = gradio.Dropdown( label = wording.get('uis.output_audio_encoder_dropdown'), - choices = facefusion.choices.output_audio_encoders, + choices = available_encoder_set.get('audio'), value = state_manager.get_item('output_audio_encoder'), visible = is_video(state_manager.get_item('target_path')) ) @@ -80,7 +82,7 @@ def render() -> None: ) OUTPUT_VIDEO_ENCODER_DROPDOWN = gradio.Dropdown( label = wording.get('uis.output_video_encoder_dropdown'), - choices = facefusion.choices.output_video_encoders, + choices = available_encoder_set.get('video'), value = state_manager.get_item('output_video_encoder'), visible = is_video(state_manager.get_item('target_path')) ) diff --git a/tests/test_ffmpeg.py b/tests/test_ffmpeg.py index 855cfff..054db8a 100644 --- a/tests/test_ffmpeg.py +++ b/tests/test_ffmpeg.py @@ -5,7 +5,7 @@ import pytest from facefusion import process_manager, state_manager from facefusion.download import conditional_download -from facefusion.ffmpeg import concat_video, extract_frames, read_audio_buffer, replace_audio, restore_audio +from facefusion.ffmpeg import concat_video, extract_frames, get_available_encoder_set, read_audio_buffer, replace_audio, restore_audio from facefusion.filesystem import copy_file from facefusion.temp_helper import clear_temp_directory, create_temp_directory, get_temp_file_path, resolve_temp_frame_paths from .helper import get_test_example_file, get_test_examples_directory, get_test_output_file, prepare_test_output_directory @@ -38,6 +38,13 @@ def before_each() -> None: prepare_test_output_directory() +def test_get_available_encoder_set() -> None: + available_encoder_set = get_available_encoder_set() + + assert 'aac' in available_encoder_set.get('audio') + assert 'libx264' in available_encoder_set.get('video') + + def test_extract_frames() -> None: extract_set =\ [ diff --git a/tests/test_ffmpeg_builder.py b/tests/test_ffmpeg_builder.py index 66b1aab..1869a1f 100644 --- a/tests/test_ffmpeg_builder.py +++ b/tests/test_ffmpeg_builder.py @@ -42,6 +42,9 @@ def test_set_audio_quality() -> None: assert set_audio_quality('libvorbis', 0) == [ '-q:a', '-1.0' ] assert set_audio_quality('libvorbis', 50) == [ '-q:a', '4.5' ] assert set_audio_quality('libvorbis', 100) == [ '-q:a', '10.0' ] + assert set_audio_quality('flac', 0) == [] + assert set_audio_quality('flac', 50) == [] + assert set_audio_quality('flac', 100) == [] def test_set_video_quality() -> None: