Feat/more audio settings (#849)

* Add more audio settings, revamp some ffmpeg commands * Add more audio settings, revamp some ffmpeg commands * Add more audio settings, revamp some ffmpeg commands * Add more audio settings, revamp some ffmpeg commands
2025-01-07 22:10:54 +01:00
parent c5bc7c50a5
commit 5b76f54332
14 changed files with 144 additions and 81 deletions
--- a/facefusion.ini
+++ b/facefusion.ini
@@ -49,6 +49,8 @@ keep_temp =
 output_image_quality =
 output_image_resolution =
 output_audio_encoder =
+output_audio_quality =
+output_audio_volume =
 output_video_encoder =
 output_video_preset =
 output_video_quality =
--- a/facefusion/args.py
+++ b/facefusion/args.py
@@ -92,6 +92,8 @@ def apply_args(args : Args, apply_state_item : ApplyStateItem) -> None:
 		else:
 			apply_state_item('output_image_resolution', pack_resolution(output_image_resolution))
 	apply_state_item('output_audio_encoder', args.get('output_audio_encoder'))
+	apply_state_item('output_audio_quality', args.get('output_audio_quality'))
+	apply_state_item('output_audio_volume', args.get('output_audio_volume'))
 	apply_state_item('output_video_encoder', args.get('output_video_encoder'))
 	apply_state_item('output_video_preset', args.get('output_video_preset'))
 	apply_state_item('output_video_quality', args.get('output_video_quality'))
@@ -105,7 +107,6 @@ def apply_args(args : Args, apply_state_item : ApplyStateItem) -> None:
 	if args.get('output_video_fps') or is_video(args.get('target_path')):
 		output_video_fps = normalize_fps(args.get('output_video_fps')) or detect_video_fps(args.get('target_path'))
 		apply_state_item('output_video_fps', output_video_fps)
-	apply_state_item('skip_audio', args.get('skip_audio'))
 	# processors
 	available_processors = [ get_file_name(file_path) for file_path in resolve_file_paths('facefusion/processors/modules') ]
 	apply_state_item('processors', args.get('processors'))
--- a/facefusion/choices.py
+++ b/facefusion/choices.py
@@ -127,4 +127,6 @@ face_mask_padding_range : Sequence[int] = create_int_range(0, 100, 1)
 face_selector_age_range : Sequence[int] = create_int_range(0, 100, 1)
 reference_face_distance_range : Sequence[float] = create_float_range(0.0, 1.5, 0.05)
 output_image_quality_range : Sequence[int] = create_int_range(0, 100, 1)
+output_audio_quality_range : Sequence[int] = create_int_range(0, 100, 1)
+output_audio_volume_range : Sequence[int] = create_int_range(0, 100, 1)
 output_video_quality_range : Sequence[int] = create_int_range(0, 100, 1)
--- a/facefusion/core.py
+++ b/facefusion/core.py
@@ -437,7 +437,7 @@ def process_video(start_time : float) -> ErrorCode:
 		process_manager.end()
 		return 1
 	# handle audio
-	if state_manager.get_item('skip_audio'):
+	if state_manager.get_item('output_audio_volume') == 0:
 		logger.info(wording.get('skipping_audio'), __name__)
 		move_temp_file(state_manager.get_item('target_path'), state_manager.get_item('output_path'))
 	else:
--- a/facefusion/ffmpeg.py
+++ b/facefusion/ffmpeg.py
@@ -94,6 +94,87 @@ def extract_frames(target_path : str, temp_video_resolution : str, temp_video_fp
 		return process.returncode == 0


+def copy_image(target_path : str, temp_image_resolution : str) -> bool:
+	temp_file_path = get_temp_file_path(target_path)
+	if get_file_format(target_path) == 'webp':
+		output_image_compression = 100
+	else:
+		output_image_compression = 1
+	commands = [ '-i', target_path, '-s', str(temp_image_resolution), '-q:v', str(output_image_compression), '-y', temp_file_path ]
+	return run_ffmpeg(commands).returncode == 0
+
+
+def finalize_image(target_path : str, output_path : str, output_image_resolution : str) -> bool:
+	output_image_quality = state_manager.get_item('output_image_quality')
+	temp_file_path = get_temp_file_path(target_path)
+	if get_file_format(target_path) == 'webp':
+		output_image_compression = output_image_quality
+	else:
+		output_image_compression = round(31 - (output_image_quality * 0.31))
+	commands = [ '-i', temp_file_path, '-s', str(output_image_resolution), '-q:v', str(output_image_compression), '-y', output_path ]
+	return run_ffmpeg(commands).returncode == 0
+
+
+def read_audio_buffer(target_path : str, sample_rate : int, channel_total : int) -> Optional[AudioBuffer]:
+	commands = [ '-i', target_path, '-vn', '-f', 's16le', '-acodec', 'pcm_s16le', '-ar', str(sample_rate), '-ac', str(channel_total), '-' ]
+	process = open_ffmpeg(commands)
+	audio_buffer, _ = process.communicate()
+	if process.returncode == 0:
+		return audio_buffer
+	return None
+
+
+def restore_audio(target_path : str, output_path : str, output_video_fps : Fps, trim_frame_start : int, trim_frame_end : int) -> bool:
+	output_audio_encoder = state_manager.get_item('output_audio_encoder')
+	output_audio_quality = state_manager.get_item('output_audio_quality')
+	output_audio_volume = state_manager.get_item('output_audio_volume')
+	temp_file_path = get_temp_file_path(target_path)
+	temp_video_duration = detect_video_duration(temp_file_path)
+	commands = [ '-i', temp_file_path ]
+
+	if isinstance(trim_frame_start, int):
+		start_time = trim_frame_start / output_video_fps
+		commands.extend([ '-ss', str(start_time) ])
+	if isinstance(trim_frame_end, int):
+		end_time = trim_frame_end / output_video_fps
+		commands.extend([ '-to', str(end_time) ])
+	commands.extend([ '-i', target_path, '-c:v', 'copy', '-c:a', output_audio_encoder ])
+	if output_audio_encoder in [ 'aac' ]:
+		output_audio_compression = round(10 - (output_audio_quality * 0.9))
+		commands.extend([ '-q:a', str(output_audio_compression) ])
+	if output_audio_encoder in [ 'libmp3lame' ]:
+		output_audio_compression = round(9 - (output_audio_quality * 0.9))
+		commands.extend([ '-q:a', str(output_audio_compression) ])
+	if output_audio_encoder in [ 'libopus', 'libvorbis' ]:
+		output_audio_compression = round((100 - output_audio_quality) / 10)
+		commands.extend([ '-q:a', str(output_audio_compression) ])
+	output_audio_volume = output_audio_volume / 100
+	commands.extend([ '-filter:a', 'volume=' + str(output_audio_volume), '-map', '0:v:0', '-map', '1:a:0', '-t', str(temp_video_duration), '-y', output_path ])
+	return run_ffmpeg(commands).returncode == 0
+
+
+def replace_audio(target_path : str, audio_path : str, output_path : str) -> bool:
+	output_audio_encoder = state_manager.get_item('output_audio_encoder')
+	output_audio_quality = state_manager.get_item('output_audio_quality')
+	output_audio_volume = state_manager.get_item('output_audio_volume')
+	temp_file_path = get_temp_file_path(target_path)
+	temp_video_duration = detect_video_duration(temp_file_path)
+	commands = [ '-i', temp_file_path, '-i', audio_path, '-c:v', 'copy', '-c:a', output_audio_encoder ]
+
+	if output_audio_encoder in [ 'aac' ]:
+		output_audio_compression = round(10 - (output_audio_quality * 0.9))
+		commands.extend([ '-q:a', str(output_audio_compression) ])
+	if output_audio_encoder in [ 'libmp3lame' ]:
+		output_audio_compression = round(9 - (output_audio_quality * 0.9))
+		commands.extend([ '-q:a', str(output_audio_compression) ])
+	if output_audio_encoder in [ 'libopus', 'libvorbis' ]:
+		output_audio_compression = round((100 - output_audio_quality) / 10)
+		commands.extend([ '-q:a', str(output_audio_compression) ])
+	output_audio_volume = output_audio_volume / 100
+	commands.extend([ '-filter:a', 'volume=' + str(output_audio_volume), '-t', str(temp_video_duration), '-y', output_path ])
+	return run_ffmpeg(commands).returncode == 0
+
+
 def merge_video(target_path : str, output_video_resolution : str, output_video_fps: Fps) -> bool:
 	output_video_encoder = state_manager.get_item('output_video_encoder')
 	output_video_quality = state_manager.get_item('output_video_quality')
@@ -128,7 +209,6 @@ def merge_video(target_path : str, output_video_resolution : str, output_video_f


 def concat_video(output_path : str, temp_output_paths : List[str]) -> bool:
-	output_audio_encoder = state_manager.get_item('output_audio_encoder')
 	concat_video_path = tempfile.mktemp()

 	with open(concat_video_path, 'w') as concat_video_file:
@@ -136,67 +216,13 @@ def concat_video(output_path : str, temp_output_paths : List[str]) -> bool:
 			concat_video_file.write('file \'' + os.path.abspath(temp_output_path) + '\'' + os.linesep)
 		concat_video_file.flush()
 		concat_video_file.close()
-	commands = [ '-f', 'concat', '-safe', '0', '-i', concat_video_file.name, '-c:v', 'copy', '-c:a', output_audio_encoder, '-y', os.path.abspath(output_path) ]
+	commands = [ '-f', 'concat', '-safe', '0', '-i', concat_video_file.name, '-c:v', 'copy', '-c:a', 'copy', '-y', os.path.abspath(output_path) ]
 	process = run_ffmpeg(commands)
 	process.communicate()
 	remove_file(concat_video_path)
 	return process.returncode == 0


-def copy_image(target_path : str, temp_image_resolution : str) -> bool:
-	temp_file_path = get_temp_file_path(target_path)
-	temp_image_compression = calc_image_compression(target_path, 100)
-	commands = [ '-i', target_path, '-s', str(temp_image_resolution), '-q:v', str(temp_image_compression), '-y', temp_file_path ]
-	return run_ffmpeg(commands).returncode == 0
-
-
-def finalize_image(target_path : str, output_path : str, output_image_resolution : str) -> bool:
-	output_image_quality = state_manager.get_item('output_image_quality')
-	temp_file_path = get_temp_file_path(target_path)
-	output_image_compression = calc_image_compression(target_path, output_image_quality)
-	commands = [ '-i', temp_file_path, '-s', str(output_image_resolution), '-q:v', str(output_image_compression), '-y', output_path ]
-	return run_ffmpeg(commands).returncode == 0
-
-
-def calc_image_compression(image_path : str, image_quality : int) -> int:
-	if get_file_format(image_path) == 'webm':
-		image_quality = 100 - image_quality
-	return round(31 - (image_quality * 0.31))
-
-
-def read_audio_buffer(target_path : str, sample_rate : int, channel_total : int) -> Optional[AudioBuffer]:
-	commands = [ '-i', target_path, '-vn', '-f', 's16le', '-acodec', 'pcm_s16le', '-ar', str(sample_rate), '-ac', str(channel_total), '-' ]
-	process = open_ffmpeg(commands)
-	audio_buffer, _ = process.communicate()
-	if process.returncode == 0:
-		return audio_buffer
-	return None
-
-
-def restore_audio(target_path : str, output_path : str, output_video_fps : Fps, trim_frame_start : int, trim_frame_end : int) -> bool:
-	output_audio_encoder = state_manager.get_item('output_audio_encoder')
-	temp_file_path = get_temp_file_path(target_path)
-	temp_video_duration = detect_video_duration(temp_file_path)
-	commands = [ '-i', temp_file_path ]
-
-	if isinstance(trim_frame_start, int):
-		start_time = trim_frame_start / output_video_fps
-		commands.extend([ '-ss', str(start_time) ])
-	if isinstance(trim_frame_end, int):
-		end_time = trim_frame_end / output_video_fps
-		commands.extend([ '-to', str(end_time) ])
-	commands.extend([ '-i', target_path, '-c:v', 'copy', '-c:a', output_audio_encoder, '-map', '0:v:0', '-map', '1:a:0', '-t', str(temp_video_duration), '-y', output_path ])
-	return run_ffmpeg(commands).returncode == 0
-
-
-def replace_audio(target_path : str, audio_path : str, output_path : str) -> bool:
-	output_audio_encoder = state_manager.get_item('output_audio_encoder')
-	temp_file_path = get_temp_file_path(target_path)
-	temp_video_duration = detect_video_duration(temp_file_path)
-	commands = [ '-i', temp_file_path, '-i', audio_path, '-c:v', 'copy', '-c:a', output_audio_encoder, '-t', str(temp_video_duration), '-y', output_path ]
-	return run_ffmpeg(commands).returncode == 0
-
-
 def map_nvenc_preset(output_video_preset : OutputVideoPreset) -> Optional[str]:
 	if output_video_preset in [ 'ultrafast', 'superfast', 'veryfast', 'faster', 'fast' ]:
 		return 'fast'
--- a/facefusion/program.py
+++ b/facefusion/program.py
@@ -159,13 +159,14 @@ def create_output_creation_program() -> ArgumentParser:
 	group_output_creation.add_argument('--output-image-quality', help = wording.get('help.output_image_quality'), type = int, default = config.get_int_value('output_creation.output_image_quality', '80'), choices = facefusion.choices.output_image_quality_range, metavar = create_int_metavar(facefusion.choices.output_image_quality_range))
 	group_output_creation.add_argument('--output-image-resolution', help = wording.get('help.output_image_resolution'), default = config.get_str_value('output_creation.output_image_resolution'))
 	group_output_creation.add_argument('--output-audio-encoder', help = wording.get('help.output_audio_encoder'), default = config.get_str_value('output_creation.output_audio_encoder', 'aac'), choices = facefusion.choices.output_audio_encoders)
+	group_output_creation.add_argument('--output-audio-quality', help = wording.get('help.output_audio_quality'), type = int, default = config.get_int_value('output_creation.output_audio_quality', '80'), choices = facefusion.choices.output_audio_quality_range, metavar = create_int_metavar(facefusion.choices.output_audio_quality_range))
+	group_output_creation.add_argument('--output-audio-volume', help = wording.get('help.output_audio_volume'), type = int, default = config.get_int_value('output_creation.output_audio_volume', '100'), choices = facefusion.choices.output_audio_volume_range, metavar = create_int_metavar(facefusion.choices.output_audio_volume_range))
 	group_output_creation.add_argument('--output-video-encoder', help = wording.get('help.output_video_encoder'), default = config.get_str_value('output_creation.output_video_encoder', 'libx264'), choices = facefusion.choices.output_video_encoders)
 	group_output_creation.add_argument('--output-video-preset', help = wording.get('help.output_video_preset'), default = config.get_str_value('output_creation.output_video_preset', 'veryfast'), choices = facefusion.choices.output_video_presets)
 	group_output_creation.add_argument('--output-video-quality', help = wording.get('help.output_video_quality'), type = int, default = config.get_int_value('output_creation.output_video_quality', '80'), choices = facefusion.choices.output_video_quality_range, metavar = create_int_metavar(facefusion.choices.output_video_quality_range))
 	group_output_creation.add_argument('--output-video-resolution', help = wording.get('help.output_video_resolution'), default = config.get_str_value('output_creation.output_video_resolution'))
 	group_output_creation.add_argument('--output-video-fps', help = wording.get('help.output_video_fps'), type = float, default = config.get_str_value('output_creation.output_video_fps'))
-	group_output_creation.add_argument('--skip-audio', help = wording.get('help.skip_audio'), action = 'store_true', default = config.get_bool_value('output_creation.skip_audio'))
-	job_store.register_step_keys([ 'output_image_quality', 'output_image_resolution', 'output_audio_encoder', 'output_video_encoder', 'output_video_preset', 'output_video_quality', 'output_video_resolution', 'output_video_fps', 'skip_audio' ])
+	job_store.register_step_keys([ 'output_image_quality', 'output_image_resolution', 'output_audio_encoder', 'output_audio_quality', 'output_audio_volume', 'output_video_encoder', 'output_video_preset', 'output_video_quality', 'output_video_resolution', 'output_video_fps' ])
 	return program


--- a/facefusion/typing.py
+++ b/facefusion/typing.py
@@ -252,12 +252,13 @@ StateKey = Literal\
 	'output_image_quality',
 	'output_image_resolution',
 	'output_audio_encoder',
+	'output_audio_quality',
+	'output_audio_volume',
 	'output_video_encoder',
 	'output_video_preset',
 	'output_video_quality',
 	'output_video_resolution',
 	'output_video_fps',
-	'skip_audio',
 	'processors',
 	'open_browser',
 	'ui_layouts',
@@ -315,12 +316,13 @@ State = TypedDict('State',
 	'output_image_quality' : int,
 	'output_image_resolution' : str,
 	'output_audio_encoder' : OutputAudioEncoder,
+	'output_audio_quality' : int,
+	'output_audio_volume' : int,
 	'output_video_encoder' : OutputVideoEncoder,
 	'output_video_preset' : OutputVideoPreset,
 	'output_video_quality' : int,
 	'output_video_resolution' : str,
 	'output_video_fps' : float,
-	'skip_audio' : bool,
 	'processors' : List[str],
 	'open_browser' : bool,
 	'ui_layouts' : List[str],
--- a/facefusion/uis/choices.py
+++ b/facefusion/uis/choices.py
@@ -5,7 +5,7 @@ from facefusion.uis.typing import JobManagerAction, JobRunnerAction, WebcamMode
 job_manager_actions : List[JobManagerAction] = [ 'job-create', 'job-submit', 'job-delete', 'job-add-step', 'job-remix-step', 'job-insert-step', 'job-remove-step' ]
 job_runner_actions : List[JobRunnerAction] = [ 'job-run', 'job-run-all', 'job-retry', 'job-retry-all' ]

-common_options : List[str] = [ 'keep-temp', 'skip-audio' ]
+common_options : List[str] = [ 'keep-temp' ]

 webcam_modes : List[WebcamMode] = [ 'inline', 'udp', 'v4l2' ]
 webcam_resolutions : List[str] = [ '320x240', '640x480', '800x600', '1024x768', '1280x720', '1280x960', '1920x1080', '2560x1440', '3840x2160' ]
--- a/facefusion/uis/components/benchmark.py
+++ b/facefusion/uis/components/benchmark.py
@@ -81,8 +81,8 @@ def start(benchmark_runs : List[str], benchmark_cycles : int) -> Generator[List[
 	state_manager.init_item('source_paths', [ '.assets/examples/source.jpg', '.assets/examples/source.mp3' ])
 	state_manager.init_item('face_landmarker_score', 0)
 	state_manager.init_item('temp_frame_format', 'bmp')
+	state_manager.init_item('output_audio_volume', 0)
 	state_manager.init_item('output_video_preset', 'ultrafast')
-	state_manager.init_item('skip_audio', True)
 	state_manager.sync_item('execution_providers')
 	state_manager.sync_item('execution_thread_count')
 	state_manager.sync_item('execution_queue_count')
--- a/facefusion/uis/components/common_options.py
+++ b/facefusion/uis/components/common_options.py
@@ -15,8 +15,6 @@ def render() -> None:

 	if state_manager.get_item('keep_temp'):
 		common_options.append('keep-temp')
-	if state_manager.get_item('skip_audio'):
-		common_options.append('skip-audio')

 	COMMON_OPTIONS_CHECKBOX_GROUP = gradio.Checkboxgroup(
 		label = wording.get('uis.common_options_checkbox_group'),
@@ -31,6 +29,4 @@ def listen() -> None:

 def update(common_options : List[str]) -> None:
 	keep_temp = 'keep-temp' in common_options
-	skip_audio = 'skip-audio' in common_options
 	state_manager.set_item('keep_temp', keep_temp)
-	state_manager.set_item('skip_audio', skip_audio)
--- a/facefusion/uis/components/output_options.py
+++ b/facefusion/uis/components/output_options.py
@@ -13,6 +13,8 @@ from facefusion.vision import create_image_resolutions, create_video_resolutions
 OUTPUT_IMAGE_QUALITY_SLIDER : Optional[gradio.Slider] = None
 OUTPUT_IMAGE_RESOLUTION_DROPDOWN : Optional[gradio.Dropdown] = None
 OUTPUT_AUDIO_ENCODER_DROPDOWN : Optional[gradio.Dropdown] = None
+OUTPUT_AUDIO_QUALITY_SLIDER : Optional[gradio.Slider] = None
+OUTPUT_AUDIO_VOLUME_SLIDER : Optional[gradio.Slider] = None
 OUTPUT_VIDEO_ENCODER_DROPDOWN : Optional[gradio.Dropdown] = None
 OUTPUT_VIDEO_PRESET_DROPDOWN : Optional[gradio.Dropdown] = None
 OUTPUT_VIDEO_RESOLUTION_DROPDOWN : Optional[gradio.Dropdown] = None
@@ -24,6 +26,8 @@ def render() -> None:
 	global OUTPUT_IMAGE_QUALITY_SLIDER
 	global OUTPUT_IMAGE_RESOLUTION_DROPDOWN
 	global OUTPUT_AUDIO_ENCODER_DROPDOWN
+	global OUTPUT_AUDIO_QUALITY_SLIDER
+	global OUTPUT_AUDIO_VOLUME_SLIDER
 	global OUTPUT_VIDEO_ENCODER_DROPDOWN
 	global OUTPUT_VIDEO_PRESET_DROPDOWN
 	global OUTPUT_VIDEO_RESOLUTION_DROPDOWN
@@ -58,6 +62,22 @@ def render() -> None:
 		value = state_manager.get_item('output_audio_encoder'),
 		visible = is_video(state_manager.get_item('target_path'))
 	)
+	OUTPUT_AUDIO_QUALITY_SLIDER = gradio.Slider(
+		label = wording.get('uis.output_audio_quality_slider'),
+		value = state_manager.get_item('output_audio_quality'),
+		step = calc_int_step(facefusion.choices.output_audio_quality_range),
+		minimum = facefusion.choices.output_audio_quality_range[0],
+		maximum = facefusion.choices.output_audio_quality_range[-1],
+		visible = is_video(state_manager.get_item('target_path'))
+	)
+	OUTPUT_AUDIO_VOLUME_SLIDER = gradio.Slider(
+		label = wording.get('uis.output_audio_volume_slider'),
+		value = state_manager.get_item('output_audio_volume'),
+		step = calc_int_step(facefusion.choices.output_audio_volume_range),
+		minimum = facefusion.choices.output_audio_volume_range[0],
+		maximum = facefusion.choices.output_audio_volume_range[-1],
+		visible = is_video(state_manager.get_item('target_path'))
+	)
 	OUTPUT_VIDEO_ENCODER_DROPDOWN = gradio.Dropdown(
 		label = wording.get('uis.output_video_encoder_dropdown'),
 		choices = facefusion.choices.output_video_encoders,
@@ -99,6 +119,8 @@ def listen() -> None:
 	OUTPUT_IMAGE_QUALITY_SLIDER.release(update_output_image_quality, inputs = OUTPUT_IMAGE_QUALITY_SLIDER)
 	OUTPUT_IMAGE_RESOLUTION_DROPDOWN.change(update_output_image_resolution, inputs = OUTPUT_IMAGE_RESOLUTION_DROPDOWN)
 	OUTPUT_AUDIO_ENCODER_DROPDOWN.change(update_output_audio_encoder, inputs = OUTPUT_AUDIO_ENCODER_DROPDOWN)
+	OUTPUT_AUDIO_QUALITY_SLIDER.release(update_output_audio_quality, inputs = OUTPUT_AUDIO_QUALITY_SLIDER)
+	OUTPUT_AUDIO_VOLUME_SLIDER.release(update_output_audio_volume, inputs = OUTPUT_AUDIO_VOLUME_SLIDER)
 	OUTPUT_VIDEO_ENCODER_DROPDOWN.change(update_output_video_encoder, inputs = OUTPUT_VIDEO_ENCODER_DROPDOWN)
 	OUTPUT_VIDEO_PRESET_DROPDOWN.change(update_output_video_preset, inputs = OUTPUT_VIDEO_PRESET_DROPDOWN)
 	OUTPUT_VIDEO_QUALITY_SLIDER.release(update_output_video_quality, inputs = OUTPUT_VIDEO_QUALITY_SLIDER)
@@ -111,22 +133,22 @@ def listen() -> None:
 		'target_video'
 	]):
 		for method in [ 'upload', 'change', 'clear' ]:
-			getattr(ui_component, method)(remote_update, outputs = [ OUTPUT_IMAGE_QUALITY_SLIDER, OUTPUT_IMAGE_RESOLUTION_DROPDOWN, OUTPUT_AUDIO_ENCODER_DROPDOWN, OUTPUT_VIDEO_ENCODER_DROPDOWN, OUTPUT_VIDEO_PRESET_DROPDOWN, OUTPUT_VIDEO_QUALITY_SLIDER, OUTPUT_VIDEO_RESOLUTION_DROPDOWN, OUTPUT_VIDEO_FPS_SLIDER ])
+			getattr(ui_component, method)(remote_update, outputs = [ OUTPUT_IMAGE_QUALITY_SLIDER, OUTPUT_IMAGE_RESOLUTION_DROPDOWN, OUTPUT_AUDIO_ENCODER_DROPDOWN, OUTPUT_AUDIO_QUALITY_SLIDER, OUTPUT_AUDIO_VOLUME_SLIDER, OUTPUT_VIDEO_ENCODER_DROPDOWN, OUTPUT_VIDEO_PRESET_DROPDOWN, OUTPUT_VIDEO_QUALITY_SLIDER, OUTPUT_VIDEO_RESOLUTION_DROPDOWN, OUTPUT_VIDEO_FPS_SLIDER ])


-def remote_update() -> Tuple[gradio.Slider, gradio.Dropdown, gradio.Dropdown, gradio.Dropdown, gradio.Dropdown, gradio.Slider, gradio.Dropdown, gradio.Slider]:
+def remote_update() -> Tuple[gradio.Slider, gradio.Dropdown, gradio.Dropdown, gradio.Slider, gradio.Slider, gradio.Dropdown, gradio.Dropdown, gradio.Slider, gradio.Dropdown, gradio.Slider]:
 	if is_image(state_manager.get_item('target_path')):
 		output_image_resolution = detect_image_resolution(state_manager.get_item('target_path'))
 		output_image_resolutions = create_image_resolutions(output_image_resolution)
 		state_manager.set_item('output_image_resolution', pack_resolution(output_image_resolution))
-		return gradio.Slider(visible = True), gradio.Dropdown(value = state_manager.get_item('output_image_resolution'), choices = output_image_resolutions, visible = True), gradio.Dropdown(visible = False), gradio.Dropdown(visible = False), gradio.Dropdown(visible = False), gradio.Slider(visible = False), gradio.Dropdown(visible = False), gradio.Slider(visible = False)
+		return gradio.Slider(visible = True), gradio.Dropdown(value = state_manager.get_item('output_image_resolution'), choices = output_image_resolutions, visible = True), gradio.Dropdown(visible = False), gradio.Slider(visible = False), gradio.Slider(visible = False), gradio.Dropdown(visible = False), gradio.Dropdown(visible = False), gradio.Slider(visible = False), gradio.Dropdown(visible = False), gradio.Slider(visible = False)
 	if is_video(state_manager.get_item('target_path')):
 		output_video_resolution = detect_video_resolution(state_manager.get_item('target_path'))
 		output_video_resolutions = create_video_resolutions(output_video_resolution)
 		state_manager.set_item('output_video_resolution', pack_resolution(output_video_resolution))
 		state_manager.set_item('output_video_fps', detect_video_fps(state_manager.get_item('target_path')))
-		return gradio.Slider(visible = False), gradio.Dropdown(visible = False), gradio.Dropdown(visible = True), gradio.Dropdown(visible = True), gradio.Dropdown(visible = True), gradio.Slider(visible = True), gradio.Dropdown(value = state_manager.get_item('output_video_resolution'), choices = output_video_resolutions, visible = True), gradio.Slider(value = state_manager.get_item('output_video_fps'), visible = True)
-	return gradio.Slider(visible = False), gradio.Dropdown(visible = False), gradio.Dropdown(visible = False), gradio.Dropdown(visible = False), gradio.Dropdown(visible = False), gradio.Slider(visible = False), gradio.Dropdown(visible = False), gradio.Slider(visible = False)
+		return gradio.Slider(visible = False), gradio.Dropdown(visible = False), gradio.Dropdown(visible = True), gradio.Slider(visible = True), gradio.Slider(visible = True), gradio.Dropdown(visible = True), gradio.Dropdown(visible = True), gradio.Slider(visible = True), gradio.Dropdown(value = state_manager.get_item('output_video_resolution'), choices = output_video_resolutions, visible = True), gradio.Slider(value = state_manager.get_item('output_video_fps'), visible = True)
+	return gradio.Slider(visible = False), gradio.Dropdown(visible = False), gradio.Dropdown(visible = False), gradio.Slider(visible = False), gradio.Slider(visible = False), gradio.Dropdown(visible = False), gradio.Dropdown(visible = False), gradio.Slider(visible = False), gradio.Dropdown(visible = False), gradio.Slider(visible = False)


 def update_output_image_quality(output_image_quality : float) -> None:
@@ -141,6 +163,14 @@ def update_output_audio_encoder(output_audio_encoder : OutputAudioEncoder) -> No
 	state_manager.set_item('output_audio_encoder', output_audio_encoder)


+def update_output_audio_quality(output_audio_quality : float) -> None:
+	state_manager.set_item('output_audio_quality', int(output_audio_quality))
+
+
+def update_output_audio_volume(output_audio_volume: float) -> None:
+	state_manager.set_item('output_audio_volume', int(output_audio_volume))
+
+
 def update_output_video_encoder(output_video_encoder : OutputVideoEncoder) -> None:
 	state_manager.set_item('output_video_encoder', output_video_encoder)

--- a/facefusion/wording.py
+++ b/facefusion/wording.py
@@ -139,14 +139,15 @@ WORDING : Dict[str, Any] =\
 		'keep_temp': 'keep the temporary resources after processing',
 		# output creation
 		'output_image_quality': 'specify the image quality which translates to the compression factor',
-		'output_image_resolution': 'specify the image output resolution based on the target image',
-		'output_audio_encoder': 'specify the encoder used for the audio output',
-		'output_video_encoder': 'specify the encoder used for the video output',
+		'output_image_resolution': 'specify the image resolution based on the target image',
+		'output_audio_encoder': 'specify the encoder used for the audio',
+		'output_audio_quality': 'specify the audio quality which translates to the compression factor',
+		'output_audio_volume': 'specify the audio volume based on the target video',
+		'output_video_encoder': 'specify the encoder used for the video',
 		'output_video_preset': 'balance fast video processing and video file size',
 		'output_video_quality': 'specify the video quality which translates to the compression factor',
-		'output_video_resolution': 'specify the video output resolution based on the target video',
-		'output_video_fps': 'specify the video output fps based on the target video',
-		'skip_audio': 'omit the audio from the target video',
+		'output_video_resolution': 'specify the video resolution based on the target video',
+		'output_video_fps': 'specify the video fps based on the target video',
 		# processors
 		'processors': 'load a single or multiple processors (choices: {choices}, ...)',
 		'age_modifier_model': 'choose the model responsible for aging the face',
@@ -303,6 +304,8 @@ WORDING : Dict[str, Any] =\
 		'lip_syncer_model_dropdown': 'LIP SYNCER MODEL',
 		'log_level_dropdown': 'LOG LEVEL',
 		'output_audio_encoder_dropdown': 'OUTPUT AUDIO ENCODER',
+		'output_audio_quality_slider': 'OUTPUT AUDIO QUALITY',
+		'output_audio_volume_slider': 'OUTPUT AUDIO VOLUME',
 		'output_image_or_video': 'OUTPUT',
 		'output_image_quality_slider': 'OUTPUT IMAGE QUALITY',
 		'output_image_resolution_dropdown': 'OUTPUT IMAGE RESOLUTION',
--- a/tests/test_ffmpeg.py
+++ b/tests/test_ffmpeg.py
@@ -29,6 +29,8 @@ def before_all() -> None:
 	state_manager.init_item('temp_path', tempfile.gettempdir())
 	state_manager.init_item('temp_frame_format', 'png')
 	state_manager.init_item('output_audio_encoder', 'aac')
+	state_manager.init_item('output_audio_quality', 80)
+	state_manager.init_item('output_audio_volume', 100)


@pytest.fixture(scope = 'function', autouse = True)
--- a/tests/test_job_runner.py
+++ b/tests/test_job_runner.py
@@ -2,7 +2,6 @@ import subprocess

 import pytest

-from facefusion import state_manager
 from facefusion.download import conditional_download
 from facefusion.filesystem import copy_file
 from facefusion.jobs.job_manager import add_step, clear_jobs, create_job, init_jobs, submit_job, submit_jobs
@@ -19,7 +18,6 @@ def before_all() -> None:
 		'https://github.com/facefusion/facefusion-assets/releases/download/examples-3.0.0/target-240p.mp4'
 	])
 	subprocess.run([ 'ffmpeg', '-i', get_test_example_file('target-240p.mp4'), '-vframes', '1', get_test_example_file('target-240p.jpg') ])
-	state_manager.init_item('output_audio_encoder', 'aac')


@pytest.fixture(scope = 'function', autouse = True)