2024-02-14 14:08:29 +01:00
|
|
|
from typing import Optional, Any, List
|
|
|
|
|
from functools import lru_cache
|
|
|
|
|
import numpy
|
|
|
|
|
import scipy
|
|
|
|
|
|
|
|
|
|
from facefusion.filesystem import is_audio
|
|
|
|
|
from facefusion.ffmpeg import read_audio_buffer
|
2024-04-09 15:40:55 +02:00
|
|
|
from facefusion.typing import Fps, Audio, AudioFrame, Spectrogram, MelFilterBank
|
|
|
|
|
from facefusion.voice_extractor import batch_extract_voice
|
2024-02-14 14:08:29 +01:00
|
|
|
|
|
|
|
|
|
2024-04-09 15:40:55 +02:00
|
|
|
@lru_cache(maxsize = 128)
|
|
|
|
|
def read_static_audio(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]:
|
|
|
|
|
return read_audio(audio_path, fps)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def read_audio(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]:
|
2024-04-13 11:27:55 +02:00
|
|
|
sample_rate = 48000
|
2024-04-09 15:40:55 +02:00
|
|
|
channel_total = 2
|
|
|
|
|
|
2024-02-14 14:08:29 +01:00
|
|
|
if is_audio(audio_path):
|
2024-04-09 15:40:55 +02:00
|
|
|
audio_buffer = read_audio_buffer(audio_path, sample_rate, channel_total)
|
|
|
|
|
audio = numpy.frombuffer(audio_buffer, dtype = numpy.int16).reshape(-1, 2)
|
|
|
|
|
audio = prepare_audio(audio)
|
|
|
|
|
spectrogram = create_spectrogram(audio)
|
|
|
|
|
audio_frames = extract_audio_frames(spectrogram, fps)
|
|
|
|
|
return audio_frames
|
2024-02-14 14:08:29 +01:00
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
2024-04-09 15:40:55 +02:00
|
|
|
@lru_cache(maxsize = 128)
|
|
|
|
|
def read_static_voice(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]:
|
|
|
|
|
return read_voice(audio_path, fps)
|
Next (#436)
* Rename landmark 5 variables
* Mark as NEXT
* Render tabs for multiple ui layout usage
* Allow many face detectors at once, Add face detector tweaks
* Remove face detector tweaks for now (kinda placebo)
* Fix lint issues
* Allow rendering the landmark-5 and landmark-5/68 via debugger
* Fix naming
* Convert face landmark based on confidence score
* Convert face landmark based on confidence score
* Add scrfd face detector model (#397)
* Add scrfd face detector model
* Switch to scrfd_2.5g.onnx model
* Just some renaming
* Downgrade OpenCV, Add SYSTEM_VERSION_COMPAT=0 for MacOS
* Improve naming
* prepare detect frame outside of semaphore
* Feat/process manager (#399)
* Minor naming
* Introduce process manager to start and stop
* Introduce process manager to start and stop
* Introduce process manager to start and stop
* Introduce process manager to start and stop
* Introduce process manager to start and stop
* Remove useless test for now
* Avoid useless variables
* Show stop once is_processing is True
* Allow to stop ffmpeg processing too
* Implement output image resolution (#403)
* Implement output image resolution
* Reorder code
* Simplify output logic and therefore fix bug
* Frame-enhancer-onnx (#404)
* changes
* changes
* changes
* changes
* add models
* update workflow
* Some cleanup
* Some cleanup
* Feat/frame enhancer polishing (#410)
* Some cleanup
* Polish the frame enhancer
* Frame Enhancer: Add more models, optimize processing
* Minor changes
* Improve readability of create_tile_frames and merge_tile_frames
* We don't have enough models yet
* Feat/face landmarker score (#413)
* Introduce face landmarker score
* Fix testing
* Fix testing
* Use release for score related sliders
* Reduce face landmark fallbacks
* Scores and landmarks in Face dict, Change color-theme in face debugger
* Scores and landmarks in Face dict, Change color-theme in face debugger
* Fix some naming
* Add 8K support (for whatever reasons)
* Fix testing
* Using get() for face.landmarks
* Introduce statistics
* More statistics
* Limit the histogram equalization
* Enable queue() for default layout
* Improve copy_image()
* Fix error when switching detector model
* Always set UI values with globals if possible
* Use different logic for output image and output video resolutions
* Enforce re-download if file size is off
* Remove unused method
* Remove unused method
* Remove unused warning filter
* Improved output path normalization (#419)
* Handle some exceptions
* Handle some exceptions
* Cleanup
* Prevent countless thread locks
* Listen to user feedback
* Fix webp edge case
* Feat/cuda device detection (#424)
* Introduce cuda device detection
* Introduce cuda device detection
* it's gtx
* Move logic to run_nvidia_smi()
* Finalize execution device naming
* Finalize execution device naming
* Merge execution_helper.py to execution.py
* Undo lowercase of values
* Undo lowercase of values
* Finalize naming
* Add missing entry to ini
* fix lip_syncer preview (#426)
* fix lip_syncer preview
* change
* Refresh preview on trim changes
* Cleanup frame enhancers and remove useless scale in merge_video() (#428)
* Keep lips over the whole video once lip syncer is enabled (#430)
* Keep lips over the whole video once lip syncer is enabled
* changes
* changes
* Fix spacing
* Use empty audio frame on silence
* Use empty audio frame on silence
* Fix ConfigParser encoding (#431)
facefusion.ini is UTF8 encoded but config.py doesn't specify encoding which results in corrupted entries when non english characters are used.
Affected entries:
source_paths
target_path
output_path
* Adjust spacing
* Improve the GTX 16 series detection
* Use general exception to catch ParseError
* Use general exception to catch ParseError
* Host frame enhancer models4
* Use latest onnxruntime
* Minor changes in benchmark UI
* Different approach to cancel ffmpeg process
* Add support for amd amf encoders (#433)
* Add amd_amf encoders
* remove -rc cqp from amf encoder parameters
* Improve terminal output, move success messages to debug mode
* Improve terminal output, move success messages to debug mode
* Minor update
* Minor update
* onnxruntime 1.17.1 matches cuda 12.2
* Feat/improved scaling (#435)
* Prevent useless temp upscaling, Show resolution and fps in terminal output
* Remove temp frame quality
* Remove temp frame quality
* Tiny cleanup
* Default back to png for temp frames, Remove pix_fmt from frame extraction due mjpeg error
* Fix inswapper fallback by onnxruntime
* Fix inswapper fallback by major onnxruntime
* Fix inswapper fallback by major onnxruntime
* Add testing for vision restrict methods
* Fix left / right face mask regions, add left-ear and right-ear
* Flip right and left again
* Undo ears - does not work with box mask
* Prepare next release
* Fix spacing
* 100% quality when using jpg for temp frames
* Use span_kendata_x4 as default as of speed
* benchmark optimal tile and pad
* Undo commented out code
* Add real_esrgan_x4_fp16 model
* Be strict when using many face detectors
---------
Co-authored-by: Harisreedhar <46858047+harisreedhar@users.noreply.github.com>
Co-authored-by: aldemoth <159712934+aldemoth@users.noreply.github.com>
2024-03-14 19:56:54 +01:00
|
|
|
|
|
|
|
|
|
2024-04-09 15:40:55 +02:00
|
|
|
def read_voice(audio_path : str, fps : Fps) -> Optional[List[AudioFrame]]:
|
2024-04-13 11:27:55 +02:00
|
|
|
sample_rate = 48000
|
2024-04-09 15:40:55 +02:00
|
|
|
channel_total = 2
|
2024-04-13 11:27:55 +02:00
|
|
|
chunk_size = 1024 * 240
|
|
|
|
|
step_size = 1024 * 180
|
2024-04-09 15:40:55 +02:00
|
|
|
|
2024-02-14 14:08:29 +01:00
|
|
|
if is_audio(audio_path):
|
2024-04-09 15:40:55 +02:00
|
|
|
audio_buffer = read_audio_buffer(audio_path, sample_rate, channel_total)
|
2024-02-14 14:08:29 +01:00
|
|
|
audio = numpy.frombuffer(audio_buffer, dtype = numpy.int16).reshape(-1, 2)
|
2024-04-09 15:40:55 +02:00
|
|
|
audio = batch_extract_voice(audio, chunk_size, step_size)
|
2024-04-13 11:27:55 +02:00
|
|
|
audio = prepare_voice(audio)
|
2024-04-09 15:40:55 +02:00
|
|
|
spectrogram = create_spectrogram(audio)
|
|
|
|
|
audio_frames = extract_audio_frames(spectrogram, fps)
|
2024-02-14 14:08:29 +01:00
|
|
|
return audio_frames
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
2024-04-09 15:40:55 +02:00
|
|
|
def get_audio_frame(audio_path : str, fps : Fps, frame_number : int = 0) -> Optional[AudioFrame]:
|
|
|
|
|
if is_audio(audio_path):
|
|
|
|
|
audio_frames = read_static_audio(audio_path, fps)
|
|
|
|
|
if frame_number in range(len(audio_frames)):
|
|
|
|
|
return audio_frames[frame_number]
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_voice_frame(audio_path : str, fps : Fps, frame_number : int = 0) -> Optional[AudioFrame]:
|
|
|
|
|
if is_audio(audio_path):
|
|
|
|
|
voice_frames = read_static_voice(audio_path, fps)
|
|
|
|
|
if frame_number in range(len(voice_frames)):
|
|
|
|
|
return voice_frames[frame_number]
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_empty_audio_frame() -> AudioFrame:
|
|
|
|
|
mel_filter_total = 80
|
|
|
|
|
step_size = 16
|
|
|
|
|
audio_frame = numpy.zeros((mel_filter_total, step_size)).astype(numpy.int16)
|
|
|
|
|
return audio_frame
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def prepare_audio(audio : numpy.ndarray[Any, Any]) -> Audio:
|
2024-02-14 14:08:29 +01:00
|
|
|
if audio.ndim > 1:
|
|
|
|
|
audio = numpy.mean(audio, axis = 1)
|
|
|
|
|
audio = audio / numpy.max(numpy.abs(audio), axis = 0)
|
2024-04-09 15:40:55 +02:00
|
|
|
audio = scipy.signal.lfilter([ 1.0, -0.97 ], [ 1.0 ], audio)
|
2024-02-14 14:08:29 +01:00
|
|
|
return audio
|
|
|
|
|
|
|
|
|
|
|
2024-04-13 11:27:55 +02:00
|
|
|
def prepare_voice(audio : numpy.ndarray[Any, Any]) -> Audio:
|
|
|
|
|
sample_rate = 48000
|
|
|
|
|
resample_rate = 16000
|
|
|
|
|
|
|
|
|
|
audio = scipy.signal.resample(audio, int(len(audio) * resample_rate / sample_rate))
|
|
|
|
|
audio = prepare_audio(audio)
|
|
|
|
|
return audio
|
|
|
|
|
|
|
|
|
|
|
2024-02-14 14:08:29 +01:00
|
|
|
def convert_hertz_to_mel(hertz : float) -> float:
|
|
|
|
|
return 2595 * numpy.log10(1 + hertz / 700)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def convert_mel_to_hertz(mel : numpy.ndarray[Any, Any]) -> numpy.ndarray[Any, Any]:
|
|
|
|
|
return 700 * (10 ** (mel / 2595) - 1)
|
|
|
|
|
|
|
|
|
|
|
2024-04-09 15:40:55 +02:00
|
|
|
def create_mel_filter_bank() -> MelFilterBank:
|
|
|
|
|
mel_filter_total = 80
|
|
|
|
|
mel_bin_total = 800
|
|
|
|
|
sample_rate = 16000
|
|
|
|
|
min_frequency = 55.0
|
|
|
|
|
max_frequency = 7600.0
|
|
|
|
|
mel_filter_bank = numpy.zeros((mel_filter_total, mel_bin_total // 2 + 1))
|
|
|
|
|
mel_frequency_range = numpy.linspace(convert_hertz_to_mel(min_frequency), convert_hertz_to_mel(max_frequency), mel_filter_total + 2)
|
|
|
|
|
indices = numpy.floor((mel_bin_total + 1) * convert_mel_to_hertz(mel_frequency_range) / sample_rate).astype(numpy.int16)
|
|
|
|
|
|
|
|
|
|
for index in range(mel_filter_total):
|
|
|
|
|
start = indices[index]
|
|
|
|
|
end = indices[index + 1]
|
|
|
|
|
mel_filter_bank[index, start:end] = scipy.signal.windows.triang(end - start)
|
|
|
|
|
return mel_filter_bank
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_spectrogram(audio : Audio) -> Spectrogram:
|
|
|
|
|
mel_bin_total = 800
|
|
|
|
|
mel_bin_overlap = 600
|
|
|
|
|
mel_filter_bank = create_mel_filter_bank()
|
|
|
|
|
spectrogram = scipy.signal.stft(audio, nperseg = mel_bin_total, nfft = mel_bin_total, noverlap = mel_bin_overlap)[2]
|
|
|
|
|
spectrogram = numpy.dot(mel_filter_bank, numpy.abs(spectrogram))
|
2024-02-14 14:08:29 +01:00
|
|
|
return spectrogram
|
|
|
|
|
|
|
|
|
|
|
2024-04-09 15:40:55 +02:00
|
|
|
def extract_audio_frames(spectrogram : Spectrogram, fps : Fps) -> List[AudioFrame]:
|
|
|
|
|
mel_filter_total = 80
|
|
|
|
|
step_size = 16
|
2024-02-14 14:08:29 +01:00
|
|
|
audio_frames = []
|
2024-04-09 15:40:55 +02:00
|
|
|
indices = numpy.arange(0, spectrogram.shape[1], mel_filter_total / fps).astype(numpy.int16)
|
|
|
|
|
indices = indices[indices >= step_size]
|
|
|
|
|
|
2024-02-14 14:08:29 +01:00
|
|
|
for index in indices:
|
2024-04-09 15:40:55 +02:00
|
|
|
start = max(0, index - step_size)
|
|
|
|
|
audio_frames.append(spectrogram[:, start:index])
|
2024-02-14 14:08:29 +01:00
|
|
|
return audio_frames
|