Spaces:
Runtime error
Runtime error
| # coding=utf-8 | |
| # Copyright 2023 The HuggingFace Inc. team and the librosa & torchaudio authors. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| """ | |
| Audio processing functions to extract features from audio waveforms. This code is pure numpy to support all frameworks | |
| and remove unnecessary dependencies. | |
| """ | |
| import warnings | |
| import numpy as np | |
| from typing import Union, Optional | |
| def hertz_to_mel( | |
| freq: Union[float, np.ndarray], mel_scale: str = "htk" | |
| ) -> Union[float, np.ndarray]: | |
| if mel_scale not in ["slaney", "htk", "kaldi"]: | |
| raise ValueError('mel_scale should be one of "htk", "slaney" or "kaldi".') | |
| if mel_scale == "htk": | |
| return 2595.0 * np.log10(1.0 + (freq / 700.0)) | |
| elif mel_scale == "kaldi": | |
| return 1127.0 * np.log(1.0 + (freq / 700.0)) | |
| min_log_hertz = 1000.0 | |
| min_log_mel = 15.0 | |
| logstep = 27.0 / np.log(6.4) | |
| mels = 3.0 * freq / 200.0 | |
| if isinstance(freq, np.ndarray): | |
| log_region = freq >= min_log_hertz | |
| mels[log_region] = ( | |
| min_log_mel + np.log(freq[log_region] / min_log_hertz) * logstep | |
| ) | |
| elif freq >= min_log_hertz: | |
| mels = min_log_mel + np.log(freq / min_log_hertz) * logstep | |
| return mels | |
| def mel_to_hertz( | |
| mels: Union[float, np.ndarray], mel_scale: str = "htk" | |
| ) -> Union[float, np.ndarray]: | |
| if mel_scale not in ["slaney", "htk", "kaldi"]: | |
| raise ValueError('mel_scale should be one of "htk", "slaney" or "kaldi".') | |
| if mel_scale == "htk": | |
| return 700.0 * (np.power(10, mels / 2595.0) - 1.0) | |
| elif mel_scale == "kaldi": | |
| return 700.0 * (np.exp(mels / 1127.0) - 1.0) | |
| min_log_hertz = 1000.0 | |
| min_log_mel = 15.0 | |
| logstep = np.log(6.4) / 27.0 | |
| freq = 200.0 * mels / 3.0 | |
| if isinstance(mels, np.ndarray): | |
| log_region = mels >= min_log_mel | |
| freq[log_region] = min_log_hertz * np.exp( | |
| logstep * (mels[log_region] - min_log_mel) | |
| ) | |
| elif mels >= min_log_mel: | |
| freq = min_log_hertz * np.exp(logstep * (mels - min_log_mel)) | |
| return freq | |
| def _create_triangular_filter_bank( | |
| fft_freqs: np.ndarray, filter_freqs: np.ndarray | |
| ) -> np.ndarray: | |
| """ | |
| Creates a triangular filter bank. | |
| Adapted from *torchaudio* and *librosa*. | |
| Args: | |
| fft_freqs (`np.ndarray` of shape `(num_frequency_bins,)`): | |
| Discrete frequencies of the FFT bins in Hz. | |
| filter_freqs (`np.ndarray` of shape `(num_mel_filters,)`): | |
| Center frequencies of the triangular filters to create, in Hz. | |
| Returns: | |
| `np.ndarray` of shape `(num_frequency_bins, num_mel_filters)` | |
| """ | |
| filter_diff = np.diff(filter_freqs) | |
| slopes = np.expand_dims(filter_freqs, 0) - np.expand_dims(fft_freqs, 1) | |
| down_slopes = -slopes[:, :-2] / filter_diff[:-1] | |
| up_slopes = slopes[:, 2:] / filter_diff[1:] | |
| return np.maximum(np.zeros(1), np.minimum(down_slopes, up_slopes)) | |
| def mel_filter_bank( | |
| num_frequency_bins: int, | |
| num_mel_filters: int, | |
| min_frequency: float, | |
| max_frequency: float, | |
| sampling_rate: int, | |
| norm: Optional[str] = None, | |
| mel_scale: str = "htk", | |
| triangularize_in_mel_space: bool = False, | |
| ) -> np.ndarray: | |
| if norm is not None and norm != "slaney": | |
| raise ValueError('norm must be one of None or "slaney"') | |
| # center points of the triangular mel filters | |
| mel_min = hertz_to_mel(min_frequency, mel_scale=mel_scale) | |
| mel_max = hertz_to_mel(max_frequency, mel_scale=mel_scale) | |
| mel_freqs = np.linspace(mel_min, mel_max, num_mel_filters + 2) | |
| filter_freqs = mel_to_hertz(mel_freqs, mel_scale=mel_scale) | |
| if triangularize_in_mel_space: | |
| # frequencies of FFT bins in Hz, but filters triangularized in mel space | |
| fft_bin_width = sampling_rate / (num_frequency_bins * 2) | |
| fft_freqs = hertz_to_mel( | |
| fft_bin_width * np.arange(num_frequency_bins), mel_scale=mel_scale | |
| ) | |
| filter_freqs = mel_freqs | |
| else: | |
| # frequencies of FFT bins in Hz | |
| fft_freqs = np.linspace(0, sampling_rate // 2, num_frequency_bins) | |
| mel_filters = _create_triangular_filter_bank(fft_freqs, filter_freqs) | |
| if norm is not None and norm == "slaney": | |
| # Slaney-style mel is scaled to be approx constant energy per channel | |
| enorm = 2.0 / ( | |
| filter_freqs[2 : num_mel_filters + 2] - filter_freqs[:num_mel_filters] | |
| ) | |
| mel_filters *= np.expand_dims(enorm, 0) | |
| if (mel_filters.max(axis=0) == 0.0).any(): | |
| warnings.warn( | |
| "At least one mel filter has all zero values. " | |
| f"The value for `num_mel_filters` ({num_mel_filters}) may be set too high. " | |
| f"Or, the value for `num_frequency_bins` ({num_frequency_bins}) may be set too low." | |
| ) | |
| return mel_filters | |