代码拉取完成,页面将自动刷新
import logging
import os
from collections import defaultdict
from pathlib import Path
import librosa
import numpy as np
from python_speech_features import fbank
from tqdm import tqdm
from constants import SAMPLE_RATE, NUM_FBANKS
from utils import find_files, ensures_dir
logger = logging.getLogger(__name__)
def read_mfcc(input_filename, sample_rate):
audio = Audio.read(input_filename, sample_rate)
energy = np.abs(audio)
silence_threshold = np.percentile(energy, 95)
offsets = np.where(energy > silence_threshold)[0]
# left_blank_duration_ms = (1000.0 * offsets[0]) // self.sample_rate # frame_id to duration (ms)
# right_blank_duration_ms = (1000.0 * (len(audio) - offsets[-1])) // self.sample_rate
# TODO: could use trim_silence() here or a better VAD.
audio_voice_only = audio[offsets[0]:offsets[-1]]
mfcc = mfcc_fbank(audio_voice_only, sample_rate)
return mfcc
def extract_speaker_and_utterance_ids(filename: str): # LIBRI.
# 'audio/dev-other/116/288045/116-288045-0000.flac'
speaker, _, basename = Path(filename).parts[-3:]
filename.split('-')
utterance = os.path.splitext(basename.split('-', 1)[-1])[0]
assert basename.split('-')[0] == speaker
return speaker, utterance
class Audio:
def __init__(self, cache_dir: str, audio_dir: str = None, sample_rate: int = SAMPLE_RATE, ext='flac'):
self.ext = ext
self.cache_dir = os.path.join(cache_dir, 'audio-fbanks')
ensures_dir(self.cache_dir)
if audio_dir is not None:
self.build_cache(os.path.expanduser(audio_dir), sample_rate)
self.speakers_to_utterances = defaultdict(dict)
for cache_file in find_files(self.cache_dir, ext='npy'):
# /path/to/speaker_utterance.npy
speaker_id, utterance_id = Path(cache_file).stem.split('_')
self.speakers_to_utterances[speaker_id][utterance_id] = cache_file
@property
def speaker_ids(self):
return sorted(self.speakers_to_utterances)
@staticmethod
def trim_silence(audio, threshold):
"""Removes silence at the beginning and end of a sample."""
energy = librosa.feature.rms(audio)
frames = np.nonzero(np.array(energy > threshold))
indices = librosa.core.frames_to_samples(frames)[1]
# Note: indices can be an empty array, if the whole audio was silence.
audio_trim = audio[0:0]
left_blank = audio[0:0]
right_blank = audio[0:0]
if indices.size:
audio_trim = audio[indices[0]:indices[-1]]
left_blank = audio[:indices[0]] # slice before.
right_blank = audio[indices[-1]:] # slice after.
return audio_trim, left_blank, right_blank
@staticmethod
def read(filename, sample_rate=SAMPLE_RATE):
audio, sr = librosa.load(filename, sr=sample_rate, mono=True, dtype=np.float32)
assert sr == sample_rate
return audio
def build_cache(self, audio_dir, sample_rate):
logger.info(f'audio_dir: {audio_dir}.')
logger.info(f'sample_rate: {sample_rate:,} hz.')
audio_files = find_files(audio_dir, ext=self.ext)
audio_files_count = len(audio_files)
assert audio_files_count != 0, f'Could not find any {self.ext} files in {audio_dir}.'
logger.info(f'Found {audio_files_count:,} files in {audio_dir}.')
with tqdm(audio_files) as bar:
for audio_filename in bar:
bar.set_description(audio_filename)
self.cache_audio_file(audio_filename, sample_rate)
def cache_audio_file(self, input_filename, sample_rate):
sp, utt = extract_speaker_and_utterance_ids(input_filename)
cache_filename = os.path.join(self.cache_dir, f'{sp}_{utt}.npy')
if not os.path.isfile(cache_filename):
try:
mfcc = read_mfcc(input_filename, sample_rate)
np.save(cache_filename, mfcc)
except librosa.util.exceptions.ParameterError as e:
logger.error(e)
def pad_mfcc(mfcc, max_length): # num_frames, nfilt=64.
if len(mfcc) < max_length:
mfcc = np.vstack((mfcc, np.tile(np.zeros(mfcc.shape[1]), (max_length - len(mfcc), 1))))
return mfcc
def mfcc_fbank(signal: np.array, sample_rate: int): # 1D signal array.
# Returns MFCC with shape (num_frames, n_filters, 3).
filter_banks, energies = fbank(signal, samplerate=sample_rate, nfilt=NUM_FBANKS)
frames_features = normalize_frames(filter_banks)
# delta_1 = delta(filter_banks, N=1)
# delta_2 = delta(delta_1, N=1)
# frames_features = np.transpose(np.stack([filter_banks, delta_1, delta_2]), (1, 2, 0))
return np.array(frames_features, dtype=np.float32) # Float32 precision is enough here.
def normalize_frames(m, epsilon=1e-12):
return [(v - np.mean(v)) / max(np.std(v), epsilon) for v in m]
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。