Hello,
I’m attempting to build a script to help me identify poor audio/transcription pairs in my test/train data. The ideal solution will identify when the audio is all blank/mostly silence, output > input as defined by ctc loss function. My first pass on this (is_this_bad_audio), I added it to the audio.py utility included in the repo. Does anyone use a different approach that is not listening to every sample? Thanks!
import pydub
import numpy as np
import re
import scipy.io.wavfile as wav
from python_speech_features import mfcc
import librosa
import soundfile as sf
from util.text import text_to_char_array, Alphabet
alphabet = Alphabet("/.../alphabet.txt")
def audiofile_to_input_vector(audio_filename, numcep, numcontext):
r"""
Given a WAV audio file at ``audio_filename``, calculates ``numcep`` MFCC features
at every 0.01s time step with a window length of 0.025s. Appends ``numcontext``
context frames to the left and right of each time step, and returns this data
in a numpy array.
"""
# Load wav files
fs, audio = wav.read(audio_filename)
# Get mfcc coefficients
features = mfcc(audio, samplerate=fs, numcep=numcep, winlen=0.032, winstep=0.02, winfunc=np.hamming)
# Add empty initial and final contexts
empty_context = np.zeros((numcontext, numcep), dtype=features.dtype)
features = np.concatenate((empty_context, features, empty_context))
return features
def is_this_bad_audio(audio_filename, transcript):
r"""
A simple audio validation process to determine if an audio sample may be a good candidate
for CTC loss given audio represented as MFCC and transcript represented as vector. Poor
audio will also be removed by determining the decibel level averages across the audio file.
"""
try:
features_len = len(audiofile_to_input_vector(audio_filename, numcep=26, numcontext=9)) - 2 * 9 #features_len
# output = len(re.findall(r"[a-z ]", transcript))
#alphabet = Alphabet(alphabet)
transcript_len = len(text_to_char_array(transcript, alphabet))
# print("DEBUG: features_len:{} transcript_len:{}".format(features_len,transcript_len))
if features_len < transcript_len:
answer = "CTCError"
elif features_len <= 1:
answer = "TransLen<=1"
else:
q = pydub.AudioSegment.from_wav(audio_filename)
if str(q.dBFS) == '-inf':
answer = 'dBFS-inf'
elif q.dBFS < -50:
answer = 'dBFS<good'
else:
answer = "OK"
except:
answer = "FileError"
return answer
def standardize_audio_input(audio_filename, audio_filename_dest, sr=8000):
r"""
Given a WAV audio file at ``audio_filename``, standardizes audio as mono and 16bit PCM 16kHz
and saves it to ``audio_filename_dest``.
"""
audio, fs = librosa.load(audio_filename, sr=sr)
audio = librosa.resample(audio, fs, 16000, res_type='kaiser_best')
sf.write(audio_filename_dest, audio, 16000, subtype='PCM_16')