diff --git a/README.md b/README.md index 20813f125360f3941d0cb1657851163514c6e504..9ee0488bc025e68b2482bd69ca19ff1bc8622d82 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,8 @@ usage: gsrp_tdoa_hyperres.py [-h] [-c CHANNELS] [-i INVERSE] [-f FRAME_SIZE] Arguments ----- ``` +Computes TDOA estimates from a multi-channel recording. + positional arguments: infile The sound file to process. outfile The text or npy file to write results to. Each row @@ -34,6 +36,8 @@ positional arguments: optional arguments: -h, --help show this help message and exit + +Channels: -c CHANNELS, --channels CHANNELS The channels to cross-correlate. Accepts two or more, but beware of high memory use. To be given as a comma- @@ -43,15 +47,23 @@ optional arguments: Inverse the channel. To be given as a comma-separated list of numbers,with 0 referring to the first channel once channels have been chosen by --channels. + +Size settings: -f FRAME_SIZE, --frame-size FRAME_SIZE The size of the cross-correlation frames in seconds (default: 0.2) - -s HOP_SIZE, --hop-size HOP_SIZE + -s STRIDE, --stride STRIDE The step between the beginnings of sequential frames in seconds (default: 0.01), or the postion in second if csv file path is given. -m MAX_TDOA, --max-tdoa MAX_TDOA The maximum TDOA in seconds (default: 0.0011). + -S SECONDS, --start SECONDS + If given, only analyze from the given position. + -E SECONDS, --end SECONDS + If given, only analyze up to the given position. + +Filtering: -l LOW, --low LOW Bottom cutoff frequency. Disabled by default. -u UP, --up UP Top cutoff frequency. Disabled by default. -d DECIMATE, --decimate DECIMATE @@ -59,8 +71,23 @@ optional arguments: default -t, --temporal If given, any decimation will be applied in the time domain instead of the spectral domain. - -e, --erase Erase existing outfile. If outfile existe and --erase + +Other: + -e, --erase Erase existing outfile. If outfile exist and --erase is not provide, the script will exit. - --start SECONDS If given, only analyze from the given position. - --end SECONDS If given, only analyze up to the given position. + -n, --no-hyperres Disable the hyper resolution evalutation of the TDOA + -M {estimate,auto,on-the-fly,prepare}, --mode {estimate,auto,on-the-fly,prepare} + How to explore the TDOA space (default: prepare). + 'prepare' precomputes all the possible TDOA + pairs and then evaluate them. All the results are save + in memory. + 'on-the-fly' compute the TDOA pairs at the same + time as it compute the loss function. Only the maximum + is saved. Can be slower than 'prepare'. + 'estimate' try to reduce the number of TDOA to + evaluate. + 'auto' automatically try to pick the right + method. + +Process finished with exit code 0 ``` diff --git a/c_corr.pyx b/c_corr.pyx index a372c73ddf43136e6f60e32b9d175038ce08a5a0..0e67e22b99dacebe5aded8af5ec073f4f6beb480 100755 --- a/c_corr.pyx +++ b/c_corr.pyx @@ -20,7 +20,7 @@ ctypedef fused npType2: @cython.boundscheck(False) @cython.wraparound(False) -def c_corr(npType1[:,::1] cc, npType2[::1, :] taus): +def c_corr_at(npType1[:,::1] cc, npType2[::1, :] taus): total = taus.shape[1] n_pair = taus.shape[0] cdef double max_corr_val = 0.0 @@ -33,4 +33,48 @@ def c_corr(npType1[:,::1] cc, npType2[::1, :] taus): if corr_val > max_corr_val: max_corr_val = corr_val index = i - return np.log10(max_corr_val), index \ No newline at end of file + return np.log10(max_corr_val), index + + +@cython.boundscheck(False) +@cython.wraparound(False) +def c_corr_all(npType1[:,::1] cc, npType2 max_tdoa, npType2 base_tdoa): + cdef double corr_val = 0.0 + taus = np.zeros(cc.shape[0], int) + out_taus = np.zeros(cc.shape[0], int) + _c_corr_core(cc, taus, out_taus, corr_val, base_tdoa-1, base_tdoa, max_tdoa, cc.shape[1], cc.shape[0]) + return np.log10(corr_val), out_taus + + +@cython.boundscheck(False) +@cython.wraparound(False) +def _c_corr_core(npType1[:,::1] cc, npType2[::1] taus, npType2[::1] out_taus, double val, + npType2 curr_tdoa, npType2 ind_tdoa, npType2 max_tdoa, npType2 win_size, npType2 n_pair): + if curr_tdoa < 0: + cdef int i0 = 1 + cdef int j0 = 0 + for i in range(ind_tdoa, n_pair): + taus[i] = taus[i0] - taus[j0] + i0 += 1 + if i0 >= ind_tdoa: + j0 += 1 + i0 = j0 + 1 + val = cc[0, taus[0]] + for i in range(1, n_pair): + val *= cc[i, taus[i] % win_size] + return val + cdef int low = -max_tdoa + cdef int high = max_tdoa + cdef double tmp_val = 0 + for i in range(curr_tdoa+1, ind_tdoa): + if low < taus[i] - max_tdoa: + low = -max_tdoa + taus[i] + if high > max_tdoa + taus[i]: + high = max_tdoa + taus[i] + for t in range(low, high+1): + taus[curr_tdoa] = t + tmp_val = _c_corr_core(cc, taus, out_taus, val, curr_tdoa-1, ind_tdoa, max_size, win_size, n_pair) + if val < tmp_val: + val = tmp_val + out_taus[:] = taus + return val \ No newline at end of file diff --git a/gsrp_tdoa_hyperres.py b/gsrp_tdoa_hyperres.py index 0365e7d16abc53604eeef75eeacd1ea5c345317e..27f8d05f6dcb5eee4137c2c3cca1913c80d069f6 100755 --- a/gsrp_tdoa_hyperres.py +++ b/gsrp_tdoa_hyperres.py @@ -9,8 +9,9 @@ import numpy as np from numpy.fft import rfft, irfft import scipy.signal as sg import soundfile as sf -from c_corr import c_corr +import c_corr from scipy.signal.windows import tukey + try: from tqdm import trange except ImportError: @@ -41,46 +42,62 @@ def slicer(down, up, ndim, n): return slices[index].reshape(ndim, -1).T -def corr(data, sr, pos, w_size, max_tdoa, decimate=1): +def corr(data, pos, w_size, max_tdoa, decimate=1, mode='prepare', hyper=True): num_channels = data.shape[1] num_channel_pairs = num_channels * (num_channels - 1) // 2 - # apply zero-padding - data = np.pad(data, (((w_size - 1) // 2, (w_size - 1) // 2), (0, 0)), - 'constant') - - # prepare window - win = np.expand_dims(tukey(w_size, 0.2), -1) - - # prepare index matrices + data = np.pad(data, [((w_size - 1) // 2, (w_size - 1) // 2), (0, 0)], 'constant') + win = tukey(w_size, 0.2)[:, np.newaxis] + cc_size = min(w_size, int(2 * max_tdoa // decimate)) v1 = np.empty(num_channel_pairs, np.int8) v2 = np.empty(num_channel_pairs, np.int8) mat = np.zeros((num_channel_pairs, num_channels - 1), np.int8) for k, (i, j) in enumerate(itertools.combinations(range(num_channels), 2)): if i > 0: - mat[k, i-1] = -1 - mat[k, j-1] = 1 + mat[k, i - 1] = -1 + mat[k, j - 1] = 1 v1[k] = i v2[k] = j - cc_size = min(w_size, int(2 * max_tdoa // decimate)) - slices = slicer(-(cc_size // 2), cc_size // 2, (num_channels - 1), 16) - tausf = [] - for j in range(len(slices)): - taus = np.mgrid[slices[j]].reshape(num_channels - 1, -1).astype(np.int16) - taus2 = np.matmul(mat, taus) - tausf += [taus2[:, np.abs(taus2).max(0) <= cc_size // 2]] - tausf = np.hstack(tausf) - dw_size = w_size // decimate - tausf %= dw_size - - # run the TDOA search + if mode == 'prepare': + slices = slicer(-(cc_size // 2), cc_size // 2, (num_channels - 1), 16) + tausf = [] + for j in range(len(slices)): + taus = np.mgrid[slices[j]].reshape(num_channels - 1, -1).astype(np.int16) + taus2 = np.matmul(mat, taus) + tausf += [taus2[:, np.abs(taus2).max(0) <= cc_size // 2]] + tausf = np.hstack(tausf) + dw_size = w_size // decimate + tausf %= dw_size + elif mode == 'on-the-fly': + pass + else: + raise ValueError(f'Unknown mode {mode}') + tdoas = np.zeros((len(pos), num_channels), np.float32) - tdoas2 = np.zeros((len(pos), num_channels -1), np.float32) + + if hyper: # prepare hyper res + tdoas2 = np.zeros((len(pos), num_channels), np.float32) + poly = PolynomialFeatures(2) + lin = LinearRegression() + pipe = Pipeline([('poly', poly), ('lin', lin)]) + ind = np.triu_indices(num_channels - 1) + + def _hyperres(taus, cc): + taus = taus + np.stack(np.meshgrid(*(num_channels - 1) * (np.arange(-2, 3),)), 0).reshape( + num_channels - 1, -1).T + taus = np.matmul(mat, taus.T) + taus = taus[:, np.abs(taus).max(0) <= cc_size // 2] + mean = taus.mean(-1)[:3] + coef = pipe.fit(taus.T[:, :3] - mean, + cc[np.expand_dims(np.arange(num_channel_pairs), 1), taus.astype(np.int)].prod(0) + ).named_steps['lin'].coef_ + der = np.zeros((num_channels - 1, num_channels - 1)) + der[ind] = coef[4:] + poly_min = np.linalg.lstsq(der + der.T, -coef[1:4], rcond=None)[0] + mean + return pipe.predict(poly_min), poly_min + + cc = np.empty((num_channel_pairs, dw_size), np.float32) - poly = PolynomialFeatures(2) - lin = LinearRegression() - pipe = Pipeline([('poly',poly),('lin',lin)]) - ind = np.triu_indices(num_channels -1) for i in trange(len(pos)): fft = rfft(win * data[pos[i]:w_size + pos[i]], axis=0) if decimate > 1: @@ -88,22 +105,24 @@ def corr(data, sr, pos, w_size, max_tdoa, decimate=1): fft = np.asarray(fft, dtype=np.complex64) cc[:] = irfft(fft[:, v2] * np.conj(fft[:, v1]), axis=0).T cc -= cc.min(-1, keepdims=True) - val, index = c_corr(cc, tausf) - tdoas[i, 0] = val - tdoas[i, 1:] = ((tausf[:(num_channels - 1), index] + dw_size//2) % dw_size) - dw_size//2 - - # hyper resolution - taus = tdoas[i,1:num_channels] + np.stack(np.meshgrid(*(num_channels-1) * (np.arange(-2,3),)),0).reshape(num_channels-1, -1).T - taus = np.matmul(mat, taus.T) - taus = taus[:, np.abs(taus).max(0) <= cc_size // 2] - mean = taus.mean(-1)[:3] - coef = pipe.fit((taus).T[:,:3] - mean, - cc[np.expand_dims(np.arange(num_channel_pairs), 1), taus.astype(np.int)].prod(0) - ).named_steps['lin'].coef_ - der = np.zeros((num_channels -1, num_channels - 1)) - der[ind] = coef[4:] - tdoas2[i] = np.linalg.lstsq(der + der.T, -coef[1:4], rcond=None)[0] + mean - return np.hstack((np.expand_dims(pos, -1), tdoas)), np.hstack((np.expand_dims(pos, -1), tdoas2)) + if mode == 'prepare': + tdoas[i, 0], index = c_corr.c_corr_at(cc, tausf) + tdoas[i, 1:] = ((tausf[:(num_channels - 1), index] + dw_size // 2) % dw_size) - dw_size // 2 + elif mode == 'on-the-fly': + tdoas[i, 0], tdoas[i, 1:] = c_corr.c_corr_all(cc, cc_size//2, num_channels - 1) + else: + raise ValueError(f'Unknown mode {mode}') + + if hyper: + tdoas[i, 0], tdoas2[i, 1:] = _hyperres(tdoas[i, 1:], cc) + + if hyper: + return np.hstack((np.expand_dims(pos, -1), tdoas)), np.hstack((np.expand_dims(pos, -1), tdoas2)) + else: + return np.hstack((np.expand_dims(pos, -1), tdoas)) + + + def main(args): @@ -114,7 +133,8 @@ def main(args): # load audio file print(f'Loading {args.infile}...') sr = sf.info(args.infile).samplerate - sound, sr = sf.read(args.infile, start=int(sr*args.start), stop=int(sr*args.end) if args.end is not None else None, + sound, sr = sf.read(args.infile, start=int(sr * args.start), + stop=int(sr * args.end) if args.end is not None else None, dtype=np.float32, always_2d=True) if args.channels is not None: sound = sound[:, args.channels] @@ -132,11 +152,11 @@ def main(args): print("Filtering...") if args.low is not None: if args.highpass is None: - sos = sg.butter(3, 2*args.low/sr, 'highpass', output='sos') + sos = sg.butter(3, 2 * args.low / sr, 'highpass', output='sos') else: - sos = sg.butter(3, [2*args.low/sr, 2*args.up/sr], 'bandpass', output='sos') + sos = sg.butter(3, [2 * args.low / sr, 2 * args.up / sr], 'bandpass', output='sos') else: - sos = sg.butter(3, 2*args.up/sr, 'lowpass', output='sos') + sos = sg.butter(3, 2 * args.up / sr, 'lowpass', output='sos') sound = sg.sosfiltfilt(sos, sound, axis=0) if args.decimate and args.temporal: @@ -144,20 +164,17 @@ def main(args): sr /= args.decimate # Position where the TDOAs are computed - if os.path.isfile(args.hop_size): - pos = (sr * np.loadtxt(args.hop_size, delimiter=',')).astype(int).ravel() + if os.path.isfile(args.stride): + pos = (sr * np.loadtxt(args.stride, delimiter=',')).astype(int).ravel() else: try: - pos = np.arange(0, len(sound), int(sr * float(args.hop_size))) + pos = np.arange(0, len(sound), int(sr * float(args.stride))) except ValueError: - raise ValueError(f'Error: hop size {args.hop_size} is neither an existing file nor a float') - + raise ValueError(f'Error: hop size {args.stride} is neither an existing file nor a float') print("Computing TDOAs...") - result1, result2 = corr(sound, sr, pos, int(sr * args.frame_size), - max_tdoa=int(np.ceil(sr * args.max_tdoa)), - decimate=(args.decimate - if not args.temporal else 1)) + result1 = corr(sound, pos, int(sr * args.frame_size), max_tdoa=int(np.ceil(sr * args.max_tdoa)), + decimate=args.decimate if not args.temporal else 1, mode=args.mode, hyper=not args.no_hyperes) # compute additional non-independent TDOAs additional1 = [(b - a) for a, b in itertools.combinations(result1.T[2:], 2)] @@ -176,7 +193,21 @@ def main(args): if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Computes TDOA estimates from a multi-channel recording.') + class SmartFormatter(argparse.HelpFormatter): + """ + Allow to change argparse formating behaviour for one option by adding \'R|\' at the start of the help + """ + + def _split_lines(self, text, width): + if text.startswith('R|'): + return [l for t in text[2:].splitlines() for l in argparse.HelpFormatter._split_lines(self, t, width)] + # this is the RawTextHelpFormatter._split_lines + return argparse.HelpFormatter._split_lines(self, text, width) + + + parser = argparse.ArgumentParser(description='Computes TDOA estimates from a multi-channel recording.', + formatter_class=SmartFormatter) + parser.add_argument('infile', type=str, help='The sound file to process.') parser.add_argument('outfile', type=str, help='The text or npy file to write results to. Each row gives the ' 'position (in samples), cross-correlation product, the independent ' @@ -184,38 +215,59 @@ if __name__ == "__main__": 'For plots, the panes give the cross-correlation product, ' 'independent TDOAS and derived TDOAs from top to bottom.') - parser.add_argument('-c', '--channels', type=intlist, default=None, + group1 = parser.add_argument_group('Channels') + group1.add_argument('-c', '--channels', type=intlist, default=None, help='The channels to cross-correlate. Accepts two or more, but beware of high memory use. To ' 'be given as a comma-separated list of numbers, with 0 referring to the first channel ' '(default: all channels).') - parser.add_argument('-i', '--inverse', type=intlist, default=None, + group1.add_argument('-i', '--inverse', type=intlist, default=None, help='Inverse the channel. To be given as a comma-separated list of numbers,' 'with 0 referring to the first channel once channels have been chosen by --channels.') - parser.add_argument('-f', '--frame-size', type=float, default=0.2, + group2 = parser.add_argument_group('Size settings') + group2.add_argument('-f', '--frame-size', type=float, default=0.2, help='The size of the cross-correlation frames in seconds (default: %(default)s)') - parser.add_argument('-s', '--hop-size', type=str, default=0.01, + group2.add_argument('-s', '--stride', type=str, default=0.01, help='The step between the beginnings of sequential frames in seconds (default: %(default)s), ' 'or the postion in second if csv file path is given.') - parser.add_argument('-m', '--max-tdoa', type=float, default=0.0011, + group2.add_argument('-m', '--max-tdoa', type=float, default=0.0011, help='The maximum TDOA in seconds (default: %(default)s).') - parser.add_argument('-l', '--low', type=float, default=None, help='Bottom cutoff frequency. Disabled by default.') - parser.add_argument('-u', '--up', type=float, default=None, help='Top cutoff frequency. Disabled by default.') + group2.add_argument('-S', '--start', metavar='SECONDS', type=float, default=0, + help='If given, only analyze from the given position.') + group2.add_argument('-E', '--end', metavar='SECONDS', type=float, default=None, + help='If given, only analyze up to the given position.') - parser.add_argument('-d', '--decimate', type=int, default=1, help='Downsample the signal by the given factor. ' + group3 = parser.add_argument_group('Filtering') + group3.add_argument('-l', '--low', type=float, default=None, help='Bottom cutoff frequency. Disabled by default.') + group3.add_argument('-u', '--up', type=float, default=None, help='Top cutoff frequency. Disabled by default.') + group3.add_argument('-d', '--decimate', type=int, default=1, help='Downsample the signal by the given factor. ' 'Disabled by default') - parser.add_argument('-t', '--temporal', action='store_true', help='If given, any decimation will be applied in the ' + group3.add_argument('-t', '--temporal', action='store_true', help='If given, any decimation will be applied in the ' 'time domain instead of the spectral domain.') - parser.add_argument('-e', '--erase', action='store_false', help='Erase existing outfile. If outfile existe and ' + group4 = parser.add_argument_group('Other') + group4.add_argument('-e', '--erase', action='store_false', help='Erase existing outfile. If outfile exist and ' '--erase is not provide, the script will exit.') - - parser.add_argument('--start', metavar='SECONDS', type=float, default=0, - help='If given, only analyze from the given position.') - parser.add_argument('--end', metavar='SECONDS', type=float, default=None, - help='If given, only analyze up to the given position.') + group4.add_argument('-n', '--no-hyperres', action='store_True', help='Disable the hyper resolution evalutation of ' + 'the TDOA') + group4.add_argument('-M', '--mode', choices={'prepare', 'on-the-fly', 'estimate', 'auto'}, default='prepare', + help=f'R|How to explore the TDOA space (default: %(default)s).\n' + f'{BColors.BOLD}prepare{BColors.ENDC} precomputes all the possible TDOA pairs and then ' + f'evaluate them. All the results are save in memory.\n' + f'{BColors.BOLD}on-the-fly{BColors.ENDC} compute the TDOA pairs at the same time as it ' + f'compute the loss function. Only the maximum is saved. Can be slower than ' + f'{BColors.BOLD}prepare{BColors.ENDC}.\n' + f'{BColors.BOLD}estimate{BColors.ENDC} try to reduce the number of TDOA to evaluate.\n' + f'{BColors.BOLD}auto{BColors.ENDC} automatically try to pick the right method.') args = parser.parse_args() + try: + if args.mode in ['estimate', 'auto']: + raise NotImplementedError(f'mode {args.mode} is not yet implemented') + + sys.exit(main(args)) - sys.exit(main(args)) + except Exception as e: + print(e) + sys.exit(2)