#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
Audio analysis
'''
import os
import pandas as pd
import numpy as np
import soundfile as sf
import librosa
from .feature import Feature, FeatureCollection
from .timing import TimingList
[docs]class Audio(object):
"""
The base Audio object: wraps the ouput from librosa, and provides access to features
Attributes
----------
sample_rate: number
sample rate
raw_samples: numpy array
raw samples from the audio
analysis_samples: numpy array
downsampled samples for analysis
num_channels: integer
number of channels of the audio
duration: float
duration, in seconds
features: dict
collection of named feature objects
"""
def __init__(self, file_path=None, raw_samples=None, convert_to_mono=False,
sample_rate=44100, analysis_sample_rate=22050):
"""
Audio constructor.
Opens a file path, loads the audio with librosa, and prepares the features
Parameters
----------
file_path: string
path to the audio file to load
raw_samples: np.array
samples to use for audio output
convert_to_mono: boolean
(optional) converts the file to mono on loading
sample_rate: number > 0 [scalar]
(optional) sample rate to pass to librosa.
Returns
------
An Audio object
"""
if file_path:
y, sr = librosa.load(file_path, mono=convert_to_mono, sr=sample_rate)
elif raw_samples is not None:
# This assumes that we're passing in raw_samples
# directly from another Audio's raw_samples.
y = raw_samples
sr = sample_rate
self.file_path = file_path
self.sample_rate = float(sr)
self.analysis_sample_rate = float(analysis_sample_rate)
self.num_channels = y.ndim
self.duration = librosa.get_duration(y=y, sr=sr)
self.analysis_samples = librosa.resample(librosa.to_mono(y),
sr, self.analysis_sample_rate,
res_type='kaiser_best')
self.raw_samples = np.atleast_2d(y)
self.zero_indexes = self._create_zero_indexes()
self.features = self._create_features()
self.timings = self._create_timings()
def __repr__(self):
file_name = os.path.split(self.file_path)[-1]
args = file_name, self.duration
return '<Audio, file: {0:s}, duration: {1:.2f}>'.format(*args)
[docs] def output(self, filename, format=None):
"""
Write the samples out to the given filename.
Parameters
----------
filename : str
The path to write the audio on disk.
This can be any format supported by `pysoundfile`, including
`WAV`, `FLAC`, or `OGG` (but not `mp3`).
format : str
If provided, explicitly set the output encoding format.
See `soundfile.available_formats`.
"""
sf.write(filename, self.raw_samples.T, int(self.sample_rate), format=format)
def _create_zero_indexes(self):
"""
Create zero crossing indexes.
We use these in synthesis, and it is easier to make them here.
"""
zero_indexes = []
for channel_index in range(self.num_channels):
channel = self.raw_samples[channel_index]
zero_crossings = librosa.zero_crossings(channel)
zero_index = np.nonzero(zero_crossings)[0]
zero_indexes.append(zero_index)
return zero_indexes
def _create_timings(self):
"""
Create timings in a timings dict.
"""
timings = {}
timings['track'] = TimingList('track', [(0, self.duration)], self)
timings['beats'] = TimingList('beats', self._get_beats(), self)
timings['segments'] = TimingList('segments', self._get_segments(), self)
return timings
def _get_beats(self):
"""
Gets beats using librosa's beat tracker.
"""
_, beat_frames = librosa.beat.beat_track(y=self.analysis_samples,
sr=self.analysis_sample_rate,
trim=False)
# pad beat times to full duration
f_max = librosa.time_to_frames(self.duration, sr=self.analysis_sample_rate)
beat_frames = librosa.util.fix_frames(beat_frames, x_min=0, x_max=f_max)
# convert frames to times
beat_times = librosa.frames_to_time(beat_frames, sr=self.analysis_sample_rate)
# make the list of (start, duration) tuples that TimingList expects
starts_durs = [(s, t-s) for (s, t) in zip(beat_times, beat_times[1:])]
return starts_durs
def _get_segments(self):
"""
Gets Echo Nest style segments using librosa's onset detection and backtracking.
"""
onset_frames = librosa.onset.onset_detect(y=self.analysis_samples,
sr=self.analysis_sample_rate,
backtrack=True)
segment_times = librosa.frames_to_time(onset_frames,
sr=self.analysis_sample_rate)
# make the list of (start, duration) tuples that TimingList expects
starts_durs = [(s, t-s) for (s, t) in zip(segment_times, segment_times[1:])]
return starts_durs
def _create_features(self):
"""
Creates the FeatureCollection, and loads each feature.
Parameters
---------
Returns
-----
FeatureCollection
FeatureCollection with each Amen.Feature object named correctly.
Note that _get_chroma returns a FeatureCollection of chroma features.
"""
features = FeatureCollection()
features['centroid'] = self._get_centroid()
features['amplitude'] = self._get_amplitude()
features['timbre'] = self._get_timbre()
features['chroma'] = self._get_chroma()
features['tempo'] = self._get_tempo()
return features
def _get_centroid(self):
"""
Gets spectral centroid data from librosa, and returns it as a Feature
Parameters
---------
Returns
-----
Feature
"""
centroids = librosa.feature.spectral_centroid(self.analysis_samples)
data = self._convert_to_dataframe(centroids, ['spectral_centroid'])
feature = Feature(data)
return feature
def _get_amplitude(self):
"""
Gets amplitude data from librosa, and returns it as a Feature
Parameters
---------
Returns
-----
Feature
"""
amplitudes = librosa.feature.rmse(self.analysis_samples)
data = self._convert_to_dataframe(amplitudes, ['amplitude'])
feature = Feature(data)
return feature
def _get_timbre(self):
"""
Gets timbre (MFCC) data, taking the first 20.
Note that the keys to the Feature are "mffc_<index>",
to avoid having a dict-like object with numeric keys.
Parameters
---------
Returns
-----
Feature
"""
mfccs = librosa.feature.mfcc(y=self.analysis_samples, sr=self.analysis_sample_rate, n_mfcc=12)
feature = FeatureCollection()
for index, mfcc in enumerate(mfccs):
data = self._convert_to_dataframe(mfcc, ['timbre'])
key = 'mfcc_%s' % (index)
feature[key] = Feature(data)
return feature
def _get_chroma(self):
"""
Gets chroma data from librosa, and returns it as a FeatureCollection,
with 12 features.
Parameters
---------
Returns
-----
FeatureCollection
"""
feature = FeatureCollection()
pitch_names = ['c', 'c#', 'd', 'eb', 'e', 'f', 'f#', 'g', 'ab', 'a', 'bb', 'b']
chroma_cq = librosa.feature.chroma_cqt(self.analysis_samples)
for chroma, pitch in zip(chroma_cq, pitch_names):
data = self._convert_to_dataframe(chroma, [pitch])
feature[pitch] = Feature(data)
# Enharmonic aliases
feature['db'] = feature['c#']
feature['d#'] = feature['eb']
feature['gb'] = feature['f#']
feature['g#'] = feature['ab']
feature['a#'] = feature['bb']
return feature
def _get_tempo(self):
"""
Gets tempo data from librosa, and returns it as a feature collection.
Note that the tempo feature uses median aggregation, as opposed to the
default mean.
Parameters
---------
Returns
-----
FeatureCollection
"""
onset_env = librosa.onset.onset_strength(self.analysis_samples, sr=self.analysis_sample_rate)
tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=self.analysis_sample_rate, aggregate=None)
data = self._convert_to_dataframe(tempo, ['tempo'])
feature = Feature(data, aggregate=np.median)
return feature
@classmethod
def _convert_to_dataframe(cls, feature_data, columns):
"""
Take raw librosa feature data, convert to a pandas dataframe.
Parameters
---------
feature_data: numpy array
a N by T array, where N is the number of features, and T is the number of time dimensions
columns: list [strings]
a list of column names of length N, the same as the N dimension of feature_data
Returns
-----
pandas.DataFrame
"""
feature_data = feature_data.transpose()
frame_numbers = np.arange(len(feature_data))
indexes = librosa.frames_to_time(frame_numbers)
indexes = pd.to_timedelta(indexes, unit='s')
data = pd.DataFrame(data=feature_data, index=indexes, columns=columns)
return data