Source code for amen.audio

#!/usr/bin/env python
# -*- coding: utf-8 -*-

'''
Audio analysis
'''

import os
import pandas as pd
import numpy as np
import soundfile as sf

import librosa

from .feature import Feature, FeatureCollection
from .timing import TimingList

[docs]class Audio(object):
    """
    The base Audio object:  wraps the ouput from librosa, and provides access to features

    Attributes
    ----------
        sample_rate: number
            sample rate
        raw_samples: numpy array
            raw samples from the audio
        analysis_samples: numpy array
            downsampled samples for analysis
        num_channels: integer
            number of channels of the audio
        duration: float
            duration, in seconds
        features: dict
            collection of named feature objects
    """

    def __init__(self, file_path=None, raw_samples=None, convert_to_mono=False,
                 sample_rate=44100, analysis_sample_rate=22050):
        """
        Audio constructor.
        Opens a file path, loads the audio with librosa, and prepares the features

        Parameters
        ----------

        file_path: string
            path to the audio file to load

        raw_samples: np.array
            samples to use for audio output

        convert_to_mono: boolean
            (optional) converts the file to mono on loading

        sample_rate: number > 0 [scalar]
            (optional) sample rate to pass to librosa.


        Returns
        ------
        An Audio object
        """

        if file_path:
            y, sr = librosa.load(file_path, mono=convert_to_mono, sr=sample_rate)
        elif raw_samples is not None:
            # This assumes that we're passing in raw_samples
            # directly from another Audio's raw_samples.
            y = raw_samples
            sr = sample_rate

        self.file_path = file_path
        self.sample_rate = float(sr)
        self.analysis_sample_rate = float(analysis_sample_rate)
        self.num_channels = y.ndim
        self.duration = librosa.get_duration(y=y, sr=sr)

        self.analysis_samples = librosa.resample(librosa.to_mono(y),
                                                 sr, self.analysis_sample_rate,
                                                 res_type='kaiser_best')
        self.raw_samples = np.atleast_2d(y)

        self.zero_indexes = self._create_zero_indexes()
        self.features = self._create_features()
        self.timings = self._create_timings()

    def __repr__(self):
        file_name = os.path.split(self.file_path)[-1]
        args = file_name, self.duration
        return '<Audio, file: {0:s}, duration: {1:.2f}>'.format(*args)

[docs]    def output(self, filename, format=None):
        """
        Write the samples out to the given filename.

        Parameters
        ----------
        filename : str
            The path to write the audio on disk.
            This can be any format supported by `pysoundfile`, including
            `WAV`, `FLAC`, or `OGG` (but not `mp3`).

        format : str
            If provided, explicitly set the output encoding format.
            See `soundfile.available_formats`.
        """
        sf.write(filename, self.raw_samples.T, int(self.sample_rate), format=format)

    def _create_zero_indexes(self):
        """
        Create zero crossing indexes.
        We use these in synthesis, and it is easier to make them here.
        """
        zero_indexes = []
        for channel_index in range(self.num_channels):
            channel = self.raw_samples[channel_index]
            zero_crossings = librosa.zero_crossings(channel)
            zero_index = np.nonzero(zero_crossings)[0]
            zero_indexes.append(zero_index)
        return zero_indexes

    def _create_timings(self):
        """
        Create timings in a timings dict.
        """
        timings = {}
        timings['track'] = TimingList('track', [(0, self.duration)], self)
        timings['beats'] = TimingList('beats', self._get_beats(), self)
        timings['segments'] = TimingList('segments', self._get_segments(), self)
        return timings

    def _get_beats(self):
        """
        Gets beats using librosa's beat tracker.
        """
        _, beat_frames = librosa.beat.beat_track(y=self.analysis_samples,
                                                 sr=self.analysis_sample_rate,
                                                 trim=False)

        # pad beat times to full duration
        f_max = librosa.time_to_frames(self.duration, sr=self.analysis_sample_rate)
        beat_frames = librosa.util.fix_frames(beat_frames, x_min=0, x_max=f_max)

        # convert frames to times
        beat_times = librosa.frames_to_time(beat_frames, sr=self.analysis_sample_rate)

        # make the list of (start, duration) tuples that TimingList expects
        starts_durs = [(s, t-s) for (s, t) in zip(beat_times, beat_times[1:])]

        return starts_durs

    def _get_segments(self):
        """
        Gets Echo Nest style segments using librosa's onset detection and backtracking.
        """

        onset_frames  = librosa.onset.onset_detect(y=self.analysis_samples,
                                                sr=self.analysis_sample_rate,
                                                backtrack=True)
        segment_times = librosa.frames_to_time(onset_frames,
                                               sr=self.analysis_sample_rate)

        # make the list of (start, duration) tuples that TimingList expects
        starts_durs = [(s, t-s) for (s, t) in zip(segment_times, segment_times[1:])]

        return starts_durs

    def _create_features(self):
        """
        Creates the FeatureCollection, and loads each feature.

        Parameters
        ---------

        Returns
        -----
        FeatureCollection
            FeatureCollection with each Amen.Feature object named correctly.
            Note that _get_chroma returns a FeatureCollection of chroma features.
        """
        features = FeatureCollection()
        features['centroid'] = self._get_centroid()
        features['amplitude'] = self._get_amplitude()
        features['timbre'] = self._get_timbre()
        features['chroma'] = self._get_chroma()
        features['tempo'] = self._get_tempo()
        return features

    def _get_centroid(self):
        """
        Gets spectral centroid data from librosa, and returns it as a Feature

        Parameters
        ---------

        Returns
        -----
        Feature
        """
        centroids = librosa.feature.spectral_centroid(self.analysis_samples)
        data = self._convert_to_dataframe(centroids, ['spectral_centroid'])
        feature = Feature(data)
        return feature

    def _get_amplitude(self):
        """
        Gets amplitude data from librosa, and returns it as a Feature

        Parameters
        ---------

        Returns
        -----
        Feature
        """
        amplitudes = librosa.feature.rmse(self.analysis_samples)
        data = self._convert_to_dataframe(amplitudes, ['amplitude'])
        feature = Feature(data)
        return feature

    def _get_timbre(self):
        """
        Gets timbre (MFCC) data, taking the first 20.
        Note that the keys to the Feature are "mffc_<index>", 
        to avoid having a dict-like object with numeric keys.

        Parameters
        ---------

        Returns
        -----
        Feature
        """
        mfccs = librosa.feature.mfcc(y=self.analysis_samples, sr=self.analysis_sample_rate, n_mfcc=12)
        feature = FeatureCollection()
        for index, mfcc in enumerate(mfccs):
            data = self._convert_to_dataframe(mfcc, ['timbre'])
            key = 'mfcc_%s' % (index)
            feature[key] = Feature(data)

        return feature

    def _get_chroma(self):
        """
        Gets chroma data from librosa, and returns it as a FeatureCollection,
        with 12 features.

        Parameters
        ---------

        Returns
        -----
        FeatureCollection
        """
        feature = FeatureCollection()
        pitch_names = ['c', 'c#', 'd', 'eb', 'e', 'f', 'f#', 'g', 'ab', 'a', 'bb', 'b']
        chroma_cq = librosa.feature.chroma_cqt(self.analysis_samples)
        for chroma, pitch in zip(chroma_cq, pitch_names):
            data = self._convert_to_dataframe(chroma, [pitch])
            feature[pitch] = Feature(data)

        # Enharmonic aliases
        feature['db'] = feature['c#']
        feature['d#'] = feature['eb']
        feature['gb'] = feature['f#']
        feature['g#'] = feature['ab']
        feature['a#'] = feature['bb']

        return feature

    def _get_tempo(self):
        """
        Gets tempo data from librosa, and returns it as a feature collection.
        Note that the tempo feature uses median aggregation, as opposed to the
        default mean.

        Parameters
        ---------

        Returns
        -----
        FeatureCollection
        """
        onset_env = librosa.onset.onset_strength(self.analysis_samples, sr=self.analysis_sample_rate)
        tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=self.analysis_sample_rate, aggregate=None)
        data = self._convert_to_dataframe(tempo, ['tempo'])
        feature = Feature(data, aggregate=np.median)

        return feature

    @classmethod
    def _convert_to_dataframe(cls, feature_data, columns):
        """
        Take raw librosa feature data, convert to a pandas dataframe.

        Parameters
        ---------
        feature_data: numpy array
            a N by T array, where N is the number of features, and T is the number of time dimensions

        columns: list [strings]
            a list of column names of length N, the same as the N dimension of feature_data

        Returns
        -----
        pandas.DataFrame
        """
        feature_data = feature_data.transpose()
        frame_numbers = np.arange(len(feature_data))
        indexes = librosa.frames_to_time(frame_numbers)
        indexes = pd.to_timedelta(indexes, unit='s')
        data = pd.DataFrame(data=feature_data, index=indexes, columns=columns)
        return data