Source code for amen.feature

#!/usr/bin/env python
'''Container classes for feature analysis'''

import numpy as np
import pandas as pd
import six

from .timing import TimeSlice
from .exceptions import FeatureError

[docs]class Feature(object): """ Core feature container object. Handles indexing and time-slicing. Attributes --------- Methods ------ at(time_slices) Resample the feature at the given TimeSlices """ def __init__(self, data, aggregate=np.mean, base=None, time_slices=None): """ Constructor for feature object Parameters ---------- data: pandas.DataFrame Time-indexed data frame of features aggregate: function resample-aggregation function or mapping Returns ------ A Feature object """ # Check that the arguments have the right types assert isinstance(data, pd.DataFrame) self.data = data self.aggregate = aggregate self.time_slices = time_slices # Not sure that this is the right way to do it - I feel like we're outsmarting pandas # pandas supports multiple keys in a dataframe, whereas this only allows one. # Should we replace FeatureCollection with something like that? self.name = data.keys()[0] if base is not None: assert isinstance(base, Feature) self.base = base def __repr__(self): args = (self.name) return '<Feature, {0}>'.format(args) def __iter__(self): """ Wrapper to allow easy access to the internal data of the pandas dataframe """ for datum in self.data[self.name]: yield datum def __getitem__(self, x): """ Wrapper to allow easy access to the internal data of the pandas dataframe """ return self.data[self.name][x] def __len__(self): """ Wrapper to allow easy access to the internal data of the pandas dataframe """ return len(self.data[self.name])
[docs] def with_time(self): """ Allows iteration over a time-indexed feature and the associated timeslices. """ if self.time_slices is None: raise FeatureError("Feature has no time reference.") for i, datum in enumerate(self.data[self.name]): yield (self.time_slices[i], datum)
[docs] def at(self, time_slices): """ Resample the data at a new time slice index. Parameters ---------- time_slices: TimeSlice or TimeSlice collection The time slices at which to index this feature object Returns ------- Feature The resampled feature data """ if self.base is not None: return self.base.at(time_slices) if isinstance(time_slices, TimeSlice): time_slices = [time_slices] # join the time slice values timed_data = pd.DataFrame(columns=self.data.columns) # make the new data for slice_t in time_slices: slice_index = ((slice_t.time <= self.data.index) & (self.data.index < slice_t.time + slice_t.duration)) timed_data.loc[slice_t.time] = self.aggregate(self.data[slice_index], axis=0) # return the new feature object return Feature(data=timed_data, aggregate=self.aggregate, base=self, time_slices=time_slices)
[docs]class FeatureCollection(dict): """ A dictionary of features. Delegates `.at` to the features it contains. Allows for selection of multiple keys, which returns a smaller feature collection. """
[docs] def at(self, time_slices): """ Resample each feature at a new time slice index. Parameters ---------- time_slices : TimeSlice or TimeSlice collection The time slices at which to index this feature object Returns ------- new_features : FeatureCollection The resampled feature data """ new_features = FeatureCollection() for key in self.keys(): new_features[key] = self[key].at(time_slices) return new_features
def __iter__(self): """ Wrapper to avoid making the user deal with parallel lists """ key = list(self.keys())[0] length = len(self[key]) for i in range(length): res = {} for key, feature in self.items(): res[key] = feature.data[feature.name][i] yield res def __len__(self): """ Wrapper to avoid making the user deal with parallel lists """ key = list(self.keys())[0] feature = self[key] return len(feature)
[docs] def with_time(self): """ Allows iteration over a time-indexed feature and the associated timeslices. """ key = list(self.keys())[0] length = len(self[key]) time_slices = self[key].time_slices if time_slices is None: raise FeatureError("FeatureCollection has no time reference.") for i in range(length): res = {} for key, feature in self.items(): res[key] = feature.data[feature.name][i] yield (time_slices[i], res)
[docs] def get(self, keys): """ Get a subset of the keys in the correct feature collection Parameters ---------- keys : A string or list of strings The keys to return from the current feature collection Returns ------- new_features : FeatureCollection The subset of keys """ if isinstance(keys, six.string_types): keys = [keys] new_features = FeatureCollection() for key in keys: if key in self: new_features[key] = self[key] return new_features