Skip to content

elapid.train_test_split

Methods for geographlically splitting data into train/test splits

BufferedLeaveOneOut

Bases: BaseCrossValidator

Leave-one-out CV that excludes training points within a buffered distance.

Source code in elapid/train_test_split.py
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
class BufferedLeaveOneOut(BaseCrossValidator):
    """Leave-one-out CV that excludes training points within a buffered distance."""

    def __init__(self, distance: float):
        """Buffered leave-one-out cross-validation strategy.

        Drops points from the training data based on a buffered distance
            to the left-out test point(s). Implemented from Ploton et al. 2020,
            https://www.nature.com/articles/s41467-020-18321-y

        Args:
            distance: drop training data points within this distance of test data.
        """
        self.distance = distance

    def _group_idxs(
        self, points: Vector, class_label: str = None, groups: str = None, count: bool = False
    ) -> List[int]:
        """Get test indices for grouped train/test splits."""
        if class_label is not None:
            in_class = points[class_label] == 1
            points = points.iloc[in_class]

        unique = points[groups].unique()
        if count:
            return len(unique)

        all_idxs = np.arange(len(points))
        test_idxs = []
        for group in unique:
            in_group = points[groups] == group
            test_idxs.append(all_idxs[in_group])

        return test_idxs

    def _point_idxs(self, points: Vector, class_label: str = None, count: bool = False) -> List[int]:
        """Get test indices for single point train/test splits."""
        if class_label is None:
            if count:
                return len(points)
            else:
                return range(len(points))

        else:
            in_class = points[class_label] == 1
            if count:
                return in_class.sum()
            else:
                return np.where(in_class)[0]

    def _iter_test_indices(self, points: Vector, class_label: str = None, groups: str = None, y: None = None):
        """Generate indices for test data samples."""
        if groups is None:
            test_idxs = self._point_idxs(points, class_label)

        else:
            test_idxs = self._group_idxs(points, class_label, groups)

        for indices in test_idxs:
            yield indices

    def _iter_test_masks(self, points: Vector, class_label: str = None, groups: str = None):
        """Generates boolean masks corresponding to test sets."""
        for test_index in self._iter_test_indices(points, class_label, groups):
            test_mask = np.zeros(_num_samples(points), dtype=bool)
            test_mask[test_index] = True
            yield test_mask

    def split(self, points: Vector, class_label: str = None, groups: str = None) -> Tuple[np.ndarray, np.ndarray]:
        """Split point data into train/test folds and return their array indices.

        Default behaviour is to perform leave-one-out cross-validation, meaning
            there will be as many train/test splits as there are samples.
            To run leave-one-out splits for each y==1 sample, use the
            `class_label` parameter to define which column includes the class
            to leave out. To run a grouped leave-one-out, use the `groups`
            parameter to define which column includes unique IDs to group by.

        Args:
            points: point-format GeoSeries or GeoDataFrame.
            class_label: column to specify presence locations (y==1).
            groups: column to group train/test splits by.

        Yields:
            (train_idxs, test_idxs) the train/test splits for each fold.
        """
        n_samples = len(points)
        indices = np.arange(n_samples)
        for test_index in self._iter_test_masks(points, class_label, groups):
            train_idx = indices[np.logical_not(test_index)]
            test_idx = indices[test_index]
            train_pts = points.iloc[train_idx]
            test_pts = points.iloc[test_idx]
            distances = nearest_point_distance(test_pts, train_pts)
            in_range = distances > self.distance
            buffered_train_idx = train_idx[in_range]
            yield buffered_train_idx, test_idx

    def get_n_splits(self, points: Vector, class_label: str = None, groups: str = None) -> int:
        """Return the number of splitting iterations in the cross-validator.

        Args:
            points: point-format GeoSeries or GeoDataFrame.
            class_label: column to specify presence locations (y==1).
            groups: column to group train/test splits by.

        Returns:
            Splitting iteration count.
        """
        if groups is None:
            return self._point_idxs(points, class_label, count=True)
        else:
            return self._group_idxs(points, class_label, groups, count=True)

__init__(distance)

Buffered leave-one-out cross-validation strategy.

Drops points from the training data based on a buffered distance to the left-out test point(s). Implemented from Ploton et al. 2020, www.nature.com/articles/s41467-020-18321-y

Parameters:

Name Type Description Default
distance float

drop training data points within this distance of test data.

required
Source code in elapid/train_test_split.py
120
121
122
123
124
125
126
127
128
129
130
def __init__(self, distance: float):
    """Buffered leave-one-out cross-validation strategy.

    Drops points from the training data based on a buffered distance
        to the left-out test point(s). Implemented from Ploton et al. 2020,
        https://www.nature.com/articles/s41467-020-18321-y

    Args:
        distance: drop training data points within this distance of test data.
    """
    self.distance = distance

get_n_splits(points, class_label=None, groups=None)

Return the number of splitting iterations in the cross-validator.

Parameters:

Name Type Description Default
points Vector

point-format GeoSeries or GeoDataFrame.

required
class_label str

column to specify presence locations (y==1).

None
groups str

column to group train/test splits by.

None

Returns:

Type Description
int

Splitting iteration count.

Source code in elapid/train_test_split.py
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
def get_n_splits(self, points: Vector, class_label: str = None, groups: str = None) -> int:
    """Return the number of splitting iterations in the cross-validator.

    Args:
        points: point-format GeoSeries or GeoDataFrame.
        class_label: column to specify presence locations (y==1).
        groups: column to group train/test splits by.

    Returns:
        Splitting iteration count.
    """
    if groups is None:
        return self._point_idxs(points, class_label, count=True)
    else:
        return self._group_idxs(points, class_label, groups, count=True)

split(points, class_label=None, groups=None)

Split point data into train/test folds and return their array indices.

Default behaviour is to perform leave-one-out cross-validation, meaning there will be as many train/test splits as there are samples. To run leave-one-out splits for each y==1 sample, use the class_label parameter to define which column includes the class to leave out. To run a grouped leave-one-out, use the groups parameter to define which column includes unique IDs to group by.

Parameters:

Name Type Description Default
points Vector

point-format GeoSeries or GeoDataFrame.

required
class_label str

column to specify presence locations (y==1).

None
groups str

column to group train/test splits by.

None

Yields:

Type Description
Tuple[np.ndarray, np.ndarray]

(train_idxs, test_idxs) the train/test splits for each fold.

Source code in elapid/train_test_split.py
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
def split(self, points: Vector, class_label: str = None, groups: str = None) -> Tuple[np.ndarray, np.ndarray]:
    """Split point data into train/test folds and return their array indices.

    Default behaviour is to perform leave-one-out cross-validation, meaning
        there will be as many train/test splits as there are samples.
        To run leave-one-out splits for each y==1 sample, use the
        `class_label` parameter to define which column includes the class
        to leave out. To run a grouped leave-one-out, use the `groups`
        parameter to define which column includes unique IDs to group by.

    Args:
        points: point-format GeoSeries or GeoDataFrame.
        class_label: column to specify presence locations (y==1).
        groups: column to group train/test splits by.

    Yields:
        (train_idxs, test_idxs) the train/test splits for each fold.
    """
    n_samples = len(points)
    indices = np.arange(n_samples)
    for test_index in self._iter_test_masks(points, class_label, groups):
        train_idx = indices[np.logical_not(test_index)]
        test_idx = indices[test_index]
        train_pts = points.iloc[train_idx]
        test_pts = points.iloc[test_idx]
        distances = nearest_point_distance(test_pts, train_pts)
        in_range = distances > self.distance
        buffered_train_idx = train_idx[in_range]
        yield buffered_train_idx, test_idx

GeographicKFold

Bases: BaseCrossValidator

Compute geographically-clustered train/test folds using KMeans clustering

Source code in elapid/train_test_split.py
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
class GeographicKFold(BaseCrossValidator):
    """Compute geographically-clustered train/test folds using KMeans clustering"""

    def __init__(self, n_splits: int = 4):
        """Cluster x/y points into separate cross-validation folds.

        Args:
            n_splits: Number of geographic clusters to split the data into.
        """
        self.n_splits = n_splits

    def _iter_test_indices(self, points: Vector, y: None = None, groups: None = None):
        """Generate indices for test data samples."""
        kmeans = KMeans(n_clusters=self.n_splits)
        xy = np.array(list(zip(points.geometry.x, points.geometry.y)))
        kmeans.fit(xy)
        clusters = kmeans.predict(xy)
        indices = np.arange(len(xy))
        for cluster in range(self.n_splits):
            test = clusters == cluster
            yield indices[test]

    def split(self, points: Vector) -> Tuple[np.ndarray, np.ndarray]:
        """Split point data into geographically-clustered train/test folds and
            return their array indices.

        Args:
            points: point-format GeoSeries or GeoDataFrame.

        Yields:
            (train_idxs, test_idxs) the train/test splits for each geo fold.
        """
        for train, test in super().split(points):
            yield train, test

    def get_n_splits(self) -> int:
        """Return the number of splitting iterations in the cross-validator.

        Returns:
            Splitting iteration count.
        """
        return self.n_splits

__init__(n_splits=4)

Cluster x/y points into separate cross-validation folds.

Parameters:

Name Type Description Default
n_splits int

Number of geographic clusters to split the data into.

4
Source code in elapid/train_test_split.py
76
77
78
79
80
81
82
def __init__(self, n_splits: int = 4):
    """Cluster x/y points into separate cross-validation folds.

    Args:
        n_splits: Number of geographic clusters to split the data into.
    """
    self.n_splits = n_splits

get_n_splits()

Return the number of splitting iterations in the cross-validator.

Returns:

Type Description
int

Splitting iteration count.

Source code in elapid/train_test_split.py
108
109
110
111
112
113
114
def get_n_splits(self) -> int:
    """Return the number of splitting iterations in the cross-validator.

    Returns:
        Splitting iteration count.
    """
    return self.n_splits

split(points)

Split point data into geographically-clustered train/test folds and return their array indices.

Parameters:

Name Type Description Default
points Vector

point-format GeoSeries or GeoDataFrame.

required

Yields:

Type Description
Tuple[np.ndarray, np.ndarray]

(train_idxs, test_idxs) the train/test splits for each geo fold.

Source code in elapid/train_test_split.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
def split(self, points: Vector) -> Tuple[np.ndarray, np.ndarray]:
    """Split point data into geographically-clustered train/test folds and
        return their array indices.

    Args:
        points: point-format GeoSeries or GeoDataFrame.

    Yields:
        (train_idxs, test_idxs) the train/test splits for each geo fold.
    """
    for train, test in super().split(points):
        yield train, test

checkerboard_split(points, grid_size, buffer=0, bounds=None)

Create train/test splits with a spatially-gridded checkerboard.

Parameters:

Name Type Description Default
points Vector

point-format GeoSeries or GeoDataFrame

required
grid_size float

the height and width of each checkerboard side to split data using. Should match the units of the points CRS (i.e. grid_size=1000 is a 1km grid for UTM data)

required
buffer float

add an x/y buffer around the initial checkerboard bounds

0
bounds Tuple[float, float, float, float]

instead of deriving the checkerboard bounds from points, use this tuple of [xmin, ymin, xmax, ymax] values.

None

Returns:

Type Description
Tuple[gpd.GeoDataFrame, gpd.GeoDataFrame]

(train_points, test_points) split using a checkerboard grid.

Source code in elapid/train_test_split.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
def checkerboard_split(
    points: Vector, grid_size: float, buffer: float = 0, bounds: Tuple[float, float, float, float] = None
) -> Tuple[gpd.GeoDataFrame, gpd.GeoDataFrame]:
    """Create train/test splits with a spatially-gridded checkerboard.

    Args:
        points: point-format GeoSeries or GeoDataFrame
        grid_size: the height and width of each checkerboard side to split
            data using. Should match the units of the points CRS
            (i.e. grid_size=1000 is a 1km grid for UTM data)
        buffer: add an x/y buffer around the initial checkerboard bounds
        bounds: instead of deriving the checkerboard bounds from `points`,
            use this tuple of [xmin, ymin, xmax, ymax] values.

    Returns:
        (train_points, test_points) split using a checkerboard grid.
    """
    if isinstance(points, gpd.GeoSeries):
        points = points.to_frame("geometry")

    bounds = points.total_bounds if bounds is None else bounds
    xmin, ymin, xmax, ymax = bounds

    x0s = np.arange(xmin - buffer, xmax + buffer + grid_size, grid_size)
    y0s = np.arange(ymin - buffer, ymax + buffer + grid_size, grid_size)

    train_cells = []
    test_cells = []
    for idy, y0 in enumerate(y0s):
        offset = 0 if idy % 2 == 0 else 1
        for idx, x0 in enumerate(x0s):
            cell = box(x0, y0, x0 + grid_size, y0 + grid_size)
            cell_type = 0 if (idx + offset) % 2 == 0 else 1
            if cell_type == 0:
                train_cells.append(cell)
            else:
                test_cells.append(cell)

    grid_crs = points.crs
    train_grid = gpd.GeoDataFrame(geometry=train_cells, crs=grid_crs)
    test_grid = gpd.GeoDataFrame(geometry=test_cells, crs=grid_crs)
    train_points = (
        gpd.sjoin(points, train_grid, how="left", predicate="within")
        .dropna()
        .drop(columns="index_right")
        .reset_index(drop=True)
    )
    test_points = (
        gpd.sjoin(points, test_grid, how="left", predicate="within")
        .dropna()
        .drop(columns="index_right")
        .reset_index(drop=True)
    )

    return train_points, test_points