Source code for trackintel.preprocessing.trips

import warnings
from datetime import timedelta

import geopandas as gpd
import numpy as np
import pandas as pd
from tqdm import tqdm

import trackintel as ti
from trackintel import Tours, Trips


[docs] def get_trips_grouped(trips, tours): """Helper function to get grouped trips by tour id Parameters ---------- trips: Trips Trips dataframe tours: Tours Output of generate_tours function, must contain column "trips" with list of trip ids on tour Returns ------- trips_grouped_by_tour: DataFrameGroupBy object Trips grouped by tour id Examples -------- >>> get_trips_grouped(trips, tours) Notes ------- This function is necessary because when running generate_tours, one trip only gets the tour ID of the smallest tour it belongs to assigned. Here, we return all trips for each tour, which might contain a nested tour. """ trips_inp = trips.copy() if "tour_id" in trips_inp.columns: trips_inp.drop(columns=["tour_id"], inplace=True) # make smaller version of tours tours_to_trips = tours.reset_index()[["id", "trips"]] # switch to trips id as index tours_to_trips.rename(columns={"id": "tour_id", "trips": "trip_id"}, inplace=True) # expand this small version so that each trip id is one row tours_expanded = tours_to_trips.explode("trip_id").reset_index(drop=True) # join with trips table by id tours_with_trips = tours_expanded.merge(trips_inp, left_on="trip_id", right_on="id", how="left") # group trips_grouped_by_tour = tours_with_trips.groupby("tour_id") return trips_grouped_by_tour
[docs] def generate_tours( trips, staypoints=None, max_dist=100, max_time="1d", max_nr_gaps=0, print_progress=False, ): """ Generate trackintel-tours from trips Parameters ---------- trips : Trips staypoints : Staypoints, default None Must have `location_id` column to connect trips via locations to a tour. If None, trips will be connected based only by the set distance threshold `max_dist`. max_dist: float, default 100 (meters) Maximum distance between the end point of one trip and the start point of the next trip within a tour. This is parameter is only used if staypoints is `None`! Also, if `max_nr_gaps > 0`, a tour can contain larger spatial gaps (see Notes below for more detail) max_time: str or pd.Timedelta, default "1d" (1 day) Maximum time that a tour is allowed to take max_nr_gaps: int, default 0 Maximum number of spatial gaps on the tour. Use with caution - see notes below. print_progress : bool, default False If print_progress is True, the progress bar is displayed Returns ------- trips_with_tours: Trips Same as `trips`, but with column `tour_id`, containing a list of the tours that the trip is part of (see notes). tours: Tours The generated tours Examples -------- >>> trips.generate_tours(staypoints) Notes ------- - Tours are defined as a collection of trips in a certain time frame that start and end at the same point - Tours and trips have an N:N relationship: One tour consists of multiple trips, but also one trip can be part of multiple tours, due to nested tours or overlapping tours. - This function implements two possibilities to generate tours of trips: Via the location ID in the `staypoints` df, or via a maximum distance. Thus, note that only one of the parameters `staypoints` or `max_dist` is used! - Nested tours are possible and will be regarded as 2 (or more tours). - It is possible to allow spatial gaps to occur on the tour, which might be useful to deal with missing data. Example: The two trips home-work, supermarket-home would still be detected as a tour when max_nr_gaps >= 1, although the work-supermarket trip is missing. Warning: This only counts the number of gaps, but neither temporal or spatial distance of gaps, nor the number of missing trips in a gap are bounded. Thus, this parameter should be set with caution, because trips that are hours apart might still be connected to a tour if `max_nr_gaps > 0`. """ # Two options: either the location IDs for staypoints on the trips are provided, or a maximum distance threshold # between end and start of trips is used if staypoints is not None: assert ( "location_id" in staypoints.columns ), "Staypoints with location ID is required, otherwise tours are generated without location using max_dist" geom_col = None # not used crs_is_projected = False # not used ti.Staypoints.validate(staypoints) ti.TripsDataFrame.validate(trips) else: # if no location is given, we need the trips table to have a geometry column ti.TripsGeoDataFrame.validate(trips) geom_col = trips.geometry.name # get crs crs_is_projected = ti.geogr.check_gdf_planar(trips) # convert max_time to timedelta if isinstance(max_time, str): max_time = pd.to_timedelta(max_time) # otherwise check if it's a Timedelta already, and raise error if not elif not isinstance(max_time, pd.Timedelta): raise TypeError("Parameter max_time must be either of type String or pd.Timedelta!") trips_input = trips.copy() # If the trips already have a column "tour_id", we drop it if "tour_id" in trips_input: trips_input.drop(columns="tour_id", inplace=True) warnings.warn("Deleted existing column 'tour_id' from trips.") kwargs = { "max_dist": max_dist, "max_nr_gaps": max_nr_gaps, "max_time": max_time, "staypoints": staypoints, "geom_col": geom_col, "crs_is_projected": crs_is_projected, } if print_progress: tqdm.pandas(desc="User tour generation") tours = ( trips_input.groupby(["user_id"], group_keys=False, as_index=False) .progress_apply(_generate_tours_user, **kwargs) .reset_index(drop=True) ) else: tours = ( trips_input.groupby(["user_id"], group_keys=False, as_index=False) .apply(_generate_tours_user, **kwargs) .reset_index(drop=True) ) # No tours found if len(tours) == 0: warnings.warn("No tours can be generated, return empty tours") return trips_input, tours # index management tours["id"] = np.arange(len(tours)) tours.set_index("id", inplace=True) # assign tour id to trips tour2trip_map = tours.reset_index().explode("trips").rename(columns={"id": "tour_id"}) # Each trip is only assigned to one tour. If a trip belongs to multiple tours, we can find its smallest subtour # by using the first one it is assigned to (nested tours are always found before big tours - have smaller tour_id) temp = tour2trip_map.groupby("trips").agg({"tour_id": list}) trips_with_tours = trips_input.join(temp, how="left") # trips id (generated by this function) should be int64 tours.index = tours.index.astype("int64") return trips_with_tours, Tours(tours)
def _generate_tours_user( user_trip_df, staypoints=None, max_dist=100, max_nr_gaps=0, max_time=timedelta(days=1), geom_col="geom", crs_is_projected=False, ): """ Compute tours from trips for one user Parameters ---------- user_trip_df : Trips The trips have to follow the standard definition for trips DataFrames staypoints : Staypoints, optional Must contain location ID column to connect trips via locations to a tour. If None, trips will be connected based only on a distance threshold `max_dist`. max_dist: float, default 100 (meters) Maximum distance between the end point of one trip and the start point of the next trip on a tour. However, if `max_nr_gaps > 0`, a tour can contain larger spatial gaps (see notes in `generate_tours`) max_time: Timedelta, default 1 day Maximum time that a tour is allowed to take max_nr_gaps: int, default 0 Maximum number of spatial gaps on the tour. Use with caution - see notes in `generate_tours`. geom_col : str, optional Name of geometry column of user_trip_df, by default "geom" crs_is_projected : bool, optional Whether the crs of user_trip_df is projected, by default False Returns ------- tours_df: DataFrame Tours for one user """ user_id = user_trip_df["user_id"].unique() assert len(user_id) == 1 user_id = user_id[0] # sort by time user_trip_df = user_trip_df.sort_values(by=["started_at"]) # save only the trip id (row.name) in the start candidates start_candidates = [] # collect tours tours = [] # Iterate over trips for _, row in user_trip_df.iterrows(): end_time = row["finished_at"] if len(start_candidates) > 0: # Check if there is a spatial gap between the previous and current trip: # If staypoints with locations are available, check whether they share the same location if staypoints is not None: end_start_at_same_loc = _check_same_loc( user_trip_df.loc[start_candidates[-1], "destination_staypoint_id"], # dest. stp of previous trip row["origin_staypoint_id"], # start stp of current trip staypoints, ) else: # If no locations are available, check whether the distance is smaller than max_dist end_start_at_same_loc = _check_max_dist( user_trip_df.loc[start_candidates[-1], geom_col].geoms[1], # destination point of previous trip row[geom_col].geoms[0], # start point of current trip max_dist, crs_is_projected, ) # if the current trip does not start at the end of the previous trip, there is a gap if not end_start_at_same_loc: # option 1: no gaps allowed - start search again if max_nr_gaps == 0: start_candidates = [row.name] continue # option 2: gaps allowed - search further else: start_candidates.append(np.nan) # Add this point as a candidate start_candidates.append(row.name) # Check whether endpoint would be an unknown activity if pd.isna(row["destination_staypoint_id"]): continue # keep a list of which candidates to remove (because of time frame) new_list_start = 0 # keep track of how many gaps we encountered, if greater than max_nr_gaps then stop gap_counter = 0 # check for all candidates whether they form a tour with the current trip for j, cand in enumerate(start_candidates[::-1]): # gap if np.isnan(cand): gap_counter += 1 if gap_counter > max_nr_gaps: # these gaps won't vanish, so we can crop the candidate list here new_list_start = j + 1 break else: continue # check time difference - if time too long, we can remove the candidate cand_start_time = user_trip_df.loc[cand, "started_at"] if end_time - cand_start_time > max_time: new_list_start = len(start_candidates) - j - 1 break # check whether the start-end candidate of a tour is an unknown activity if pd.isna(user_trip_df.loc[cand, "origin_staypoint_id"]): continue # check if endpoint of trip = start location of cand if staypoints is not None: end_start_at_same_loc = _check_same_loc( user_trip_df.loc[cand, "origin_staypoint_id"], # start stp of first trip row["destination_staypoint_id"], # destination stp of current trip staypoints, ) else: # if no locations are available, check whether the distance is smaller than max_dist end_start_at_same_loc = _check_max_dist( user_trip_df.loc[cand, geom_col].geoms[0], # start point of first trip row[geom_col].geoms[1], # destination point of current trip max_dist, crs_is_projected=crs_is_projected, ) if end_start_at_same_loc: # Tour found! # collect the trips on the tour in a list non_gap_trip_idxs = [c for c in start_candidates[-j - 1 :] if ~np.isnan(c)] tour_candidate = user_trip_df[user_trip_df.index.isin(non_gap_trip_idxs)] tours.append(_create_tour_from_stack(tour_candidate, staypoints, max_time)) # do not consider the other trips - one trip cannot close two tours at a time break # remove points because they are out of the time window start_candidates = start_candidates[new_list_start:] if len(tours) == 0: return pd.DataFrame( tours, columns=[ "user_id", "started_at", "finished_at", "origin_staypoint_id", "destination_staypoint_id", "trips", "location_id", ], ) tours_df = pd.DataFrame(tours) return tours_df def _check_same_loc(stp1, stp2, staypoints): """Check whether two staypoints are at the same location Parameters ---------- stp1 : int First staypoint id stp2 : int Second staypoint id staypoints : Trackintel staypoints GeoDataFrame with staypoints and also location ids Returns ------- share_location, bool If True, stp1 and stp2 are at the same location """ if pd.isna(stp1) or pd.isna(stp2): return False share_location = staypoints.loc[stp1, "location_id"] == staypoints.loc[stp2, "location_id"] return share_location def _check_max_dist(p1, p2, max_dist, crs_is_projected=False): """ Check whether two points p1, p2 are less or equal than max_dist apart Parameters -------- p1, p2: shapely Point objects max_dist: int Returns ------ dist_below_thresh: bool indicating whether p1 and p2 are less than max_dist apart """ if crs_is_projected: dist = p1.distance(p2) else: dist = ti.geogr.point_haversine_dist(p1.x, p1.y, p2.x, p2.y) dist_below_thresh = dist <= max_dist return dist_below_thresh def _create_tour_from_stack(temp_tour_stack, staypoints, max_time): """ Aggregate information of tour elements in a structured dictionary. Parameters ---------- temp_tour_stack : list list of dictionary like elements (either pandas series or python dictionary). Contains all trips that will be aggregated into a tour Returns ------- tour_dict_entry: dictionary """ # this function return and empty dict if no tripleg is in the stack first_trip = temp_tour_stack.iloc[0] last_trip = temp_tour_stack.iloc[-1] # get location ID if available: if staypoints is not None: start_loc = staypoints.loc[first_trip["origin_staypoint_id"], "location_id"] # double check whether start and end location are the same end_loc = staypoints.loc[last_trip["destination_staypoint_id"], "location_id"] assert start_loc == end_loc else: # set location to NaN since not available start_loc = pd.NA # all data has to be from the same user assert len(temp_tour_stack["user_id"].unique()) == 1 # double check if tour requirements are fulfilled assert last_trip["finished_at"] - first_trip["started_at"] <= max_time tour_dict_entry = { "user_id": first_trip["user_id"], "started_at": first_trip["started_at"], "finished_at": last_trip["finished_at"], "origin_staypoint_id": first_trip["origin_staypoint_id"], "destination_staypoint_id": last_trip["destination_staypoint_id"], "trips": list(temp_tour_stack.index), "location_id": start_loc, } return tour_dict_entry