R - ggmap - calculate shortest distance between cities via geocoding
Asked Answered
O

3

5

I have a list of cities and related information that I've placed in a dataframe, like so:

library(plyr)
library(dplyr)
library(ggmap)
library(Imap)

cities <- c("washington, dc", "wilmington, de", "amarillo, tx", 
            "denver, co", "needham, ma", "philadelphia, pa", 
            "doylestown, pa", "galveston, tx", "tuscaloosa, al", 
            "hollywood, fl"
            )

id <- c(156952, 154222, 785695, 154423, 971453, 149888, 1356987,
        178946, 169944, 136421)

month <- c(201811, 201811, 201912, 201912, 202005, 202005, 
           202005, 202106, 202106, 202106 )

category<- c("home", "work", "home", "home", "home", "work",
             "cell", "home", "work", "cell")

places <- data.frame(cities, id, category, month)

Using the Imap and ggmap packages, I can retrieve the longitude and latitudes for each city:

lat <- geocode(location = places$cities, source = "google")$lat
lon <- geocode(location = places$cities, source = "google")$lon

places <- cbind(places, lat, lon)

What I would like to do is the following:

  1. Calculate the distance between each city by month and category
  2. return the second shortest distance and the corresponding city and id in separate columns in places

I wrote a for loop to calculate the distances:

for (i in 1:nrow(places)) {




dist_list[[i]] <- gdist(lon.1 = places$lon[i], 
                          lat.1 = places$lat[i], 
                          lon.2 = places$lon, 
                          lat.2 = places$lat,
                          units="miles")
  
}

Which produces the following data:

dput(dist_list)
list(c(0, 98.3464717885451, 1386.25425677199, 1489.87718040776, 
383.083760289456, 123.232894969413, 140.284537078237, 1209.23510542932, 
706.670452283757, 906.79542720295), c(98.4762434610638, 0, 1472.06660056474, 
1560.93398322985, 285.23618862797, 24.9195071209828, 44.8853561530985, 
1308.60741637919, 805.755084908157, 983.102810248198), c(1389.07354011351, 
1472.06660056474, 0, 356.573530670257, 1712.29111612461, 1493.39302974566, 
1497.2125164277, 579.329313217289, 827.577713357261, 1434.82691622332
), c(1492.80130415651, 1560.93398322985, 356.573530670257, 0, 
1761.3773163288, 1578.71125031146, 1576.80713231756, 923.725006795209, 
1067.04809350934, 1717.32991551111), c(383.551997010915, 285.23618862797, 
1712.29111612461, 1761.3773163288, 0, 260.382178510916, 243.947043197789, 
1588.85470703957, 1088.38640303169, 1230.47219244291), c(123.395655314093, 
24.9195071209827, 1493.39302974566, 1578.71125031146, 260.382178510916, 
0, 24.7382114555287, 1333.29925285915, 830.581742827321, 1002.94777739349
), c(140.431447025301, 44.8853561530986, 1497.2125164277, 1576.80713231756, 
243.947043197789, 24.7382114555285, 0, 1346.44527983873, 844.827513981938, 
1026.98263808807), c(1211.16392416136, 1308.60741637919, 579.329313217289, 
923.725006795209, 1588.85470703957, 1333.29925285915, 1346.44527983873, 
0, 505.292529136012, 925.512554201542), c(707.73957320737, 805.755084908157, 
827.577713357261, 1067.04809350934, 1088.38640303169, 830.581742827321, 
844.827513981938, 505.292529136012, 0, 666.837848781548), c(906.880841903584, 
983.102810248198, 1434.82691622332, 1717.32991551111, 1230.47219244291, 
1002.94777739349, 1026.98263808807, 925.512554201542, 666.837848781548, 
0))

The desired result would look like this (first row):

cities          id         category  month      lat        lon   min.dist  closest city  closest city id  
washington, dc  156952     home      201811 38.90719  -77.03687   98.34647  wilmington, de  154222 

And via the nth function in Rfast I can get the second smallest distance

nth(dist_list[[1]], 2)

The problem I have is I don't know how to connect the info from the list to the df places. Any help or suggestions would be greatly appreciated.

Orcinol answered 18/6, 2021 at 14:46 Comment(0)
U
3
# get min distance:
min_d <- sapply(dist_list, function(x) sort(x)[2])
places$min_dist <- min_d
# index:
i <- sapply(dist_list, function(x) which(sort(x)[2] == x))
# add name:
places$min_name <- places$cities[i]

With grouping:

# prepare dist matrix outside loop
m <- t(as.data.frame(dist_list))
row.names(m) <- NULL
diag(m) <- NA

# create grouping variable:
gv <- as.integer(factor(places$month)) # or:
# gv <- as.integer(factor(paste(places$month, places$category)))

# set distance to NA if not in relevant group:
i <- sapply(gv, function(x) gv == x)
m[!i] <- NA

l <- sapply(as.data.frame(t(m)), function(x) {
  if (all(is.na(x))) return(list(NA, NA))
  mv <- min(x, na.rm = T)
  i <- which(x == mv)
  list(mv, i)
})
l
places <- cbind(places, min_dist = unlist(l[1, ]), min_nr = unlist(l[2, ]))

places$min_name <- places$cities[places$min_nr] # add name
places$min_id <- places$id[places$min_nr] # add id
places

result:

              cities      id category  month  min_dist min_nr         min_name  min_id
V1    washington, dc  156952     home 201811  98.34647      2   wilmington, de  154222
V2    wilmington, de  154222     work 201811  98.47624      1   washington, dc  156952
V3      amarillo, tx  785695     home 201912 356.57353      4       denver, co  154423
V4        denver, co  154423     home 201912 356.57353      3     amarillo, tx  785695
V5       needham, ma  971453     home 202005 243.94704      7   doylestown, pa 1356987
V6  philadelphia, pa  149888     work 202005  24.73821      7   doylestown, pa 1356987
V7    doylestown, pa 1356987     cell 202005  24.73821      6 philadelphia, pa  149888
V8     galveston, tx  178946     home 202106 505.29253      9   tuscaloosa, al  169944
V9    tuscaloosa, al  169944     work 202106 505.29253      8    galveston, tx  178946
V10    hollywood, fl  136421     cell 202106 666.83785      9   tuscaloosa, al  169944
Urceolate answered 21/6, 2021 at 15:42 Comment(6)
this works pretty well but what if i need to calculate the distance using a by variable?Orcinol
@Orcinol what do you mean? it is not clear... by groups? for example calculate closes city where month is equal?Urceolate
@minem...for example, by month and id....also the id of the closest city should be returned as well in placesOrcinol
one more question: what if dist_list and places had different lengths? As of now, i <- sapply(dist_list, function(x) which(sort(x)[2] == x)) doesn't work if the lengths are of different.Orcinol
@Orcinol in your question you specified that you create dist_list from places, so they should relate one to one. If you are using some subset of one or other then you need to subset everything...Urceolate
ok thanks, I realized I wanted to take the lat <- geocode(location = unique(places$cities), source = "google")$latOrcinol
H
3

Update

Assuming we group by month only, we can try the code below

f <- function(df) {
    r <- list()
    for (i in 1:nrow(df)) {
        x <- c()
        for (j in 1:nrow(df)) {
            x <- append(
                x,
                with(df, gdist(lat[i], lon[i], lat[j], lon[j], units = "miles"))
            )
        }
        x <- replace(x, x == 0, Inf)
        idx <- which.min(x)
        r[[i]] <- data.frame(
            min.dist = min(x),
            closest_city = df$cities[idx],
            closest_city_id = df$id[idx]
        )
    }
    do.call(rbind, r)
}

places %>%
    group_by(month) %>%
    do(cbind(., f(.))) %>%
    ungroup()

which gives

# A tibble: 10 x 9
   cities               id category  month   lat    lon min.dist closest_city   
   <chr>             <int> <chr>     <int> <dbl>  <dbl>    <dbl> <chr>
 1 washington, dc   156952 home     201811  38.9  -77.0   104.   wilmington, de
 2 wilmington, de   154222 work     201811  39.7  -75.5   104.   washington, dc
 3 amarillo, tx     785695 home     201912  35.2 -102.    232.   denver, co
 4 denver, co       154423 home     201912  39.8 -105.    232.   amarillo, tx
 5 needham, ma      971453 home     202005  42.3  -71.2   273.   doylestown, pa
 6 philadelphia, ~  149888 work     202005  40.0  -75.2     6.81 doylestown, pa
 7 doylestown, pa  1356987 cell     202005  40.3  -75.1     6.81 philadelphia, ~
 8 galveston, tx    178946 home     202106  29.2  -94.9 11405.   hollywood, fl
 9 tuscaloosa, al   169944 work     202106  33.2  -87.6   517.   hollywood, fl
10 hollywood, fl    136421 cell     202106  26.0  -80.1   517.   tuscaloosa, al
# ... with 1 more variable: closest_city_id <int>

Based on your obtained dist_list, we can try the code below

closest <- do.call(
    rbind,
    lapply(
        dist_list,
        function(x) {
            x <- replace(x, x == 0, Inf)
            idx <- which.min(x)
            with(
                places,
                data.frame(
                    min.dist = min(x),
                    closest_city = cities[idx],
                    closest_city_id = id[idx]
                )
            )
        }
    )
)

which gives

    min.dist     closest_city closest_city_id
1   98.34647   wilmington, de          154222
2   24.91951 philadelphia, pa          149888
3  356.57353       denver, co          154423
4  356.57353     amarillo, tx          785695
5  243.94704   doylestown, pa         1356987
6   24.73821   doylestown, pa         1356987
7   24.73821 philadelphia, pa          149888
8  505.29253   tuscaloosa, al          169944
9  505.29253    galveston, tx          178946
10 666.83785   tuscaloosa, al          169944

Also, if you want to append the above data frame to your existing places, you can use

places <- cbind(places, closest)
Horrendous answered 21/6, 2021 at 19:54 Comment(4)
this looks great...what if i need to make the calc by id, month?Orcinol
@Orcinol see my udpateHorrendous
If month is the grouping variable, washington, dc should be the second row. Also, can you explain what is min_nr?Orcinol
@Orcinol No, I don't have min_nrHorrendous
P
3

use sf::st_distance()

Given that you're working with spatial data, I advise an approach based on spatial libraries like {sf}.

library(tidyverse)
library(tidygeocoder)
library(sf)

# clean location, geocode, and convert to sf object
places <- places %>% 
  separate(cities, into = c("city", "state"), sep = ", ") %>% 
  geocode(city = city, state = state) %>% 
  st_as_sf(coords = c("long", "lat"), crs = 4269)

# sanity check
mapview::mapview(places)

enter image description here

# calculate distances between point pairs with st_distance()
compute_close_city <- function(i){
  # compute distances btwn a point and its neighbors (excluding itself)
  distances = st_distance(places[i, ], places[-i, ])
  # index of the nearest neighbor
  j = which.min(distances)
  
  # organize and return the result
  result <- tibble(
    close_city   = places$city[-i][j],  # closest city
    close_state  = places$state[-i][j], # closest state
    close_dist_m = distances[j]         # distance in meters
  )
  
  return(result)
}

# calculate close cities and distances, bind results into dataframe
close_df <- map_df(1:nrow(places), ~compute_close_city(.x))

# bind the result to the places data frame
places <- bind_cols(places, close_df)

# view the result and verify it works
select(places, city, close_city, close_dist_m)

Returns:

Simple feature collection with 10 features and 3 fields
Geometry type: POINT
Dimension:     XY
Bounding box:  xmin: -104.9849 ymin: 26.0112 xmax: -75.13046 ymax: 40.31004
Geodetic CRS:  NAD83
# A tibble: 10 x 4
   city         close_city   close_dist_m             geometry
   <chr>        <chr>                 [m]          <POINT [°]>
 1 washington   wilmington      159476.85 (-77.03656 38.89499)
 2 wilmington   philadelphia     40022.81 (-75.54659 39.74595)
 3 amarillo     denver          574956.04 (-101.8338 35.20722)
 4 denver       amarillo        574956.04 (-104.9849 39.73924)
 5 needham      tuscaloosa      153463.74 (-88.33309 31.98683)
 6 philadelphia doylestown       39775.87 (-75.16353 39.95272)
 7 doylestown   philadelphia     39775.87 (-75.13046 40.31004)
 8 galveston    needham         687140.47 (-94.79459 29.29933)
 9 tuscaloosa   needham         153463.74 (-87.56753 33.20956)
10 hollywood    needham        1035934.60  (-80.14949 26.0112)
Pearsall answered 22/6, 2021 at 22:43 Comment(2)
this is pretty elegant...but entries for rows 4, 8-10 aren't right.Orcinol
I think the problem here is the distances are not calculated by month, category.Orcinol

© 2022 - 2024 — McMap. All rights reserved.