I have downloaded a list of all the towns and cities etc in the US from the census bureau. Here is a random sample:
dput(somewhere)
structure(list(state = structure(c(30L, 31L, 5L, 31L, 24L, 36L,
13L, 21L, 6L, 10L, 31L, 28L, 10L, 5L, 5L, 8L, 23L, 11L, 34L,
19L, 29L, 4L, 24L, 13L, 21L, 31L, 2L, 3L, 29L, 24L, 1L, 13L,
15L, 10L, 11L, 33L, 35L, 8L, 11L, 12L, 36L, 28L, 9L, 31L, 8L,
14L, 11L, 12L, 36L, 13L, 8L, 5L, 29L, 8L, 7L, 23L, 25L, 39L,
16L, 28L, 10L, 29L, 26L, 8L, 32L, 40L, 28L, 23L, 37L, 31L, 18L,
5L, 1L, 31L, 18L, 13L, 11L, 10L, 25L, 18L, 21L, 18L, 11L, 35L,
31L, 36L, 20L, 19L, 38L, 2L, 40L, 13L, 36L, 11L, 29L, 27L, 22L,
17L, 12L, 20L), .Label = c("ak", "al", "ar", "az", "ca", "co",
"fl", "ga", "hi", "ia", "il", "in", "ks", "ky", "la", "md", "mi",
"mo", "ms", "mt", "nc", "nd", "ne", "nj", "nm", "nv", "ny", "oh",
"ok", "or", "pa", "pr", "ri", "sc", "sd", "tx", "ut", "va", "wa",
"wi"), class = "factor"), geoid = c(4120100L, 4280962L, 668028L,
4243944L, 3460600L, 4871948L, 2046000L, 3747695L, 839965L, 1909910L,
4244824L, 3902204L, 1963750L, 622360L, 669088L, 1382972L, 3125230L,
1722736L, 4539265L, 2804705L, 4039625L, 451465L, 3467020L, 2077150L,
3765260L, 4221792L, 133904L, 566320L, 4033150L, 3463180L, 223460L,
2013675L, 2232405L, 1951600L, 1752142L, 4445010L, 4655684L, 1336416L,
1729080L, 1840842L, 4804672L, 3932207L, 1523675L, 4260384L, 1321912L,
2159232L, 1735307L, 1867176L, 4839304L, 2057350L, 1309656L, 655380L,
4082250L, 1350680L, 1275475L, 3147745L, 3505010L, 5352285L, 2483337L,
3909834L, 1912945L, 4068200L, 3227900L, 1366304L, 7286831L, 5505350L,
3982390L, 3149915L, 4974480L, 4249440L, 2943346L, 677430L, 280770L,
4247872L, 2902242L, 2039075L, 1735281L, 1932565L, 3580120L, 2973852L,
3722620L, 2943238L, 1755938L, 4643100L, 4251904L, 4830920L, 3056575L,
2801940L, 5156048L, 137000L, 5508925L, 2057300L, 4861172L, 1736477L,
4021200L, 3677783L, 3832060L, 2614900L, 1820332L, 3043750L),
ansicode = c(2410344L, 2390453L, 2411793L, 1214759L, 885360L,
2412035L, 485621L, 2403359L, 2412812L, 2583481L, 2390095L,
2397971L, 2396237L, 2585422L, 2411819L, 2405746L, 2398338L,
2394628L, 2812929L, 2586582L, 2408478L, 2582836L, 885393L,
2397270L, 2402898L, 2584453L, 2405811L, 2405518L, 2412737L,
2389752L, 2418574L, 2393549L, 2402559L, 2629970L, 2399453L,
2378109L, 2812999L, 2402563L, 2398956L, 2396699L, 2409759L,
2393028L, 2414061L, 2805542L, 2404192L, 2404475L, 2398514L,
2629884L, 2408486L, 2396265L, 2405306L, 2411363L, 2413515L,
2405064L, 2402989L, 2583899L, 2629103L, 2585016L, 2390487L,
2397481L, 2393811L, 2413298L, 2583927L, 2812702L, 2415078L,
1582764L, 2400116L, 2400036L, 2412013L, 2633665L, 2787908L,
2583158L, 2418866L, 1214943L, 2393998L, 485611L, 2398513L,
2394969L, 2806756L, 2397053L, 2406485L, 2395719L, 2399572L,
1267480L, 2389516L, 2410660L, 2409026L, 2806379L, 2584894L,
2404746L, 2586459L, 2396263L, 2411528L, 2398556L, 2412443L,
2584298L, 1036064L, 2806333L, 2396920L, 2804282L), city = c("donald",
"warminster heights", "san juan capistrano", "littlestown",
"port republic", "taylor", "merriam", "northlakes", "julesburg",
"california junction", "lower allen", "antwerp", "pleasantville",
"el rancho", "santa clarita", "willacoochee", "kennard",
"effingham", "la france", "beechwood", "keys", "orange grove mobile manor",
"shiloh", "west mineral", "stony point", "east salem", "heath",
"stamps", "haworth", "rio grande", "ester", "clayton", "hackberry",
"middle amana", "new baden", "melville", "rolland colony",
"hannahs mill", "germantown hills", "la fontaine", "aurora",
"green meadows", "kaiminani", "pinecroft", "dawson", "park city",
"hinsdale", "st. meinrad", "kingsland", "powhattan", "bowersville",
"palos verdes estates", "wyandotte", "meigs", "waverly",
"sunol", "arroyo hondo", "outlook", "west pocomoke", "buchtel",
"chatsworth", "smith village", "glenbrook", "rock spring",
"villalba", "bayfield", "waynesfield", "utica", "sunset",
"milford square", "lithium", "swall meadows", "unalaska",
"martinsburg", "ashland", "leawood", "hindsboro", "gray",
"turley", "trimble", "falcon", "linn", "olympia fields",
"mitchell", "mount pleasant mills", "greenville", "park city",
"arkabutla", "new river", "huntsville", "boulder junction",
"potwin", "red lick", "huey", "dougherty", "wadsworth", "grand forks",
"chassell", "edgewood", "lindsay"), lsad = c("25", "57",
"25", "21", "25", "25", "25", "57", "43", "57", "57", "47",
"25", "57", "25", "25", "47", "25", "57", "57", "57", "57",
"21", "25", "57", "57", "43", "25", "43", "57", "57", "25",
"57", "57", "47", "57", "57", "57", "47", "43", "25", "57",
"57", "57", "25", "25", "47", "57", "57", "25", "43", "25",
"43", "25", "57", "57", "57", "57", "57", "47", "25", "43",
"57", "57", "62", "25", "47", "47", "25", "57", "57", "57",
"25", "21", "25", "25", "47", "25", "57", "25", "43", "25",
"47", "25", "57", "25", "57", "57", "57", "25", "57", "25",
"25", "47", "43", "57", "25", "57", "43", "57"), funcstat = c("a",
"s", "a", "a", "a", "a", "a", "s", "a", "s", "s", "a", "a",
"s", "a", "a", "a", "a", "s", "s", "s", "s", "a", "a", "s",
"s", "a", "a", "a", "s", "s", "a", "s", "s", "a", "s", "s",
"s", "a", "a", "a", "s", "s", "s", "a", "a", "a", "s", "s",
"a", "a", "a", "a", "a", "s", "s", "s", "s", "s", "a", "a",
"a", "s", "s", "s", "a", "a", "a", "a", "s", "s", "s", "a",
"a", "a", "a", "a", "a", "s", "a", "a", "a", "a", "a", "s",
"a", "s", "s", "s", "a", "s", "a", "a", "a", "a", "s", "a",
"s", "a", "s"), latitude = c(45.221487, 40.18837, 33.500889,
39.74517, 39.534798, 30.573263, 39.017607, 35.780523, 40.984864,
41.56017, 40.226748, 41.180176, 41.387011, 36.220684, 34.414083,
31.335094, 41.474697, 39.120662, 34.616281, 32.336723, 35.802786,
32.598451, 39.462418, 37.283906, 35.867809, 40.608713, 31.344839,
33.354959, 33.840898, 39.019051, 64.879056, 39.736866, 29.964958,
41.794765, 38.536765, 41.559549, 44.3437, 32.937302, 40.768954,
40.673893, 33.055942, 39.867193, 19.757709, 40.564189, 31.771864,
37.093499, 41.800683, 38.168142, 30.666141, 39.761734, 34.373065,
33.774271, 36.807143, 31.071788, 27.985282, 41.154105, 36.534599,
46.331153, 38.096527, 39.463511, 42.916301, 35.45079, 39.100123,
34.81467, 18.127809, 46.81399, 40.602442, 40.895279, 41.13806,
40.433182, 37.831844, 37.50606, 53.910255, 40.310917, 38.812464,
38.907263, 39.684775, 41.841711, 36.736661, 39.476152, 35.194804,
38.478798, 41.521996, 43.730057, 40.724697, 33.111939, 45.630946,
34.700227, 37.142945, 34.782275, 46.1148, 37.938624, 33.485081,
38.605285, 34.399808, 42.821447, 47.921291, 47.036116, 40.103208,
47.224885), longitude = c(-122.837813, -75.084089, -117.654388,
-77.089213, -74.476099, -97.427116, -94.693955, -81.367835,
-102.262708, -95.994752, -76.902769, -84.736099, -93.272787,
-119.068357, -118.494729, -83.044003, -96.203696, -88.550859,
-82.770697, -90.808692, -94.941358, -114.660588, -75.29244,
-94.926801, -81.044121, -77.23694, -86.46905, -93.497879,
-94.657035, -74.87787, -148.041153, -100.176484, -93.410178,
-91.901539, -89.707193, -71.301933, -96.59226, -84.340945,
-89.462982, -85.722023, -97.509615, -83.945334, -156.001765,
-78.353464, -84.443499, -86.048077, -87.928172, -86.832128,
-98.454026, -95.634011, -83.084305, -118.425754, -94.729305,
-84.092683, -81.625304, -102.762746, -105.666602, -120.092812,
-75.579197, -82.180426, -96.514499, -97.457006, -119.927289,
-85.238869, -66.481897, -90.822546, -83.973881, -97.345349,
-112.028388, -75.405024, -89.88325, -118.642656, -166.529029,
-78.324286, -92.239531, -94.62524, -88.134729, -94.985863,
-107.792147, -94.561898, -78.65389, -91.844989, -87.691648,
-98.029974, -77.026451, -96.110256, -108.925311, -90.121565,
-80.595817, -86.532599, -89.654438, -97.01835, -94.161474,
-89.289973, -97.05148, -77.893875, -97.08933, -88.530745,
-85.737461, -105.152791), designation = c("city", "cdp",
"city", "borough", "city", "city", "city", "cdp", "town",
"cdp", "cdp", "village", "city", "cdp", "city", "city", "village",
"city", "cdp", "cdp", "cdp", "cdp", "borough", "city", "cdp",
"cdp", "town", "city", "town", "cdp", "cdp", "city", "cdp",
"cdp", "village", "cdp", "cdp", "cdp", "village", "town",
"city", "cdp", "cdp", "cdp", "city", "city", "village", "cdp",
"cdp", "city", "town", "city", "town", "city", "cdp", "cdp",
"cdp", "cdp", "cdp", "village", "city", "town", "cdp", "cdp",
"urbana", "city", "village", "village", "city", "cdp", "cdp",
"cdp", "city", "borough", "city", "city", "village", "city",
"cdp", "city", "town", "city", "village", "city", "cdp",
"city", "cdp", "cdp", "cdp", "city", "cdp", "city", "city",
"village", "town", "cdp", "city", "cdp", "town", "cdp")), row.names = c(22769L,
24845L, 3314L, 24015L, 17360L, 28139L, 10085L, 19881L, 3886L,
8750L, 24027L, 20585L, 9362L, 2499L, 3333L, 6041L, 16321L, 6847L,
25249L, 14051L, 22233L, 1210L, 17425L, 10353L, 20053L, 23545L,
253L, 1951L, 22166L, 17386L, 685L, 9771L, 11134L, 9225L, 7386L,
25001L, 25862L, 5663L, 6950L, 8239L, 26555L, 20991L, 6108L, 24388L,
5551L, 10772L, 7056L, 8470L, 27292L, 10202L, 5451L, 3116L, 22660L,
5776L, 5317L, 16546L, 17582L, 29958L, 12103L, 20709L, 8779L,
22515L, 16665L, 5902L, 31901L, 30658L, 21745L, 16574L, 28632L,
24127L, 15046L, 3455L, 930L, 24087L, 14494L, 10016L, 7055L, 8993L,
18048L, 15434L, 19615L, 15043L, 7454L, 25775L, 24194L, 27115L,
15857L, 14038L, 29305L, 276L, 30693L, 10201L, 27863L, 7075L,
22046L, 19267L, 20311L, 12502L, 8093L, 15798L), class = "data.frame")
I would like to calculate the distance between each entry in the city
column using the longitude
and latitude
columns and the gdist
function. I know that the following for
loop works and is easy to read:
dist_list <- list()
for (i in 1:nrow(somewhere)) {
dist_list[[i]] <- gdist(lon.1 = somewhere$longitude[i],
lat.1 = somewhere$latitude[i],
lon.2 = somewhere$longitude,
lat.2 = somewhere$latitude,
units="miles")
}
However: IT TAKES FOREVER to run on the full dataset (31K+ rows)--as in hours. I'm looking for something that will speed up this calculation. I figured something in the split-apply-combine
approach would work well, since I would like to eventually involve a pair of grouping variables, geo_block
and ansi_block
but honestly anything would be better than what I have.
I have tried the following:
somewhere$geo_block <- substr(somewhere$geoid, 1, 1)
somewhere$ansi_block <- substr(somewhere$ansicode, 1, 1)
somewhere <- somewhere %>%
split(.$geo_block, .$ansi_block) %>%
mutate(dist = gdist(longlat = somewhere[, c("longitude", "latitude")]))
But am unsure of how to specify the second set of long-lat inputs outside of the standard for
loop.
My question:
- How do I use the
split-apply-combine
approach to solve this problem withgeo_block
andansi_block
as a grouping variable as above? I would like to return the shortest distance and the name of thecity
and the value ofgeo_block
corresponding to this distance.
All suggestions are welcome. Ideally the desired result would be fairly quick because the actual dataset I'm working with is quite large. Since I'm a bit in the woods here, I've added a bounty to the question to generate a little more interest and hopefully a wide set of potential answers that I can learn from. Thanks so much!
state
as a grouping variable? Some states will have many many more locations than others. Instead you should use even-sized groups, say every 500 or 1000 rows. – Angstromn
-by-n
explosion withouter
. I'll assume that the OP doesn't care that a state boundary could easily separate closest neighbors. – Leonstate
or not) is going to be dimensionally much larger than the number of rows, so it will not fit cleanly into a frame. Do you want the closest? A statistic of distances? If grouping bystate
, do you care that two locations may be separated by feet but will not be paired? – Leongdist
is, but one answer there shows thatspatialrisk::haversine
is an order of magnitude faster thangeosphere::distm
. – Angstrom