Skip to content

Commit

Permalink
Historic intersections (#185)
Browse files Browse the repository at this point in the history
* add nyc-streets.geojson

* 4239 intersections, tossing 16 ambiguous ones

* 2679 historic intersections that no longer exist

* normalize Tenth -> 10th; down to 1802 intersections

* need to de-dupe within historic_grid.py

* 1637 after de-duping

* tests passing

* update test logs

* update stats; looks great!

* add historic grid to e2e

* more image sizes

* site stats

* force rounding
  • Loading branch information
danvk authored Nov 27, 2024
1 parent 7c0c638 commit 7c30363
Show file tree
Hide file tree
Showing 15 changed files with 2,766 additions and 79 deletions.
1 change: 1 addition & 0 deletions .github/workflows/e2etest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ jobs:
- name: Generate intersections
run: |
export PYTHONPATH=.
poetry run python oldnyc/geocode/historic_grid.py
poetry run python oldnyc/geocode/osm/generate_intersections.py
- name: Generate truth data
run: |
Expand Down
1,637 changes: 1,637 additions & 0 deletions data/historic-intersections.csv

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion data/lat-lon-to-ids.json

Large diffs are not rendered by default.

839 changes: 839 additions & 0 deletions data/originals/nyc-streets.geojson

Large diffs are not rendered by default.

24 changes: 24 additions & 0 deletions data/self-hosted-sizes.txt
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@
2040741,614,760
2040746,760,595
2040755,760,561
2040756,760,556
2040759,760,567
2040760,760,553
2040762,760,570
Expand Down Expand Up @@ -142,10 +143,17 @@
2040871,760,617
2040872,760,617
2040873,760,618
2040883,760,670
2040884,760,679
2040885,760,675
2040886,760,669
2040893,760,675
2040894,760,671
3984206,760,545
3984207,760,544
3984209,760,546
3984214,544,760
3984215,760,546
3984216,546,760
3984217,760,550
3984220,760,546
Expand All @@ -171,6 +179,7 @@
3984250,546,760
3984251,760,543
3984252,760,544
3984253,546,760
3984254,546,760
3984255,760,546
3984256,760,549
Expand All @@ -190,14 +199,20 @@
3984301,760,546
3984302,760,546
3984305,760,544
3984311,760,546
3984314,760,546
3984318,760,546
3984322,760,546
3984323,760,546
3984328,760,541
3984329,760,549
3984330,760,544
3984331,760,547
3984332,546,760
3984333,760,546
3984334,760,547
3984336,546,760
3984337,760,548
3984351,760,546
3984352,760,546
3984354,760,546
Expand Down Expand Up @@ -229,6 +244,7 @@
3984426,546,760
3984436,760,546
3984438,546,760
3984439,546,760
3984440,546,760
3984443,546,760
3984445,760,546
Expand Down Expand Up @@ -266,6 +282,7 @@
3984511,549,760
3984514,543,760
3984515,760,549
3984517,549,760
3984519,760,549
3984520,760,545
3984521,760,549
Expand Down Expand Up @@ -364,6 +381,7 @@
3984791,545,760
3984793,545,760
3984794,549,760
3984800,549,760
3984807,541,760
3984812,546,760
3984816,548,760
Expand Down Expand Up @@ -786,15 +804,20 @@
485729,479,760
485730,476,760
485731,482,760
485732,505,760
485735,760,509
485737,510,760
485738,760,506
485739,760,509
485740,504,760
485741,504,760
485742,481,760
485744,511,760
485745,509,760
485746,509,760
485747,760,512
485750,760,508
485751,760,508
485752,509,760
485753,760,484
485754,510,760
Expand Down Expand Up @@ -1114,6 +1137,7 @@
703511f,760,489
703517f,760,489
703518f,485,760
703519f,760,490
703520f,489,760
703524f,760,491
703525f,760,489
Expand Down
8 changes: 6 additions & 2 deletions oldnyc/geocode/grid.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/env python

import csv
import itertools
import re
import statistics
import sys
Expand Down Expand Up @@ -99,10 +100,13 @@ def strip_ave(street: str) -> str:
).strip()


def load_all_intersections():
def load_all_intersections(exclude_historic=False):
# Completely unambiguous intersections
ints = dict[Intersection, Point]()
for row in csv.DictReader(open("data/nyc-intersections.csv")):
inputs = csv.DictReader(open("data/nyc-intersections.csv"))
if not exclude_historic:
inputs = itertools.chain(inputs, csv.DictReader(open("data/historic-intersections.csv")))
for row in inputs:
# Street1,Street2,Borough,Lat,Lon,Nodes
str1 = row["Street1"]
str2 = row["Street2"]
Expand Down
16 changes: 12 additions & 4 deletions oldnyc/geocode/grid_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@
def assert_close(ll1: tuple[float, float] | None, ll2: tuple[float, float]):
"""Assert that two latitude & longitude tuples are "close"."""
assert ll1 is not None
assert ll1[0] == pytest.approx(ll2[0], 1e-6)
assert ll1[1] == pytest.approx(ll2[1], 1e-6)
assert ll1 == pytest.approx(ll2, 1e-6)


def test_exact():
Expand Down Expand Up @@ -73,9 +72,9 @@ def test_parse_street_ave():
def test_geocode_broadway():
g = Grid()
# this is an exact intersection
assert_close(g.geocode_intersection("Broadway", "59th Street", M), (40.767696, -73.981679))
assert_close(g.geocode_intersection("Broadway", "59th Street", M), (40.76801, -73.98194))
# these are interpolations
assert_close(g.geocode_intersection("Broadway", "24th Street", M), (40.7422035, -73.989151))
assert_close(g.geocode_intersection("Broadway", "24th Street", M), (40.74229, -73.98919))
assert_close(g.geocode_intersection("Broadway", "124th St", M), (40.8140625, -73.959421))
# this isn't great, but probably not too far off the intended location.
assert_close(g.geocode_intersection("Broadway", "127th St.", M), (40.816311, -73.957802))
Expand Down Expand Up @@ -175,3 +174,12 @@ def test_preserved_streets():
def test_cursed_intersection():
g = Grid()
assert g.geocode_intersection("Broadway", "Amsterdam Ave", "Manhattan") is None


def test_geocode_historic():
# Historic intersections from nyc-streets.geojson
g = Grid()
assert_close(
g.geocode_intersection("Broome Street", "Tompkins Street", "Manhattan"),
(40.71418, -73.97697),
)
115 changes: 115 additions & 0 deletions oldnyc/geocode/historic_grid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
"""Construct historic intersections from nyc-streets.geojson"""

import csv
import itertools
import sys
from collections import defaultdict

import pygeojson
import shapely
from haversine import haversine
from tqdm import tqdm

from oldnyc.geocode.geocode_types import Point
from oldnyc.geocode.grid import (
Intersection,
load_all_intersections,
normalize_street_for_osm,
strip_dir,
)


def geom_to_shape(geom: pygeojson.GeometryObject | None):
if geom is None:
raise ValueError("Expected geometry, got None")
if isinstance(geom, pygeojson.LineString):
return shapely.geometry.LineString(geom.coordinates)
elif isinstance(geom, pygeojson.MultiLineString):
return shapely.geometry.MultiLineString(geom.coordinates)
else:
raise ValueError(f"Unexpected geometry type: {geom.type}")


def bounds_diam_m(g: shapely.geometry.base.BaseGeometry) -> float:
lng1, lat1, lng2, lat2 = g.bounds
return 1000 * haversine((lat1, lng1), (lat2, lng2))


def centroid_if_small(g: shapely.geometry.base.BaseGeometry) -> shapely.geometry.Point | None:
d_m = bounds_diam_m(g)
if d_m < 60:
return g.centroid
return None


def main():
n_ambig, n_match_current, n_match_stripped = 0, 0, 0
streets = pygeojson.load_feature_collection(open("data/originals/nyc-streets.geojson")).features
geoms = [geom_to_shape(f.geometry) for f in streets]

current_ixs, current_stripped_ixs = load_all_intersections(exclude_historic=True)

ix_to_pt = defaultdict[Intersection, list[Point]](list)
for i, j in tqdm(itertools.combinations(range(len(geoms)), 2)):
name1 = normalize_street_for_osm(streets[i].properties["name"])
name2 = normalize_street_for_osm(streets[j].properties["name"])
if name1 == name2 or strip_dir(name1) == strip_dir(name2):
continue
g1, g2 = geoms[i], geoms[j]
if not g1.intersects(g2):
continue
pt = g1.intersection(g2)
full_ix = pt
if not isinstance(pt, shapely.geometry.Point):
pt = centroid_if_small(pt)
if not pt:
sys.stderr.write(
f'Multiple intersections between "{name1}" ({i}) and "{name2}" ({j}): {full_ix}: {bounds_diam_m(full_ix):.2f}m\n'
)
n_ambig += 1
continue

boro1 = streets[i].properties["data"]["borough"]
boro2 = streets[i].properties["data"]["borough"]
assert boro1 == boro2, f"{boro1} != {boro2}"

ix = Intersection(name1, name2, boro1)
strip_ix = Intersection(strip_dir(name1), strip_dir(name2), boro1)

if ix in current_ixs:
n_match_current += 1
continue

if strip_ix in current_stripped_ixs:
n_match_stripped += 1
continue

ix_to_pt[ix].append((pt.y, pt.x))

out = csv.DictWriter(
open("data/historic-intersections.csv", "w"),
fieldnames=["Street1", "Street2", "Borough", "Lat", "Lon"],
)
out.writeheader()
n_ambig_dedupe = 0
for ix, pts in ix_to_pt.items():
mp = shapely.geometry.MultiPoint([(x, y) for (y, x) in pts])
pt = centroid_if_small(mp)
if not pt:
sys.stderr.write(f"Skipping {ix} with {len(pts)} points: {mp} / {bounds_diam_m(mp)}m\n")
n_ambig_dedupe += 1
continue
out.writerow(
{
"Street1": ix.str1,
"Street2": ix.str2,
"Borough": ix.boro,
"Lat": "%.6f" % pt.y,
"Lon": "%.6f" % pt.x,
}
)
sys.stderr.write(f"{n_ambig=}, {n_match_current=}, {n_match_stripped=}, {n_ambig_dedupe=}\n")


if __name__ == "__main__":
main()
60 changes: 59 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ imagehash = "^4.3.1"
pygeojson = "^0.2.0"
natsort = "^8.4.0"
word2number = "^1.1"
shapely = "^2.0.6"

[tool.ruff]
line-length = 100
Expand Down
4 changes: 2 additions & 2 deletions test/geocode-performance.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@

Results:
Geocodes
229 / 269 = 85.13% of locatable images correctly located.
12 / 241 = 4.98% incorrectly located.
232 / 269 = 86.25% of locatable images correctly located.
10 / 242 = 4.13% incorrectly located.

Loading

0 comments on commit 7c30363

Please sign in to comment.