Add gitignore
This commit is contained in:
commit
b2de3304f3
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
__pycache__/*
|
||||||
99
checkoa.py
Executable file
99
checkoa.py
Executable file
@ -0,0 +1,99 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
import os, json, traceback
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
|
||||||
|
rowstocheck = 15000 # Stop reading a file after this many rows, speeds up analysis of many/large address files
|
||||||
|
|
||||||
|
oklist = []
|
||||||
|
emptygeometrylist = []
|
||||||
|
emptyaddresslist = []
|
||||||
|
nocitylist = []
|
||||||
|
noziplist = []
|
||||||
|
totallyemptylist = []
|
||||||
|
|
||||||
|
def checkGeojson(filename):
|
||||||
|
filedata = open(filename, 'r')
|
||||||
|
linecount = 0
|
||||||
|
okcount = 0
|
||||||
|
emptygeometrycount = 0
|
||||||
|
emptyaddresscount = 0
|
||||||
|
emptycitycount = 0
|
||||||
|
emptyzipcount = 0
|
||||||
|
for line in filedata:
|
||||||
|
linecount = linecount + 1
|
||||||
|
if linecount > rowstocheck:
|
||||||
|
break;
|
||||||
|
try:
|
||||||
|
data = json.loads(line)
|
||||||
|
bad = False
|
||||||
|
if not data["properties"]["number"] or not data["properties"]["street"]:
|
||||||
|
emptyaddresscount = emptyaddresscount + 1
|
||||||
|
bad = True
|
||||||
|
if not data["geometry"] or not data["geometry"]["coordinates"][0] or not data["geometry"]["coordinates"][1]:
|
||||||
|
emptygeometrycount = emptygeometrycount + 1
|
||||||
|
bad = True
|
||||||
|
if not data["properties"]["city"] and not data["properties"]["postcode"]: # Flag missing city unless postal code exists, because city can probably be filled from that
|
||||||
|
emptycitycount = emptycitycount + 1
|
||||||
|
bad = True
|
||||||
|
if not data["properties"]["postcode"]:
|
||||||
|
emptyzipcount = emptyzipcount + 1
|
||||||
|
bad = True
|
||||||
|
if bad == False:
|
||||||
|
okcount = okcount + 1
|
||||||
|
except Exception as e:
|
||||||
|
traceback.print_exc()
|
||||||
|
print("Error encountered while processing", filename, "at", line)
|
||||||
|
print(filename, ": OK:", str(okcount), "Bad: geometry:", str(emptygeometrycount), "address:", str(emptyaddresscount), "city:", str(emptycitycount), "zip:", str(emptyzipcount), " ", end="\r", flush=True)
|
||||||
|
filedata.close()
|
||||||
|
bad = False
|
||||||
|
if emptygeometrycount / linecount > .25:
|
||||||
|
emptygeometrylist.append(filename)
|
||||||
|
bad = True
|
||||||
|
if emptyaddresscount / linecount > .67:
|
||||||
|
emptyaddresslist.append(filename)
|
||||||
|
bad = True
|
||||||
|
if emptycitycount / linecount > .67:
|
||||||
|
nocitylist.append(filename)
|
||||||
|
bad = True
|
||||||
|
if emptyzipcount / linecount > .75:
|
||||||
|
noziplist.append(filename)
|
||||||
|
bad = True
|
||||||
|
if emptyaddresscount >= (linecount - 10): # Allow a couple not-fully-empty addresses, otherwise some broken ones won't be reported
|
||||||
|
totallyemptylist.append(filename)
|
||||||
|
bad = True
|
||||||
|
if bad == False:
|
||||||
|
oklist.append(filename)
|
||||||
|
|
||||||
|
parser = ArgumentParser(
|
||||||
|
description="Check OpenAddresses GeoJSON files and report on any problems found."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"source",
|
||||||
|
help="File(s) to check.",
|
||||||
|
nargs='+'
|
||||||
|
)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
args = parser.parse_args()
|
||||||
|
print("Checking " + str(len(args.source)) + " OpenAddresses data files.")
|
||||||
|
for filename in args.source:
|
||||||
|
checkGeojson(filename)
|
||||||
|
print(" ")
|
||||||
|
print()
|
||||||
|
print("== Report ==")
|
||||||
|
print(" Files missing geometry:")
|
||||||
|
for filename in emptygeometrylist:
|
||||||
|
print(" ", filename)
|
||||||
|
print(" Files missing street address:")
|
||||||
|
for filename in emptyaddresslist:
|
||||||
|
print(" ", filename)
|
||||||
|
print(" Files missing city:")
|
||||||
|
for filename in nocitylist:
|
||||||
|
print(" ", filename)
|
||||||
|
print(" Files missing postal code:")
|
||||||
|
for filename in noziplist:
|
||||||
|
print(" ", filename)
|
||||||
|
print(" Files missing all street addresses:")
|
||||||
|
for filename in totallyemptylist:
|
||||||
|
print(" ", filename)
|
||||||
63
downloadoa.py
Executable file
63
downloadoa.py
Executable file
@ -0,0 +1,63 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
import gzip
|
||||||
|
import shutil
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
|
||||||
|
import requests, tempfile, os, pathlib
|
||||||
|
|
||||||
|
sourceList = {}
|
||||||
|
|
||||||
|
def getSourceList():
|
||||||
|
global sourceList
|
||||||
|
if sourceList == {}:
|
||||||
|
print("Fetching sources list")
|
||||||
|
json = requests.get(
|
||||||
|
"https://batch.openaddresses.io/api/data"
|
||||||
|
).json()
|
||||||
|
for s in json:
|
||||||
|
if s["layer"] != "addresses":
|
||||||
|
continue
|
||||||
|
if s["source"] in sourceList:
|
||||||
|
if s["updated"] > sourceList[s["source"]]["updated"]:
|
||||||
|
sourceList[s["source"]] = s
|
||||||
|
else:
|
||||||
|
sourceList[s["source"]] = s
|
||||||
|
return sourceList
|
||||||
|
|
||||||
|
|
||||||
|
def downloadSources(id, outfolder):
|
||||||
|
for sourceName in getSourceList():
|
||||||
|
s = getSourceList()[sourceName]
|
||||||
|
if s["source"].startswith(id):
|
||||||
|
outfilename = outfolder + "/" + s["source"] + "-addresses-" + s["name"] + ".geojson"
|
||||||
|
outfoldername = os.path.dirname(outfilename)
|
||||||
|
if os.path.isfile(outfilename):
|
||||||
|
print("Skipping " + s["source"] + ", already on disk.")
|
||||||
|
continue
|
||||||
|
print("Downloading " + s["source"])
|
||||||
|
gzdl = requests.get("https://v2.openaddresses.io/batch-prod/job/" + str(s["job"]) + "/source.geojson.gz", stream=True)
|
||||||
|
tmp = tempfile.NamedTemporaryFile()
|
||||||
|
with open(tmp.name, 'wb') as tf:
|
||||||
|
for chunk in gzdl.iter_content(chunk_size=16*1024):
|
||||||
|
tf.write(chunk)
|
||||||
|
pathlib.Path(outfoldername).mkdir(parents=True, exist_ok=True)
|
||||||
|
with gzip.open(tmp.name) as gzf, open(outfilename, 'wb') as outf:
|
||||||
|
shutil.copyfileobj(gzf, outf)
|
||||||
|
|
||||||
|
parser = ArgumentParser(
|
||||||
|
description="Download address data from OpenAddresses.io"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"source",
|
||||||
|
help="Source dataset ID, or partial ID. For example: us/al/ will download all Alabama datasets, us/mt/statewide will download the Montana statewide dataset.",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"outfolder",
|
||||||
|
help="Output folder",
|
||||||
|
)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
args = parser.parse_args()
|
||||||
|
downloadSources(args.source, args.outfolder)
|
||||||
780
main.py
Executable file
780
main.py
Executable file
@ -0,0 +1,780 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print("Address Database Builder 2025")
|
||||||
|
print("Starting up...")
|
||||||
|
|
||||||
|
import argparse, csv, zipfile, gzip, os, re, json, traceback, sys, multiprocessing
|
||||||
|
import concurrent.futures
|
||||||
|
from collections import deque
|
||||||
|
import pandas as pd
|
||||||
|
import dask.dataframe as dd
|
||||||
|
import gc
|
||||||
|
from multiprocessing import get_context
|
||||||
|
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
from src.addressfunctions import normalizeAddress
|
||||||
|
from src.constants import ValidationException
|
||||||
|
import src.config
|
||||||
|
|
||||||
|
maxthreads = multiprocessing.cpu_count()
|
||||||
|
MAX_IN_FLIGHT = maxthreads * 2
|
||||||
|
os.environ["OPENBLAS_MAIN_FREE"] = "1"
|
||||||
|
writelock = multiprocessing.Lock()
|
||||||
|
badcount = 0
|
||||||
|
skippedcount = 0
|
||||||
|
|
||||||
|
countrycode = "US"
|
||||||
|
|
||||||
|
def init_worker(cfg: src.config.AppConfig):
|
||||||
|
src.config.set_config(cfg)
|
||||||
|
|
||||||
|
def fixLatLon(filepath):
|
||||||
|
cfg = src.config.get_config()
|
||||||
|
print("Repairing flipped latitude/longitude pairs in " + filepath)
|
||||||
|
fixedcount = 0
|
||||||
|
df = pd.read_csv(filepath, keep_default_na=False, dtype="str")
|
||||||
|
skipstates = ("VI", "AK", "HI", "PR")
|
||||||
|
for index, row in df.iterrows():
|
||||||
|
row.latitude = float(row.latitude)
|
||||||
|
row.longitude = float(row.longitude)
|
||||||
|
if row.latitude < -90 or row.latitude > 90:
|
||||||
|
df.at[index, "latitude"], df.at[index, "longitude"] = row.longitude, row.latitude
|
||||||
|
fixedcount = fixedcount + 1
|
||||||
|
elif cfg.countryCode == "US" and row.state not in skipstates and (row.longitude < -171.791110603 or row.longitude > -66.96466):
|
||||||
|
df.at[index, "latitude"], df.at[index, "longitude"] = row.longitude, row.latitude
|
||||||
|
fixedcount = fixedcount + 1
|
||||||
|
elif cfg.countryCode == "US" and row.state not in skipstates and (row.latitude < 18.91619 or row.latitude > 71.3577635769):
|
||||||
|
df.at[index, "latitude"], df.at[index, "longitude"] = row.longitude, row.latitude
|
||||||
|
fixedcount = fixedcount + 1
|
||||||
|
df.to_csv(filepath + ".coordfix.csv", mode='a', index=False, header=not os.path.exists(filepath + ".coordfix.csv"))
|
||||||
|
print("\nDone flipping " + filepath + "! Fixed " + str(fixedcount) + " records.")
|
||||||
|
|
||||||
|
|
||||||
|
def normalize(number, street, street2, city, state, zipcode, latitude, longitude, zipprefix = False, plus4="", county = False):
|
||||||
|
cfg = src.config.get_config()
|
||||||
|
street1 = street
|
||||||
|
if len(city) > 4 and street1.endswith(" " + city):
|
||||||
|
# City name leaked into street field (Albany County Wyoming, for one)
|
||||||
|
street1 = street1.removesuffix(" " + city)
|
||||||
|
addr = normalizeAddress(number, street1, street2, city, state, zipcode, round(float(latitude),7), round(float(longitude), 7), zipprefix, plus4, county)
|
||||||
|
if len(addr['zip'] or "") < 5 or len(addr['plus4'] or "") != 4:
|
||||||
|
|
||||||
|
# Try removing letters from address numbers, and ignore city field
|
||||||
|
addrstrip = normalizeAddress(re.sub("[^0-9]", "", addr['number']), addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['latitude'], addr['longitude'], False, addr['plus4'], county)
|
||||||
|
|
||||||
|
# If that didn't work, try instead stripping the city name because it might be wrong
|
||||||
|
if addr['city'] != "" and (len(addrstrip['zip'] or "") < 5 or len(addrstrip['plus4'] or "") != 4):
|
||||||
|
addrstrip = normalizeAddress(addr['number'], addr['street'], addr['unit'], "", addr['state'], addr['zip'], addr['latitude'], addr['longitude'], False, addr['plus4'], county)
|
||||||
|
|
||||||
|
# Use libpostal to analyze address deeper
|
||||||
|
if cfg.advancedMode and len(addrstrip['zip'] or "") < 5 or len(addrstrip['plus4'] or "") != 4:
|
||||||
|
try:
|
||||||
|
addr = advancedNormalize(addr['number'], addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['latitude'], addr['longitude'], False, addr['plus4'], county)
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
# Do another normalize pass for good luck (maybe the previous one got the ZIP and now we can get the +4)
|
||||||
|
if len(addr['zip'] or "") < 5 or len(addr['plus4'] or "") != 4:
|
||||||
|
addr = normalizeAddress(addr['number'], addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['latitude'], addr['longitude'], False, addr['plus4'], county)
|
||||||
|
else:
|
||||||
|
addr = addrstrip
|
||||||
|
return addr
|
||||||
|
|
||||||
|
def processOwnChunk(chunk, chunkcount, outfilename, ignorestates, keeponlystates):
|
||||||
|
global badcount, skippedcount, writelock
|
||||||
|
cfg = src.config.get_config()
|
||||||
|
data = []
|
||||||
|
print(" " + str(chunkcount) + " ", end="\r", flush=True)
|
||||||
|
for index, row in chunk.iterrows():
|
||||||
|
if row.state in ignorestates:
|
||||||
|
skippedcount = skippedcount + 1
|
||||||
|
continue
|
||||||
|
if keeponlystates != [] and row.state not in keeponlystates:
|
||||||
|
skippedcount = skippedcount + 1
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
if not cfg.noSkip4 and len(row.plus4 or "") == 4:
|
||||||
|
addr = {
|
||||||
|
"number": row.number,
|
||||||
|
"street": row.street,
|
||||||
|
"unit": row.street2,
|
||||||
|
"city": row.city,
|
||||||
|
"state": row.state,
|
||||||
|
"zip": row.zip,
|
||||||
|
"plus4": row.plus4,
|
||||||
|
"latitude": round(float(row.latitude),7),
|
||||||
|
"longitude": round(float(row.longitude), 7)
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
addr = normalize(row.number, row.street, row.street2, row.city, row.state, row.zip, round(float(row.latitude),7), round(float(row.longitude), 7), False, row.plus4)
|
||||||
|
|
||||||
|
if addr["state"] in ignorestates:
|
||||||
|
skippedcount = skippedcount + 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
data.append([addr['number'], addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['plus4'], addr['latitude'], addr['longitude'], row.source])
|
||||||
|
except ValidationException as e:
|
||||||
|
badcount = badcount + 1
|
||||||
|
except Exception as e:
|
||||||
|
print("W: Couldn't ingest address:")
|
||||||
|
print(row)
|
||||||
|
traceback.print_exc()
|
||||||
|
badcount = badcount + 1
|
||||||
|
out = pd.DataFrame(data=data, columns=["number","street","street2","city","state","zip","plus4","latitude","longitude","source"])
|
||||||
|
with writelock:
|
||||||
|
out.to_csv(outfilename, mode='a', index=False, header=not os.path.exists(outfilename), columns=["number","street","street2","city","state","zip","plus4","latitude","longitude","source"])
|
||||||
|
gc.collect()
|
||||||
|
|
||||||
|
|
||||||
|
def importOwnFile(filename, outfilename, ignorestates, keeponlystates):
|
||||||
|
global badcount, skippedcount, writelock
|
||||||
|
print("Processing addresses from " + filename)
|
||||||
|
columns = ["number","street","street2","city","state","zip","plus4","latitude","longitude","source"]
|
||||||
|
file = filename
|
||||||
|
chunkcount = 0
|
||||||
|
badcount = 0
|
||||||
|
skippedcount = 0
|
||||||
|
chunksize = 1000
|
||||||
|
in_flight = set()
|
||||||
|
with concurrent.futures.ProcessPoolExecutor(max_workers=maxthreads, max_tasks_per_child=100, initializer=init_worker, initargs=(cfg,)) as executor:
|
||||||
|
for chunk in pd.read_csv(file, chunksize=chunksize, usecols=columns, keep_default_na=False, dtype={
|
||||||
|
"number":"string","street":"string",
|
||||||
|
"street2":"string","city":"string",
|
||||||
|
"state":"category", "zip":"string",
|
||||||
|
"plus4": "string",
|
||||||
|
"latitude":"float32", "longitude":"float32",
|
||||||
|
"source":"category"}, dtype_backend="pyarrow"):
|
||||||
|
|
||||||
|
while len(in_flight) >= MAX_IN_FLIGHT:
|
||||||
|
done, in_flight = concurrent.futures.wait(in_flight, return_when=concurrent.futures.FIRST_COMPLETED)
|
||||||
|
for fut in done:
|
||||||
|
fut.result()
|
||||||
|
fut = executor.submit(processOwnChunk, chunk, chunkcount * chunksize, outfilename, ignorestates, keeponlystates)
|
||||||
|
in_flight.add(fut)
|
||||||
|
chunkcount = chunkcount + 1
|
||||||
|
|
||||||
|
for fut in concurrent.futures.as_completed(in_flight):
|
||||||
|
fut.result()
|
||||||
|
|
||||||
|
print("\nDone processing! Parsed " + str(chunkcount) + " chunks.")
|
||||||
|
print("There were " + str(badcount) + " unprocessable addresses.")
|
||||||
|
if ignorestates:
|
||||||
|
print("There were " + str(skippedcount) + " addresses ignored due to your --ignorestates setting.")
|
||||||
|
print("Saved to output file " + outfilename)
|
||||||
|
|
||||||
|
def processNadChunk(chunk, chunkcount, outfilename, ignorestates, keeponlystates):
|
||||||
|
global badcount, skippedcount, writelock
|
||||||
|
print(" " + str(chunkcount) + " ", end="\r", flush=True)
|
||||||
|
data = []
|
||||||
|
for index, row in chunk.iterrows():
|
||||||
|
if row.State.upper() in ignorestates:
|
||||||
|
skippedcount = skippedcount + 1
|
||||||
|
continue
|
||||||
|
if keeponlystates != [] and row.State.upper() not in keeponlystates:
|
||||||
|
skippedcount = skippedcount + 1
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
town = row.Inc_Muni
|
||||||
|
if town == "Unincorporated":
|
||||||
|
town = ""
|
||||||
|
if not town:
|
||||||
|
town = row.Post_City
|
||||||
|
if not town:
|
||||||
|
town = row.Uninc_Comm
|
||||||
|
|
||||||
|
addr = normalize(row.AddNo_Full, row.StNam_Full, row.SubAddress, row.Inc_Muni, row.State, row.Zip_Code, round(float(row.Latitude),7), round(float(row.Longitude), 7))
|
||||||
|
|
||||||
|
if addr["state"] in ignorestates: # For example, AR's data claims to have MO addresses but the ZIP says they're in AR, so the first pass of this won't catch those
|
||||||
|
skippedcount = skippedcount + 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
source = row.NAD_Source
|
||||||
|
source = source.replace("State of ", "")
|
||||||
|
source = "NAD " + source
|
||||||
|
data.append([addr['number'], addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['plus4'], addr['latitude'], addr['longitude'], source])
|
||||||
|
except ValidationException as e:
|
||||||
|
badcount = badcount + 1
|
||||||
|
except Exception as e:
|
||||||
|
print("W: Couldn't ingest address:")
|
||||||
|
print(row)
|
||||||
|
traceback.print_exc()
|
||||||
|
badcount = badcount + 1
|
||||||
|
out = pd.DataFrame(data=data, columns=["number","street","street2","city","state","zip","plus4","latitude","longitude","source"])
|
||||||
|
with writelock:
|
||||||
|
out.to_csv(outfilename, mode='a', index=False, header=not os.path.exists(outfilename), columns=["number","street","street2","city","state","zip","plus4","latitude","longitude","source"])
|
||||||
|
gc.collect()
|
||||||
|
|
||||||
|
def importNadFile(filename, outfilename, ignorestates, keeponlystates, startatline):
|
||||||
|
global skippedcount, badcount
|
||||||
|
print("Importing National Address Database addresses from " + filename)
|
||||||
|
if startatline > 0:
|
||||||
|
print("Skipping to line number " + str(startatline))
|
||||||
|
|
||||||
|
columns = [
|
||||||
|
"AddNo_Full",
|
||||||
|
"StNam_Full",
|
||||||
|
"St_PreMod",
|
||||||
|
"St_PreDir",
|
||||||
|
"St_Name",
|
||||||
|
"SubAddress",
|
||||||
|
"Inc_Muni",
|
||||||
|
"Post_City",
|
||||||
|
"Uninc_Comm",
|
||||||
|
"Urbnztn_PR",
|
||||||
|
"State",
|
||||||
|
"Zip_Code",
|
||||||
|
"UUID",
|
||||||
|
"Longitude",
|
||||||
|
"Latitude",
|
||||||
|
"DateUpdate",
|
||||||
|
"NAD_Source",
|
||||||
|
]
|
||||||
|
file = filename
|
||||||
|
if filename.endswith(".zip"):
|
||||||
|
zf = zipfile.ZipFile(filename, mode="r")
|
||||||
|
zipFiles = zf.namelist()
|
||||||
|
for fname in zipFiles:
|
||||||
|
if fname.upper().startswith("TXT/NAD") and fname.upper().endswith(".TXT"):
|
||||||
|
file = zf.open(fname, mode="r", force_zip64=True)
|
||||||
|
break
|
||||||
|
chunkcount = 0
|
||||||
|
chunksize = 1000
|
||||||
|
in_flight = set()
|
||||||
|
with concurrent.futures.ProcessPoolExecutor(max_workers=maxthreads, mp_context=get_context("spawn"), max_tasks_per_child=100, initializer=init_worker, initargs=(cfg,)) as executor:
|
||||||
|
for chunk in pd.read_csv(file, chunksize=chunksize, header=0, skiprows=lambda i: 1 <= i <= startatline, usecols=columns, keep_default_na=False, dtype={
|
||||||
|
"State":"category","NAD_Source":"category",
|
||||||
|
"Zip_Code":"string","UUID":"string",
|
||||||
|
"AddNo_Full":"string","StNam_Full":"string","St_PreMod":"string",
|
||||||
|
"St_PreDir":"string","St_Name":"string","SubAddress":"string",
|
||||||
|
"Inc_Muni":"string","Post_City":"string","Uninc_Comm":"string",
|
||||||
|
"Urbnztn_PR":"string","Longitude":"float32","Latitude":"float32",
|
||||||
|
"DateUpdate":"string"}, dtype_backend="pyarrow"):
|
||||||
|
while len(in_flight) >= MAX_IN_FLIGHT:
|
||||||
|
done, in_flight = concurrent.futures.wait(in_flight, return_when=concurrent.futures.FIRST_COMPLETED)
|
||||||
|
for fut in done:
|
||||||
|
fut.result()
|
||||||
|
fut = executor.submit(processNadChunk, chunk, chunkcount * chunksize, outfilename, ignorestates, keeponlystates)
|
||||||
|
in_flight.add(fut)
|
||||||
|
chunkcount = chunkcount + 1
|
||||||
|
for fut in concurrent.futures.as_completed(in_flight):
|
||||||
|
fut.result()
|
||||||
|
print("\nDone importing NAD! Processed " + str(chunkcount) + " chunks of " + str(chunksize) + " rows.")
|
||||||
|
print("There were " + str(badcount) + " unprocessable addresses.")
|
||||||
|
if ignorestates:
|
||||||
|
print("There were " + str(skippedcount) + " addresses ignored due to your --ignorestates setting.")
|
||||||
|
print("Saved to output file " + outfilename)
|
||||||
|
|
||||||
|
def processOpenAddressRows(rows, startindex, outfilename, ignorestates, source, stateOverride, zipprefix, citySuggestion, county = False):
|
||||||
|
global badcount, skippedcount, writelock
|
||||||
|
print(" " + str(startindex) + " ", end="\r", flush=True)
|
||||||
|
linecount = 0
|
||||||
|
outdata = []
|
||||||
|
emptylinecount = 0
|
||||||
|
for line in rows:
|
||||||
|
linecount = linecount + 1
|
||||||
|
try:
|
||||||
|
data = json.loads(line)
|
||||||
|
if not data["properties"]["number"] and not data["properties"]["street"]:
|
||||||
|
emptylinecount = emptylinecount + 1
|
||||||
|
if not data["geometry"] or not data["geometry"]["coordinates"][0] or not data["geometry"]["coordinates"][1]:
|
||||||
|
emptylinecount = emptylinecount + 1
|
||||||
|
state = data["properties"]["region"].upper()
|
||||||
|
city = data["properties"]["city"].upper().strip()
|
||||||
|
if stateOverride:
|
||||||
|
state = stateOverride
|
||||||
|
if state in ignorestates:
|
||||||
|
skippedcount = skippedcount + 1
|
||||||
|
continue
|
||||||
|
if data["geometry"] is None:
|
||||||
|
badcount = badcount + 1
|
||||||
|
continue
|
||||||
|
if not data["properties"]["number"] or not data["properties"]["street"] or data["properties"]["number"] == "0":
|
||||||
|
badcount = badcount + 1
|
||||||
|
continue
|
||||||
|
if citySuggestion and not city:
|
||||||
|
city = citySuggestion
|
||||||
|
if source == "OA/hawaii" and re.match(r"^[1-9][1-9][0-9]{4}", data["properties"]["number"]):
|
||||||
|
# Source is broken/missing, and the last good version has the house numbers without dashes
|
||||||
|
# Hawaii has a specific and unique address numbering system
|
||||||
|
data["properties"]["number"] = data["properties"]["number"][:2] + "-" + data["properties"]["number"][2:]
|
||||||
|
|
||||||
|
addr = normalize(data["properties"]["number"], data["properties"]["street"], data["properties"]["unit"], city, state, data["properties"]["postcode"], data["geometry"]["coordinates"][1], data["geometry"]["coordinates"][0], zipprefix, "", county)
|
||||||
|
|
||||||
|
|
||||||
|
if addr["state"] in ignorestates:
|
||||||
|
skippedcount = skippedcount + 1
|
||||||
|
continue
|
||||||
|
if addr["street"] == "":
|
||||||
|
badcount = badcount + 1
|
||||||
|
continue
|
||||||
|
if not source:
|
||||||
|
source = "OA/"+addr["state"]
|
||||||
|
outdata.append([addr['number'], addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['plus4'], addr['latitude'], addr['longitude'], source])
|
||||||
|
except ValidationException as e:
|
||||||
|
badcount = badcount + 1
|
||||||
|
except Exception as e:
|
||||||
|
traceback.print_exc()
|
||||||
|
print("Error encountered while processing", line)
|
||||||
|
badcount = badcount + 1
|
||||||
|
if linecount > 0 and emptylinecount / linecount > .95:
|
||||||
|
print("\nWarning: Empty chunk! " + str(emptylinecount) + " of " + str(linecount) + " rows had no address.")
|
||||||
|
out = pd.DataFrame(data=outdata, columns=["number","street","street2","city","state","zip","plus4","latitude","longitude","source"])
|
||||||
|
with writelock:
|
||||||
|
out.to_csv(outfilename, mode='a', index=False, header=not os.path.exists(outfilename), columns=["number","street","street2","city","state","zip","plus4","latitude","longitude","source"])
|
||||||
|
gc.collect()
|
||||||
|
|
||||||
|
def importOpenAddressFile(filepath, outfilename, ignorestates, source, stateOverride, zipprefix):
|
||||||
|
global badcount, skippedcount
|
||||||
|
cfg = src.config.get_config()
|
||||||
|
print("Importing OpenAddresses data from " + filepath)
|
||||||
|
chunksize = 1000
|
||||||
|
linecount = 0
|
||||||
|
|
||||||
|
if stateOverride:
|
||||||
|
stateOverride = stateOverride.strip().upper()
|
||||||
|
|
||||||
|
file = filepath
|
||||||
|
if filepath.endswith(".gz"):
|
||||||
|
file = gzip.open(filepath, 'rb')
|
||||||
|
else:
|
||||||
|
file = open(file, 'r')
|
||||||
|
|
||||||
|
county = False
|
||||||
|
|
||||||
|
if not source or source == "":
|
||||||
|
source = "OA/"+filepath.split("/")[-1].split("-")[0]
|
||||||
|
if source.startswith("OA/statewide"):
|
||||||
|
if stateOverride:
|
||||||
|
source = source.replace("statewide", stateOverride)
|
||||||
|
else:
|
||||||
|
source = False
|
||||||
|
citySuggestion = False
|
||||||
|
if not cfg.citySuggestion and filepath.split("/")[-1].startswith("city_of_"):
|
||||||
|
# Set city suggestion using filename
|
||||||
|
citySuggestion = re.sub(r'\d+', '', filepath.split("/")[-1].split("-")[0].replace("city_of_", "").replace("_", " ").upper().strip())
|
||||||
|
|
||||||
|
if filepath.split("/")[-1].endswith("-addresses-county.geojson"):
|
||||||
|
county = filepath.split("/")[-1].split("-")[0].replace("_", " ").upper().strip()
|
||||||
|
print("Detected county from filename: " + county + ", will use for ZIP Code hinting")
|
||||||
|
|
||||||
|
lines = []
|
||||||
|
in_flight = set()
|
||||||
|
with concurrent.futures.ProcessPoolExecutor(max_workers=maxthreads, mp_context=get_context("spawn"), max_tasks_per_child=1000, initializer=init_worker, initargs=(cfg,)) as executor:
|
||||||
|
for line in file:
|
||||||
|
lines.append(line)
|
||||||
|
linecount = linecount + 1
|
||||||
|
if len(lines) >= chunksize:
|
||||||
|
while len(in_flight) >= MAX_IN_FLIGHT:
|
||||||
|
done, in_flight = concurrent.futures.wait(in_flight, return_when=concurrent.futures.FIRST_COMPLETED)
|
||||||
|
for fut in done:
|
||||||
|
fut.result()
|
||||||
|
fut = executor.submit(processOpenAddressRows, lines, linecount, outfilename, ignorestates, source, stateOverride, zipprefix, citySuggestion, county)
|
||||||
|
in_flight.add(fut)
|
||||||
|
lines = []
|
||||||
|
|
||||||
|
fut = executor.submit(processOpenAddressRows, lines, linecount, outfilename, ignorestates, source, stateOverride, zipprefix, citySuggestion, county)
|
||||||
|
in_flight.add(fut)
|
||||||
|
|
||||||
|
for fut in concurrent.futures.as_completed(in_flight):
|
||||||
|
fut.result()
|
||||||
|
|
||||||
|
file.close()
|
||||||
|
|
||||||
|
print("\nDone importing OpenAddresses! Processed " + str(linecount) + " entries.")
|
||||||
|
print("There were " + str(badcount) + " unprocessable addresses.")
|
||||||
|
if ignorestates:
|
||||||
|
print("There were " + str(skippedcount) + " addresses ignored due to your --ignorestates setting.")
|
||||||
|
print("Saved to output file " + outfilename)
|
||||||
|
return
|
||||||
|
|
||||||
|
def importOSMFile(filename, outfilename):
|
||||||
|
"""
|
||||||
|
Overpass API query for data input (replace name=Montana with the region you want):
|
||||||
|
[out:csv(::"lat", ::"lon", "addr:housenumber", "addr:street", "addr:city", "addr:state", "addr:postcode")][timeout:120];
|
||||||
|
area["name"="Montana"]->.boundaryarea;
|
||||||
|
node["addr:housenumber"]["addr:street"](area.boundaryarea);
|
||||||
|
out;
|
||||||
|
way["addr:housenumber"]["addr:street"](area.boundaryarea);
|
||||||
|
out center;
|
||||||
|
relation["addr:housenumber"]["addr:street"](area.boundaryarea);
|
||||||
|
out center;
|
||||||
|
"""
|
||||||
|
print("Importing OSM Overpass data from " + filename)
|
||||||
|
columns = [
|
||||||
|
"@lat",
|
||||||
|
"@lon",
|
||||||
|
"addr:housenumber",
|
||||||
|
"addr:street",
|
||||||
|
"addr:city",
|
||||||
|
"addr:state",
|
||||||
|
"addr:postcode"
|
||||||
|
]
|
||||||
|
file = filename
|
||||||
|
chunkcount = 0
|
||||||
|
badcount = 0
|
||||||
|
skippedcount = 0
|
||||||
|
source = "OpenStreetMap.org. License: ODbL"
|
||||||
|
for chunk in pd.read_csv(file, sep='\t', chunksize=100, usecols=columns, keep_default_na=False, dtype="str"):
|
||||||
|
print(" " + str(chunkcount * 100) + " ", end="\r", flush=True)
|
||||||
|
data = []
|
||||||
|
for index, row in chunk.iterrows():
|
||||||
|
try:
|
||||||
|
addr = normalize(row["addr:housenumber"], row["addr:street"], "", row["addr:city"], row["addr:state"], row["addr:postcode"], row["@lat"], row["@lon"])
|
||||||
|
data.append([addr['number'], addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['plus4'], addr['latitude'], addr['longitude'], source])
|
||||||
|
except ValidationException as e:
|
||||||
|
badcount = badcount + 1
|
||||||
|
except Exception as e:
|
||||||
|
print("W: Couldn't ingest address:")
|
||||||
|
print(row)
|
||||||
|
traceback.print_exc()
|
||||||
|
badcount = badcount + 1
|
||||||
|
out = pd.DataFrame(data=data, columns=["number","street","street2","city","state","zip","plus4","latitude","longitude","source"])
|
||||||
|
out.to_csv(outfilename, mode='a', index=False, header=not os.path.exists(outfilename), columns=["number","street","street2","city","state","zip","plus4","latitude","longitude","source"])
|
||||||
|
chunkcount = chunkcount + 1
|
||||||
|
print("\nDone importing OSM! Processed " + str(chunkcount) + " chunks.")
|
||||||
|
print("There were " + str(badcount) + " unprocessable addresses.")
|
||||||
|
print("Saved to output file " + outfilename)
|
||||||
|
|
||||||
|
def importNARFile(filename, outfilename):
|
||||||
|
print("Importing Statistics Canada data from " + filename)
|
||||||
|
zf = zipfile.ZipFile(filename, mode="r")
|
||||||
|
zipFiles = zf.namelist()
|
||||||
|
locationFileList = {}
|
||||||
|
addressFileList = {}
|
||||||
|
provinceCodes = [10,11,12,13,24,35,46,47,48,59,60,61,62]
|
||||||
|
for c in provinceCodes:
|
||||||
|
addressFileList[str(c)] = []
|
||||||
|
locationFileList[str(c)] = []
|
||||||
|
# = zf.open(fname, mode="r", force_zip64=True)
|
||||||
|
for fname in zipFiles:
|
||||||
|
if fname.startswith("Addresses/Address_") and fname.endswith(".csv"):
|
||||||
|
number = fname.replace("Addresses/Address_", "").replace(".csv", "").split("_")[0]
|
||||||
|
addressFileList[number].append(fname)
|
||||||
|
elif fname.startswith("Locations/Location_") and fname.endswith(".csv"):
|
||||||
|
number = fname.replace("Locations/Location_", "").replace(".csv", "").split("_")[0]
|
||||||
|
locationFileList[number].append(fname)
|
||||||
|
|
||||||
|
print("\nMerging address and location tables...")
|
||||||
|
mergecount = 0
|
||||||
|
dataframes = []
|
||||||
|
addrcols = ["LOC_GUID","APT_NO_LABEL","CIVIC_NO","CIVIC_NO_SUFFIX","MAIL_STREET_NAME","MAIL_STREET_TYPE","MAIL_STREET_DIR","MAIL_MUN_NAME","MAIL_PROV_ABVN","MAIL_POSTAL_CODE","BU_N_CIVIC_ADD"]
|
||||||
|
loccols = ["LOC_GUID","BG_LATITUDE","BG_LONGITUDE"]
|
||||||
|
for provinceId in provinceCodes:
|
||||||
|
print(" " + str(mergecount+1) + " ", end="\r", flush=True)
|
||||||
|
readaf = map(lambda addrFilename: dd.read_csv("zip://"+addrFilename, storage_options={'fo': filename}, usecols=addrcols, keep_default_na=False, dtype="str"), addressFileList[str(provinceId)])
|
||||||
|
readlf = map(lambda locationFilename: dd.read_csv("zip://"+locationFilename, storage_options={'fo': filename}, usecols=loccols, keep_default_na=False, dtype="str"), locationFileList[str(provinceId)])
|
||||||
|
addressFrame = dd.concat(list(readaf), ignore_index=False)
|
||||||
|
locationFrame = dd.concat(list(readlf), ignore_index=False)
|
||||||
|
dataframes.append(dd.merge(addressFrame, locationFrame, on=["LOC_GUID"]))
|
||||||
|
mergecount = mergecount + 1
|
||||||
|
|
||||||
|
print("\nProcessing addresses...")
|
||||||
|
file = filename
|
||||||
|
alladdrcount = 0
|
||||||
|
skippedcount = 0
|
||||||
|
source = "StatsCan NAR"
|
||||||
|
provinceIndex = 0
|
||||||
|
for df in dataframes:
|
||||||
|
print("\nProcessing province ID " + str(provinceCodes[provinceIndex]))
|
||||||
|
data = []
|
||||||
|
addrcount = 0
|
||||||
|
for index, row in df.iterrows():
|
||||||
|
if (addrcount % 100 == 0):
|
||||||
|
print(" " + str(addrcount) + " ", end="\r", flush=True)
|
||||||
|
number = ("".join(filter(None, [row["CIVIC_NO"], row["CIVIC_NO_SUFFIX"]]))).strip().upper()
|
||||||
|
street = (" ".join(filter(None, [row["MAIL_STREET_NAME"], row["MAIL_STREET_TYPE"], row["MAIL_STREET_DIR"]]))).strip().upper()
|
||||||
|
apt = row["APT_NO_LABEL"].strip().upper()
|
||||||
|
if street == "":
|
||||||
|
# PO BOX probably
|
||||||
|
if row["BU_N_CIVIC_ADD"].startswith("PO BOX "):
|
||||||
|
data.append([row["BU_N_CIVIC_ADD"].replace("PO BOX ", "").strip(), "PO BOX", "", row["MAIL_MUN_NAME"], row["MAIL_PROV_ABVN"], row["MAIL_POSTAL_CODE"], "", row["BG_LATITUDE"], row["BG_LONGITUDE"], source])
|
||||||
|
else:
|
||||||
|
skippedcount = skippedcount + 1
|
||||||
|
else:
|
||||||
|
data.append([number, street, apt, row["MAIL_MUN_NAME"], row["MAIL_PROV_ABVN"], row["MAIL_POSTAL_CODE"], "", row["BG_LATITUDE"], row["BG_LONGITUDE"], source])
|
||||||
|
addrcount = addrcount + 1
|
||||||
|
if len(data) >= 1000: # Dump to file so we don't use tons of RAM
|
||||||
|
out = pd.DataFrame(data=data, columns=["number","street","street2","city","state","zip","plus4","latitude","longitude","source"])
|
||||||
|
out.to_csv(outfilename, mode='a', index=False, header=not os.path.exists(outfilename), columns=["number","street","street2","city","state","zip","plus4","latitude","longitude","source"])
|
||||||
|
data = []
|
||||||
|
out = pd.DataFrame(data=data, columns=["number","street","street2","city","state","zip","plus4","latitude","longitude","source"])
|
||||||
|
out.to_csv(outfilename, mode='a', index=False, header=not os.path.exists(outfilename), columns=["number","street","street2","city","state","zip","plus4","latitude","longitude","source"])
|
||||||
|
alladdrcount = alladdrcount + addrcount
|
||||||
|
provinceIndex = provinceIndex + 1
|
||||||
|
print("\nDone importing NAR! Processed " + str(alladdrcount) + " addresses.")
|
||||||
|
print("Skipped " + str(skippedcount) + " invalid mailing addresses.")
|
||||||
|
print("Saved to output file " + outfilename)
|
||||||
|
|
||||||
|
def removeDupes(filepath):
|
||||||
|
print("Removing duplicate and incomplete addresses from " + filepath)
|
||||||
|
chunkcount = 0
|
||||||
|
chunksize = 20000000
|
||||||
|
for chunk in pd.read_csv(filepath, chunksize=chunksize, keep_default_na=False, dtype="str", usecols=["number", "street", "street2", "city", "state", "zip", "latitude", "longitude", "source"]):
|
||||||
|
print(".", end="", flush=True)
|
||||||
|
chunk.replace('', None, inplace=True)
|
||||||
|
chunk.dropna(subset=['zip','number','street','city','state','latitude','longitude'], inplace=True)
|
||||||
|
chunk.sort_values(by="plus4", ascending=False, inplace=True, na_position="last") # Make sure the address duplicate with a +4 is kept
|
||||||
|
chunk.drop_duplicates(subset=["number", "street", "street2", "city", "state", "zip"], keep="first", inplace=True)
|
||||||
|
chunk.to_csv(filepath + ".dedup.csv", mode='a', index=False,header=not os.path.exists(filepath + ".dedup.csv"), columns=["number","street","street2","city","state","zip","latitude","longitude", "source"])
|
||||||
|
chunkcount = chunkcount + 1
|
||||||
|
print("\nDone removing duplicates from " + filepath + "! Processed " + str(chunkcount) + " chunks of " + str(chunksize) + " records.")
|
||||||
|
|
||||||
|
def tosqlite(addressfile, dbfile):
|
||||||
|
global countrycode
|
||||||
|
cfg = src.config.get_config()
|
||||||
|
print("\nReading addresses from " + addressfile)
|
||||||
|
file = addressfile
|
||||||
|
if addressfile.endswith(".gz"):
|
||||||
|
file = gzip.open(addressfile, 'rb')
|
||||||
|
else:
|
||||||
|
file = open(addressfile, 'r')
|
||||||
|
|
||||||
|
connection = sqlite3.connect(dbfile)
|
||||||
|
|
||||||
|
cursor = connection.cursor()
|
||||||
|
cursor.execute("""CREATE TABLE IF NOT EXISTS `addresses` (
|
||||||
|
`zipcode` VARCHAR ( 6 ) NOT NULL,
|
||||||
|
`number` VARCHAR ( 30 ) NOT NULL,
|
||||||
|
`street` VARCHAR ( 200 ) NOT NULL,
|
||||||
|
`street2` VARCHAR ( 20 ),
|
||||||
|
`city` VARCHAR ( 50 ) NOT NULL,
|
||||||
|
`state` CHAR ( 2 ) NOT NULL,
|
||||||
|
`plus4` CHAR ( 4 ),
|
||||||
|
`country` CHAR ( 2 ) NOT NULL DEFAULT "US",
|
||||||
|
`latitude` DECIMAL ( 8 , 6 ) NOT NULL,
|
||||||
|
`longitude` DECIMAL( 9 , 6 ) NOT NULL,
|
||||||
|
`source` VARCHAR( 40 ),
|
||||||
|
UNIQUE (zipcode, number, street, street2, country)
|
||||||
|
)""")
|
||||||
|
cursor.execute("DROP TABLE IF EXISTS `addresses_temp`")
|
||||||
|
cursor.execute("""CREATE TABLE IF NOT EXISTS `addresses_temp` (
|
||||||
|
`zipcode` CHAR ( 6 ) NOT NULL,
|
||||||
|
`number` VARCHAR ( 30 ) NOT NULL,
|
||||||
|
`street` VARCHAR ( 200 ) NOT NULL,
|
||||||
|
`street2` VARCHAR ( 20 ),
|
||||||
|
`city` VARCHAR ( 50 ) NOT NULL,
|
||||||
|
`state` CHAR ( 2 ) NOT NULL,
|
||||||
|
`plus4` CHAR ( 4 ),
|
||||||
|
`country` CHAR ( 2 ) NOT NULL DEFAULT "US",
|
||||||
|
`latitude` DECIMAL ( 8 , 6 ) NOT NULL,
|
||||||
|
`longitude` DECIMAL( 9 , 6 ) NOT NULL,
|
||||||
|
`source` VARCHAR( 40 )
|
||||||
|
)""")
|
||||||
|
cursor.execute("""CREATE INDEX IF NOT EXISTS `latitude_longitude` ON `addresses` (
|
||||||
|
`latitude`,
|
||||||
|
`longitude`
|
||||||
|
)""")
|
||||||
|
cursor.execute("""CREATE INDEX IF NOT EXISTS `number_street` ON `addresses` (
|
||||||
|
`number`,
|
||||||
|
`street`
|
||||||
|
)""")
|
||||||
|
cursor.execute("""CREATE INDEX IF NOT EXISTS `state_city` ON `addresses` (
|
||||||
|
`state`,
|
||||||
|
`city`
|
||||||
|
)""")
|
||||||
|
cursor.execute("""CREATE INDEX IF NOT EXISTS `zipcode_number` ON `addresses` (
|
||||||
|
`zipcode`,
|
||||||
|
`number`
|
||||||
|
)""")
|
||||||
|
cursor.execute("""CREATE INDEX IF NOT EXISTS `country` ON `addresses` (
|
||||||
|
`country`
|
||||||
|
)""")
|
||||||
|
|
||||||
|
chunksize = 5000
|
||||||
|
chunkcount = 0
|
||||||
|
rowschanged = 0
|
||||||
|
columns = ["number","street","street2","city","state","zip","latitude","longitude","source"]
|
||||||
|
if cfg.appendPlus4:
|
||||||
|
columns.append("plus4")
|
||||||
|
for chunk in pd.read_csv(file, chunksize=chunksize, usecols=columns, keep_default_na=False, dtype="str"):
|
||||||
|
chunk = chunk.rename(columns={'zip': 'zipcode'})
|
||||||
|
chunk.insert(7, "country", countrycode)
|
||||||
|
# Replace empty values with NULL
|
||||||
|
chunk.replace('', None, inplace=True)
|
||||||
|
# Replace null street2 with empty string so the SQLite UNIQUE clause will work
|
||||||
|
chunk.fillna({"street2": ""}, inplace=True)
|
||||||
|
# Remove null values that aren't allowed
|
||||||
|
chunk.dropna(subset=['zipcode','number','street','city','state','latitude','longitude'], inplace=True)
|
||||||
|
print(" " + str(chunkcount * chunksize) + " ", end="\r", flush=True)
|
||||||
|
# Write chunk to SQLite
|
||||||
|
cursor.execute("DELETE FROM addresses_temp")
|
||||||
|
chunk.to_sql("addresses_temp", connection, if_exists='append', index=False, dtype={
|
||||||
|
"zipcode": "CHAR(6)",
|
||||||
|
"number": "VARCHAR(30)",
|
||||||
|
"street": "VARCHAR(200)",
|
||||||
|
"street2": "VARCHAR(20)",
|
||||||
|
"city": "VARCHAR(50)",
|
||||||
|
"state": "CHAR(2)",
|
||||||
|
"plus4": "CHAR(4)",
|
||||||
|
"country": "CHAR(2)",
|
||||||
|
"latitude": "DECIMAL(8,6)",
|
||||||
|
"longitude": "DECIMAL(9,6)",
|
||||||
|
"source": "VARCHAR(40)"
|
||||||
|
})
|
||||||
|
chunkcount = chunkcount + 1
|
||||||
|
cursor.execute("INSERT OR IGNORE INTO addresses SELECT * FROM addresses_temp")
|
||||||
|
rowschanged = rowschanged + cursor.rowcount
|
||||||
|
if chunkcount % 5000 == 0: # VACUUM every 10 million inserts
|
||||||
|
print(" Optimizing database...", end="\r", flush=True)
|
||||||
|
connection.executescript("VACUUM")
|
||||||
|
print(" ", end="\r", flush=True)
|
||||||
|
connection.executescript("DROP TABLE addresses_temp")
|
||||||
|
|
||||||
|
cursor.execute("DELETE FROM addresses WHERE number=\"0\"")
|
||||||
|
rowschanged = rowschanged + cursor.rowcount
|
||||||
|
if rowschanged > 10000000:
|
||||||
|
print("\nOptimizing database...")
|
||||||
|
connection.executescript("VACUUM; ANALYZE; PRAGMA optimize;")
|
||||||
|
print("Done converting to SQLite! Processed " + str(chunkcount) + " chunks (" + str(chunksize) + " records per chunk).")
|
||||||
|
print(str(rowschanged) + " records inserted.")
|
||||||
|
connection.close()
|
||||||
|
print("Saved to output file " + dbfile)
|
||||||
|
return rowschanged
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Tools to build a standardized U.S. address database from free source data."
|
||||||
|
)
|
||||||
|
parser.add_argument("file", help="Address file(s) to process.", nargs='+')
|
||||||
|
parser.add_argument("--outputfile", help="Filename to output address data to. If unspecified, set to \"./data/out.csv\" or \"./data/out.sqlite\", depending on options set.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--filetype",
|
||||||
|
help="Type of address file to ingest. nad=National Address Database, oa=OpenAddresses, adb=CSV created by this script, osm=OpenStreetMap Overpass API (see main.py source for query to use), nar=Statistics Canada National Address Register",
|
||||||
|
choices=["nad", "oa", "adb", "osm", "nar"],
|
||||||
|
)
|
||||||
|
parser.add_argument("--state", help="Some OpenAddresses files don't have the state field set. Do it manually here.")
|
||||||
|
parser.add_argument("--ignorestates", help="Comma-separated two-letter state names. Addresses in these states will be skipped over.")
|
||||||
|
parser.add_argument("--onlystates", help="Comma-separated two-letter state names. Addresses NOT in these states will be skipped over.")
|
||||||
|
parser.add_argument("--source", help="Set the data source name (OpenAddresses only). Autodetected based on filename if not set.")
|
||||||
|
parser.add_argument("--dedup", help="Remove duplicate records in an already-ingested address file, and saves it to folder/file.dedup.csv. Only catches \"nearby\" duplicates; processes 20,000,000 records at a time.", action='store_true')
|
||||||
|
parser.add_argument("--fixlatlon", help="Detect and repair flipped latitude/longitude pairs in an already-ingested address file, and saves it to [filename].coordfix.csv.", action='store_true')
|
||||||
|
parser.add_argument("--tosqlite", help="Output to a SQLite3 database. Only works on output CSV data from this script.", action='store_true')
|
||||||
|
parser.add_argument("--appendplus4", help="Append ZIP+4 data to all records. Fairly slow.", action='store_true')
|
||||||
|
parser.add_argument("--appendunitlabel", help="Append unit label (APT, STE, etc) to unit numbers using ZIP+4 data.", action='store_true')
|
||||||
|
parser.add_argument("--zipprefix", help="When searching for a ZIP, assume it starts with the digits provided for faster lookups.")
|
||||||
|
parser.add_argument("-a", help="Allow appending to existing output file.", action='store_true')
|
||||||
|
parser.add_argument("--cpu", help="Number of CPU cores to use for parallel processing.")
|
||||||
|
parser.add_argument("--country", help="Two-letter country code. Default is US.")
|
||||||
|
parser.add_argument("--city", help="City name to assume when there's no city or postal code in the source data. Useful for OpenAddresses city_of_ data files.")
|
||||||
|
parser.add_argument("--startat", help="Skip to this line number in the input file (NAD)")
|
||||||
|
parser.add_argument("--census", help="Enable looking up missing ZIP codes in the U.S. Census Geocoder when we have a full address, city, and state but no ZIP.", action='store_true')
|
||||||
|
parser.add_argument("--libpostal", help="Use libpostal address parsing and expansions to match bad addresses to a ZIP+4. Automatically enables --appendplus4.", action='store_true')
|
||||||
|
parser.add_argument("--noskip4", help="When processing own file format, don't skip normalizing records that have a ZIP+4 already.", action="store_true")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
startAtLine = 0
|
||||||
|
|
||||||
|
appendPlus4 = False
|
||||||
|
appendUnitLabel = False
|
||||||
|
useCensusToFillEmptyZIPs = False
|
||||||
|
countryCode = "US"
|
||||||
|
citySuggestion = False
|
||||||
|
advancedMode = False
|
||||||
|
noSkip4 = False
|
||||||
|
|
||||||
|
if args.libpostal:
|
||||||
|
advancedMode = True
|
||||||
|
appendPlus4 = True
|
||||||
|
if advancedMode:
|
||||||
|
from src.advancedparsing import advancedNormalize
|
||||||
|
print("Using libpostal to work harder on bad addresses.")
|
||||||
|
|
||||||
|
if args.appendplus4:
|
||||||
|
appendPlus4 = True
|
||||||
|
if appendPlus4:
|
||||||
|
print("Trying to match to ZIP+4 codes for every address!")
|
||||||
|
|
||||||
|
if args.noskip4:
|
||||||
|
noSkip4 = True
|
||||||
|
if noSkip4:
|
||||||
|
print("Also normalizing records that have a +4 in the input data.")
|
||||||
|
|
||||||
|
if args.appendunitlabel:
|
||||||
|
appendUnitLabel = True
|
||||||
|
|
||||||
|
if args.census:
|
||||||
|
useCensusToFillEmptyZIPs = True
|
||||||
|
else:
|
||||||
|
useCensusToFillEmptyZIPs = False
|
||||||
|
if useCensusToFillEmptyZIPs:
|
||||||
|
print("Census geocoder enabled! RIP your network maybe")
|
||||||
|
|
||||||
|
statesToIgnore = []
|
||||||
|
if args.ignorestates:
|
||||||
|
statesToIgnore = re.sub(r"[^a-zA-Z,]+", "", args.ignorestates.upper()).split(",")
|
||||||
|
statesToKeep = []
|
||||||
|
if args.onlystates:
|
||||||
|
statesToKeep = re.sub(r"[^a-zA-Z,]+", "", args.onlystates.upper()).split(",")
|
||||||
|
|
||||||
|
zipprefix = False
|
||||||
|
if args.zipprefix:
|
||||||
|
zipprefix = args.zipprefix
|
||||||
|
|
||||||
|
if args.cpu:
|
||||||
|
maxthreads = int(args.cpu)
|
||||||
|
|
||||||
|
if args.country:
|
||||||
|
if len(args.country) != 2:
|
||||||
|
print("Invalid country code " + args.country + ", exiting.")
|
||||||
|
sys.exit(1)
|
||||||
|
countrycode = args.country.upper()
|
||||||
|
countryCode = countrycode
|
||||||
|
if args.startat and args.startat.isdigit():
|
||||||
|
startAtLine = int(args.startat)
|
||||||
|
|
||||||
|
if args.city:
|
||||||
|
citySuggestion = args.city.strip().toUpper()
|
||||||
|
|
||||||
|
cfg = src.config.AppConfig(appendPlus4=appendPlus4, appendUnitLabel=appendUnitLabel, countryCode=countryCode, citySuggestion=citySuggestion, useCensusToFillEmptyZIPs=useCensusToFillEmptyZIPs, advancedMode=advancedMode, noSkip4=noSkip4)
|
||||||
|
|
||||||
|
src.config.set_config(cfg)
|
||||||
|
|
||||||
|
if args.dedup:
|
||||||
|
with concurrent.futures.ProcessPoolExecutor(max_workers=maxthreads) as executor:
|
||||||
|
for file in args.file:
|
||||||
|
executor.submit(removeDupes, file)
|
||||||
|
elif args.fixlatlon:
|
||||||
|
with concurrent.futures.ProcessPoolExecutor(max_workers=maxthreads) as executor:
|
||||||
|
for file in args.file:
|
||||||
|
executor.submit(fixLatLon, file)
|
||||||
|
elif args.tosqlite:
|
||||||
|
outputfile = "./data/out.sqlite"
|
||||||
|
if args.outputfile:
|
||||||
|
outputfile = args.outputfile
|
||||||
|
if args.a != True and os.path.exists(args.outputfile):
|
||||||
|
print("Output file already exists, exiting!")
|
||||||
|
sys.exit()
|
||||||
|
rowschanged = 0
|
||||||
|
filesimported = 0
|
||||||
|
for file in args.file:
|
||||||
|
rowschanged = rowschanged + tosqlite(file, outputfile)
|
||||||
|
filesimported = filesimported + 1
|
||||||
|
print("\nDone importing " + str(filesimported) + " files. " + str(rowschanged) + " records inserted.")
|
||||||
|
elif args.file:
|
||||||
|
outputfile = "./data/out.csv"
|
||||||
|
if args.outputfile:
|
||||||
|
outputfile = args.outputfile
|
||||||
|
if args.a != True and os.path.exists(args.outputfile):
|
||||||
|
print("Output file already exists, exiting!")
|
||||||
|
sys.exit()
|
||||||
|
if args.filetype == "nad":
|
||||||
|
for file in args.file:
|
||||||
|
importNadFile(file, outputfile, statesToIgnore, statesToKeep, startAtLine)
|
||||||
|
elif args.filetype == "adb":
|
||||||
|
for file in args.file:
|
||||||
|
importOwnFile(file, outputfile, statesToIgnore, statesToKeep)
|
||||||
|
elif args.filetype == "osm":
|
||||||
|
for file in args.file:
|
||||||
|
importOSMFile(file, outputfile)
|
||||||
|
elif args.filetype == "nar":
|
||||||
|
countrycode = "CA"
|
||||||
|
for file in args.file:
|
||||||
|
importNARFile(file, outputfile)
|
||||||
|
elif args.filetype == "oa":
|
||||||
|
source = ""
|
||||||
|
if args.source:
|
||||||
|
source = args.source
|
||||||
|
for file in args.file:
|
||||||
|
importOpenAddressFile(file, outputfile, statesToIgnore, source, args.state, zipprefix)
|
||||||
61
rendermap.py
Executable file
61
rendermap.py
Executable file
@ -0,0 +1,61 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
from PIL import Image, ImageDraw
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
Image.MAX_IMAGE_PIXELS = 648000000 # 100 pixels per degree
|
||||||
|
|
||||||
|
def render(filename, outfile, ppd):
|
||||||
|
print("Creating map overlay")
|
||||||
|
pixelsperdegree = ppd
|
||||||
|
width = 360 * pixelsperdegree
|
||||||
|
height = 180 * pixelsperdegree
|
||||||
|
img = Image.new('RGBA', (width, height), (255, 255, 255, 0))
|
||||||
|
draw = ImageDraw.Draw(img)
|
||||||
|
|
||||||
|
print("Connecting to database")
|
||||||
|
connection = sqlite3.connect(filename)
|
||||||
|
c = connection.cursor()
|
||||||
|
|
||||||
|
print("Drawing map overlay")
|
||||||
|
c.execute('SELECT longitude, latitude FROM addresses')
|
||||||
|
count = 0
|
||||||
|
try:
|
||||||
|
for (x,y) in c:
|
||||||
|
try:
|
||||||
|
if float(y) < -90.0 or float(y) > 90.0:
|
||||||
|
x, y = y, x
|
||||||
|
x = round((x + 180) * pixelsperdegree)
|
||||||
|
y = height - round((y + 90) * pixelsperdegree)
|
||||||
|
draw.point((x, y), fill=(0, 255, 0))
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
count = count + 1
|
||||||
|
if count % 1000 == 0:
|
||||||
|
print(" " + str(count) + " ", end="\r", flush=True)
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\nKeyboardInterrupt: Stopping draw and saving image early")
|
||||||
|
pass
|
||||||
|
print("\nSaving overlay image")
|
||||||
|
img.save(outfile, format="PNG")
|
||||||
|
print("Rendering map image")
|
||||||
|
if (pixelsperdegree > 50):
|
||||||
|
basemap = Image.open("basemap-100.png")
|
||||||
|
else:
|
||||||
|
basemap = Image.open("basemap-50.png")
|
||||||
|
Image.alpha_composite(basemap.resize((width, height)), img).save(outfile + ".map.png", format="PNG")
|
||||||
|
img.close()
|
||||||
|
basemap.close()
|
||||||
|
print("Done! Saved map to " + outfile)
|
||||||
|
|
||||||
|
parser = ArgumentParser(description='Draw a map of a database\'s address points.')
|
||||||
|
|
||||||
|
parser.add_argument('src_db', help='Input SQLite database with "addresses" table containing "latitude" and "longitude" columns')
|
||||||
|
parser.add_argument('png_filename', help='Output PNG filename.')
|
||||||
|
parser.set_defaults(ppd=50)
|
||||||
|
parser.add_argument('ppd', help='Pixels per degree of latitude/longitude.', type=int)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
args = parser.parse_args()
|
||||||
|
render(args.src_db, args.png_filename, args.ppd)
|
||||||
51
sqlite-from-sqfull.py
Executable file
51
sqlite-from-sqfull.py
Executable file
@ -0,0 +1,51 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
def process(filename, outfile):
|
||||||
|
print("Connecting to databases")
|
||||||
|
connection = sqlite3.connect(filename)
|
||||||
|
c = connection.cursor()
|
||||||
|
|
||||||
|
connection2 = sqlite3.connect(outfile)
|
||||||
|
c2 = connection2.cursor()
|
||||||
|
|
||||||
|
print("Creating lite database")
|
||||||
|
c2.execute("DROP TABLE IF EXISTS `addresses`")
|
||||||
|
c2.execute("""CREATE TABLE `addresses` (
|
||||||
|
`zipcode` VARCHAR ( 6 ) NOT NULL,
|
||||||
|
`number` VARCHAR ( 30 ) NOT NULL,
|
||||||
|
`street` VARCHAR ( 200 ) NOT NULL,
|
||||||
|
`street2` VARCHAR ( 20 ),
|
||||||
|
`city` VARCHAR ( 50 ) NOT NULL,
|
||||||
|
`state` CHAR ( 2 ) NOT NULL,
|
||||||
|
`plus4` CHAR ( 4 ),
|
||||||
|
`country` CHAR ( 2 ) NOT NULL DEFAULT "US",
|
||||||
|
UNIQUE (zipcode, number, street, street2, country)
|
||||||
|
)""")
|
||||||
|
c2.execute("CREATE INDEX `zipcode_number` ON `addresses` (`zipcode`,`number`)")
|
||||||
|
c2.execute("CREATE INDEX `number_street_state` ON `addresses` (`number`,`street`,`state`)")
|
||||||
|
|
||||||
|
print("Copying records")
|
||||||
|
c.execute('SELECT zipcode, number, street, street2, city, state, plus4, country FROM addresses')
|
||||||
|
|
||||||
|
count = 0
|
||||||
|
for (zipcode, number, street, street2, city, state, plus4, country) in c:
|
||||||
|
c2.execute("INSERT OR IGNORE INTO addresses(zipcode, number, street, street2, city, state, plus4, country) VALUES (?,?,?,?,?,?,?,?)", (zipcode, number, street, street2, city, state, plus4, country))
|
||||||
|
count = count + 1
|
||||||
|
if count % 10000 == 0:
|
||||||
|
print(" " + str(count) + " ", end="\r", flush=True)
|
||||||
|
|
||||||
|
print("\nVacuuming...")
|
||||||
|
connection2.executescript("VACUUM")
|
||||||
|
print("Done! Copied " + str(count) + " rows to " + outfile + ".")
|
||||||
|
|
||||||
|
parser = ArgumentParser(description='Draw a map of a database\'s address points.')
|
||||||
|
|
||||||
|
parser.add_argument('src_db', help='"Full" SQLite database')
|
||||||
|
parser.add_argument('dest_db', help='Output database with some columns and indexes removed')
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
args = parser.parse_args()
|
||||||
|
process(args.src_db, args.dest_db)
|
||||||
0
src/__init__.py
Normal file
0
src/__init__.py
Normal file
333
src/addressfunctions.py
Normal file
333
src/addressfunctions.py
Normal file
@ -0,0 +1,333 @@
|
|||||||
|
# Created on : Aug 29, 2024, 12:57:40 AM
|
||||||
|
# Author : Skylar Ittner
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from pythonnet import load
|
||||||
|
from scourgify import NormalizeAddress, normalize_address_record
|
||||||
|
from src.zipfunctions import checkZIPCode, getZIP, getZIP4, getCityStateForZIP
|
||||||
|
from src.constants import STATES, LONGHAND_STREET_TYPES, UNITS, ValidationException
|
||||||
|
import src.config
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
|
||||||
|
load("mono")
|
||||||
|
import clr
|
||||||
|
|
||||||
|
clr.AddReference("KellermanSoftware.USPSStandardization")
|
||||||
|
from KellermanSoftware.USPSStandardization import StandardizationLogic
|
||||||
|
|
||||||
|
standardization = False
|
||||||
|
try:
|
||||||
|
standardization = StandardizationLogic("Netsyms Technologies LLC 203206", "2;770AE30D7A5F217E77C857B29618A6E8DD")
|
||||||
|
except:
|
||||||
|
print("Kellerman USPSStandardization failed to initialize, skipping.")
|
||||||
|
|
||||||
|
|
||||||
|
zipcodes = pd.read_csv("zip_code_database.csv", keep_default_na=False, dtype="str")
|
||||||
|
|
||||||
|
PRE_STANDARDIZATION_STREET_REGEXES = {
|
||||||
|
" POINTADDRESS$": "",
|
||||||
|
"S U S HWY ": "S US HIGHWAY ",
|
||||||
|
"^U S ([0-9]+) HWY": r"US HIGHWAY \1",
|
||||||
|
"^U S HWY ": "US HIGHWAY ",
|
||||||
|
"[–—−]": "-",
|
||||||
|
" PW$": " PKWY",
|
||||||
|
" VIS ": " VISTA ",
|
||||||
|
" VLY ": " VALLEY ",
|
||||||
|
" MTN ": " MOUNTAIN ",
|
||||||
|
" CTR ": " CENTER ",
|
||||||
|
" CLB ": " CLUB ",
|
||||||
|
"HBR ": "HARBOR ",
|
||||||
|
"^PNE ": "PINE ",
|
||||||
|
"^SPG ": "SPRING ",
|
||||||
|
"^M L KING JR ": "MARTIN LUTHER KING JR ",
|
||||||
|
"^NONE$": "",
|
||||||
|
" VLY ": " VALLEY ",
|
||||||
|
"^VLY ": "VALLEY ",
|
||||||
|
"BEN-DIER": "BEN DIER",
|
||||||
|
" ROCK RIV$": "" # Albany county WY misspelled their own city name and put it in the street field
|
||||||
|
}
|
||||||
|
|
||||||
|
POST_STANDARDIZATION_STREET_REGEXES = {
|
||||||
|
", BASE$": "",
|
||||||
|
", BASE CP$": "",
|
||||||
|
"UNITED STATES HWY ([0-9]+)": r"US HIGHWAY \1",
|
||||||
|
"^U.S. HWY ": "US HIGHWAY ",
|
||||||
|
"^U.S. HIGHWAY ": "US HIGHWAY ",
|
||||||
|
"^U S ([0-9]+) HWY": r"US HIGHWAY \1",
|
||||||
|
"^US ([0-9]+)": r"US HIGHWAY \1",
|
||||||
|
" US HWY ([0-9]+)": r" US HIGHWAY \1",
|
||||||
|
"UNITED STATES FOREST SERVICE ROAD ([0-9]+) RD$": r"FOREST SERVICE ROAD \1",
|
||||||
|
"^IH?([0-9]{1,3})(\s|$)": r"INTERSTATE \1\2",
|
||||||
|
"^INTERSTATE HWY ([0-9]{1,3})(\s|$)": r"INTERSTATE \1\2",
|
||||||
|
"^I ?([0-9]{1,3})(\s|$)": r"INTERSTATE \1\2",
|
||||||
|
"^I-([0-9]{1,3})$": r"INTERSTATE \1",
|
||||||
|
"^([EW]) I-([0-9]{1,3})$": r"\1 INTERSTATE \2",
|
||||||
|
"^HWY FM ([0-9]+)": r"FM \1",
|
||||||
|
"^FARM TO MARKET ([0-9]+)": r"FM \1",
|
||||||
|
" (HIWAY) ([0-9]+)$": r" HWY \2",
|
||||||
|
" (RTE|RT) ([0-9]+)$": r" ROUTE \2",
|
||||||
|
" RD ([0-9]+)$": r" ROAD \1",
|
||||||
|
"^ST (HIGHWAY|ROUTE|ROAD|HWY) ([0-9]+)$": r"STATE \1 \2",
|
||||||
|
"^CNTY (HIGHWAY|ROUTE|ROAD|HWY) ([0-9]+)$": r"COUNTY \1 \2",
|
||||||
|
"^CR ([0-9]+)": r"COUNTY ROAD \1",
|
||||||
|
"^COUNTY RD ([0-9]+) ([NSEW]{1,2})": r"COUNTY ROAD \1 \2",
|
||||||
|
"^(SR|ST RD) ([0-9]+)": r"STATE ROAD \2",
|
||||||
|
"^(ST RT|ST RTE) ([0-9]+)": r"STATE ROUTE \2",
|
||||||
|
"^(HWY|HIWAY) ([0-9]+)": r"HIGHWAY \2",
|
||||||
|
"^(RTE|RT) ([0-9]+)": r"ROUTE \2",
|
||||||
|
"^RD ([0-9]+)": r"ROAD \1",
|
||||||
|
"^TSR ([0-9]+)": r"TOWNSHIP ROAD \1",
|
||||||
|
"([0-9]+) BYP RD": r"\1 BYPASS RD",
|
||||||
|
"([0-9]+) BYPASS": r"\1 BYP",
|
||||||
|
" HIGHWAY ([0-9]+)": r" HWY \1",
|
||||||
|
"^(STATE|COUNTY) HWY ": r"\1 HIGHWAY ",
|
||||||
|
"UNITED STATES HWY ([0-9]+)": r"US HIGHWAY \1",
|
||||||
|
"UNITED STATES HWY ": "US HIGHWAY ",
|
||||||
|
"^US ([0-9]+)": r"US HIGHWAY \1",
|
||||||
|
" US HWY ([0-9]+)": r" US HIGHWAY \1",
|
||||||
|
"^US HWY ": r"US HIGHWAY ",
|
||||||
|
"^FIRST ": "1ST ",
|
||||||
|
"^SECOND ": "2ND ",
|
||||||
|
"^THIRD ": "3RD ",
|
||||||
|
"^FOURTH ": "4TH ",
|
||||||
|
"^FIFTH ": "5TH ",
|
||||||
|
"^SIXTH ": "6TH ",
|
||||||
|
"^SEVENTH ": "7TH ",
|
||||||
|
"^EIGHTH ": "8TH ",
|
||||||
|
"^NINTH ": "9TH ",
|
||||||
|
"^TENTH ": "10TH ",
|
||||||
|
" STREET ST$": " ST",
|
||||||
|
" AVENUE AVE$": " AVE",
|
||||||
|
" DRIVE DR$": " DR",
|
||||||
|
" DR DRIVE$": " DR",
|
||||||
|
" PARKS PARK$": " PARK",
|
||||||
|
" ROAD RD$": " RD",
|
||||||
|
" LK ": " LAKE ",
|
||||||
|
" ST ST$": " ST",
|
||||||
|
"^(N|S|E|W) COUNTY (ROAD|RD) ([0-9]{3,}) (N|S|E|W)$": r"\1 \3 \4", # Indiana has "County Road" in NAD as the street name for some reason
|
||||||
|
"^COUNTY RD COUNTY ROAD ": "COUNTY ROAD ",
|
||||||
|
" CI$": " CIR",
|
||||||
|
" CM$": " CMN",
|
||||||
|
" BL$": " BLVD",
|
||||||
|
" TE$": " TER",
|
||||||
|
" LP$": " LOOP",
|
||||||
|
"^CRK ([0-9]+)$": r"COUNTY ROAD \1", # Athens TX does this for some reason
|
||||||
|
"^PR ([0-9]+)$": r"PRIVATE ROAD \1", # Athens TX does this for some reason
|
||||||
|
"^SH ([0-9]+) ([NSEW])$": r"STATE HIGHWAY \1 \2" # Athens TX does this too for some reason
|
||||||
|
}
|
||||||
|
|
||||||
|
STANDARDIZATION_NUMBER_REGEXES = {
|
||||||
|
"^([0-9]+) \1$": r"\1", # Fix address numbers that repeat with a space between (doesn't always work for some reason)
|
||||||
|
"^([0-9]+) ([A-Z])$": r"\1\2", # "1234 A ROAD ST" to "1234A ROAD ST"
|
||||||
|
"^0$": "", # Blank out 0 as a house number
|
||||||
|
"^\.$": "", # Blank out .
|
||||||
|
}
|
||||||
|
|
||||||
|
ABBREV_PATTERN = ""
|
||||||
|
ABBREV_PATTERN_LIST = []
|
||||||
|
for (a, b) in LONGHAND_STREET_TYPES.items():
|
||||||
|
ABBREV_PATTERN_LIST.append(a)
|
||||||
|
ABBREV_PATTERN = "|".join(ABBREV_PATTERN_LIST)
|
||||||
|
STREET_INNER_ABBREV_FIND_REGEX = re.compile("(^|\s)("+ABBREV_PATTERN+") ("+ABBREV_PATTERN+")( [NSEW]{0,2})?$")
|
||||||
|
|
||||||
|
def postStandardizeStreet(street):
|
||||||
|
# Catch edge cases with USPS formatting
|
||||||
|
for (find, replace) in POST_STANDARDIZATION_STREET_REGEXES.items():
|
||||||
|
street = re.sub(find, replace, street)
|
||||||
|
|
||||||
|
# Unshorten things like "S CRK RD", correcting to "S CREEK RD"
|
||||||
|
matches = STREET_INNER_ABBREV_FIND_REGEX.search(street)
|
||||||
|
if matches:
|
||||||
|
street = street.replace(matches.group(2), LONGHAND_STREET_TYPES[matches.group(2)], 1)
|
||||||
|
|
||||||
|
# "KY 1234" to "KY HIGHWAY 1234" per Pub 28
|
||||||
|
if re.match(r"^[A-Z]{2} [0-9]+$", street):
|
||||||
|
for (full, abbr) in STATES.items():
|
||||||
|
if street.startswith(abbr + " "):
|
||||||
|
street = street.replace(abbr, abbr + " HIGHWAY", 1)
|
||||||
|
break
|
||||||
|
# "KENTUCKY STATE HIGHWAY 625" to "KY STATE HIGHWAY 625" per Pub 28
|
||||||
|
if re.match(r"^[A-Z]{2,} STATE HIGHWAY [0-9]+", street):
|
||||||
|
for (full, abbr) in STATES.items():
|
||||||
|
if street.startswith(full + " "):
|
||||||
|
street = street.replace(full, abbr, 1)
|
||||||
|
break
|
||||||
|
|
||||||
|
return street.strip()
|
||||||
|
|
||||||
|
def preStandardizeStreet(street):
|
||||||
|
for (find, replace) in PRE_STANDARDIZATION_STREET_REGEXES.items():
|
||||||
|
street = re.sub(find, replace, street)
|
||||||
|
# Remove unit from end of street
|
||||||
|
hashtag = street.find("#")
|
||||||
|
if hashtag > 0:
|
||||||
|
street = street[:hashtag].strip()
|
||||||
|
return street
|
||||||
|
|
||||||
|
def standardizeNumber(number):
|
||||||
|
for (find, replace) in STANDARDIZATION_NUMBER_REGEXES.items():
|
||||||
|
number = re.sub(find, replace, number)
|
||||||
|
# Detect "1234 1234" which some sources have sometimes (like Kentucky NAD v20)
|
||||||
|
if (parts := number.split(" ")) and len(parts) == 2 and parts[0] == parts[1]:
|
||||||
|
number = parts[0]
|
||||||
|
return number.strip()
|
||||||
|
|
||||||
|
def splitNumberAndUnit(number):
|
||||||
|
# Some places have unit numbers in the primary address, remove them
|
||||||
|
num = number
|
||||||
|
unit = ""
|
||||||
|
for label in UNITS:
|
||||||
|
pos = number.find(label)
|
||||||
|
if pos > 0:
|
||||||
|
num = number[:pos]
|
||||||
|
unit = number[pos:]
|
||||||
|
break
|
||||||
|
num = standardizeNumber(num)
|
||||||
|
unit = removeUnitText(unit)
|
||||||
|
return num, unit
|
||||||
|
|
||||||
|
def removeUnitText(subaddr):
|
||||||
|
if subaddr == None:
|
||||||
|
return subaddr
|
||||||
|
subaddr = subaddr.upper()
|
||||||
|
for label in UNITS:
|
||||||
|
subaddr = subaddr.replace(label, "")
|
||||||
|
subaddr = subaddr.replace(" ", " ")
|
||||||
|
return subaddr.strip()
|
||||||
|
|
||||||
|
def normalizeAddress(number, street, unit, city, state, zipcode, lat, lon, zipprefix=False, plus4="", county=False):
|
||||||
|
cfg = src.config.get_config()
|
||||||
|
if not number:
|
||||||
|
#print("W: No address number for address, skipping "+street)
|
||||||
|
raise ValidationException("No address number")
|
||||||
|
if not street:
|
||||||
|
#print("W: No street for address, skipping")
|
||||||
|
raise ValidationException("No address number")
|
||||||
|
# Detect flipped coordinates
|
||||||
|
if lat < -90 or lat > 90:
|
||||||
|
lon, lat = lat, lon
|
||||||
|
number = standardizeNumber(str(number).upper().strip())
|
||||||
|
street = preStandardizeStreet(street.strip().upper())
|
||||||
|
unit = unit.strip().upper()
|
||||||
|
city = city.strip().upper()
|
||||||
|
state = state.strip().upper()
|
||||||
|
zipcode = (zipcode or "").strip()
|
||||||
|
plus4 = plus4.strip()
|
||||||
|
|
||||||
|
if (not city or city == "") and (not zipcode or zipcode == "") and cfg.citySuggestion:
|
||||||
|
# Use the city specified on the CLI, hopefully it'll help
|
||||||
|
city = cfg.citySuggestion
|
||||||
|
|
||||||
|
if unit == "":
|
||||||
|
number, unit = splitNumberAndUnit(number)
|
||||||
|
|
||||||
|
city = city.replace("CITY OF ", "").replace("TOWN OF ", "").replace("VILLAGE OF ", "")
|
||||||
|
city = city.replace("UNINCORPORATED", "")
|
||||||
|
|
||||||
|
#
|
||||||
|
# Standardize address
|
||||||
|
#
|
||||||
|
try:
|
||||||
|
# Python library
|
||||||
|
addr = normalize_address_record(
|
||||||
|
{
|
||||||
|
"address_line_1": "".join(["999999999", " ", street]),
|
||||||
|
"address_line_2": unit,
|
||||||
|
"city": city,
|
||||||
|
"state": state,
|
||||||
|
"postal_code": zipcode
|
||||||
|
}
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
try:
|
||||||
|
# Proprietary Mono library
|
||||||
|
if standardization == False:
|
||||||
|
raise e
|
||||||
|
addr = {
|
||||||
|
"address_line_1": "999999999 " + standardization.StandardizeStreetAddress(street),
|
||||||
|
"address_line_2": unit,
|
||||||
|
"city": city,
|
||||||
|
"state": state,
|
||||||
|
"postal_code": zipcode
|
||||||
|
}
|
||||||
|
except Exception as ex:
|
||||||
|
# This basically never happens
|
||||||
|
print("W: Couldn't parse address:")
|
||||||
|
print(ex)
|
||||||
|
raise ex
|
||||||
|
|
||||||
|
#
|
||||||
|
# Remove number from street address field
|
||||||
|
#
|
||||||
|
addr['address_line_1'] = addr['address_line_1'].replace("999999999", number)
|
||||||
|
streetonly = addr['address_line_1']
|
||||||
|
if streetonly.startswith(str(number) + " "):
|
||||||
|
streetonly = streetonly[len(str(number) + " "):]
|
||||||
|
#
|
||||||
|
# Run extra regexes on street to fix standardization problems
|
||||||
|
#
|
||||||
|
streetonly = postStandardizeStreet(streetonly)
|
||||||
|
|
||||||
|
#
|
||||||
|
# Special conditional rules
|
||||||
|
#
|
||||||
|
if addr["state"] == "PR":
|
||||||
|
# Puerto Rico special rules
|
||||||
|
if re.match("[A-Z]", number):
|
||||||
|
number = number.replace("-", "")
|
||||||
|
|
||||||
|
#
|
||||||
|
# Clean second line
|
||||||
|
#
|
||||||
|
addr['address_line_2'] = removeUnitText(addr['address_line_2'])
|
||||||
|
|
||||||
|
#
|
||||||
|
# Standardize and validate and/or append ZIP Code
|
||||||
|
#
|
||||||
|
zipcode = addr["postal_code"]
|
||||||
|
unitprefix = ""
|
||||||
|
unit = addr['address_line_2']
|
||||||
|
|
||||||
|
if zipcode is not None:
|
||||||
|
zipcode = addr["postal_code"][0:5]
|
||||||
|
|
||||||
|
# Skip these if we already have a ZIP+4 code, assume it's accurate
|
||||||
|
if zipcode is not None and len(zipcode) == 5 and not plus4:
|
||||||
|
zipinfo = getCityStateForZIP(zipcode)
|
||||||
|
if cfg.appendPlus4 or zipinfo == False or addr["state"] != zipinfo["state"] or (cfg.appendUnitLabel and addr["address_line_2"] != ""):
|
||||||
|
zipcode, plus4, streetonly, unitprefix, unit = getZIP4(number, streetonly, addr['address_line_2'], addr["state"], lat, lon, addr["city"], zipcode, county)
|
||||||
|
zipinfo = getCityStateForZIP(zipcode)
|
||||||
|
if zipinfo != False:
|
||||||
|
addr["city"] = zipinfo["city"]
|
||||||
|
addr["state"] = zipinfo["state"]
|
||||||
|
else:
|
||||||
|
addr["city"] = zipinfo["city"]
|
||||||
|
addr["state"] = zipinfo["state"]
|
||||||
|
elif not plus4:
|
||||||
|
zipcode, plus4, streetonly, unitprefix, unit = getZIP4(number, streetonly, addr['address_line_2'], addr["state"], lat, lon, addr["city"], False, county)
|
||||||
|
zipinfo = getCityStateForZIP(zipcode)
|
||||||
|
if zipinfo != False:
|
||||||
|
addr["city"] = zipinfo["city"]
|
||||||
|
addr["state"] = zipinfo["state"]
|
||||||
|
|
||||||
|
if not plus4 and streetonly == "UNITED STATES HWY" and re.match(r"^\d+$", unit):
|
||||||
|
streetonly = f"US HIGHWAY {unit}"
|
||||||
|
unit = ""
|
||||||
|
|
||||||
|
#if not src.config.appendPlus4:
|
||||||
|
# plus4 = ""
|
||||||
|
|
||||||
|
return {
|
||||||
|
"number": number,
|
||||||
|
"street": streetonly,
|
||||||
|
"unit": ' '.join(filter(None, (unitprefix, unit))),
|
||||||
|
"city": addr["city"],
|
||||||
|
"state": addr["state"],
|
||||||
|
"zip": zipcode,
|
||||||
|
"plus4": plus4,
|
||||||
|
"latitude": lat,
|
||||||
|
"longitude": lon
|
||||||
|
}
|
||||||
109
src/advancedparsing.py
Normal file
109
src/advancedparsing.py
Normal file
@ -0,0 +1,109 @@
|
|||||||
|
# Use "AI" to parse problem addresses and find more matches
|
||||||
|
# First expand the address to possible forms, then normalize each one, and keep the one that has a ZIP+4
|
||||||
|
|
||||||
|
from postal.parser import parse_address
|
||||||
|
from postal.expand import expand_address
|
||||||
|
from src.addressfunctions import normalizeAddress
|
||||||
|
import re
|
||||||
|
|
||||||
|
def advancedNormalize(number, street, unit, city, state, zipcode, lat, lon, zipprefix=False, plus4 = ""):
|
||||||
|
if len(plus4 or "") == 4:
|
||||||
|
# Return as-is, it's got a +4 match already
|
||||||
|
return {
|
||||||
|
"number": number,
|
||||||
|
"street": street,
|
||||||
|
"unit": unit,
|
||||||
|
"city": city,
|
||||||
|
"state": state,
|
||||||
|
"zip": zipcode,
|
||||||
|
"plus4": plus4,
|
||||||
|
"latitude": lat,
|
||||||
|
"longitude": lon
|
||||||
|
}
|
||||||
|
|
||||||
|
# Merge and re-split the address to catch odd things like the street having the city and zip too
|
||||||
|
parsed = parse_address(f"{number} {street}, {city} {state} {zipcode}")
|
||||||
|
pNumber = number
|
||||||
|
pStreet = street
|
||||||
|
pUnit = unit
|
||||||
|
pCity = city
|
||||||
|
pState = state
|
||||||
|
pZip = zipcode
|
||||||
|
for part in parsed:
|
||||||
|
if part[1] == "house_number" and (pNumber == "" or pNumber == number): # Don't overwrite it with values found later, which might be a zip code or something
|
||||||
|
pNumber = part[0].upper()
|
||||||
|
elif part[1] == "road":
|
||||||
|
pStreet = part[0].upper()
|
||||||
|
elif part[1] == "unit":
|
||||||
|
pUnit = part[0].upper()
|
||||||
|
elif part[1] == "city":
|
||||||
|
pCity = part[0].upper()
|
||||||
|
elif part[1] == "state":
|
||||||
|
pState = part[0].upper()
|
||||||
|
elif part[1] == "postcode":
|
||||||
|
pZip = part[0].upper()
|
||||||
|
|
||||||
|
# Expand the number/street to all possible forms
|
||||||
|
expanded = expand_address(f"{pNumber} {pStreet}")
|
||||||
|
|
||||||
|
normalizedMatches = []
|
||||||
|
# Add the original address as a candidate so if no better matches come up, it'll probably just use it as-is
|
||||||
|
normalizedMatches.append({
|
||||||
|
"number": number,
|
||||||
|
"street": street,
|
||||||
|
"unit": unit,
|
||||||
|
"city": city,
|
||||||
|
"state": state,
|
||||||
|
"zip": zipcode,
|
||||||
|
"plus4": plus4,
|
||||||
|
"latitude": lat,
|
||||||
|
"longitude": lon
|
||||||
|
})
|
||||||
|
# Also add one where we remove any non-numeric data from the number and unit fields
|
||||||
|
normalizedMatches.append(normalizeAddress(re.sub("[^0-9]", "", number), street, re.sub("[^0-9]", "", unit), pCity, pState, pZip, lat, lon, zipprefix, plus4))
|
||||||
|
if number != pNumber or unit != pUnit or street != pStreet:
|
||||||
|
normalizedMatches.append(normalizeAddress(re.sub("[^0-9]", "", pNumber), pStreet, re.sub("[^0-9]", "", pUnit), pCity, pState, pZip, lat, lon, zipprefix, plus4))
|
||||||
|
|
||||||
|
for exp in expanded:
|
||||||
|
parsed = parse_address(exp)
|
||||||
|
pN = ""
|
||||||
|
pS = ""
|
||||||
|
pU = ""
|
||||||
|
for part in parsed:
|
||||||
|
if part[1] == "house_number" and pN == "":
|
||||||
|
pN = part[0]
|
||||||
|
elif part[1] == "road":
|
||||||
|
pS = part[0]
|
||||||
|
elif part[1] == "unit":
|
||||||
|
pU = part[0]
|
||||||
|
try:
|
||||||
|
normalizedMatches.append(normalizeAddress(pN, pS, pU, pCity, pState, pZip, lat, lon, zipprefix, plus4))
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
if len(normalizedMatches) > 1:
|
||||||
|
weights = {"number": 5, "street": 5, "unit": 1, "city": 1, "state": 1, "zip": 3, "plus4": 8, "latitude": 0, "longitude": 0}
|
||||||
|
sortedMatches = sorted(
|
||||||
|
normalizedMatches,
|
||||||
|
key=lambda item: sum(weights[k] for k, v in item.items() if v),
|
||||||
|
reverse=True
|
||||||
|
)
|
||||||
|
return sortedMatches[0]
|
||||||
|
elif len(normalizedMatches) == 1:
|
||||||
|
return normalizedMatches[0]
|
||||||
|
|
||||||
|
# No matches, give up on the whole thing
|
||||||
|
return {
|
||||||
|
"number": number,
|
||||||
|
"street": street,
|
||||||
|
"unit": unit,
|
||||||
|
"city": city,
|
||||||
|
"state": state,
|
||||||
|
"zip": zipcode,
|
||||||
|
"plus4": plus4,
|
||||||
|
"latitude": lat,
|
||||||
|
"longitude": lon
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
23
src/config.py
Normal file
23
src/config.py
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class AppConfig:
|
||||||
|
appendPlus4: bool
|
||||||
|
appendUnitLabel: bool
|
||||||
|
countryCode: str
|
||||||
|
citySuggestion: bool
|
||||||
|
useCensusToFillEmptyZIPs: bool
|
||||||
|
advancedMode: bool
|
||||||
|
noSkip4: bool
|
||||||
|
|
||||||
|
_CFG: Optional[AppConfig] = None
|
||||||
|
|
||||||
|
def set_config(cfg: AppConfig) -> None:
|
||||||
|
global _CFG
|
||||||
|
_CFG = cfg # set once at start (or in child initializer)
|
||||||
|
|
||||||
|
def get_config() -> AppConfig:
|
||||||
|
if _CFG is None:
|
||||||
|
raise RuntimeError("Config not initialized yet")
|
||||||
|
return _CFG
|
||||||
291
src/constants.py
Normal file
291
src/constants.py
Normal file
@ -0,0 +1,291 @@
|
|||||||
|
|
||||||
|
class ValidationException(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
LONGHAND_STREET_TYPES = {
|
||||||
|
'ALY': 'ALLEY',
|
||||||
|
'ANX': 'ANNEX',
|
||||||
|
'ARC': 'ARCADE',
|
||||||
|
'AV': 'AVENUE',
|
||||||
|
'AVE': 'AVENUE',
|
||||||
|
'BYU': 'BAYOU',
|
||||||
|
'BCH': 'BEACH',
|
||||||
|
'BND': 'BEND',
|
||||||
|
'BLF': 'BLUFF',
|
||||||
|
'BLFS': 'BLUFFS',
|
||||||
|
'BTM': 'BOTTOM',
|
||||||
|
'BLVD': 'BOULEVARD',
|
||||||
|
'BL': 'BOULEVARD',
|
||||||
|
'BR': 'BRANCH',
|
||||||
|
'BRG': 'BRIDGE',
|
||||||
|
'BRK': 'BROOK',
|
||||||
|
'BRKS': 'BROOKS',
|
||||||
|
'BGS': 'BURGS',
|
||||||
|
'BYP': 'BYPASS',
|
||||||
|
'CP': 'CAMP',
|
||||||
|
'CYN': 'CANYON',
|
||||||
|
'CPE': 'CAPE',
|
||||||
|
'CSWY': 'CAUSEWAY',
|
||||||
|
'CTR': 'CENTER',
|
||||||
|
'CTRS': 'CENTERS',
|
||||||
|
'CI': 'CIRCLE',
|
||||||
|
'CIR': 'CIRCLE',
|
||||||
|
'CIRS': 'CIRCLES',
|
||||||
|
'CLF': 'CLIFF',
|
||||||
|
'CLFS': 'CLIFFS',
|
||||||
|
'CMN': 'COMMON',
|
||||||
|
'CM': 'COMMON',
|
||||||
|
'COR': 'CORNER',
|
||||||
|
'CORS': 'CORNERS',
|
||||||
|
'CRSE': 'COURSE',
|
||||||
|
'CT': 'COURT',
|
||||||
|
'CTS': 'COURTS',
|
||||||
|
'CVS': 'COVES',
|
||||||
|
'CRK': 'CREEK',
|
||||||
|
'CRES': 'CRESCENT',
|
||||||
|
'CRST': 'CREST',
|
||||||
|
'XING': 'CROSSING',
|
||||||
|
'XRD': 'CROSSROAD',
|
||||||
|
'CURV': 'CURVE',
|
||||||
|
'DL': 'DALE',
|
||||||
|
'DM': 'DAM',
|
||||||
|
'DV': 'DIVIDE',
|
||||||
|
'DR': 'DRIVE',
|
||||||
|
'DRS': 'DRIVES',
|
||||||
|
'EST': 'ESTATE',
|
||||||
|
'ESTS': 'ESTATES',
|
||||||
|
'EXPY': 'EXPRESSWAY',
|
||||||
|
'EXT': 'EXTENSION',
|
||||||
|
'EXTS': 'EXTENSIONS',
|
||||||
|
'FALL': 'FALL',
|
||||||
|
'FLS': 'FALLS',
|
||||||
|
'FRY': 'FERRY',
|
||||||
|
'FLD': 'FIELD',
|
||||||
|
'FLDS': 'FIELDS',
|
||||||
|
'FLT': 'FLAT',
|
||||||
|
'FLTS': 'FLATS',
|
||||||
|
'FRD': 'FORD',
|
||||||
|
'FRDS': 'FORDS',
|
||||||
|
'FRST': 'FORESTS',
|
||||||
|
'FRG': 'FORGE',
|
||||||
|
'FRGS': 'FORGES',
|
||||||
|
'FRK': 'FORK',
|
||||||
|
'FRKS': 'FORKS',
|
||||||
|
'FT': 'FORT',
|
||||||
|
'FWY': 'FREEWAY',
|
||||||
|
'GDN': 'GARDEN',
|
||||||
|
'GDNS': 'GARDENS',
|
||||||
|
'GTWY': 'GATEWAY',
|
||||||
|
'GLN': 'GLEN',
|
||||||
|
'GLNS': 'GLENS',
|
||||||
|
'GRNS': 'GREENS',
|
||||||
|
'GRV': 'GROVE',
|
||||||
|
'GRVS': 'GROVES',
|
||||||
|
'HBR': 'HARBOR',
|
||||||
|
'HBRS': 'HARBORS',
|
||||||
|
'HVN': 'HAVEN',
|
||||||
|
'HTS': 'HEIGHTS',
|
||||||
|
'HWY': 'HIGHWAY',
|
||||||
|
'HL': 'HILL',
|
||||||
|
'HLS': 'HILLS',
|
||||||
|
'HOLW': 'HOLLOW',
|
||||||
|
'INLT': 'INLET',
|
||||||
|
'IS': 'ISLAND',
|
||||||
|
'ISS': 'ISLANDS',
|
||||||
|
'ISLE': 'ISLE',
|
||||||
|
'JCT': 'JUNCTION',
|
||||||
|
'JCTS': 'JUNCTIONS',
|
||||||
|
'KY': 'KEY',
|
||||||
|
'KYS': 'KEYS',
|
||||||
|
'KNL': 'KNOLL',
|
||||||
|
'KNLS': 'KNOLLS',
|
||||||
|
'LK': 'LAKE',
|
||||||
|
'LKS': 'LAKES',
|
||||||
|
'LAND': 'LAND',
|
||||||
|
'LNDG': 'LANDING',
|
||||||
|
'LN': 'LANE',
|
||||||
|
'LGT': 'LIGHT',
|
||||||
|
'LGTS': 'LIGHTS',
|
||||||
|
'LF': 'LOAF',
|
||||||
|
'LCK': 'LOCK',
|
||||||
|
'LCKS': 'LOCKS',
|
||||||
|
'LDG': 'LODGE',
|
||||||
|
'LOOP': 'LOOP',
|
||||||
|
'LP': 'LOOP',
|
||||||
|
'MALL': 'MALL',
|
||||||
|
'MNR': 'MANOR',
|
||||||
|
'MNRS': 'MANORS',
|
||||||
|
'MDW': 'MEADOW',
|
||||||
|
'MDWS': 'MEADOWS',
|
||||||
|
'MEWS': 'MEWS',
|
||||||
|
'ML': 'MILL',
|
||||||
|
'MLS': 'MILLS',
|
||||||
|
'MSN': 'MISSION',
|
||||||
|
'MTWY': 'MOTORWAY',
|
||||||
|
'MT': 'MOUNT',
|
||||||
|
'MTN': 'MOUNTAIN',
|
||||||
|
'MTNS': 'MOUNTAINS',
|
||||||
|
'NCK': 'NECK',
|
||||||
|
'ORCH': 'ORCHARD',
|
||||||
|
'OVAL': 'OVAL',
|
||||||
|
'OPAS': 'OVERPASS',
|
||||||
|
'PARK': 'PARKS',
|
||||||
|
'PKWY': 'PARKWAY',
|
||||||
|
'PASS': 'PASS',
|
||||||
|
'PSGE': 'PASSAGE',
|
||||||
|
'PATH': 'PATHS',
|
||||||
|
'PIKE': 'PIKES',
|
||||||
|
'PNE': 'PINE',
|
||||||
|
'PNES': 'PINES',
|
||||||
|
'PL': 'PLACE',
|
||||||
|
'PLN': 'PLAIN',
|
||||||
|
'PLNS': 'PLAINS',
|
||||||
|
'PLZ': 'PLAZA',
|
||||||
|
'PT': 'POINT',
|
||||||
|
'PTS': 'POINTS',
|
||||||
|
'PRT': 'PORT',
|
||||||
|
'PRTS': 'PORTS',
|
||||||
|
'PR': 'PRAIRIE',
|
||||||
|
'PW': 'PARKWAY',
|
||||||
|
'RADL': 'RADIAL',
|
||||||
|
'RAMP': 'RAMP',
|
||||||
|
'RNCH': 'RANCH',
|
||||||
|
'RPD': 'RAPID',
|
||||||
|
'RPDS': 'RAPIDS',
|
||||||
|
'RST': 'REST',
|
||||||
|
'RDG': 'RIDGE',
|
||||||
|
'RDGS': 'RIDGES',
|
||||||
|
'RIV': 'RIVER',
|
||||||
|
'RD': 'ROAD',
|
||||||
|
'RDS': 'ROADS',
|
||||||
|
'RTE': 'ROUTE',
|
||||||
|
'ROW': 'ROW',
|
||||||
|
'RUE': 'RUE',
|
||||||
|
'RUN': 'RUN',
|
||||||
|
'SHL': 'SHOAL',
|
||||||
|
'SHLS': 'SHOALS',
|
||||||
|
'SHR': 'SHORE',
|
||||||
|
'SHRS': 'SHORES',
|
||||||
|
'SKWY': 'SKYWAY',
|
||||||
|
'SPG': 'SPRING',
|
||||||
|
'SPGS': 'SPRINGS',
|
||||||
|
'SPUR': 'SPURS',
|
||||||
|
'SQ': 'SQUARE',
|
||||||
|
'SQS': 'SQUARES',
|
||||||
|
'STA': 'STATION',
|
||||||
|
'STRA': 'STRAVENUE',
|
||||||
|
'STRM': 'STREAM',
|
||||||
|
'ST': 'STREET',
|
||||||
|
'STS': 'STREETS',
|
||||||
|
'SMT': 'SUMMIT',
|
||||||
|
'TER': 'TERRACE',
|
||||||
|
'TRWY': 'THROUGHWAY',
|
||||||
|
'TRCE': 'TRACE',
|
||||||
|
'TRAK': 'TRACK',
|
||||||
|
'TRFY': 'TRAFFICWAY',
|
||||||
|
'TRL': 'TRAIL',
|
||||||
|
'TUNL': 'TUNNEL',
|
||||||
|
'TPKE': 'TURNPIKE',
|
||||||
|
'UPAS': 'UNDERPASS',
|
||||||
|
'UN': 'UNION',
|
||||||
|
'UNS': 'UNIONS',
|
||||||
|
'VLY': 'VALLEY',
|
||||||
|
'VLYS': 'VALLEYS',
|
||||||
|
'VIA': 'VIADUCT',
|
||||||
|
'VW': 'VIEW',
|
||||||
|
'VWS': 'VIEWS',
|
||||||
|
'VLG': 'VILLAGE',
|
||||||
|
'VLGS': 'VILLAGES',
|
||||||
|
'VL': 'VILLE',
|
||||||
|
'VIS': 'VISTA',
|
||||||
|
'WALK': 'WALK',
|
||||||
|
'WALL': 'WALL',
|
||||||
|
'WAY': 'WAY',
|
||||||
|
'WL': 'WELL',
|
||||||
|
'WLS': 'WELLS'
|
||||||
|
}
|
||||||
|
|
||||||
|
UNITS = [
|
||||||
|
'APT',
|
||||||
|
'BLDG',
|
||||||
|
'BUILDING',
|
||||||
|
'BSMT',
|
||||||
|
'DEPT',
|
||||||
|
'FL',
|
||||||
|
'FRNT',
|
||||||
|
'HNGR',
|
||||||
|
'KEY',
|
||||||
|
'LBBY',
|
||||||
|
'LOT',
|
||||||
|
'LOWR',
|
||||||
|
'OFC',
|
||||||
|
'PH',
|
||||||
|
'PIER',
|
||||||
|
'REAR',
|
||||||
|
'RM',
|
||||||
|
'SIDE',
|
||||||
|
'SLIP',
|
||||||
|
'SPC',
|
||||||
|
'STOP',
|
||||||
|
'STE',
|
||||||
|
'TRLR',
|
||||||
|
'UNIT',
|
||||||
|
'UPPER',
|
||||||
|
'#',
|
||||||
|
'BASE', # Not a real unit designator but appears in some NAD AZ data for some reason
|
||||||
|
'(VACANT)' # One dataset does this...
|
||||||
|
]
|
||||||
|
|
||||||
|
STATES = {
|
||||||
|
"ALABAMA": "AL",
|
||||||
|
"ALASKA": "AK",
|
||||||
|
"ARIZONA": "AZ",
|
||||||
|
"ARKANSAS": "AR",
|
||||||
|
"CALIFORNIA": "CA",
|
||||||
|
"COLORADO": "CO",
|
||||||
|
"CONNECTICUT": "CT",
|
||||||
|
"DELAWARE": "DE",
|
||||||
|
"DISTRICT OF COLUMBIA": "DC",
|
||||||
|
"FLORIDA": "FL",
|
||||||
|
"GEORGIA": "GA",
|
||||||
|
"HAWAII": "HI",
|
||||||
|
"IDAHO": "ID",
|
||||||
|
"ILLINOIS": "IL",
|
||||||
|
"INDIANA": "IN",
|
||||||
|
"IOWA": "IA",
|
||||||
|
"KANSAS": "KS",
|
||||||
|
"KENTUCKY": "KY",
|
||||||
|
"LOUISIANA": "LA",
|
||||||
|
"MAINE": "ME",
|
||||||
|
"MONTANA": "MT",
|
||||||
|
"NEBRASKA": "NE",
|
||||||
|
"NEVADA": "NV",
|
||||||
|
"NEW HAMPSHIRE": "NH",
|
||||||
|
"NEW JERSEY": "NJ",
|
||||||
|
"NEW MEXICO": "NM",
|
||||||
|
"NEW YORK": "NY",
|
||||||
|
"NORTH CAROLINA": "NC",
|
||||||
|
"NORTH DAKOTA": "ND",
|
||||||
|
"OHIO": "OH",
|
||||||
|
"OKLAHOMA": "OK",
|
||||||
|
"OREGON": "OR",
|
||||||
|
"MARYLAND": "MD",
|
||||||
|
"MASSACHUSETTS": "MA",
|
||||||
|
"MICHIGAN": "MI",
|
||||||
|
"MINNESOTA": "MN",
|
||||||
|
"MISSISSIPPI": "MS",
|
||||||
|
"MISSOURI": "MO",
|
||||||
|
"PENNSYLVANIA": "PA",
|
||||||
|
"RHODE ISLAND": "RI",
|
||||||
|
"SOUTH CAROLINA": "SC",
|
||||||
|
"SOUTH DAKOTA": "SD",
|
||||||
|
"TENNESSEE": "TN",
|
||||||
|
"TEXAS": "TX",
|
||||||
|
"UTAH": "UT",
|
||||||
|
"VERMONT": "VT",
|
||||||
|
"VIRGINIA": "VA",
|
||||||
|
"WASHINGTON": "WA",
|
||||||
|
"WEST VIRGINIA": "WV",
|
||||||
|
"WISCONSIN": "WI",
|
||||||
|
"WYOMING": "WY"
|
||||||
|
}
|
||||||
614
src/streetcleaner.py
Normal file
614
src/streetcleaner.py
Normal file
@ -0,0 +1,614 @@
|
|||||||
|
# Created on : Aug 28, 2024, 11:58:00 PM
|
||||||
|
# Author : Skylar Ittner
|
||||||
|
|
||||||
|
DIRECTIONAL_REPLACEMENTS = {
|
||||||
|
'EAST': 'E',
|
||||||
|
'WEST': 'W',
|
||||||
|
'NORTH': 'N',
|
||||||
|
'SOUTH': 'S',
|
||||||
|
'NORTHEAST': 'NE',
|
||||||
|
'NORTHWEST': 'NW',
|
||||||
|
'SOUTHEAST': 'SE',
|
||||||
|
'SOUTHWEST': 'SW',
|
||||||
|
'ESTE': 'E',
|
||||||
|
'OESTE': 'W',
|
||||||
|
'NORTE': 'N',
|
||||||
|
'SUR': 'S',
|
||||||
|
'NORESTE': 'NE',
|
||||||
|
'NOROESTE': 'NW',
|
||||||
|
'SURESTE': 'SE',
|
||||||
|
'SUROESTE': 'SW'
|
||||||
|
}
|
||||||
|
|
||||||
|
STREET_TYPE_ABBREVIATIONS = {
|
||||||
|
'ALLEE': 'ALY',
|
||||||
|
'ALLEY': 'ALY',
|
||||||
|
'ALLY': 'ALY',
|
||||||
|
'ALY': 'ALY',
|
||||||
|
'ANEX': 'ANX',
|
||||||
|
'ANNEX': 'ANX',
|
||||||
|
'ANNX': 'ANX',
|
||||||
|
'ANX': 'ANX',
|
||||||
|
'ARC': 'ARC',
|
||||||
|
'ARCADE': 'ARC',
|
||||||
|
'AV': 'AVE',
|
||||||
|
'AVE': 'AVE',
|
||||||
|
'AVEN': 'AVE',
|
||||||
|
'AVENU': 'AVE',
|
||||||
|
'AVENUE': 'AVE',
|
||||||
|
'AVN': 'AVE',
|
||||||
|
'AVNUE': 'AVE',
|
||||||
|
'BAYOO': 'BYU',
|
||||||
|
'BAYOU': 'BYU',
|
||||||
|
'BCH': 'BCH',
|
||||||
|
'BEACH': 'BCH',
|
||||||
|
'BEND': 'BND',
|
||||||
|
'BND': 'BND',
|
||||||
|
'BLF': 'BLF',
|
||||||
|
'BLUF': 'BLF',
|
||||||
|
'BLUFF': 'BLF',
|
||||||
|
'BLUFFS': 'BLFS',
|
||||||
|
'BOT': 'BTM',
|
||||||
|
'BOTTM': 'BTM',
|
||||||
|
'BOTTOM': 'BTM',
|
||||||
|
'BTM': 'BTM',
|
||||||
|
'BLVD': 'BLVD',
|
||||||
|
'BOUL': 'BLVD',
|
||||||
|
'BOULEVARD': 'BLVD',
|
||||||
|
'BOULV': 'BLVD',
|
||||||
|
'BR': 'BR',
|
||||||
|
'BRANCH': 'BR',
|
||||||
|
'BRNCH': 'BR',
|
||||||
|
'BRDGE': 'BRG',
|
||||||
|
'BRG': 'BRG',
|
||||||
|
'BRIDGE': 'BRG',
|
||||||
|
'BRK': 'BRK',
|
||||||
|
'BROOK': 'BRK',
|
||||||
|
'BROOKS': 'BRKS',
|
||||||
|
'BURG': 'BG',
|
||||||
|
'BURGS': 'BGS',
|
||||||
|
'BYP': 'BYP',
|
||||||
|
'BYPA': 'BYP',
|
||||||
|
'BYPAS': 'BYP',
|
||||||
|
'BYPASS': 'BYP',
|
||||||
|
'BYPS': 'BYP',
|
||||||
|
'CAMP': 'CP',
|
||||||
|
'CMP': 'CP',
|
||||||
|
'CP': 'CP',
|
||||||
|
'CANYN': 'CYN',
|
||||||
|
'CANYON': 'CYN',
|
||||||
|
'CNYN': 'CYN',
|
||||||
|
'CYN': 'CYN',
|
||||||
|
'CAPE': 'CPE',
|
||||||
|
'CPE': 'CPE',
|
||||||
|
'CAUSEWAY': 'CSWY',
|
||||||
|
'CAUSWAY': 'CSWY',
|
||||||
|
'CSWY': 'CSWY',
|
||||||
|
'CEN': 'CTR',
|
||||||
|
'CENT': 'CTR',
|
||||||
|
'CENTER': 'CTR',
|
||||||
|
'CENTR': 'CTR',
|
||||||
|
'CENTRE': 'CTR',
|
||||||
|
'CNTER': 'CTR',
|
||||||
|
'CNTR': 'CTR',
|
||||||
|
'CTR': 'CTR',
|
||||||
|
'CENTERS': 'CTRS',
|
||||||
|
'CIR': 'CIR',
|
||||||
|
'CIRC': 'CIR',
|
||||||
|
'CIRCL': 'CIR',
|
||||||
|
'CIRCLE': 'CIR',
|
||||||
|
'CRCL': 'CIR',
|
||||||
|
'CRCLE': 'CIR',
|
||||||
|
'CIRCLES': 'CIRS',
|
||||||
|
'CLF': 'CLF',
|
||||||
|
'CLIFF': 'CLF',
|
||||||
|
'CLFS': 'CLFS',
|
||||||
|
'CLIFFS': 'CLFS',
|
||||||
|
'CLB': 'CLB',
|
||||||
|
'CLUB': 'CLB',
|
||||||
|
'COMMON': 'CMN',
|
||||||
|
'COR': 'COR',
|
||||||
|
'CORNER': 'COR',
|
||||||
|
'CORNERS': 'CORS',
|
||||||
|
'CORS': 'CORS',
|
||||||
|
'COURSE': 'CRSE',
|
||||||
|
'CRSE': 'CRSE',
|
||||||
|
'COURT': 'CT',
|
||||||
|
'CRT': 'CT',
|
||||||
|
'CT': 'CT',
|
||||||
|
'COURTS': 'CTS',
|
||||||
|
'COVE': 'CV',
|
||||||
|
'CV': 'CV',
|
||||||
|
'COVES': 'CVS',
|
||||||
|
'CK': 'CRK',
|
||||||
|
'CR': 'CRK',
|
||||||
|
'CREEK': 'CRK',
|
||||||
|
'CRK': 'CRK',
|
||||||
|
'CRECENT': 'CRES',
|
||||||
|
'CRES': 'CRES',
|
||||||
|
'CRESCENT': 'CRES',
|
||||||
|
'CRESENT': 'CRES',
|
||||||
|
'CRSCNT': 'CRES',
|
||||||
|
'CRSENT': 'CRES',
|
||||||
|
'CRSNT': 'CRES',
|
||||||
|
'CREST': 'CRST',
|
||||||
|
'CROSSING': 'XING',
|
||||||
|
'CRSSING': 'XING',
|
||||||
|
'CRSSNG': 'XING',
|
||||||
|
'XING': 'XING',
|
||||||
|
'CROSSROAD': 'XRD',
|
||||||
|
'CURVE': 'CURV',
|
||||||
|
'DALE': 'DL',
|
||||||
|
'DL': 'DL',
|
||||||
|
'DAM': 'DM',
|
||||||
|
'DM': 'DM',
|
||||||
|
'DIV': 'DV',
|
||||||
|
'DIVIDE': 'DV',
|
||||||
|
'DV': 'DV',
|
||||||
|
'DVD': 'DV',
|
||||||
|
'DR': 'DR',
|
||||||
|
'DRIV': 'DR',
|
||||||
|
'DRIVE': 'DR',
|
||||||
|
'DRV': 'DR',
|
||||||
|
'DRIVES': 'DRS',
|
||||||
|
'EST': 'EST',
|
||||||
|
'ESTATE': 'EST',
|
||||||
|
'ESTATES': 'ESTS',
|
||||||
|
'ESTS': 'ESTS',
|
||||||
|
'EXP': 'EXPY',
|
||||||
|
'EXPR': 'EXPY',
|
||||||
|
'EXPRESS': 'EXPY',
|
||||||
|
'EXPRESSWAY': 'EXPY',
|
||||||
|
'EXPW': 'EXPY',
|
||||||
|
'EXPY': 'EXPY',
|
||||||
|
'EXT': 'EXT',
|
||||||
|
'EXTENSION': 'EXT',
|
||||||
|
'EXTN': 'EXT',
|
||||||
|
'EXTNSN': 'EXT',
|
||||||
|
'EXTENSIONS': 'EXTS',
|
||||||
|
'EXTS': 'EXTS',
|
||||||
|
'FALL': 'FALL',
|
||||||
|
'FALLS': 'FLS',
|
||||||
|
'FLS': 'FLS',
|
||||||
|
'FERRY': 'FRY',
|
||||||
|
'FRRY': 'FRY',
|
||||||
|
'FRY': 'FRY',
|
||||||
|
'FIELD': 'FLD',
|
||||||
|
'FLD': 'FLD',
|
||||||
|
'FIELDS': 'FLDS',
|
||||||
|
'FLDS': 'FLDS',
|
||||||
|
'FLAT': 'FLT',
|
||||||
|
'FLT': 'FLT',
|
||||||
|
'FLATS': 'FLTS',
|
||||||
|
'FLTS': 'FLTS',
|
||||||
|
'FORD': 'FRD',
|
||||||
|
'FRD': 'FRD',
|
||||||
|
'FORDS': 'FRDS',
|
||||||
|
'FOREST': 'FRST',
|
||||||
|
'FORESTS': 'FRST',
|
||||||
|
'FRST': 'FRST',
|
||||||
|
'FORG': 'FRG',
|
||||||
|
'FORGE': 'FRG',
|
||||||
|
'FRG': 'FRG',
|
||||||
|
'FORGES': 'FRGS',
|
||||||
|
'FORK': 'FRK',
|
||||||
|
'FRK': 'FRK',
|
||||||
|
'FORKS': 'FRKS',
|
||||||
|
'FRKS': 'FRKS',
|
||||||
|
'FORT': 'FT',
|
||||||
|
'FRT': 'FT',
|
||||||
|
'FT': 'FT',
|
||||||
|
'FREEWAY': 'FWY',
|
||||||
|
'FREEWY': 'FWY',
|
||||||
|
'FRWAY': 'FWY',
|
||||||
|
'FRWY': 'FWY',
|
||||||
|
'FWY': 'FWY',
|
||||||
|
'GARDEN': 'GDN',
|
||||||
|
'GARDN': 'GDN',
|
||||||
|
'GDN': 'GDN',
|
||||||
|
'GRDEN': 'GDN',
|
||||||
|
'GRDN': 'GDN',
|
||||||
|
'GARDENS': 'GDNS',
|
||||||
|
'GDNS': 'GDNS',
|
||||||
|
'GRDNS': 'GDNS',
|
||||||
|
'GATEWAY': 'GTWY',
|
||||||
|
'GATEWY': 'GTWY',
|
||||||
|
'GATWAY': 'GTWY',
|
||||||
|
'GTWAY': 'GTWY',
|
||||||
|
'GTWY': 'GTWY',
|
||||||
|
'GLEN': 'GLN',
|
||||||
|
'GLN': 'GLN',
|
||||||
|
'GLENS': 'GLNS',
|
||||||
|
'GREEN': 'GRN',
|
||||||
|
'GRN': 'GRN',
|
||||||
|
'GREENS': 'GRNS',
|
||||||
|
'GROV': 'GRV',
|
||||||
|
'GROVE': 'GRV',
|
||||||
|
'GRV': 'GRV',
|
||||||
|
'GROVES': 'GRVS',
|
||||||
|
'HARB': 'HBR',
|
||||||
|
'HARBOR': 'HBR',
|
||||||
|
'HARBR': 'HBR',
|
||||||
|
'HBR': 'HBR',
|
||||||
|
'HRBOR': 'HBR',
|
||||||
|
'HARBORS': 'HBRS',
|
||||||
|
'HAVEN': 'HVN',
|
||||||
|
'HAVN': 'HVN',
|
||||||
|
'HVN': 'HVN',
|
||||||
|
'HEIGHT': 'HTS',
|
||||||
|
'HEIGHTS': 'HTS',
|
||||||
|
'HGTS': 'HTS',
|
||||||
|
'HT': 'HTS',
|
||||||
|
'HTS': 'HTS',
|
||||||
|
'HIGHWAY': 'HWY',
|
||||||
|
'HIGHWY': 'HWY',
|
||||||
|
'HIWAY': 'HWY',
|
||||||
|
'HIWY': 'HWY',
|
||||||
|
'HWAY': 'HWY',
|
||||||
|
'HWY': 'HWY',
|
||||||
|
'HILL': 'HL',
|
||||||
|
'HL': 'HL',
|
||||||
|
'HILLS': 'HLS',
|
||||||
|
'HLS': 'HLS',
|
||||||
|
'HLLW': 'HOLW',
|
||||||
|
'HOLLOW': 'HOLW',
|
||||||
|
'HOLLOWS': 'HOLW',
|
||||||
|
'HOLW': 'HOLW',
|
||||||
|
'HOLWS': 'HOLW',
|
||||||
|
'INLET': 'INLT',
|
||||||
|
'INLT': 'INLT',
|
||||||
|
'IS': 'IS',
|
||||||
|
'ISLAND': 'IS',
|
||||||
|
'ISLND': 'IS',
|
||||||
|
'ISLANDS': 'ISS',
|
||||||
|
'ISLNDS': 'ISS',
|
||||||
|
'ISS': 'ISS',
|
||||||
|
'ISLE': 'ISLE',
|
||||||
|
'ISLES': 'ISLE',
|
||||||
|
'JCT': 'JCT',
|
||||||
|
'JCTION': 'JCT',
|
||||||
|
'JCTN': 'JCT',
|
||||||
|
'JUNCTION': 'JCT',
|
||||||
|
'JUNCTN': 'JCT',
|
||||||
|
'JUNCTON': 'JCT',
|
||||||
|
'JCTNS': 'JCTS',
|
||||||
|
'JCTS': 'JCTS',
|
||||||
|
'JUNCTIONS': 'JCTS',
|
||||||
|
'KEY': 'KY',
|
||||||
|
'KY': 'KY',
|
||||||
|
'KEYS': 'KYS',
|
||||||
|
'KYS': 'KYS',
|
||||||
|
'KNL': 'KNL',
|
||||||
|
'KNOL': 'KNL',
|
||||||
|
'KNOLL': 'KNL',
|
||||||
|
'KNLS': 'KNLS',
|
||||||
|
'KNOLLS': 'KNLS',
|
||||||
|
'LAKE': 'LK',
|
||||||
|
'LK': 'LK',
|
||||||
|
'LAKES': 'LKS',
|
||||||
|
'LKS': 'LKS',
|
||||||
|
'LAND': 'LAND',
|
||||||
|
'LANDING': 'LNDG',
|
||||||
|
'LNDG': 'LNDG',
|
||||||
|
'LNDNG': 'LNDG',
|
||||||
|
'LA': 'LN',
|
||||||
|
'LANE': 'LN',
|
||||||
|
'LANES': 'LN',
|
||||||
|
'LN': 'LN',
|
||||||
|
'LGT': 'LGT',
|
||||||
|
'LIGHT': 'LGT',
|
||||||
|
'LIGHTS': 'LGTS',
|
||||||
|
'LF': 'LF',
|
||||||
|
'LOAF': 'LF',
|
||||||
|
'LCK': 'LCK',
|
||||||
|
'LOCK': 'LCK',
|
||||||
|
'LCKS': 'LCKS',
|
||||||
|
'LOCKS': 'LCKS',
|
||||||
|
'LDG': 'LDG',
|
||||||
|
'LDGE': 'LDG',
|
||||||
|
'LODG': 'LDG',
|
||||||
|
'LODGE': 'LDG',
|
||||||
|
'LOOP': 'LOOP',
|
||||||
|
'LOOPS': 'LOOP',
|
||||||
|
'MALL': 'MALL',
|
||||||
|
'MANOR': 'MNR',
|
||||||
|
'MNR': 'MNR',
|
||||||
|
'MANORS': 'MNRS',
|
||||||
|
'MNRS': 'MNRS',
|
||||||
|
'MDW': 'MDW',
|
||||||
|
'MEADOW': 'MDW',
|
||||||
|
'MDWS': 'MDWS',
|
||||||
|
'MEADOWS': 'MDWS',
|
||||||
|
'MEDOWS': 'MDWS',
|
||||||
|
'MEWS': 'MEWS',
|
||||||
|
'MILL': 'ML',
|
||||||
|
'ML': 'ML',
|
||||||
|
'MILLS': 'MLS',
|
||||||
|
'MLS': 'MLS',
|
||||||
|
'MISSION': 'MSN',
|
||||||
|
'MISSN': 'MSN',
|
||||||
|
'MSN': 'MSN',
|
||||||
|
'MSSN': 'MSN',
|
||||||
|
'MOTORWAY': 'MTWY',
|
||||||
|
'MNT': 'MT',
|
||||||
|
'MOUNT': 'MT',
|
||||||
|
'MT': 'MT',
|
||||||
|
'MNTAIN': 'MTN',
|
||||||
|
'MNTN': 'MTN',
|
||||||
|
'MOUNTAIN': 'MTN',
|
||||||
|
'MOUNTIN': 'MTN',
|
||||||
|
'MTIN': 'MTN',
|
||||||
|
'MTN': 'MTN',
|
||||||
|
'MNTNS': 'MTNS',
|
||||||
|
'MOUNTAINS': 'MTNS',
|
||||||
|
'NCK': 'NCK',
|
||||||
|
'NECK': 'NCK',
|
||||||
|
'ORCH': 'ORCH',
|
||||||
|
'ORCHARD': 'ORCH',
|
||||||
|
'ORCHRD': 'ORCH',
|
||||||
|
'OVAL': 'OVAL',
|
||||||
|
'OVL': 'OVAL',
|
||||||
|
'OVERPASS': 'OPAS',
|
||||||
|
'PARK': 'PARK',
|
||||||
|
'PK': 'PARK',
|
||||||
|
'PRK': 'PARK',
|
||||||
|
'PARKS': 'PARK',
|
||||||
|
'PARKWAY': 'PKWY',
|
||||||
|
'PARKWY': 'PKWY',
|
||||||
|
'PKWAY': 'PKWY',
|
||||||
|
'PKWY': 'PKWY',
|
||||||
|
'PKY': 'PKWY',
|
||||||
|
'PW': 'PKWY',
|
||||||
|
'PARKWAYS': 'PKWY',
|
||||||
|
'PKWYS': 'PKWY',
|
||||||
|
'PASS': 'PASS',
|
||||||
|
'PASSAGE': 'PSGE',
|
||||||
|
'PATH': 'PATH',
|
||||||
|
'PATHS': 'PATH',
|
||||||
|
'PIKE': 'PIKE',
|
||||||
|
'PIKES': 'PIKE',
|
||||||
|
'PINE': 'PNE',
|
||||||
|
'PINES': 'PNES',
|
||||||
|
'PNES': 'PNES',
|
||||||
|
'PL': 'PL',
|
||||||
|
'PLACE': 'PL',
|
||||||
|
'PLAIN': 'PLN',
|
||||||
|
'PLN': 'PLN',
|
||||||
|
'PLAINES': 'PLNS',
|
||||||
|
'PLAINS': 'PLNS',
|
||||||
|
'PLNS': 'PLNS',
|
||||||
|
'PLAZA': 'PLZ',
|
||||||
|
'PLZ': 'PLZ',
|
||||||
|
'PLZA': 'PLZ',
|
||||||
|
'POINT': 'PT',
|
||||||
|
'PT': 'PT',
|
||||||
|
'POINTS': 'PTS',
|
||||||
|
'PTS': 'PTS',
|
||||||
|
'PORT': 'PRT',
|
||||||
|
'PRT': 'PRT',
|
||||||
|
'PORTS': 'PRTS',
|
||||||
|
'PRTS': 'PRTS',
|
||||||
|
'PR': 'PR',
|
||||||
|
'PRAIRIE': 'PR',
|
||||||
|
'PRARIE': 'PR',
|
||||||
|
'PRR': 'PR',
|
||||||
|
'RAD': 'RADL',
|
||||||
|
'RADIAL': 'RADL',
|
||||||
|
'RADIEL': 'RADL',
|
||||||
|
'RADL': 'RADL',
|
||||||
|
'RAMP': 'RAMP',
|
||||||
|
'RANCH': 'RNCH',
|
||||||
|
'RANCHES': 'RNCH',
|
||||||
|
'RNCH': 'RNCH',
|
||||||
|
'RNCHS': 'RNCH',
|
||||||
|
'RAPID': 'RPD',
|
||||||
|
'RPD': 'RPD',
|
||||||
|
'RAPIDS': 'RPDS',
|
||||||
|
'RPDS': 'RPDS',
|
||||||
|
'REST': 'RST',
|
||||||
|
'RST': 'RST',
|
||||||
|
'RDG': 'RDG',
|
||||||
|
'RDGE': 'RDG',
|
||||||
|
'RIDGE': 'RDG',
|
||||||
|
'RDGS': 'RDGS',
|
||||||
|
'RIDGES': 'RDGS',
|
||||||
|
'RIV': 'RIV',
|
||||||
|
'RIVER': 'RIV',
|
||||||
|
'RIVR': 'RIV',
|
||||||
|
'RVR': 'RIV',
|
||||||
|
'RD': 'RD',
|
||||||
|
'ROAD': 'RD',
|
||||||
|
'RDS': 'RDS',
|
||||||
|
'ROADS': 'RDS',
|
||||||
|
'ROUTE': 'RTE',
|
||||||
|
'ROW': 'ROW',
|
||||||
|
'RUE': 'RUE',
|
||||||
|
'RUN': 'RUN',
|
||||||
|
'SHL': 'SHL',
|
||||||
|
'SHOAL': 'SHL',
|
||||||
|
'SHLS': 'SHLS',
|
||||||
|
'SHOALS': 'SHLS',
|
||||||
|
'SHOAR': 'SHR',
|
||||||
|
'SHORE': 'SHR',
|
||||||
|
'SHR': 'SHR',
|
||||||
|
'SHOARS': 'SHRS',
|
||||||
|
'SHORES': 'SHRS',
|
||||||
|
'SHRS': 'SHRS',
|
||||||
|
'SKYWAY': 'SKWY',
|
||||||
|
'SPG': 'SPG',
|
||||||
|
'SPNG': 'SPG',
|
||||||
|
'SPRING': 'SPG',
|
||||||
|
'SPRNG': 'SPG',
|
||||||
|
'SPGS': 'SPGS',
|
||||||
|
'SPNGS': 'SPGS',
|
||||||
|
'SPRINGS': 'SPGS',
|
||||||
|
'SPRNGS': 'SPGS',
|
||||||
|
'SPUR': 'SPUR',
|
||||||
|
'SPURS': 'SPUR',
|
||||||
|
'SQ': 'SQ',
|
||||||
|
'SQR': 'SQ',
|
||||||
|
'SQRE': 'SQ',
|
||||||
|
'SQU': 'SQ',
|
||||||
|
'SQUARE': 'SQ',
|
||||||
|
'SQRS': 'SQS',
|
||||||
|
'SQUARES': 'SQS',
|
||||||
|
'STA': 'STA',
|
||||||
|
'STATION': 'STA',
|
||||||
|
'STATN': 'STA',
|
||||||
|
'STN': 'STA',
|
||||||
|
'STRA': 'STRA',
|
||||||
|
'STRAV': 'STRA',
|
||||||
|
'STRAVE': 'STRA',
|
||||||
|
'STRAVEN': 'STRA',
|
||||||
|
'STRAVENUE': 'STRA',
|
||||||
|
'STRAVN': 'STRA',
|
||||||
|
'STRVN': 'STRA',
|
||||||
|
'STRVNUE': 'STRA',
|
||||||
|
'STREAM': 'STRM',
|
||||||
|
'STREME': 'STRM',
|
||||||
|
'STRM': 'STRM',
|
||||||
|
'ST': 'ST',
|
||||||
|
'STR': 'ST',
|
||||||
|
'STREET': 'ST',
|
||||||
|
'STRT': 'ST',
|
||||||
|
'STREETS': 'STS',
|
||||||
|
'SMT': 'SMT',
|
||||||
|
'SUMIT': 'SMT',
|
||||||
|
'SUMITT': 'SMT',
|
||||||
|
'SUMMIT': 'SMT',
|
||||||
|
'TER': 'TER',
|
||||||
|
'TERR': 'TER',
|
||||||
|
'TERRACE': 'TER',
|
||||||
|
'THROUGHWAY': 'TRWY',
|
||||||
|
'TRACE': 'TRCE',
|
||||||
|
'TRACES': 'TRCE',
|
||||||
|
'TRCE': 'TRCE',
|
||||||
|
'TRACK': 'TRAK',
|
||||||
|
'TRACKS': 'TRAK',
|
||||||
|
'TRAK': 'TRAK',
|
||||||
|
'TRK': 'TRAK',
|
||||||
|
'TRKS': 'TRAK',
|
||||||
|
'TRAFFICWAY': 'TRFY',
|
||||||
|
'TRFY': 'TRFY',
|
||||||
|
'TR': 'TRL',
|
||||||
|
'TRAIL': 'TRL',
|
||||||
|
'TRAILS': 'TRL',
|
||||||
|
'TRL': 'TRL',
|
||||||
|
'TRLS': 'TRL',
|
||||||
|
'TUNEL': 'TUNL',
|
||||||
|
'TUNL': 'TUNL',
|
||||||
|
'TUNLS': 'TUNL',
|
||||||
|
'TUNNEL': 'TUNL',
|
||||||
|
'TUNNELS': 'TUNL',
|
||||||
|
'TUNNL': 'TUNL',
|
||||||
|
'TPK': 'TPKE',
|
||||||
|
'TPKE': 'TPKE',
|
||||||
|
'TRNPK': 'TPKE',
|
||||||
|
'TRPK': 'TPKE',
|
||||||
|
'TURNPIKE': 'TPKE',
|
||||||
|
'TURNPK': 'TPKE',
|
||||||
|
'UNDERPASS': 'UPAS',
|
||||||
|
'UN': 'UN',
|
||||||
|
'UNION': 'UN',
|
||||||
|
'UNIONS': 'UNS',
|
||||||
|
'VALLEY': 'VLY',
|
||||||
|
'VALLY': 'VLY',
|
||||||
|
'VLLY': 'VLY',
|
||||||
|
'VLY': 'VLY',
|
||||||
|
'VALLEYS': 'VLYS',
|
||||||
|
'VLYS': 'VLYS',
|
||||||
|
'VDCT': 'VIA',
|
||||||
|
'VIA': 'VIA',
|
||||||
|
'VIADCT': 'VIA',
|
||||||
|
'VIADUCT': 'VIA',
|
||||||
|
'VIEW': 'VW',
|
||||||
|
'VW': 'VW',
|
||||||
|
'VIEWS': 'VWS',
|
||||||
|
'VWS': 'VWS',
|
||||||
|
'VILL': 'VLG',
|
||||||
|
'VILLAG': 'VLG',
|
||||||
|
'VILLAGE': 'VLG',
|
||||||
|
'VILLG': 'VLG',
|
||||||
|
'VILLIAGE': 'VLG',
|
||||||
|
'VLG': 'VLG',
|
||||||
|
'VILLAGES': 'VLGS',
|
||||||
|
'VLGS': 'VLGS',
|
||||||
|
'VILLE': 'VL',
|
||||||
|
'VL': 'VL',
|
||||||
|
'VIS': 'VIS',
|
||||||
|
'VIST': 'VIS',
|
||||||
|
'VISTA': 'VIS',
|
||||||
|
'VST': 'VIS',
|
||||||
|
'VSTA': 'VIS',
|
||||||
|
'WALK': 'WALK',
|
||||||
|
'WALKS': 'WALK',
|
||||||
|
'WALL': 'WALL',
|
||||||
|
'WAY': 'WAY',
|
||||||
|
'WY': 'WAY',
|
||||||
|
'WAYS': 'WAYS',
|
||||||
|
'WELL': 'WL',
|
||||||
|
'WELLS': 'WLS',
|
||||||
|
'WLS': 'WLS'
|
||||||
|
}
|
||||||
|
|
||||||
|
OCCUPANCY_TYPE_ABBREVIATIONS = {
|
||||||
|
'APARTMENT': 'APT',
|
||||||
|
'BUILDING': 'BLDG',
|
||||||
|
'BASEMENT': 'BSMT',
|
||||||
|
'DEPARTMENT': 'DEPT',
|
||||||
|
'FLOOR': 'FL',
|
||||||
|
'FRONT': 'FRNT',
|
||||||
|
'HANGER': 'HNGR',
|
||||||
|
'KEY': 'KEY',
|
||||||
|
'LOBBY': 'LBBY',
|
||||||
|
'LOT': 'LOT',
|
||||||
|
'LOWER': 'LOWR',
|
||||||
|
'OFFICE': 'OFC',
|
||||||
|
'PENTHOUSE': 'PH',
|
||||||
|
'PIER': 'PIER',
|
||||||
|
'REAR': 'REAR',
|
||||||
|
'ROOM': 'RM',
|
||||||
|
'SIDE': 'SIDE',
|
||||||
|
'SLIP': 'SLIP',
|
||||||
|
'SPACE': 'SPC',
|
||||||
|
'STOP': 'STOP',
|
||||||
|
'SUITE': 'STE',
|
||||||
|
'TRAILER': 'TRLR',
|
||||||
|
'UNIT': 'UNIT',
|
||||||
|
'UPPER': 'UPPER',
|
||||||
|
'#': '#'
|
||||||
|
}
|
||||||
|
|
||||||
|
ODD_REPLACEMENTS = {
|
||||||
|
"UNITED STATES HIGHWAY": "US HIGHWAY"
|
||||||
|
}
|
||||||
|
|
||||||
|
HIGHWAY_REPLACEMENTS = {
|
||||||
|
"UNITED STATES HIGHWAY": "US HIGHWAY",
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
# Replace directionals with abbreviated versions.
|
||||||
|
def abbrevDirectionals(string):
|
||||||
|
string = string.upper()
|
||||||
|
for (find, replace) in DIRECTIONAL_REPLACEMENTS:
|
||||||
|
string = string.replace(find, replace)
|
||||||
|
return string
|
||||||
|
|
||||||
|
def abbrevStreetTypes(string):
|
||||||
|
string = string.upper()
|
||||||
|
for (find, replace) in STREET_TYPE_ABBREVIATIONS:
|
||||||
|
string = string.replace(find, replace)
|
||||||
|
return string
|
||||||
|
|
||||||
|
# Odd/unusual string replacement
|
||||||
|
def oddHandling(string):
|
||||||
|
string = string.upper()
|
||||||
|
for (find, replace) in ODD_REPLACEMENTS:
|
||||||
|
string = string.replace(find, replace)
|
||||||
|
return string
|
||||||
|
|
||||||
|
def highwayStandardize(street):
|
||||||
|
|
||||||
|
|
||||||
|
def normalize(number, streetPreMod, streetPreDir, streetPreType, streetPreSep, streetName, streetPostType, streetPostDir, streetPostMod):
|
||||||
429
src/zipfunctions.py
Normal file
429
src/zipfunctions.py
Normal file
@ -0,0 +1,429 @@
|
|||||||
|
# Created on : Aug 29, 2024, 12:57:40 AM
|
||||||
|
# Author : Skylar Ittner
|
||||||
|
|
||||||
|
import re, sys
|
||||||
|
#import pandas as pd
|
||||||
|
from uszipcode import SearchEngine, ZipcodeTypeEnum
|
||||||
|
from shapely.geometry import Point
|
||||||
|
from shapely.geometry.polygon import Polygon
|
||||||
|
import sqlite3
|
||||||
|
from src.constants import LONGHAND_STREET_TYPES
|
||||||
|
import src.config
|
||||||
|
import sys
|
||||||
|
import urllib.request
|
||||||
|
from urllib.parse import quote
|
||||||
|
import json
|
||||||
|
|
||||||
|
#zipcodes = pd.read_csv("zip_code_database.csv", keep_default_na=False, dtype="str")
|
||||||
|
fastsearch = SearchEngine(db_file_path="zipcode_db_simple.sqlite", simple_or_comprehensive=SearchEngine.SimpleOrComprehensiveArgEnum.simple)
|
||||||
|
search = SearchEngine(db_file_path="zipcode_db.sqlite", simple_or_comprehensive=SearchEngine.SimpleOrComprehensiveArgEnum.comprehensive)
|
||||||
|
|
||||||
|
zip4 = sqlite3.connect("file:/home/skylar/AddressDatabase/zip4.sqlite?mode=ro&immutable=1", uri=True, check_same_thread=False)
|
||||||
|
zip4.executescript("""
|
||||||
|
PRAGMA query_only=ON; -- belt-and-suspenders (can’t write)
|
||||||
|
PRAGMA temp_store=MEMORY; -- sorts/temps in RAM
|
||||||
|
PRAGMA cache_size=-800000; -- ~800 MB page cache (negative = KB units)
|
||||||
|
PRAGMA mmap_size=4294967296; -- 1 GiB memory-mapped I/O (bump if you have RAM)
|
||||||
|
PRAGMA automatic_index=ON; -- leave enabled (default), can help odd joins
|
||||||
|
PRAGMA threads=4; -- allow parallel ops for sorts/expr eval (if available)
|
||||||
|
""")
|
||||||
|
zip4.row_factory = sqlite3.Row
|
||||||
|
cur = zip4.cursor()
|
||||||
|
|
||||||
|
# SQL query cache when finding ZIP+4, prevents running duplicate queries for nearby addresses on the same road
|
||||||
|
querycache = {}
|
||||||
|
querycachelimit = 3000
|
||||||
|
|
||||||
|
def checkZIPCode(lat, lon, state, zip):
|
||||||
|
zipok = False
|
||||||
|
if not zip or zip == None or zip != zip:
|
||||||
|
zipok = False
|
||||||
|
elif isinstance(zip, str):
|
||||||
|
if len(zip) == 5:
|
||||||
|
zipok = True
|
||||||
|
else:
|
||||||
|
zip = zip.rjust(5, '0')
|
||||||
|
elif isinstance(zip, int):
|
||||||
|
if zip >= 10000:
|
||||||
|
zipok = True
|
||||||
|
else:
|
||||||
|
zip = str(int(zip)).rjust(5, '0')
|
||||||
|
zipok = True
|
||||||
|
elif isinstance(zip, float):
|
||||||
|
if zip >= 10000:
|
||||||
|
zip = int(zip)
|
||||||
|
zipok = True
|
||||||
|
else:
|
||||||
|
zip = str(int(zip)).rjust(5, '0')
|
||||||
|
zipok = True
|
||||||
|
else:
|
||||||
|
zip = str(int(zip)).rjust(5, '0')
|
||||||
|
zipok = True
|
||||||
|
|
||||||
|
zipInfo = False
|
||||||
|
if zipok:
|
||||||
|
zipInfo = getCityStateForZIP(zip)
|
||||||
|
if not zipInfo:
|
||||||
|
zipok = False
|
||||||
|
elif zipInfo.state != state:
|
||||||
|
zipok = False
|
||||||
|
|
||||||
|
if not zipok:
|
||||||
|
result = search.by_coordinates(lat = lat, lng = lon, returns = 1)
|
||||||
|
if len(result) == 1:
|
||||||
|
return getCityStateForZIP(result[0].zipcode)
|
||||||
|
elif len(result) > 1:
|
||||||
|
print(result[0])
|
||||||
|
print(result[1])
|
||||||
|
return getCityStateForZIP(result[0].zipcode)
|
||||||
|
else:
|
||||||
|
return {"zip": "", "city": "", "state": ""}
|
||||||
|
else:
|
||||||
|
return zipInfo
|
||||||
|
|
||||||
|
def getCityStateForZIP(zipcode):
|
||||||
|
if not zipcode or zipcode == False or len(zipcode) != 5:
|
||||||
|
return False
|
||||||
|
cur.execute("SELECT ZipCode,City,State FROM ZIPCodes WHERE ZipCode='"+zipcode+"' AND CityStateKey=PreferredLastLineKey LIMIT 1")
|
||||||
|
results = cur.fetchall()
|
||||||
|
if len(results) == 0:
|
||||||
|
return False
|
||||||
|
return {
|
||||||
|
"zip": results[0]["ZipCode"],
|
||||||
|
"city": results[0]["City"],
|
||||||
|
"state": results[0]["State"]
|
||||||
|
}
|
||||||
|
|
||||||
|
def getZIPFromGeo(lat, lon, prefix=False, state=False):
|
||||||
|
if prefix and state:
|
||||||
|
result = search.query(lat = lat, lng = lon, returns = 1, radius=20, prefix=str(prefix), state=state)
|
||||||
|
elif state:
|
||||||
|
result = search.query(lat = lat, lng = lon, returns = 1, radius=20, state=state)
|
||||||
|
elif prefix:
|
||||||
|
result = search.query(lat = lat, lng = lon, returns = 1, radius=20, prefix=str(prefix))
|
||||||
|
else:
|
||||||
|
result = search.by_coordinates(lat = lat, lng = lon, returns = 1)
|
||||||
|
if len(result) == 1:
|
||||||
|
return result[0].zipcode
|
||||||
|
elif len(result) > 1:
|
||||||
|
#print("W: Multiple ZIP matches for "+lat+", "+lon+": "+result[0] + " or " + result[1])
|
||||||
|
return result[0].zipcode
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def subaddrMatchRows(rows, unit):
|
||||||
|
if re.match(r"^[0-9]+$", unit):
|
||||||
|
unit = unit.zfill(8)
|
||||||
|
unitrows = []
|
||||||
|
nounitrows = []
|
||||||
|
for row in rows:
|
||||||
|
if row["AddressSecLowNumber"] == "":
|
||||||
|
nounitrows.append(row)
|
||||||
|
continue
|
||||||
|
if row["AddressSecLowNumber"] <= unit and row["AddressSecHighNumber"] >= unit:
|
||||||
|
unitrows.append(row)
|
||||||
|
if len(unitrows) == 0:
|
||||||
|
return nounitrows
|
||||||
|
return unitrows
|
||||||
|
|
||||||
|
def getZIPsForCityState(city, state):
|
||||||
|
city = city.upper().strip().replace("'","''")
|
||||||
|
cur.execute("SELECT * FROM ZIPCodes WHERE State = '" + state + "' AND (CityAliasName = '" + city + "' OR City = '" + city + "' OR CityAliasAbbreviation = '" + city + "')")
|
||||||
|
citylist = cur.fetchall()
|
||||||
|
resultlist = []
|
||||||
|
ziplist = []
|
||||||
|
# Remove entries that aren't the preferred city name for the ZIP Code
|
||||||
|
for cityrow in citylist:
|
||||||
|
if cityrow["CityStateKey"] == cityrow["PreferredLastLineKey"]:
|
||||||
|
resultlist.append(cityrow)
|
||||||
|
ziplist.append(cityrow["ZipCode"])
|
||||||
|
return resultlist, ziplist
|
||||||
|
|
||||||
|
def getZIPsForCounty(county, state):
|
||||||
|
county = county.upper().strip().replace("'","''")
|
||||||
|
cur.execute("SELECT ZipCode FROM ZIPCodes WHERE State = '" + state + "' AND County = '" + county + "'")
|
||||||
|
countylist = cur.fetchall()
|
||||||
|
# Also get records where the ZIP isn't mainly in the county but some of it is
|
||||||
|
cur.execute("SELECT ZipCode FROM ZIPCodesMultiCounty WHERE State = '" + state + "' AND County = '" + county + "'")
|
||||||
|
multicountylist = cur.fetchall()
|
||||||
|
ziplist = []
|
||||||
|
|
||||||
|
for row in countylist:
|
||||||
|
if row["ZipCode"] not in ziplist:
|
||||||
|
ziplist.append(row["ZipCode"])
|
||||||
|
for row in multicountylist:
|
||||||
|
if row["ZipCode"] not in ziplist:
|
||||||
|
ziplist.append(row["ZipCode"])
|
||||||
|
return ziplist
|
||||||
|
|
||||||
|
def addressRangeContainsNumber(low, high, evenodd, number):
|
||||||
|
numberevenodd = "B"
|
||||||
|
if re.match(r"^[0-9]+$", number):
|
||||||
|
if int(number) % 2 == 0:
|
||||||
|
numberevenodd = "E"
|
||||||
|
else:
|
||||||
|
numberevenodd = "O"
|
||||||
|
number = number.zfill(10)
|
||||||
|
elif low == number or high == number:
|
||||||
|
return True
|
||||||
|
|
||||||
|
if evenodd == "B" or evenodd == numberevenodd:
|
||||||
|
if low <= number and high >= number: # This logic is bad and should be rewritten
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check if the address number range is actually just a single address that matches the number provided.
|
||||||
|
def addressRangeIsExactNumber(low, high, number):
|
||||||
|
if low != high:
|
||||||
|
return False
|
||||||
|
if low == number:
|
||||||
|
return True
|
||||||
|
if low == number.zfill(10):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def getZIP4(number, street, unit, state, lat, lon, city=False, zip=False, county=False):
|
||||||
|
number = number.strip()
|
||||||
|
street = street.strip()
|
||||||
|
if not unit:
|
||||||
|
unit = ""
|
||||||
|
|
||||||
|
# Get list of 5-digit ZIP Codes matching the city and state
|
||||||
|
citystateresults = False
|
||||||
|
zipfilter = False
|
||||||
|
if city:
|
||||||
|
citystateresults, zipfilter = getZIPsForCityState(city, state)
|
||||||
|
if len(zipfilter) == 0:
|
||||||
|
zipfilter = False
|
||||||
|
elif county:
|
||||||
|
zipfilter = getZIPsForCounty(county, state)
|
||||||
|
if len(zipfilter) == 0:
|
||||||
|
zipfilter = False
|
||||||
|
|
||||||
|
|
||||||
|
queries = []
|
||||||
|
basenamequeries = [] # Queries that only match on street basename, try after "main" queries don't return a match
|
||||||
|
|
||||||
|
# Get street base name for broader matching in case suffix or directional differs
|
||||||
|
typelessStreet = street
|
||||||
|
for (short, long) in LONGHAND_STREET_TYPES.items():
|
||||||
|
typelessStreet = re.sub(r"\s" + re.escape(short) + r"\b", "", typelessStreet)
|
||||||
|
streetBasename = re.sub("^[NSEW]{1,2} ", "", typelessStreet)
|
||||||
|
streetBasename = re.sub(" [NSEW]{1,2}$", "", streetBasename)
|
||||||
|
|
||||||
|
#print(street, typelessStreet, streetBasename)
|
||||||
|
|
||||||
|
# Build a list of queries to run, starting with the most specific and getting more desperate until a match is found
|
||||||
|
if zip:
|
||||||
|
queries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StreetFull = '" + street + "' AND State = '" + state + "'" + " AND ZipCode='"+zip+"'")
|
||||||
|
basenamequeries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StName = '" + streetBasename + "' AND State = '" + state + "'" + " AND ZipCode='"+zip+"'")
|
||||||
|
if zipfilter:
|
||||||
|
queries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StreetFull = '" + street + "' AND State = '" + state + "'" + " AND ZipCode IN ('" + ("','".join(zipfilter)) + "')")
|
||||||
|
basenamequeries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StName = '" + streetBasename + "' AND State = '" + state + "'" + " AND ZipCode IN ('" + ("','".join(zipfilter)) + "')")
|
||||||
|
|
||||||
|
if not unit and re.match(".* ([0-9]{1,5}|[A-Z]{1})$", street):
|
||||||
|
# Maybe the street has the apartment number in it for some reason
|
||||||
|
newStreet = re.sub(" ([0-9]{1,5}|[A-Z]{1})$", "", street)
|
||||||
|
newUnit = street[len(newStreet):].strip()
|
||||||
|
typelessStreet = newStreet
|
||||||
|
for (short, long) in LONGHAND_STREET_TYPES.items():
|
||||||
|
typelessStreet = re.sub(r"\s" + re.escape(short) + r"\b", "", typelessStreet)
|
||||||
|
newstreetBasename = re.sub("^[NSEW]{1,2} ", "", typelessStreet)
|
||||||
|
newstreetBasename = re.sub(" [NSEW]{1,2}$", "", newstreetBasename)
|
||||||
|
if zip:
|
||||||
|
queries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StreetFull = '" + newStreet + "' AND State = '" + state + "'" + " AND ZipCode='"+zip+"'")
|
||||||
|
basenamequeries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StName = '" + newstreetBasename + "' AND State = '" + state + "'" + " AND ZipCode='"+zip+"'")
|
||||||
|
if zipfilter:
|
||||||
|
queries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StreetFull = '" + newStreet + "' AND State = '" + state + "'" + " AND ZipCode IN ('" + ("','".join(zipfilter)) + "')")
|
||||||
|
basenamequeries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StName = '" + newstreetBasename + "' AND State = '" + state + "'" + " AND ZipCode IN ('" + ("','".join(zipfilter)) + "')")
|
||||||
|
|
||||||
|
if not zip and not zipfilter:
|
||||||
|
# Who needs ZIP Codes and city names anyways
|
||||||
|
queries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StreetFull = '" + street + "' AND State = '" + state + "'")
|
||||||
|
queries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StName = '" + street + "' AND State = '" + state + "'")
|
||||||
|
basenamequeries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StName = '" + streetBasename + "' AND State = '" + state + "'")
|
||||||
|
queries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StreetFull LIKE '" + street + "%' AND State = '" + state + "'")
|
||||||
|
#queries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StName LIKE '" + streetBasename + "%' AND State = '" + state + "'")
|
||||||
|
|
||||||
|
resultrows = []
|
||||||
|
|
||||||
|
suggestZip = ""
|
||||||
|
suggestStreet = ""
|
||||||
|
|
||||||
|
queries = queries + basenamequeries
|
||||||
|
|
||||||
|
for query in queries:
|
||||||
|
#print(query)
|
||||||
|
if query in querycache:
|
||||||
|
rows = querycache[query]
|
||||||
|
#print("CACHED: " + query)
|
||||||
|
else:
|
||||||
|
cur.execute(query)
|
||||||
|
#print("NOCACHE: " +query)
|
||||||
|
rows = cur.fetchall()
|
||||||
|
|
||||||
|
# Add to query cache
|
||||||
|
querycache[query] = rows
|
||||||
|
if len(querycache) > querycachelimit:
|
||||||
|
querycache.pop(next(iter(querycache)))
|
||||||
|
|
||||||
|
unitfilterrows = rows
|
||||||
|
if unit:
|
||||||
|
# Filter to rows that match the unit number
|
||||||
|
unitfilterrows = subaddrMatchRows(rows, unit)
|
||||||
|
|
||||||
|
# Try matching range against unit-filtered rows, if that doesn't work, try the non-filtered ones (address might be more specific than ZIP4 file)
|
||||||
|
for row in unitfilterrows:
|
||||||
|
if addressRangeContainsNumber(row["AddressPrimaryLowNumber"], row["AddressPrimaryHighNumber"], row["AddressPrimaryEvenOdd"], number):
|
||||||
|
resultrows.append(row)
|
||||||
|
if len(resultrows) == 0 and len(unitfilterrows) < len(rows):
|
||||||
|
for row in rows:
|
||||||
|
if addressRangeContainsNumber(row["AddressPrimaryLowNumber"], row["AddressPrimaryHighNumber"], row["AddressPrimaryEvenOdd"], number):
|
||||||
|
resultrows.append(row)
|
||||||
|
|
||||||
|
if len(resultrows) == 1:
|
||||||
|
# One match found, it's probably the right one!
|
||||||
|
return resultrows[0]["ZipCode"], resultrows[0]["Plus4Low"], resultrows[0]["StreetFull"], resultrows[0]["AddressSecAbbr"], unit
|
||||||
|
|
||||||
|
if len(resultrows) > 1:
|
||||||
|
# First check if our source address has a unit, and if not, remove all match rows that DO have a unit.
|
||||||
|
if not unit:
|
||||||
|
base_rows = [
|
||||||
|
r for r in resultrows
|
||||||
|
if not (
|
||||||
|
r["AddressSecAbbr"] or
|
||||||
|
r["AddressSecLowNumber"] or
|
||||||
|
r["AddressSecHighNumber"]
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
# If we found at least one base-address row, narrow resultrows to those
|
||||||
|
if base_rows:
|
||||||
|
# Narrow further by looking for exact number matches (low and high are the same and what we're looking for)
|
||||||
|
exact_rows = []
|
||||||
|
for row in base_rows:
|
||||||
|
if addressRangeIsExactNumber(row["AddressPrimaryLowNumber"], row["AddressPrimaryHighNumber"], number):
|
||||||
|
exact_rows.append(row)
|
||||||
|
|
||||||
|
if len(exact_rows) > 0:
|
||||||
|
resultrows = exact_rows
|
||||||
|
else:
|
||||||
|
resultrows = base_rows
|
||||||
|
|
||||||
|
# If that left us with exactly one, we can return it immediately
|
||||||
|
if len(resultrows) == 1:
|
||||||
|
row = resultrows[0]
|
||||||
|
return (
|
||||||
|
row["ZipCode"],
|
||||||
|
row["Plus4Low"],
|
||||||
|
row["StreetFull"],
|
||||||
|
"", # no unit designator when no unit was given
|
||||||
|
unit, # still the original (empty) unit
|
||||||
|
)
|
||||||
|
suggestZip = resultrows[0]["ZipCode"]
|
||||||
|
suggestStreet = resultrows[0]["StreetFull"]
|
||||||
|
for row in resultrows:
|
||||||
|
# Check if the 5-digit ZIP and/or street are the same for all results, clear it if not
|
||||||
|
if suggestZip != row["ZipCode"]:
|
||||||
|
suggestZip = ""
|
||||||
|
if suggestStreet != row["StreetFull"]:
|
||||||
|
suggestStreet = ""
|
||||||
|
# Return an address-specific row if it exists
|
||||||
|
if row["AddressPrimaryLowNumber"] == number and row["AddressPrimaryHighNumber"] == number:
|
||||||
|
return row["ZipCode"], row["Plus4Low"], row["StreetFull"], row["AddressSecAbbr"], unit
|
||||||
|
#print("Multiple possible ZIP+4 matches for", number, street, "#"+unit, city, state, zip)
|
||||||
|
#for row in resultrows:
|
||||||
|
# print(row["ZipCode"],row["AddressPrimaryLowNumber"],row["AddressPrimaryHighNumber"], row["StreetFull"], row["AddressPrimaryEvenOdd"], row["Plus4Low"], row["AddressSecAbbr"])
|
||||||
|
|
||||||
|
# No match found
|
||||||
|
cfg = src.config.get_config()
|
||||||
|
if cfg.useCensusToFillEmptyZIPs and number != "" and street != "" and city != "" and city != False and state != "":
|
||||||
|
# Query the Census Geocoder, because this address probably exists
|
||||||
|
print("US Census Geo:" + number + " " + street + ", " + city + " " + state + " ", end="\r", flush=True)
|
||||||
|
try:
|
||||||
|
result = urllib.request.urlopen("https://geocoding.geo.census.gov/geocoder/locations/address?street="+quote(number + " " + street)+"&city="+quote(city)+"&state="+state+"&zip=&benchmark=4&format=json").read()
|
||||||
|
jsonresult = json.loads(result)
|
||||||
|
if len(jsonresult["result"]["addressMatches"]) == 1:
|
||||||
|
comps = jsonresult["result"]["addressMatches"][0]["addressComponents"]
|
||||||
|
streetparts = [comps["preDirection"], comps["preType"], comps["streetName"], comps["suffixType"], comps["suffixDirection"]]
|
||||||
|
street = " ".join(x for x in streetparts if x)
|
||||||
|
return jsonresult["result"]["addressMatches"][0]["addressComponents"]["zip"], "", street, "", unit
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if suggestZip == "":
|
||||||
|
suggestZip = getZIP(lat, lon, False, state, city)
|
||||||
|
if suggestStreet == "":
|
||||||
|
suggestStreet = street
|
||||||
|
return suggestZip, "", suggestStreet, "", unit
|
||||||
|
|
||||||
|
|
||||||
|
def getZIP(lat, lon, prefix=False, state=False, city=False):
|
||||||
|
if city == "":
|
||||||
|
city = False
|
||||||
|
else:
|
||||||
|
city = city.upper()
|
||||||
|
if state == "":
|
||||||
|
state = False
|
||||||
|
lat = float(lat)
|
||||||
|
lon = float(lon)
|
||||||
|
|
||||||
|
citystateresult = False
|
||||||
|
if city and state: # Check if city and state combo only has one standard ZIP Code
|
||||||
|
try:
|
||||||
|
citystateresult = fastsearch.by_city_and_state(city, state, returns=30, zipcode_type=ZipcodeTypeEnum.Standard) # Use simple database because it's like 2x faster
|
||||||
|
if len(citystateresult) == 1:
|
||||||
|
#print("Exact city match found: "+city+ " "+state+" "+citystateresult[0].zipcode)
|
||||||
|
return citystateresult[0].zipcode
|
||||||
|
except ValueError:
|
||||||
|
# Sometimes it objects to a city name and says it isn't valid
|
||||||
|
pass
|
||||||
|
if prefix and state: # Get ZIPs by lat/lon that start with prefix and are in state for faster queries
|
||||||
|
result = search.query(lat = lat, lng = lon, returns = 20, radius=10, prefix=str(prefix), state=state, zipcode_type=ZipcodeTypeEnum.Standard)
|
||||||
|
elif state: # Get ZIPs filtered by state for faster queries
|
||||||
|
result = search.query(lat = lat, lng = lon, returns = 20, radius=10, state=state, zipcode_type=ZipcodeTypeEnum.Standard)
|
||||||
|
elif prefix: # Get ZIPs by lat/lon that start with prefix for faster queries
|
||||||
|
result = search.query(lat = lat, lng = lon, returns = 20, radius=10, prefix=str(prefix), zipcode_type=ZipcodeTypeEnum.Standard)
|
||||||
|
else: # Get ZIPs by lat/lon
|
||||||
|
result = search.by_coordinates(lat = lat, lng = lon, returns = 20, zipcode_type=ZipcodeTypeEnum.Standard)
|
||||||
|
|
||||||
|
if len(result) == 1:
|
||||||
|
return result[0].zipcode
|
||||||
|
elif len(result) > 1:
|
||||||
|
matchzips = []
|
||||||
|
if citystateresult:
|
||||||
|
# Find zip codes that both queries have in common, maybe there's only one that overlaps with both!
|
||||||
|
for val in citystateresult:
|
||||||
|
for res in result:
|
||||||
|
if res.zipcode == val.zipcode:
|
||||||
|
matchzips.append(res)
|
||||||
|
if len(matchzips) == 1:
|
||||||
|
#print("Exact match found between lat/lon and city/state queries: "+matchzips[0])
|
||||||
|
return matchzips[0].zipcode
|
||||||
|
else:
|
||||||
|
matchzips = result
|
||||||
|
#print("W: Multiple equally-valid ZIP matches for "+str(lat)+", "+str(lon)+" "+str(city)+", "+str(state)+": ")
|
||||||
|
addrpoint = Point(lon, lat)
|
||||||
|
for zip in matchzips:
|
||||||
|
#print(" "+zip.zipcode)
|
||||||
|
zippolys = zip.polygon
|
||||||
|
if zippolys == None:
|
||||||
|
continue
|
||||||
|
if dimensionality(zippolys) == 2:
|
||||||
|
zippolys = [zippolys]
|
||||||
|
for poly in zippolys:
|
||||||
|
zipborder = Polygon(poly)
|
||||||
|
if zipborder.contains(addrpoint):
|
||||||
|
#print("Found probable ZIP based on border: " + zip.zipcode)
|
||||||
|
return zip.zipcode
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def dimensionality(matrix):
|
||||||
|
dims = []
|
||||||
|
while isinstance(matrix, list) and matrix is not None:
|
||||||
|
dims.append(len(matrix))
|
||||||
|
matrix = matrix[0]
|
||||||
|
return len(dims)
|
||||||
42736
zip_code_database.csv
Normal file
42736
zip_code_database.csv
Normal file
File diff suppressed because it is too large
Load Diff
272
zipdbgen.py
Executable file
272
zipdbgen.py
Executable file
@ -0,0 +1,272 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
# Generate a ZIP+4 database from the data at https://www.zip-codes.com/zip-plus-4-database.asp
|
||||||
|
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
import sqlite3, zipfile, re
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
def process(infile, outfile):
|
||||||
|
print("Reading " + infile)
|
||||||
|
zf = zipfile.ZipFile(infile, mode="r")
|
||||||
|
zipFiles = zf.namelist()
|
||||||
|
ziplist = []
|
||||||
|
zip5list = []
|
||||||
|
zipcountylist = [] # List of ZIPs in multiple counties
|
||||||
|
for fname in zipFiles:
|
||||||
|
if re.match("ZIP4-[A-Z]{2}.zip", fname):
|
||||||
|
ziplist.append(fname)
|
||||||
|
elif fname == "zip-codes-database-STANDARD.csv":
|
||||||
|
zip5list.append(fname)
|
||||||
|
elif "MULTI-COUNTY" in fname and fname.endswith(".csv"):
|
||||||
|
zipcountylist.append(fname)
|
||||||
|
|
||||||
|
filesprocessed = 0
|
||||||
|
chunksprocessed = 0
|
||||||
|
chunksize = 5000
|
||||||
|
|
||||||
|
if len(ziplist) > 0:
|
||||||
|
print("Creating ZIP+4 database")
|
||||||
|
connection = sqlite3.connect(outfile)
|
||||||
|
connection.executescript("PRAGMA foreign_keys=OFF;")
|
||||||
|
c = connection.cursor()
|
||||||
|
c.execute("PRAGMA journal_mode=OFF;") # or MEMORY; fastest is OFF (risk if crash)
|
||||||
|
c.execute("PRAGMA synchronous=OFF;") # biggest win: no fsync on each commit
|
||||||
|
c.execute("PRAGMA temp_store=MEMORY;") # keep temp B-trees in RAM
|
||||||
|
c.execute("PRAGMA cache_size=-1600000;") # ~1600MB page cache (negative = KB)
|
||||||
|
c.execute("PRAGMA locking_mode=EXCLUSIVE;") # avoid lock thrash
|
||||||
|
c.execute("PRAGMA mmap_size=1073741824;") # 1GB mmap; helps reads, slight write help
|
||||||
|
c.execute("PRAGMA page_size=65536;")
|
||||||
|
createZIP4DB(c)
|
||||||
|
|
||||||
|
def mergeStreet(row):
|
||||||
|
return ' '.join(filter(None, [row["StPreDirAbbr"], row["StName"], row["StSuffixAbbr"], row["StPostDirAbbr"]]))
|
||||||
|
|
||||||
|
for file in ziplist:
|
||||||
|
with zf.open(file, mode="r", force_zip64=True) as innerfile:
|
||||||
|
with zipfile.ZipFile(innerfile, mode="r") as innerzip:
|
||||||
|
with innerzip.open(innerzip.namelist()[0], mode="r") as csvfile:
|
||||||
|
print("\nImporting " + file + " ..." + " ", end="\r", flush=True)
|
||||||
|
for chunk in pd.read_csv(csvfile, chunksize=chunksize, keep_default_na=False, dtype="str"):
|
||||||
|
chunk["StreetFull"] = chunk.apply(mergeStreet, axis=1)
|
||||||
|
chunk.to_sql("ZIP4", connection, if_exists='append', index=False, method='multi')
|
||||||
|
chunksprocessed = chunksprocessed + 1
|
||||||
|
print("Importing " + file + " ... " + str(chunksprocessed * chunksize) +" ", end="\r", flush=True)
|
||||||
|
#print("\nVacuuming database...")
|
||||||
|
#connection.executescript("VACUUM")
|
||||||
|
filesprocessed = filesprocessed + 1
|
||||||
|
zf.close()
|
||||||
|
|
||||||
|
if len(zip5list) > 0:
|
||||||
|
print("Creating 5-digit ZIP database")
|
||||||
|
connection = sqlite3.connect(outfile)
|
||||||
|
c = connection.cursor()
|
||||||
|
createZIP5DB(c)
|
||||||
|
filesprocessed = 1
|
||||||
|
with zf.open(zip5list[0], mode="r", force_zip64=True) as csvfile:
|
||||||
|
print("\nImporting " + zip5list[0] + " ..." + " ", end="\r", flush=True)
|
||||||
|
for chunk in pd.read_csv(csvfile, chunksize=chunksize, keep_default_na=False, dtype="str"):
|
||||||
|
chunk.to_sql("ZIPCodes", connection, if_exists='append', index=False)
|
||||||
|
chunksprocessed = chunksprocessed + 1
|
||||||
|
print("Importing " + zip5list[0] + " ... " + str(chunksprocessed * chunksize) +" ", end="\r", flush=True)
|
||||||
|
|
||||||
|
if len(zipcountylist) > 0:
|
||||||
|
print("Creating Multi-county ZIP database")
|
||||||
|
connection = sqlite3.connect(outfile)
|
||||||
|
c = connection.cursor()
|
||||||
|
createZIPMultiCountyDB(c)
|
||||||
|
filesprocessed = 1
|
||||||
|
with zf.open(zipcountylist[0], mode="r", force_zip64=True) as csvfile:
|
||||||
|
print("\nImporting " + zipcountylist[0] + " ..." + " ", end="\r", flush=True)
|
||||||
|
for chunk in pd.read_csv(csvfile, chunksize=chunksize, keep_default_na=False, dtype="str"):
|
||||||
|
chunk.to_sql("ZIPCodesMultiCounty", connection, if_exists='append', index=False)
|
||||||
|
chunksprocessed = chunksprocessed + 1
|
||||||
|
print("Importing " + zipcountylist[0] + " ... " + str(chunksprocessed * chunksize) +" ", end="\r", flush=True)
|
||||||
|
|
||||||
|
print("\nFiles processed: " + str(filesprocessed))
|
||||||
|
print("Records processed: " + str(chunksprocessed * chunksize))
|
||||||
|
print("Done! Saved to " + outfile)
|
||||||
|
print("\nOne last thing: optimizing output database (this might take a few minutes)...")
|
||||||
|
connection.executescript("VACUUM; ANALYZE; PRAGMA optimize;")
|
||||||
|
|
||||||
|
|
||||||
|
def createZIP5DB(c):
|
||||||
|
c.execute("DROP TABLE IF EXISTS ZIPCodes")
|
||||||
|
c.execute('''CREATE TABLE ZIPCodes (
|
||||||
|
ZipCode char(5) NOT NULL,
|
||||||
|
City varchar(35) NULL,
|
||||||
|
State char(2),
|
||||||
|
County varchar(45) NULL,
|
||||||
|
AreaCode varchar(55) NULL,
|
||||||
|
CityType char(1) NULL,
|
||||||
|
CityAliasAbbreviation varchar(13) NULL,
|
||||||
|
CityAliasName varchar(35) NULL,
|
||||||
|
Latitude decimal(12, 6),
|
||||||
|
Longitude decimal(12, 6),
|
||||||
|
TimeZone char(2) NULL,
|
||||||
|
Elevation int,
|
||||||
|
CountyFIPS char(5) NULL,
|
||||||
|
DayLightSaving char(1) NULL,
|
||||||
|
PreferredLastLineKey varchar(10) NULL,
|
||||||
|
ClassificationCode char(1) NULL,
|
||||||
|
MultiCounty char(1) NULL,
|
||||||
|
StateFIPS char(2) NULL,
|
||||||
|
CityStateKey char(6) NULL,
|
||||||
|
CityAliasCode varchar(5) NULL,
|
||||||
|
PrimaryRecord char(1),
|
||||||
|
CityMixedCase varchar(35) NULL,
|
||||||
|
CityAliasMixedCase varchar(35) NULL,
|
||||||
|
StateANSI varchar(2) NULL,
|
||||||
|
CountyANSI varchar(3) NULL,
|
||||||
|
FacilityCode varchar(1) NULL,
|
||||||
|
CityDeliveryIndicator varchar(1) NULL,
|
||||||
|
CarrierRouteRateSortation varchar(1) NULL,
|
||||||
|
FinanceNumber varchar(6) NULL,
|
||||||
|
UniqueZIPName varchar(1) NULL,
|
||||||
|
CountyMixedCase varchar(45) NULL
|
||||||
|
);''')
|
||||||
|
c.execute("CREATE INDEX Index_ZIPCodes_ZipCode ON ZIPCodes (ZipCode)")
|
||||||
|
c.execute("CREATE INDEX Index_ZIPCodes_State ON ZIPCodes (State)")
|
||||||
|
c.execute("CREATE INDEX Index_ZIPCodes_County ON ZIPCodes (County)")
|
||||||
|
c.execute("CREATE INDEX Index_ZIPCodes_AreaCode ON ZIPCodes (AreaCode)")
|
||||||
|
c.execute("CREATE INDEX Index_ZIPCodes_City ON ZIPCodes (City)")
|
||||||
|
c.execute("CREATE INDEX Index_ZIPCodes_Latitude ON ZIPCodes (Latitude)")
|
||||||
|
c.execute("CREATE INDEX Index_ZIPCodes_Longitude ON ZIPCodes (Longitude)")
|
||||||
|
c.execute("CREATE INDEX Index_ZIPCodes_CityAliasName ON ZIPCodes (CityAliasName)")
|
||||||
|
c.execute("CREATE INDEX Index_ZIPCodes_CityStateKey ON ZIPCodes (CityStateKey)")
|
||||||
|
|
||||||
|
c.execute("DROP TABLE IF EXISTS States")
|
||||||
|
c.execute("CREATE TABLE States (code TEXT, name TEXT)")
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("AE", "Armed Forces Europe, the Middle East, and Canada")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("AP", "Armed Forces Pacific")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("AA", "Armed Forces Americas")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("AL", "Alabama")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("AK", "Alaska")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("AS", "American Samoa")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("AZ", "Arizona")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("AR", "Arkansas")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("CA", "California")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("CO", "Colorado")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("CT", "Connecticut")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("DE", "Delaware")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("DC", "District of Columbia")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("FM", "Federated States of Micronesia")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("FL", "Florida")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("GA", "Georgia")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("GU", "Guam")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("HI", "Hawaii")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("ID", "Idaho")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("IL", "Illinois")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("IN", "Indiana")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("IA", "Iowa")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("KS", "Kansas")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("KY", "Kentucky")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("LA", "Louisiana")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("ME", "Maine")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("MH", "Marshall Islands")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("MD", "Maryland")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("MA", "Massachusetts")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("MI", "Michigan")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("MN", "Minnesota")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("MS", "Mississippi")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("MO", "Missouri")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("MT", "Montana")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("NE", "Nebraska")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("NV", "Nevada")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("NH", "New Hampshire")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("NJ", "New Jersey")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("NM", "New Mexico")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("NY", "New York")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("NC", "North Carolina")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("ND", "North Dakota")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("MP", "Northern Mariana Islands")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("OH", "Ohio")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("OK", "Oklahoma")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("OR", "Oregon")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("PW", "Palau")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("PA", "Pennsylvania")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("PR", "Puerto Rico")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("RI", "Rhode Island")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("SC", "South Carolina")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("SD", "South Dakota")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("TN", "Tennessee")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("TX", "Texas")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("UT", "Utah")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("VT", "Vermont")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("VI", "Virgin Islands")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("VA", "Virginia")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("WA", "Washington")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("WV", "West Virginia")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("WI", "Wisconsin")')
|
||||||
|
c.execute('INSERT INTO "States" ("code", "name") VALUES ("WY", "Wyoming")')
|
||||||
|
|
||||||
|
def createZIPMultiCountyDB(c):
|
||||||
|
c.execute("DROP TABLE IF EXISTS ZIPCodesMultiCounty")
|
||||||
|
c.execute("CREATE TABLE ZIPCodesMultiCounty ( ZipCode char(5) NOT NULL, StateFIPS char(2), State char(2), CountyFIPS char(5) NULL, County varchar(45), CountyMixedCase varchar(45) )")
|
||||||
|
c.execute("CREATE INDEX Index_ZIPCodesMultiCounty_ZipCode ON ZIPCodesMultiCounty (ZipCode)")
|
||||||
|
c.execute("CREATE INDEX Index_ZIPCodesMultiCounty_State ON ZIPCodesMultiCounty (State)")
|
||||||
|
c.execute("CREATE INDEX Index_ZIPCodesMultiCounty_County ON ZIPCodesMultiCounty (County)")
|
||||||
|
|
||||||
|
def createZIP4DB(c):
|
||||||
|
c.execute("DROP TABLE IF EXISTS `ZIP4`")
|
||||||
|
c.execute('''
|
||||||
|
CREATE TABLE "ZIP4" (
|
||||||
|
"ZipCode"char(5),
|
||||||
|
"UpdateKey"varchar(10),
|
||||||
|
"Action"char(1),
|
||||||
|
"RecordType"varchar(1),
|
||||||
|
"CarrierRoute"varchar(4),
|
||||||
|
"StPreDirAbbr"varchar(2),
|
||||||
|
"StName"varchar(28),
|
||||||
|
"StSuffixAbbr"varchar(4),
|
||||||
|
"StPostDirAbbr"varchar(2),
|
||||||
|
"AddressPrimaryLowNumber"varchar(10),
|
||||||
|
"AddressPrimaryHighNumber"varchar(10),
|
||||||
|
"AddressPrimaryEvenOdd"varchar(1),
|
||||||
|
"BuildingName"varchar(40),
|
||||||
|
"AddressSecAbbr"varchar(4),
|
||||||
|
"AddressSecLowNumber"varchar(10),
|
||||||
|
"AddressSecHighNumber"varchar(10),
|
||||||
|
"AddressSecOddEven"varchar(1),
|
||||||
|
"Plus4Low"varchar(4),
|
||||||
|
"Plus4High"varchar(4),
|
||||||
|
"BaseAlternateCode"varchar(1),
|
||||||
|
"LACSStatus"varchar(1),
|
||||||
|
"GovernmentBuilding"varchar(1),
|
||||||
|
"FinanceNumber"varchar(6),
|
||||||
|
"State"varchar(2),
|
||||||
|
"CountyFIPS"varchar(3),
|
||||||
|
"CongressionalDistrict"varchar(2),
|
||||||
|
"MunicipalityKey"varchar(6),
|
||||||
|
"UrbanizationKey"varchar(6),
|
||||||
|
"PreferredLastLineKey"varchar(6),
|
||||||
|
"ToLatitude"decimal(18, 10),
|
||||||
|
"FromLatitude"decimal(18, 10),
|
||||||
|
"ToLongitude"decimal(18, 10),
|
||||||
|
"FromLongitude"decimal(18, 10),
|
||||||
|
"CensusTract"varchar(15),
|
||||||
|
"CensusBlock"varchar(15),
|
||||||
|
"TLID"varchar(15),
|
||||||
|
"LatLonMultiMatch"varchar(1),
|
||||||
|
"StreetFull" varchar(36)
|
||||||
|
)
|
||||||
|
''')
|
||||||
|
c.execute('''CREATE INDEX "addressnumber" ON "ZIP4" ("AddressPrimaryLowNumber","AddressPrimaryHighNumber","AddressPrimaryOddEven")''')
|
||||||
|
c.execute('''CREATE INDEX "key" ON "ZIP4" ("PreferredLastLineKey")''')
|
||||||
|
c.execute('''CREATE INDEX "zipcode_route" ON "ZIP4" ("ZipCode", "CarrierRoute")''')
|
||||||
|
c.execute('''CREATE INDEX "state" ON "ZIP4" ("State")''')
|
||||||
|
c.execute('''CREATE INDEX "streetfull_state" ON "ZIP4" ("StreetFull", "State")''')
|
||||||
|
c.execute('''CREATE INDEX "stname_state" ON "ZIP4" ("StName", "State")''')
|
||||||
|
c.execute('''CREATE INDEX "zip" ON "ZIP4" ("ZipCode")''')
|
||||||
|
c.execute('''CREATE INDEX "streetfull_state_zip" ON "ZIP4" ("StreetFull", "State", "ZipCode")''')
|
||||||
|
c.execute('''CREATE INDEX "stname_state_zip" ON "ZIP4" ("StName", "State", "ZipCode")''')
|
||||||
|
|
||||||
|
|
||||||
|
parser = ArgumentParser(description='Create a SQLite ZIP Code database from CSV data from https://www.zip-codes.com/zip-plus-4-database.asp. Supports both 5-digit ZIP and ZIP+4 products.')
|
||||||
|
|
||||||
|
parser.add_argument('src', help='Input .zip archive')
|
||||||
|
parser.add_argument('dest', help='Output SQLite3 database file')
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
args = parser.parse_args()
|
||||||
|
process(args.src, args.dest)
|
||||||
Loading…
x
Reference in New Issue
Block a user