Add gitignore

This commit is contained in:
Skylar Ittner 2025-11-15 19:51:14 -07:00
commit b2de3304f3
15 changed files with 45862 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
__pycache__/*

99
checkoa.py Executable file
View File

@ -0,0 +1,99 @@
#!/usr/bin/python3
import os, json, traceback
from argparse import ArgumentParser
rowstocheck = 15000 # Stop reading a file after this many rows, speeds up analysis of many/large address files
oklist = []
emptygeometrylist = []
emptyaddresslist = []
nocitylist = []
noziplist = []
totallyemptylist = []
def checkGeojson(filename):
filedata = open(filename, 'r')
linecount = 0
okcount = 0
emptygeometrycount = 0
emptyaddresscount = 0
emptycitycount = 0
emptyzipcount = 0
for line in filedata:
linecount = linecount + 1
if linecount > rowstocheck:
break;
try:
data = json.loads(line)
bad = False
if not data["properties"]["number"] or not data["properties"]["street"]:
emptyaddresscount = emptyaddresscount + 1
bad = True
if not data["geometry"] or not data["geometry"]["coordinates"][0] or not data["geometry"]["coordinates"][1]:
emptygeometrycount = emptygeometrycount + 1
bad = True
if not data["properties"]["city"] and not data["properties"]["postcode"]: # Flag missing city unless postal code exists, because city can probably be filled from that
emptycitycount = emptycitycount + 1
bad = True
if not data["properties"]["postcode"]:
emptyzipcount = emptyzipcount + 1
bad = True
if bad == False:
okcount = okcount + 1
except Exception as e:
traceback.print_exc()
print("Error encountered while processing", filename, "at", line)
print(filename, ": OK:", str(okcount), "Bad: geometry:", str(emptygeometrycount), "address:", str(emptyaddresscount), "city:", str(emptycitycount), "zip:", str(emptyzipcount), " ", end="\r", flush=True)
filedata.close()
bad = False
if emptygeometrycount / linecount > .25:
emptygeometrylist.append(filename)
bad = True
if emptyaddresscount / linecount > .67:
emptyaddresslist.append(filename)
bad = True
if emptycitycount / linecount > .67:
nocitylist.append(filename)
bad = True
if emptyzipcount / linecount > .75:
noziplist.append(filename)
bad = True
if emptyaddresscount >= (linecount - 10): # Allow a couple not-fully-empty addresses, otherwise some broken ones won't be reported
totallyemptylist.append(filename)
bad = True
if bad == False:
oklist.append(filename)
parser = ArgumentParser(
description="Check OpenAddresses GeoJSON files and report on any problems found."
)
parser.add_argument(
"source",
help="File(s) to check.",
nargs='+'
)
if __name__ == "__main__":
args = parser.parse_args()
print("Checking " + str(len(args.source)) + " OpenAddresses data files.")
for filename in args.source:
checkGeojson(filename)
print(" ")
print()
print("== Report ==")
print(" Files missing geometry:")
for filename in emptygeometrylist:
print(" ", filename)
print(" Files missing street address:")
for filename in emptyaddresslist:
print(" ", filename)
print(" Files missing city:")
for filename in nocitylist:
print(" ", filename)
print(" Files missing postal code:")
for filename in noziplist:
print(" ", filename)
print(" Files missing all street addresses:")
for filename in totallyemptylist:
print(" ", filename)

63
downloadoa.py Executable file
View File

@ -0,0 +1,63 @@
#!/usr/bin/python3
import gzip
import shutil
from argparse import ArgumentParser
import requests, tempfile, os, pathlib
sourceList = {}
def getSourceList():
global sourceList
if sourceList == {}:
print("Fetching sources list")
json = requests.get(
"https://batch.openaddresses.io/api/data"
).json()
for s in json:
if s["layer"] != "addresses":
continue
if s["source"] in sourceList:
if s["updated"] > sourceList[s["source"]]["updated"]:
sourceList[s["source"]] = s
else:
sourceList[s["source"]] = s
return sourceList
def downloadSources(id, outfolder):
for sourceName in getSourceList():
s = getSourceList()[sourceName]
if s["source"].startswith(id):
outfilename = outfolder + "/" + s["source"] + "-addresses-" + s["name"] + ".geojson"
outfoldername = os.path.dirname(outfilename)
if os.path.isfile(outfilename):
print("Skipping " + s["source"] + ", already on disk.")
continue
print("Downloading " + s["source"])
gzdl = requests.get("https://v2.openaddresses.io/batch-prod/job/" + str(s["job"]) + "/source.geojson.gz", stream=True)
tmp = tempfile.NamedTemporaryFile()
with open(tmp.name, 'wb') as tf:
for chunk in gzdl.iter_content(chunk_size=16*1024):
tf.write(chunk)
pathlib.Path(outfoldername).mkdir(parents=True, exist_ok=True)
with gzip.open(tmp.name) as gzf, open(outfilename, 'wb') as outf:
shutil.copyfileobj(gzf, outf)
parser = ArgumentParser(
description="Download address data from OpenAddresses.io"
)
parser.add_argument(
"source",
help="Source dataset ID, or partial ID. For example: us/al/ will download all Alabama datasets, us/mt/statewide will download the Montana statewide dataset.",
)
parser.add_argument(
"outfolder",
help="Output folder",
)
if __name__ == "__main__":
args = parser.parse_args()
downloadSources(args.source, args.outfolder)

780
main.py Executable file
View File

@ -0,0 +1,780 @@
#!/usr/bin/python3
if __name__ == "__main__":
print("Address Database Builder 2025")
print("Starting up...")
import argparse, csv, zipfile, gzip, os, re, json, traceback, sys, multiprocessing
import concurrent.futures
from collections import deque
import pandas as pd
import dask.dataframe as dd
import gc
from multiprocessing import get_context
import sqlite3
from src.addressfunctions import normalizeAddress
from src.constants import ValidationException
import src.config
maxthreads = multiprocessing.cpu_count()
MAX_IN_FLIGHT = maxthreads * 2
os.environ["OPENBLAS_MAIN_FREE"] = "1"
writelock = multiprocessing.Lock()
badcount = 0
skippedcount = 0
countrycode = "US"
def init_worker(cfg: src.config.AppConfig):
src.config.set_config(cfg)
def fixLatLon(filepath):
cfg = src.config.get_config()
print("Repairing flipped latitude/longitude pairs in " + filepath)
fixedcount = 0
df = pd.read_csv(filepath, keep_default_na=False, dtype="str")
skipstates = ("VI", "AK", "HI", "PR")
for index, row in df.iterrows():
row.latitude = float(row.latitude)
row.longitude = float(row.longitude)
if row.latitude < -90 or row.latitude > 90:
df.at[index, "latitude"], df.at[index, "longitude"] = row.longitude, row.latitude
fixedcount = fixedcount + 1
elif cfg.countryCode == "US" and row.state not in skipstates and (row.longitude < -171.791110603 or row.longitude > -66.96466):
df.at[index, "latitude"], df.at[index, "longitude"] = row.longitude, row.latitude
fixedcount = fixedcount + 1
elif cfg.countryCode == "US" and row.state not in skipstates and (row.latitude < 18.91619 or row.latitude > 71.3577635769):
df.at[index, "latitude"], df.at[index, "longitude"] = row.longitude, row.latitude
fixedcount = fixedcount + 1
df.to_csv(filepath + ".coordfix.csv", mode='a', index=False, header=not os.path.exists(filepath + ".coordfix.csv"))
print("\nDone flipping " + filepath + "! Fixed " + str(fixedcount) + " records.")
def normalize(number, street, street2, city, state, zipcode, latitude, longitude, zipprefix = False, plus4="", county = False):
cfg = src.config.get_config()
street1 = street
if len(city) > 4 and street1.endswith(" " + city):
# City name leaked into street field (Albany County Wyoming, for one)
street1 = street1.removesuffix(" " + city)
addr = normalizeAddress(number, street1, street2, city, state, zipcode, round(float(latitude),7), round(float(longitude), 7), zipprefix, plus4, county)
if len(addr['zip'] or "") < 5 or len(addr['plus4'] or "") != 4:
# Try removing letters from address numbers, and ignore city field
addrstrip = normalizeAddress(re.sub("[^0-9]", "", addr['number']), addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['latitude'], addr['longitude'], False, addr['plus4'], county)
# If that didn't work, try instead stripping the city name because it might be wrong
if addr['city'] != "" and (len(addrstrip['zip'] or "") < 5 or len(addrstrip['plus4'] or "") != 4):
addrstrip = normalizeAddress(addr['number'], addr['street'], addr['unit'], "", addr['state'], addr['zip'], addr['latitude'], addr['longitude'], False, addr['plus4'], county)
# Use libpostal to analyze address deeper
if cfg.advancedMode and len(addrstrip['zip'] or "") < 5 or len(addrstrip['plus4'] or "") != 4:
try:
addr = advancedNormalize(addr['number'], addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['latitude'], addr['longitude'], False, addr['plus4'], county)
except Exception as e:
pass
# Do another normalize pass for good luck (maybe the previous one got the ZIP and now we can get the +4)
if len(addr['zip'] or "") < 5 or len(addr['plus4'] or "") != 4:
addr = normalizeAddress(addr['number'], addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['latitude'], addr['longitude'], False, addr['plus4'], county)
else:
addr = addrstrip
return addr
def processOwnChunk(chunk, chunkcount, outfilename, ignorestates, keeponlystates):
global badcount, skippedcount, writelock
cfg = src.config.get_config()
data = []
print(" " + str(chunkcount) + " ", end="\r", flush=True)
for index, row in chunk.iterrows():
if row.state in ignorestates:
skippedcount = skippedcount + 1
continue
if keeponlystates != [] and row.state not in keeponlystates:
skippedcount = skippedcount + 1
continue
try:
if not cfg.noSkip4 and len(row.plus4 or "") == 4:
addr = {
"number": row.number,
"street": row.street,
"unit": row.street2,
"city": row.city,
"state": row.state,
"zip": row.zip,
"plus4": row.plus4,
"latitude": round(float(row.latitude),7),
"longitude": round(float(row.longitude), 7)
}
else:
addr = normalize(row.number, row.street, row.street2, row.city, row.state, row.zip, round(float(row.latitude),7), round(float(row.longitude), 7), False, row.plus4)
if addr["state"] in ignorestates:
skippedcount = skippedcount + 1
continue
data.append([addr['number'], addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['plus4'], addr['latitude'], addr['longitude'], row.source])
except ValidationException as e:
badcount = badcount + 1
except Exception as e:
print("W: Couldn't ingest address:")
print(row)
traceback.print_exc()
badcount = badcount + 1
out = pd.DataFrame(data=data, columns=["number","street","street2","city","state","zip","plus4","latitude","longitude","source"])
with writelock:
out.to_csv(outfilename, mode='a', index=False, header=not os.path.exists(outfilename), columns=["number","street","street2","city","state","zip","plus4","latitude","longitude","source"])
gc.collect()
def importOwnFile(filename, outfilename, ignorestates, keeponlystates):
global badcount, skippedcount, writelock
print("Processing addresses from " + filename)
columns = ["number","street","street2","city","state","zip","plus4","latitude","longitude","source"]
file = filename
chunkcount = 0
badcount = 0
skippedcount = 0
chunksize = 1000
in_flight = set()
with concurrent.futures.ProcessPoolExecutor(max_workers=maxthreads, max_tasks_per_child=100, initializer=init_worker, initargs=(cfg,)) as executor:
for chunk in pd.read_csv(file, chunksize=chunksize, usecols=columns, keep_default_na=False, dtype={
"number":"string","street":"string",
"street2":"string","city":"string",
"state":"category", "zip":"string",
"plus4": "string",
"latitude":"float32", "longitude":"float32",
"source":"category"}, dtype_backend="pyarrow"):
while len(in_flight) >= MAX_IN_FLIGHT:
done, in_flight = concurrent.futures.wait(in_flight, return_when=concurrent.futures.FIRST_COMPLETED)
for fut in done:
fut.result()
fut = executor.submit(processOwnChunk, chunk, chunkcount * chunksize, outfilename, ignorestates, keeponlystates)
in_flight.add(fut)
chunkcount = chunkcount + 1
for fut in concurrent.futures.as_completed(in_flight):
fut.result()
print("\nDone processing! Parsed " + str(chunkcount) + " chunks.")
print("There were " + str(badcount) + " unprocessable addresses.")
if ignorestates:
print("There were " + str(skippedcount) + " addresses ignored due to your --ignorestates setting.")
print("Saved to output file " + outfilename)
def processNadChunk(chunk, chunkcount, outfilename, ignorestates, keeponlystates):
global badcount, skippedcount, writelock
print(" " + str(chunkcount) + " ", end="\r", flush=True)
data = []
for index, row in chunk.iterrows():
if row.State.upper() in ignorestates:
skippedcount = skippedcount + 1
continue
if keeponlystates != [] and row.State.upper() not in keeponlystates:
skippedcount = skippedcount + 1
continue
try:
town = row.Inc_Muni
if town == "Unincorporated":
town = ""
if not town:
town = row.Post_City
if not town:
town = row.Uninc_Comm
addr = normalize(row.AddNo_Full, row.StNam_Full, row.SubAddress, row.Inc_Muni, row.State, row.Zip_Code, round(float(row.Latitude),7), round(float(row.Longitude), 7))
if addr["state"] in ignorestates: # For example, AR's data claims to have MO addresses but the ZIP says they're in AR, so the first pass of this won't catch those
skippedcount = skippedcount + 1
continue
source = row.NAD_Source
source = source.replace("State of ", "")
source = "NAD " + source
data.append([addr['number'], addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['plus4'], addr['latitude'], addr['longitude'], source])
except ValidationException as e:
badcount = badcount + 1
except Exception as e:
print("W: Couldn't ingest address:")
print(row)
traceback.print_exc()
badcount = badcount + 1
out = pd.DataFrame(data=data, columns=["number","street","street2","city","state","zip","plus4","latitude","longitude","source"])
with writelock:
out.to_csv(outfilename, mode='a', index=False, header=not os.path.exists(outfilename), columns=["number","street","street2","city","state","zip","plus4","latitude","longitude","source"])
gc.collect()
def importNadFile(filename, outfilename, ignorestates, keeponlystates, startatline):
global skippedcount, badcount
print("Importing National Address Database addresses from " + filename)
if startatline > 0:
print("Skipping to line number " + str(startatline))
columns = [
"AddNo_Full",
"StNam_Full",
"St_PreMod",
"St_PreDir",
"St_Name",
"SubAddress",
"Inc_Muni",
"Post_City",
"Uninc_Comm",
"Urbnztn_PR",
"State",
"Zip_Code",
"UUID",
"Longitude",
"Latitude",
"DateUpdate",
"NAD_Source",
]
file = filename
if filename.endswith(".zip"):
zf = zipfile.ZipFile(filename, mode="r")
zipFiles = zf.namelist()
for fname in zipFiles:
if fname.upper().startswith("TXT/NAD") and fname.upper().endswith(".TXT"):
file = zf.open(fname, mode="r", force_zip64=True)
break
chunkcount = 0
chunksize = 1000
in_flight = set()
with concurrent.futures.ProcessPoolExecutor(max_workers=maxthreads, mp_context=get_context("spawn"), max_tasks_per_child=100, initializer=init_worker, initargs=(cfg,)) as executor:
for chunk in pd.read_csv(file, chunksize=chunksize, header=0, skiprows=lambda i: 1 <= i <= startatline, usecols=columns, keep_default_na=False, dtype={
"State":"category","NAD_Source":"category",
"Zip_Code":"string","UUID":"string",
"AddNo_Full":"string","StNam_Full":"string","St_PreMod":"string",
"St_PreDir":"string","St_Name":"string","SubAddress":"string",
"Inc_Muni":"string","Post_City":"string","Uninc_Comm":"string",
"Urbnztn_PR":"string","Longitude":"float32","Latitude":"float32",
"DateUpdate":"string"}, dtype_backend="pyarrow"):
while len(in_flight) >= MAX_IN_FLIGHT:
done, in_flight = concurrent.futures.wait(in_flight, return_when=concurrent.futures.FIRST_COMPLETED)
for fut in done:
fut.result()
fut = executor.submit(processNadChunk, chunk, chunkcount * chunksize, outfilename, ignorestates, keeponlystates)
in_flight.add(fut)
chunkcount = chunkcount + 1
for fut in concurrent.futures.as_completed(in_flight):
fut.result()
print("\nDone importing NAD! Processed " + str(chunkcount) + " chunks of " + str(chunksize) + " rows.")
print("There were " + str(badcount) + " unprocessable addresses.")
if ignorestates:
print("There were " + str(skippedcount) + " addresses ignored due to your --ignorestates setting.")
print("Saved to output file " + outfilename)
def processOpenAddressRows(rows, startindex, outfilename, ignorestates, source, stateOverride, zipprefix, citySuggestion, county = False):
global badcount, skippedcount, writelock
print(" " + str(startindex) + " ", end="\r", flush=True)
linecount = 0
outdata = []
emptylinecount = 0
for line in rows:
linecount = linecount + 1
try:
data = json.loads(line)
if not data["properties"]["number"] and not data["properties"]["street"]:
emptylinecount = emptylinecount + 1
if not data["geometry"] or not data["geometry"]["coordinates"][0] or not data["geometry"]["coordinates"][1]:
emptylinecount = emptylinecount + 1
state = data["properties"]["region"].upper()
city = data["properties"]["city"].upper().strip()
if stateOverride:
state = stateOverride
if state in ignorestates:
skippedcount = skippedcount + 1
continue
if data["geometry"] is None:
badcount = badcount + 1
continue
if not data["properties"]["number"] or not data["properties"]["street"] or data["properties"]["number"] == "0":
badcount = badcount + 1
continue
if citySuggestion and not city:
city = citySuggestion
if source == "OA/hawaii" and re.match(r"^[1-9][1-9][0-9]{4}", data["properties"]["number"]):
# Source is broken/missing, and the last good version has the house numbers without dashes
# Hawaii has a specific and unique address numbering system
data["properties"]["number"] = data["properties"]["number"][:2] + "-" + data["properties"]["number"][2:]
addr = normalize(data["properties"]["number"], data["properties"]["street"], data["properties"]["unit"], city, state, data["properties"]["postcode"], data["geometry"]["coordinates"][1], data["geometry"]["coordinates"][0], zipprefix, "", county)
if addr["state"] in ignorestates:
skippedcount = skippedcount + 1
continue
if addr["street"] == "":
badcount = badcount + 1
continue
if not source:
source = "OA/"+addr["state"]
outdata.append([addr['number'], addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['plus4'], addr['latitude'], addr['longitude'], source])
except ValidationException as e:
badcount = badcount + 1
except Exception as e:
traceback.print_exc()
print("Error encountered while processing", line)
badcount = badcount + 1
if linecount > 0 and emptylinecount / linecount > .95:
print("\nWarning: Empty chunk! " + str(emptylinecount) + " of " + str(linecount) + " rows had no address.")
out = pd.DataFrame(data=outdata, columns=["number","street","street2","city","state","zip","plus4","latitude","longitude","source"])
with writelock:
out.to_csv(outfilename, mode='a', index=False, header=not os.path.exists(outfilename), columns=["number","street","street2","city","state","zip","plus4","latitude","longitude","source"])
gc.collect()
def importOpenAddressFile(filepath, outfilename, ignorestates, source, stateOverride, zipprefix):
global badcount, skippedcount
cfg = src.config.get_config()
print("Importing OpenAddresses data from " + filepath)
chunksize = 1000
linecount = 0
if stateOverride:
stateOverride = stateOverride.strip().upper()
file = filepath
if filepath.endswith(".gz"):
file = gzip.open(filepath, 'rb')
else:
file = open(file, 'r')
county = False
if not source or source == "":
source = "OA/"+filepath.split("/")[-1].split("-")[0]
if source.startswith("OA/statewide"):
if stateOverride:
source = source.replace("statewide", stateOverride)
else:
source = False
citySuggestion = False
if not cfg.citySuggestion and filepath.split("/")[-1].startswith("city_of_"):
# Set city suggestion using filename
citySuggestion = re.sub(r'\d+', '', filepath.split("/")[-1].split("-")[0].replace("city_of_", "").replace("_", " ").upper().strip())
if filepath.split("/")[-1].endswith("-addresses-county.geojson"):
county = filepath.split("/")[-1].split("-")[0].replace("_", " ").upper().strip()
print("Detected county from filename: " + county + ", will use for ZIP Code hinting")
lines = []
in_flight = set()
with concurrent.futures.ProcessPoolExecutor(max_workers=maxthreads, mp_context=get_context("spawn"), max_tasks_per_child=1000, initializer=init_worker, initargs=(cfg,)) as executor:
for line in file:
lines.append(line)
linecount = linecount + 1
if len(lines) >= chunksize:
while len(in_flight) >= MAX_IN_FLIGHT:
done, in_flight = concurrent.futures.wait(in_flight, return_when=concurrent.futures.FIRST_COMPLETED)
for fut in done:
fut.result()
fut = executor.submit(processOpenAddressRows, lines, linecount, outfilename, ignorestates, source, stateOverride, zipprefix, citySuggestion, county)
in_flight.add(fut)
lines = []
fut = executor.submit(processOpenAddressRows, lines, linecount, outfilename, ignorestates, source, stateOverride, zipprefix, citySuggestion, county)
in_flight.add(fut)
for fut in concurrent.futures.as_completed(in_flight):
fut.result()
file.close()
print("\nDone importing OpenAddresses! Processed " + str(linecount) + " entries.")
print("There were " + str(badcount) + " unprocessable addresses.")
if ignorestates:
print("There were " + str(skippedcount) + " addresses ignored due to your --ignorestates setting.")
print("Saved to output file " + outfilename)
return
def importOSMFile(filename, outfilename):
"""
Overpass API query for data input (replace name=Montana with the region you want):
[out:csv(::"lat", ::"lon", "addr:housenumber", "addr:street", "addr:city", "addr:state", "addr:postcode")][timeout:120];
area["name"="Montana"]->.boundaryarea;
node["addr:housenumber"]["addr:street"](area.boundaryarea);
out;
way["addr:housenumber"]["addr:street"](area.boundaryarea);
out center;
relation["addr:housenumber"]["addr:street"](area.boundaryarea);
out center;
"""
print("Importing OSM Overpass data from " + filename)
columns = [
"@lat",
"@lon",
"addr:housenumber",
"addr:street",
"addr:city",
"addr:state",
"addr:postcode"
]
file = filename
chunkcount = 0
badcount = 0
skippedcount = 0
source = "OpenStreetMap.org. License: ODbL"
for chunk in pd.read_csv(file, sep='\t', chunksize=100, usecols=columns, keep_default_na=False, dtype="str"):
print(" " + str(chunkcount * 100) + " ", end="\r", flush=True)
data = []
for index, row in chunk.iterrows():
try:
addr = normalize(row["addr:housenumber"], row["addr:street"], "", row["addr:city"], row["addr:state"], row["addr:postcode"], row["@lat"], row["@lon"])
data.append([addr['number'], addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['plus4'], addr['latitude'], addr['longitude'], source])
except ValidationException as e:
badcount = badcount + 1
except Exception as e:
print("W: Couldn't ingest address:")
print(row)
traceback.print_exc()
badcount = badcount + 1
out = pd.DataFrame(data=data, columns=["number","street","street2","city","state","zip","plus4","latitude","longitude","source"])
out.to_csv(outfilename, mode='a', index=False, header=not os.path.exists(outfilename), columns=["number","street","street2","city","state","zip","plus4","latitude","longitude","source"])
chunkcount = chunkcount + 1
print("\nDone importing OSM! Processed " + str(chunkcount) + " chunks.")
print("There were " + str(badcount) + " unprocessable addresses.")
print("Saved to output file " + outfilename)
def importNARFile(filename, outfilename):
print("Importing Statistics Canada data from " + filename)
zf = zipfile.ZipFile(filename, mode="r")
zipFiles = zf.namelist()
locationFileList = {}
addressFileList = {}
provinceCodes = [10,11,12,13,24,35,46,47,48,59,60,61,62]
for c in provinceCodes:
addressFileList[str(c)] = []
locationFileList[str(c)] = []
# = zf.open(fname, mode="r", force_zip64=True)
for fname in zipFiles:
if fname.startswith("Addresses/Address_") and fname.endswith(".csv"):
number = fname.replace("Addresses/Address_", "").replace(".csv", "").split("_")[0]
addressFileList[number].append(fname)
elif fname.startswith("Locations/Location_") and fname.endswith(".csv"):
number = fname.replace("Locations/Location_", "").replace(".csv", "").split("_")[0]
locationFileList[number].append(fname)
print("\nMerging address and location tables...")
mergecount = 0
dataframes = []
addrcols = ["LOC_GUID","APT_NO_LABEL","CIVIC_NO","CIVIC_NO_SUFFIX","MAIL_STREET_NAME","MAIL_STREET_TYPE","MAIL_STREET_DIR","MAIL_MUN_NAME","MAIL_PROV_ABVN","MAIL_POSTAL_CODE","BU_N_CIVIC_ADD"]
loccols = ["LOC_GUID","BG_LATITUDE","BG_LONGITUDE"]
for provinceId in provinceCodes:
print(" " + str(mergecount+1) + " ", end="\r", flush=True)
readaf = map(lambda addrFilename: dd.read_csv("zip://"+addrFilename, storage_options={'fo': filename}, usecols=addrcols, keep_default_na=False, dtype="str"), addressFileList[str(provinceId)])
readlf = map(lambda locationFilename: dd.read_csv("zip://"+locationFilename, storage_options={'fo': filename}, usecols=loccols, keep_default_na=False, dtype="str"), locationFileList[str(provinceId)])
addressFrame = dd.concat(list(readaf), ignore_index=False)
locationFrame = dd.concat(list(readlf), ignore_index=False)
dataframes.append(dd.merge(addressFrame, locationFrame, on=["LOC_GUID"]))
mergecount = mergecount + 1
print("\nProcessing addresses...")
file = filename
alladdrcount = 0
skippedcount = 0
source = "StatsCan NAR"
provinceIndex = 0
for df in dataframes:
print("\nProcessing province ID " + str(provinceCodes[provinceIndex]))
data = []
addrcount = 0
for index, row in df.iterrows():
if (addrcount % 100 == 0):
print(" " + str(addrcount) + " ", end="\r", flush=True)
number = ("".join(filter(None, [row["CIVIC_NO"], row["CIVIC_NO_SUFFIX"]]))).strip().upper()
street = (" ".join(filter(None, [row["MAIL_STREET_NAME"], row["MAIL_STREET_TYPE"], row["MAIL_STREET_DIR"]]))).strip().upper()
apt = row["APT_NO_LABEL"].strip().upper()
if street == "":
# PO BOX probably
if row["BU_N_CIVIC_ADD"].startswith("PO BOX "):
data.append([row["BU_N_CIVIC_ADD"].replace("PO BOX ", "").strip(), "PO BOX", "", row["MAIL_MUN_NAME"], row["MAIL_PROV_ABVN"], row["MAIL_POSTAL_CODE"], "", row["BG_LATITUDE"], row["BG_LONGITUDE"], source])
else:
skippedcount = skippedcount + 1
else:
data.append([number, street, apt, row["MAIL_MUN_NAME"], row["MAIL_PROV_ABVN"], row["MAIL_POSTAL_CODE"], "", row["BG_LATITUDE"], row["BG_LONGITUDE"], source])
addrcount = addrcount + 1
if len(data) >= 1000: # Dump to file so we don't use tons of RAM
out = pd.DataFrame(data=data, columns=["number","street","street2","city","state","zip","plus4","latitude","longitude","source"])
out.to_csv(outfilename, mode='a', index=False, header=not os.path.exists(outfilename), columns=["number","street","street2","city","state","zip","plus4","latitude","longitude","source"])
data = []
out = pd.DataFrame(data=data, columns=["number","street","street2","city","state","zip","plus4","latitude","longitude","source"])
out.to_csv(outfilename, mode='a', index=False, header=not os.path.exists(outfilename), columns=["number","street","street2","city","state","zip","plus4","latitude","longitude","source"])
alladdrcount = alladdrcount + addrcount
provinceIndex = provinceIndex + 1
print("\nDone importing NAR! Processed " + str(alladdrcount) + " addresses.")
print("Skipped " + str(skippedcount) + " invalid mailing addresses.")
print("Saved to output file " + outfilename)
def removeDupes(filepath):
print("Removing duplicate and incomplete addresses from " + filepath)
chunkcount = 0
chunksize = 20000000
for chunk in pd.read_csv(filepath, chunksize=chunksize, keep_default_na=False, dtype="str", usecols=["number", "street", "street2", "city", "state", "zip", "latitude", "longitude", "source"]):
print(".", end="", flush=True)
chunk.replace('', None, inplace=True)
chunk.dropna(subset=['zip','number','street','city','state','latitude','longitude'], inplace=True)
chunk.sort_values(by="plus4", ascending=False, inplace=True, na_position="last") # Make sure the address duplicate with a +4 is kept
chunk.drop_duplicates(subset=["number", "street", "street2", "city", "state", "zip"], keep="first", inplace=True)
chunk.to_csv(filepath + ".dedup.csv", mode='a', index=False,header=not os.path.exists(filepath + ".dedup.csv"), columns=["number","street","street2","city","state","zip","latitude","longitude", "source"])
chunkcount = chunkcount + 1
print("\nDone removing duplicates from " + filepath + "! Processed " + str(chunkcount) + " chunks of " + str(chunksize) + " records.")
def tosqlite(addressfile, dbfile):
global countrycode
cfg = src.config.get_config()
print("\nReading addresses from " + addressfile)
file = addressfile
if addressfile.endswith(".gz"):
file = gzip.open(addressfile, 'rb')
else:
file = open(addressfile, 'r')
connection = sqlite3.connect(dbfile)
cursor = connection.cursor()
cursor.execute("""CREATE TABLE IF NOT EXISTS `addresses` (
`zipcode` VARCHAR ( 6 ) NOT NULL,
`number` VARCHAR ( 30 ) NOT NULL,
`street` VARCHAR ( 200 ) NOT NULL,
`street2` VARCHAR ( 20 ),
`city` VARCHAR ( 50 ) NOT NULL,
`state` CHAR ( 2 ) NOT NULL,
`plus4` CHAR ( 4 ),
`country` CHAR ( 2 ) NOT NULL DEFAULT "US",
`latitude` DECIMAL ( 8 , 6 ) NOT NULL,
`longitude` DECIMAL( 9 , 6 ) NOT NULL,
`source` VARCHAR( 40 ),
UNIQUE (zipcode, number, street, street2, country)
)""")
cursor.execute("DROP TABLE IF EXISTS `addresses_temp`")
cursor.execute("""CREATE TABLE IF NOT EXISTS `addresses_temp` (
`zipcode` CHAR ( 6 ) NOT NULL,
`number` VARCHAR ( 30 ) NOT NULL,
`street` VARCHAR ( 200 ) NOT NULL,
`street2` VARCHAR ( 20 ),
`city` VARCHAR ( 50 ) NOT NULL,
`state` CHAR ( 2 ) NOT NULL,
`plus4` CHAR ( 4 ),
`country` CHAR ( 2 ) NOT NULL DEFAULT "US",
`latitude` DECIMAL ( 8 , 6 ) NOT NULL,
`longitude` DECIMAL( 9 , 6 ) NOT NULL,
`source` VARCHAR( 40 )
)""")
cursor.execute("""CREATE INDEX IF NOT EXISTS `latitude_longitude` ON `addresses` (
`latitude`,
`longitude`
)""")
cursor.execute("""CREATE INDEX IF NOT EXISTS `number_street` ON `addresses` (
`number`,
`street`
)""")
cursor.execute("""CREATE INDEX IF NOT EXISTS `state_city` ON `addresses` (
`state`,
`city`
)""")
cursor.execute("""CREATE INDEX IF NOT EXISTS `zipcode_number` ON `addresses` (
`zipcode`,
`number`
)""")
cursor.execute("""CREATE INDEX IF NOT EXISTS `country` ON `addresses` (
`country`
)""")
chunksize = 5000
chunkcount = 0
rowschanged = 0
columns = ["number","street","street2","city","state","zip","latitude","longitude","source"]
if cfg.appendPlus4:
columns.append("plus4")
for chunk in pd.read_csv(file, chunksize=chunksize, usecols=columns, keep_default_na=False, dtype="str"):
chunk = chunk.rename(columns={'zip': 'zipcode'})
chunk.insert(7, "country", countrycode)
# Replace empty values with NULL
chunk.replace('', None, inplace=True)
# Replace null street2 with empty string so the SQLite UNIQUE clause will work
chunk.fillna({"street2": ""}, inplace=True)
# Remove null values that aren't allowed
chunk.dropna(subset=['zipcode','number','street','city','state','latitude','longitude'], inplace=True)
print(" " + str(chunkcount * chunksize) + " ", end="\r", flush=True)
# Write chunk to SQLite
cursor.execute("DELETE FROM addresses_temp")
chunk.to_sql("addresses_temp", connection, if_exists='append', index=False, dtype={
"zipcode": "CHAR(6)",
"number": "VARCHAR(30)",
"street": "VARCHAR(200)",
"street2": "VARCHAR(20)",
"city": "VARCHAR(50)",
"state": "CHAR(2)",
"plus4": "CHAR(4)",
"country": "CHAR(2)",
"latitude": "DECIMAL(8,6)",
"longitude": "DECIMAL(9,6)",
"source": "VARCHAR(40)"
})
chunkcount = chunkcount + 1
cursor.execute("INSERT OR IGNORE INTO addresses SELECT * FROM addresses_temp")
rowschanged = rowschanged + cursor.rowcount
if chunkcount % 5000 == 0: # VACUUM every 10 million inserts
print(" Optimizing database...", end="\r", flush=True)
connection.executescript("VACUUM")
print(" ", end="\r", flush=True)
connection.executescript("DROP TABLE addresses_temp")
cursor.execute("DELETE FROM addresses WHERE number=\"0\"")
rowschanged = rowschanged + cursor.rowcount
if rowschanged > 10000000:
print("\nOptimizing database...")
connection.executescript("VACUUM; ANALYZE; PRAGMA optimize;")
print("Done converting to SQLite! Processed " + str(chunkcount) + " chunks (" + str(chunksize) + " records per chunk).")
print(str(rowschanged) + " records inserted.")
connection.close()
print("Saved to output file " + dbfile)
return rowschanged
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Tools to build a standardized U.S. address database from free source data."
)
parser.add_argument("file", help="Address file(s) to process.", nargs='+')
parser.add_argument("--outputfile", help="Filename to output address data to. If unspecified, set to \"./data/out.csv\" or \"./data/out.sqlite\", depending on options set.")
parser.add_argument(
"--filetype",
help="Type of address file to ingest. nad=National Address Database, oa=OpenAddresses, adb=CSV created by this script, osm=OpenStreetMap Overpass API (see main.py source for query to use), nar=Statistics Canada National Address Register",
choices=["nad", "oa", "adb", "osm", "nar"],
)
parser.add_argument("--state", help="Some OpenAddresses files don't have the state field set. Do it manually here.")
parser.add_argument("--ignorestates", help="Comma-separated two-letter state names. Addresses in these states will be skipped over.")
parser.add_argument("--onlystates", help="Comma-separated two-letter state names. Addresses NOT in these states will be skipped over.")
parser.add_argument("--source", help="Set the data source name (OpenAddresses only). Autodetected based on filename if not set.")
parser.add_argument("--dedup", help="Remove duplicate records in an already-ingested address file, and saves it to folder/file.dedup.csv. Only catches \"nearby\" duplicates; processes 20,000,000 records at a time.", action='store_true')
parser.add_argument("--fixlatlon", help="Detect and repair flipped latitude/longitude pairs in an already-ingested address file, and saves it to [filename].coordfix.csv.", action='store_true')
parser.add_argument("--tosqlite", help="Output to a SQLite3 database. Only works on output CSV data from this script.", action='store_true')
parser.add_argument("--appendplus4", help="Append ZIP+4 data to all records. Fairly slow.", action='store_true')
parser.add_argument("--appendunitlabel", help="Append unit label (APT, STE, etc) to unit numbers using ZIP+4 data.", action='store_true')
parser.add_argument("--zipprefix", help="When searching for a ZIP, assume it starts with the digits provided for faster lookups.")
parser.add_argument("-a", help="Allow appending to existing output file.", action='store_true')
parser.add_argument("--cpu", help="Number of CPU cores to use for parallel processing.")
parser.add_argument("--country", help="Two-letter country code. Default is US.")
parser.add_argument("--city", help="City name to assume when there's no city or postal code in the source data. Useful for OpenAddresses city_of_ data files.")
parser.add_argument("--startat", help="Skip to this line number in the input file (NAD)")
parser.add_argument("--census", help="Enable looking up missing ZIP codes in the U.S. Census Geocoder when we have a full address, city, and state but no ZIP.", action='store_true')
parser.add_argument("--libpostal", help="Use libpostal address parsing and expansions to match bad addresses to a ZIP+4. Automatically enables --appendplus4.", action='store_true')
parser.add_argument("--noskip4", help="When processing own file format, don't skip normalizing records that have a ZIP+4 already.", action="store_true")
args = parser.parse_args()
startAtLine = 0
appendPlus4 = False
appendUnitLabel = False
useCensusToFillEmptyZIPs = False
countryCode = "US"
citySuggestion = False
advancedMode = False
noSkip4 = False
if args.libpostal:
advancedMode = True
appendPlus4 = True
if advancedMode:
from src.advancedparsing import advancedNormalize
print("Using libpostal to work harder on bad addresses.")
if args.appendplus4:
appendPlus4 = True
if appendPlus4:
print("Trying to match to ZIP+4 codes for every address!")
if args.noskip4:
noSkip4 = True
if noSkip4:
print("Also normalizing records that have a +4 in the input data.")
if args.appendunitlabel:
appendUnitLabel = True
if args.census:
useCensusToFillEmptyZIPs = True
else:
useCensusToFillEmptyZIPs = False
if useCensusToFillEmptyZIPs:
print("Census geocoder enabled! RIP your network maybe")
statesToIgnore = []
if args.ignorestates:
statesToIgnore = re.sub(r"[^a-zA-Z,]+", "", args.ignorestates.upper()).split(",")
statesToKeep = []
if args.onlystates:
statesToKeep = re.sub(r"[^a-zA-Z,]+", "", args.onlystates.upper()).split(",")
zipprefix = False
if args.zipprefix:
zipprefix = args.zipprefix
if args.cpu:
maxthreads = int(args.cpu)
if args.country:
if len(args.country) != 2:
print("Invalid country code " + args.country + ", exiting.")
sys.exit(1)
countrycode = args.country.upper()
countryCode = countrycode
if args.startat and args.startat.isdigit():
startAtLine = int(args.startat)
if args.city:
citySuggestion = args.city.strip().toUpper()
cfg = src.config.AppConfig(appendPlus4=appendPlus4, appendUnitLabel=appendUnitLabel, countryCode=countryCode, citySuggestion=citySuggestion, useCensusToFillEmptyZIPs=useCensusToFillEmptyZIPs, advancedMode=advancedMode, noSkip4=noSkip4)
src.config.set_config(cfg)
if args.dedup:
with concurrent.futures.ProcessPoolExecutor(max_workers=maxthreads) as executor:
for file in args.file:
executor.submit(removeDupes, file)
elif args.fixlatlon:
with concurrent.futures.ProcessPoolExecutor(max_workers=maxthreads) as executor:
for file in args.file:
executor.submit(fixLatLon, file)
elif args.tosqlite:
outputfile = "./data/out.sqlite"
if args.outputfile:
outputfile = args.outputfile
if args.a != True and os.path.exists(args.outputfile):
print("Output file already exists, exiting!")
sys.exit()
rowschanged = 0
filesimported = 0
for file in args.file:
rowschanged = rowschanged + tosqlite(file, outputfile)
filesimported = filesimported + 1
print("\nDone importing " + str(filesimported) + " files. " + str(rowschanged) + " records inserted.")
elif args.file:
outputfile = "./data/out.csv"
if args.outputfile:
outputfile = args.outputfile
if args.a != True and os.path.exists(args.outputfile):
print("Output file already exists, exiting!")
sys.exit()
if args.filetype == "nad":
for file in args.file:
importNadFile(file, outputfile, statesToIgnore, statesToKeep, startAtLine)
elif args.filetype == "adb":
for file in args.file:
importOwnFile(file, outputfile, statesToIgnore, statesToKeep)
elif args.filetype == "osm":
for file in args.file:
importOSMFile(file, outputfile)
elif args.filetype == "nar":
countrycode = "CA"
for file in args.file:
importNARFile(file, outputfile)
elif args.filetype == "oa":
source = ""
if args.source:
source = args.source
for file in args.file:
importOpenAddressFile(file, outputfile, statesToIgnore, source, args.state, zipprefix)

61
rendermap.py Executable file
View File

@ -0,0 +1,61 @@
#!/usr/bin/python3
from PIL import Image, ImageDraw
from argparse import ArgumentParser
import sqlite3
Image.MAX_IMAGE_PIXELS = 648000000 # 100 pixels per degree
def render(filename, outfile, ppd):
print("Creating map overlay")
pixelsperdegree = ppd
width = 360 * pixelsperdegree
height = 180 * pixelsperdegree
img = Image.new('RGBA', (width, height), (255, 255, 255, 0))
draw = ImageDraw.Draw(img)
print("Connecting to database")
connection = sqlite3.connect(filename)
c = connection.cursor()
print("Drawing map overlay")
c.execute('SELECT longitude, latitude FROM addresses')
count = 0
try:
for (x,y) in c:
try:
if float(y) < -90.0 or float(y) > 90.0:
x, y = y, x
x = round((x + 180) * pixelsperdegree)
y = height - round((y + 90) * pixelsperdegree)
draw.point((x, y), fill=(0, 255, 0))
except:
pass
count = count + 1
if count % 1000 == 0:
print(" " + str(count) + " ", end="\r", flush=True)
except KeyboardInterrupt:
print("\nKeyboardInterrupt: Stopping draw and saving image early")
pass
print("\nSaving overlay image")
img.save(outfile, format="PNG")
print("Rendering map image")
if (pixelsperdegree > 50):
basemap = Image.open("basemap-100.png")
else:
basemap = Image.open("basemap-50.png")
Image.alpha_composite(basemap.resize((width, height)), img).save(outfile + ".map.png", format="PNG")
img.close()
basemap.close()
print("Done! Saved map to " + outfile)
parser = ArgumentParser(description='Draw a map of a database\'s address points.')
parser.add_argument('src_db', help='Input SQLite database with "addresses" table containing "latitude" and "longitude" columns')
parser.add_argument('png_filename', help='Output PNG filename.')
parser.set_defaults(ppd=50)
parser.add_argument('ppd', help='Pixels per degree of latitude/longitude.', type=int)
if __name__ == "__main__":
args = parser.parse_args()
render(args.src_db, args.png_filename, args.ppd)

51
sqlite-from-sqfull.py Executable file
View File

@ -0,0 +1,51 @@
#!/usr/bin/python3
from argparse import ArgumentParser
import sqlite3
def process(filename, outfile):
print("Connecting to databases")
connection = sqlite3.connect(filename)
c = connection.cursor()
connection2 = sqlite3.connect(outfile)
c2 = connection2.cursor()
print("Creating lite database")
c2.execute("DROP TABLE IF EXISTS `addresses`")
c2.execute("""CREATE TABLE `addresses` (
`zipcode` VARCHAR ( 6 ) NOT NULL,
`number` VARCHAR ( 30 ) NOT NULL,
`street` VARCHAR ( 200 ) NOT NULL,
`street2` VARCHAR ( 20 ),
`city` VARCHAR ( 50 ) NOT NULL,
`state` CHAR ( 2 ) NOT NULL,
`plus4` CHAR ( 4 ),
`country` CHAR ( 2 ) NOT NULL DEFAULT "US",
UNIQUE (zipcode, number, street, street2, country)
)""")
c2.execute("CREATE INDEX `zipcode_number` ON `addresses` (`zipcode`,`number`)")
c2.execute("CREATE INDEX `number_street_state` ON `addresses` (`number`,`street`,`state`)")
print("Copying records")
c.execute('SELECT zipcode, number, street, street2, city, state, plus4, country FROM addresses')
count = 0
for (zipcode, number, street, street2, city, state, plus4, country) in c:
c2.execute("INSERT OR IGNORE INTO addresses(zipcode, number, street, street2, city, state, plus4, country) VALUES (?,?,?,?,?,?,?,?)", (zipcode, number, street, street2, city, state, plus4, country))
count = count + 1
if count % 10000 == 0:
print(" " + str(count) + " ", end="\r", flush=True)
print("\nVacuuming...")
connection2.executescript("VACUUM")
print("Done! Copied " + str(count) + " rows to " + outfile + ".")
parser = ArgumentParser(description='Draw a map of a database\'s address points.')
parser.add_argument('src_db', help='"Full" SQLite database')
parser.add_argument('dest_db', help='Output database with some columns and indexes removed')
if __name__ == "__main__":
args = parser.parse_args()
process(args.src_db, args.dest_db)

0
src/__init__.py Normal file
View File

333
src/addressfunctions.py Normal file
View File

@ -0,0 +1,333 @@
# Created on : Aug 29, 2024, 12:57:40AM
# Author : Skylar Ittner
import re
import pandas as pd
from pythonnet import load
from scourgify import NormalizeAddress, normalize_address_record
from src.zipfunctions import checkZIPCode, getZIP, getZIP4, getCityStateForZIP
from src.constants import STATES, LONGHAND_STREET_TYPES, UNITS, ValidationException
import src.config
import json
import sys
load("mono")
import clr
clr.AddReference("KellermanSoftware.USPSStandardization")
from KellermanSoftware.USPSStandardization import StandardizationLogic
standardization = False
try:
standardization = StandardizationLogic("Netsyms Technologies LLC 203206", "2;770AE30D7A5F217E77C857B29618A6E8DD")
except:
print("Kellerman USPSStandardization failed to initialize, skipping.")
zipcodes = pd.read_csv("zip_code_database.csv", keep_default_na=False, dtype="str")
PRE_STANDARDIZATION_STREET_REGEXES = {
" POINTADDRESS$": "",
"S U S HWY ": "S US HIGHWAY ",
"^U S ([0-9]+) HWY": r"US HIGHWAY \1",
"^U S HWY ": "US HIGHWAY ",
"[–—−]": "-",
" PW$": " PKWY",
" VIS ": " VISTA ",
" VLY ": " VALLEY ",
" MTN ": " MOUNTAIN ",
" CTR ": " CENTER ",
" CLB ": " CLUB ",
"HBR ": "HARBOR ",
"^PNE ": "PINE ",
"^SPG ": "SPRING ",
"^M L KING JR ": "MARTIN LUTHER KING JR ",
"^NONE$": "",
" VLY ": " VALLEY ",
"^VLY ": "VALLEY ",
"BEN-DIER": "BEN DIER",
" ROCK RIV$": "" # Albany county WY misspelled their own city name and put it in the street field
}
POST_STANDARDIZATION_STREET_REGEXES = {
", BASE$": "",
", BASE CP$": "",
"UNITED STATES HWY ([0-9]+)": r"US HIGHWAY \1",
"^U.S. HWY ": "US HIGHWAY ",
"^U.S. HIGHWAY ": "US HIGHWAY ",
"^U S ([0-9]+) HWY": r"US HIGHWAY \1",
"^US ([0-9]+)": r"US HIGHWAY \1",
" US HWY ([0-9]+)": r" US HIGHWAY \1",
"UNITED STATES FOREST SERVICE ROAD ([0-9]+) RD$": r"FOREST SERVICE ROAD \1",
"^IH?([0-9]{1,3})(\s|$)": r"INTERSTATE \1\2",
"^INTERSTATE HWY ([0-9]{1,3})(\s|$)": r"INTERSTATE \1\2",
"^I ?([0-9]{1,3})(\s|$)": r"INTERSTATE \1\2",
"^I-([0-9]{1,3})$": r"INTERSTATE \1",
"^([EW]) I-([0-9]{1,3})$": r"\1 INTERSTATE \2",
"^HWY FM ([0-9]+)": r"FM \1",
"^FARM TO MARKET ([0-9]+)": r"FM \1",
" (HIWAY) ([0-9]+)$": r" HWY \2",
" (RTE|RT) ([0-9]+)$": r" ROUTE \2",
" RD ([0-9]+)$": r" ROAD \1",
"^ST (HIGHWAY|ROUTE|ROAD|HWY) ([0-9]+)$": r"STATE \1 \2",
"^CNTY (HIGHWAY|ROUTE|ROAD|HWY) ([0-9]+)$": r"COUNTY \1 \2",
"^CR ([0-9]+)": r"COUNTY ROAD \1",
"^COUNTY RD ([0-9]+) ([NSEW]{1,2})": r"COUNTY ROAD \1 \2",
"^(SR|ST RD) ([0-9]+)": r"STATE ROAD \2",
"^(ST RT|ST RTE) ([0-9]+)": r"STATE ROUTE \2",
"^(HWY|HIWAY) ([0-9]+)": r"HIGHWAY \2",
"^(RTE|RT) ([0-9]+)": r"ROUTE \2",
"^RD ([0-9]+)": r"ROAD \1",
"^TSR ([0-9]+)": r"TOWNSHIP ROAD \1",
"([0-9]+) BYP RD": r"\1 BYPASS RD",
"([0-9]+) BYPASS": r"\1 BYP",
" HIGHWAY ([0-9]+)": r" HWY \1",
"^(STATE|COUNTY) HWY ": r"\1 HIGHWAY ",
"UNITED STATES HWY ([0-9]+)": r"US HIGHWAY \1",
"UNITED STATES HWY ": "US HIGHWAY ",
"^US ([0-9]+)": r"US HIGHWAY \1",
" US HWY ([0-9]+)": r" US HIGHWAY \1",
"^US HWY ": r"US HIGHWAY ",
"^FIRST ": "1ST ",
"^SECOND ": "2ND ",
"^THIRD ": "3RD ",
"^FOURTH ": "4TH ",
"^FIFTH ": "5TH ",
"^SIXTH ": "6TH ",
"^SEVENTH ": "7TH ",
"^EIGHTH ": "8TH ",
"^NINTH ": "9TH ",
"^TENTH ": "10TH ",
" STREET ST$": " ST",
" AVENUE AVE$": " AVE",
" DRIVE DR$": " DR",
" DR DRIVE$": " DR",
" PARKS PARK$": " PARK",
" ROAD RD$": " RD",
" LK ": " LAKE ",
" ST ST$": " ST",
"^(N|S|E|W) COUNTY (ROAD|RD) ([0-9]{3,}) (N|S|E|W)$": r"\1 \3 \4", # Indiana has "County Road" in NAD as the street name for some reason
"^COUNTY RD COUNTY ROAD ": "COUNTY ROAD ",
" CI$": " CIR",
" CM$": " CMN",
" BL$": " BLVD",
" TE$": " TER",
" LP$": " LOOP",
"^CRK ([0-9]+)$": r"COUNTY ROAD \1", # Athens TX does this for some reason
"^PR ([0-9]+)$": r"PRIVATE ROAD \1", # Athens TX does this for some reason
"^SH ([0-9]+) ([NSEW])$": r"STATE HIGHWAY \1 \2" # Athens TX does this too for some reason
}
STANDARDIZATION_NUMBER_REGEXES = {
"^([0-9]+) \1$": r"\1", # Fix address numbers that repeat with a space between (doesn't always work for some reason)
"^([0-9]+) ([A-Z])$": r"\1\2", # "1234 A ROAD ST" to "1234A ROAD ST"
"^0$": "", # Blank out 0 as a house number
"^\.$": "", # Blank out .
}
ABBREV_PATTERN = ""
ABBREV_PATTERN_LIST = []
for (a, b) in LONGHAND_STREET_TYPES.items():
ABBREV_PATTERN_LIST.append(a)
ABBREV_PATTERN = "|".join(ABBREV_PATTERN_LIST)
STREET_INNER_ABBREV_FIND_REGEX = re.compile("(^|\s)("+ABBREV_PATTERN+") ("+ABBREV_PATTERN+")( [NSEW]{0,2})?$")
def postStandardizeStreet(street):
# Catch edge cases with USPS formatting
for (find, replace) in POST_STANDARDIZATION_STREET_REGEXES.items():
street = re.sub(find, replace, street)
# Unshorten things like "S CRK RD", correcting to "S CREEK RD"
matches = STREET_INNER_ABBREV_FIND_REGEX.search(street)
if matches:
street = street.replace(matches.group(2), LONGHAND_STREET_TYPES[matches.group(2)], 1)
# "KY 1234" to "KY HIGHWAY 1234" per Pub 28
if re.match(r"^[A-Z]{2} [0-9]+$", street):
for (full, abbr) in STATES.items():
if street.startswith(abbr + " "):
street = street.replace(abbr, abbr + " HIGHWAY", 1)
break
# "KENTUCKY STATE HIGHWAY 625" to "KY STATE HIGHWAY 625" per Pub 28
if re.match(r"^[A-Z]{2,} STATE HIGHWAY [0-9]+", street):
for (full, abbr) in STATES.items():
if street.startswith(full + " "):
street = street.replace(full, abbr, 1)
break
return street.strip()
def preStandardizeStreet(street):
for (find, replace) in PRE_STANDARDIZATION_STREET_REGEXES.items():
street = re.sub(find, replace, street)
# Remove unit from end of street
hashtag = street.find("#")
if hashtag > 0:
street = street[:hashtag].strip()
return street
def standardizeNumber(number):
for (find, replace) in STANDARDIZATION_NUMBER_REGEXES.items():
number = re.sub(find, replace, number)
# Detect "1234 1234" which some sources have sometimes (like Kentucky NAD v20)
if (parts := number.split(" ")) and len(parts) == 2 and parts[0] == parts[1]:
number = parts[0]
return number.strip()
def splitNumberAndUnit(number):
# Some places have unit numbers in the primary address, remove them
num = number
unit = ""
for label in UNITS:
pos = number.find(label)
if pos > 0:
num = number[:pos]
unit = number[pos:]
break
num = standardizeNumber(num)
unit = removeUnitText(unit)
return num, unit
def removeUnitText(subaddr):
if subaddr == None:
return subaddr
subaddr = subaddr.upper()
for label in UNITS:
subaddr = subaddr.replace(label, "")
subaddr = subaddr.replace(" ", " ")
return subaddr.strip()
def normalizeAddress(number, street, unit, city, state, zipcode, lat, lon, zipprefix=False, plus4="", county=False):
cfg = src.config.get_config()
if not number:
#print("W: No address number for address, skipping "+street)
raise ValidationException("No address number")
if not street:
#print("W: No street for address, skipping")
raise ValidationException("No address number")
# Detect flipped coordinates
if lat < -90 or lat > 90:
lon, lat = lat, lon
number = standardizeNumber(str(number).upper().strip())
street = preStandardizeStreet(street.strip().upper())
unit = unit.strip().upper()
city = city.strip().upper()
state = state.strip().upper()
zipcode = (zipcode or "").strip()
plus4 = plus4.strip()
if (not city or city == "") and (not zipcode or zipcode == "") and cfg.citySuggestion:
# Use the city specified on the CLI, hopefully it'll help
city = cfg.citySuggestion
if unit == "":
number, unit = splitNumberAndUnit(number)
city = city.replace("CITY OF ", "").replace("TOWN OF ", "").replace("VILLAGE OF ", "")
city = city.replace("UNINCORPORATED", "")
#
# Standardize address
#
try:
# Python library
addr = normalize_address_record(
{
"address_line_1": "".join(["999999999", " ", street]),
"address_line_2": unit,
"city": city,
"state": state,
"postal_code": zipcode
}
)
except Exception as e:
try:
# Proprietary Mono library
if standardization == False:
raise e
addr = {
"address_line_1": "999999999 " + standardization.StandardizeStreetAddress(street),
"address_line_2": unit,
"city": city,
"state": state,
"postal_code": zipcode
}
except Exception as ex:
# This basically never happens
print("W: Couldn't parse address:")
print(ex)
raise ex
#
# Remove number from street address field
#
addr['address_line_1'] = addr['address_line_1'].replace("999999999", number)
streetonly = addr['address_line_1']
if streetonly.startswith(str(number) + " "):
streetonly = streetonly[len(str(number) + " "):]
#
# Run extra regexes on street to fix standardization problems
#
streetonly = postStandardizeStreet(streetonly)
#
# Special conditional rules
#
if addr["state"] == "PR":
# Puerto Rico special rules
if re.match("[A-Z]", number):
number = number.replace("-", "")
#
# Clean second line
#
addr['address_line_2'] = removeUnitText(addr['address_line_2'])
#
# Standardize and validate and/or append ZIP Code
#
zipcode = addr["postal_code"]
unitprefix = ""
unit = addr['address_line_2']
if zipcode is not None:
zipcode = addr["postal_code"][0:5]
# Skip these if we already have a ZIP+4 code, assume it's accurate
if zipcode is not None and len(zipcode) == 5 and not plus4:
zipinfo = getCityStateForZIP(zipcode)
if cfg.appendPlus4 or zipinfo == False or addr["state"] != zipinfo["state"] or (cfg.appendUnitLabel and addr["address_line_2"] != ""):
zipcode, plus4, streetonly, unitprefix, unit = getZIP4(number, streetonly, addr['address_line_2'], addr["state"], lat, lon, addr["city"], zipcode, county)
zipinfo = getCityStateForZIP(zipcode)
if zipinfo != False:
addr["city"] = zipinfo["city"]
addr["state"] = zipinfo["state"]
else:
addr["city"] = zipinfo["city"]
addr["state"] = zipinfo["state"]
elif not plus4:
zipcode, plus4, streetonly, unitprefix, unit = getZIP4(number, streetonly, addr['address_line_2'], addr["state"], lat, lon, addr["city"], False, county)
zipinfo = getCityStateForZIP(zipcode)
if zipinfo != False:
addr["city"] = zipinfo["city"]
addr["state"] = zipinfo["state"]
if not plus4 and streetonly == "UNITED STATES HWY" and re.match(r"^\d+$", unit):
streetonly = f"US HIGHWAY {unit}"
unit = ""
#if not src.config.appendPlus4:
# plus4 = ""
return {
"number": number,
"street": streetonly,
"unit": ' '.join(filter(None, (unitprefix, unit))),
"city": addr["city"],
"state": addr["state"],
"zip": zipcode,
"plus4": plus4,
"latitude": lat,
"longitude": lon
}

109
src/advancedparsing.py Normal file
View File

@ -0,0 +1,109 @@
# Use "AI" to parse problem addresses and find more matches
# First expand the address to possible forms, then normalize each one, and keep the one that has a ZIP+4
from postal.parser import parse_address
from postal.expand import expand_address
from src.addressfunctions import normalizeAddress
import re
def advancedNormalize(number, street, unit, city, state, zipcode, lat, lon, zipprefix=False, plus4 = ""):
if len(plus4 or "") == 4:
# Return as-is, it's got a +4 match already
return {
"number": number,
"street": street,
"unit": unit,
"city": city,
"state": state,
"zip": zipcode,
"plus4": plus4,
"latitude": lat,
"longitude": lon
}
# Merge and re-split the address to catch odd things like the street having the city and zip too
parsed = parse_address(f"{number} {street}, {city} {state} {zipcode}")
pNumber = number
pStreet = street
pUnit = unit
pCity = city
pState = state
pZip = zipcode
for part in parsed:
if part[1] == "house_number" and (pNumber == "" or pNumber == number): # Don't overwrite it with values found later, which might be a zip code or something
pNumber = part[0].upper()
elif part[1] == "road":
pStreet = part[0].upper()
elif part[1] == "unit":
pUnit = part[0].upper()
elif part[1] == "city":
pCity = part[0].upper()
elif part[1] == "state":
pState = part[0].upper()
elif part[1] == "postcode":
pZip = part[0].upper()
# Expand the number/street to all possible forms
expanded = expand_address(f"{pNumber} {pStreet}")
normalizedMatches = []
# Add the original address as a candidate so if no better matches come up, it'll probably just use it as-is
normalizedMatches.append({
"number": number,
"street": street,
"unit": unit,
"city": city,
"state": state,
"zip": zipcode,
"plus4": plus4,
"latitude": lat,
"longitude": lon
})
# Also add one where we remove any non-numeric data from the number and unit fields
normalizedMatches.append(normalizeAddress(re.sub("[^0-9]", "", number), street, re.sub("[^0-9]", "", unit), pCity, pState, pZip, lat, lon, zipprefix, plus4))
if number != pNumber or unit != pUnit or street != pStreet:
normalizedMatches.append(normalizeAddress(re.sub("[^0-9]", "", pNumber), pStreet, re.sub("[^0-9]", "", pUnit), pCity, pState, pZip, lat, lon, zipprefix, plus4))
for exp in expanded:
parsed = parse_address(exp)
pN = ""
pS = ""
pU = ""
for part in parsed:
if part[1] == "house_number" and pN == "":
pN = part[0]
elif part[1] == "road":
pS = part[0]
elif part[1] == "unit":
pU = part[0]
try:
normalizedMatches.append(normalizeAddress(pN, pS, pU, pCity, pState, pZip, lat, lon, zipprefix, plus4))
except Exception as e:
pass
if len(normalizedMatches) > 1:
weights = {"number": 5, "street": 5, "unit": 1, "city": 1, "state": 1, "zip": 3, "plus4": 8, "latitude": 0, "longitude": 0}
sortedMatches = sorted(
normalizedMatches,
key=lambda item: sum(weights[k] for k, v in item.items() if v),
reverse=True
)
return sortedMatches[0]
elif len(normalizedMatches) == 1:
return normalizedMatches[0]
# No matches, give up on the whole thing
return {
"number": number,
"street": street,
"unit": unit,
"city": city,
"state": state,
"zip": zipcode,
"plus4": plus4,
"latitude": lat,
"longitude": lon
}

23
src/config.py Normal file
View File

@ -0,0 +1,23 @@
from dataclasses import dataclass
from typing import Optional
@dataclass(frozen=True)
class AppConfig:
appendPlus4: bool
appendUnitLabel: bool
countryCode: str
citySuggestion: bool
useCensusToFillEmptyZIPs: bool
advancedMode: bool
noSkip4: bool
_CFG: Optional[AppConfig] = None
def set_config(cfg: AppConfig) -> None:
global _CFG
_CFG = cfg # set once at start (or in child initializer)
def get_config() -> AppConfig:
if _CFG is None:
raise RuntimeError("Config not initialized yet")
return _CFG

291
src/constants.py Normal file
View File

@ -0,0 +1,291 @@
class ValidationException(Exception):
pass
LONGHAND_STREET_TYPES = {
'ALY': 'ALLEY',
'ANX': 'ANNEX',
'ARC': 'ARCADE',
'AV': 'AVENUE',
'AVE': 'AVENUE',
'BYU': 'BAYOU',
'BCH': 'BEACH',
'BND': 'BEND',
'BLF': 'BLUFF',
'BLFS': 'BLUFFS',
'BTM': 'BOTTOM',
'BLVD': 'BOULEVARD',
'BL': 'BOULEVARD',
'BR': 'BRANCH',
'BRG': 'BRIDGE',
'BRK': 'BROOK',
'BRKS': 'BROOKS',
'BGS': 'BURGS',
'BYP': 'BYPASS',
'CP': 'CAMP',
'CYN': 'CANYON',
'CPE': 'CAPE',
'CSWY': 'CAUSEWAY',
'CTR': 'CENTER',
'CTRS': 'CENTERS',
'CI': 'CIRCLE',
'CIR': 'CIRCLE',
'CIRS': 'CIRCLES',
'CLF': 'CLIFF',
'CLFS': 'CLIFFS',
'CMN': 'COMMON',
'CM': 'COMMON',
'COR': 'CORNER',
'CORS': 'CORNERS',
'CRSE': 'COURSE',
'CT': 'COURT',
'CTS': 'COURTS',
'CVS': 'COVES',
'CRK': 'CREEK',
'CRES': 'CRESCENT',
'CRST': 'CREST',
'XING': 'CROSSING',
'XRD': 'CROSSROAD',
'CURV': 'CURVE',
'DL': 'DALE',
'DM': 'DAM',
'DV': 'DIVIDE',
'DR': 'DRIVE',
'DRS': 'DRIVES',
'EST': 'ESTATE',
'ESTS': 'ESTATES',
'EXPY': 'EXPRESSWAY',
'EXT': 'EXTENSION',
'EXTS': 'EXTENSIONS',
'FALL': 'FALL',
'FLS': 'FALLS',
'FRY': 'FERRY',
'FLD': 'FIELD',
'FLDS': 'FIELDS',
'FLT': 'FLAT',
'FLTS': 'FLATS',
'FRD': 'FORD',
'FRDS': 'FORDS',
'FRST': 'FORESTS',
'FRG': 'FORGE',
'FRGS': 'FORGES',
'FRK': 'FORK',
'FRKS': 'FORKS',
'FT': 'FORT',
'FWY': 'FREEWAY',
'GDN': 'GARDEN',
'GDNS': 'GARDENS',
'GTWY': 'GATEWAY',
'GLN': 'GLEN',
'GLNS': 'GLENS',
'GRNS': 'GREENS',
'GRV': 'GROVE',
'GRVS': 'GROVES',
'HBR': 'HARBOR',
'HBRS': 'HARBORS',
'HVN': 'HAVEN',
'HTS': 'HEIGHTS',
'HWY': 'HIGHWAY',
'HL': 'HILL',
'HLS': 'HILLS',
'HOLW': 'HOLLOW',
'INLT': 'INLET',
'IS': 'ISLAND',
'ISS': 'ISLANDS',
'ISLE': 'ISLE',
'JCT': 'JUNCTION',
'JCTS': 'JUNCTIONS',
'KY': 'KEY',
'KYS': 'KEYS',
'KNL': 'KNOLL',
'KNLS': 'KNOLLS',
'LK': 'LAKE',
'LKS': 'LAKES',
'LAND': 'LAND',
'LNDG': 'LANDING',
'LN': 'LANE',
'LGT': 'LIGHT',
'LGTS': 'LIGHTS',
'LF': 'LOAF',
'LCK': 'LOCK',
'LCKS': 'LOCKS',
'LDG': 'LODGE',
'LOOP': 'LOOP',
'LP': 'LOOP',
'MALL': 'MALL',
'MNR': 'MANOR',
'MNRS': 'MANORS',
'MDW': 'MEADOW',
'MDWS': 'MEADOWS',
'MEWS': 'MEWS',
'ML': 'MILL',
'MLS': 'MILLS',
'MSN': 'MISSION',
'MTWY': 'MOTORWAY',
'MT': 'MOUNT',
'MTN': 'MOUNTAIN',
'MTNS': 'MOUNTAINS',
'NCK': 'NECK',
'ORCH': 'ORCHARD',
'OVAL': 'OVAL',
'OPAS': 'OVERPASS',
'PARK': 'PARKS',
'PKWY': 'PARKWAY',
'PASS': 'PASS',
'PSGE': 'PASSAGE',
'PATH': 'PATHS',
'PIKE': 'PIKES',
'PNE': 'PINE',
'PNES': 'PINES',
'PL': 'PLACE',
'PLN': 'PLAIN',
'PLNS': 'PLAINS',
'PLZ': 'PLAZA',
'PT': 'POINT',
'PTS': 'POINTS',
'PRT': 'PORT',
'PRTS': 'PORTS',
'PR': 'PRAIRIE',
'PW': 'PARKWAY',
'RADL': 'RADIAL',
'RAMP': 'RAMP',
'RNCH': 'RANCH',
'RPD': 'RAPID',
'RPDS': 'RAPIDS',
'RST': 'REST',
'RDG': 'RIDGE',
'RDGS': 'RIDGES',
'RIV': 'RIVER',
'RD': 'ROAD',
'RDS': 'ROADS',
'RTE': 'ROUTE',
'ROW': 'ROW',
'RUE': 'RUE',
'RUN': 'RUN',
'SHL': 'SHOAL',
'SHLS': 'SHOALS',
'SHR': 'SHORE',
'SHRS': 'SHORES',
'SKWY': 'SKYWAY',
'SPG': 'SPRING',
'SPGS': 'SPRINGS',
'SPUR': 'SPURS',
'SQ': 'SQUARE',
'SQS': 'SQUARES',
'STA': 'STATION',
'STRA': 'STRAVENUE',
'STRM': 'STREAM',
'ST': 'STREET',
'STS': 'STREETS',
'SMT': 'SUMMIT',
'TER': 'TERRACE',
'TRWY': 'THROUGHWAY',
'TRCE': 'TRACE',
'TRAK': 'TRACK',
'TRFY': 'TRAFFICWAY',
'TRL': 'TRAIL',
'TUNL': 'TUNNEL',
'TPKE': 'TURNPIKE',
'UPAS': 'UNDERPASS',
'UN': 'UNION',
'UNS': 'UNIONS',
'VLY': 'VALLEY',
'VLYS': 'VALLEYS',
'VIA': 'VIADUCT',
'VW': 'VIEW',
'VWS': 'VIEWS',
'VLG': 'VILLAGE',
'VLGS': 'VILLAGES',
'VL': 'VILLE',
'VIS': 'VISTA',
'WALK': 'WALK',
'WALL': 'WALL',
'WAY': 'WAY',
'WL': 'WELL',
'WLS': 'WELLS'
}
UNITS = [
'APT',
'BLDG',
'BUILDING',
'BSMT',
'DEPT',
'FL',
'FRNT',
'HNGR',
'KEY',
'LBBY',
'LOT',
'LOWR',
'OFC',
'PH',
'PIER',
'REAR',
'RM',
'SIDE',
'SLIP',
'SPC',
'STOP',
'STE',
'TRLR',
'UNIT',
'UPPER',
'#',
'BASE', # Not a real unit designator but appears in some NAD AZ data for some reason
'(VACANT)' # One dataset does this...
]
STATES = {
"ALABAMA": "AL",
"ALASKA": "AK",
"ARIZONA": "AZ",
"ARKANSAS": "AR",
"CALIFORNIA": "CA",
"COLORADO": "CO",
"CONNECTICUT": "CT",
"DELAWARE": "DE",
"DISTRICT OF COLUMBIA": "DC",
"FLORIDA": "FL",
"GEORGIA": "GA",
"HAWAII": "HI",
"IDAHO": "ID",
"ILLINOIS": "IL",
"INDIANA": "IN",
"IOWA": "IA",
"KANSAS": "KS",
"KENTUCKY": "KY",
"LOUISIANA": "LA",
"MAINE": "ME",
"MONTANA": "MT",
"NEBRASKA": "NE",
"NEVADA": "NV",
"NEW HAMPSHIRE": "NH",
"NEW JERSEY": "NJ",
"NEW MEXICO": "NM",
"NEW YORK": "NY",
"NORTH CAROLINA": "NC",
"NORTH DAKOTA": "ND",
"OHIO": "OH",
"OKLAHOMA": "OK",
"OREGON": "OR",
"MARYLAND": "MD",
"MASSACHUSETTS": "MA",
"MICHIGAN": "MI",
"MINNESOTA": "MN",
"MISSISSIPPI": "MS",
"MISSOURI": "MO",
"PENNSYLVANIA": "PA",
"RHODE ISLAND": "RI",
"SOUTH CAROLINA": "SC",
"SOUTH DAKOTA": "SD",
"TENNESSEE": "TN",
"TEXAS": "TX",
"UTAH": "UT",
"VERMONT": "VT",
"VIRGINIA": "VA",
"WASHINGTON": "WA",
"WEST VIRGINIA": "WV",
"WISCONSIN": "WI",
"WYOMING": "WY"
}

614
src/streetcleaner.py Normal file
View File

@ -0,0 +1,614 @@
# Created on : Aug 28, 2024, 11:58:00PM
# Author : Skylar Ittner
DIRECTIONAL_REPLACEMENTS = {
'EAST': 'E',
'WEST': 'W',
'NORTH': 'N',
'SOUTH': 'S',
'NORTHEAST': 'NE',
'NORTHWEST': 'NW',
'SOUTHEAST': 'SE',
'SOUTHWEST': 'SW',
'ESTE': 'E',
'OESTE': 'W',
'NORTE': 'N',
'SUR': 'S',
'NORESTE': 'NE',
'NOROESTE': 'NW',
'SURESTE': 'SE',
'SUROESTE': 'SW'
}
STREET_TYPE_ABBREVIATIONS = {
'ALLEE': 'ALY',
'ALLEY': 'ALY',
'ALLY': 'ALY',
'ALY': 'ALY',
'ANEX': 'ANX',
'ANNEX': 'ANX',
'ANNX': 'ANX',
'ANX': 'ANX',
'ARC': 'ARC',
'ARCADE': 'ARC',
'AV': 'AVE',
'AVE': 'AVE',
'AVEN': 'AVE',
'AVENU': 'AVE',
'AVENUE': 'AVE',
'AVN': 'AVE',
'AVNUE': 'AVE',
'BAYOO': 'BYU',
'BAYOU': 'BYU',
'BCH': 'BCH',
'BEACH': 'BCH',
'BEND': 'BND',
'BND': 'BND',
'BLF': 'BLF',
'BLUF': 'BLF',
'BLUFF': 'BLF',
'BLUFFS': 'BLFS',
'BOT': 'BTM',
'BOTTM': 'BTM',
'BOTTOM': 'BTM',
'BTM': 'BTM',
'BLVD': 'BLVD',
'BOUL': 'BLVD',
'BOULEVARD': 'BLVD',
'BOULV': 'BLVD',
'BR': 'BR',
'BRANCH': 'BR',
'BRNCH': 'BR',
'BRDGE': 'BRG',
'BRG': 'BRG',
'BRIDGE': 'BRG',
'BRK': 'BRK',
'BROOK': 'BRK',
'BROOKS': 'BRKS',
'BURG': 'BG',
'BURGS': 'BGS',
'BYP': 'BYP',
'BYPA': 'BYP',
'BYPAS': 'BYP',
'BYPASS': 'BYP',
'BYPS': 'BYP',
'CAMP': 'CP',
'CMP': 'CP',
'CP': 'CP',
'CANYN': 'CYN',
'CANYON': 'CYN',
'CNYN': 'CYN',
'CYN': 'CYN',
'CAPE': 'CPE',
'CPE': 'CPE',
'CAUSEWAY': 'CSWY',
'CAUSWAY': 'CSWY',
'CSWY': 'CSWY',
'CEN': 'CTR',
'CENT': 'CTR',
'CENTER': 'CTR',
'CENTR': 'CTR',
'CENTRE': 'CTR',
'CNTER': 'CTR',
'CNTR': 'CTR',
'CTR': 'CTR',
'CENTERS': 'CTRS',
'CIR': 'CIR',
'CIRC': 'CIR',
'CIRCL': 'CIR',
'CIRCLE': 'CIR',
'CRCL': 'CIR',
'CRCLE': 'CIR',
'CIRCLES': 'CIRS',
'CLF': 'CLF',
'CLIFF': 'CLF',
'CLFS': 'CLFS',
'CLIFFS': 'CLFS',
'CLB': 'CLB',
'CLUB': 'CLB',
'COMMON': 'CMN',
'COR': 'COR',
'CORNER': 'COR',
'CORNERS': 'CORS',
'CORS': 'CORS',
'COURSE': 'CRSE',
'CRSE': 'CRSE',
'COURT': 'CT',
'CRT': 'CT',
'CT': 'CT',
'COURTS': 'CTS',
'COVE': 'CV',
'CV': 'CV',
'COVES': 'CVS',
'CK': 'CRK',
'CR': 'CRK',
'CREEK': 'CRK',
'CRK': 'CRK',
'CRECENT': 'CRES',
'CRES': 'CRES',
'CRESCENT': 'CRES',
'CRESENT': 'CRES',
'CRSCNT': 'CRES',
'CRSENT': 'CRES',
'CRSNT': 'CRES',
'CREST': 'CRST',
'CROSSING': 'XING',
'CRSSING': 'XING',
'CRSSNG': 'XING',
'XING': 'XING',
'CROSSROAD': 'XRD',
'CURVE': 'CURV',
'DALE': 'DL',
'DL': 'DL',
'DAM': 'DM',
'DM': 'DM',
'DIV': 'DV',
'DIVIDE': 'DV',
'DV': 'DV',
'DVD': 'DV',
'DR': 'DR',
'DRIV': 'DR',
'DRIVE': 'DR',
'DRV': 'DR',
'DRIVES': 'DRS',
'EST': 'EST',
'ESTATE': 'EST',
'ESTATES': 'ESTS',
'ESTS': 'ESTS',
'EXP': 'EXPY',
'EXPR': 'EXPY',
'EXPRESS': 'EXPY',
'EXPRESSWAY': 'EXPY',
'EXPW': 'EXPY',
'EXPY': 'EXPY',
'EXT': 'EXT',
'EXTENSION': 'EXT',
'EXTN': 'EXT',
'EXTNSN': 'EXT',
'EXTENSIONS': 'EXTS',
'EXTS': 'EXTS',
'FALL': 'FALL',
'FALLS': 'FLS',
'FLS': 'FLS',
'FERRY': 'FRY',
'FRRY': 'FRY',
'FRY': 'FRY',
'FIELD': 'FLD',
'FLD': 'FLD',
'FIELDS': 'FLDS',
'FLDS': 'FLDS',
'FLAT': 'FLT',
'FLT': 'FLT',
'FLATS': 'FLTS',
'FLTS': 'FLTS',
'FORD': 'FRD',
'FRD': 'FRD',
'FORDS': 'FRDS',
'FOREST': 'FRST',
'FORESTS': 'FRST',
'FRST': 'FRST',
'FORG': 'FRG',
'FORGE': 'FRG',
'FRG': 'FRG',
'FORGES': 'FRGS',
'FORK': 'FRK',
'FRK': 'FRK',
'FORKS': 'FRKS',
'FRKS': 'FRKS',
'FORT': 'FT',
'FRT': 'FT',
'FT': 'FT',
'FREEWAY': 'FWY',
'FREEWY': 'FWY',
'FRWAY': 'FWY',
'FRWY': 'FWY',
'FWY': 'FWY',
'GARDEN': 'GDN',
'GARDN': 'GDN',
'GDN': 'GDN',
'GRDEN': 'GDN',
'GRDN': 'GDN',
'GARDENS': 'GDNS',
'GDNS': 'GDNS',
'GRDNS': 'GDNS',
'GATEWAY': 'GTWY',
'GATEWY': 'GTWY',
'GATWAY': 'GTWY',
'GTWAY': 'GTWY',
'GTWY': 'GTWY',
'GLEN': 'GLN',
'GLN': 'GLN',
'GLENS': 'GLNS',
'GREEN': 'GRN',
'GRN': 'GRN',
'GREENS': 'GRNS',
'GROV': 'GRV',
'GROVE': 'GRV',
'GRV': 'GRV',
'GROVES': 'GRVS',
'HARB': 'HBR',
'HARBOR': 'HBR',
'HARBR': 'HBR',
'HBR': 'HBR',
'HRBOR': 'HBR',
'HARBORS': 'HBRS',
'HAVEN': 'HVN',
'HAVN': 'HVN',
'HVN': 'HVN',
'HEIGHT': 'HTS',
'HEIGHTS': 'HTS',
'HGTS': 'HTS',
'HT': 'HTS',
'HTS': 'HTS',
'HIGHWAY': 'HWY',
'HIGHWY': 'HWY',
'HIWAY': 'HWY',
'HIWY': 'HWY',
'HWAY': 'HWY',
'HWY': 'HWY',
'HILL': 'HL',
'HL': 'HL',
'HILLS': 'HLS',
'HLS': 'HLS',
'HLLW': 'HOLW',
'HOLLOW': 'HOLW',
'HOLLOWS': 'HOLW',
'HOLW': 'HOLW',
'HOLWS': 'HOLW',
'INLET': 'INLT',
'INLT': 'INLT',
'IS': 'IS',
'ISLAND': 'IS',
'ISLND': 'IS',
'ISLANDS': 'ISS',
'ISLNDS': 'ISS',
'ISS': 'ISS',
'ISLE': 'ISLE',
'ISLES': 'ISLE',
'JCT': 'JCT',
'JCTION': 'JCT',
'JCTN': 'JCT',
'JUNCTION': 'JCT',
'JUNCTN': 'JCT',
'JUNCTON': 'JCT',
'JCTNS': 'JCTS',
'JCTS': 'JCTS',
'JUNCTIONS': 'JCTS',
'KEY': 'KY',
'KY': 'KY',
'KEYS': 'KYS',
'KYS': 'KYS',
'KNL': 'KNL',
'KNOL': 'KNL',
'KNOLL': 'KNL',
'KNLS': 'KNLS',
'KNOLLS': 'KNLS',
'LAKE': 'LK',
'LK': 'LK',
'LAKES': 'LKS',
'LKS': 'LKS',
'LAND': 'LAND',
'LANDING': 'LNDG',
'LNDG': 'LNDG',
'LNDNG': 'LNDG',
'LA': 'LN',
'LANE': 'LN',
'LANES': 'LN',
'LN': 'LN',
'LGT': 'LGT',
'LIGHT': 'LGT',
'LIGHTS': 'LGTS',
'LF': 'LF',
'LOAF': 'LF',
'LCK': 'LCK',
'LOCK': 'LCK',
'LCKS': 'LCKS',
'LOCKS': 'LCKS',
'LDG': 'LDG',
'LDGE': 'LDG',
'LODG': 'LDG',
'LODGE': 'LDG',
'LOOP': 'LOOP',
'LOOPS': 'LOOP',
'MALL': 'MALL',
'MANOR': 'MNR',
'MNR': 'MNR',
'MANORS': 'MNRS',
'MNRS': 'MNRS',
'MDW': 'MDW',
'MEADOW': 'MDW',
'MDWS': 'MDWS',
'MEADOWS': 'MDWS',
'MEDOWS': 'MDWS',
'MEWS': 'MEWS',
'MILL': 'ML',
'ML': 'ML',
'MILLS': 'MLS',
'MLS': 'MLS',
'MISSION': 'MSN',
'MISSN': 'MSN',
'MSN': 'MSN',
'MSSN': 'MSN',
'MOTORWAY': 'MTWY',
'MNT': 'MT',
'MOUNT': 'MT',
'MT': 'MT',
'MNTAIN': 'MTN',
'MNTN': 'MTN',
'MOUNTAIN': 'MTN',
'MOUNTIN': 'MTN',
'MTIN': 'MTN',
'MTN': 'MTN',
'MNTNS': 'MTNS',
'MOUNTAINS': 'MTNS',
'NCK': 'NCK',
'NECK': 'NCK',
'ORCH': 'ORCH',
'ORCHARD': 'ORCH',
'ORCHRD': 'ORCH',
'OVAL': 'OVAL',
'OVL': 'OVAL',
'OVERPASS': 'OPAS',
'PARK': 'PARK',
'PK': 'PARK',
'PRK': 'PARK',
'PARKS': 'PARK',
'PARKWAY': 'PKWY',
'PARKWY': 'PKWY',
'PKWAY': 'PKWY',
'PKWY': 'PKWY',
'PKY': 'PKWY',
'PW': 'PKWY',
'PARKWAYS': 'PKWY',
'PKWYS': 'PKWY',
'PASS': 'PASS',
'PASSAGE': 'PSGE',
'PATH': 'PATH',
'PATHS': 'PATH',
'PIKE': 'PIKE',
'PIKES': 'PIKE',
'PINE': 'PNE',
'PINES': 'PNES',
'PNES': 'PNES',
'PL': 'PL',
'PLACE': 'PL',
'PLAIN': 'PLN',
'PLN': 'PLN',
'PLAINES': 'PLNS',
'PLAINS': 'PLNS',
'PLNS': 'PLNS',
'PLAZA': 'PLZ',
'PLZ': 'PLZ',
'PLZA': 'PLZ',
'POINT': 'PT',
'PT': 'PT',
'POINTS': 'PTS',
'PTS': 'PTS',
'PORT': 'PRT',
'PRT': 'PRT',
'PORTS': 'PRTS',
'PRTS': 'PRTS',
'PR': 'PR',
'PRAIRIE': 'PR',
'PRARIE': 'PR',
'PRR': 'PR',
'RAD': 'RADL',
'RADIAL': 'RADL',
'RADIEL': 'RADL',
'RADL': 'RADL',
'RAMP': 'RAMP',
'RANCH': 'RNCH',
'RANCHES': 'RNCH',
'RNCH': 'RNCH',
'RNCHS': 'RNCH',
'RAPID': 'RPD',
'RPD': 'RPD',
'RAPIDS': 'RPDS',
'RPDS': 'RPDS',
'REST': 'RST',
'RST': 'RST',
'RDG': 'RDG',
'RDGE': 'RDG',
'RIDGE': 'RDG',
'RDGS': 'RDGS',
'RIDGES': 'RDGS',
'RIV': 'RIV',
'RIVER': 'RIV',
'RIVR': 'RIV',
'RVR': 'RIV',
'RD': 'RD',
'ROAD': 'RD',
'RDS': 'RDS',
'ROADS': 'RDS',
'ROUTE': 'RTE',
'ROW': 'ROW',
'RUE': 'RUE',
'RUN': 'RUN',
'SHL': 'SHL',
'SHOAL': 'SHL',
'SHLS': 'SHLS',
'SHOALS': 'SHLS',
'SHOAR': 'SHR',
'SHORE': 'SHR',
'SHR': 'SHR',
'SHOARS': 'SHRS',
'SHORES': 'SHRS',
'SHRS': 'SHRS',
'SKYWAY': 'SKWY',
'SPG': 'SPG',
'SPNG': 'SPG',
'SPRING': 'SPG',
'SPRNG': 'SPG',
'SPGS': 'SPGS',
'SPNGS': 'SPGS',
'SPRINGS': 'SPGS',
'SPRNGS': 'SPGS',
'SPUR': 'SPUR',
'SPURS': 'SPUR',
'SQ': 'SQ',
'SQR': 'SQ',
'SQRE': 'SQ',
'SQU': 'SQ',
'SQUARE': 'SQ',
'SQRS': 'SQS',
'SQUARES': 'SQS',
'STA': 'STA',
'STATION': 'STA',
'STATN': 'STA',
'STN': 'STA',
'STRA': 'STRA',
'STRAV': 'STRA',
'STRAVE': 'STRA',
'STRAVEN': 'STRA',
'STRAVENUE': 'STRA',
'STRAVN': 'STRA',
'STRVN': 'STRA',
'STRVNUE': 'STRA',
'STREAM': 'STRM',
'STREME': 'STRM',
'STRM': 'STRM',
'ST': 'ST',
'STR': 'ST',
'STREET': 'ST',
'STRT': 'ST',
'STREETS': 'STS',
'SMT': 'SMT',
'SUMIT': 'SMT',
'SUMITT': 'SMT',
'SUMMIT': 'SMT',
'TER': 'TER',
'TERR': 'TER',
'TERRACE': 'TER',
'THROUGHWAY': 'TRWY',
'TRACE': 'TRCE',
'TRACES': 'TRCE',
'TRCE': 'TRCE',
'TRACK': 'TRAK',
'TRACKS': 'TRAK',
'TRAK': 'TRAK',
'TRK': 'TRAK',
'TRKS': 'TRAK',
'TRAFFICWAY': 'TRFY',
'TRFY': 'TRFY',
'TR': 'TRL',
'TRAIL': 'TRL',
'TRAILS': 'TRL',
'TRL': 'TRL',
'TRLS': 'TRL',
'TUNEL': 'TUNL',
'TUNL': 'TUNL',
'TUNLS': 'TUNL',
'TUNNEL': 'TUNL',
'TUNNELS': 'TUNL',
'TUNNL': 'TUNL',
'TPK': 'TPKE',
'TPKE': 'TPKE',
'TRNPK': 'TPKE',
'TRPK': 'TPKE',
'TURNPIKE': 'TPKE',
'TURNPK': 'TPKE',
'UNDERPASS': 'UPAS',
'UN': 'UN',
'UNION': 'UN',
'UNIONS': 'UNS',
'VALLEY': 'VLY',
'VALLY': 'VLY',
'VLLY': 'VLY',
'VLY': 'VLY',
'VALLEYS': 'VLYS',
'VLYS': 'VLYS',
'VDCT': 'VIA',
'VIA': 'VIA',
'VIADCT': 'VIA',
'VIADUCT': 'VIA',
'VIEW': 'VW',
'VW': 'VW',
'VIEWS': 'VWS',
'VWS': 'VWS',
'VILL': 'VLG',
'VILLAG': 'VLG',
'VILLAGE': 'VLG',
'VILLG': 'VLG',
'VILLIAGE': 'VLG',
'VLG': 'VLG',
'VILLAGES': 'VLGS',
'VLGS': 'VLGS',
'VILLE': 'VL',
'VL': 'VL',
'VIS': 'VIS',
'VIST': 'VIS',
'VISTA': 'VIS',
'VST': 'VIS',
'VSTA': 'VIS',
'WALK': 'WALK',
'WALKS': 'WALK',
'WALL': 'WALL',
'WAY': 'WAY',
'WY': 'WAY',
'WAYS': 'WAYS',
'WELL': 'WL',
'WELLS': 'WLS',
'WLS': 'WLS'
}
OCCUPANCY_TYPE_ABBREVIATIONS = {
'APARTMENT': 'APT',
'BUILDING': 'BLDG',
'BASEMENT': 'BSMT',
'DEPARTMENT': 'DEPT',
'FLOOR': 'FL',
'FRONT': 'FRNT',
'HANGER': 'HNGR',
'KEY': 'KEY',
'LOBBY': 'LBBY',
'LOT': 'LOT',
'LOWER': 'LOWR',
'OFFICE': 'OFC',
'PENTHOUSE': 'PH',
'PIER': 'PIER',
'REAR': 'REAR',
'ROOM': 'RM',
'SIDE': 'SIDE',
'SLIP': 'SLIP',
'SPACE': 'SPC',
'STOP': 'STOP',
'SUITE': 'STE',
'TRAILER': 'TRLR',
'UNIT': 'UNIT',
'UPPER': 'UPPER',
'#': '#'
}
ODD_REPLACEMENTS = {
"UNITED STATES HIGHWAY": "US HIGHWAY"
}
HIGHWAY_REPLACEMENTS = {
"UNITED STATES HIGHWAY": "US HIGHWAY",
}
# Replace directionals with abbreviated versions.
def abbrevDirectionals(string):
string = string.upper()
for (find, replace) in DIRECTIONAL_REPLACEMENTS:
string = string.replace(find, replace)
return string
def abbrevStreetTypes(string):
string = string.upper()
for (find, replace) in STREET_TYPE_ABBREVIATIONS:
string = string.replace(find, replace)
return string
# Odd/unusual string replacement
def oddHandling(string):
string = string.upper()
for (find, replace) in ODD_REPLACEMENTS:
string = string.replace(find, replace)
return string
def highwayStandardize(street):
def normalize(number, streetPreMod, streetPreDir, streetPreType, streetPreSep, streetName, streetPostType, streetPostDir, streetPostMod):

429
src/zipfunctions.py Normal file
View File

@ -0,0 +1,429 @@
# Created on : Aug 29, 2024, 12:57:40AM
# Author : Skylar Ittner
import re, sys
#import pandas as pd
from uszipcode import SearchEngine, ZipcodeTypeEnum
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon
import sqlite3
from src.constants import LONGHAND_STREET_TYPES
import src.config
import sys
import urllib.request
from urllib.parse import quote
import json
#zipcodes = pd.read_csv("zip_code_database.csv", keep_default_na=False, dtype="str")
fastsearch = SearchEngine(db_file_path="zipcode_db_simple.sqlite", simple_or_comprehensive=SearchEngine.SimpleOrComprehensiveArgEnum.simple)
search = SearchEngine(db_file_path="zipcode_db.sqlite", simple_or_comprehensive=SearchEngine.SimpleOrComprehensiveArgEnum.comprehensive)
zip4 = sqlite3.connect("file:/home/skylar/AddressDatabase/zip4.sqlite?mode=ro&immutable=1", uri=True, check_same_thread=False)
zip4.executescript("""
PRAGMA query_only=ON; -- belt-and-suspenders (cant write)
PRAGMA temp_store=MEMORY; -- sorts/temps in RAM
PRAGMA cache_size=-800000; -- ~800 MB page cache (negative = KB units)
PRAGMA mmap_size=4294967296; -- 1 GiB memory-mapped I/O (bump if you have RAM)
PRAGMA automatic_index=ON; -- leave enabled (default), can help odd joins
PRAGMA threads=4; -- allow parallel ops for sorts/expr eval (if available)
""")
zip4.row_factory = sqlite3.Row
cur = zip4.cursor()
# SQL query cache when finding ZIP+4, prevents running duplicate queries for nearby addresses on the same road
querycache = {}
querycachelimit = 3000
def checkZIPCode(lat, lon, state, zip):
zipok = False
if not zip or zip == None or zip != zip:
zipok = False
elif isinstance(zip, str):
if len(zip) == 5:
zipok = True
else:
zip = zip.rjust(5, '0')
elif isinstance(zip, int):
if zip >= 10000:
zipok = True
else:
zip = str(int(zip)).rjust(5, '0')
zipok = True
elif isinstance(zip, float):
if zip >= 10000:
zip = int(zip)
zipok = True
else:
zip = str(int(zip)).rjust(5, '0')
zipok = True
else:
zip = str(int(zip)).rjust(5, '0')
zipok = True
zipInfo = False
if zipok:
zipInfo = getCityStateForZIP(zip)
if not zipInfo:
zipok = False
elif zipInfo.state != state:
zipok = False
if not zipok:
result = search.by_coordinates(lat = lat, lng = lon, returns = 1)
if len(result) == 1:
return getCityStateForZIP(result[0].zipcode)
elif len(result) > 1:
print(result[0])
print(result[1])
return getCityStateForZIP(result[0].zipcode)
else:
return {"zip": "", "city": "", "state": ""}
else:
return zipInfo
def getCityStateForZIP(zipcode):
if not zipcode or zipcode == False or len(zipcode) != 5:
return False
cur.execute("SELECT ZipCode,City,State FROM ZIPCodes WHERE ZipCode='"+zipcode+"' AND CityStateKey=PreferredLastLineKey LIMIT 1")
results = cur.fetchall()
if len(results) == 0:
return False
return {
"zip": results[0]["ZipCode"],
"city": results[0]["City"],
"state": results[0]["State"]
}
def getZIPFromGeo(lat, lon, prefix=False, state=False):
if prefix and state:
result = search.query(lat = lat, lng = lon, returns = 1, radius=20, prefix=str(prefix), state=state)
elif state:
result = search.query(lat = lat, lng = lon, returns = 1, radius=20, state=state)
elif prefix:
result = search.query(lat = lat, lng = lon, returns = 1, radius=20, prefix=str(prefix))
else:
result = search.by_coordinates(lat = lat, lng = lon, returns = 1)
if len(result) == 1:
return result[0].zipcode
elif len(result) > 1:
#print("W: Multiple ZIP matches for "+lat+", "+lon+": "+result[0] + " or " + result[1])
return result[0].zipcode
else:
return None
def subaddrMatchRows(rows, unit):
if re.match(r"^[0-9]+$", unit):
unit = unit.zfill(8)
unitrows = []
nounitrows = []
for row in rows:
if row["AddressSecLowNumber"] == "":
nounitrows.append(row)
continue
if row["AddressSecLowNumber"] <= unit and row["AddressSecHighNumber"] >= unit:
unitrows.append(row)
if len(unitrows) == 0:
return nounitrows
return unitrows
def getZIPsForCityState(city, state):
city = city.upper().strip().replace("'","''")
cur.execute("SELECT * FROM ZIPCodes WHERE State = '" + state + "' AND (CityAliasName = '" + city + "' OR City = '" + city + "' OR CityAliasAbbreviation = '" + city + "')")
citylist = cur.fetchall()
resultlist = []
ziplist = []
# Remove entries that aren't the preferred city name for the ZIP Code
for cityrow in citylist:
if cityrow["CityStateKey"] == cityrow["PreferredLastLineKey"]:
resultlist.append(cityrow)
ziplist.append(cityrow["ZipCode"])
return resultlist, ziplist
def getZIPsForCounty(county, state):
county = county.upper().strip().replace("'","''")
cur.execute("SELECT ZipCode FROM ZIPCodes WHERE State = '" + state + "' AND County = '" + county + "'")
countylist = cur.fetchall()
# Also get records where the ZIP isn't mainly in the county but some of it is
cur.execute("SELECT ZipCode FROM ZIPCodesMultiCounty WHERE State = '" + state + "' AND County = '" + county + "'")
multicountylist = cur.fetchall()
ziplist = []
for row in countylist:
if row["ZipCode"] not in ziplist:
ziplist.append(row["ZipCode"])
for row in multicountylist:
if row["ZipCode"] not in ziplist:
ziplist.append(row["ZipCode"])
return ziplist
def addressRangeContainsNumber(low, high, evenodd, number):
numberevenodd = "B"
if re.match(r"^[0-9]+$", number):
if int(number) % 2 == 0:
numberevenodd = "E"
else:
numberevenodd = "O"
number = number.zfill(10)
elif low == number or high == number:
return True
if evenodd == "B" or evenodd == numberevenodd:
if low <= number and high >= number: # This logic is bad and should be rewritten
return True
return False
# Check if the address number range is actually just a single address that matches the number provided.
def addressRangeIsExactNumber(low, high, number):
if low != high:
return False
if low == number:
return True
if low == number.zfill(10):
return True
return False
def getZIP4(number, street, unit, state, lat, lon, city=False, zip=False, county=False):
number = number.strip()
street = street.strip()
if not unit:
unit = ""
# Get list of 5-digit ZIP Codes matching the city and state
citystateresults = False
zipfilter = False
if city:
citystateresults, zipfilter = getZIPsForCityState(city, state)
if len(zipfilter) == 0:
zipfilter = False
elif county:
zipfilter = getZIPsForCounty(county, state)
if len(zipfilter) == 0:
zipfilter = False
queries = []
basenamequeries = [] # Queries that only match on street basename, try after "main" queries don't return a match
# Get street base name for broader matching in case suffix or directional differs
typelessStreet = street
for (short, long) in LONGHAND_STREET_TYPES.items():
typelessStreet = re.sub(r"\s" + re.escape(short) + r"\b", "", typelessStreet)
streetBasename = re.sub("^[NSEW]{1,2} ", "", typelessStreet)
streetBasename = re.sub(" [NSEW]{1,2}$", "", streetBasename)
#print(street, typelessStreet, streetBasename)
# Build a list of queries to run, starting with the most specific and getting more desperate until a match is found
if zip:
queries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StreetFull = '" + street + "' AND State = '" + state + "'" + " AND ZipCode='"+zip+"'")
basenamequeries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StName = '" + streetBasename + "' AND State = '" + state + "'" + " AND ZipCode='"+zip+"'")
if zipfilter:
queries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StreetFull = '" + street + "' AND State = '" + state + "'" + " AND ZipCode IN ('" + ("','".join(zipfilter)) + "')")
basenamequeries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StName = '" + streetBasename + "' AND State = '" + state + "'" + " AND ZipCode IN ('" + ("','".join(zipfilter)) + "')")
if not unit and re.match(".* ([0-9]{1,5}|[A-Z]{1})$", street):
# Maybe the street has the apartment number in it for some reason
newStreet = re.sub(" ([0-9]{1,5}|[A-Z]{1})$", "", street)
newUnit = street[len(newStreet):].strip()
typelessStreet = newStreet
for (short, long) in LONGHAND_STREET_TYPES.items():
typelessStreet = re.sub(r"\s" + re.escape(short) + r"\b", "", typelessStreet)
newstreetBasename = re.sub("^[NSEW]{1,2} ", "", typelessStreet)
newstreetBasename = re.sub(" [NSEW]{1,2}$", "", newstreetBasename)
if zip:
queries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StreetFull = '" + newStreet + "' AND State = '" + state + "'" + " AND ZipCode='"+zip+"'")
basenamequeries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StName = '" + newstreetBasename + "' AND State = '" + state + "'" + " AND ZipCode='"+zip+"'")
if zipfilter:
queries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StreetFull = '" + newStreet + "' AND State = '" + state + "'" + " AND ZipCode IN ('" + ("','".join(zipfilter)) + "')")
basenamequeries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StName = '" + newstreetBasename + "' AND State = '" + state + "'" + " AND ZipCode IN ('" + ("','".join(zipfilter)) + "')")
if not zip and not zipfilter:
# Who needs ZIP Codes and city names anyways
queries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StreetFull = '" + street + "' AND State = '" + state + "'")
queries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StName = '" + street + "' AND State = '" + state + "'")
basenamequeries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StName = '" + streetBasename + "' AND State = '" + state + "'")
queries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StreetFull LIKE '" + street + "%' AND State = '" + state + "'")
#queries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StName LIKE '" + streetBasename + "%' AND State = '" + state + "'")
resultrows = []
suggestZip = ""
suggestStreet = ""
queries = queries + basenamequeries
for query in queries:
#print(query)
if query in querycache:
rows = querycache[query]
#print("CACHED: " + query)
else:
cur.execute(query)
#print("NOCACHE: " +query)
rows = cur.fetchall()
# Add to query cache
querycache[query] = rows
if len(querycache) > querycachelimit:
querycache.pop(next(iter(querycache)))
unitfilterrows = rows
if unit:
# Filter to rows that match the unit number
unitfilterrows = subaddrMatchRows(rows, unit)
# Try matching range against unit-filtered rows, if that doesn't work, try the non-filtered ones (address might be more specific than ZIP4 file)
for row in unitfilterrows:
if addressRangeContainsNumber(row["AddressPrimaryLowNumber"], row["AddressPrimaryHighNumber"], row["AddressPrimaryEvenOdd"], number):
resultrows.append(row)
if len(resultrows) == 0 and len(unitfilterrows) < len(rows):
for row in rows:
if addressRangeContainsNumber(row["AddressPrimaryLowNumber"], row["AddressPrimaryHighNumber"], row["AddressPrimaryEvenOdd"], number):
resultrows.append(row)
if len(resultrows) == 1:
# One match found, it's probably the right one!
return resultrows[0]["ZipCode"], resultrows[0]["Plus4Low"], resultrows[0]["StreetFull"], resultrows[0]["AddressSecAbbr"], unit
if len(resultrows) > 1:
# First check if our source address has a unit, and if not, remove all match rows that DO have a unit.
if not unit:
base_rows = [
r for r in resultrows
if not (
r["AddressSecAbbr"] or
r["AddressSecLowNumber"] or
r["AddressSecHighNumber"]
)
]
# If we found at least one base-address row, narrow resultrows to those
if base_rows:
# Narrow further by looking for exact number matches (low and high are the same and what we're looking for)
exact_rows = []
for row in base_rows:
if addressRangeIsExactNumber(row["AddressPrimaryLowNumber"], row["AddressPrimaryHighNumber"], number):
exact_rows.append(row)
if len(exact_rows) > 0:
resultrows = exact_rows
else:
resultrows = base_rows
# If that left us with exactly one, we can return it immediately
if len(resultrows) == 1:
row = resultrows[0]
return (
row["ZipCode"],
row["Plus4Low"],
row["StreetFull"],
"", # no unit designator when no unit was given
unit, # still the original (empty) unit
)
suggestZip = resultrows[0]["ZipCode"]
suggestStreet = resultrows[0]["StreetFull"]
for row in resultrows:
# Check if the 5-digit ZIP and/or street are the same for all results, clear it if not
if suggestZip != row["ZipCode"]:
suggestZip = ""
if suggestStreet != row["StreetFull"]:
suggestStreet = ""
# Return an address-specific row if it exists
if row["AddressPrimaryLowNumber"] == number and row["AddressPrimaryHighNumber"] == number:
return row["ZipCode"], row["Plus4Low"], row["StreetFull"], row["AddressSecAbbr"], unit
#print("Multiple possible ZIP+4 matches for", number, street, "#"+unit, city, state, zip)
#for row in resultrows:
# print(row["ZipCode"],row["AddressPrimaryLowNumber"],row["AddressPrimaryHighNumber"], row["StreetFull"], row["AddressPrimaryEvenOdd"], row["Plus4Low"], row["AddressSecAbbr"])
# No match found
cfg = src.config.get_config()
if cfg.useCensusToFillEmptyZIPs and number != "" and street != "" and city != "" and city != False and state != "":
# Query the Census Geocoder, because this address probably exists
print("US Census Geo:" + number + " " + street + ", " + city + " " + state + " ", end="\r", flush=True)
try:
result = urllib.request.urlopen("https://geocoding.geo.census.gov/geocoder/locations/address?street="+quote(number + " " + street)+"&city="+quote(city)+"&state="+state+"&zip=&benchmark=4&format=json").read()
jsonresult = json.loads(result)
if len(jsonresult["result"]["addressMatches"]) == 1:
comps = jsonresult["result"]["addressMatches"][0]["addressComponents"]
streetparts = [comps["preDirection"], comps["preType"], comps["streetName"], comps["suffixType"], comps["suffixDirection"]]
street = " ".join(x for x in streetparts if x)
return jsonresult["result"]["addressMatches"][0]["addressComponents"]["zip"], "", street, "", unit
except:
pass
if suggestZip == "":
suggestZip = getZIP(lat, lon, False, state, city)
if suggestStreet == "":
suggestStreet = street
return suggestZip, "", suggestStreet, "", unit
def getZIP(lat, lon, prefix=False, state=False, city=False):
if city == "":
city = False
else:
city = city.upper()
if state == "":
state = False
lat = float(lat)
lon = float(lon)
citystateresult = False
if city and state: # Check if city and state combo only has one standard ZIP Code
try:
citystateresult = fastsearch.by_city_and_state(city, state, returns=30, zipcode_type=ZipcodeTypeEnum.Standard) # Use simple database because it's like 2x faster
if len(citystateresult) == 1:
#print("Exact city match found: "+city+ " "+state+" "+citystateresult[0].zipcode)
return citystateresult[0].zipcode
except ValueError:
# Sometimes it objects to a city name and says it isn't valid
pass
if prefix and state: # Get ZIPs by lat/lon that start with prefix and are in state for faster queries
result = search.query(lat = lat, lng = lon, returns = 20, radius=10, prefix=str(prefix), state=state, zipcode_type=ZipcodeTypeEnum.Standard)
elif state: # Get ZIPs filtered by state for faster queries
result = search.query(lat = lat, lng = lon, returns = 20, radius=10, state=state, zipcode_type=ZipcodeTypeEnum.Standard)
elif prefix: # Get ZIPs by lat/lon that start with prefix for faster queries
result = search.query(lat = lat, lng = lon, returns = 20, radius=10, prefix=str(prefix), zipcode_type=ZipcodeTypeEnum.Standard)
else: # Get ZIPs by lat/lon
result = search.by_coordinates(lat = lat, lng = lon, returns = 20, zipcode_type=ZipcodeTypeEnum.Standard)
if len(result) == 1:
return result[0].zipcode
elif len(result) > 1:
matchzips = []
if citystateresult:
# Find zip codes that both queries have in common, maybe there's only one that overlaps with both!
for val in citystateresult:
for res in result:
if res.zipcode == val.zipcode:
matchzips.append(res)
if len(matchzips) == 1:
#print("Exact match found between lat/lon and city/state queries: "+matchzips[0])
return matchzips[0].zipcode
else:
matchzips = result
#print("W: Multiple equally-valid ZIP matches for "+str(lat)+", "+str(lon)+" "+str(city)+", "+str(state)+": ")
addrpoint = Point(lon, lat)
for zip in matchzips:
#print(" "+zip.zipcode)
zippolys = zip.polygon
if zippolys == None:
continue
if dimensionality(zippolys) == 2:
zippolys = [zippolys]
for poly in zippolys:
zipborder = Polygon(poly)
if zipborder.contains(addrpoint):
#print("Found probable ZIP based on border: " + zip.zipcode)
return zip.zipcode
return None
else:
return None
def dimensionality(matrix):
dims = []
while isinstance(matrix, list) and matrix is not None:
dims.append(len(matrix))
matrix = matrix[0]
return len(dims)

42736
zip_code_database.csv Normal file

File diff suppressed because it is too large Load Diff

272
zipdbgen.py Executable file
View File

@ -0,0 +1,272 @@
#!/usr/bin/python3
# Generate a ZIP+4 database from the data at https://www.zip-codes.com/zip-plus-4-database.asp
from argparse import ArgumentParser
import sqlite3, zipfile, re
import pandas as pd
def process(infile, outfile):
print("Reading " + infile)
zf = zipfile.ZipFile(infile, mode="r")
zipFiles = zf.namelist()
ziplist = []
zip5list = []
zipcountylist = [] # List of ZIPs in multiple counties
for fname in zipFiles:
if re.match("ZIP4-[A-Z]{2}.zip", fname):
ziplist.append(fname)
elif fname == "zip-codes-database-STANDARD.csv":
zip5list.append(fname)
elif "MULTI-COUNTY" in fname and fname.endswith(".csv"):
zipcountylist.append(fname)
filesprocessed = 0
chunksprocessed = 0
chunksize = 5000
if len(ziplist) > 0:
print("Creating ZIP+4 database")
connection = sqlite3.connect(outfile)
connection.executescript("PRAGMA foreign_keys=OFF;")
c = connection.cursor()
c.execute("PRAGMA journal_mode=OFF;") # or MEMORY; fastest is OFF (risk if crash)
c.execute("PRAGMA synchronous=OFF;") # biggest win: no fsync on each commit
c.execute("PRAGMA temp_store=MEMORY;") # keep temp B-trees in RAM
c.execute("PRAGMA cache_size=-1600000;") # ~1600MB page cache (negative = KB)
c.execute("PRAGMA locking_mode=EXCLUSIVE;") # avoid lock thrash
c.execute("PRAGMA mmap_size=1073741824;") # 1GB mmap; helps reads, slight write help
c.execute("PRAGMA page_size=65536;")
createZIP4DB(c)
def mergeStreet(row):
return ' '.join(filter(None, [row["StPreDirAbbr"], row["StName"], row["StSuffixAbbr"], row["StPostDirAbbr"]]))
for file in ziplist:
with zf.open(file, mode="r", force_zip64=True) as innerfile:
with zipfile.ZipFile(innerfile, mode="r") as innerzip:
with innerzip.open(innerzip.namelist()[0], mode="r") as csvfile:
print("\nImporting " + file + " ..." + " ", end="\r", flush=True)
for chunk in pd.read_csv(csvfile, chunksize=chunksize, keep_default_na=False, dtype="str"):
chunk["StreetFull"] = chunk.apply(mergeStreet, axis=1)
chunk.to_sql("ZIP4", connection, if_exists='append', index=False, method='multi')
chunksprocessed = chunksprocessed + 1
print("Importing " + file + " ... " + str(chunksprocessed * chunksize) +" ", end="\r", flush=True)
#print("\nVacuuming database...")
#connection.executescript("VACUUM")
filesprocessed = filesprocessed + 1
zf.close()
if len(zip5list) > 0:
print("Creating 5-digit ZIP database")
connection = sqlite3.connect(outfile)
c = connection.cursor()
createZIP5DB(c)
filesprocessed = 1
with zf.open(zip5list[0], mode="r", force_zip64=True) as csvfile:
print("\nImporting " + zip5list[0] + " ..." + " ", end="\r", flush=True)
for chunk in pd.read_csv(csvfile, chunksize=chunksize, keep_default_na=False, dtype="str"):
chunk.to_sql("ZIPCodes", connection, if_exists='append', index=False)
chunksprocessed = chunksprocessed + 1
print("Importing " + zip5list[0] + " ... " + str(chunksprocessed * chunksize) +" ", end="\r", flush=True)
if len(zipcountylist) > 0:
print("Creating Multi-county ZIP database")
connection = sqlite3.connect(outfile)
c = connection.cursor()
createZIPMultiCountyDB(c)
filesprocessed = 1
with zf.open(zipcountylist[0], mode="r", force_zip64=True) as csvfile:
print("\nImporting " + zipcountylist[0] + " ..." + " ", end="\r", flush=True)
for chunk in pd.read_csv(csvfile, chunksize=chunksize, keep_default_na=False, dtype="str"):
chunk.to_sql("ZIPCodesMultiCounty", connection, if_exists='append', index=False)
chunksprocessed = chunksprocessed + 1
print("Importing " + zipcountylist[0] + " ... " + str(chunksprocessed * chunksize) +" ", end="\r", flush=True)
print("\nFiles processed: " + str(filesprocessed))
print("Records processed: " + str(chunksprocessed * chunksize))
print("Done! Saved to " + outfile)
print("\nOne last thing: optimizing output database (this might take a few minutes)...")
connection.executescript("VACUUM; ANALYZE; PRAGMA optimize;")
def createZIP5DB(c):
c.execute("DROP TABLE IF EXISTS ZIPCodes")
c.execute('''CREATE TABLE ZIPCodes (
ZipCode char(5) NOT NULL,
City varchar(35) NULL,
State char(2),
County varchar(45) NULL,
AreaCode varchar(55) NULL,
CityType char(1) NULL,
CityAliasAbbreviation varchar(13) NULL,
CityAliasName varchar(35) NULL,
Latitude decimal(12, 6),
Longitude decimal(12, 6),
TimeZone char(2) NULL,
Elevation int,
CountyFIPS char(5) NULL,
DayLightSaving char(1) NULL,
PreferredLastLineKey varchar(10) NULL,
ClassificationCode char(1) NULL,
MultiCounty char(1) NULL,
StateFIPS char(2) NULL,
CityStateKey char(6) NULL,
CityAliasCode varchar(5) NULL,
PrimaryRecord char(1),
CityMixedCase varchar(35) NULL,
CityAliasMixedCase varchar(35) NULL,
StateANSI varchar(2) NULL,
CountyANSI varchar(3) NULL,
FacilityCode varchar(1) NULL,
CityDeliveryIndicator varchar(1) NULL,
CarrierRouteRateSortation varchar(1) NULL,
FinanceNumber varchar(6) NULL,
UniqueZIPName varchar(1) NULL,
CountyMixedCase varchar(45) NULL
);''')
c.execute("CREATE INDEX Index_ZIPCodes_ZipCode ON ZIPCodes (ZipCode)")
c.execute("CREATE INDEX Index_ZIPCodes_State ON ZIPCodes (State)")
c.execute("CREATE INDEX Index_ZIPCodes_County ON ZIPCodes (County)")
c.execute("CREATE INDEX Index_ZIPCodes_AreaCode ON ZIPCodes (AreaCode)")
c.execute("CREATE INDEX Index_ZIPCodes_City ON ZIPCodes (City)")
c.execute("CREATE INDEX Index_ZIPCodes_Latitude ON ZIPCodes (Latitude)")
c.execute("CREATE INDEX Index_ZIPCodes_Longitude ON ZIPCodes (Longitude)")
c.execute("CREATE INDEX Index_ZIPCodes_CityAliasName ON ZIPCodes (CityAliasName)")
c.execute("CREATE INDEX Index_ZIPCodes_CityStateKey ON ZIPCodes (CityStateKey)")
c.execute("DROP TABLE IF EXISTS States")
c.execute("CREATE TABLE States (code TEXT, name TEXT)")
c.execute('INSERT INTO "States" ("code", "name") VALUES ("AE", "Armed Forces Europe, the Middle East, and Canada")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("AP", "Armed Forces Pacific")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("AA", "Armed Forces Americas")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("AL", "Alabama")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("AK", "Alaska")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("AS", "American Samoa")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("AZ", "Arizona")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("AR", "Arkansas")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("CA", "California")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("CO", "Colorado")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("CT", "Connecticut")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("DE", "Delaware")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("DC", "District of Columbia")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("FM", "Federated States of Micronesia")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("FL", "Florida")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("GA", "Georgia")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("GU", "Guam")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("HI", "Hawaii")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("ID", "Idaho")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("IL", "Illinois")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("IN", "Indiana")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("IA", "Iowa")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("KS", "Kansas")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("KY", "Kentucky")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("LA", "Louisiana")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("ME", "Maine")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("MH", "Marshall Islands")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("MD", "Maryland")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("MA", "Massachusetts")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("MI", "Michigan")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("MN", "Minnesota")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("MS", "Mississippi")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("MO", "Missouri")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("MT", "Montana")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("NE", "Nebraska")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("NV", "Nevada")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("NH", "New Hampshire")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("NJ", "New Jersey")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("NM", "New Mexico")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("NY", "New York")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("NC", "North Carolina")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("ND", "North Dakota")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("MP", "Northern Mariana Islands")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("OH", "Ohio")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("OK", "Oklahoma")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("OR", "Oregon")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("PW", "Palau")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("PA", "Pennsylvania")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("PR", "Puerto Rico")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("RI", "Rhode Island")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("SC", "South Carolina")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("SD", "South Dakota")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("TN", "Tennessee")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("TX", "Texas")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("UT", "Utah")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("VT", "Vermont")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("VI", "Virgin Islands")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("VA", "Virginia")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("WA", "Washington")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("WV", "West Virginia")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("WI", "Wisconsin")')
c.execute('INSERT INTO "States" ("code", "name") VALUES ("WY", "Wyoming")')
def createZIPMultiCountyDB(c):
c.execute("DROP TABLE IF EXISTS ZIPCodesMultiCounty")
c.execute("CREATE TABLE ZIPCodesMultiCounty ( ZipCode char(5) NOT NULL, StateFIPS char(2), State char(2), CountyFIPS char(5) NULL, County varchar(45), CountyMixedCase varchar(45) )")
c.execute("CREATE INDEX Index_ZIPCodesMultiCounty_ZipCode ON ZIPCodesMultiCounty (ZipCode)")
c.execute("CREATE INDEX Index_ZIPCodesMultiCounty_State ON ZIPCodesMultiCounty (State)")
c.execute("CREATE INDEX Index_ZIPCodesMultiCounty_County ON ZIPCodesMultiCounty (County)")
def createZIP4DB(c):
c.execute("DROP TABLE IF EXISTS `ZIP4`")
c.execute('''
CREATE TABLE "ZIP4" (
"ZipCode"char(5),
"UpdateKey"varchar(10),
"Action"char(1),
"RecordType"varchar(1),
"CarrierRoute"varchar(4),
"StPreDirAbbr"varchar(2),
"StName"varchar(28),
"StSuffixAbbr"varchar(4),
"StPostDirAbbr"varchar(2),
"AddressPrimaryLowNumber"varchar(10),
"AddressPrimaryHighNumber"varchar(10),
"AddressPrimaryEvenOdd"varchar(1),
"BuildingName"varchar(40),
"AddressSecAbbr"varchar(4),
"AddressSecLowNumber"varchar(10),
"AddressSecHighNumber"varchar(10),
"AddressSecOddEven"varchar(1),
"Plus4Low"varchar(4),
"Plus4High"varchar(4),
"BaseAlternateCode"varchar(1),
"LACSStatus"varchar(1),
"GovernmentBuilding"varchar(1),
"FinanceNumber"varchar(6),
"State"varchar(2),
"CountyFIPS"varchar(3),
"CongressionalDistrict"varchar(2),
"MunicipalityKey"varchar(6),
"UrbanizationKey"varchar(6),
"PreferredLastLineKey"varchar(6),
"ToLatitude"decimal(18, 10),
"FromLatitude"decimal(18, 10),
"ToLongitude"decimal(18, 10),
"FromLongitude"decimal(18, 10),
"CensusTract"varchar(15),
"CensusBlock"varchar(15),
"TLID"varchar(15),
"LatLonMultiMatch"varchar(1),
"StreetFull" varchar(36)
)
''')
c.execute('''CREATE INDEX "addressnumber" ON "ZIP4" ("AddressPrimaryLowNumber","AddressPrimaryHighNumber","AddressPrimaryOddEven")''')
c.execute('''CREATE INDEX "key" ON "ZIP4" ("PreferredLastLineKey")''')
c.execute('''CREATE INDEX "zipcode_route" ON "ZIP4" ("ZipCode", "CarrierRoute")''')
c.execute('''CREATE INDEX "state" ON "ZIP4" ("State")''')
c.execute('''CREATE INDEX "streetfull_state" ON "ZIP4" ("StreetFull", "State")''')
c.execute('''CREATE INDEX "stname_state" ON "ZIP4" ("StName", "State")''')
c.execute('''CREATE INDEX "zip" ON "ZIP4" ("ZipCode")''')
c.execute('''CREATE INDEX "streetfull_state_zip" ON "ZIP4" ("StreetFull", "State", "ZipCode")''')
c.execute('''CREATE INDEX "stname_state_zip" ON "ZIP4" ("StName", "State", "ZipCode")''')
parser = ArgumentParser(description='Create a SQLite ZIP Code database from CSV data from https://www.zip-codes.com/zip-plus-4-database.asp. Supports both 5-digit ZIP and ZIP+4 products.')
parser.add_argument('src', help='Input .zip archive')
parser.add_argument('dest', help='Output SQLite3 database file')
if __name__ == "__main__":
args = parser.parse_args()
process(args.src, args.dest)