Add gitignore

2025-11-15 19:51:14 -07:00 · 2025-11-15 19:51:14 -07:00 · b2de3304f3
commit b2de3304f3
15 changed files with 45862 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
 __pycache__/*
--- a/checkoa.py
+++ b/checkoa.py
@ -0,0 +1,99 @@
 #!/usr/bin/python3
 import os, json, traceback
 from argparse import ArgumentParser
 rowstocheck = 15000 # Stop reading a file after this many rows, speeds up analysis of many/large address files
 oklist = []
 emptygeometrylist = []
 emptyaddresslist = []
 nocitylist = []
 noziplist = []
 totallyemptylist = []
 def checkGeojson(filename):
    filedata = open(filename, 'r')
    linecount = 0
    okcount = 0
    emptygeometrycount = 0
    emptyaddresscount = 0
    emptycitycount = 0
    emptyzipcount = 0
    for line in filedata:
        linecount = linecount + 1
        if linecount > rowstocheck:
            break;
        try:
            data = json.loads(line)
            bad = False
            if not data["properties"]["number"] or not data["properties"]["street"]:
                emptyaddresscount = emptyaddresscount + 1
                bad = True
            if not data["geometry"] or not data["geometry"]["coordinates"][0] or not data["geometry"]["coordinates"][1]:
                emptygeometrycount = emptygeometrycount + 1
                bad = True
            if not data["properties"]["city"] and not data["properties"]["postcode"]: # Flag missing city unless postal code exists, because city can probably be filled from that
                emptycitycount = emptycitycount + 1
                bad = True
            if not data["properties"]["postcode"]:
                emptyzipcount = emptyzipcount + 1
                bad = True
            if bad == False:
                okcount = okcount + 1
        except Exception as e:
            traceback.print_exc()
            print("Error encountered while processing", filename, "at", line)
    print(filename, ": OK:", str(okcount), "Bad: geometry:", str(emptygeometrycount), "address:", str(emptyaddresscount), "city:", str(emptycitycount), "zip:", str(emptyzipcount), "          ", end="\r", flush=True)
    filedata.close()
    bad = False
    if emptygeometrycount / linecount > .25:
        emptygeometrylist.append(filename)
        bad = True
    if emptyaddresscount / linecount > .67:
        emptyaddresslist.append(filename)
        bad = True
    if emptycitycount / linecount > .67:
        nocitylist.append(filename)
        bad = True
    if emptyzipcount / linecount > .75:
        noziplist.append(filename)
        bad = True
    if emptyaddresscount >= (linecount - 10): # Allow a couple not-fully-empty addresses, otherwise some broken ones won't be reported
        totallyemptylist.append(filename)
        bad = True
    if bad == False:
        oklist.append(filename)
 parser = ArgumentParser(
    description="Check OpenAddresses GeoJSON files and report on any problems found."
 )
 parser.add_argument(
    "source",
    help="File(s) to check.",
    nargs='+'
 )
 if __name__ == "__main__":
    args = parser.parse_args()
    print("Checking " + str(len(args.source)) + " OpenAddresses data files.")
    for filename in args.source:
        checkGeojson(filename)
    print("                                                                                                                               ")
    print()
    print("== Report ==")
    print(" Files missing geometry:")
    for filename in emptygeometrylist:
        print("    ", filename)
    print(" Files missing street address:")
    for filename in emptyaddresslist:
        print("    ", filename)
    print(" Files missing city:")
    for filename in nocitylist:
        print("    ", filename)
    print(" Files missing postal code:")
    for filename in noziplist:
        print("    ", filename)
    print(" Files missing all street addresses:")
    for filename in totallyemptylist:
        print("    ", filename)
--- a/downloadoa.py
+++ b/downloadoa.py
@ -0,0 +1,63 @@
 #!/usr/bin/python3
 import gzip
 import shutil
 from argparse import ArgumentParser
 import requests, tempfile, os, pathlib
 sourceList = {}
 def getSourceList():
    global sourceList
    if sourceList == {}:
        print("Fetching sources list")
        json = requests.get(
            "https://batch.openaddresses.io/api/data"
        ).json()
        for s in json:
            if s["layer"] != "addresses":
                continue
            if s["source"] in sourceList:
                if s["updated"] > sourceList[s["source"]]["updated"]:
                    sourceList[s["source"]] = s
            else:
                sourceList[s["source"]] = s
    return sourceList
 def downloadSources(id, outfolder):
    for sourceName in getSourceList():
        s = getSourceList()[sourceName]
        if s["source"].startswith(id):
            outfilename = outfolder + "/" + s["source"] + "-addresses-" + s["name"] + ".geojson"
            outfoldername = os.path.dirname(outfilename)
            if os.path.isfile(outfilename):
                print("Skipping " + s["source"] + ", already on disk.")
                continue
            print("Downloading " + s["source"])
            gzdl = requests.get("https://v2.openaddresses.io/batch-prod/job/" + str(s["job"]) + "/source.geojson.gz", stream=True)
            tmp = tempfile.NamedTemporaryFile()
            with open(tmp.name, 'wb') as tf:
                for chunk in gzdl.iter_content(chunk_size=16*1024):
                    tf.write(chunk)
            pathlib.Path(outfoldername).mkdir(parents=True, exist_ok=True)
            with gzip.open(tmp.name) as gzf, open(outfilename, 'wb') as outf:
                shutil.copyfileobj(gzf, outf)
 parser = ArgumentParser(
    description="Download address data from OpenAddresses.io"
 )
 parser.add_argument(
    "source",
    help="Source dataset ID, or partial ID. For example: us/al/ will download all Alabama datasets, us/mt/statewide will download the Montana statewide dataset.",
 )
 parser.add_argument(
    "outfolder",
    help="Output folder",
 )
 if __name__ == "__main__":
    args = parser.parse_args()
    downloadSources(args.source, args.outfolder)
--- a/main.py
+++ b/main.py
@ -0,0 +1,780 @@
 #!/usr/bin/python3
 if __name__ == "__main__":
    print("Address Database Builder 2025")
    print("Starting up...")
 import argparse, csv, zipfile, gzip, os, re, json, traceback, sys, multiprocessing
 import concurrent.futures
 from collections import deque
 import pandas as pd
 import dask.dataframe as dd
 import gc
 from multiprocessing import get_context
 import sqlite3
 from src.addressfunctions import normalizeAddress
 from src.constants import ValidationException
 import src.config
 maxthreads = multiprocessing.cpu_count()
 MAX_IN_FLIGHT = maxthreads * 2
 os.environ["OPENBLAS_MAIN_FREE"] = "1"
 writelock = multiprocessing.Lock()
 badcount = 0
 skippedcount = 0
 countrycode = "US"
 def init_worker(cfg: src.config.AppConfig):
    src.config.set_config(cfg)
 def fixLatLon(filepath):
    cfg = src.config.get_config()
    print("Repairing flipped latitude/longitude pairs in " + filepath)
    fixedcount = 0
    df = pd.read_csv(filepath, keep_default_na=False, dtype="str")
    skipstates = ("VI", "AK", "HI", "PR")
    for index, row in df.iterrows():
        row.latitude = float(row.latitude)
        row.longitude = float(row.longitude)
        if row.latitude < -90 or row.latitude > 90:
            df.at[index, "latitude"], df.at[index, "longitude"] = row.longitude, row.latitude
            fixedcount = fixedcount + 1
        elif cfg.countryCode == "US" and row.state not in skipstates and (row.longitude < -171.791110603 or row.longitude > -66.96466):
            df.at[index, "latitude"], df.at[index, "longitude"] = row.longitude, row.latitude
            fixedcount = fixedcount + 1
        elif cfg.countryCode == "US" and row.state not in skipstates and (row.latitude < 18.91619 or row.latitude > 71.3577635769):
            df.at[index, "latitude"], df.at[index, "longitude"] = row.longitude, row.latitude
            fixedcount = fixedcount + 1
    df.to_csv(filepath + ".coordfix.csv", mode='a', index=False, header=not os.path.exists(filepath + ".coordfix.csv"))
    print("\nDone flipping " + filepath + "! Fixed " + str(fixedcount) + " records.")
 def normalize(number, street, street2, city, state, zipcode, latitude, longitude, zipprefix = False, plus4="", county = False):
    cfg = src.config.get_config()
    street1 = street
    if len(city) > 4 and street1.endswith(" " + city):
        # City name leaked into street field (Albany County Wyoming, for one)
        street1 = street1.removesuffix(" " + city)
    addr = normalizeAddress(number, street1, street2, city, state, zipcode, round(float(latitude),7), round(float(longitude), 7), zipprefix, plus4, county)
    if len(addr['zip'] or "") < 5 or len(addr['plus4'] or "") != 4:
        # Try removing letters from address numbers, and ignore city field
        addrstrip = normalizeAddress(re.sub("[^0-9]", "", addr['number']), addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['latitude'], addr['longitude'], False, addr['plus4'], county)
        # If that didn't work, try instead stripping the city name because it might be wrong
        if addr['city'] != "" and (len(addrstrip['zip'] or "") < 5 or len(addrstrip['plus4'] or "") != 4):
            addrstrip = normalizeAddress(addr['number'], addr['street'], addr['unit'], "", addr['state'], addr['zip'], addr['latitude'], addr['longitude'], False, addr['plus4'], county)
        # Use libpostal to analyze address deeper
        if cfg.advancedMode and len(addrstrip['zip'] or "") < 5 or len(addrstrip['plus4'] or "") != 4:
            try:
                addr = advancedNormalize(addr['number'], addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['latitude'], addr['longitude'], False, addr['plus4'], county)
            except Exception as e:
                pass
            # Do another normalize pass for good luck (maybe the previous one got the ZIP and now we can get the +4)
            if len(addr['zip'] or "") < 5 or len(addr['plus4'] or "") != 4:
                addr = normalizeAddress(addr['number'], addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['latitude'], addr['longitude'], False, addr['plus4'], county)
        else:
            addr = addrstrip
    return addr
 def processOwnChunk(chunk, chunkcount, outfilename, ignorestates, keeponlystates):
    global badcount, skippedcount, writelock
    cfg = src.config.get_config()
    data = []
    print(" " + str(chunkcount) + "          ", end="\r", flush=True)
    for index, row in chunk.iterrows():
        if row.state in ignorestates:
            skippedcount = skippedcount + 1
            continue
        if keeponlystates != [] and row.state not in keeponlystates:
            skippedcount = skippedcount + 1
            continue
        try:
            if not cfg.noSkip4 and len(row.plus4 or "") == 4:
                addr = {
                    "number": row.number,
                    "street": row.street,
                    "unit": row.street2,
                    "city": row.city,
                    "state": row.state,
                    "zip": row.zip,
                    "plus4": row.plus4,
                    "latitude": round(float(row.latitude),7),
                    "longitude": round(float(row.longitude), 7)
                }
            else:
                addr = normalize(row.number, row.street, row.street2, row.city, row.state, row.zip, round(float(row.latitude),7), round(float(row.longitude), 7), False, row.plus4)
            if addr["state"] in ignorestates:
                skippedcount = skippedcount + 1
                continue
            data.append([addr['number'], addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['plus4'], addr['latitude'], addr['longitude'], row.source])
        except ValidationException as e:
            badcount = badcount + 1
        except Exception as e:
            print("W: Couldn't ingest address:")
            print(row)
            traceback.print_exc()
            badcount = badcount + 1
    out = pd.DataFrame(data=data, columns=["number","street","street2","city","state","zip","plus4","latitude","longitude","source"])
    with writelock:
        out.to_csv(outfilename, mode='a', index=False, header=not os.path.exists(outfilename), columns=["number","street","street2","city","state","zip","plus4","latitude","longitude","source"])
    gc.collect()
 def importOwnFile(filename, outfilename, ignorestates, keeponlystates):
    global badcount, skippedcount, writelock
    print("Processing addresses from " + filename)
    columns = ["number","street","street2","city","state","zip","plus4","latitude","longitude","source"]
    file = filename
    chunkcount = 0
    badcount = 0
    skippedcount = 0
    chunksize = 1000
    in_flight = set()
    with concurrent.futures.ProcessPoolExecutor(max_workers=maxthreads, max_tasks_per_child=100, initializer=init_worker, initargs=(cfg,)) as executor:
        for chunk in pd.read_csv(file, chunksize=chunksize, usecols=columns, keep_default_na=False, dtype={
            "number":"string","street":"string",
            "street2":"string","city":"string",
            "state":"category", "zip":"string",
            "plus4": "string",
            "latitude":"float32", "longitude":"float32",
            "source":"category"}, dtype_backend="pyarrow"):
            while len(in_flight) >= MAX_IN_FLIGHT:
                done, in_flight = concurrent.futures.wait(in_flight, return_when=concurrent.futures.FIRST_COMPLETED)
                for fut in done:
                    fut.result()
            fut = executor.submit(processOwnChunk, chunk, chunkcount * chunksize, outfilename, ignorestates, keeponlystates)
            in_flight.add(fut)
            chunkcount = chunkcount + 1
        for fut in concurrent.futures.as_completed(in_flight):
            fut.result()
    print("\nDone processing! Parsed " + str(chunkcount) + " chunks.")
    print("There were " + str(badcount) + " unprocessable addresses.")
    if ignorestates:
        print("There were " + str(skippedcount) + " addresses ignored due to your --ignorestates setting.")
    print("Saved to output file " + outfilename)
 def processNadChunk(chunk, chunkcount, outfilename, ignorestates, keeponlystates):
    global badcount, skippedcount, writelock
    print(" " + str(chunkcount) + "          ", end="\r", flush=True)
    data = []
    for index, row in chunk.iterrows():
        if row.State.upper() in ignorestates:
            skippedcount = skippedcount + 1
            continue
        if keeponlystates != [] and row.State.upper() not in keeponlystates:
            skippedcount = skippedcount + 1
            continue
        try:
            town = row.Inc_Muni
            if town == "Unincorporated":
                town = ""
            if not town:
                town = row.Post_City
            if not town:
                town = row.Uninc_Comm
            addr = normalize(row.AddNo_Full, row.StNam_Full, row.SubAddress, row.Inc_Muni, row.State, row.Zip_Code, round(float(row.Latitude),7), round(float(row.Longitude), 7))
            if addr["state"] in ignorestates: # For example, AR's data claims to have MO addresses but the ZIP says they're in AR, so the first pass of this won't catch those
                skippedcount = skippedcount + 1
                continue
            source = row.NAD_Source
            source = source.replace("State of ", "")
            source = "NAD " + source
            data.append([addr['number'], addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['plus4'], addr['latitude'], addr['longitude'], source])
        except ValidationException as e:
            badcount = badcount + 1
        except Exception as e:
            print("W: Couldn't ingest address:")
            print(row)
            traceback.print_exc()
            badcount = badcount + 1
    out = pd.DataFrame(data=data, columns=["number","street","street2","city","state","zip","plus4","latitude","longitude","source"])
    with writelock:
        out.to_csv(outfilename, mode='a', index=False, header=not os.path.exists(outfilename), columns=["number","street","street2","city","state","zip","plus4","latitude","longitude","source"])
    gc.collect()
 def importNadFile(filename, outfilename, ignorestates, keeponlystates, startatline):
    global skippedcount, badcount
    print("Importing National Address Database addresses from " + filename)
    if startatline > 0:
        print("Skipping to line number " + str(startatline))
    columns = [
        "AddNo_Full",
        "StNam_Full",
        "St_PreMod",
        "St_PreDir",
        "St_Name",
        "SubAddress",
        "Inc_Muni",
        "Post_City",
        "Uninc_Comm",
        "Urbnztn_PR",
        "State",
        "Zip_Code",
        "UUID",
        "Longitude",
        "Latitude",
        "DateUpdate",
        "NAD_Source",
    ]
    file = filename
    if filename.endswith(".zip"):
        zf = zipfile.ZipFile(filename, mode="r")
        zipFiles = zf.namelist()
        for fname in zipFiles:
            if fname.upper().startswith("TXT/NAD") and fname.upper().endswith(".TXT"):
                file = zf.open(fname, mode="r", force_zip64=True)
                break
    chunkcount = 0
    chunksize = 1000
    in_flight = set()
    with concurrent.futures.ProcessPoolExecutor(max_workers=maxthreads, mp_context=get_context("spawn"), max_tasks_per_child=100, initializer=init_worker, initargs=(cfg,)) as executor:
        for chunk in pd.read_csv(file, chunksize=chunksize, header=0, skiprows=lambda i: 1 <= i <= startatline, usecols=columns, keep_default_na=False, dtype={
            "State":"category","NAD_Source":"category",
            "Zip_Code":"string","UUID":"string",
            "AddNo_Full":"string","StNam_Full":"string","St_PreMod":"string",
            "St_PreDir":"string","St_Name":"string","SubAddress":"string",
            "Inc_Muni":"string","Post_City":"string","Uninc_Comm":"string",
            "Urbnztn_PR":"string","Longitude":"float32","Latitude":"float32",
            "DateUpdate":"string"}, dtype_backend="pyarrow"):
            while len(in_flight) >= MAX_IN_FLIGHT:
                done, in_flight = concurrent.futures.wait(in_flight, return_when=concurrent.futures.FIRST_COMPLETED)
                for fut in done:
                    fut.result()
            fut = executor.submit(processNadChunk, chunk, chunkcount * chunksize, outfilename, ignorestates, keeponlystates)
            in_flight.add(fut)
            chunkcount = chunkcount + 1
        for fut in concurrent.futures.as_completed(in_flight):
            fut.result()
    print("\nDone importing NAD! Processed " + str(chunkcount) + " chunks of " + str(chunksize) + " rows.")
    print("There were " + str(badcount) + " unprocessable addresses.")
    if ignorestates:
        print("There were " + str(skippedcount) + " addresses ignored due to your --ignorestates setting.")
    print("Saved to output file " + outfilename)
 def processOpenAddressRows(rows, startindex, outfilename, ignorestates, source, stateOverride, zipprefix, citySuggestion, county = False):
    global badcount, skippedcount, writelock
    print(" " + str(startindex) + "                                                ", end="\r", flush=True)
    linecount = 0
    outdata = []
    emptylinecount = 0
    for line in rows:
        linecount = linecount + 1
        try:
            data = json.loads(line)
            if not data["properties"]["number"] and not data["properties"]["street"]:
                emptylinecount = emptylinecount + 1
            if not data["geometry"] or not data["geometry"]["coordinates"][0] or not data["geometry"]["coordinates"][1]:
                emptylinecount = emptylinecount + 1
            state = data["properties"]["region"].upper()
            city = data["properties"]["city"].upper().strip()
            if stateOverride:
                state = stateOverride
            if state in ignorestates:
                skippedcount = skippedcount + 1
                continue
            if data["geometry"] is None:
                badcount = badcount + 1
                continue
            if not data["properties"]["number"] or not data["properties"]["street"] or data["properties"]["number"] == "0":
                badcount = badcount + 1
                continue
            if citySuggestion and not city:
                city = citySuggestion
            if source == "OA/hawaii" and re.match(r"^[1-9][1-9][0-9]{4}", data["properties"]["number"]):
                # Source is broken/missing, and the last good version has the house numbers without dashes
                # Hawaii has a specific and unique address numbering system
                data["properties"]["number"] = data["properties"]["number"][:2] + "-" +  data["properties"]["number"][2:]
            addr = normalize(data["properties"]["number"], data["properties"]["street"], data["properties"]["unit"], city, state, data["properties"]["postcode"], data["geometry"]["coordinates"][1], data["geometry"]["coordinates"][0], zipprefix, "", county)
            if addr["state"] in ignorestates:
                skippedcount = skippedcount + 1
                continue
            if addr["street"] == "":
                badcount = badcount + 1
                continue
            if not source:
                source = "OA/"+addr["state"]
            outdata.append([addr['number'], addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['plus4'], addr['latitude'], addr['longitude'], source])
        except ValidationException as e:
            badcount = badcount + 1
        except Exception as e:
            traceback.print_exc()
            print("Error encountered while processing", line)
            badcount = badcount + 1
    if linecount > 0 and emptylinecount / linecount > .95:
        print("\nWarning: Empty chunk! " + str(emptylinecount) + " of " + str(linecount) + " rows had no address.")
    out = pd.DataFrame(data=outdata, columns=["number","street","street2","city","state","zip","plus4","latitude","longitude","source"])
    with writelock:
        out.to_csv(outfilename, mode='a', index=False, header=not os.path.exists(outfilename), columns=["number","street","street2","city","state","zip","plus4","latitude","longitude","source"])
    gc.collect()
 def importOpenAddressFile(filepath, outfilename, ignorestates, source, stateOverride, zipprefix):
    global badcount, skippedcount
    cfg = src.config.get_config()
    print("Importing OpenAddresses data from " + filepath)
    chunksize = 1000
    linecount = 0
    if stateOverride:
        stateOverride = stateOverride.strip().upper()
    file = filepath
    if filepath.endswith(".gz"):
        file = gzip.open(filepath, 'rb')
    else:
        file = open(file, 'r')
    county = False
    if not source or source == "":
        source = "OA/"+filepath.split("/")[-1].split("-")[0]
        if source.startswith("OA/statewide"):
            if stateOverride:
                source = source.replace("statewide", stateOverride)
            else:
                source = False
    citySuggestion = False
    if not cfg.citySuggestion and filepath.split("/")[-1].startswith("city_of_"):
        # Set city suggestion using filename
        citySuggestion = re.sub(r'\d+', '', filepath.split("/")[-1].split("-")[0].replace("city_of_", "").replace("_", " ").upper().strip())
    if filepath.split("/")[-1].endswith("-addresses-county.geojson"):
        county = filepath.split("/")[-1].split("-")[0].replace("_", " ").upper().strip()
        print("Detected county from filename: " + county + ", will use for ZIP Code hinting")
    lines = []
    in_flight = set()
    with concurrent.futures.ProcessPoolExecutor(max_workers=maxthreads, mp_context=get_context("spawn"), max_tasks_per_child=1000, initializer=init_worker, initargs=(cfg,)) as executor:
        for line in file:
            lines.append(line)
            linecount = linecount + 1
            if len(lines) >= chunksize:
                while len(in_flight) >= MAX_IN_FLIGHT:
                    done, in_flight = concurrent.futures.wait(in_flight, return_when=concurrent.futures.FIRST_COMPLETED)
                    for fut in done:
                        fut.result()
                fut = executor.submit(processOpenAddressRows, lines, linecount, outfilename, ignorestates, source, stateOverride, zipprefix, citySuggestion, county)
                in_flight.add(fut)
                lines = []
        fut = executor.submit(processOpenAddressRows, lines, linecount, outfilename, ignorestates, source, stateOverride, zipprefix, citySuggestion, county)
        in_flight.add(fut)
        for fut in concurrent.futures.as_completed(in_flight):
            fut.result()
    file.close()
    print("\nDone importing OpenAddresses! Processed " + str(linecount) + " entries.")
    print("There were " + str(badcount) + " unprocessable addresses.")
    if ignorestates:
        print("There were " + str(skippedcount) + " addresses ignored due to your --ignorestates setting.")
    print("Saved to output file " + outfilename)
    return
 def importOSMFile(filename, outfilename):
    """
    Overpass API query for data input (replace name=Montana with the region you want):
    [out:csv(::"lat", ::"lon", "addr:housenumber", "addr:street", "addr:city", "addr:state", "addr:postcode")][timeout:120];
    area["name"="Montana"]->.boundaryarea;
    node["addr:housenumber"]["addr:street"](area.boundaryarea);
    out;
    way["addr:housenumber"]["addr:street"](area.boundaryarea);
    out center;
    relation["addr:housenumber"]["addr:street"](area.boundaryarea);
    out center;
    """
    print("Importing OSM Overpass data from " + filename)
    columns = [
        "@lat",
        "@lon",
        "addr:housenumber",
        "addr:street",
        "addr:city",
        "addr:state",
        "addr:postcode"
    ]
    file = filename
    chunkcount = 0
    badcount = 0
    skippedcount = 0
    source = "OpenStreetMap.org. License: ODbL"
    for chunk in pd.read_csv(file, sep='\t', chunksize=100, usecols=columns, keep_default_na=False, dtype="str"):
        print(" " + str(chunkcount * 100) + "          ", end="\r", flush=True)
        data = []
        for index, row in chunk.iterrows():
            try:
                addr = normalize(row["addr:housenumber"], row["addr:street"], "", row["addr:city"], row["addr:state"], row["addr:postcode"], row["@lat"], row["@lon"])
                data.append([addr['number'], addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['plus4'], addr['latitude'], addr['longitude'], source])
            except ValidationException as e:
                badcount = badcount + 1
            except Exception as e:
                print("W: Couldn't ingest address:")
                print(row)
                traceback.print_exc()
                badcount = badcount + 1
        out = pd.DataFrame(data=data, columns=["number","street","street2","city","state","zip","plus4","latitude","longitude","source"])
        out.to_csv(outfilename, mode='a', index=False, header=not os.path.exists(outfilename), columns=["number","street","street2","city","state","zip","plus4","latitude","longitude","source"])
        chunkcount = chunkcount + 1
    print("\nDone importing OSM! Processed " + str(chunkcount) + " chunks.")
    print("There were " + str(badcount) + " unprocessable addresses.")
    print("Saved to output file " + outfilename)
 def importNARFile(filename, outfilename):
    print("Importing Statistics Canada data from " + filename)
    zf = zipfile.ZipFile(filename, mode="r")
    zipFiles = zf.namelist()
    locationFileList = {}
    addressFileList = {}
    provinceCodes = [10,11,12,13,24,35,46,47,48,59,60,61,62]
    for c in provinceCodes:
        addressFileList[str(c)] = []
        locationFileList[str(c)] = []
    #  = zf.open(fname, mode="r", force_zip64=True)
    for fname in zipFiles:
        if fname.startswith("Addresses/Address_") and fname.endswith(".csv"):
            number = fname.replace("Addresses/Address_", "").replace(".csv", "").split("_")[0]
            addressFileList[number].append(fname)
        elif fname.startswith("Locations/Location_") and fname.endswith(".csv"):
            number = fname.replace("Locations/Location_", "").replace(".csv", "").split("_")[0]
            locationFileList[number].append(fname)
    print("\nMerging address and location tables...")
    mergecount = 0
    dataframes = []
    addrcols = ["LOC_GUID","APT_NO_LABEL","CIVIC_NO","CIVIC_NO_SUFFIX","MAIL_STREET_NAME","MAIL_STREET_TYPE","MAIL_STREET_DIR","MAIL_MUN_NAME","MAIL_PROV_ABVN","MAIL_POSTAL_CODE","BU_N_CIVIC_ADD"]
    loccols = ["LOC_GUID","BG_LATITUDE","BG_LONGITUDE"]
    for provinceId in provinceCodes:
        print(" " + str(mergecount+1) + "          ", end="\r", flush=True)
        readaf = map(lambda addrFilename: dd.read_csv("zip://"+addrFilename, storage_options={'fo': filename}, usecols=addrcols, keep_default_na=False, dtype="str"), addressFileList[str(provinceId)])
        readlf = map(lambda locationFilename: dd.read_csv("zip://"+locationFilename, storage_options={'fo': filename}, usecols=loccols, keep_default_na=False, dtype="str"), locationFileList[str(provinceId)])
        addressFrame = dd.concat(list(readaf), ignore_index=False)
        locationFrame = dd.concat(list(readlf), ignore_index=False)
        dataframes.append(dd.merge(addressFrame, locationFrame, on=["LOC_GUID"]))
        mergecount = mergecount + 1
    print("\nProcessing addresses...")
    file = filename
    alladdrcount = 0
    skippedcount = 0
    source = "StatsCan NAR"
    provinceIndex = 0
    for df in dataframes:
        print("\nProcessing province ID " + str(provinceCodes[provinceIndex]))
        data = []
        addrcount = 0
        for index, row in df.iterrows():
            if (addrcount % 100 == 0):
                print(" " + str(addrcount) + "          ", end="\r", flush=True)
            number = ("".join(filter(None, [row["CIVIC_NO"], row["CIVIC_NO_SUFFIX"]]))).strip().upper()
            street = (" ".join(filter(None, [row["MAIL_STREET_NAME"], row["MAIL_STREET_TYPE"], row["MAIL_STREET_DIR"]]))).strip().upper()
            apt = row["APT_NO_LABEL"].strip().upper()
            if street == "":
                # PO BOX probably
                if row["BU_N_CIVIC_ADD"].startswith("PO BOX "):
                    data.append([row["BU_N_CIVIC_ADD"].replace("PO BOX ", "").strip(), "PO BOX", "", row["MAIL_MUN_NAME"], row["MAIL_PROV_ABVN"], row["MAIL_POSTAL_CODE"], "", row["BG_LATITUDE"], row["BG_LONGITUDE"], source])
                else:
                    skippedcount = skippedcount + 1
            else:
                data.append([number, street, apt, row["MAIL_MUN_NAME"], row["MAIL_PROV_ABVN"], row["MAIL_POSTAL_CODE"], "", row["BG_LATITUDE"], row["BG_LONGITUDE"], source])
            addrcount = addrcount + 1
            if len(data) >= 1000: # Dump to file so we don't use tons of RAM
                out = pd.DataFrame(data=data, columns=["number","street","street2","city","state","zip","plus4","latitude","longitude","source"])
                out.to_csv(outfilename, mode='a', index=False, header=not os.path.exists(outfilename), columns=["number","street","street2","city","state","zip","plus4","latitude","longitude","source"])
                data = []
        out = pd.DataFrame(data=data, columns=["number","street","street2","city","state","zip","plus4","latitude","longitude","source"])
        out.to_csv(outfilename, mode='a', index=False, header=not os.path.exists(outfilename), columns=["number","street","street2","city","state","zip","plus4","latitude","longitude","source"])
        alladdrcount = alladdrcount + addrcount
        provinceIndex = provinceIndex + 1
    print("\nDone importing NAR! Processed " + str(alladdrcount) + " addresses.")
    print("Skipped " + str(skippedcount) + " invalid mailing addresses.")
    print("Saved to output file " + outfilename)
 def removeDupes(filepath):
    print("Removing duplicate and incomplete addresses from " + filepath)
    chunkcount = 0
    chunksize = 20000000
    for chunk in pd.read_csv(filepath, chunksize=chunksize, keep_default_na=False, dtype="str", usecols=["number", "street", "street2", "city", "state", "zip", "latitude", "longitude", "source"]):
        print(".", end="", flush=True)
        chunk.replace('', None, inplace=True)
        chunk.dropna(subset=['zip','number','street','city','state','latitude','longitude'], inplace=True)
        chunk.sort_values(by="plus4", ascending=False, inplace=True, na_position="last") # Make sure the address duplicate with a +4 is kept
        chunk.drop_duplicates(subset=["number", "street", "street2", "city", "state", "zip"], keep="first", inplace=True)
        chunk.to_csv(filepath + ".dedup.csv", mode='a', index=False,header=not os.path.exists(filepath + ".dedup.csv"), columns=["number","street","street2","city","state","zip","latitude","longitude", "source"])
        chunkcount = chunkcount + 1
    print("\nDone removing duplicates from " + filepath + "! Processed " + str(chunkcount) + " chunks of " + str(chunksize) + " records.")
 def tosqlite(addressfile, dbfile):
    global countrycode
    cfg = src.config.get_config()
    print("\nReading addresses from " + addressfile)
    file = addressfile
    if addressfile.endswith(".gz"):
        file = gzip.open(addressfile, 'rb')
    else:
        file = open(addressfile, 'r')
    connection = sqlite3.connect(dbfile)
    cursor = connection.cursor()
    cursor.execute("""CREATE TABLE IF NOT EXISTS `addresses` (
 	`zipcode`	VARCHAR ( 6 ) NOT NULL,
 	`number`	VARCHAR ( 30 ) NOT NULL,
 	`street`	VARCHAR ( 200 ) NOT NULL,
 	`street2`	VARCHAR ( 20 ),
 	`city`	VARCHAR ( 50 ) NOT NULL,
 	`state`	CHAR ( 2 ) NOT NULL,
    `plus4`	CHAR ( 4 ),
    `country` CHAR ( 2 ) NOT NULL DEFAULT "US",
 	`latitude`	DECIMAL ( 8 , 6 ) NOT NULL,
 	`longitude`	DECIMAL( 9 , 6 ) NOT NULL,
    `source`	VARCHAR( 40 ),
    UNIQUE (zipcode, number, street, street2, country)
 )""")
    cursor.execute("DROP TABLE IF EXISTS `addresses_temp`")
    cursor.execute("""CREATE TABLE IF NOT EXISTS `addresses_temp` (
 	`zipcode`	CHAR ( 6 ) NOT NULL,
 	`number`	VARCHAR ( 30 ) NOT NULL,
 	`street`	VARCHAR ( 200 ) NOT NULL,
 	`street2`	VARCHAR ( 20 ),
 	`city`	VARCHAR ( 50 ) NOT NULL,
 	`state`	CHAR ( 2 ) NOT NULL,
    `plus4`	CHAR ( 4 ),
    `country` CHAR ( 2 ) NOT NULL DEFAULT "US",
 	`latitude`	DECIMAL ( 8 , 6 ) NOT NULL,
 	`longitude`	DECIMAL( 9 , 6 ) NOT NULL,
    `source`	VARCHAR( 40 )
 )""")
    cursor.execute("""CREATE INDEX IF NOT EXISTS `latitude_longitude` ON `addresses` (
 	`latitude`,
 	`longitude`
 )""")
    cursor.execute("""CREATE INDEX IF NOT EXISTS `number_street` ON `addresses` (
 	`number`,
 	`street`
 )""")
    cursor.execute("""CREATE INDEX IF NOT EXISTS `state_city` ON `addresses` (
 	`state`,
 	`city`
 )""")
    cursor.execute("""CREATE INDEX IF NOT EXISTS `zipcode_number` ON `addresses` (
 	`zipcode`,
 	`number`
 )""")
    cursor.execute("""CREATE INDEX IF NOT EXISTS `country` ON `addresses` (
 	`country`
 )""")
    chunksize = 5000
    chunkcount = 0
    rowschanged = 0
    columns = ["number","street","street2","city","state","zip","latitude","longitude","source"]
    if cfg.appendPlus4:
        columns.append("plus4")
    for chunk in pd.read_csv(file, chunksize=chunksize, usecols=columns, keep_default_na=False, dtype="str"):
        chunk = chunk.rename(columns={'zip': 'zipcode'})
        chunk.insert(7, "country", countrycode)
        # Replace empty values with NULL
        chunk.replace('', None, inplace=True)
        # Replace null street2 with empty string so the SQLite UNIQUE clause will work
        chunk.fillna({"street2": ""}, inplace=True)
        # Remove null values that aren't allowed
        chunk.dropna(subset=['zipcode','number','street','city','state','latitude','longitude'], inplace=True)
        print(" " + str(chunkcount * chunksize) + "          ", end="\r", flush=True)
        # Write chunk to SQLite
        cursor.execute("DELETE FROM addresses_temp")
        chunk.to_sql("addresses_temp", connection, if_exists='append', index=False, dtype={
            "zipcode": "CHAR(6)",
            "number": "VARCHAR(30)",
            "street": "VARCHAR(200)",
            "street2": "VARCHAR(20)",
            "city": "VARCHAR(50)",
            "state": "CHAR(2)",
            "plus4": "CHAR(4)",
            "country": "CHAR(2)",
            "latitude": "DECIMAL(8,6)",
            "longitude": "DECIMAL(9,6)",
            "source": "VARCHAR(40)"
            })
        chunkcount = chunkcount + 1
        cursor.execute("INSERT OR IGNORE INTO addresses SELECT * FROM addresses_temp")
        rowschanged = rowschanged + cursor.rowcount
        if chunkcount % 5000 == 0: # VACUUM every 10 million inserts
            print(" Optimizing database...", end="\r", flush=True)
            connection.executescript("VACUUM")
            print("                       ", end="\r", flush=True)
    connection.executescript("DROP TABLE addresses_temp")
    cursor.execute("DELETE FROM addresses WHERE number=\"0\"")
    rowschanged = rowschanged + cursor.rowcount
    if rowschanged > 10000000:
        print("\nOptimizing database...")
        connection.executescript("VACUUM; ANALYZE; PRAGMA optimize;")
    print("Done converting to SQLite! Processed " + str(chunkcount) + " chunks (" + str(chunksize) + " records per chunk).")
    print(str(rowschanged) + " records inserted.")
    connection.close()
    print("Saved to output file " + dbfile)
    return rowschanged
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Tools to build a standardized U.S. address database from free source data."
    )
    parser.add_argument("file", help="Address file(s) to process.", nargs='+')
    parser.add_argument("--outputfile", help="Filename to output address data to. If unspecified, set to \"./data/out.csv\" or \"./data/out.sqlite\", depending on options set.")
    parser.add_argument(
        "--filetype",
        help="Type of address file to ingest. nad=National Address Database, oa=OpenAddresses, adb=CSV created by this script, osm=OpenStreetMap Overpass API (see main.py source for query to use), nar=Statistics Canada National Address Register",
        choices=["nad", "oa", "adb", "osm", "nar"],
    )
    parser.add_argument("--state", help="Some OpenAddresses files don't have the state field set. Do it manually here.")
    parser.add_argument("--ignorestates", help="Comma-separated two-letter state names. Addresses in these states will be skipped over.")
    parser.add_argument("--onlystates", help="Comma-separated two-letter state names. Addresses NOT in these states will be skipped over.")
    parser.add_argument("--source", help="Set the data source name (OpenAddresses only). Autodetected based on filename if not set.")
    parser.add_argument("--dedup", help="Remove duplicate records in an already-ingested address file, and saves it to folder/file.dedup.csv. Only catches \"nearby\" duplicates; processes 20,000,000 records at a time.", action='store_true')
    parser.add_argument("--fixlatlon", help="Detect and repair flipped latitude/longitude pairs in an already-ingested address file, and saves it to [filename].coordfix.csv.", action='store_true')
    parser.add_argument("--tosqlite", help="Output to a SQLite3 database. Only works on output CSV data from this script.", action='store_true')
    parser.add_argument("--appendplus4", help="Append ZIP+4 data to all records. Fairly slow.", action='store_true')
    parser.add_argument("--appendunitlabel", help="Append unit label (APT, STE, etc) to unit numbers using ZIP+4 data.", action='store_true')
    parser.add_argument("--zipprefix", help="When searching for a ZIP, assume it starts with the digits provided for faster lookups.")
    parser.add_argument("-a", help="Allow appending to existing output file.", action='store_true')
    parser.add_argument("--cpu", help="Number of CPU cores to use for parallel processing.")
    parser.add_argument("--country", help="Two-letter country code. Default is US.")
    parser.add_argument("--city", help="City name to assume when there's no city or postal code in the source data. Useful for OpenAddresses city_of_ data files.")
    parser.add_argument("--startat", help="Skip to this line number in the input file (NAD)")
    parser.add_argument("--census", help="Enable looking up missing ZIP codes in the U.S. Census Geocoder when we have a full address, city, and state but no ZIP.", action='store_true')
    parser.add_argument("--libpostal", help="Use libpostal address parsing and expansions to match bad addresses to a ZIP+4. Automatically enables --appendplus4.", action='store_true')
    parser.add_argument("--noskip4", help="When processing own file format, don't skip normalizing records that have a ZIP+4 already.", action="store_true")
    args = parser.parse_args()
    startAtLine = 0
    appendPlus4 = False
    appendUnitLabel = False
    useCensusToFillEmptyZIPs = False
    countryCode = "US"
    citySuggestion = False
    advancedMode = False
    noSkip4 = False
    if args.libpostal:
        advancedMode = True
        appendPlus4 = True
    if advancedMode:
        from src.advancedparsing import advancedNormalize
        print("Using libpostal to work harder on bad addresses.")
    if args.appendplus4:
        appendPlus4 = True
    if appendPlus4:
        print("Trying to match to ZIP+4 codes for every address!")
    if args.noskip4:
        noSkip4 = True
    if noSkip4:
        print("Also normalizing records that have a +4 in the input data.")
    if args.appendunitlabel:
        appendUnitLabel = True
    if args.census:
        useCensusToFillEmptyZIPs = True
    else:
        useCensusToFillEmptyZIPs = False
    if useCensusToFillEmptyZIPs:
        print("Census geocoder enabled! RIP your network maybe")
    statesToIgnore = []
    if args.ignorestates:
        statesToIgnore = re.sub(r"[^a-zA-Z,]+", "", args.ignorestates.upper()).split(",")
    statesToKeep = []
    if args.onlystates:
        statesToKeep = re.sub(r"[^a-zA-Z,]+", "", args.onlystates.upper()).split(",")
    zipprefix = False
    if args.zipprefix:
        zipprefix = args.zipprefix
    if args.cpu:
        maxthreads = int(args.cpu)
    if args.country:
        if len(args.country) != 2:
            print("Invalid country code " + args.country + ", exiting.")
            sys.exit(1)
        countrycode = args.country.upper()
        countryCode = countrycode
    if args.startat and args.startat.isdigit():
        startAtLine = int(args.startat)
    if args.city:
        citySuggestion = args.city.strip().toUpper()
    cfg = src.config.AppConfig(appendPlus4=appendPlus4, appendUnitLabel=appendUnitLabel, countryCode=countryCode, citySuggestion=citySuggestion, useCensusToFillEmptyZIPs=useCensusToFillEmptyZIPs, advancedMode=advancedMode, noSkip4=noSkip4)
    src.config.set_config(cfg)
    if args.dedup:
        with concurrent.futures.ProcessPoolExecutor(max_workers=maxthreads) as executor:
            for file in args.file:
                executor.submit(removeDupes, file)
    elif args.fixlatlon:
        with concurrent.futures.ProcessPoolExecutor(max_workers=maxthreads) as executor:
            for file in args.file:
                executor.submit(fixLatLon, file)
    elif args.tosqlite:
        outputfile = "./data/out.sqlite"
        if args.outputfile:
            outputfile = args.outputfile
        if args.a != True and os.path.exists(args.outputfile):
            print("Output file already exists, exiting!")
            sys.exit()
        rowschanged = 0
        filesimported = 0
        for file in args.file:
            rowschanged = rowschanged + tosqlite(file, outputfile)
            filesimported = filesimported + 1
        print("\nDone importing " + str(filesimported) + " files. " + str(rowschanged) + " records inserted.")
    elif args.file:
        outputfile = "./data/out.csv"
        if args.outputfile:
            outputfile = args.outputfile
        if args.a != True and os.path.exists(args.outputfile):
            print("Output file already exists, exiting!")
            sys.exit()
        if args.filetype == "nad":
            for file in args.file:
                importNadFile(file, outputfile, statesToIgnore, statesToKeep, startAtLine)
        elif args.filetype == "adb":
            for file in args.file:
                importOwnFile(file, outputfile, statesToIgnore, statesToKeep)
        elif args.filetype == "osm":
            for file in args.file:
                importOSMFile(file, outputfile)
        elif args.filetype == "nar":
            countrycode = "CA"
            for file in args.file:
                importNARFile(file, outputfile)
        elif args.filetype == "oa":
            source = ""
            if args.source:
                source = args.source
            for file in args.file:
                importOpenAddressFile(file, outputfile, statesToIgnore, source, args.state, zipprefix)
--- a/rendermap.py
+++ b/rendermap.py
@ -0,0 +1,61 @@
 #!/usr/bin/python3
 from PIL import Image, ImageDraw
 from argparse import ArgumentParser
 import sqlite3
 Image.MAX_IMAGE_PIXELS = 648000000 # 100 pixels per degree
 def render(filename, outfile, ppd):
    print("Creating map overlay")
    pixelsperdegree = ppd
    width = 360 * pixelsperdegree
    height = 180 * pixelsperdegree
    img = Image.new('RGBA', (width, height), (255, 255, 255, 0))
    draw = ImageDraw.Draw(img)
    print("Connecting to database")
    connection = sqlite3.connect(filename)
    c = connection.cursor()
    print("Drawing map overlay")
    c.execute('SELECT longitude, latitude FROM addresses')
    count = 0
    try:
        for (x,y) in c:
            try:
                if float(y) < -90.0 or float(y) > 90.0:
                    x, y = y, x
                x = round((x + 180) * pixelsperdegree)
                y = height - round((y + 90) * pixelsperdegree)
                draw.point((x, y), fill=(0, 255, 0))
            except:
                pass
            count = count + 1
            if count % 1000 == 0:
                print("  " + str(count) + "          ", end="\r", flush=True)
    except KeyboardInterrupt:
        print("\nKeyboardInterrupt: Stopping draw and saving image early")
        pass
    print("\nSaving overlay image")
    img.save(outfile, format="PNG")
    print("Rendering map image")
    if (pixelsperdegree > 50):
        basemap = Image.open("basemap-100.png")
    else:
        basemap = Image.open("basemap-50.png")
    Image.alpha_composite(basemap.resize((width, height)), img).save(outfile + ".map.png", format="PNG")
    img.close()
    basemap.close()
    print("Done! Saved map to " + outfile)
 parser = ArgumentParser(description='Draw a map of a database\'s address points.')
 parser.add_argument('src_db', help='Input SQLite database with "addresses" table containing "latitude" and "longitude" columns')
 parser.add_argument('png_filename', help='Output PNG filename.')
 parser.set_defaults(ppd=50)
 parser.add_argument('ppd', help='Pixels per degree of latitude/longitude.', type=int)
 if __name__ == "__main__":
    args = parser.parse_args()
    render(args.src_db, args.png_filename, args.ppd)
--- a/sqlite-from-sqfull.py
+++ b/sqlite-from-sqfull.py
@ -0,0 +1,51 @@
 #!/usr/bin/python3
 from argparse import ArgumentParser
 import sqlite3
 def process(filename, outfile):
    print("Connecting to databases")
    connection = sqlite3.connect(filename)
    c = connection.cursor()
    connection2 = sqlite3.connect(outfile)
    c2 = connection2.cursor()
    print("Creating lite database")
    c2.execute("DROP TABLE IF EXISTS `addresses`")
    c2.execute("""CREATE TABLE `addresses` (
 	`zipcode`	VARCHAR ( 6 ) NOT NULL,
 	`number`	VARCHAR ( 30 ) NOT NULL,
 	`street`	VARCHAR ( 200 ) NOT NULL,
 	`street2`	VARCHAR ( 20 ),
 	`city`	VARCHAR ( 50 ) NOT NULL,
 	`state`	CHAR ( 2 ) NOT NULL,
    `plus4`	CHAR ( 4 ),
    `country` CHAR ( 2 ) NOT NULL DEFAULT "US",
    UNIQUE (zipcode, number, street, street2, country)
 )""")
    c2.execute("CREATE INDEX `zipcode_number` ON `addresses` (`zipcode`,`number`)")
    c2.execute("CREATE INDEX `number_street_state` ON `addresses` (`number`,`street`,`state`)")
    print("Copying records")
    c.execute('SELECT zipcode, number, street, street2, city, state, plus4, country FROM addresses')
    count = 0
    for (zipcode, number, street, street2, city, state, plus4, country) in c:
        c2.execute("INSERT OR IGNORE INTO addresses(zipcode, number, street, street2, city, state, plus4, country) VALUES (?,?,?,?,?,?,?,?)", (zipcode, number, street, street2, city, state, plus4, country))
        count = count + 1
        if count % 10000 == 0:
            print("  " + str(count) + "          ", end="\r", flush=True)
    print("\nVacuuming...")
    connection2.executescript("VACUUM")
    print("Done! Copied " + str(count) + " rows to " + outfile + ".")
 parser = ArgumentParser(description='Draw a map of a database\'s address points.')
 parser.add_argument('src_db', help='"Full" SQLite database')
 parser.add_argument('dest_db', help='Output database with some columns and indexes removed')
 if __name__ == "__main__":
    args = parser.parse_args()
    process(args.src_db, args.dest_db)
--- a/src/init.py
+++ b/src/init.py
--- a/src/addressfunctions.py
+++ b/src/addressfunctions.py
@ -0,0 +1,333 @@
 # Created on : Aug 29, 2024, 12:57:40 AM
 # Author     : Skylar Ittner
 import re
 import pandas as pd
 from pythonnet import load
 from scourgify import NormalizeAddress, normalize_address_record
 from src.zipfunctions import checkZIPCode, getZIP, getZIP4, getCityStateForZIP
 from src.constants import STATES, LONGHAND_STREET_TYPES, UNITS, ValidationException
 import src.config
 import json
 import sys
 load("mono")
 import clr
 clr.AddReference("KellermanSoftware.USPSStandardization")
 from KellermanSoftware.USPSStandardization import StandardizationLogic
 standardization = False
 try:
    standardization = StandardizationLogic("Netsyms Technologies LLC 203206", "2;770AE30D7A5F217E77C857B29618A6E8DD")
 except:
    print("Kellerman USPSStandardization failed to initialize, skipping.")
 zipcodes = pd.read_csv("zip_code_database.csv", keep_default_na=False, dtype="str")
 PRE_STANDARDIZATION_STREET_REGEXES = {
    " POINTADDRESS$": "",
    "S U S HWY ": "S US HIGHWAY ",
    "^U S ([0-9]+) HWY": r"US HIGHWAY \1",
    "^U S HWY ": "US HIGHWAY ",
    "[–—−]": "-",
    " PW$": " PKWY",
    " VIS ": " VISTA ",
    " VLY ": " VALLEY ",
    " MTN ": " MOUNTAIN ",
    " CTR ": " CENTER ",
    " CLB ": " CLUB ",
    "HBR ": "HARBOR ",
    "^PNE ": "PINE ",
    "^SPG ": "SPRING ",
    "^M L KING JR ": "MARTIN LUTHER KING JR ",
    "^NONE$": "",
    " VLY ": " VALLEY ",
    "^VLY ": "VALLEY ",
    "BEN-DIER": "BEN DIER",
    " ROCK RIV$": "" # Albany county WY misspelled their own city name and put it in the street field
 }
 POST_STANDARDIZATION_STREET_REGEXES = {
    ", BASE$": "",
    ", BASE CP$": "",
    "UNITED STATES HWY ([0-9]+)": r"US HIGHWAY \1",
    "^U.S. HWY ": "US HIGHWAY ",
    "^U.S. HIGHWAY ": "US HIGHWAY ",
    "^U S ([0-9]+) HWY": r"US HIGHWAY \1",
    "^US ([0-9]+)": r"US HIGHWAY \1",
    " US HWY ([0-9]+)": r" US HIGHWAY \1",
    "UNITED STATES FOREST SERVICE ROAD ([0-9]+) RD$": r"FOREST SERVICE ROAD \1",
    "^IH?([0-9]{1,3})(\s|$)": r"INTERSTATE \1\2",
    "^INTERSTATE HWY ([0-9]{1,3})(\s|$)": r"INTERSTATE \1\2",
    "^I ?([0-9]{1,3})(\s|$)": r"INTERSTATE \1\2",
    "^I-([0-9]{1,3})$": r"INTERSTATE \1",
    "^([EW]) I-([0-9]{1,3})$": r"\1 INTERSTATE \2",
    "^HWY FM ([0-9]+)": r"FM \1",
    "^FARM TO MARKET ([0-9]+)": r"FM \1",
    " (HIWAY) ([0-9]+)$": r" HWY \2",
    " (RTE|RT) ([0-9]+)$": r" ROUTE \2",
    " RD ([0-9]+)$": r" ROAD \1",
    "^ST (HIGHWAY|ROUTE|ROAD|HWY) ([0-9]+)$": r"STATE \1 \2",
    "^CNTY (HIGHWAY|ROUTE|ROAD|HWY) ([0-9]+)$": r"COUNTY \1 \2",
    "^CR ([0-9]+)": r"COUNTY ROAD \1",
    "^COUNTY RD ([0-9]+) ([NSEW]{1,2})": r"COUNTY ROAD \1 \2",
    "^(SR|ST RD) ([0-9]+)": r"STATE ROAD \2",
    "^(ST RT|ST RTE) ([0-9]+)": r"STATE ROUTE \2",
    "^(HWY|HIWAY) ([0-9]+)": r"HIGHWAY \2",
    "^(RTE|RT) ([0-9]+)": r"ROUTE \2",
    "^RD ([0-9]+)": r"ROAD \1",
    "^TSR ([0-9]+)": r"TOWNSHIP ROAD \1",
    "([0-9]+) BYP RD": r"\1 BYPASS RD",
    "([0-9]+) BYPASS": r"\1 BYP",
    " HIGHWAY ([0-9]+)": r" HWY \1",
    "^(STATE|COUNTY) HWY ": r"\1 HIGHWAY ",
    "UNITED STATES HWY ([0-9]+)": r"US HIGHWAY \1",
    "UNITED STATES HWY ": "US HIGHWAY ",
    "^US ([0-9]+)": r"US HIGHWAY \1",
    " US HWY ([0-9]+)": r" US HIGHWAY \1",
    "^US HWY ": r"US HIGHWAY ",
    "^FIRST ": "1ST ",
    "^SECOND ": "2ND ",
    "^THIRD ": "3RD ",
    "^FOURTH ": "4TH ",
    "^FIFTH ": "5TH ",
    "^SIXTH ": "6TH ",
    "^SEVENTH ": "7TH ",
    "^EIGHTH ": "8TH ",
    "^NINTH ": "9TH ",
    "^TENTH ": "10TH ",
    " STREET ST$": " ST",
    " AVENUE AVE$": " AVE",
    " DRIVE DR$": " DR",
    " DR DRIVE$": " DR",
    " PARKS PARK$": " PARK",
    " ROAD RD$": " RD",
    " LK ": " LAKE ",
    " ST ST$": " ST",
    "^(N|S|E|W) COUNTY (ROAD|RD) ([0-9]{3,}) (N|S|E|W)$": r"\1 \3 \4", # Indiana has "County Road" in NAD as the street name for some reason
    "^COUNTY RD COUNTY ROAD ": "COUNTY ROAD ",
    " CI$": " CIR",
    " CM$": " CMN",
    " BL$": " BLVD",
    " TE$": " TER",
    " LP$": " LOOP",
    "^CRK ([0-9]+)$": r"COUNTY ROAD \1", # Athens TX does this for some reason
    "^PR ([0-9]+)$": r"PRIVATE ROAD \1", # Athens TX does this for some reason
    "^SH ([0-9]+) ([NSEW])$": r"STATE HIGHWAY \1 \2" # Athens TX does this too for some reason
 }
 STANDARDIZATION_NUMBER_REGEXES = {
    "^([0-9]+) \1$": r"\1", # Fix address numbers that repeat with a space between (doesn't always work for some reason)
    "^([0-9]+) ([A-Z])$": r"\1\2", # "1234 A ROAD ST" to "1234A ROAD ST"
    "^0$": "", # Blank out 0 as a house number
    "^\.$": "", # Blank out .
 }
 ABBREV_PATTERN = ""
 ABBREV_PATTERN_LIST = []
 for (a, b) in LONGHAND_STREET_TYPES.items():
    ABBREV_PATTERN_LIST.append(a)
 ABBREV_PATTERN = "|".join(ABBREV_PATTERN_LIST)
 STREET_INNER_ABBREV_FIND_REGEX = re.compile("(^|\s)("+ABBREV_PATTERN+") ("+ABBREV_PATTERN+")( [NSEW]{0,2})?$")
 def postStandardizeStreet(street):
    # Catch edge cases with USPS formatting
    for (find, replace) in POST_STANDARDIZATION_STREET_REGEXES.items():
        street = re.sub(find, replace, street)
    # Unshorten things like "S CRK RD", correcting to "S CREEK RD"
    matches = STREET_INNER_ABBREV_FIND_REGEX.search(street)
    if matches:
        street = street.replace(matches.group(2), LONGHAND_STREET_TYPES[matches.group(2)], 1)
    # "KY 1234" to "KY HIGHWAY 1234" per Pub 28
    if re.match(r"^[A-Z]{2} [0-9]+$", street):
        for (full, abbr) in STATES.items():
            if street.startswith(abbr + " "):
                street = street.replace(abbr, abbr + " HIGHWAY", 1)
                break
    # "KENTUCKY STATE HIGHWAY 625" to "KY STATE HIGHWAY 625" per Pub 28
    if re.match(r"^[A-Z]{2,} STATE HIGHWAY [0-9]+", street):
        for (full, abbr) in STATES.items():
            if street.startswith(full + " "):
                street = street.replace(full, abbr, 1)
                break
    return street.strip()
 def preStandardizeStreet(street):
    for (find, replace) in PRE_STANDARDIZATION_STREET_REGEXES.items():
        street = re.sub(find, replace, street)
    # Remove unit from end of street
    hashtag = street.find("#")
    if hashtag > 0:
        street = street[:hashtag].strip()
    return street
 def standardizeNumber(number):
    for (find, replace) in STANDARDIZATION_NUMBER_REGEXES.items():
        number = re.sub(find, replace, number)
    # Detect "1234 1234" which some sources have sometimes (like Kentucky NAD v20)
    if (parts := number.split(" ")) and len(parts) == 2 and parts[0] == parts[1]:
        number = parts[0]
    return number.strip()
 def splitNumberAndUnit(number):
    # Some places have unit numbers in the primary address, remove them
    num = number
    unit = ""
    for label in UNITS:
        pos = number.find(label)
        if pos > 0:
            num = number[:pos]
            unit = number[pos:]
            break
    num = standardizeNumber(num)
    unit = removeUnitText(unit)
    return num, unit
 def removeUnitText(subaddr):
    if subaddr == None:
        return subaddr
    subaddr = subaddr.upper()
    for label in UNITS:
        subaddr = subaddr.replace(label, "")
    subaddr = subaddr.replace("  ", " ")
    return subaddr.strip()
 def normalizeAddress(number, street, unit, city, state, zipcode, lat, lon, zipprefix=False, plus4="", county=False):
    cfg = src.config.get_config()
    if not number:
        #print("W: No address number for address, skipping "+street)
        raise ValidationException("No address number")
    if not street:
        #print("W: No street for address, skipping")
        raise ValidationException("No address number")
    # Detect flipped coordinates
    if lat < -90 or lat > 90:
        lon, lat = lat, lon
    number = standardizeNumber(str(number).upper().strip())
    street = preStandardizeStreet(street.strip().upper())
    unit = unit.strip().upper()
    city = city.strip().upper()
    state = state.strip().upper()
    zipcode = (zipcode or "").strip()
    plus4 = plus4.strip()
    if (not city or city == "") and (not zipcode or zipcode == "") and cfg.citySuggestion:
        # Use the city specified on the CLI, hopefully it'll help
        city = cfg.citySuggestion
    if unit == "":
        number, unit = splitNumberAndUnit(number)        
    city = city.replace("CITY OF ", "").replace("TOWN OF ", "").replace("VILLAGE OF ", "")
    city = city.replace("UNINCORPORATED", "")
    #
    # Standardize address
    #
    try:
        # Python library
        addr = normalize_address_record(
            {
                "address_line_1": "".join(["999999999", " ", street]),
                "address_line_2": unit,
                "city": city,
                "state": state,
                "postal_code": zipcode
            }
        )
    except Exception as e:
        try:
            # Proprietary Mono library
            if standardization == False:
                raise e
            addr = {
                "address_line_1": "999999999 " + standardization.StandardizeStreetAddress(street),
                "address_line_2": unit,
                "city": city,
                "state": state,
                "postal_code": zipcode
            }
        except Exception as ex:
            # This basically never happens
            print("W: Couldn't parse address:")
            print(ex)
            raise ex
    #
    # Remove number from street address field
    #
    addr['address_line_1'] = addr['address_line_1'].replace("999999999", number)
    streetonly = addr['address_line_1']
    if streetonly.startswith(str(number) + " "):
        streetonly = streetonly[len(str(number) + " "):]
    #
    # Run extra regexes on street to fix standardization problems
    #
    streetonly = postStandardizeStreet(streetonly)        
    #
    # Special conditional rules
    #
    if addr["state"] == "PR":
        # Puerto Rico special rules
        if re.match("[A-Z]", number):
            number = number.replace("-", "")
    #
    # Clean second line
    #
    addr['address_line_2'] = removeUnitText(addr['address_line_2'])
    #
    # Standardize and validate and/or append ZIP Code
    #
    zipcode = addr["postal_code"]
    unitprefix = ""
    unit = addr['address_line_2']
    if zipcode is not None:
        zipcode = addr["postal_code"][0:5]
    # Skip these if we already have a ZIP+4 code, assume it's accurate
    if zipcode is not None and len(zipcode) == 5 and not plus4:
        zipinfo = getCityStateForZIP(zipcode)
        if cfg.appendPlus4 or zipinfo == False or addr["state"] != zipinfo["state"] or (cfg.appendUnitLabel and addr["address_line_2"] != ""):
            zipcode, plus4, streetonly, unitprefix, unit = getZIP4(number, streetonly, addr['address_line_2'], addr["state"], lat, lon, addr["city"], zipcode, county)
            zipinfo = getCityStateForZIP(zipcode)
            if zipinfo != False:
                addr["city"] = zipinfo["city"]
                addr["state"] = zipinfo["state"]
        else:
            addr["city"] = zipinfo["city"]
            addr["state"] = zipinfo["state"]
    elif not plus4:
        zipcode, plus4, streetonly, unitprefix, unit = getZIP4(number, streetonly, addr['address_line_2'], addr["state"], lat, lon, addr["city"], False, county)
        zipinfo = getCityStateForZIP(zipcode)
        if zipinfo != False:
            addr["city"] = zipinfo["city"]
            addr["state"] = zipinfo["state"]
    if not plus4 and streetonly == "UNITED STATES HWY" and re.match(r"^\d+$", unit):
        streetonly = f"US HIGHWAY {unit}"
        unit = ""
    #if not src.config.appendPlus4:
    #    plus4 = ""        
    return {
        "number": number,
        "street": streetonly,
        "unit": ' '.join(filter(None, (unitprefix, unit))),
        "city": addr["city"],
        "state": addr["state"],
        "zip": zipcode,
        "plus4": plus4,
        "latitude": lat,
        "longitude": lon
    }
--- a/src/advancedparsing.py
+++ b/src/advancedparsing.py
@ -0,0 +1,109 @@
 # Use "AI" to parse problem addresses and find more matches
 # First expand the address to possible forms, then normalize each one, and keep the one that has a ZIP+4
 from postal.parser import parse_address
 from postal.expand import expand_address
 from src.addressfunctions import normalizeAddress
 import re
 def advancedNormalize(number, street, unit, city, state, zipcode, lat, lon, zipprefix=False, plus4 = ""):
    if len(plus4 or "") == 4:
        # Return as-is, it's got a +4 match already
        return {
            "number": number,
            "street": street,
            "unit": unit,
            "city": city,
            "state": state,
            "zip": zipcode,
            "plus4": plus4,
            "latitude": lat,
            "longitude": lon
        }
    # Merge and re-split the address to catch odd things like the street having the city and zip too
    parsed = parse_address(f"{number} {street}, {city} {state} {zipcode}")
    pNumber = number
    pStreet = street
    pUnit = unit
    pCity = city
    pState = state
    pZip = zipcode
    for part in parsed:
        if part[1] == "house_number" and (pNumber == "" or pNumber == number): # Don't overwrite it with values found later, which might be a zip code or something
            pNumber = part[0].upper()
        elif part[1] == "road":
            pStreet = part[0].upper()
        elif part[1] == "unit":
            pUnit = part[0].upper()
        elif part[1] == "city":
            pCity = part[0].upper()
        elif part[1] == "state":
            pState = part[0].upper()
        elif part[1] == "postcode":
            pZip = part[0].upper()
    # Expand the number/street to all possible forms
    expanded = expand_address(f"{pNumber} {pStreet}")
    normalizedMatches = []
    # Add the original address as a candidate so if no better matches come up, it'll probably just use it as-is
    normalizedMatches.append({
        "number": number,
        "street": street,
        "unit": unit,
        "city": city,
        "state": state,
        "zip": zipcode,
        "plus4": plus4,
        "latitude": lat,
        "longitude": lon
    })
    # Also add one where we remove any non-numeric data from the number and unit fields
    normalizedMatches.append(normalizeAddress(re.sub("[^0-9]", "", number), street, re.sub("[^0-9]", "", unit), pCity, pState, pZip, lat, lon, zipprefix, plus4))
    if number != pNumber or unit != pUnit or street != pStreet:
        normalizedMatches.append(normalizeAddress(re.sub("[^0-9]", "", pNumber), pStreet, re.sub("[^0-9]", "", pUnit), pCity, pState, pZip, lat, lon, zipprefix, plus4))
    for exp in expanded:
        parsed = parse_address(exp)
        pN = ""
        pS = ""
        pU = ""
        for part in parsed:
            if part[1] == "house_number" and pN == "":
                pN = part[0]
            elif part[1] == "road":
                pS = part[0]
            elif part[1] == "unit":
                pU = part[0]
        try:
            normalizedMatches.append(normalizeAddress(pN, pS, pU, pCity, pState, pZip, lat, lon, zipprefix, plus4))
        except Exception as e:
            pass
    if len(normalizedMatches) > 1:
        weights = {"number": 5, "street": 5, "unit": 1, "city": 1, "state": 1, "zip": 3, "plus4": 8, "latitude": 0, "longitude": 0}
        sortedMatches = sorted(
            normalizedMatches,
            key=lambda item: sum(weights[k] for k, v in item.items() if v),
            reverse=True
        )
        return sortedMatches[0]
    elif len(normalizedMatches) == 1:
        return normalizedMatches[0]
    # No matches, give up on the whole thing
    return {
        "number": number,
        "street": street,
        "unit": unit,
        "city": city,
        "state": state,
        "zip": zipcode,
        "plus4": plus4,
        "latitude": lat,
        "longitude": lon
    }
--- a/src/config.py
+++ b/src/config.py
@ -0,0 +1,23 @@
 from dataclasses import dataclass
 from typing import Optional
@dataclass(frozen=True)
 class AppConfig:
    appendPlus4: bool
    appendUnitLabel: bool
    countryCode: str
    citySuggestion: bool
    useCensusToFillEmptyZIPs: bool
    advancedMode: bool
    noSkip4: bool
 _CFG: Optional[AppConfig] = None
 def set_config(cfg: AppConfig) -> None:
    global _CFG
    _CFG = cfg  # set once at start (or in child initializer)
 def get_config() -> AppConfig:
    if _CFG is None:
        raise RuntimeError("Config not initialized yet")
    return _CFG
--- a/src/constants.py
+++ b/src/constants.py
@ -0,0 +1,291 @@
 class ValidationException(Exception):
    pass
 LONGHAND_STREET_TYPES = {
    'ALY': 'ALLEY',
    'ANX': 'ANNEX',
    'ARC': 'ARCADE',
    'AV': 'AVENUE',
    'AVE': 'AVENUE',
    'BYU': 'BAYOU',
    'BCH': 'BEACH',
    'BND': 'BEND',
    'BLF': 'BLUFF',
    'BLFS': 'BLUFFS',
    'BTM': 'BOTTOM',
    'BLVD': 'BOULEVARD',
    'BL': 'BOULEVARD',
    'BR': 'BRANCH',
    'BRG': 'BRIDGE',
    'BRK': 'BROOK',
    'BRKS': 'BROOKS',
    'BGS': 'BURGS',
    'BYP': 'BYPASS',
    'CP': 'CAMP',
    'CYN': 'CANYON',
    'CPE': 'CAPE',
    'CSWY': 'CAUSEWAY',
    'CTR': 'CENTER',
    'CTRS': 'CENTERS',
    'CI': 'CIRCLE',
    'CIR': 'CIRCLE',
    'CIRS': 'CIRCLES',
    'CLF': 'CLIFF',
    'CLFS': 'CLIFFS',
    'CMN': 'COMMON',
    'CM': 'COMMON',
    'COR': 'CORNER',
    'CORS': 'CORNERS',
    'CRSE': 'COURSE',
    'CT': 'COURT',
    'CTS': 'COURTS',
    'CVS': 'COVES',
    'CRK': 'CREEK',
    'CRES': 'CRESCENT',
    'CRST': 'CREST',
    'XING': 'CROSSING',
    'XRD': 'CROSSROAD',
    'CURV': 'CURVE',
    'DL': 'DALE',
    'DM': 'DAM',
    'DV': 'DIVIDE',
    'DR': 'DRIVE',
    'DRS': 'DRIVES',
    'EST': 'ESTATE',
    'ESTS': 'ESTATES',
    'EXPY': 'EXPRESSWAY',
    'EXT': 'EXTENSION',
    'EXTS': 'EXTENSIONS',
    'FALL': 'FALL',
    'FLS': 'FALLS',
    'FRY': 'FERRY',
    'FLD': 'FIELD',
    'FLDS': 'FIELDS',
    'FLT': 'FLAT',
    'FLTS': 'FLATS',
    'FRD': 'FORD',
    'FRDS': 'FORDS',
    'FRST': 'FORESTS',
    'FRG': 'FORGE',
    'FRGS': 'FORGES',
    'FRK': 'FORK',
    'FRKS': 'FORKS',
    'FT': 'FORT',
    'FWY': 'FREEWAY',
    'GDN': 'GARDEN',
    'GDNS': 'GARDENS',
    'GTWY': 'GATEWAY',
    'GLN': 'GLEN',
    'GLNS': 'GLENS',
    'GRNS': 'GREENS',
    'GRV': 'GROVE',
    'GRVS': 'GROVES',
    'HBR': 'HARBOR',
    'HBRS': 'HARBORS',
    'HVN': 'HAVEN',
    'HTS': 'HEIGHTS',
    'HWY': 'HIGHWAY',
    'HL': 'HILL',
    'HLS': 'HILLS',
    'HOLW': 'HOLLOW',
    'INLT': 'INLET',
    'IS': 'ISLAND',
    'ISS': 'ISLANDS',
    'ISLE': 'ISLE',
    'JCT': 'JUNCTION',
    'JCTS': 'JUNCTIONS',
    'KY': 'KEY',
    'KYS': 'KEYS',
    'KNL': 'KNOLL',
    'KNLS': 'KNOLLS',
    'LK': 'LAKE',
    'LKS': 'LAKES',
    'LAND': 'LAND',
    'LNDG': 'LANDING',
    'LN': 'LANE',
    'LGT': 'LIGHT',
    'LGTS': 'LIGHTS',
    'LF': 'LOAF',
    'LCK': 'LOCK',
    'LCKS': 'LOCKS',
    'LDG': 'LODGE',
    'LOOP': 'LOOP',
    'LP': 'LOOP',
    'MALL': 'MALL',
    'MNR': 'MANOR',
    'MNRS': 'MANORS',
    'MDW': 'MEADOW',
    'MDWS': 'MEADOWS',
    'MEWS': 'MEWS',
    'ML': 'MILL',
    'MLS': 'MILLS',
    'MSN': 'MISSION',
    'MTWY': 'MOTORWAY',
    'MT': 'MOUNT',
    'MTN': 'MOUNTAIN',
    'MTNS': 'MOUNTAINS',
    'NCK': 'NECK',
    'ORCH': 'ORCHARD',
    'OVAL': 'OVAL',
    'OPAS': 'OVERPASS',
    'PARK': 'PARKS',
    'PKWY': 'PARKWAY',
    'PASS': 'PASS',
    'PSGE': 'PASSAGE',
    'PATH': 'PATHS',
    'PIKE': 'PIKES',
    'PNE': 'PINE',
    'PNES': 'PINES',
    'PL': 'PLACE',
    'PLN': 'PLAIN',
    'PLNS': 'PLAINS',
    'PLZ': 'PLAZA',
    'PT': 'POINT',
    'PTS': 'POINTS',
    'PRT': 'PORT',
    'PRTS': 'PORTS',
    'PR': 'PRAIRIE',
    'PW': 'PARKWAY',
    'RADL': 'RADIAL',
    'RAMP': 'RAMP',
    'RNCH': 'RANCH',
    'RPD': 'RAPID',
    'RPDS': 'RAPIDS',
    'RST': 'REST',
    'RDG': 'RIDGE',
    'RDGS': 'RIDGES',
    'RIV': 'RIVER',
    'RD': 'ROAD',
    'RDS': 'ROADS',
    'RTE': 'ROUTE',
    'ROW': 'ROW',
    'RUE': 'RUE',
    'RUN': 'RUN',
    'SHL': 'SHOAL',
    'SHLS': 'SHOALS',
    'SHR': 'SHORE',
    'SHRS': 'SHORES',
    'SKWY': 'SKYWAY',
    'SPG': 'SPRING',
    'SPGS': 'SPRINGS',
    'SPUR': 'SPURS',
    'SQ': 'SQUARE',
    'SQS': 'SQUARES',
    'STA': 'STATION',
    'STRA': 'STRAVENUE',
    'STRM': 'STREAM',
    'ST': 'STREET',
    'STS': 'STREETS',
    'SMT': 'SUMMIT',
    'TER': 'TERRACE',
    'TRWY': 'THROUGHWAY',
    'TRCE': 'TRACE',
    'TRAK': 'TRACK',
    'TRFY': 'TRAFFICWAY',
    'TRL': 'TRAIL',
    'TUNL': 'TUNNEL',
    'TPKE': 'TURNPIKE',
    'UPAS': 'UNDERPASS',
    'UN': 'UNION',
    'UNS': 'UNIONS',
    'VLY': 'VALLEY',
    'VLYS': 'VALLEYS',
    'VIA': 'VIADUCT',
    'VW': 'VIEW',
    'VWS': 'VIEWS',
    'VLG': 'VILLAGE',
    'VLGS': 'VILLAGES',
    'VL': 'VILLE',
    'VIS': 'VISTA',
    'WALK': 'WALK',
    'WALL': 'WALL',
    'WAY': 'WAY',
    'WL': 'WELL',
    'WLS': 'WELLS'
 }
 UNITS = [
    'APT',
    'BLDG',
    'BUILDING',
    'BSMT',
    'DEPT',
    'FL',
    'FRNT',
    'HNGR',
    'KEY',
    'LBBY',
    'LOT',
    'LOWR',
    'OFC',
    'PH',
    'PIER',
    'REAR',
    'RM',
    'SIDE',
    'SLIP',
    'SPC',
    'STOP',
    'STE',
    'TRLR',
    'UNIT',
    'UPPER',
    '#',
    'BASE', # Not a real unit designator but appears in some NAD AZ data for some reason
    '(VACANT)' # One dataset does this...
 ]
 STATES = {
    "ALABAMA": "AL",
    "ALASKA": "AK",
    "ARIZONA": "AZ",
    "ARKANSAS": "AR",
    "CALIFORNIA": "CA",
    "COLORADO": "CO",
    "CONNECTICUT": "CT",
    "DELAWARE": "DE",
    "DISTRICT OF COLUMBIA": "DC",
    "FLORIDA": "FL",
    "GEORGIA": "GA",
    "HAWAII": "HI",
    "IDAHO": "ID",
    "ILLINOIS": "IL",
    "INDIANA": "IN",
    "IOWA": "IA",
    "KANSAS": "KS",
    "KENTUCKY": "KY",
    "LOUISIANA": "LA",
    "MAINE": "ME",
    "MONTANA": "MT",
    "NEBRASKA": "NE",
    "NEVADA": "NV",
    "NEW HAMPSHIRE": "NH",
    "NEW JERSEY": "NJ",
    "NEW MEXICO": "NM",
    "NEW YORK": "NY",
    "NORTH CAROLINA": "NC",
    "NORTH DAKOTA": "ND",
    "OHIO": "OH",
    "OKLAHOMA": "OK",
    "OREGON": "OR",
    "MARYLAND": "MD",
    "MASSACHUSETTS": "MA",
    "MICHIGAN": "MI",
    "MINNESOTA": "MN",
    "MISSISSIPPI": "MS",
    "MISSOURI": "MO",
    "PENNSYLVANIA": "PA",
    "RHODE ISLAND": "RI",
    "SOUTH CAROLINA": "SC",
    "SOUTH DAKOTA": "SD",
    "TENNESSEE": "TN",
    "TEXAS": "TX",
    "UTAH": "UT",
    "VERMONT": "VT",
    "VIRGINIA": "VA",
    "WASHINGTON": "WA",
    "WEST VIRGINIA": "WV",
    "WISCONSIN": "WI",
    "WYOMING": "WY"
 }
--- a/src/streetcleaner.py
+++ b/src/streetcleaner.py
@ -0,0 +1,614 @@
 # Created on : Aug 28, 2024, 11:58:00 PM
 # Author     : Skylar Ittner
 DIRECTIONAL_REPLACEMENTS = {
    'EAST': 'E',
    'WEST': 'W',
    'NORTH': 'N',
    'SOUTH': 'S',
    'NORTHEAST': 'NE',
    'NORTHWEST': 'NW',
    'SOUTHEAST': 'SE',
    'SOUTHWEST': 'SW',
    'ESTE': 'E',
    'OESTE': 'W',
    'NORTE': 'N',
    'SUR': 'S',
    'NORESTE': 'NE',
    'NOROESTE': 'NW',
    'SURESTE': 'SE',
    'SUROESTE': 'SW'
 }
 STREET_TYPE_ABBREVIATIONS = {
    'ALLEE': 'ALY',
    'ALLEY': 'ALY',
    'ALLY': 'ALY',
    'ALY': 'ALY',
    'ANEX': 'ANX',
    'ANNEX': 'ANX',
    'ANNX': 'ANX',
    'ANX': 'ANX',
    'ARC': 'ARC',
    'ARCADE': 'ARC',
    'AV': 'AVE',
    'AVE': 'AVE',
    'AVEN': 'AVE',
    'AVENU': 'AVE',
    'AVENUE': 'AVE',
    'AVN': 'AVE',
    'AVNUE': 'AVE',
    'BAYOO': 'BYU',
    'BAYOU': 'BYU',
    'BCH': 'BCH',
    'BEACH': 'BCH',
    'BEND': 'BND',
    'BND': 'BND',
    'BLF': 'BLF',
    'BLUF': 'BLF',
    'BLUFF': 'BLF',
    'BLUFFS': 'BLFS',
    'BOT': 'BTM',
    'BOTTM': 'BTM',
    'BOTTOM': 'BTM',
    'BTM': 'BTM',
    'BLVD': 'BLVD',
    'BOUL': 'BLVD',
    'BOULEVARD': 'BLVD',
    'BOULV': 'BLVD',
    'BR': 'BR',
    'BRANCH': 'BR',
    'BRNCH': 'BR',
    'BRDGE': 'BRG',
    'BRG': 'BRG',
    'BRIDGE': 'BRG',
    'BRK': 'BRK',
    'BROOK': 'BRK',
    'BROOKS': 'BRKS',
    'BURG': 'BG',
    'BURGS': 'BGS',
    'BYP': 'BYP',
    'BYPA': 'BYP',
    'BYPAS': 'BYP',
    'BYPASS': 'BYP',
    'BYPS': 'BYP',
    'CAMP': 'CP',
    'CMP': 'CP',
    'CP': 'CP',
    'CANYN': 'CYN',
    'CANYON': 'CYN',
    'CNYN': 'CYN',
    'CYN': 'CYN',
    'CAPE': 'CPE',
    'CPE': 'CPE',
    'CAUSEWAY': 'CSWY',
    'CAUSWAY': 'CSWY',
    'CSWY': 'CSWY',
    'CEN': 'CTR',
    'CENT': 'CTR',
    'CENTER': 'CTR',
    'CENTR': 'CTR',
    'CENTRE': 'CTR',
    'CNTER': 'CTR',
    'CNTR': 'CTR',
    'CTR': 'CTR',
    'CENTERS': 'CTRS',
    'CIR': 'CIR',
    'CIRC': 'CIR',
    'CIRCL': 'CIR',
    'CIRCLE': 'CIR',
    'CRCL': 'CIR',
    'CRCLE': 'CIR',
    'CIRCLES': 'CIRS',
    'CLF': 'CLF',
    'CLIFF': 'CLF',
    'CLFS': 'CLFS',
    'CLIFFS': 'CLFS',
    'CLB': 'CLB',
    'CLUB': 'CLB',
    'COMMON': 'CMN',
    'COR': 'COR',
    'CORNER': 'COR',
    'CORNERS': 'CORS',
    'CORS': 'CORS',
    'COURSE': 'CRSE',
    'CRSE': 'CRSE',
    'COURT': 'CT',
    'CRT': 'CT',
    'CT': 'CT',
    'COURTS': 'CTS',
    'COVE': 'CV',
    'CV': 'CV',
    'COVES': 'CVS',
    'CK': 'CRK',
    'CR': 'CRK',
    'CREEK': 'CRK',
    'CRK': 'CRK',
    'CRECENT': 'CRES',
    'CRES': 'CRES',
    'CRESCENT': 'CRES',
    'CRESENT': 'CRES',
    'CRSCNT': 'CRES',
    'CRSENT': 'CRES',
    'CRSNT': 'CRES',
    'CREST': 'CRST',
    'CROSSING': 'XING',
    'CRSSING': 'XING',
    'CRSSNG': 'XING',
    'XING': 'XING',
    'CROSSROAD': 'XRD',
    'CURVE': 'CURV',
    'DALE': 'DL',
    'DL': 'DL',
    'DAM': 'DM',
    'DM': 'DM',
    'DIV': 'DV',
    'DIVIDE': 'DV',
    'DV': 'DV',
    'DVD': 'DV',
    'DR': 'DR',
    'DRIV': 'DR',
    'DRIVE': 'DR',
    'DRV': 'DR',
    'DRIVES': 'DRS',
    'EST': 'EST',
    'ESTATE': 'EST',
    'ESTATES': 'ESTS',
    'ESTS': 'ESTS',
    'EXP': 'EXPY',
    'EXPR': 'EXPY',
    'EXPRESS': 'EXPY',
    'EXPRESSWAY': 'EXPY',
    'EXPW': 'EXPY',
    'EXPY': 'EXPY',
    'EXT': 'EXT',
    'EXTENSION': 'EXT',
    'EXTN': 'EXT',
    'EXTNSN': 'EXT',
    'EXTENSIONS': 'EXTS',
    'EXTS': 'EXTS',
    'FALL': 'FALL',
    'FALLS': 'FLS',
    'FLS': 'FLS',
    'FERRY': 'FRY',
    'FRRY': 'FRY',
    'FRY': 'FRY',
    'FIELD': 'FLD',
    'FLD': 'FLD',
    'FIELDS': 'FLDS',
    'FLDS': 'FLDS',
    'FLAT': 'FLT',
    'FLT': 'FLT',
    'FLATS': 'FLTS',
    'FLTS': 'FLTS',
    'FORD': 'FRD',
    'FRD': 'FRD',
    'FORDS': 'FRDS',
    'FOREST': 'FRST',
    'FORESTS': 'FRST',
    'FRST': 'FRST',
    'FORG': 'FRG',
    'FORGE': 'FRG',
    'FRG': 'FRG',
    'FORGES': 'FRGS',
    'FORK': 'FRK',
    'FRK': 'FRK',
    'FORKS': 'FRKS',
    'FRKS': 'FRKS',
    'FORT': 'FT',
    'FRT': 'FT',
    'FT': 'FT',
    'FREEWAY': 'FWY',
    'FREEWY': 'FWY',
    'FRWAY': 'FWY',
    'FRWY': 'FWY',
    'FWY': 'FWY',
    'GARDEN': 'GDN',
    'GARDN': 'GDN',
    'GDN': 'GDN',
    'GRDEN': 'GDN',
    'GRDN': 'GDN',
    'GARDENS': 'GDNS',
    'GDNS': 'GDNS',
    'GRDNS': 'GDNS',
    'GATEWAY': 'GTWY',
    'GATEWY': 'GTWY',
    'GATWAY': 'GTWY',
    'GTWAY': 'GTWY',
    'GTWY': 'GTWY',
    'GLEN': 'GLN',
    'GLN': 'GLN',
    'GLENS': 'GLNS',
    'GREEN': 'GRN',
    'GRN': 'GRN',
    'GREENS': 'GRNS',
    'GROV': 'GRV',
    'GROVE': 'GRV',
    'GRV': 'GRV',
    'GROVES': 'GRVS',
    'HARB': 'HBR',
    'HARBOR': 'HBR',
    'HARBR': 'HBR',
    'HBR': 'HBR',
    'HRBOR': 'HBR',
    'HARBORS': 'HBRS',
    'HAVEN': 'HVN',
    'HAVN': 'HVN',
    'HVN': 'HVN',
    'HEIGHT': 'HTS',
    'HEIGHTS': 'HTS',
    'HGTS': 'HTS',
    'HT': 'HTS',
    'HTS': 'HTS',
    'HIGHWAY': 'HWY',
    'HIGHWY': 'HWY',
    'HIWAY': 'HWY',
    'HIWY': 'HWY',
    'HWAY': 'HWY',
    'HWY': 'HWY',
    'HILL': 'HL',
    'HL': 'HL',
    'HILLS': 'HLS',
    'HLS': 'HLS',
    'HLLW': 'HOLW',
    'HOLLOW': 'HOLW',
    'HOLLOWS': 'HOLW',
    'HOLW': 'HOLW',
    'HOLWS': 'HOLW',
    'INLET': 'INLT',
    'INLT': 'INLT',
    'IS': 'IS',
    'ISLAND': 'IS',
    'ISLND': 'IS',
    'ISLANDS': 'ISS',
    'ISLNDS': 'ISS',
    'ISS': 'ISS',
    'ISLE': 'ISLE',
    'ISLES': 'ISLE',
    'JCT': 'JCT',
    'JCTION': 'JCT',
    'JCTN': 'JCT',
    'JUNCTION': 'JCT',
    'JUNCTN': 'JCT',
    'JUNCTON': 'JCT',
    'JCTNS': 'JCTS',
    'JCTS': 'JCTS',
    'JUNCTIONS': 'JCTS',
    'KEY': 'KY',
    'KY': 'KY',
    'KEYS': 'KYS',
    'KYS': 'KYS',
    'KNL': 'KNL',
    'KNOL': 'KNL',
    'KNOLL': 'KNL',
    'KNLS': 'KNLS',
    'KNOLLS': 'KNLS',
    'LAKE': 'LK',
    'LK': 'LK',
    'LAKES': 'LKS',
    'LKS': 'LKS',
    'LAND': 'LAND',
    'LANDING': 'LNDG',
    'LNDG': 'LNDG',
    'LNDNG': 'LNDG',
    'LA': 'LN',
    'LANE': 'LN',
    'LANES': 'LN',
    'LN': 'LN',
    'LGT': 'LGT',
    'LIGHT': 'LGT',
    'LIGHTS': 'LGTS',
    'LF': 'LF',
    'LOAF': 'LF',
    'LCK': 'LCK',
    'LOCK': 'LCK',
    'LCKS': 'LCKS',
    'LOCKS': 'LCKS',
    'LDG': 'LDG',
    'LDGE': 'LDG',
    'LODG': 'LDG',
    'LODGE': 'LDG',
    'LOOP': 'LOOP',
    'LOOPS': 'LOOP',
    'MALL': 'MALL',
    'MANOR': 'MNR',
    'MNR': 'MNR',
    'MANORS': 'MNRS',
    'MNRS': 'MNRS',
    'MDW': 'MDW',
    'MEADOW': 'MDW',
    'MDWS': 'MDWS',
    'MEADOWS': 'MDWS',
    'MEDOWS': 'MDWS',
    'MEWS': 'MEWS',
    'MILL': 'ML',
    'ML': 'ML',
    'MILLS': 'MLS',
    'MLS': 'MLS',
    'MISSION': 'MSN',
    'MISSN': 'MSN',
    'MSN': 'MSN',
    'MSSN': 'MSN',
    'MOTORWAY': 'MTWY',
    'MNT': 'MT',
    'MOUNT': 'MT',
    'MT': 'MT',
    'MNTAIN': 'MTN',
    'MNTN': 'MTN',
    'MOUNTAIN': 'MTN',
    'MOUNTIN': 'MTN',
    'MTIN': 'MTN',
    'MTN': 'MTN',
    'MNTNS': 'MTNS',
    'MOUNTAINS': 'MTNS',
    'NCK': 'NCK',
    'NECK': 'NCK',
    'ORCH': 'ORCH',
    'ORCHARD': 'ORCH',
    'ORCHRD': 'ORCH',
    'OVAL': 'OVAL',
    'OVL': 'OVAL',
    'OVERPASS': 'OPAS',
    'PARK': 'PARK',
    'PK': 'PARK',
    'PRK': 'PARK',
    'PARKS': 'PARK',
    'PARKWAY': 'PKWY',
    'PARKWY': 'PKWY',
    'PKWAY': 'PKWY',
    'PKWY': 'PKWY',
    'PKY': 'PKWY',
    'PW': 'PKWY',
    'PARKWAYS': 'PKWY',
    'PKWYS': 'PKWY',
    'PASS': 'PASS',
    'PASSAGE': 'PSGE',
    'PATH': 'PATH',
    'PATHS': 'PATH',
    'PIKE': 'PIKE',
    'PIKES': 'PIKE',
    'PINE': 'PNE',
    'PINES': 'PNES',
    'PNES': 'PNES',
    'PL': 'PL',
    'PLACE': 'PL',
    'PLAIN': 'PLN',
    'PLN': 'PLN',
    'PLAINES': 'PLNS',
    'PLAINS': 'PLNS',
    'PLNS': 'PLNS',
    'PLAZA': 'PLZ',
    'PLZ': 'PLZ',
    'PLZA': 'PLZ',
    'POINT': 'PT',
    'PT': 'PT',
    'POINTS': 'PTS',
    'PTS': 'PTS',
    'PORT': 'PRT',
    'PRT': 'PRT',
    'PORTS': 'PRTS',
    'PRTS': 'PRTS',
    'PR': 'PR',
    'PRAIRIE': 'PR',
    'PRARIE': 'PR',
    'PRR': 'PR',
    'RAD': 'RADL',
    'RADIAL': 'RADL',
    'RADIEL': 'RADL',
    'RADL': 'RADL',
    'RAMP': 'RAMP',
    'RANCH': 'RNCH',
    'RANCHES': 'RNCH',
    'RNCH': 'RNCH',
    'RNCHS': 'RNCH',
    'RAPID': 'RPD',
    'RPD': 'RPD',
    'RAPIDS': 'RPDS',
    'RPDS': 'RPDS',
    'REST': 'RST',
    'RST': 'RST',
    'RDG': 'RDG',
    'RDGE': 'RDG',
    'RIDGE': 'RDG',
    'RDGS': 'RDGS',
    'RIDGES': 'RDGS',
    'RIV': 'RIV',
    'RIVER': 'RIV',
    'RIVR': 'RIV',
    'RVR': 'RIV',
    'RD': 'RD',
    'ROAD': 'RD',
    'RDS': 'RDS',
    'ROADS': 'RDS',
    'ROUTE': 'RTE',
    'ROW': 'ROW',
    'RUE': 'RUE',
    'RUN': 'RUN',
    'SHL': 'SHL',
    'SHOAL': 'SHL',
    'SHLS': 'SHLS',
    'SHOALS': 'SHLS',
    'SHOAR': 'SHR',
    'SHORE': 'SHR',
    'SHR': 'SHR',
    'SHOARS': 'SHRS',
    'SHORES': 'SHRS',
    'SHRS': 'SHRS',
    'SKYWAY': 'SKWY',
    'SPG': 'SPG',
    'SPNG': 'SPG',
    'SPRING': 'SPG',
    'SPRNG': 'SPG',
    'SPGS': 'SPGS',
    'SPNGS': 'SPGS',
    'SPRINGS': 'SPGS',
    'SPRNGS': 'SPGS',
    'SPUR': 'SPUR',
    'SPURS': 'SPUR',
    'SQ': 'SQ',
    'SQR': 'SQ',
    'SQRE': 'SQ',
    'SQU': 'SQ',
    'SQUARE': 'SQ',
    'SQRS': 'SQS',
    'SQUARES': 'SQS',
    'STA': 'STA',
    'STATION': 'STA',
    'STATN': 'STA',
    'STN': 'STA',
    'STRA': 'STRA',
    'STRAV': 'STRA',
    'STRAVE': 'STRA',
    'STRAVEN': 'STRA',
    'STRAVENUE': 'STRA',
    'STRAVN': 'STRA',
    'STRVN': 'STRA',
    'STRVNUE': 'STRA',
    'STREAM': 'STRM',
    'STREME': 'STRM',
    'STRM': 'STRM',
    'ST': 'ST',
    'STR': 'ST',
    'STREET': 'ST',
    'STRT': 'ST',
    'STREETS': 'STS',
    'SMT': 'SMT',
    'SUMIT': 'SMT',
    'SUMITT': 'SMT',
    'SUMMIT': 'SMT',
    'TER': 'TER',
    'TERR': 'TER',
    'TERRACE': 'TER',
    'THROUGHWAY': 'TRWY',
    'TRACE': 'TRCE',
    'TRACES': 'TRCE',
    'TRCE': 'TRCE',
    'TRACK': 'TRAK',
    'TRACKS': 'TRAK',
    'TRAK': 'TRAK',
    'TRK': 'TRAK',
    'TRKS': 'TRAK',
    'TRAFFICWAY': 'TRFY',
    'TRFY': 'TRFY',
    'TR': 'TRL',
    'TRAIL': 'TRL',
    'TRAILS': 'TRL',
    'TRL': 'TRL',
    'TRLS': 'TRL',
    'TUNEL': 'TUNL',
    'TUNL': 'TUNL',
    'TUNLS': 'TUNL',
    'TUNNEL': 'TUNL',
    'TUNNELS': 'TUNL',
    'TUNNL': 'TUNL',
    'TPK': 'TPKE',
    'TPKE': 'TPKE',
    'TRNPK': 'TPKE',
    'TRPK': 'TPKE',
    'TURNPIKE': 'TPKE',
    'TURNPK': 'TPKE',
    'UNDERPASS': 'UPAS',
    'UN': 'UN',
    'UNION': 'UN',
    'UNIONS': 'UNS',
    'VALLEY': 'VLY',
    'VALLY': 'VLY',
    'VLLY': 'VLY',
    'VLY': 'VLY',
    'VALLEYS': 'VLYS',
    'VLYS': 'VLYS',
    'VDCT': 'VIA',
    'VIA': 'VIA',
    'VIADCT': 'VIA',
    'VIADUCT': 'VIA',
    'VIEW': 'VW',
    'VW': 'VW',
    'VIEWS': 'VWS',
    'VWS': 'VWS',
    'VILL': 'VLG',
    'VILLAG': 'VLG',
    'VILLAGE': 'VLG',
    'VILLG': 'VLG',
    'VILLIAGE': 'VLG',
    'VLG': 'VLG',
    'VILLAGES': 'VLGS',
    'VLGS': 'VLGS',
    'VILLE': 'VL',
    'VL': 'VL',
    'VIS': 'VIS',
    'VIST': 'VIS',
    'VISTA': 'VIS',
    'VST': 'VIS',
    'VSTA': 'VIS',
    'WALK': 'WALK',
    'WALKS': 'WALK',
    'WALL': 'WALL',
    'WAY': 'WAY',
    'WY': 'WAY',
    'WAYS': 'WAYS',
    'WELL': 'WL',
    'WELLS': 'WLS',
    'WLS': 'WLS'
 }
 OCCUPANCY_TYPE_ABBREVIATIONS = {
    'APARTMENT': 'APT',
    'BUILDING': 'BLDG',
    'BASEMENT': 'BSMT',
    'DEPARTMENT': 'DEPT',
    'FLOOR': 'FL',
    'FRONT': 'FRNT',
    'HANGER': 'HNGR',
    'KEY': 'KEY',
    'LOBBY': 'LBBY',
    'LOT': 'LOT',
    'LOWER': 'LOWR',
    'OFFICE': 'OFC',
    'PENTHOUSE': 'PH',
    'PIER': 'PIER',
    'REAR': 'REAR',
    'ROOM': 'RM',
    'SIDE': 'SIDE',
    'SLIP': 'SLIP',
    'SPACE': 'SPC',
    'STOP': 'STOP',
    'SUITE': 'STE',
    'TRAILER': 'TRLR',
    'UNIT': 'UNIT',
    'UPPER': 'UPPER',
    '#': '#'
 }
 ODD_REPLACEMENTS = {
    "UNITED STATES HIGHWAY": "US HIGHWAY"
 }
 HIGHWAY_REPLACEMENTS = {
    "UNITED STATES HIGHWAY": "US HIGHWAY",
 }
 # Replace directionals with abbreviated versions.
 def abbrevDirectionals(string):
    string = string.upper()
    for (find, replace) in DIRECTIONAL_REPLACEMENTS:
        string = string.replace(find, replace)
    return string
 def abbrevStreetTypes(string):
    string = string.upper()
    for (find, replace) in STREET_TYPE_ABBREVIATIONS:
        string = string.replace(find, replace)
    return string
 # Odd/unusual string replacement
 def oddHandling(string):
    string = string.upper()
    for (find, replace) in ODD_REPLACEMENTS:
        string = string.replace(find, replace)
    return string
 def highwayStandardize(street):
 def normalize(number, streetPreMod, streetPreDir, streetPreType, streetPreSep, streetName, streetPostType, streetPostDir, streetPostMod):
--- a/src/zipfunctions.py
+++ b/src/zipfunctions.py
@ -0,0 +1,429 @@
 # Created on : Aug 29, 2024, 12:57:40 AM
 # Author     : Skylar Ittner
 import re, sys
 #import pandas as pd
 from uszipcode import SearchEngine, ZipcodeTypeEnum
 from shapely.geometry import Point
 from shapely.geometry.polygon import Polygon
 import sqlite3
 from src.constants import LONGHAND_STREET_TYPES
 import src.config
 import sys
 import urllib.request
 from urllib.parse import quote
 import json
 #zipcodes = pd.read_csv("zip_code_database.csv", keep_default_na=False, dtype="str")
 fastsearch = SearchEngine(db_file_path="zipcode_db_simple.sqlite", simple_or_comprehensive=SearchEngine.SimpleOrComprehensiveArgEnum.simple)
 search = SearchEngine(db_file_path="zipcode_db.sqlite", simple_or_comprehensive=SearchEngine.SimpleOrComprehensiveArgEnum.comprehensive)
 zip4 = sqlite3.connect("file:/home/skylar/AddressDatabase/zip4.sqlite?mode=ro&immutable=1", uri=True, check_same_thread=False)
 zip4.executescript("""
 PRAGMA query_only=ON;            -- belt-and-suspenders (can’t write)
 PRAGMA temp_store=MEMORY;        -- sorts/temps in RAM
 PRAGMA cache_size=-800000;       -- ~800 MB page cache (negative = KB units)
 PRAGMA mmap_size=4294967296;     -- 1 GiB memory-mapped I/O (bump if you have RAM)
 PRAGMA automatic_index=ON;       -- leave enabled (default), can help odd joins
 PRAGMA threads=4;                -- allow parallel ops for sorts/expr eval (if available)
 """)
 zip4.row_factory = sqlite3.Row
 cur = zip4.cursor()
 # SQL query cache when finding ZIP+4, prevents running duplicate queries for nearby addresses on the same road
 querycache = {}
 querycachelimit = 3000
 def checkZIPCode(lat, lon, state, zip):
    zipok = False
    if not zip or zip == None or zip != zip:
        zipok = False
    elif isinstance(zip, str):
        if len(zip) == 5:
            zipok = True
        else:
            zip = zip.rjust(5, '0')
    elif isinstance(zip, int):
        if zip >= 10000:
            zipok = True
        else:
            zip = str(int(zip)).rjust(5, '0')
            zipok = True
    elif isinstance(zip, float):
        if zip >= 10000:
            zip = int(zip)
            zipok = True
        else:
            zip = str(int(zip)).rjust(5, '0')
            zipok = True
    else:
        zip = str(int(zip)).rjust(5, '0')
        zipok = True
    zipInfo = False
    if zipok:
        zipInfo = getCityStateForZIP(zip)
        if not zipInfo:
            zipok = False
        elif zipInfo.state != state:
            zipok = False
    if not zipok:
        result = search.by_coordinates(lat = lat, lng = lon, returns = 1)
        if len(result) == 1:
            return getCityStateForZIP(result[0].zipcode)
        elif len(result) > 1:
            print(result[0])
            print(result[1])
            return getCityStateForZIP(result[0].zipcode)
        else:
            return {"zip": "", "city": "", "state": ""}
    else:
        return zipInfo
 def getCityStateForZIP(zipcode):
    if not zipcode or zipcode == False or len(zipcode) != 5:
        return False
    cur.execute("SELECT ZipCode,City,State FROM ZIPCodes WHERE ZipCode='"+zipcode+"' AND CityStateKey=PreferredLastLineKey LIMIT 1")
    results = cur.fetchall()
    if len(results) == 0:
        return False
    return {
        "zip": results[0]["ZipCode"],
        "city": results[0]["City"],
        "state": results[0]["State"]
    }
 def getZIPFromGeo(lat, lon, prefix=False, state=False):
    if prefix and state:
        result = search.query(lat = lat, lng = lon, returns = 1, radius=20, prefix=str(prefix), state=state)
    elif state:
        result = search.query(lat = lat, lng = lon, returns = 1, radius=20, state=state)
    elif prefix:
        result = search.query(lat = lat, lng = lon, returns = 1, radius=20, prefix=str(prefix))
    else:
        result = search.by_coordinates(lat = lat, lng = lon, returns = 1)
    if len(result) == 1:
        return result[0].zipcode
    elif len(result) > 1:
        #print("W: Multiple ZIP matches for "+lat+", "+lon+": "+result[0] + " or " + result[1])
        return result[0].zipcode
    else:
        return None
 def subaddrMatchRows(rows, unit):
    if re.match(r"^[0-9]+$", unit):
        unit = unit.zfill(8)
    unitrows = []
    nounitrows = []
    for row in rows:
        if row["AddressSecLowNumber"] == "":
            nounitrows.append(row)
            continue
        if row["AddressSecLowNumber"] <= unit and row["AddressSecHighNumber"] >= unit:
            unitrows.append(row)
    if len(unitrows) == 0:
        return nounitrows
    return unitrows
 def getZIPsForCityState(city, state):
    city = city.upper().strip().replace("'","''")
    cur.execute("SELECT * FROM ZIPCodes WHERE State = '" + state + "' AND (CityAliasName = '" + city + "' OR City = '" + city + "' OR CityAliasAbbreviation = '" + city + "')")
    citylist = cur.fetchall()
    resultlist = []
    ziplist = []
    # Remove entries that aren't the preferred city name for the ZIP Code
    for cityrow in citylist:
        if cityrow["CityStateKey"] == cityrow["PreferredLastLineKey"]:
            resultlist.append(cityrow)
            ziplist.append(cityrow["ZipCode"])
    return resultlist, ziplist
 def getZIPsForCounty(county, state):
    county = county.upper().strip().replace("'","''")
    cur.execute("SELECT ZipCode FROM ZIPCodes WHERE State = '" + state + "' AND County = '" + county + "'")
    countylist = cur.fetchall()
    # Also get records where the ZIP isn't mainly in the county but some of it is
    cur.execute("SELECT ZipCode FROM ZIPCodesMultiCounty WHERE State = '" + state + "' AND County = '" + county + "'")
    multicountylist = cur.fetchall()
    ziplist = []
    for row in countylist:
        if row["ZipCode"] not in ziplist:
            ziplist.append(row["ZipCode"])
    for row in multicountylist:
        if row["ZipCode"] not in ziplist:
            ziplist.append(row["ZipCode"])
    return ziplist
 def addressRangeContainsNumber(low, high, evenodd, number):
    numberevenodd = "B"
    if re.match(r"^[0-9]+$", number):
        if int(number) % 2 == 0:
            numberevenodd = "E"
        else:
            numberevenodd = "O"
        number = number.zfill(10)
    elif low == number or high == number:
        return True
    if evenodd == "B" or evenodd == numberevenodd:
        if low <= number and high >= number: # This logic is bad and should be rewritten
            return True
    return False
 # Check if the address number range is actually just a single address that matches the number provided.
 def addressRangeIsExactNumber(low, high, number):
    if low != high:
        return False
    if low == number:
        return True
    if low == number.zfill(10):
        return True
    return False
 def getZIP4(number, street, unit, state, lat, lon, city=False, zip=False, county=False):
    number = number.strip()
    street = street.strip()
    if not unit:
        unit = ""
    # Get list of 5-digit ZIP Codes matching the city and state
    citystateresults = False
    zipfilter = False
    if city:
        citystateresults, zipfilter = getZIPsForCityState(city, state)
        if len(zipfilter) == 0:
            zipfilter = False
    elif county:
        zipfilter = getZIPsForCounty(county, state)
        if len(zipfilter) == 0:
            zipfilter = False
    queries = []
    basenamequeries = [] # Queries that only match on street basename, try after "main" queries don't return a match
    # Get street base name for broader matching in case suffix or directional differs
    typelessStreet = street
    for (short, long) in LONGHAND_STREET_TYPES.items():
        typelessStreet = re.sub(r"\s" + re.escape(short) + r"\b", "", typelessStreet)
    streetBasename = re.sub("^[NSEW]{1,2} ", "", typelessStreet)
    streetBasename = re.sub(" [NSEW]{1,2}$", "", streetBasename)
    #print(street, typelessStreet, streetBasename)
    # Build a list of queries to run, starting with the most specific and getting more desperate until a match is found
    if zip:
        queries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StreetFull = '" + street + "' AND State = '" + state + "'" + " AND ZipCode='"+zip+"'")
        basenamequeries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StName = '" + streetBasename + "' AND State = '" + state + "'" + " AND ZipCode='"+zip+"'")
    if zipfilter:
        queries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StreetFull = '" + street + "' AND State = '" + state + "'" + " AND ZipCode IN ('" + ("','".join(zipfilter)) + "')")
        basenamequeries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StName = '" + streetBasename + "' AND State = '" + state + "'" + " AND ZipCode IN ('" + ("','".join(zipfilter)) + "')")
    if not unit and re.match(".* ([0-9]{1,5}|[A-Z]{1})$", street):
        # Maybe the street has the apartment number in it for some reason
        newStreet = re.sub(" ([0-9]{1,5}|[A-Z]{1})$", "", street)
        newUnit = street[len(newStreet):].strip()
        typelessStreet = newStreet
        for (short, long) in LONGHAND_STREET_TYPES.items():
            typelessStreet = re.sub(r"\s" + re.escape(short) + r"\b", "", typelessStreet)
        newstreetBasename = re.sub("^[NSEW]{1,2} ", "", typelessStreet)
        newstreetBasename = re.sub(" [NSEW]{1,2}$", "", newstreetBasename)
        if zip:
            queries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StreetFull = '" + newStreet + "' AND State = '" + state + "'" + " AND ZipCode='"+zip+"'")
            basenamequeries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StName = '" + newstreetBasename + "' AND State = '" + state + "'" + " AND ZipCode='"+zip+"'")
        if zipfilter:
            queries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StreetFull = '" + newStreet + "' AND State = '" + state + "'" + " AND ZipCode IN ('" + ("','".join(zipfilter)) + "')")
            basenamequeries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StName = '" + newstreetBasename + "' AND State = '" + state + "'" + " AND ZipCode IN ('" + ("','".join(zipfilter)) + "')")
    if not zip and not zipfilter:
        # Who needs ZIP Codes and city names anyways
        queries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StreetFull = '" + street + "' AND State = '" + state + "'")
        queries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StName = '" + street + "' AND State = '" + state + "'")
        basenamequeries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StName = '" + streetBasename + "' AND State = '" + state + "'")
        queries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StreetFull LIKE '" + street + "%' AND State = '" + state + "'")
        #queries.append("SELECT ZipCode, Plus4Low, StreetFull, AddressSecAbbr, AddressSecLowNumber, AddressSecHighNumber, AddressPrimaryLowNumber, AddressPrimaryHighNumber, AddressPrimaryEvenOdd FROM ZIP4 WHERE StName LIKE '" + streetBasename + "%' AND State = '" + state + "'")
    resultrows = []
    suggestZip = ""
    suggestStreet = ""
    queries = queries + basenamequeries
    for query in queries:
        #print(query)
        if query in querycache:
            rows = querycache[query]
            #print("CACHED:  " + query)
        else:
            cur.execute(query)
            #print("NOCACHE: " +query)
            rows = cur.fetchall()
        # Add to query cache
        querycache[query] = rows
        if len(querycache) > querycachelimit:
            querycache.pop(next(iter(querycache)))
        unitfilterrows = rows
        if unit:
            # Filter to rows that match the unit number
            unitfilterrows = subaddrMatchRows(rows, unit)
        # Try matching range against unit-filtered rows, if that doesn't work, try the non-filtered ones (address might be more specific than ZIP4 file)
        for row in unitfilterrows:
            if addressRangeContainsNumber(row["AddressPrimaryLowNumber"], row["AddressPrimaryHighNumber"], row["AddressPrimaryEvenOdd"], number):
                resultrows.append(row)
        if len(resultrows) == 0 and len(unitfilterrows) < len(rows):
            for row in rows:
                if addressRangeContainsNumber(row["AddressPrimaryLowNumber"], row["AddressPrimaryHighNumber"], row["AddressPrimaryEvenOdd"], number):
                    resultrows.append(row)
        if len(resultrows) == 1:
            # One match found, it's probably the right one!
            return resultrows[0]["ZipCode"], resultrows[0]["Plus4Low"], resultrows[0]["StreetFull"], resultrows[0]["AddressSecAbbr"], unit
        if len(resultrows) > 1:
            # First check if our source address has a unit, and if not, remove all match rows that DO have a unit.
            if not unit:
                base_rows = [
                    r for r in resultrows
                    if not (
                        r["AddressSecAbbr"] or
                        r["AddressSecLowNumber"] or
                        r["AddressSecHighNumber"]
                    )
                ]
                # If we found at least one base-address row, narrow resultrows to those
                if base_rows:
                    # Narrow further by looking for exact number matches (low and high are the same and what we're looking for)
                    exact_rows = []
                    for row in base_rows:
                        if addressRangeIsExactNumber(row["AddressPrimaryLowNumber"], row["AddressPrimaryHighNumber"], number):
                            exact_rows.append(row)
                    if len(exact_rows) > 0:
                        resultrows = exact_rows
                    else:
                        resultrows = base_rows
                    # If that left us with exactly one, we can return it immediately
                    if len(resultrows) == 1:
                        row = resultrows[0]
                        return (
                            row["ZipCode"],
                            row["Plus4Low"],
                            row["StreetFull"],
                            "",     # no unit designator when no unit was given
                            unit,   # still the original (empty) unit
                        )
            suggestZip = resultrows[0]["ZipCode"]
            suggestStreet = resultrows[0]["StreetFull"]
            for row in resultrows:
                # Check if the 5-digit ZIP and/or street are the same for all results, clear it if not
                if suggestZip != row["ZipCode"]:
                    suggestZip = ""
                if suggestStreet != row["StreetFull"]:
                    suggestStreet = ""
                # Return an address-specific row if it exists
                if row["AddressPrimaryLowNumber"] == number and row["AddressPrimaryHighNumber"] == number:
                    return row["ZipCode"], row["Plus4Low"], row["StreetFull"], row["AddressSecAbbr"], unit
            #print("Multiple possible ZIP+4 matches for", number, street, "#"+unit, city, state, zip)
            #for row in resultrows:
            #    print(row["ZipCode"],row["AddressPrimaryLowNumber"],row["AddressPrimaryHighNumber"], row["StreetFull"], row["AddressPrimaryEvenOdd"], row["Plus4Low"], row["AddressSecAbbr"])
    # No match found
    cfg = src.config.get_config()
    if cfg.useCensusToFillEmptyZIPs and number != "" and street != "" and city != "" and city != False and state != "":
        # Query the Census Geocoder, because this address probably exists
        print("US Census Geo:" + number + " " + street + ", " + city + " " + state + "                         ", end="\r", flush=True)
        try:
            result = urllib.request.urlopen("https://geocoding.geo.census.gov/geocoder/locations/address?street="+quote(number + " " + street)+"&city="+quote(city)+"&state="+state+"&zip=&benchmark=4&format=json").read()
            jsonresult = json.loads(result)
            if len(jsonresult["result"]["addressMatches"]) == 1:
                comps = jsonresult["result"]["addressMatches"][0]["addressComponents"]
                streetparts = [comps["preDirection"], comps["preType"], comps["streetName"], comps["suffixType"], comps["suffixDirection"]]
                street = " ".join(x for x in streetparts if x)
                return jsonresult["result"]["addressMatches"][0]["addressComponents"]["zip"], "", street, "", unit
        except:
            pass
    if suggestZip == "":
        suggestZip = getZIP(lat, lon, False, state, city)
    if suggestStreet == "":
        suggestStreet = street
    return suggestZip, "", suggestStreet, "", unit
 def getZIP(lat, lon, prefix=False, state=False, city=False):
    if city == "":
        city = False
    else:
        city = city.upper()
    if state == "":
        state = False
    lat = float(lat)
    lon = float(lon)
    citystateresult = False
    if city and state: # Check if city and state combo only has one standard ZIP Code
        try:
            citystateresult = fastsearch.by_city_and_state(city, state, returns=30, zipcode_type=ZipcodeTypeEnum.Standard) # Use simple database because it's like 2x faster
            if len(citystateresult) == 1:
                #print("Exact city match found: "+city+ " "+state+" "+citystateresult[0].zipcode)
                return citystateresult[0].zipcode
        except ValueError:
            # Sometimes it objects to a city name and says it isn't valid
            pass
    if prefix and state: # Get ZIPs by lat/lon that start with prefix and are in state for faster queries
        result = search.query(lat = lat, lng = lon, returns = 20, radius=10, prefix=str(prefix), state=state, zipcode_type=ZipcodeTypeEnum.Standard)
    elif state: # Get ZIPs filtered by state for faster queries
        result = search.query(lat = lat, lng = lon, returns = 20, radius=10, state=state, zipcode_type=ZipcodeTypeEnum.Standard)
    elif prefix: # Get ZIPs by lat/lon that start with prefix for faster queries
        result = search.query(lat = lat, lng = lon, returns = 20, radius=10, prefix=str(prefix), zipcode_type=ZipcodeTypeEnum.Standard)
    else: # Get ZIPs by lat/lon
        result = search.by_coordinates(lat = lat, lng = lon, returns = 20, zipcode_type=ZipcodeTypeEnum.Standard)
    if len(result) == 1:
        return result[0].zipcode
    elif len(result) > 1:
        matchzips = []
        if citystateresult:
            # Find zip codes that both queries have in common, maybe there's only one that overlaps with both!
            for val in citystateresult:
                for res in result:
                    if res.zipcode == val.zipcode:
                        matchzips.append(res)
            if len(matchzips) == 1:
                #print("Exact match found between lat/lon and city/state queries: "+matchzips[0])
                return matchzips[0].zipcode
        else:
            matchzips = result
        #print("W: Multiple equally-valid ZIP matches for "+str(lat)+", "+str(lon)+" "+str(city)+", "+str(state)+": ")
        addrpoint = Point(lon, lat)
        for zip in matchzips:
            #print("  "+zip.zipcode)
            zippolys = zip.polygon
            if zippolys == None:
                continue
            if dimensionality(zippolys) == 2:
                zippolys = [zippolys]
            for poly in zippolys:
                zipborder = Polygon(poly)
                if zipborder.contains(addrpoint):
                    #print("Found probable ZIP based on border: " + zip.zipcode)
                    return zip.zipcode
        return None
    else:
        return None
 def dimensionality(matrix):
    dims = []
    while isinstance(matrix, list) and matrix is not None:
        dims.append(len(matrix))
        matrix = matrix[0]
    return len(dims)
--- a/zip_code_database.csv
+++ b/zip_code_database.csv
--- a/zipdbgen.py
+++ b/zipdbgen.py
@ -0,0 +1,272 @@
 #!/usr/bin/python3
 # Generate a ZIP+4 database from the data at https://www.zip-codes.com/zip-plus-4-database.asp
 from argparse import ArgumentParser
 import sqlite3, zipfile, re
 import pandas as pd
 def process(infile, outfile):
    print("Reading " + infile)
    zf = zipfile.ZipFile(infile, mode="r")
    zipFiles = zf.namelist()
    ziplist = []
    zip5list = []
    zipcountylist = [] # List of ZIPs in multiple counties
    for fname in zipFiles:
        if re.match("ZIP4-[A-Z]{2}.zip", fname):
            ziplist.append(fname)
        elif fname == "zip-codes-database-STANDARD.csv":
            zip5list.append(fname)
        elif "MULTI-COUNTY" in fname and fname.endswith(".csv"):
            zipcountylist.append(fname)
    filesprocessed = 0
    chunksprocessed = 0
    chunksize = 5000
    if len(ziplist) > 0:
        print("Creating ZIP+4 database")
        connection = sqlite3.connect(outfile)
        connection.executescript("PRAGMA foreign_keys=OFF;")
        c = connection.cursor()
        c.execute("PRAGMA journal_mode=OFF;")        # or MEMORY; fastest is OFF (risk if crash)
        c.execute("PRAGMA synchronous=OFF;")         # biggest win: no fsync on each commit
        c.execute("PRAGMA temp_store=MEMORY;")       # keep temp B-trees in RAM
        c.execute("PRAGMA cache_size=-1600000;")     # ~1600MB page cache (negative = KB)
        c.execute("PRAGMA locking_mode=EXCLUSIVE;")  # avoid lock thrash
        c.execute("PRAGMA mmap_size=1073741824;")    # 1GB mmap; helps reads, slight write help
        c.execute("PRAGMA page_size=65536;")
        createZIP4DB(c)
        def mergeStreet(row):
            return ' '.join(filter(None, [row["StPreDirAbbr"], row["StName"], row["StSuffixAbbr"], row["StPostDirAbbr"]]))
        for file in ziplist:
            with zf.open(file, mode="r", force_zip64=True) as innerfile:
                with zipfile.ZipFile(innerfile, mode="r") as innerzip:
                    with innerzip.open(innerzip.namelist()[0], mode="r") as csvfile:
                        print("\nImporting " + file + " ..." + "            ", end="\r", flush=True)
                        for chunk in pd.read_csv(csvfile, chunksize=chunksize, keep_default_na=False, dtype="str"):
                            chunk["StreetFull"] = chunk.apply(mergeStreet, axis=1)
                            chunk.to_sql("ZIP4", connection, if_exists='append', index=False, method='multi')
                            chunksprocessed = chunksprocessed + 1
                            print("Importing " + file + " ... " + str(chunksprocessed * chunksize) +"            ", end="\r", flush=True)
                        #print("\nVacuuming database...")
                        #connection.executescript("VACUUM")
            filesprocessed = filesprocessed + 1
        zf.close()
    if len(zip5list) > 0:
        print("Creating 5-digit ZIP database")
        connection = sqlite3.connect(outfile)
        c = connection.cursor()
        createZIP5DB(c)
        filesprocessed = 1
        with zf.open(zip5list[0], mode="r", force_zip64=True) as csvfile:
            print("\nImporting " + zip5list[0] + " ..." + "            ", end="\r", flush=True)
            for chunk in pd.read_csv(csvfile, chunksize=chunksize, keep_default_na=False, dtype="str"):
                chunk.to_sql("ZIPCodes", connection, if_exists='append', index=False)
                chunksprocessed = chunksprocessed + 1
                print("Importing " + zip5list[0] + " ... " + str(chunksprocessed * chunksize) +"            ", end="\r", flush=True)
    if len(zipcountylist) > 0:
        print("Creating Multi-county ZIP database")
        connection = sqlite3.connect(outfile)
        c = connection.cursor()
        createZIPMultiCountyDB(c)
        filesprocessed = 1
        with zf.open(zipcountylist[0], mode="r", force_zip64=True) as csvfile:
            print("\nImporting " + zipcountylist[0] + " ..." + "            ", end="\r", flush=True)
            for chunk in pd.read_csv(csvfile, chunksize=chunksize, keep_default_na=False, dtype="str"):
                chunk.to_sql("ZIPCodesMultiCounty", connection, if_exists='append', index=False)
                chunksprocessed = chunksprocessed + 1
                print("Importing " + zipcountylist[0] + " ... " + str(chunksprocessed * chunksize) +"            ", end="\r", flush=True)
    print("\nFiles processed: " + str(filesprocessed))
    print("Records processed: " + str(chunksprocessed * chunksize))
    print("Done! Saved to " + outfile)
    print("\nOne last thing: optimizing output database (this might take a few minutes)...")
    connection.executescript("VACUUM; ANALYZE; PRAGMA optimize;")
 def createZIP5DB(c):
    c.execute("DROP TABLE IF EXISTS ZIPCodes")
    c.execute('''CREATE TABLE ZIPCodes (
 	ZipCode char(5) NOT NULL,
 	City varchar(35) NULL,
 	State char(2),
 	County varchar(45) NULL,
 	AreaCode varchar(55) NULL,
 	CityType char(1) NULL,
 	CityAliasAbbreviation varchar(13) NULL,
 	CityAliasName varchar(35) NULL,
 	Latitude decimal(12, 6),
 	Longitude decimal(12, 6),
 	TimeZone char(2) NULL,
 	Elevation int,
 	CountyFIPS char(5) NULL,
 	DayLightSaving char(1) NULL,
 	PreferredLastLineKey varchar(10) NULL,
 	ClassificationCode char(1) NULL,
 	MultiCounty char(1) NULL,
 	StateFIPS char(2) NULL,
 	CityStateKey char(6) NULL,
 	CityAliasCode varchar(5) NULL,
 	PrimaryRecord char(1),
 	CityMixedCase varchar(35) NULL,
 	CityAliasMixedCase varchar(35) NULL,
 	StateANSI varchar(2) NULL,
 	CountyANSI varchar(3) NULL,
 	FacilityCode varchar(1) NULL,
 	CityDeliveryIndicator varchar(1) NULL,
 	CarrierRouteRateSortation varchar(1) NULL,
 	FinanceNumber varchar(6) NULL,
 	UniqueZIPName varchar(1) NULL,
 	CountyMixedCase varchar(45) NULL
 );''')
    c.execute("CREATE INDEX Index_ZIPCodes_ZipCode ON ZIPCodes (ZipCode)")
    c.execute("CREATE INDEX Index_ZIPCodes_State ON ZIPCodes (State)")
    c.execute("CREATE INDEX Index_ZIPCodes_County ON ZIPCodes (County)")
    c.execute("CREATE INDEX Index_ZIPCodes_AreaCode ON ZIPCodes (AreaCode)")
    c.execute("CREATE INDEX Index_ZIPCodes_City ON ZIPCodes (City)")
    c.execute("CREATE INDEX Index_ZIPCodes_Latitude ON ZIPCodes (Latitude)")
    c.execute("CREATE INDEX Index_ZIPCodes_Longitude ON ZIPCodes (Longitude)")
    c.execute("CREATE INDEX Index_ZIPCodes_CityAliasName ON ZIPCodes (CityAliasName)")
    c.execute("CREATE INDEX Index_ZIPCodes_CityStateKey ON ZIPCodes (CityStateKey)")
    c.execute("DROP TABLE IF EXISTS States")
    c.execute("CREATE TABLE States (code TEXT, name TEXT)")
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("AE", "Armed Forces Europe, the Middle East, and Canada")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("AP", "Armed Forces Pacific")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("AA", "Armed Forces Americas")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("AL", "Alabama")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("AK", "Alaska")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("AS", "American Samoa")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("AZ", "Arizona")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("AR", "Arkansas")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("CA", "California")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("CO", "Colorado")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("CT", "Connecticut")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("DE", "Delaware")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("DC", "District of Columbia")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("FM", "Federated States of Micronesia")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("FL", "Florida")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("GA", "Georgia")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("GU", "Guam")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("HI", "Hawaii")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("ID", "Idaho")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("IL", "Illinois")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("IN", "Indiana")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("IA", "Iowa")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("KS", "Kansas")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("KY", "Kentucky")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("LA", "Louisiana")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("ME", "Maine")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("MH", "Marshall Islands")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("MD", "Maryland")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("MA", "Massachusetts")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("MI", "Michigan")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("MN", "Minnesota")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("MS", "Mississippi")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("MO", "Missouri")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("MT", "Montana")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("NE", "Nebraska")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("NV", "Nevada")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("NH", "New Hampshire")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("NJ", "New Jersey")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("NM", "New Mexico")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("NY", "New York")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("NC", "North Carolina")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("ND", "North Dakota")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("MP", "Northern Mariana Islands")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("OH", "Ohio")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("OK", "Oklahoma")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("OR", "Oregon")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("PW", "Palau")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("PA", "Pennsylvania")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("PR", "Puerto Rico")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("RI", "Rhode Island")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("SC", "South Carolina")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("SD", "South Dakota")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("TN", "Tennessee")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("TX", "Texas")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("UT", "Utah")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("VT", "Vermont")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("VI", "Virgin Islands")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("VA", "Virginia")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("WA", "Washington")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("WV", "West Virginia")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("WI", "Wisconsin")')
    c.execute('INSERT INTO "States" ("code", "name") VALUES ("WY", "Wyoming")')
 def createZIPMultiCountyDB(c):
    c.execute("DROP TABLE IF EXISTS ZIPCodesMultiCounty")
    c.execute("CREATE TABLE ZIPCodesMultiCounty ( ZipCode char(5) NOT NULL, StateFIPS char(2), State char(2), CountyFIPS char(5) NULL, County varchar(45), CountyMixedCase varchar(45) )")
    c.execute("CREATE INDEX Index_ZIPCodesMultiCounty_ZipCode ON ZIPCodesMultiCounty (ZipCode)")
    c.execute("CREATE INDEX Index_ZIPCodesMultiCounty_State ON ZIPCodesMultiCounty (State)")
    c.execute("CREATE INDEX Index_ZIPCodesMultiCounty_County ON ZIPCodesMultiCounty (County)")
 def createZIP4DB(c):
    c.execute("DROP TABLE IF EXISTS `ZIP4`")
    c.execute('''
 CREATE TABLE "ZIP4" (
 "ZipCode"char(5),
 "UpdateKey"varchar(10),
 "Action"char(1),
 "RecordType"varchar(1),
 "CarrierRoute"varchar(4),
 "StPreDirAbbr"varchar(2),
 "StName"varchar(28),
 "StSuffixAbbr"varchar(4),
 "StPostDirAbbr"varchar(2),
 "AddressPrimaryLowNumber"varchar(10),
 "AddressPrimaryHighNumber"varchar(10),
 "AddressPrimaryEvenOdd"varchar(1),
 "BuildingName"varchar(40),
 "AddressSecAbbr"varchar(4),
 "AddressSecLowNumber"varchar(10),
 "AddressSecHighNumber"varchar(10),
 "AddressSecOddEven"varchar(1),
 "Plus4Low"varchar(4),
 "Plus4High"varchar(4),
 "BaseAlternateCode"varchar(1),
 "LACSStatus"varchar(1),
 "GovernmentBuilding"varchar(1),
 "FinanceNumber"varchar(6),
 "State"varchar(2),
 "CountyFIPS"varchar(3),
 "CongressionalDistrict"varchar(2),
 "MunicipalityKey"varchar(6),
 "UrbanizationKey"varchar(6),
 "PreferredLastLineKey"varchar(6),
 "ToLatitude"decimal(18, 10),
 "FromLatitude"decimal(18, 10),
 "ToLongitude"decimal(18, 10),
 "FromLongitude"decimal(18, 10),
 "CensusTract"varchar(15),
 "CensusBlock"varchar(15),
 "TLID"varchar(15),
 "LatLonMultiMatch"varchar(1),
 "StreetFull" varchar(36)
 )
    ''')
    c.execute('''CREATE INDEX "addressnumber" ON "ZIP4" ("AddressPrimaryLowNumber","AddressPrimaryHighNumber","AddressPrimaryOddEven")''')
    c.execute('''CREATE INDEX "key" ON "ZIP4" ("PreferredLastLineKey")''')
    c.execute('''CREATE INDEX "zipcode_route" ON "ZIP4" ("ZipCode", "CarrierRoute")''')
    c.execute('''CREATE INDEX "state" ON "ZIP4" ("State")''')
    c.execute('''CREATE INDEX "streetfull_state" ON "ZIP4" ("StreetFull", "State")''')
    c.execute('''CREATE INDEX "stname_state" ON "ZIP4" ("StName", "State")''')
    c.execute('''CREATE INDEX "zip" ON "ZIP4" ("ZipCode")''')
    c.execute('''CREATE INDEX "streetfull_state_zip" ON "ZIP4" ("StreetFull", "State", "ZipCode")''')
    c.execute('''CREATE INDEX "stname_state_zip" ON "ZIP4" ("StName", "State", "ZipCode")''')
 parser = ArgumentParser(description='Create a SQLite ZIP Code database from CSV data from https://www.zip-codes.com/zip-plus-4-database.asp. Supports both 5-digit ZIP and ZIP+4 products.')
 parser.add_argument('src', help='Input .zip archive')
 parser.add_argument('dest', help='Output SQLite3 database file')
 if __name__ == "__main__":
    args = parser.parse_args()
    process(args.src, args.dest)