From 915bd43907003317922e6192c8b0a6328e83c226 Mon Sep 17 00:00:00 2001 From: Skylar Ittner Date: Thu, 20 Nov 2025 20:48:07 -0700 Subject: [PATCH] Fix libpostal and normalization errors, add verbose mode for debugging --- main.py | 34 ++++++++++++++++----- src/addressfunctions.py | 65 +++++++++++++++++++++++++++++------------ src/advancedparsing.py | 33 +++++++++++++++++---- src/config.py | 1 + 4 files changed, 103 insertions(+), 30 deletions(-) diff --git a/main.py b/main.py index 712d0c5..738551e 100755 --- a/main.py +++ b/main.py @@ -58,27 +58,41 @@ def normalize(number, street, street2, city, state, zipcode, latitude, longitude if len(city) > 4 and street1.endswith(" " + city): # City name leaked into street field (Albany County Wyoming, for one) street1 = street1.removesuffix(" " + city) + if cfg.verbose: + print("Starting to normalize address:") + print(" ", number, street1, street2, city, state, zipcode, round(float(latitude),7), round(float(longitude), 7), plus4, county) addr = normalizeAddress(number, street1, street2, city, state, zipcode, round(float(latitude),7), round(float(longitude), 7), zipprefix, plus4, county) if len(addr['zip'] or "") < 5 or len(addr['plus4'] or "") != 4: - + if cfg.verbose: + print(" Address didn't match to a full ZIP+4 code. Trying more things.") + print(" ", re.sub("[^0-9]", "", addr['number']), addr['street'], addr['unit'], "", addr['state'], addr['zip']) # Try removing letters from address numbers, and ignore city field - addrstrip = normalizeAddress(re.sub("[^0-9]", "", addr['number']), addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['latitude'], addr['longitude'], False, addr['plus4'], county) - - # If that didn't work, try instead stripping the city name because it might be wrong - if addr['city'] != "" and (len(addrstrip['zip'] or "") < 5 or len(addrstrip['plus4'] or "") != 4): - addrstrip = normalizeAddress(addr['number'], addr['street'], addr['unit'], "", addr['state'], addr['zip'], addr['latitude'], addr['longitude'], False, addr['plus4'], county) + addrstrip = normalizeAddress(re.sub("[^0-9]", "", addr['number']), addr['street'], addr['unit'], "", addr['state'], addr['zip'], addr['latitude'], addr['longitude'], False, addr['plus4'], county) # Use libpostal to analyze address deeper if cfg.advancedMode and len(addrstrip['zip'] or "") < 5 or len(addrstrip['plus4'] or "") != 4: try: + if cfg.verbose: + print(" Using libpostal to break down and analyze address.") + print(" ",addrstrip) + from src.advancedparsing import advancedNormalize addr = advancedNormalize(addr['number'], addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['latitude'], addr['longitude'], False, addr['plus4'], county) except Exception as e: + if cfg.verbose: + print(" libpostal crashed.") + raise e pass # Do another normalize pass for good luck (maybe the previous one got the ZIP and now we can get the +4) if len(addr['zip'] or "") < 5 or len(addr['plus4'] or "") != 4: + if cfg.verbose: + print(" Doing a final normalization attempt after libpostal.") + print(" ", addr) addr = normalizeAddress(addr['number'], addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['latitude'], addr['longitude'], False, addr['plus4'], county) else: addr = addrstrip + if cfg.verbose: + print(" Final result after normalization:") + print(" ", addr) return addr def processOwnChunk(chunk, chunkcount, outfilename, ignorestates, keeponlystates): @@ -661,6 +675,7 @@ if __name__ == "__main__": parser.add_argument("--census", help="Enable looking up missing ZIP codes in the U.S. Census Geocoder when we have a full address, city, and state but no ZIP.", action='store_true') parser.add_argument("--libpostal", help="Use libpostal address parsing and expansions to match bad addresses to a ZIP+4. Automatically enables --appendplus4.", action='store_true') parser.add_argument("--noskip4", help="When processing own file format, don't skip normalizing records that have a ZIP+4 already.", action="store_true") + parser.add_argument("-v", help="Verbose output (for development)", action="store_true") args = parser.parse_args() @@ -673,6 +688,11 @@ if __name__ == "__main__": citySuggestion = False advancedMode = False noSkip4 = False + verbose = False + + if args.v: + verbose = True + print("Verbose mode engaged!") if args.libpostal: advancedMode = True @@ -727,7 +747,7 @@ if __name__ == "__main__": if args.city: citySuggestion = args.city.strip().toUpper() - cfg = src.config.AppConfig(appendPlus4=appendPlus4, appendUnitLabel=appendUnitLabel, countryCode=countryCode, citySuggestion=citySuggestion, useCensusToFillEmptyZIPs=useCensusToFillEmptyZIPs, advancedMode=advancedMode, noSkip4=noSkip4) + cfg = src.config.AppConfig(appendPlus4=appendPlus4, appendUnitLabel=appendUnitLabel, countryCode=countryCode, citySuggestion=citySuggestion, useCensusToFillEmptyZIPs=useCensusToFillEmptyZIPs, advancedMode=advancedMode, noSkip4=noSkip4, verbose=verbose) src.config.set_config(cfg) diff --git a/src/addressfunctions.py b/src/addressfunctions.py index f3e837e..0e9dbaf 100644 --- a/src/addressfunctions.py +++ b/src/addressfunctions.py @@ -116,7 +116,8 @@ POST_STANDARDIZATION_STREET_REGEXES = { " LP$": " LOOP", "^CRK ([0-9]+)$": r"COUNTY ROAD \1", # Athens TX does this for some reason "^PR ([0-9]+)$": r"PRIVATE ROAD \1", # Athens TX does this for some reason - "^SH ([0-9]+) ([NSEW])$": r"STATE HIGHWAY \1 \2" # Athens TX does this too for some reason + "^SH ([0-9]+) ([NSEW])$": r"STATE HIGHWAY \1 \2", # Athens TX does this too for some reason + "^GLN HOLLOW ": "GLEN HOLLOW " } STANDARDIZATION_NUMBER_REGEXES = { @@ -211,6 +212,8 @@ def normalizeAddress(number, street, unit, city, state, zipcode, lat, lon, zippr lon, lat = lat, lon number = standardizeNumber(str(number).upper().strip()) street = preStandardizeStreet(street.strip().upper()) + if cfg.verbose: + print(" Preprocessed street:", street) unit = unit.strip().upper() city = city.strip().upper() state = state.strip().upper() @@ -230,18 +233,32 @@ def normalizeAddress(number, street, unit, city, state, zipcode, lat, lon, zippr # # Standardize address # + if cfg.verbose: + print(" Standardizing street: ", street) try: # Python library + # Won't work at all without a zip code + incity = city + if not city: + incity = "XXXXXXXXXX" + instate = state + if not state: + instate = "XX" + inzip = zipcode + if not zipcode: + inzip = "00000" addr = normalize_address_record( { "address_line_1": "".join(["999999999", " ", street]), "address_line_2": unit, - "city": city, - "state": state, - "postal_code": zipcode + "city": incity, + "state": instate, + "postal_code": inzip } ) except Exception as e: + if cfg.verbose: + print(" Error standardizing street with usaddress-scourgify: ", e) try: # Proprietary Mono library if standardization == False: @@ -266,10 +283,15 @@ def normalizeAddress(number, street, unit, city, state, zipcode, lat, lon, zippr streetonly = addr['address_line_1'] if streetonly.startswith(str(number) + " "): streetonly = streetonly[len(str(number) + " "):] + + if cfg.verbose: + print(" Standardized street to: ", streetonly) # # Run extra regexes on street to fix standardization problems # - streetonly = postStandardizeStreet(streetonly) + streetonly = postStandardizeStreet(streetonly) + if cfg.verbose: + print(" Postprocessed street: ", streetonly) # # Special conditional rules @@ -287,31 +309,38 @@ def normalizeAddress(number, street, unit, city, state, zipcode, lat, lon, zippr # # Standardize and validate and/or append ZIP Code # - zipcode = addr["postal_code"] + if addr["postal_code"] != "00000": + zipcode = addr["postal_code"] unitprefix = "" unit = addr['address_line_2'] - if zipcode is not None: + if zipcode is not None and addr["postal_code"] != "00000": zipcode = addr["postal_code"][0:5] + if addr["state"] != "XX": + state = addr["state"] + + if addr["city"] != "XXXXXXXXXX": + city = addr["city"] + # Skip these if we already have a ZIP+4 code, assume it's accurate if zipcode is not None and len(zipcode) == 5 and not plus4: zipinfo = getCityStateForZIP(zipcode) - if cfg.appendPlus4 or zipinfo == False or addr["state"] != zipinfo["state"] or (cfg.appendUnitLabel and addr["address_line_2"] != ""): - zipcode, plus4, streetonly, unitprefix, unit = getZIP4(number, streetonly, addr['address_line_2'], addr["state"], lat, lon, addr["city"], zipcode, county) + if cfg.appendPlus4 or zipinfo == False or state != zipinfo["state"] or (cfg.appendUnitLabel and addr["address_line_2"] != ""): + zipcode, plus4, streetonly, unitprefix, unit = getZIP4(number, streetonly, addr['address_line_2'], state, lat, lon, city, zipcode, county) zipinfo = getCityStateForZIP(zipcode) if zipinfo != False: - addr["city"] = zipinfo["city"] - addr["state"] = zipinfo["state"] + city = zipinfo["city"] + state = zipinfo["state"] else: - addr["city"] = zipinfo["city"] - addr["state"] = zipinfo["state"] + city = zipinfo["city"] + state = zipinfo["state"] elif not plus4: - zipcode, plus4, streetonly, unitprefix, unit = getZIP4(number, streetonly, addr['address_line_2'], addr["state"], lat, lon, addr["city"], False, county) + zipcode, plus4, streetonly, unitprefix, unit = getZIP4(number, streetonly, addr['address_line_2'], state, lat, lon, state, False, county) zipinfo = getCityStateForZIP(zipcode) if zipinfo != False: - addr["city"] = zipinfo["city"] - addr["state"] = zipinfo["state"] + city = zipinfo["city"] + state = zipinfo["state"] if not plus4 and streetonly == "UNITED STATES HWY" and re.match(r"^\d+$", unit): streetonly = f"US HIGHWAY {unit}" @@ -324,8 +353,8 @@ def normalizeAddress(number, street, unit, city, state, zipcode, lat, lon, zippr "number": number, "street": streetonly, "unit": ' '.join(filter(None, (unitprefix, unit))), - "city": addr["city"], - "state": addr["state"], + "city": city, + "state": state, "zip": zipcode, "plus4": plus4, "latitude": lat, diff --git a/src/advancedparsing.py b/src/advancedparsing.py index 2dce464..557c744 100644 --- a/src/advancedparsing.py +++ b/src/advancedparsing.py @@ -5,8 +5,10 @@ from postal.parser import parse_address from postal.expand import expand_address from src.addressfunctions import normalizeAddress import re +import src.config -def advancedNormalize(number, street, unit, city, state, zipcode, lat, lon, zipprefix=False, plus4 = ""): +def advancedNormalize(number, street, unit, city, state, zipcode, lat, lon, zipprefix=False, plus4 = "", county=False): + cfg = src.config.get_config() if len(plus4 or "") == 4: # Return as-is, it's got a +4 match already return { @@ -60,11 +62,14 @@ def advancedNormalize(number, street, unit, city, state, zipcode, lat, lon, zipp "longitude": lon }) # Also add one where we remove any non-numeric data from the number and unit fields - normalizedMatches.append(normalizeAddress(re.sub("[^0-9]", "", number), street, re.sub("[^0-9]", "", unit), pCity, pState, pZip, lat, lon, zipprefix, plus4)) + normalizedMatches.append(normalizeAddress(re.sub("[^0-9]", "", number), street, re.sub("[^0-9]", "", unit), pCity, pState, pZip, lat, lon, zipprefix, plus4, county)) if number != pNumber or unit != pUnit or street != pStreet: - normalizedMatches.append(normalizeAddress(re.sub("[^0-9]", "", pNumber), pStreet, re.sub("[^0-9]", "", pUnit), pCity, pState, pZip, lat, lon, zipprefix, plus4)) + normalizedMatches.append(normalizeAddress(re.sub("[^0-9]", "", pNumber), pStreet, re.sub("[^0-9]", "", pUnit), pCity, pState, pZip, lat, lon, zipprefix, plus4, county)) for exp in expanded: + if cfg.verbose: + print(" libpostal expanded match:") + print(" ", exp) parsed = parse_address(exp) pN = "" pS = "" @@ -77,11 +82,21 @@ def advancedNormalize(number, street, unit, city, state, zipcode, lat, lon, zipp elif part[1] == "unit": pU = part[0] try: - normalizedMatches.append(normalizeAddress(pN, pS, pU, pCity, pState, pZip, lat, lon, zipprefix, plus4)) + if cfg.verbose: + print(" >", pN, pS, pU) + norm = normalizeAddress(pN, pS, pU, pCity, pState, pZip, lat, lon, zipprefix, plus4, county) + if cfg.verbose: + print(" >>", norm) + normalizedMatches.append(norm) except Exception as e: + if cfg.verbose: + print(" libpostal failure during normalizing expanded matches.") + print(" ", e) pass - + if cfg.verbose: + print(" libpostal address results:") + print(" ", normalizedMatches) if len(normalizedMatches) > 1: weights = {"number": 5, "street": 5, "unit": 1, "city": 1, "state": 1, "zip": 3, "plus4": 8, "latitude": 0, "longitude": 0} sortedMatches = sorted( @@ -89,11 +104,19 @@ def advancedNormalize(number, street, unit, city, state, zipcode, lat, lon, zipp key=lambda item: sum(weights[k] for k, v in item.items() if v), reverse=True ) + if cfg.verbose: + print(" libpostal selected best match:") + print(" ", sortedMatches[0]) return sortedMatches[0] elif len(normalizedMatches) == 1: + if cfg.verbose: + print(" libpostal selected only match:") + print(" ", normalizedMatches[0]) return normalizedMatches[0] # No matches, give up on the whole thing + if cfg.verbose: + print(" libpostal no address match found") return { "number": number, "street": street, diff --git a/src/config.py b/src/config.py index 49b7460..ea69507 100644 --- a/src/config.py +++ b/src/config.py @@ -10,6 +10,7 @@ class AppConfig: useCensusToFillEmptyZIPs: bool advancedMode: bool noSkip4: bool + verbose: bool _CFG: Optional[AppConfig] = None