Fix libpostal and normalization errors, add verbose mode for debugging

This commit is contained in:
Skylar Ittner 2025-11-20 20:48:07 -07:00
parent 5a901f37a2
commit 915bd43907
4 changed files with 103 additions and 30 deletions

34
main.py
View File

@ -58,27 +58,41 @@ def normalize(number, street, street2, city, state, zipcode, latitude, longitude
if len(city) > 4 and street1.endswith(" " + city): if len(city) > 4 and street1.endswith(" " + city):
# City name leaked into street field (Albany County Wyoming, for one) # City name leaked into street field (Albany County Wyoming, for one)
street1 = street1.removesuffix(" " + city) street1 = street1.removesuffix(" " + city)
if cfg.verbose:
print("Starting to normalize address:")
print(" ", number, street1, street2, city, state, zipcode, round(float(latitude),7), round(float(longitude), 7), plus4, county)
addr = normalizeAddress(number, street1, street2, city, state, zipcode, round(float(latitude),7), round(float(longitude), 7), zipprefix, plus4, county) addr = normalizeAddress(number, street1, street2, city, state, zipcode, round(float(latitude),7), round(float(longitude), 7), zipprefix, plus4, county)
if len(addr['zip'] or "") < 5 or len(addr['plus4'] or "") != 4: if len(addr['zip'] or "") < 5 or len(addr['plus4'] or "") != 4:
if cfg.verbose:
print(" Address didn't match to a full ZIP+4 code. Trying more things.")
print(" ", re.sub("[^0-9]", "", addr['number']), addr['street'], addr['unit'], "", addr['state'], addr['zip'])
# Try removing letters from address numbers, and ignore city field # Try removing letters from address numbers, and ignore city field
addrstrip = normalizeAddress(re.sub("[^0-9]", "", addr['number']), addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['latitude'], addr['longitude'], False, addr['plus4'], county) addrstrip = normalizeAddress(re.sub("[^0-9]", "", addr['number']), addr['street'], addr['unit'], "", addr['state'], addr['zip'], addr['latitude'], addr['longitude'], False, addr['plus4'], county)
# If that didn't work, try instead stripping the city name because it might be wrong
if addr['city'] != "" and (len(addrstrip['zip'] or "") < 5 or len(addrstrip['plus4'] or "") != 4):
addrstrip = normalizeAddress(addr['number'], addr['street'], addr['unit'], "", addr['state'], addr['zip'], addr['latitude'], addr['longitude'], False, addr['plus4'], county)
# Use libpostal to analyze address deeper # Use libpostal to analyze address deeper
if cfg.advancedMode and len(addrstrip['zip'] or "") < 5 or len(addrstrip['plus4'] or "") != 4: if cfg.advancedMode and len(addrstrip['zip'] or "") < 5 or len(addrstrip['plus4'] or "") != 4:
try: try:
if cfg.verbose:
print(" Using libpostal to break down and analyze address.")
print(" ",addrstrip)
from src.advancedparsing import advancedNormalize
addr = advancedNormalize(addr['number'], addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['latitude'], addr['longitude'], False, addr['plus4'], county) addr = advancedNormalize(addr['number'], addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['latitude'], addr['longitude'], False, addr['plus4'], county)
except Exception as e: except Exception as e:
if cfg.verbose:
print(" libpostal crashed.")
raise e
pass pass
# Do another normalize pass for good luck (maybe the previous one got the ZIP and now we can get the +4) # Do another normalize pass for good luck (maybe the previous one got the ZIP and now we can get the +4)
if len(addr['zip'] or "") < 5 or len(addr['plus4'] or "") != 4: if len(addr['zip'] or "") < 5 or len(addr['plus4'] or "") != 4:
if cfg.verbose:
print(" Doing a final normalization attempt after libpostal.")
print(" ", addr)
addr = normalizeAddress(addr['number'], addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['latitude'], addr['longitude'], False, addr['plus4'], county) addr = normalizeAddress(addr['number'], addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['latitude'], addr['longitude'], False, addr['plus4'], county)
else: else:
addr = addrstrip addr = addrstrip
if cfg.verbose:
print(" Final result after normalization:")
print(" ", addr)
return addr return addr
def processOwnChunk(chunk, chunkcount, outfilename, ignorestates, keeponlystates): def processOwnChunk(chunk, chunkcount, outfilename, ignorestates, keeponlystates):
@ -661,6 +675,7 @@ if __name__ == "__main__":
parser.add_argument("--census", help="Enable looking up missing ZIP codes in the U.S. Census Geocoder when we have a full address, city, and state but no ZIP.", action='store_true') parser.add_argument("--census", help="Enable looking up missing ZIP codes in the U.S. Census Geocoder when we have a full address, city, and state but no ZIP.", action='store_true')
parser.add_argument("--libpostal", help="Use libpostal address parsing and expansions to match bad addresses to a ZIP+4. Automatically enables --appendplus4.", action='store_true') parser.add_argument("--libpostal", help="Use libpostal address parsing and expansions to match bad addresses to a ZIP+4. Automatically enables --appendplus4.", action='store_true')
parser.add_argument("--noskip4", help="When processing own file format, don't skip normalizing records that have a ZIP+4 already.", action="store_true") parser.add_argument("--noskip4", help="When processing own file format, don't skip normalizing records that have a ZIP+4 already.", action="store_true")
parser.add_argument("-v", help="Verbose output (for development)", action="store_true")
args = parser.parse_args() args = parser.parse_args()
@ -673,6 +688,11 @@ if __name__ == "__main__":
citySuggestion = False citySuggestion = False
advancedMode = False advancedMode = False
noSkip4 = False noSkip4 = False
verbose = False
if args.v:
verbose = True
print("Verbose mode engaged!")
if args.libpostal: if args.libpostal:
advancedMode = True advancedMode = True
@ -727,7 +747,7 @@ if __name__ == "__main__":
if args.city: if args.city:
citySuggestion = args.city.strip().toUpper() citySuggestion = args.city.strip().toUpper()
cfg = src.config.AppConfig(appendPlus4=appendPlus4, appendUnitLabel=appendUnitLabel, countryCode=countryCode, citySuggestion=citySuggestion, useCensusToFillEmptyZIPs=useCensusToFillEmptyZIPs, advancedMode=advancedMode, noSkip4=noSkip4) cfg = src.config.AppConfig(appendPlus4=appendPlus4, appendUnitLabel=appendUnitLabel, countryCode=countryCode, citySuggestion=citySuggestion, useCensusToFillEmptyZIPs=useCensusToFillEmptyZIPs, advancedMode=advancedMode, noSkip4=noSkip4, verbose=verbose)
src.config.set_config(cfg) src.config.set_config(cfg)

View File

@ -116,7 +116,8 @@ POST_STANDARDIZATION_STREET_REGEXES = {
" LP$": " LOOP", " LP$": " LOOP",
"^CRK ([0-9]+)$": r"COUNTY ROAD \1", # Athens TX does this for some reason "^CRK ([0-9]+)$": r"COUNTY ROAD \1", # Athens TX does this for some reason
"^PR ([0-9]+)$": r"PRIVATE ROAD \1", # Athens TX does this for some reason "^PR ([0-9]+)$": r"PRIVATE ROAD \1", # Athens TX does this for some reason
"^SH ([0-9]+) ([NSEW])$": r"STATE HIGHWAY \1 \2" # Athens TX does this too for some reason "^SH ([0-9]+) ([NSEW])$": r"STATE HIGHWAY \1 \2", # Athens TX does this too for some reason
"^GLN HOLLOW ": "GLEN HOLLOW "
} }
STANDARDIZATION_NUMBER_REGEXES = { STANDARDIZATION_NUMBER_REGEXES = {
@ -211,6 +212,8 @@ def normalizeAddress(number, street, unit, city, state, zipcode, lat, lon, zippr
lon, lat = lat, lon lon, lat = lat, lon
number = standardizeNumber(str(number).upper().strip()) number = standardizeNumber(str(number).upper().strip())
street = preStandardizeStreet(street.strip().upper()) street = preStandardizeStreet(street.strip().upper())
if cfg.verbose:
print(" Preprocessed street:", street)
unit = unit.strip().upper() unit = unit.strip().upper()
city = city.strip().upper() city = city.strip().upper()
state = state.strip().upper() state = state.strip().upper()
@ -230,18 +233,32 @@ def normalizeAddress(number, street, unit, city, state, zipcode, lat, lon, zippr
# #
# Standardize address # Standardize address
# #
if cfg.verbose:
print(" Standardizing street: ", street)
try: try:
# Python library # Python library
# Won't work at all without a zip code
incity = city
if not city:
incity = "XXXXXXXXXX"
instate = state
if not state:
instate = "XX"
inzip = zipcode
if not zipcode:
inzip = "00000"
addr = normalize_address_record( addr = normalize_address_record(
{ {
"address_line_1": "".join(["999999999", " ", street]), "address_line_1": "".join(["999999999", " ", street]),
"address_line_2": unit, "address_line_2": unit,
"city": city, "city": incity,
"state": state, "state": instate,
"postal_code": zipcode "postal_code": inzip
} }
) )
except Exception as e: except Exception as e:
if cfg.verbose:
print(" Error standardizing street with usaddress-scourgify: ", e)
try: try:
# Proprietary Mono library # Proprietary Mono library
if standardization == False: if standardization == False:
@ -266,10 +283,15 @@ def normalizeAddress(number, street, unit, city, state, zipcode, lat, lon, zippr
streetonly = addr['address_line_1'] streetonly = addr['address_line_1']
if streetonly.startswith(str(number) + " "): if streetonly.startswith(str(number) + " "):
streetonly = streetonly[len(str(number) + " "):] streetonly = streetonly[len(str(number) + " "):]
if cfg.verbose:
print(" Standardized street to: ", streetonly)
# #
# Run extra regexes on street to fix standardization problems # Run extra regexes on street to fix standardization problems
# #
streetonly = postStandardizeStreet(streetonly) streetonly = postStandardizeStreet(streetonly)
if cfg.verbose:
print(" Postprocessed street: ", streetonly)
# #
# Special conditional rules # Special conditional rules
@ -287,31 +309,38 @@ def normalizeAddress(number, street, unit, city, state, zipcode, lat, lon, zippr
# #
# Standardize and validate and/or append ZIP Code # Standardize and validate and/or append ZIP Code
# #
if addr["postal_code"] != "00000":
zipcode = addr["postal_code"] zipcode = addr["postal_code"]
unitprefix = "" unitprefix = ""
unit = addr['address_line_2'] unit = addr['address_line_2']
if zipcode is not None: if zipcode is not None and addr["postal_code"] != "00000":
zipcode = addr["postal_code"][0:5] zipcode = addr["postal_code"][0:5]
if addr["state"] != "XX":
state = addr["state"]
if addr["city"] != "XXXXXXXXXX":
city = addr["city"]
# Skip these if we already have a ZIP+4 code, assume it's accurate # Skip these if we already have a ZIP+4 code, assume it's accurate
if zipcode is not None and len(zipcode) == 5 and not plus4: if zipcode is not None and len(zipcode) == 5 and not plus4:
zipinfo = getCityStateForZIP(zipcode) zipinfo = getCityStateForZIP(zipcode)
if cfg.appendPlus4 or zipinfo == False or addr["state"] != zipinfo["state"] or (cfg.appendUnitLabel and addr["address_line_2"] != ""): if cfg.appendPlus4 or zipinfo == False or state != zipinfo["state"] or (cfg.appendUnitLabel and addr["address_line_2"] != ""):
zipcode, plus4, streetonly, unitprefix, unit = getZIP4(number, streetonly, addr['address_line_2'], addr["state"], lat, lon, addr["city"], zipcode, county) zipcode, plus4, streetonly, unitprefix, unit = getZIP4(number, streetonly, addr['address_line_2'], state, lat, lon, city, zipcode, county)
zipinfo = getCityStateForZIP(zipcode) zipinfo = getCityStateForZIP(zipcode)
if zipinfo != False: if zipinfo != False:
addr["city"] = zipinfo["city"] city = zipinfo["city"]
addr["state"] = zipinfo["state"] state = zipinfo["state"]
else: else:
addr["city"] = zipinfo["city"] city = zipinfo["city"]
addr["state"] = zipinfo["state"] state = zipinfo["state"]
elif not plus4: elif not plus4:
zipcode, plus4, streetonly, unitprefix, unit = getZIP4(number, streetonly, addr['address_line_2'], addr["state"], lat, lon, addr["city"], False, county) zipcode, plus4, streetonly, unitprefix, unit = getZIP4(number, streetonly, addr['address_line_2'], state, lat, lon, state, False, county)
zipinfo = getCityStateForZIP(zipcode) zipinfo = getCityStateForZIP(zipcode)
if zipinfo != False: if zipinfo != False:
addr["city"] = zipinfo["city"] city = zipinfo["city"]
addr["state"] = zipinfo["state"] state = zipinfo["state"]
if not plus4 and streetonly == "UNITED STATES HWY" and re.match(r"^\d+$", unit): if not plus4 and streetonly == "UNITED STATES HWY" and re.match(r"^\d+$", unit):
streetonly = f"US HIGHWAY {unit}" streetonly = f"US HIGHWAY {unit}"
@ -324,8 +353,8 @@ def normalizeAddress(number, street, unit, city, state, zipcode, lat, lon, zippr
"number": number, "number": number,
"street": streetonly, "street": streetonly,
"unit": ' '.join(filter(None, (unitprefix, unit))), "unit": ' '.join(filter(None, (unitprefix, unit))),
"city": addr["city"], "city": city,
"state": addr["state"], "state": state,
"zip": zipcode, "zip": zipcode,
"plus4": plus4, "plus4": plus4,
"latitude": lat, "latitude": lat,

View File

@ -5,8 +5,10 @@ from postal.parser import parse_address
from postal.expand import expand_address from postal.expand import expand_address
from src.addressfunctions import normalizeAddress from src.addressfunctions import normalizeAddress
import re import re
import src.config
def advancedNormalize(number, street, unit, city, state, zipcode, lat, lon, zipprefix=False, plus4 = ""): def advancedNormalize(number, street, unit, city, state, zipcode, lat, lon, zipprefix=False, plus4 = "", county=False):
cfg = src.config.get_config()
if len(plus4 or "") == 4: if len(plus4 or "") == 4:
# Return as-is, it's got a +4 match already # Return as-is, it's got a +4 match already
return { return {
@ -60,11 +62,14 @@ def advancedNormalize(number, street, unit, city, state, zipcode, lat, lon, zipp
"longitude": lon "longitude": lon
}) })
# Also add one where we remove any non-numeric data from the number and unit fields # Also add one where we remove any non-numeric data from the number and unit fields
normalizedMatches.append(normalizeAddress(re.sub("[^0-9]", "", number), street, re.sub("[^0-9]", "", unit), pCity, pState, pZip, lat, lon, zipprefix, plus4)) normalizedMatches.append(normalizeAddress(re.sub("[^0-9]", "", number), street, re.sub("[^0-9]", "", unit), pCity, pState, pZip, lat, lon, zipprefix, plus4, county))
if number != pNumber or unit != pUnit or street != pStreet: if number != pNumber or unit != pUnit or street != pStreet:
normalizedMatches.append(normalizeAddress(re.sub("[^0-9]", "", pNumber), pStreet, re.sub("[^0-9]", "", pUnit), pCity, pState, pZip, lat, lon, zipprefix, plus4)) normalizedMatches.append(normalizeAddress(re.sub("[^0-9]", "", pNumber), pStreet, re.sub("[^0-9]", "", pUnit), pCity, pState, pZip, lat, lon, zipprefix, plus4, county))
for exp in expanded: for exp in expanded:
if cfg.verbose:
print(" libpostal expanded match:")
print(" ", exp)
parsed = parse_address(exp) parsed = parse_address(exp)
pN = "" pN = ""
pS = "" pS = ""
@ -77,11 +82,21 @@ def advancedNormalize(number, street, unit, city, state, zipcode, lat, lon, zipp
elif part[1] == "unit": elif part[1] == "unit":
pU = part[0] pU = part[0]
try: try:
normalizedMatches.append(normalizeAddress(pN, pS, pU, pCity, pState, pZip, lat, lon, zipprefix, plus4)) if cfg.verbose:
print(" >", pN, pS, pU)
norm = normalizeAddress(pN, pS, pU, pCity, pState, pZip, lat, lon, zipprefix, plus4, county)
if cfg.verbose:
print(" >>", norm)
normalizedMatches.append(norm)
except Exception as e: except Exception as e:
if cfg.verbose:
print(" libpostal failure during normalizing expanded matches.")
print(" ", e)
pass pass
if cfg.verbose:
print(" libpostal address results:")
print(" ", normalizedMatches)
if len(normalizedMatches) > 1: if len(normalizedMatches) > 1:
weights = {"number": 5, "street": 5, "unit": 1, "city": 1, "state": 1, "zip": 3, "plus4": 8, "latitude": 0, "longitude": 0} weights = {"number": 5, "street": 5, "unit": 1, "city": 1, "state": 1, "zip": 3, "plus4": 8, "latitude": 0, "longitude": 0}
sortedMatches = sorted( sortedMatches = sorted(
@ -89,11 +104,19 @@ def advancedNormalize(number, street, unit, city, state, zipcode, lat, lon, zipp
key=lambda item: sum(weights[k] for k, v in item.items() if v), key=lambda item: sum(weights[k] for k, v in item.items() if v),
reverse=True reverse=True
) )
if cfg.verbose:
print(" libpostal selected best match:")
print(" ", sortedMatches[0])
return sortedMatches[0] return sortedMatches[0]
elif len(normalizedMatches) == 1: elif len(normalizedMatches) == 1:
if cfg.verbose:
print(" libpostal selected only match:")
print(" ", normalizedMatches[0])
return normalizedMatches[0] return normalizedMatches[0]
# No matches, give up on the whole thing # No matches, give up on the whole thing
if cfg.verbose:
print(" libpostal no address match found")
return { return {
"number": number, "number": number,
"street": street, "street": street,

View File

@ -10,6 +10,7 @@ class AppConfig:
useCensusToFillEmptyZIPs: bool useCensusToFillEmptyZIPs: bool
advancedMode: bool advancedMode: bool
noSkip4: bool noSkip4: bool
verbose: bool
_CFG: Optional[AppConfig] = None _CFG: Optional[AppConfig] = None