Fix libpostal and normalization errors, add verbose mode for debugging

This commit is contained in:
Skylar Ittner 2025-11-20 20:48:07 -07:00
parent 5a901f37a2
commit 915bd43907
4 changed files with 103 additions and 30 deletions

34
main.py
View File

@ -58,27 +58,41 @@ def normalize(number, street, street2, city, state, zipcode, latitude, longitude
if len(city) > 4 and street1.endswith(" " + city):
# City name leaked into street field (Albany County Wyoming, for one)
street1 = street1.removesuffix(" " + city)
if cfg.verbose:
print("Starting to normalize address:")
print(" ", number, street1, street2, city, state, zipcode, round(float(latitude),7), round(float(longitude), 7), plus4, county)
addr = normalizeAddress(number, street1, street2, city, state, zipcode, round(float(latitude),7), round(float(longitude), 7), zipprefix, plus4, county)
if len(addr['zip'] or "") < 5 or len(addr['plus4'] or "") != 4:
if cfg.verbose:
print(" Address didn't match to a full ZIP+4 code. Trying more things.")
print(" ", re.sub("[^0-9]", "", addr['number']), addr['street'], addr['unit'], "", addr['state'], addr['zip'])
# Try removing letters from address numbers, and ignore city field
addrstrip = normalizeAddress(re.sub("[^0-9]", "", addr['number']), addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['latitude'], addr['longitude'], False, addr['plus4'], county)
# If that didn't work, try instead stripping the city name because it might be wrong
if addr['city'] != "" and (len(addrstrip['zip'] or "") < 5 or len(addrstrip['plus4'] or "") != 4):
addrstrip = normalizeAddress(addr['number'], addr['street'], addr['unit'], "", addr['state'], addr['zip'], addr['latitude'], addr['longitude'], False, addr['plus4'], county)
addrstrip = normalizeAddress(re.sub("[^0-9]", "", addr['number']), addr['street'], addr['unit'], "", addr['state'], addr['zip'], addr['latitude'], addr['longitude'], False, addr['plus4'], county)
# Use libpostal to analyze address deeper
if cfg.advancedMode and len(addrstrip['zip'] or "") < 5 or len(addrstrip['plus4'] or "") != 4:
try:
if cfg.verbose:
print(" Using libpostal to break down and analyze address.")
print(" ",addrstrip)
from src.advancedparsing import advancedNormalize
addr = advancedNormalize(addr['number'], addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['latitude'], addr['longitude'], False, addr['plus4'], county)
except Exception as e:
if cfg.verbose:
print(" libpostal crashed.")
raise e
pass
# Do another normalize pass for good luck (maybe the previous one got the ZIP and now we can get the +4)
if len(addr['zip'] or "") < 5 or len(addr['plus4'] or "") != 4:
if cfg.verbose:
print(" Doing a final normalization attempt after libpostal.")
print(" ", addr)
addr = normalizeAddress(addr['number'], addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['latitude'], addr['longitude'], False, addr['plus4'], county)
else:
addr = addrstrip
if cfg.verbose:
print(" Final result after normalization:")
print(" ", addr)
return addr
def processOwnChunk(chunk, chunkcount, outfilename, ignorestates, keeponlystates):
@ -661,6 +675,7 @@ if __name__ == "__main__":
parser.add_argument("--census", help="Enable looking up missing ZIP codes in the U.S. Census Geocoder when we have a full address, city, and state but no ZIP.", action='store_true')
parser.add_argument("--libpostal", help="Use libpostal address parsing and expansions to match bad addresses to a ZIP+4. Automatically enables --appendplus4.", action='store_true')
parser.add_argument("--noskip4", help="When processing own file format, don't skip normalizing records that have a ZIP+4 already.", action="store_true")
parser.add_argument("-v", help="Verbose output (for development)", action="store_true")
args = parser.parse_args()
@ -673,6 +688,11 @@ if __name__ == "__main__":
citySuggestion = False
advancedMode = False
noSkip4 = False
verbose = False
if args.v:
verbose = True
print("Verbose mode engaged!")
if args.libpostal:
advancedMode = True
@ -727,7 +747,7 @@ if __name__ == "__main__":
if args.city:
citySuggestion = args.city.strip().toUpper()
cfg = src.config.AppConfig(appendPlus4=appendPlus4, appendUnitLabel=appendUnitLabel, countryCode=countryCode, citySuggestion=citySuggestion, useCensusToFillEmptyZIPs=useCensusToFillEmptyZIPs, advancedMode=advancedMode, noSkip4=noSkip4)
cfg = src.config.AppConfig(appendPlus4=appendPlus4, appendUnitLabel=appendUnitLabel, countryCode=countryCode, citySuggestion=citySuggestion, useCensusToFillEmptyZIPs=useCensusToFillEmptyZIPs, advancedMode=advancedMode, noSkip4=noSkip4, verbose=verbose)
src.config.set_config(cfg)

View File

@ -116,7 +116,8 @@ POST_STANDARDIZATION_STREET_REGEXES = {
" LP$": " LOOP",
"^CRK ([0-9]+)$": r"COUNTY ROAD \1", # Athens TX does this for some reason
"^PR ([0-9]+)$": r"PRIVATE ROAD \1", # Athens TX does this for some reason
"^SH ([0-9]+) ([NSEW])$": r"STATE HIGHWAY \1 \2" # Athens TX does this too for some reason
"^SH ([0-9]+) ([NSEW])$": r"STATE HIGHWAY \1 \2", # Athens TX does this too for some reason
"^GLN HOLLOW ": "GLEN HOLLOW "
}
STANDARDIZATION_NUMBER_REGEXES = {
@ -211,6 +212,8 @@ def normalizeAddress(number, street, unit, city, state, zipcode, lat, lon, zippr
lon, lat = lat, lon
number = standardizeNumber(str(number).upper().strip())
street = preStandardizeStreet(street.strip().upper())
if cfg.verbose:
print(" Preprocessed street:", street)
unit = unit.strip().upper()
city = city.strip().upper()
state = state.strip().upper()
@ -230,18 +233,32 @@ def normalizeAddress(number, street, unit, city, state, zipcode, lat, lon, zippr
#
# Standardize address
#
if cfg.verbose:
print(" Standardizing street: ", street)
try:
# Python library
# Won't work at all without a zip code
incity = city
if not city:
incity = "XXXXXXXXXX"
instate = state
if not state:
instate = "XX"
inzip = zipcode
if not zipcode:
inzip = "00000"
addr = normalize_address_record(
{
"address_line_1": "".join(["999999999", " ", street]),
"address_line_2": unit,
"city": city,
"state": state,
"postal_code": zipcode
"city": incity,
"state": instate,
"postal_code": inzip
}
)
except Exception as e:
if cfg.verbose:
print(" Error standardizing street with usaddress-scourgify: ", e)
try:
# Proprietary Mono library
if standardization == False:
@ -266,10 +283,15 @@ def normalizeAddress(number, street, unit, city, state, zipcode, lat, lon, zippr
streetonly = addr['address_line_1']
if streetonly.startswith(str(number) + " "):
streetonly = streetonly[len(str(number) + " "):]
if cfg.verbose:
print(" Standardized street to: ", streetonly)
#
# Run extra regexes on street to fix standardization problems
#
streetonly = postStandardizeStreet(streetonly)
if cfg.verbose:
print(" Postprocessed street: ", streetonly)
#
# Special conditional rules
@ -287,31 +309,38 @@ def normalizeAddress(number, street, unit, city, state, zipcode, lat, lon, zippr
#
# Standardize and validate and/or append ZIP Code
#
if addr["postal_code"] != "00000":
zipcode = addr["postal_code"]
unitprefix = ""
unit = addr['address_line_2']
if zipcode is not None:
if zipcode is not None and addr["postal_code"] != "00000":
zipcode = addr["postal_code"][0:5]
if addr["state"] != "XX":
state = addr["state"]
if addr["city"] != "XXXXXXXXXX":
city = addr["city"]
# Skip these if we already have a ZIP+4 code, assume it's accurate
if zipcode is not None and len(zipcode) == 5 and not plus4:
zipinfo = getCityStateForZIP(zipcode)
if cfg.appendPlus4 or zipinfo == False or addr["state"] != zipinfo["state"] or (cfg.appendUnitLabel and addr["address_line_2"] != ""):
zipcode, plus4, streetonly, unitprefix, unit = getZIP4(number, streetonly, addr['address_line_2'], addr["state"], lat, lon, addr["city"], zipcode, county)
if cfg.appendPlus4 or zipinfo == False or state != zipinfo["state"] or (cfg.appendUnitLabel and addr["address_line_2"] != ""):
zipcode, plus4, streetonly, unitprefix, unit = getZIP4(number, streetonly, addr['address_line_2'], state, lat, lon, city, zipcode, county)
zipinfo = getCityStateForZIP(zipcode)
if zipinfo != False:
addr["city"] = zipinfo["city"]
addr["state"] = zipinfo["state"]
city = zipinfo["city"]
state = zipinfo["state"]
else:
addr["city"] = zipinfo["city"]
addr["state"] = zipinfo["state"]
city = zipinfo["city"]
state = zipinfo["state"]
elif not plus4:
zipcode, plus4, streetonly, unitprefix, unit = getZIP4(number, streetonly, addr['address_line_2'], addr["state"], lat, lon, addr["city"], False, county)
zipcode, plus4, streetonly, unitprefix, unit = getZIP4(number, streetonly, addr['address_line_2'], state, lat, lon, state, False, county)
zipinfo = getCityStateForZIP(zipcode)
if zipinfo != False:
addr["city"] = zipinfo["city"]
addr["state"] = zipinfo["state"]
city = zipinfo["city"]
state = zipinfo["state"]
if not plus4 and streetonly == "UNITED STATES HWY" and re.match(r"^\d+$", unit):
streetonly = f"US HIGHWAY {unit}"
@ -324,8 +353,8 @@ def normalizeAddress(number, street, unit, city, state, zipcode, lat, lon, zippr
"number": number,
"street": streetonly,
"unit": ' '.join(filter(None, (unitprefix, unit))),
"city": addr["city"],
"state": addr["state"],
"city": city,
"state": state,
"zip": zipcode,
"plus4": plus4,
"latitude": lat,

View File

@ -5,8 +5,10 @@ from postal.parser import parse_address
from postal.expand import expand_address
from src.addressfunctions import normalizeAddress
import re
import src.config
def advancedNormalize(number, street, unit, city, state, zipcode, lat, lon, zipprefix=False, plus4 = ""):
def advancedNormalize(number, street, unit, city, state, zipcode, lat, lon, zipprefix=False, plus4 = "", county=False):
cfg = src.config.get_config()
if len(plus4 or "") == 4:
# Return as-is, it's got a +4 match already
return {
@ -60,11 +62,14 @@ def advancedNormalize(number, street, unit, city, state, zipcode, lat, lon, zipp
"longitude": lon
})
# Also add one where we remove any non-numeric data from the number and unit fields
normalizedMatches.append(normalizeAddress(re.sub("[^0-9]", "", number), street, re.sub("[^0-9]", "", unit), pCity, pState, pZip, lat, lon, zipprefix, plus4))
normalizedMatches.append(normalizeAddress(re.sub("[^0-9]", "", number), street, re.sub("[^0-9]", "", unit), pCity, pState, pZip, lat, lon, zipprefix, plus4, county))
if number != pNumber or unit != pUnit or street != pStreet:
normalizedMatches.append(normalizeAddress(re.sub("[^0-9]", "", pNumber), pStreet, re.sub("[^0-9]", "", pUnit), pCity, pState, pZip, lat, lon, zipprefix, plus4))
normalizedMatches.append(normalizeAddress(re.sub("[^0-9]", "", pNumber), pStreet, re.sub("[^0-9]", "", pUnit), pCity, pState, pZip, lat, lon, zipprefix, plus4, county))
for exp in expanded:
if cfg.verbose:
print(" libpostal expanded match:")
print(" ", exp)
parsed = parse_address(exp)
pN = ""
pS = ""
@ -77,11 +82,21 @@ def advancedNormalize(number, street, unit, city, state, zipcode, lat, lon, zipp
elif part[1] == "unit":
pU = part[0]
try:
normalizedMatches.append(normalizeAddress(pN, pS, pU, pCity, pState, pZip, lat, lon, zipprefix, plus4))
if cfg.verbose:
print(" >", pN, pS, pU)
norm = normalizeAddress(pN, pS, pU, pCity, pState, pZip, lat, lon, zipprefix, plus4, county)
if cfg.verbose:
print(" >>", norm)
normalizedMatches.append(norm)
except Exception as e:
if cfg.verbose:
print(" libpostal failure during normalizing expanded matches.")
print(" ", e)
pass
if cfg.verbose:
print(" libpostal address results:")
print(" ", normalizedMatches)
if len(normalizedMatches) > 1:
weights = {"number": 5, "street": 5, "unit": 1, "city": 1, "state": 1, "zip": 3, "plus4": 8, "latitude": 0, "longitude": 0}
sortedMatches = sorted(
@ -89,11 +104,19 @@ def advancedNormalize(number, street, unit, city, state, zipcode, lat, lon, zipp
key=lambda item: sum(weights[k] for k, v in item.items() if v),
reverse=True
)
if cfg.verbose:
print(" libpostal selected best match:")
print(" ", sortedMatches[0])
return sortedMatches[0]
elif len(normalizedMatches) == 1:
if cfg.verbose:
print(" libpostal selected only match:")
print(" ", normalizedMatches[0])
return normalizedMatches[0]
# No matches, give up on the whole thing
if cfg.verbose:
print(" libpostal no address match found")
return {
"number": number,
"street": street,

View File

@ -10,6 +10,7 @@ class AppConfig:
useCensusToFillEmptyZIPs: bool
advancedMode: bool
noSkip4: bool
verbose: bool
_CFG: Optional[AppConfig] = None