Fix libpostal and normalization errors, add verbose mode for debugging
This commit is contained in:
parent
5a901f37a2
commit
915bd43907
34
main.py
34
main.py
@ -58,27 +58,41 @@ def normalize(number, street, street2, city, state, zipcode, latitude, longitude
|
|||||||
if len(city) > 4 and street1.endswith(" " + city):
|
if len(city) > 4 and street1.endswith(" " + city):
|
||||||
# City name leaked into street field (Albany County Wyoming, for one)
|
# City name leaked into street field (Albany County Wyoming, for one)
|
||||||
street1 = street1.removesuffix(" " + city)
|
street1 = street1.removesuffix(" " + city)
|
||||||
|
if cfg.verbose:
|
||||||
|
print("Starting to normalize address:")
|
||||||
|
print(" ", number, street1, street2, city, state, zipcode, round(float(latitude),7), round(float(longitude), 7), plus4, county)
|
||||||
addr = normalizeAddress(number, street1, street2, city, state, zipcode, round(float(latitude),7), round(float(longitude), 7), zipprefix, plus4, county)
|
addr = normalizeAddress(number, street1, street2, city, state, zipcode, round(float(latitude),7), round(float(longitude), 7), zipprefix, plus4, county)
|
||||||
if len(addr['zip'] or "") < 5 or len(addr['plus4'] or "") != 4:
|
if len(addr['zip'] or "") < 5 or len(addr['plus4'] or "") != 4:
|
||||||
|
if cfg.verbose:
|
||||||
|
print(" Address didn't match to a full ZIP+4 code. Trying more things.")
|
||||||
|
print(" ", re.sub("[^0-9]", "", addr['number']), addr['street'], addr['unit'], "", addr['state'], addr['zip'])
|
||||||
# Try removing letters from address numbers, and ignore city field
|
# Try removing letters from address numbers, and ignore city field
|
||||||
addrstrip = normalizeAddress(re.sub("[^0-9]", "", addr['number']), addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['latitude'], addr['longitude'], False, addr['plus4'], county)
|
addrstrip = normalizeAddress(re.sub("[^0-9]", "", addr['number']), addr['street'], addr['unit'], "", addr['state'], addr['zip'], addr['latitude'], addr['longitude'], False, addr['plus4'], county)
|
||||||
|
|
||||||
# If that didn't work, try instead stripping the city name because it might be wrong
|
|
||||||
if addr['city'] != "" and (len(addrstrip['zip'] or "") < 5 or len(addrstrip['plus4'] or "") != 4):
|
|
||||||
addrstrip = normalizeAddress(addr['number'], addr['street'], addr['unit'], "", addr['state'], addr['zip'], addr['latitude'], addr['longitude'], False, addr['plus4'], county)
|
|
||||||
|
|
||||||
# Use libpostal to analyze address deeper
|
# Use libpostal to analyze address deeper
|
||||||
if cfg.advancedMode and len(addrstrip['zip'] or "") < 5 or len(addrstrip['plus4'] or "") != 4:
|
if cfg.advancedMode and len(addrstrip['zip'] or "") < 5 or len(addrstrip['plus4'] or "") != 4:
|
||||||
try:
|
try:
|
||||||
|
if cfg.verbose:
|
||||||
|
print(" Using libpostal to break down and analyze address.")
|
||||||
|
print(" ",addrstrip)
|
||||||
|
from src.advancedparsing import advancedNormalize
|
||||||
addr = advancedNormalize(addr['number'], addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['latitude'], addr['longitude'], False, addr['plus4'], county)
|
addr = advancedNormalize(addr['number'], addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['latitude'], addr['longitude'], False, addr['plus4'], county)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
if cfg.verbose:
|
||||||
|
print(" libpostal crashed.")
|
||||||
|
raise e
|
||||||
pass
|
pass
|
||||||
# Do another normalize pass for good luck (maybe the previous one got the ZIP and now we can get the +4)
|
# Do another normalize pass for good luck (maybe the previous one got the ZIP and now we can get the +4)
|
||||||
if len(addr['zip'] or "") < 5 or len(addr['plus4'] or "") != 4:
|
if len(addr['zip'] or "") < 5 or len(addr['plus4'] or "") != 4:
|
||||||
|
if cfg.verbose:
|
||||||
|
print(" Doing a final normalization attempt after libpostal.")
|
||||||
|
print(" ", addr)
|
||||||
addr = normalizeAddress(addr['number'], addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['latitude'], addr['longitude'], False, addr['plus4'], county)
|
addr = normalizeAddress(addr['number'], addr['street'], addr['unit'], addr['city'], addr['state'], addr['zip'], addr['latitude'], addr['longitude'], False, addr['plus4'], county)
|
||||||
else:
|
else:
|
||||||
addr = addrstrip
|
addr = addrstrip
|
||||||
|
if cfg.verbose:
|
||||||
|
print(" Final result after normalization:")
|
||||||
|
print(" ", addr)
|
||||||
return addr
|
return addr
|
||||||
|
|
||||||
def processOwnChunk(chunk, chunkcount, outfilename, ignorestates, keeponlystates):
|
def processOwnChunk(chunk, chunkcount, outfilename, ignorestates, keeponlystates):
|
||||||
@ -661,6 +675,7 @@ if __name__ == "__main__":
|
|||||||
parser.add_argument("--census", help="Enable looking up missing ZIP codes in the U.S. Census Geocoder when we have a full address, city, and state but no ZIP.", action='store_true')
|
parser.add_argument("--census", help="Enable looking up missing ZIP codes in the U.S. Census Geocoder when we have a full address, city, and state but no ZIP.", action='store_true')
|
||||||
parser.add_argument("--libpostal", help="Use libpostal address parsing and expansions to match bad addresses to a ZIP+4. Automatically enables --appendplus4.", action='store_true')
|
parser.add_argument("--libpostal", help="Use libpostal address parsing and expansions to match bad addresses to a ZIP+4. Automatically enables --appendplus4.", action='store_true')
|
||||||
parser.add_argument("--noskip4", help="When processing own file format, don't skip normalizing records that have a ZIP+4 already.", action="store_true")
|
parser.add_argument("--noskip4", help="When processing own file format, don't skip normalizing records that have a ZIP+4 already.", action="store_true")
|
||||||
|
parser.add_argument("-v", help="Verbose output (for development)", action="store_true")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
@ -673,6 +688,11 @@ if __name__ == "__main__":
|
|||||||
citySuggestion = False
|
citySuggestion = False
|
||||||
advancedMode = False
|
advancedMode = False
|
||||||
noSkip4 = False
|
noSkip4 = False
|
||||||
|
verbose = False
|
||||||
|
|
||||||
|
if args.v:
|
||||||
|
verbose = True
|
||||||
|
print("Verbose mode engaged!")
|
||||||
|
|
||||||
if args.libpostal:
|
if args.libpostal:
|
||||||
advancedMode = True
|
advancedMode = True
|
||||||
@ -727,7 +747,7 @@ if __name__ == "__main__":
|
|||||||
if args.city:
|
if args.city:
|
||||||
citySuggestion = args.city.strip().toUpper()
|
citySuggestion = args.city.strip().toUpper()
|
||||||
|
|
||||||
cfg = src.config.AppConfig(appendPlus4=appendPlus4, appendUnitLabel=appendUnitLabel, countryCode=countryCode, citySuggestion=citySuggestion, useCensusToFillEmptyZIPs=useCensusToFillEmptyZIPs, advancedMode=advancedMode, noSkip4=noSkip4)
|
cfg = src.config.AppConfig(appendPlus4=appendPlus4, appendUnitLabel=appendUnitLabel, countryCode=countryCode, citySuggestion=citySuggestion, useCensusToFillEmptyZIPs=useCensusToFillEmptyZIPs, advancedMode=advancedMode, noSkip4=noSkip4, verbose=verbose)
|
||||||
|
|
||||||
src.config.set_config(cfg)
|
src.config.set_config(cfg)
|
||||||
|
|
||||||
|
|||||||
@ -116,7 +116,8 @@ POST_STANDARDIZATION_STREET_REGEXES = {
|
|||||||
" LP$": " LOOP",
|
" LP$": " LOOP",
|
||||||
"^CRK ([0-9]+)$": r"COUNTY ROAD \1", # Athens TX does this for some reason
|
"^CRK ([0-9]+)$": r"COUNTY ROAD \1", # Athens TX does this for some reason
|
||||||
"^PR ([0-9]+)$": r"PRIVATE ROAD \1", # Athens TX does this for some reason
|
"^PR ([0-9]+)$": r"PRIVATE ROAD \1", # Athens TX does this for some reason
|
||||||
"^SH ([0-9]+) ([NSEW])$": r"STATE HIGHWAY \1 \2" # Athens TX does this too for some reason
|
"^SH ([0-9]+) ([NSEW])$": r"STATE HIGHWAY \1 \2", # Athens TX does this too for some reason
|
||||||
|
"^GLN HOLLOW ": "GLEN HOLLOW "
|
||||||
}
|
}
|
||||||
|
|
||||||
STANDARDIZATION_NUMBER_REGEXES = {
|
STANDARDIZATION_NUMBER_REGEXES = {
|
||||||
@ -211,6 +212,8 @@ def normalizeAddress(number, street, unit, city, state, zipcode, lat, lon, zippr
|
|||||||
lon, lat = lat, lon
|
lon, lat = lat, lon
|
||||||
number = standardizeNumber(str(number).upper().strip())
|
number = standardizeNumber(str(number).upper().strip())
|
||||||
street = preStandardizeStreet(street.strip().upper())
|
street = preStandardizeStreet(street.strip().upper())
|
||||||
|
if cfg.verbose:
|
||||||
|
print(" Preprocessed street:", street)
|
||||||
unit = unit.strip().upper()
|
unit = unit.strip().upper()
|
||||||
city = city.strip().upper()
|
city = city.strip().upper()
|
||||||
state = state.strip().upper()
|
state = state.strip().upper()
|
||||||
@ -230,18 +233,32 @@ def normalizeAddress(number, street, unit, city, state, zipcode, lat, lon, zippr
|
|||||||
#
|
#
|
||||||
# Standardize address
|
# Standardize address
|
||||||
#
|
#
|
||||||
|
if cfg.verbose:
|
||||||
|
print(" Standardizing street: ", street)
|
||||||
try:
|
try:
|
||||||
# Python library
|
# Python library
|
||||||
|
# Won't work at all without a zip code
|
||||||
|
incity = city
|
||||||
|
if not city:
|
||||||
|
incity = "XXXXXXXXXX"
|
||||||
|
instate = state
|
||||||
|
if not state:
|
||||||
|
instate = "XX"
|
||||||
|
inzip = zipcode
|
||||||
|
if not zipcode:
|
||||||
|
inzip = "00000"
|
||||||
addr = normalize_address_record(
|
addr = normalize_address_record(
|
||||||
{
|
{
|
||||||
"address_line_1": "".join(["999999999", " ", street]),
|
"address_line_1": "".join(["999999999", " ", street]),
|
||||||
"address_line_2": unit,
|
"address_line_2": unit,
|
||||||
"city": city,
|
"city": incity,
|
||||||
"state": state,
|
"state": instate,
|
||||||
"postal_code": zipcode
|
"postal_code": inzip
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
if cfg.verbose:
|
||||||
|
print(" Error standardizing street with usaddress-scourgify: ", e)
|
||||||
try:
|
try:
|
||||||
# Proprietary Mono library
|
# Proprietary Mono library
|
||||||
if standardization == False:
|
if standardization == False:
|
||||||
@ -266,10 +283,15 @@ def normalizeAddress(number, street, unit, city, state, zipcode, lat, lon, zippr
|
|||||||
streetonly = addr['address_line_1']
|
streetonly = addr['address_line_1']
|
||||||
if streetonly.startswith(str(number) + " "):
|
if streetonly.startswith(str(number) + " "):
|
||||||
streetonly = streetonly[len(str(number) + " "):]
|
streetonly = streetonly[len(str(number) + " "):]
|
||||||
|
|
||||||
|
if cfg.verbose:
|
||||||
|
print(" Standardized street to: ", streetonly)
|
||||||
#
|
#
|
||||||
# Run extra regexes on street to fix standardization problems
|
# Run extra regexes on street to fix standardization problems
|
||||||
#
|
#
|
||||||
streetonly = postStandardizeStreet(streetonly)
|
streetonly = postStandardizeStreet(streetonly)
|
||||||
|
if cfg.verbose:
|
||||||
|
print(" Postprocessed street: ", streetonly)
|
||||||
|
|
||||||
#
|
#
|
||||||
# Special conditional rules
|
# Special conditional rules
|
||||||
@ -287,31 +309,38 @@ def normalizeAddress(number, street, unit, city, state, zipcode, lat, lon, zippr
|
|||||||
#
|
#
|
||||||
# Standardize and validate and/or append ZIP Code
|
# Standardize and validate and/or append ZIP Code
|
||||||
#
|
#
|
||||||
|
if addr["postal_code"] != "00000":
|
||||||
zipcode = addr["postal_code"]
|
zipcode = addr["postal_code"]
|
||||||
unitprefix = ""
|
unitprefix = ""
|
||||||
unit = addr['address_line_2']
|
unit = addr['address_line_2']
|
||||||
|
|
||||||
if zipcode is not None:
|
if zipcode is not None and addr["postal_code"] != "00000":
|
||||||
zipcode = addr["postal_code"][0:5]
|
zipcode = addr["postal_code"][0:5]
|
||||||
|
|
||||||
|
if addr["state"] != "XX":
|
||||||
|
state = addr["state"]
|
||||||
|
|
||||||
|
if addr["city"] != "XXXXXXXXXX":
|
||||||
|
city = addr["city"]
|
||||||
|
|
||||||
# Skip these if we already have a ZIP+4 code, assume it's accurate
|
# Skip these if we already have a ZIP+4 code, assume it's accurate
|
||||||
if zipcode is not None and len(zipcode) == 5 and not plus4:
|
if zipcode is not None and len(zipcode) == 5 and not plus4:
|
||||||
zipinfo = getCityStateForZIP(zipcode)
|
zipinfo = getCityStateForZIP(zipcode)
|
||||||
if cfg.appendPlus4 or zipinfo == False or addr["state"] != zipinfo["state"] or (cfg.appendUnitLabel and addr["address_line_2"] != ""):
|
if cfg.appendPlus4 or zipinfo == False or state != zipinfo["state"] or (cfg.appendUnitLabel and addr["address_line_2"] != ""):
|
||||||
zipcode, plus4, streetonly, unitprefix, unit = getZIP4(number, streetonly, addr['address_line_2'], addr["state"], lat, lon, addr["city"], zipcode, county)
|
zipcode, plus4, streetonly, unitprefix, unit = getZIP4(number, streetonly, addr['address_line_2'], state, lat, lon, city, zipcode, county)
|
||||||
zipinfo = getCityStateForZIP(zipcode)
|
zipinfo = getCityStateForZIP(zipcode)
|
||||||
if zipinfo != False:
|
if zipinfo != False:
|
||||||
addr["city"] = zipinfo["city"]
|
city = zipinfo["city"]
|
||||||
addr["state"] = zipinfo["state"]
|
state = zipinfo["state"]
|
||||||
else:
|
else:
|
||||||
addr["city"] = zipinfo["city"]
|
city = zipinfo["city"]
|
||||||
addr["state"] = zipinfo["state"]
|
state = zipinfo["state"]
|
||||||
elif not plus4:
|
elif not plus4:
|
||||||
zipcode, plus4, streetonly, unitprefix, unit = getZIP4(number, streetonly, addr['address_line_2'], addr["state"], lat, lon, addr["city"], False, county)
|
zipcode, plus4, streetonly, unitprefix, unit = getZIP4(number, streetonly, addr['address_line_2'], state, lat, lon, state, False, county)
|
||||||
zipinfo = getCityStateForZIP(zipcode)
|
zipinfo = getCityStateForZIP(zipcode)
|
||||||
if zipinfo != False:
|
if zipinfo != False:
|
||||||
addr["city"] = zipinfo["city"]
|
city = zipinfo["city"]
|
||||||
addr["state"] = zipinfo["state"]
|
state = zipinfo["state"]
|
||||||
|
|
||||||
if not plus4 and streetonly == "UNITED STATES HWY" and re.match(r"^\d+$", unit):
|
if not plus4 and streetonly == "UNITED STATES HWY" and re.match(r"^\d+$", unit):
|
||||||
streetonly = f"US HIGHWAY {unit}"
|
streetonly = f"US HIGHWAY {unit}"
|
||||||
@ -324,8 +353,8 @@ def normalizeAddress(number, street, unit, city, state, zipcode, lat, lon, zippr
|
|||||||
"number": number,
|
"number": number,
|
||||||
"street": streetonly,
|
"street": streetonly,
|
||||||
"unit": ' '.join(filter(None, (unitprefix, unit))),
|
"unit": ' '.join(filter(None, (unitprefix, unit))),
|
||||||
"city": addr["city"],
|
"city": city,
|
||||||
"state": addr["state"],
|
"state": state,
|
||||||
"zip": zipcode,
|
"zip": zipcode,
|
||||||
"plus4": plus4,
|
"plus4": plus4,
|
||||||
"latitude": lat,
|
"latitude": lat,
|
||||||
|
|||||||
@ -5,8 +5,10 @@ from postal.parser import parse_address
|
|||||||
from postal.expand import expand_address
|
from postal.expand import expand_address
|
||||||
from src.addressfunctions import normalizeAddress
|
from src.addressfunctions import normalizeAddress
|
||||||
import re
|
import re
|
||||||
|
import src.config
|
||||||
|
|
||||||
def advancedNormalize(number, street, unit, city, state, zipcode, lat, lon, zipprefix=False, plus4 = ""):
|
def advancedNormalize(number, street, unit, city, state, zipcode, lat, lon, zipprefix=False, plus4 = "", county=False):
|
||||||
|
cfg = src.config.get_config()
|
||||||
if len(plus4 or "") == 4:
|
if len(plus4 or "") == 4:
|
||||||
# Return as-is, it's got a +4 match already
|
# Return as-is, it's got a +4 match already
|
||||||
return {
|
return {
|
||||||
@ -60,11 +62,14 @@ def advancedNormalize(number, street, unit, city, state, zipcode, lat, lon, zipp
|
|||||||
"longitude": lon
|
"longitude": lon
|
||||||
})
|
})
|
||||||
# Also add one where we remove any non-numeric data from the number and unit fields
|
# Also add one where we remove any non-numeric data from the number and unit fields
|
||||||
normalizedMatches.append(normalizeAddress(re.sub("[^0-9]", "", number), street, re.sub("[^0-9]", "", unit), pCity, pState, pZip, lat, lon, zipprefix, plus4))
|
normalizedMatches.append(normalizeAddress(re.sub("[^0-9]", "", number), street, re.sub("[^0-9]", "", unit), pCity, pState, pZip, lat, lon, zipprefix, plus4, county))
|
||||||
if number != pNumber or unit != pUnit or street != pStreet:
|
if number != pNumber or unit != pUnit or street != pStreet:
|
||||||
normalizedMatches.append(normalizeAddress(re.sub("[^0-9]", "", pNumber), pStreet, re.sub("[^0-9]", "", pUnit), pCity, pState, pZip, lat, lon, zipprefix, plus4))
|
normalizedMatches.append(normalizeAddress(re.sub("[^0-9]", "", pNumber), pStreet, re.sub("[^0-9]", "", pUnit), pCity, pState, pZip, lat, lon, zipprefix, plus4, county))
|
||||||
|
|
||||||
for exp in expanded:
|
for exp in expanded:
|
||||||
|
if cfg.verbose:
|
||||||
|
print(" libpostal expanded match:")
|
||||||
|
print(" ", exp)
|
||||||
parsed = parse_address(exp)
|
parsed = parse_address(exp)
|
||||||
pN = ""
|
pN = ""
|
||||||
pS = ""
|
pS = ""
|
||||||
@ -77,11 +82,21 @@ def advancedNormalize(number, street, unit, city, state, zipcode, lat, lon, zipp
|
|||||||
elif part[1] == "unit":
|
elif part[1] == "unit":
|
||||||
pU = part[0]
|
pU = part[0]
|
||||||
try:
|
try:
|
||||||
normalizedMatches.append(normalizeAddress(pN, pS, pU, pCity, pState, pZip, lat, lon, zipprefix, plus4))
|
if cfg.verbose:
|
||||||
|
print(" >", pN, pS, pU)
|
||||||
|
norm = normalizeAddress(pN, pS, pU, pCity, pState, pZip, lat, lon, zipprefix, plus4, county)
|
||||||
|
if cfg.verbose:
|
||||||
|
print(" >>", norm)
|
||||||
|
normalizedMatches.append(norm)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
if cfg.verbose:
|
||||||
|
print(" libpostal failure during normalizing expanded matches.")
|
||||||
|
print(" ", e)
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
if cfg.verbose:
|
||||||
|
print(" libpostal address results:")
|
||||||
|
print(" ", normalizedMatches)
|
||||||
if len(normalizedMatches) > 1:
|
if len(normalizedMatches) > 1:
|
||||||
weights = {"number": 5, "street": 5, "unit": 1, "city": 1, "state": 1, "zip": 3, "plus4": 8, "latitude": 0, "longitude": 0}
|
weights = {"number": 5, "street": 5, "unit": 1, "city": 1, "state": 1, "zip": 3, "plus4": 8, "latitude": 0, "longitude": 0}
|
||||||
sortedMatches = sorted(
|
sortedMatches = sorted(
|
||||||
@ -89,11 +104,19 @@ def advancedNormalize(number, street, unit, city, state, zipcode, lat, lon, zipp
|
|||||||
key=lambda item: sum(weights[k] for k, v in item.items() if v),
|
key=lambda item: sum(weights[k] for k, v in item.items() if v),
|
||||||
reverse=True
|
reverse=True
|
||||||
)
|
)
|
||||||
|
if cfg.verbose:
|
||||||
|
print(" libpostal selected best match:")
|
||||||
|
print(" ", sortedMatches[0])
|
||||||
return sortedMatches[0]
|
return sortedMatches[0]
|
||||||
elif len(normalizedMatches) == 1:
|
elif len(normalizedMatches) == 1:
|
||||||
|
if cfg.verbose:
|
||||||
|
print(" libpostal selected only match:")
|
||||||
|
print(" ", normalizedMatches[0])
|
||||||
return normalizedMatches[0]
|
return normalizedMatches[0]
|
||||||
|
|
||||||
# No matches, give up on the whole thing
|
# No matches, give up on the whole thing
|
||||||
|
if cfg.verbose:
|
||||||
|
print(" libpostal no address match found")
|
||||||
return {
|
return {
|
||||||
"number": number,
|
"number": number,
|
||||||
"street": street,
|
"street": street,
|
||||||
|
|||||||
@ -10,6 +10,7 @@ class AppConfig:
|
|||||||
useCensusToFillEmptyZIPs: bool
|
useCensusToFillEmptyZIPs: bool
|
||||||
advancedMode: bool
|
advancedMode: bool
|
||||||
noSkip4: bool
|
noSkip4: bool
|
||||||
|
verbose: bool
|
||||||
|
|
||||||
_CFG: Optional[AppConfig] = None
|
_CFG: Optional[AppConfig] = None
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user