Improve unit handling
This commit is contained in:
parent
2927059834
commit
8cd7e5c6f6
@ -7,7 +7,7 @@ import pandas as pd
|
||||
from pythonnet import load
|
||||
from scourgify import NormalizeAddress, normalize_address_record
|
||||
from src.zipfunctions import checkZIPCode, getZIP, getZIP4, getCityStateForZIP
|
||||
from src.constants import STATES, LONGHAND_STREET_TYPES, UNITS, ValidationException
|
||||
from src.constants import STATES, LONGHAND_STREET_TYPES, UNITS, LONG_UNITS, ValidationException
|
||||
import src.config
|
||||
import json
|
||||
import sys
|
||||
@ -127,6 +127,11 @@ STANDARDIZATION_NUMBER_REGEXES = {
|
||||
"^\.$": "", # Blank out .
|
||||
}
|
||||
|
||||
STANDARDIZATION_UNIT_REGEXES = {
|
||||
"^\s*(.+?)\s*[, ]\s*\1\s*$": r"\1", # Match repeated unit number with a space and/or comma between, example "12, 12"
|
||||
"^,$": ""
|
||||
}
|
||||
|
||||
ABBREV_PATTERN = ""
|
||||
ABBREV_PATTERN_LIST = []
|
||||
for (a, b) in LONGHAND_STREET_TYPES.items():
|
||||
@ -194,11 +199,23 @@ def removeUnitText(subaddr):
|
||||
if subaddr == None:
|
||||
return subaddr
|
||||
subaddr = subaddr.upper()
|
||||
for (lng, short) in LONG_UNITS.items():
|
||||
subaddr = subaddr.replace(lng, short)
|
||||
for label in UNITS:
|
||||
subaddr = subaddr.replace(label, "")
|
||||
subaddr = subaddr.replace(" ", " ")
|
||||
subaddr = subaddr.replace(" ", " ").replace(" ", " ")
|
||||
return subaddr.strip()
|
||||
|
||||
def cleanUnit(unit):
|
||||
unit = removeUnitText(unit)
|
||||
pattern = re.compile(r'^\s*(.+?)\s*[, ]\s*\1\s*$')
|
||||
m = pattern.match(unit)
|
||||
if m:
|
||||
unit = m.group(1)
|
||||
for (find, replace) in STANDARDIZATION_UNIT_REGEXES.items():
|
||||
unit = re.sub(find, replace, unit)
|
||||
return unit
|
||||
|
||||
def normalizeAddress(number, street, unit, city, state, zipcode, lat, lon, zipprefix=False, plus4="", county=False):
|
||||
cfg = src.config.get_config()
|
||||
if not number:
|
||||
@ -214,7 +231,9 @@ def normalizeAddress(number, street, unit, city, state, zipcode, lat, lon, zippr
|
||||
street = preStandardizeStreet(street.strip().upper())
|
||||
if cfg.verbose:
|
||||
print(" Preprocessed street:", street)
|
||||
unit = unit.strip().upper()
|
||||
unit = cleanUnit(unit.strip().upper())
|
||||
if cfg.verbose and unit:
|
||||
print(" Preprocessed unit:", unit)
|
||||
city = city.strip().upper()
|
||||
state = state.strip().upper()
|
||||
zipcode = (zipcode or "").strip()
|
||||
@ -324,7 +343,7 @@ def normalizeAddress(number, street, unit, city, state, zipcode, lat, lon, zippr
|
||||
city = addr["city"]
|
||||
|
||||
# Skip these if we already have a ZIP+4 code, assume it's accurate
|
||||
if zipcode is not None and len(zipcode) == 5 and not plus4:
|
||||
if zipcode is not None and len(zipcode) == 5 and (not plus4 or cfg.noSkip4):
|
||||
zipinfo = getCityStateForZIP(zipcode)
|
||||
if cfg.appendPlus4 or zipinfo == False or state != zipinfo["state"] or (cfg.appendUnitLabel and addr["address_line_2"] != ""):
|
||||
zipcode, plus4, streetonly, unitprefix, unit = getZIP4(number, streetonly, addr['address_line_2'], state, lat, lon, city, zipcode, county)
|
||||
|
||||
@ -221,9 +221,9 @@ UNITS = [
|
||||
'OFC',
|
||||
'PH',
|
||||
'PIER',
|
||||
'REAR',
|
||||
# 'REAR',
|
||||
'RM',
|
||||
'SIDE',
|
||||
# 'SIDE',
|
||||
'SLIP',
|
||||
'SPC',
|
||||
'STOP',
|
||||
@ -236,6 +236,25 @@ UNITS = [
|
||||
'(VACANT)' # One dataset does this...
|
||||
]
|
||||
|
||||
LONG_UNITS = {
|
||||
"APARTMENT": "APT",
|
||||
"BUILDING": "BLDG",
|
||||
"BASEMENT": "BSMT",
|
||||
"DEPARTMENT": "DEPT",
|
||||
"FLOOR": "FL",
|
||||
"FRONT": "FRNT",
|
||||
"HANGER": "HNGR",
|
||||
"LOBBY": "LBBY",
|
||||
"LOWER": "LOWR",
|
||||
"OFFICE": "OFC",
|
||||
"PENTHOUSE": "PH",
|
||||
"PENT HOUSE": "PH",
|
||||
"ROOM": "RM",
|
||||
"SPACE": "SPC",
|
||||
"SUITE": "STE",
|
||||
"TRAILER": "TRLR"
|
||||
}
|
||||
|
||||
STATES = {
|
||||
"ALABAMA": "AL",
|
||||
"ALASKA": "AK",
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user