Improve unit handling
This commit is contained in:
parent
2927059834
commit
8cd7e5c6f6
@ -7,7 +7,7 @@ import pandas as pd
|
|||||||
from pythonnet import load
|
from pythonnet import load
|
||||||
from scourgify import NormalizeAddress, normalize_address_record
|
from scourgify import NormalizeAddress, normalize_address_record
|
||||||
from src.zipfunctions import checkZIPCode, getZIP, getZIP4, getCityStateForZIP
|
from src.zipfunctions import checkZIPCode, getZIP, getZIP4, getCityStateForZIP
|
||||||
from src.constants import STATES, LONGHAND_STREET_TYPES, UNITS, ValidationException
|
from src.constants import STATES, LONGHAND_STREET_TYPES, UNITS, LONG_UNITS, ValidationException
|
||||||
import src.config
|
import src.config
|
||||||
import json
|
import json
|
||||||
import sys
|
import sys
|
||||||
@ -127,6 +127,11 @@ STANDARDIZATION_NUMBER_REGEXES = {
|
|||||||
"^\.$": "", # Blank out .
|
"^\.$": "", # Blank out .
|
||||||
}
|
}
|
||||||
|
|
||||||
|
STANDARDIZATION_UNIT_REGEXES = {
|
||||||
|
"^\s*(.+?)\s*[, ]\s*\1\s*$": r"\1", # Match repeated unit number with a space and/or comma between, example "12, 12"
|
||||||
|
"^,$": ""
|
||||||
|
}
|
||||||
|
|
||||||
ABBREV_PATTERN = ""
|
ABBREV_PATTERN = ""
|
||||||
ABBREV_PATTERN_LIST = []
|
ABBREV_PATTERN_LIST = []
|
||||||
for (a, b) in LONGHAND_STREET_TYPES.items():
|
for (a, b) in LONGHAND_STREET_TYPES.items():
|
||||||
@ -194,11 +199,23 @@ def removeUnitText(subaddr):
|
|||||||
if subaddr == None:
|
if subaddr == None:
|
||||||
return subaddr
|
return subaddr
|
||||||
subaddr = subaddr.upper()
|
subaddr = subaddr.upper()
|
||||||
|
for (lng, short) in LONG_UNITS.items():
|
||||||
|
subaddr = subaddr.replace(lng, short)
|
||||||
for label in UNITS:
|
for label in UNITS:
|
||||||
subaddr = subaddr.replace(label, "")
|
subaddr = subaddr.replace(label, "")
|
||||||
subaddr = subaddr.replace(" ", " ")
|
subaddr = subaddr.replace(" ", " ").replace(" ", " ")
|
||||||
return subaddr.strip()
|
return subaddr.strip()
|
||||||
|
|
||||||
|
def cleanUnit(unit):
|
||||||
|
unit = removeUnitText(unit)
|
||||||
|
pattern = re.compile(r'^\s*(.+?)\s*[, ]\s*\1\s*$')
|
||||||
|
m = pattern.match(unit)
|
||||||
|
if m:
|
||||||
|
unit = m.group(1)
|
||||||
|
for (find, replace) in STANDARDIZATION_UNIT_REGEXES.items():
|
||||||
|
unit = re.sub(find, replace, unit)
|
||||||
|
return unit
|
||||||
|
|
||||||
def normalizeAddress(number, street, unit, city, state, zipcode, lat, lon, zipprefix=False, plus4="", county=False):
|
def normalizeAddress(number, street, unit, city, state, zipcode, lat, lon, zipprefix=False, plus4="", county=False):
|
||||||
cfg = src.config.get_config()
|
cfg = src.config.get_config()
|
||||||
if not number:
|
if not number:
|
||||||
@ -214,7 +231,9 @@ def normalizeAddress(number, street, unit, city, state, zipcode, lat, lon, zippr
|
|||||||
street = preStandardizeStreet(street.strip().upper())
|
street = preStandardizeStreet(street.strip().upper())
|
||||||
if cfg.verbose:
|
if cfg.verbose:
|
||||||
print(" Preprocessed street:", street)
|
print(" Preprocessed street:", street)
|
||||||
unit = unit.strip().upper()
|
unit = cleanUnit(unit.strip().upper())
|
||||||
|
if cfg.verbose and unit:
|
||||||
|
print(" Preprocessed unit:", unit)
|
||||||
city = city.strip().upper()
|
city = city.strip().upper()
|
||||||
state = state.strip().upper()
|
state = state.strip().upper()
|
||||||
zipcode = (zipcode or "").strip()
|
zipcode = (zipcode or "").strip()
|
||||||
@ -324,7 +343,7 @@ def normalizeAddress(number, street, unit, city, state, zipcode, lat, lon, zippr
|
|||||||
city = addr["city"]
|
city = addr["city"]
|
||||||
|
|
||||||
# Skip these if we already have a ZIP+4 code, assume it's accurate
|
# Skip these if we already have a ZIP+4 code, assume it's accurate
|
||||||
if zipcode is not None and len(zipcode) == 5 and not plus4:
|
if zipcode is not None and len(zipcode) == 5 and (not plus4 or cfg.noSkip4):
|
||||||
zipinfo = getCityStateForZIP(zipcode)
|
zipinfo = getCityStateForZIP(zipcode)
|
||||||
if cfg.appendPlus4 or zipinfo == False or state != zipinfo["state"] or (cfg.appendUnitLabel and addr["address_line_2"] != ""):
|
if cfg.appendPlus4 or zipinfo == False or state != zipinfo["state"] or (cfg.appendUnitLabel and addr["address_line_2"] != ""):
|
||||||
zipcode, plus4, streetonly, unitprefix, unit = getZIP4(number, streetonly, addr['address_line_2'], state, lat, lon, city, zipcode, county)
|
zipcode, plus4, streetonly, unitprefix, unit = getZIP4(number, streetonly, addr['address_line_2'], state, lat, lon, city, zipcode, county)
|
||||||
|
|||||||
@ -221,9 +221,9 @@ UNITS = [
|
|||||||
'OFC',
|
'OFC',
|
||||||
'PH',
|
'PH',
|
||||||
'PIER',
|
'PIER',
|
||||||
'REAR',
|
# 'REAR',
|
||||||
'RM',
|
'RM',
|
||||||
'SIDE',
|
# 'SIDE',
|
||||||
'SLIP',
|
'SLIP',
|
||||||
'SPC',
|
'SPC',
|
||||||
'STOP',
|
'STOP',
|
||||||
@ -236,6 +236,25 @@ UNITS = [
|
|||||||
'(VACANT)' # One dataset does this...
|
'(VACANT)' # One dataset does this...
|
||||||
]
|
]
|
||||||
|
|
||||||
|
LONG_UNITS = {
|
||||||
|
"APARTMENT": "APT",
|
||||||
|
"BUILDING": "BLDG",
|
||||||
|
"BASEMENT": "BSMT",
|
||||||
|
"DEPARTMENT": "DEPT",
|
||||||
|
"FLOOR": "FL",
|
||||||
|
"FRONT": "FRNT",
|
||||||
|
"HANGER": "HNGR",
|
||||||
|
"LOBBY": "LBBY",
|
||||||
|
"LOWER": "LOWR",
|
||||||
|
"OFFICE": "OFC",
|
||||||
|
"PENTHOUSE": "PH",
|
||||||
|
"PENT HOUSE": "PH",
|
||||||
|
"ROOM": "RM",
|
||||||
|
"SPACE": "SPC",
|
||||||
|
"SUITE": "STE",
|
||||||
|
"TRAILER": "TRLR"
|
||||||
|
}
|
||||||
|
|
||||||
STATES = {
|
STATES = {
|
||||||
"ALABAMA": "AL",
|
"ALABAMA": "AL",
|
||||||
"ALASKA": "AK",
|
"ALASKA": "AK",
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user