AddressDatabase/checkoa.py
2025-11-15 19:51:14 -07:00

100 lines
3.6 KiB
Python
Executable File

#!/usr/bin/python3
import os, json, traceback
from argparse import ArgumentParser
rowstocheck = 15000 # Stop reading a file after this many rows, speeds up analysis of many/large address files
oklist = []
emptygeometrylist = []
emptyaddresslist = []
nocitylist = []
noziplist = []
totallyemptylist = []
def checkGeojson(filename):
filedata = open(filename, 'r')
linecount = 0
okcount = 0
emptygeometrycount = 0
emptyaddresscount = 0
emptycitycount = 0
emptyzipcount = 0
for line in filedata:
linecount = linecount + 1
if linecount > rowstocheck:
break;
try:
data = json.loads(line)
bad = False
if not data["properties"]["number"] or not data["properties"]["street"]:
emptyaddresscount = emptyaddresscount + 1
bad = True
if not data["geometry"] or not data["geometry"]["coordinates"][0] or not data["geometry"]["coordinates"][1]:
emptygeometrycount = emptygeometrycount + 1
bad = True
if not data["properties"]["city"] and not data["properties"]["postcode"]: # Flag missing city unless postal code exists, because city can probably be filled from that
emptycitycount = emptycitycount + 1
bad = True
if not data["properties"]["postcode"]:
emptyzipcount = emptyzipcount + 1
bad = True
if bad == False:
okcount = okcount + 1
except Exception as e:
traceback.print_exc()
print("Error encountered while processing", filename, "at", line)
print(filename, ": OK:", str(okcount), "Bad: geometry:", str(emptygeometrycount), "address:", str(emptyaddresscount), "city:", str(emptycitycount), "zip:", str(emptyzipcount), " ", end="\r", flush=True)
filedata.close()
bad = False
if emptygeometrycount / linecount > .25:
emptygeometrylist.append(filename)
bad = True
if emptyaddresscount / linecount > .67:
emptyaddresslist.append(filename)
bad = True
if emptycitycount / linecount > .67:
nocitylist.append(filename)
bad = True
if emptyzipcount / linecount > .75:
noziplist.append(filename)
bad = True
if emptyaddresscount >= (linecount - 10): # Allow a couple not-fully-empty addresses, otherwise some broken ones won't be reported
totallyemptylist.append(filename)
bad = True
if bad == False:
oklist.append(filename)
parser = ArgumentParser(
description="Check OpenAddresses GeoJSON files and report on any problems found."
)
parser.add_argument(
"source",
help="File(s) to check.",
nargs='+'
)
if __name__ == "__main__":
args = parser.parse_args()
print("Checking " + str(len(args.source)) + " OpenAddresses data files.")
for filename in args.source:
checkGeojson(filename)
print(" ")
print()
print("== Report ==")
print(" Files missing geometry:")
for filename in emptygeometrylist:
print(" ", filename)
print(" Files missing street address:")
for filename in emptyaddresslist:
print(" ", filename)
print(" Files missing city:")
for filename in nocitylist:
print(" ", filename)
print(" Files missing postal code:")
for filename in noziplist:
print(" ", filename)
print(" Files missing all street addresses:")
for filename in totallyemptylist:
print(" ", filename)