加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
geoname2fips.py 37.77 KB
一键复制 编辑 原始数据 按行查看 历史
sherpya 提交于 2019-01-25 01:40 . geoname2fips: added a bunch of countries
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320
#!/usr/bin/env python3
# The MIT License (MIT)
#
# Copyright (c) 2019 Gianluigi Tiesi
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import os
import re
import csv
import argparse
from typing import Pattern
# noinspection PyPackageRequirements
from unidecode import unidecode
from collections import defaultdict
# dataset from: https://github.com/datasets/fips-10-4
# with the help of: https://en.wikipedia.org/wiki/List_of_FIPS_region_codes
# FIPS 10-4 country codes to maxmind country names
FIPS_COUNTRIES = {
'AFGHANISTAN': 'AF',
'AKROTIRI SOVEREIGN BASE AREA': 'AX',
'ALBANIA': 'AL',
'ALGERIA': 'AG',
'AMERICAN SAMOA': 'AQ',
'ANDORRA': 'AN',
'ANGOLA': 'AO',
'ANGUILLA': 'AV',
'ANTARCTICA': 'AY',
'ANTIGUA AND BARBUDA': 'AC',
'ARGENTINA': 'AR',
'ARMENIA': 'AM',
'ARUBA': 'AA',
'ASHMORE AND CARTIER ISLANDS': 'AT',
'AUSTRALIA': 'AS',
'AUSTRIA': 'AU',
'AZERBAIJAN': 'AJ',
'BAHAMAS': 'BF',
'BAHRAIN': 'BA',
'BAKER ISLAND': 'FQ',
'BANGLADESH': 'BG',
'BARBADOS': 'BB',
'BASSAS DA INDIA': 'BS',
'BELARUS': 'BO',
'BELGIUM': 'BE',
'BELIZE': 'BH',
'BENIN': 'BN',
'BERMUDA': 'BD',
'BHUTAN': 'BT',
'BOLIVIA': 'BL',
'BOSNIA AND HERZEGOVINA': 'BK',
'BOTSWANA': 'BC',
'BOUVET ISLAND': 'BV',
'BRAZIL': 'BR',
'BRITISH INDIAN OCEAN TERRITORY': 'IO',
'BRITISH VIRGIN ISLANDS': 'VI',
'BRUNEI': 'BX',
'BULGARIA': 'BU',
'BURKINA FASO': 'UV',
'BURMA': 'BM',
'BURUNDI': 'BY',
'CABO VERDE': 'CV',
'CAMBODIA': 'CB',
'CAMEROON': 'CM',
'CANADA': 'CA',
'CAYMAN ISLANDS': 'CJ',
'CENTRAL AFRICAN REPUBLIC': 'CT',
'CHAD': 'CD',
'CHILE': 'CI',
'CHINA': 'CH',
'CHRISTMAS ISLAND': 'KT',
'CLIPPERTON ISLAND': 'IP',
'COCOS ISLANDS': 'CK',
'COLOMBIA': 'CO',
'COMOROS': 'CN',
'CONGO': 'CG',
'COOK ISLANDS': 'CW',
'CORAL SEA ISLANDS': 'CR',
'COSTA RICA': 'CS',
'IVORY COAST': 'IV',
'CROATIA': 'HR',
'CUBA': 'CU',
'CYPRUS': 'CY',
'CZECHIA': 'EZ',
'DEMOCRATIC REPUBLIC OF TIMOR LESTE': 'TT',
'DENMARK': 'DA',
'DHEKELIA SOVEREIGN BASE AREA': 'DX',
'DJIBOUTI': 'DJ',
'DOMINICA': 'DO',
'DOMINICAN REPUBLIC': 'DR',
'ECUADOR': 'EC',
'EGYPT': 'EG',
'EL SALVADOR': 'ES',
'EQUATORIAL GUINEA': 'EK',
'ERITREA': 'ER',
'ESTONIA': 'EN',
'ESWATINI': 'WZ',
'ETHIOPIA': 'ET',
'ETOROFU HABOMAI KUNASHIRI AND SHIKOTAN ISLANDS': 'PJ',
'EUROPA ISLAND': 'EU',
'FALKLAND ISLANDS': 'FK',
'FAROE ISLANDS': 'FO',
'FEDERATED STATES OF MICRONESIA': 'FM',
'FIJI': 'FJ',
'FINLAND': 'FI',
'FRANCE': 'FR',
'FRENCH GUIANA': 'FG',
'FRENCH POLYNESIA': 'FP',
'FRENCH SOUTHERN TERRITORIES': 'FS',
'GABON': 'GB',
'GAMBIA': 'GA',
'GEORGIA': 'GG',
'GERMANY': 'GM',
'GHANA': 'GH',
'GIBRALTAR': 'GI',
'GLORIOSO ISLANDS': 'GO',
'GREECE': 'GR',
'GREENLAND': 'GL',
'GRENADA': 'GJ',
'GUADELOUPE': 'GP',
'GUAM': 'GQ',
'GUATEMALA': 'GT',
'GUERNSEY': 'GK',
'GUINEA': 'GV',
'GUINEA BISSAU': 'PU',
'GUYANA': 'GY',
'HAITI': 'HA',
'HASHEMITE KINGDOM OF JORDAN': 'JO',
'HEARD ISLAND AND MCDONALD ISLANDS': 'HM',
'HONDURAS': 'HO',
'HONG KONG': 'HK',
'HOWLAND ISLAND': 'HQ',
'HUNGARY': 'HU',
'ICELAND': 'IC',
'INDIA': 'IN',
'INDONESIA': 'ID',
'IRAN': 'IR',
'IRAQ': 'IZ',
'IRELAND': 'EI',
'ISLE OF MAN': 'IM',
'ISRAEL': 'IS',
'ITALY': 'IT',
'JAMAICA': 'JM',
'JAPAN': 'JA',
'JARVIS ISLAND': 'DQ',
'JERSEY': 'JE',
'JOHNSTON ATOLL': 'JQ',
'JUAN DE NOVA ISLAND': 'JU',
'KAZAKHSTAN': 'KZ',
'KENYA': 'KE',
'KINGMAN REEF': 'KQ',
'KIRIBATI': 'KR',
'KOSOVO': 'KV',
'KUWAIT': 'KU',
'KYRGYZSTAN': 'KG',
'LAOS': 'LA',
'LATVIA': 'LG',
'LEBANON': 'LE',
'LESOTHO': 'LT',
'LIBERIA': 'LI',
'LIBYA': 'LY',
'LIECHTENSTEIN': 'LS',
'LUXEMBOURG': 'LU',
'MACAO': 'MC',
'MACEDONIA': 'MK',
'MADAGASCAR': 'MA',
'MALAWI': 'MI',
'MALAYSIA': 'MY',
'MALDIVES': 'MV',
'MALI': 'ML',
'MALTA': 'MT',
'MARSHALL ISLANDS': 'RM',
'MARTINIQUE': 'MB',
'MAURITANIA': 'MR',
'MAURITIUS': 'MP',
'MAYOTTE': 'MF',
'MEXICO': 'MX',
'MIDWAY ISLANDS': 'MQ',
'MONACO': 'MN',
'MONGOLIA': 'MG',
'MONTENEGRO': 'MJ',
'MONTSERRAT': 'MH',
'MOROCCO': 'MO',
'MOZAMBIQUE': 'MZ',
'MYANMAR': 'MM',
'NAMIBIA': 'WA',
'NAURU': 'NR',
'NAVASSA ISLAND': 'BQ',
'NEPAL': 'NP',
'NETHERLANDS': 'NL',
'NETHERLANDS ANTILLES': 'NT',
'NEW CALEDONIA': 'NC',
'NEW ZEALAND': 'NZ',
'NICARAGUA': 'NU',
'NIGER': 'NG',
'NIGERIA': 'NI',
'NIUE': 'NE',
'NORFOLK ISLAND': 'NF',
'NORTH KOREA': 'KN',
'NORTHERN MARIANA ISLANDS': 'CQ',
'NORWAY': 'NO',
'OMAN': 'MU',
'PAKISTAN': 'PK',
'PALAU': 'PS',
'PALMYRA ATOLL': 'LQ',
'PANAMA': 'PM',
'PAPUA NEW GUINEA': 'PP',
'PARACEL ISLANDS': 'PF',
'PARAGUAY': 'PA',
'PERU': 'PE',
'PHILIPPINES': 'RP',
'PITCAIRN ISLANDS': 'PC',
'POLAND': 'PL',
'PORTUGAL': 'PO',
'PUERTO RICO': 'RQ',
'QATAR': 'QA',
'REPUBLIC OF KOREA': 'KS',
'REPUBLIC OF LITHUANIA': 'LH',
'REPUBLIC OF MOLDOVA': 'MD',
'REPUBLIC OF CONGO': 'CF',
'ROMANIA': 'RO',
'RUSSIA': 'RS',
'RWANDA': 'RW',
'REUNION': 'RE',
'SAINT BARTHELEMY': 'TB',
'SAINT HELENA': 'SH',
'SAINT LUCIA': 'ST',
'SAINT MARTIN': 'RN',
'SAINT PIERRE AND MIQUELON': 'SB',
'SAINT VINCENT AND GRENADINES': 'VC',
'SAMOA': 'WS',
'SAN MARINO': 'SM',
'SAUDI ARABIA': 'SA',
'SENEGAL': 'SG',
'SERBIA': 'RI',
'SEYCHELLES': 'SE',
'SIERRA LEONE': 'SL',
'SINGAPORE': 'SN',
'SLOVAKIA': 'LO',
'SLOVENIA': 'SI',
'SOLOMON ISLANDS': 'BP',
'SOMALIA': 'SO',
'SOUTH AFRICA': 'SF',
'SOUTH GEORGIA AND SOUTH SANDWICH ISLANDS': 'SX',
'SOUTH SUDAN': 'SS',
'SPAIN': 'SP',
'SPRATLY ISLANDS': 'PG',
'SRI LANKA': 'CE',
'ST KITTS AND NEVIS': 'SC',
'SUDAN': 'SU',
'SURINAME': 'NS',
'SVALBARD AND JAN MAYEN': 'SV',
'SWEDEN': 'SW',
'SWITZERLAND': 'SZ',
'SYRIA': 'SY',
'SAO TOME AND PRINCIPE': 'TP',
'TAIWAN': 'TW',
'TAJIKISTAN': 'TI',
'TANZANIA': 'TZ',
'THAILAND': 'TH',
'TOGO': 'TO',
'TOKELAU': 'TL',
'TONGA': 'TN',
'TRINIDAD AND TOBAGO': 'TD',
'TROMELIN ISLAND': 'TE',
'TUNISIA': 'TS',
'TURKEY': 'TU',
'TURKMENISTAN': 'TX',
'TURKS AND CAICOS ISLANDS': 'TK',
'TUVALU': 'TV',
'US VIRGIN ISLANDS': 'VQ',
'UGANDA': 'UG',
'UKRAINE': 'UP',
'UNDESIGNATED SOVEREIGNTY': 'UU',
'UNITED ARAB EMIRATES': 'AE',
'UNITED KINGDOM': 'UK',
'UNITED STATES': 'US',
'URUGUAY': 'UY',
'UZBEKISTAN': 'UZ',
'VANUATU': 'NH',
'VATICAN CITY': 'VT',
'VENEZUELA': 'VE',
'VIETNAM': 'VM',
'WAKE ISLAND': 'WQ',
'WALLIS AND FUTUNA': 'WF',
'WEST BANK': 'WE',
'WESTERN SAHARA': 'WI',
'YEMEN': 'YM',
'ZAMBIA': 'ZA',
'ZIMBABWE': 'ZI'
}
COUNTRY_IGNORE = (
# without FIPS 10-4 code
'ALAND',
'BONAIRE, SINT EUSTATIUS, AND SABA',
'CURACAO',
'SINT MAARTEN',
'US MINOR OUTLYING ISLANDS',
'PALESTINE', # WE? no entries on wikipedia
# empty in fips csv
'AMERICAN SAMOA',
'COOK ISLANDS',
'DEMOCRATIC REPUBLIC OF TIMOR LESTE',
'FRENCH POLYNESIA',
'FRENCH SOUTHERN TERRITORIES',
'HONG KONG',
'MALTA',
'MARSHALL ISLANDS',
'MONTENEGRO',
'NEW CALEDONIA',
'NORTHERN MARIANA ISLANDS',
'SAINT PIERRE AND MIQUELON',
'SINGAPORE',
'SVALBARD AND JAN MAYEN',
'TOKELAU',
'TUVALU',
'US VIRGIN ISLANDS',
'WALLIS AND FUTUNA',
# missing in fips csv
'ISLE OF MAN',
'MYANMAR',
'SOUTH SUDAN',
# wip
'INDIA',
'INDONESIA',
'IRAN',
'ISRAEL',
'IVORY COAST',
'KAZAKHSTAN',
'KENYA',
'LAOS',
'LATVIA',
'LIBERIA',
'LIBYA',
'LUXEMBOURG',
'MACEDONIA',
'MALAWI',
'MAURITANIA',
'MEXICO',
'MONGOLIA',
'MOROCCO',
'MOZAMBIQUE',
'NAMIBIA',
'NEPAL',
'NIGERIA',
'NORTH KOREA',
'PAKISTAN',
'PAPUA NEW GUINEA',
'PARAGUAY',
'PHILIPPINES',
'POLAND',
'QATAR',
'REPUBLIC OF CONGO',
'REPUBLIC OF KOREA',
'REPUBLIC OF LITHUANIA',
'REPUBLIC OF MOLDOVA',
'RUSSIA',
'SAINT LUCIA',
'SAUDI ARABIA',
'SENEGAL',
'SERBIA',
'SLOVAKIA',
'SLOVENIA',
'SOMALIA',
'SOUTH AFRICA',
'SRI LANKA',
'SUDAN',
'SWITZERLAND',
'TANZANIA',
'THAILAND',
'TUNISIA',
'TURKMENISTAN',
'UGANDA',
'UKRAINE',
'UNITED KINGDOM',
'VIETNAM'
)
CITY_IGNORE = {
'AZERBAIJAN': ('AGHSU RAYON',),
'BAHAMAS': ('CENTRAL ABACO DISTRICT', 'EAST GRAND BAHAMA DISTRICT', 'HOPE TOWN DISTRICT', 'NORTH ELEUTHERA'),
'CZECHIA': ('CZECHIA',), # city?
'CONGO': ('BOENDE',),
'ESWATINI': ('ESWATINI',), # city?
}
REGION_IGNORE = {
'BAHAMAS': ('NORTH ANDROS DISTRICT', 'CENTRAL ANDROS DISTRICT', 'MOORES ISLAND DISTRICT', 'NORTH ABACO DISTRICT',
'GRAND CAY DISTRICT', 'SPANISH WELLS DISTRICT', 'EAST GRAND BAHAMA DISTRICT',
'WEST GRAND BAHAMA DISTRICT'),
'BAHRAIN': ('NORTHERN', 'SOUTHERN GOVERNORATE'),
'BOSNIA AND HERZEGOVINA': ('BRCKO',),
'BOTSWANA': ('CHOBE DISTRICT', 'JWANENG', 'LOBATSE'),
'BHUTAN': ('GASA', 'TRASHI YANGSTE'),
'BURKINA FASO': ('CASCADES REGION', 'CENTRE', 'CENTRE EST', 'CENTRE NORD', 'CENTRE OUEST', 'EST', 'HAUTS BASSINS',
'NORD', 'PLATEAU CENTRAL', 'SUD OUEST'),
'CAMBODIA': ('TBOUNG KHMUM',),
# since 2018
'CHILE': ('NUBLE',),
# since 2010
'CUBA': ('MAYABEQUE',),
'EGYPT': ('LUXOR',),
'FINLAND': ('CENTRAL FINLAND', 'NORTHERN OSTROBOTHNIA', 'SOUTHERN OSTROBOTHNIA', 'CENTRAL OSTROBOTHNIA',
'OSTROBOTHNIA', 'FINLAND PROPER', 'TAVASTIA PROPER', 'PIRKANMAA', 'KYMENLAAKSO', 'UUSIMAA',
'NORTH KARELIA', 'SATAKUNTA', 'SOUTHERN SAVONIA', 'PAIJANNE TAVASTIA', 'NORTHERN SAVO',
'SOUTH KARELIA', 'KAINUU'),
# since 2016
'FRANCE': ('NOUVELLE AQUITAINE', 'BOURGOGNE FRANCHE COMTE', 'HAUTS DE FRANCE', 'OCCITANIE', 'GRAND EST',
'AUVERGNE RHONE ALPES', 'NORMANDY'),
'GEORGIA': ('SAMTSKHE JAVAKHETI', 'GURIA', 'IMERETI', 'KAKHETI', 'KVEMO KARTLI',
'RACHA LECHKHUMI AND KVEMO SVANETI', 'SAMEGRELO AND ZEMO SVANETI', 'SHIDA KARTLI'),
'GREECE': ('NORTH AEGEAN', 'SOUTH AEGEAN', 'CENTRAL GREECE', 'WEST GREECE', 'CENTRAL MACEDONIA', 'WEST MACEDONIA',
'CRETE', 'EPIRUS', 'IONIAN ISLANDS', 'PELOPONNESE', 'MOUNT ATHOS'),
# since 2018
'GREENLAND': ('QEQERTALIK', 'AVANNAATA'),
'GRENADA': ('CARRIACOU AND PETITE MARTINIQUE',),
'IRELAND': ('LEINSTER', 'MUNSTER', 'CONNAUGHT', 'ULSTER'),
'MALDIVES': ('NORTHERN ARI ATOLL',),
'NEW ZEALAND': ('TASMAN',), # Tasman Region was not assigned a code for unknown reasons. (wikipedia)
'NORWAY': ('TRONDELAG',),
'OMAN': ('AL BATINAH NORTH GOVERNORATE', 'AL BATINAH SOUTH', 'NORTHEASTERN GOVERNORATE',
'SOUTHEASTERN GOVERNORATE'),
'TAIWAN': ('YUNLIN', 'NANTOU', 'CHIAYI', 'CHIAYI COUNTY', 'PENGHU COUNTY', 'HSINCHU COUNTY', 'CHANGHUA',
'TAICHUNG CITY', 'TAINAN', 'TAOYUAN', 'KEELUNG', 'YILAN', 'MIAOLI', 'KINMEN COUNTY', 'PINGTUNG',
'LIENCHIANG', 'HUALIEN', 'TAITUNG', 'HSINCHU', 'NEW TAIPEI'),
'TAJIKISTAN': ('DUSHANBE', 'GORNO BADAKHSHAN', 'REPUBLICAN SUBORDINATION'),
'TRINIDAD AND TOBAGO': ('SAN JUAN/LAVENTILLE', 'TUNAPUNA/PIARCO', 'PENAL/DEBE', 'POINT FORTIN',
'COUVA TABAQUITE TALPARO', 'DIEGO MARTIN', 'CHAGUANAS', 'SIPARIA', 'PRINCES TOWN',
'SANGRE GRANDE'),
'PANAMA': ('NGOEBE BUGLE', 'PANAMA OESTE', 'EMBERA WOUNAAN', 'GUNA YALA'),
# since 2017
'SIERRA LEONE': ('NORTH WEST',),
'SOLOMON ISLANDS': ('HONIARA',),
'SPAIN': ('CEUTA', 'MELILLA'),
'TONGA': ('EUA', 'NIUAS'),
# since 2004
'YEMEN': ('RAYMAH', 'SOQATRA'),
# since 2011
'ZAMBIA': ('MUCHINGA',)
}
REGION_REPLACE = {
'AE': {
'ABU ZABY': 'ABU DHABI',
'DUBAYY': 'DUBAI'
},
'AF': {
'KABOL': 'KABUL',
'KANDAHAR KANDAHAR': 'KANDAHAR'
},
'AG': {
'ALGER': 'ALGIERS',
'TAMANGHASSET': 'TAMANRASSET',
},
'AJ': {
'ABSERON': 'ABSHERON',
'BAKI': 'BAKU CITY',
'GANCA': 'GANJA CITY',
'NAXCIVAN': 'NAKHICHEVAN',
'YARDIMLI': 'YARDYMLI',
'SAKI': 'SHAKI CITY',
'SUMQAYIT': 'SUMQAYIT CITY',
},
'AL': {
'BERAT': 'BERATIT',
'DIBER': 'DIBRES',
'DURRES': 'DURRESIT',
'ELBASAN': 'ELBASANIT',
'FIER': 'FIERIT',
'TIRANE': 'TIRANA',
'GJIROKASTER': 'GJIROKASTRES',
'KORCE': 'KORCES',
'LEZHE': 'LEZHES',
'SHKODER': 'SHKODRES',
'VLORE': 'VLORES'
},
'AM': {
'LORRI': 'LORI'
},
'AO': {
'CUANDO CUBANGO': 'CUANDO COBANGO',
'CUANZA SUL': 'KWANZA SUL',
'LUNDA NORTE': 'LUANDA NORTE',
},
'AR': {
'TIERRA DEL FUEGO ANTARTIDA E ISLAS DEL ATLANTICO SUR': 'TIERRA DEL FUEGO'
},
'AU': {
'KARNTEN': 'CARINTHIA',
'NIEDEROSTERREICH': 'LOWER AUSTRIA',
'OBEROSTERREICH': 'UPPER AUSTRIA',
'STEIERMARK': 'STYRIA',
'TIROL': 'TYROL',
'WIEN': 'VIENNA'
},
'BA': {
'AL MUHARRAQ': 'MUHARRAQ',
'AL ASIMAH': 'MANAMA',
'AR RIFA WA AL MINTAQAH AL JANUBIYAH': 'AR RIFA'
},
'BC': {
'CENTRAL': 'CENTRAL DISTRICT',
'FRANCISTOWN': 'CENTRAL',
'SOUTH EAST': 'GABORONE',
'SOUTHERN': 'NGWAKETSI',
},
'BD': {
'HAMILTON': 'HAMILTON CITY',
},
'BE': {
'BRABANT WALLON': 'WALLONIA',
'BRUSSELS HOOFDSTEDELIJK GEWEST/REGION DE BRUXELLES CAPITALE': 'BRUSSELS CAPITAL',
'VLAMMS BRABANT': 'FLANDERS'
},
'BF': {
# 'FREEPORT': 'CITY OF FREEPORT',
'NICHOLLSTOWN AND BERRY ISLANDS': 'BERRY ISLANDS',
'SANDY POINT': 'SOUTH ABACO'
},
'BG': {
'RANGPUR DIVISION': 'RAJSHAHI',
'MYMENSINGH DIVISION': 'DHAKA'
},
'BK': {
'FEDERATION OF BOSNIA AND HERZEGOVINA': 'FEDERATION OF B&H',
'REPUBLICA SRPSKA': 'REPUBLIKA SRPSKA'
},
'BO': {
'BRESTSKAYA VOBLASTS': 'BREST',
'HOMYELSKAYA VOBLASTS': 'GOMEL',
'HRODZYENSKAYA VOBLASTS': 'GRODNENSKAYA',
'MAHILYOWSKAYA VOBLASTS': 'MOGILEV',
'MINSKAYA VOBLASTS': 'MINSK CITY',
'VITSYEBSKAYA VOBLASTS': 'VITEBSK'
},
'BP': {
'MAKIRA': 'MAKIRA ULAWA'
},
'BR': {
'DISTRITO FEDERAL': 'FEDERAL DISTRICT'
},
'BT': {
'CHHUKHA': 'CHUKHA',
'CHIRANG': 'TSIRANG',
'DAGA': 'DAGANA',
'GEYLEGPHUG': 'SARPANG',
'HA': 'HAA',
'LHUNTSHI': 'LHUNTSE',
'PEMAGATSEL': 'PEMAGATSHEL',
'SAMDRUP': 'SAMDRUP JONGKHAR',
'TASHIGANG': 'TRASHIGANG',
'TONGSA': 'TRONGSA',
'WANGDI PHODRANG': 'WANGDUE PHODRANG'
},
'BU': {
'KHASKOVO': 'HASKOVO',
'KURDZHALI': 'KARDZHALI',
'SOFIYA': 'SOFIA',
'SOFIYA GRAD': 'SOFIA CAPITAL',
'TURGOVISHTE': 'TARGOVISHTE',
'VELIKO TURNOVO': 'VELIKO TARNOVO'
},
'CA': {
'YUKON TERRITORY': 'YUKON'
},
'CB': {
'BANTEAY MEAN CHEAY': 'BANTEAY MEANCHEY',
'BATDAMBANG': 'BATTAMBANG',
'KAMPONG SPOE': 'KAMPONG SPEU',
'KAMPONG THUM': 'KAMPONG THOM',
'KAOH KONG': 'KOH KONG',
'KRACHEH': 'KRATIE',
'KEB': 'KEP',
'MONDOL KIRI': 'MONDOLKIRI',
'PHNUM PENH': 'PHNOM PENH',
'POUTHISAT': 'PURSAT',
'PREAH SEIHANU': 'PREAH SIHANOUK',
'ROTANAH KIRI': 'RATANAKIRI',
'SIEM REAB': 'SIEM REAP',
'STOENG TRENG': 'STUNG TRENG',
'TAKEV': 'TAKEO',
'OTDAR MEAN CHEAY': 'OTAR MEANCHEY'
},
'CD': {
'OUADDAI': 'OUADAI'
},
'CG': {
'SUD KIVU': 'SOUTH KIVU',
'KINSHASA': 'KINSHASA CITY'
},
'CH': {
'NEI MONGOL INNER MONGOLIA': 'INNER MONGOLIA AUTONOMOUS REGION',
'XIZANG TIBET': 'TIBET',
'NINGXIA NINGXIA': 'NINGXIA HUI AUTONOMOUS REGION'
},
'CI': {
'BIO BIO': 'BIOBIO',
'AISEN DEL GENERAL CARLOS IBANEZ DEL CAMPO': 'AYSEN',
'REGION METROPOLITANA': 'SANTIAGO METROPOLITAN',
'LIBERTADOR GENERAL BERNARDO OHIGGINS': 'OHIGGINS REGION',
'MAGALLANES Y DE LA ANTARTICA CHILENA': 'REGION OF MAGALLANES'
},
'CM': {
'OUEST WEST': 'WEST REGION',
'NORD OUEST NORTH WEST': 'NORTH WEST REGION',
'SUD OUEST SOUTH WEST': 'SOUTH WEST REGION',
'SUD SOUTH': 'SOUTH'
},
'CN': {
'ANJOUAN': 'NDZUWANI'
},
'CO': {
'DISTRITO CAPITAL': 'BOGOTA DC'
},
'CT': {
'NANA GREBINGUI': 'NANA GREBIZI'
},
'CU': {
'ISLA DE LA JUVENTUD': 'MUNICIPIO ESPECIAL ISLA DE LA JUVENTUD',
'LA HABANA': 'HAVANA'
},
'CV': {
'CAPE VERDE': 'CABO VERDE',
'RIBEIRA GRANDE': 'RIBEIRA GRANDE DE SANTIAGO',
'SANTA CRUZ': 'SAO LOURENCO DOS ORGAOS', # since 2005
},
'CY': {
'FAMAGUSTA': 'AMMOCHOSTOS',
'PAPHOS': 'PAFOS',
'LARNACA': 'LARNAKA',
'KYRENIA': 'KERYNEIA'
},
'DA': {
'SYDDANMARK': 'SOUTH DENMARK',
'NORDJYLLAND': 'NORTH DENMARK',
'MIDTJYLLEN': 'CENTRAL JUTLAND',
'HOVEDSTADEN': 'CAPITAL REGION',
'SJAELLAND': 'ZEALAND'
},
'DJ': {
'TADJOURA': 'TADJOURAH'
},
'DR': {
'DISTRITO NACIONAL': 'NACIONAL',
'BAHORUCO': 'BAORUCO',
'SALCEDO': 'HERMANAS MIRABAL'
},
'EC': {
'ORELLANA': 'FRANCISCO DE ORELLANA'
},
'EG': {
'AL JIZAH': 'GIZA',
'ASH SHARQIYAH': 'SHARQIA',
'AD DAQAHLIYAH': 'DAKAHLIA',
'AL QALYUBIYAH': 'QALYUBIA',
'AL GHARBIYAH': 'GHARBIA',
'AL FAYYUM': 'FAIYUM',
'AL ISKANDARIYAH': 'ALEXANDRIA',
'AL QAHIRAH': 'CAIRO GOVERNORATE',
'QINA': 'QENA',
'AL MINUFIYAH': 'MONUFIA',
'KAFR ASH SHAYKH': 'KAFR EL SHEIKH',
'AL BUHAYRAH': 'BEHEIRA',
'BANI SUWAYF': 'BENI SUWEIF',
'DUMYAT': 'DAMIETTA GOVERNORATE',
'AL BAHR AL AHMAR': 'RED SEA',
'AL ISMAILIYAH': 'ISMAILIA GOVERNORATE',
'AL MINYA': 'MINYA',
'SHAMAL SINA': 'NORTH SINAI',
'JANUB SINA': 'SOUTH SINAI',
'BUR SAID': 'PORT SAID',
'SUHAJ': 'SOHAG',
'AS SUWAYS': 'SUEZ'
},
'EN': {
'TARTUMAA': 'TARTU',
'LAANEMAA': 'LAANE',
'SAAREMAA': 'SAARE'
},
'ER': {
'MAAKEL': 'MAEKEL',
'DEBUBAWI KEYIH BAHRI': 'SOUTHERN RED SEA',
'SEMENAWI KEYIH BAHRI': 'NORTHERN RED SEA'
},
'ET': {
'AMARA': 'AMHARA',
'ADIS ABEBA': 'ADDIS ABABA',
'YEDEBUB BIHEROCH BIHERESEBOCH NA HIZBOCH': 'SOUTHERN NATIONS, NATIONALITIES, AND PEOPLES REGION',
'GAMBELA HIZBOCH': 'GAMBELA',
'HARERI HIZB': 'HARARI',
'SUMALE': 'SOMALI'
},
'EZ': {
'JIHOMORAVKY KRAJ': 'SOUTH MORAVIAN',
'ZLINSKY KRAJ': 'ZLIN',
'STREDOCESKY KRAJ': 'CENTRAL BOHEMIA',
'MORAVSKOLEZSKY KRAJ': 'MORAVSKOSLEZSKY'
},
'FI': {
'LAPPI': 'LAPLAND'
},
'FR': {
'CENTRE': 'CENTRE VAL DE LOIRE',
'BRETAGNE': 'BRITTANY',
'CORSE': 'CORSICA',
'LIMOUSIN': 'LIMOSINE'
},
'GA': {
'WESTERN': 'WEST COAST'
},
'GB': {
'NGOUNIE': 'NGOUNI'
},
'GG': {
'TBILISI': 'KALAKI TBILISI',
'MTSKHETIS RAIONI': 'MTSKHETA MTIANETI', # wtf
},
'GM': {
'BAYERN': 'BAVARIA',
'NORDRHEIN WESTFALEN': 'NORTH RHINE WESTPHALIA',
'HESSEN': 'HESSE',
'NIEDERSACHSEN': 'LOWER SAXONY',
'THURINGEN': 'THURINGIA',
'SACHSEN ANHALT': 'SAXONY ANHALT',
'SACHSEN': 'SAXONY',
'BERLIN': 'LAND BERLIN'
},
'GR': {
'ATTIKI': 'ATTICA',
'LARISA': 'THESSALY',
'EVROS': 'EAST MACEDONIA AND THRACE'
},
'GT': {
'SUCHITEPEQUEZ': 'SUCHITEPEQUE'
},
'HA': {
'GRAND ANSE': 'GRANDANS'
},
'HO': {
'ISLAS DE LA BAHIA': 'BAY ISLANDS'
},
'HR': {
'VUKOVARSKO SRIJEMSKA': 'VUKOVAR SIRMIUM',
'SPLITSKO DALMATINSKA': 'SPLIT DALMATIA',
'ISTARSKA': 'ISTRIA',
'BRODSKO POSAVSKA': 'SLAVONSKI BROD POSAVINA',
'MEDIMURSKA': 'MEGIMURSKA',
'ZAGREBACKA': 'ZAGREB COUNTY',
'GRAD ZAGREB': 'ZAGREB'
},
'IC': {
'AUSTURLAND': 'EAST',
'HOFUDBORGARSVAEDI': 'CAPITAL',
'NORDURLAND EYSTRA': 'NORTHEAST',
'NORDURLAND VESTRA': 'NORTHWEST',
'SUDURLAND': 'SOUTH',
'SUDURNES': 'SOUTHERN PENINSULA',
'VESTFIRDIR': 'WESTFJORDS',
'VESTURLAND': 'WEST'
},
'IT': {
'LOMBARDIA': 'LOMBARDY',
'TOSCANA': 'TUSCANY',
'SARDEGNA': 'SARDINIA',
'ABRUZZI': 'ABRUZZO',
'SICILIA': 'SICILY',
'PUGLIA': 'APULIA',
'BASILICATA': 'BASILICATE',
'LAZIO': 'LATIUM',
'MARCHE': 'MARCHES',
'PIEMONTE': 'PIEDMONT',
'VALLE DAOSTA': 'AOSTA VALLEY'
},
'IZ': {
'AL ANBAR': 'ANBAR',
'AL BASRAH': 'BASRA',
'DAHUK': 'DIHOK',
'AT TAMIM': 'KIRKUK',
'NINAWA': 'NINEVEH'
},
'JA': {
'FUKUSHIMA': 'FUKUSHIMA KEN',
'GUMMA': 'GUNMA'
},
'JO': {
'AJLUN': 'AJLOUN',
'AL AQABAH': 'AQABA',
'AL KARAK': 'KARAK',
'AL BALQA': 'BALQA',
'AL MAFRAQ': 'MAFRAQ',
'AT TAFILAH': 'TAFIELAH',
'AZ ZARQA': 'ZARQA',
'JARASH': 'JERASH',
'JORDAN': 'HASHEMITE KINGDOM OF JORDAN',
},
'KG': {
'YSYK KOL': 'ISSYK KUL',
'CHUY': 'CHUYSKAYA',
'BISHKEK': 'GOROD BISHKEK',
'OSH': 'OSH OBLASTY'
},
'LE': {
'LIBAN SUD': 'SOUTH GOVERNORATE'
},
'MC': {
'MACAU': 'MACAO'
},
'MU': {
'AL BURAYMI': 'AL BURAIMI',
'MASQAT': 'MUSCAT',
'ZUFAR': 'DHOFAR'
},
'MV': {
'GNAVIYANI': 'GNYAVIYANI'
},
'MY': {
'PULAU PINANG': 'PENANG'
},
'NL': {
'NOORD BRABANT': 'NORTH BRABANT',
'NOORD HOLLAND': 'NORTH HOLLAND',
'ZUID HOLLAND': 'SOUTH HOLLAND'
},
'NU': {
'ATLANTICO SUR': 'COSTA CARIBE SUR'
},
'PO': {
'LISBOA': 'LISBON'
},
'PS': {
'AIMELIK': 'AIMELIIK'
},
'PU': {
'BOLAMA': 'BOLAMA AND BIJAGOS'
},
'RW': {
'OUEST': 'WESTERN',
'SUD': 'SOUTHERN',
'NORD': 'NORTHERN'
},
'SC': {
'SAINT JOHN CAPISTERRE': 'SAINT JOHN CAPESTERRE',
'SAINT PAUL CAPISTERRE': 'SAINT PAUL CAPESTERRE',
'SAINT THOMAS MIDDLE ISLAND': 'MIDDLE ISLAND'
},
'SE': {
'GRAND ANSE': 'GRAND ANSE PRASLIN'
},
'SL': {
},
'SM': {
'MONTE GIARDINO': 'MONTEGIARDINO',
'SAN MARINO': 'SAN MARINO CITTA'
},
'SP': {
'PAIS VASCO': 'BASQUE COUNTRY',
'NAVARRA': 'NAVARRE',
'ANDALUCIA': 'ANDALUSIA',
'CASTILLA LA MANCHA': 'CASTILLE LA MANCHA',
'CATALUNA': 'CATALONIA',
'CASTILLA Y LEON': 'CASTILLE AND LEON',
'CANARIAS': 'CANARY ISLANDS',
'VALENCIANA': 'VALENCIA',
'ISLAS BALEARES': 'BALEARIC ISLANDS'
},
'SY': {
'HALAB': 'ALEPPO',
'DIMASHQ': 'DAMASCUS',
'HAMAH': 'HAMA',
'AL LADHIQIYAH': 'LATAKIA'
},
'TD': {
'TRINIDAD': 'TRINIDAD AND TOBAGO' # not sure
},
'TP': {
'SAO TOME': 'SAO TOME ISLAND'
},
'TU': {
'DUZCE': 'DUEZCE',
'BARTIN ILI': 'BARTIN',
'ICEL': 'MERSIN',
'GUMUSHANE': 'GUEMUESHANE',
'KARABUK': 'KARABUEK'
},
'TW': {
'KAO HSIUNG': 'KAOHSIUNG',
'TAI PEI': 'TAIPEI CITY'
},
'UZ': {
'SURHKONDARYO': 'SURXONDARYO',
'FARGHONA': 'FERGANA',
'KHORAZM': 'XORAZM',
'QORAQALPOGHISTON': 'KARAKALPAKSTAN',
'NAWOIY': 'NAVOIY'
},
'UV': {
'MOUHOUN': 'BOUCLE DU MOUHOUN'
},
'YM': {
'ADAN': 'ADEN',
'AMRAN': 'OMRAN',
'SANA': 'SANAA'
}
}
LOCATION_TO_PARENT = {
'BY': {
'BUJUMBURA MAIRIE PROVINCE': 'BUJUMBURA',
'BUJUMBURA RURAL PROVINCE': 'BUJUMBURA',
'ISALE': 'BUJUMBURA',
'ROHERO': 'BUJUMBURA',
'RUMONGE': 'BUJUMBURA', # since 2015
},
'CD': {
'AM DJARASS': 'BORKOU ENNEDI TIBESTI',
'AOZOU': 'BORKOU ENNEDI TIBESTI',
'BORKOU REGION': 'BORKOU ENNEDI TIBESTI',
'ENNEDI EST': 'BORKOU ENNEDI TIBESTI',
'ENNEDI OUEST': 'BORKOU ENNEDI TIBESTI',
'FADA': 'BORKOU ENNEDI TIBESTI',
'FAYA LARGEAU': 'BORKOU ENNEDI TIBESTI',
'TIBESTI REGION': 'BORKOU ENNEDI TIBESTI',
'BARH EL GAZEL': 'KANEM', # since 2008
'SALAL': 'KANEM'
},
'CG': {
'AKETI': 'ORIENTALE',
'BAS UELE': 'ORIENTALE',
'BUNIA': 'ORIENTALE',
'HAUT UELE': 'ORIENTALE',
'HAUT LOMANI': 'KATANGA',
'INONGO': 'BANDUNDU',
'ISIRO': 'ORIENTALE',
'ITURI': 'ORIENTALE',
'KASAI': 'KASAI OCCIDENTAL',
'KIKWIT': 'BANDUNDU',
'KOLWEZI': 'KATANGA',
'KONGOLO': 'KATANGA',
'KWILU': 'BANDUNDU',
'LODJA': 'KASAI ORIENTAL',
'LOMAMI': 'KATANGA',
'LUALABA': 'KATANGA',
'LUBUMBASHI': 'KATANGA',
'LUEBO': 'KASAI OCCIDENTAL',
'MAI NDOMBE': 'BANDUNDU',
'MALUKU': 'EQUATEUR',
'MONGALA': 'EQUATEUR',
'MWENE DITU': 'KATANGA',
'PROVINCE DU SUD UBANGI': 'EQUATEUR',
'SANKURU': 'KASAI ORIENTAL',
'SUNGU MONGA': 'KATANGA',
'TANGANIKA': 'KATANGA',
'TSHOPO': 'ORIENTALE',
'TSHUAPA': 'EQUATEUR',
'UPPER KATANGA': 'KATANGA',
'YANGA LIBENGE': 'EQUATEUR',
'YANGAMBI': 'ORIENTALE'
},
'CV': {
'PICOS': 'SANTA CATARINA', # since 2005
'COVA FIGUEIRA': 'SAO FILIPE',
'PORTO NOVO': 'RIBEIRA GRANDE',
'RIBEIRA BRAVA': 'SAO NICOLAU',
'QUEIMADAS': 'SAO NICOLAU',
'SANTA CATARINA DO FOGO': 'SANTA CATARINA',
'SAO SALVADOR DO MUNDO': 'SANTA CATARINA', # since 2005
'TARRAFAL DE SAO NICOLAU': 'SAO NICOLAU',
'TARRAFAL DE SÃO NICOLAU': 'SAO NICOLAU',
'VILA DA RIBEIRA BRAVA': 'SAO NICOLAU'
},
'EC': {
# since 2007
'PROVINCIA DE SANTA ELENA': 'GUAYAS',
'SANTA ELENA': 'GUAYAS',
'SALINAS': 'GUAYAS',
'PROVINCIA DE SANTO DOMINGO DE LOS TSACHILAS': 'PICHINCHA',
'SANTO DOMINGO DE LOS COLORADOS': 'PICHINCHA'
},
'UV': {
'BANFORA': 'COMOE',
'BOBO DIOULASSO': 'HOUET',
'DIAPAGA': 'TAPOA',
'DIÉBOUGOU': 'BOUGOURIBA',
'KONGOUSSI': 'BAM',
'KOUDOUGOU': 'BOULKIEMDE',
'OUAGADOUGOU': 'KADIOGO',
'OUAHIGOUYA': 'YATENGA',
'TENKODOGO': 'BOULGOU',
'ZINIARÉ': 'OUBRITENGA'
},
'UZ': {
# city and region have same name but different fips code :(
'TASHKENT': 'TOSHKENT',
'TOSHKENT SHAHRI': 'TOSHKENT'
},
'VC': {
'SAINT VINCENT AND GRENADINES': 'GRENADINES'
}
}
DIVISION_OVERRIDE = {
'AC': 'PARISH OF',
'AL': 'QARKU I',
'AR': 'FD',
'BL': 'DEPARTAMENTO DE',
'BN': 'DEPARTMENT',
'BT': 'DZONGKHAG',
'BU': 'OBLAST',
'CI': re.compile(r'(REGION DE LA |REGION DE |REGION DEL | REGION)'),
'CM': 'REGION',
'CO': re.compile(r'(DEPARTAMENTO DE.*?\s|\sDEPARTMENT)'),
'CS': 'PROVINCIA DE',
'CU': 'PROVINCIA DE',
'DJ': 'REGION',
'DR': re.compile(r'(PROVINCIA DE |PROVINCIA )'),
'EC': re.compile(r'(PROVINCIA DE |PROVINCIA DEL )'),
'ES': 'DEPARTAMENTO DE',
'ET': 'REGION',
'EZ': 'KRAJ',
'FM': 'STATE OF',
'GA': 'DIVISION',
'GG': 'RAIONI',
'GM': 'REGION',
'GT': re.compile(r'(DEPARTAMENTO DE.*?\s|\sDEPARTMENT)'),
'GV': 'REGION',
'HA': re.compile(r'(DEPARTEMENT DE L|DEPARTEMENT DE )'),
'HO': 'DEPARTAMENTO DE',
'HR': 'ZUPANIJA',
'HU': 'MEGYE',
'IZ': 'MUHAFAZAT',
'JM': 'PARISH OF',
'KG': 'OBLAST',
'KU': 'MUHAFAZAT',
'LE': 'MOHAFAZAT',
'ML': 'REGION',
'MU': 'MUHAFAZAT',
'MV': re.compile(r'( ATHOLHU| ATOLL)'),
'NG': 'REGION',
'NL': 'PROVINCIE',
'NO': 'COUNTY',
'NS': 'DISTRIKT',
'NU': 'DEPARTAMENTO DE',
'PE': re.compile(r'(REGION DE | REGION|DEPARTAMENTO DE )'),
'PM': re.compile(r'(PROVINCIA DE |PROVINCIA DEL )'),
'PS': 'STATE OF',
'RO': 'JUDETUL',
'SM': 'CASTELLO DI',
'SP': 'PRINCIPALITY OF',
'SW': (' LAN', 'S LAN'),
'SY': 'GOVERNORATE',
'TD': re.compile(r'(BOROUGH OF )'),
'TI': 'VILOYATI',
'UP': 'OBLAST',
'UV': 'PROVINCE DE LA ',
'UY': 'DEPARTAMENTO DE',
'UZ': re.compile(r'( VILOYATI| PROVINCE)'),
'VC': 'PARISH OF',
'VE': 'ESTADO',
'YM': 'MUHAFAZAT'
}
re_par1 = re.compile(r'\([^()]*\)')
re_par2 = re.compile(r'\[[^()]*\]')
# FIXME: better
def cleanup(text: str) -> str:
text = unidecode(text.upper()).replace('@', 'A').replace('\x7f', '')
text = re_par1.sub('', text)
text = re_par2.sub('', text)
for part in ('THE ', ' THE', 'CITY OF '):
text = text.replace(part, '')
text = text.replace(' ', ' ')
text = text.replace('-', ' ')
text = text.replace('.', '')
for quote in "ʼ’‘ʻ`'":
text = text.replace(quote, '')
return text.strip()
def search(rn, rd, c, e, verbose=False):
names = set()
names.add(rn)
if isinstance(rd, Pattern):
names.add(rd.sub('', rn))
elif isinstance(rd, tuple):
for r in rd:
names.add('{}{}'.format(rn, r))
elif isinstance(rd, str):
names.add('{} OF {}'.format(rd, rn))
names.add('{} {}'.format(rn, rd))
names.add('{} {}'.format(rd, rn))
if rd in rn:
rn = rn.replace(rd, '').strip()
names.add(rn)
if c == os.environ.get('C') and verbose:
print('\nSearching for {}:\n > {}'.format(', '.join(sorted(names)), ', '.join(sorted(e.keys()))))
for n in names:
if n in e:
if c == os.environ.get('C'):
print('Found {}'.format(n))
return n
def fill(ids, ccode, rcode):
for geoid in ids:
geoid2fips[geoid] = (ccode, rcode)
def parse_locations():
with open(opts.location_file, 'r', encoding='utf-8') as f:
locations = defaultdict(lambda: defaultdict(set))
for row in csv.DictReader(f):
country = cleanup(row['country_name'])
if not country:
continue # wtf?
region = cleanup(row['subdivision_1_name'] or row['country_name'])
city_name = cleanup(row['city_name']) if row['city_name'] else None
geoname_id = int(row['geoname_id'])
locations[country][region].add(geoname_id)
if city_name is not None:
locations[country][city_name].add(geoname_id)
# noinspection PyTypeChecker
cities[country][city_name] = region
sub2_name = cleanup(row['subdivision_2_name'])
if sub2_name:
# noinspection PyTypeChecker
sub2[country][city_name] = sub2_name
return locations
def parse_fips():
with open(opts.input_file, 'r', encoding='utf-8') as f:
fips = defaultdict(dict)
for row in csv.DictReader(f):
fips_country_code = row['region_code'][0:2]
fips_region_code = row['region_code'][2:4]
region_division = row['region_division'].split()[0].upper()
if region_division != 'COUNTRY' and fips_country_code not in region_divisions:
region_divisions[fips_country_code] = region_division
region_name = row['region_name'].split(' ')[0]
region_name = cleanup(region_name)
region_name = REGION_REPLACE.get(fips_country_code, {}).get(region_name, region_name)
value = (region_division, fips_country_code, fips_region_code, region_name)
if (fips_country_code in fips) \
and (fips_region_code in fips[fips_country_code]) \
and (value in fips[fips_country_code][region_name]):
raise Exception('Duplicate key for {}'.format(row))
# noinspection PyTypeChecker
fips[fips_country_code][region_name] = value
return fips
def ignore_city(country, city):
return country in CITY_IGNORE and city in CITY_IGNORE[country]
def ignore_region(country, region):
return country in REGION_IGNORE and region in REGION_IGNORE[country]
def correlate(locations, fips):
for country in sorted(locations.keys()):
fips_country_code = FIPS_COUNTRIES.get(country)
if fips_country_code is None and country not in COUNTRY_IGNORE:
raise Exception('Country {} not found in fips country table'.format(country))
if country in COUNTRY_IGNORE:
if fips_country_code:
for location in locations[country].values():
fill(location, fips_country_code, '00')
continue
entry = fips[fips_country_code]
for location_name in sorted(locations[country].keys()):
location_name = LOCATION_TO_PARENT.get(fips_country_code, {}).get(location_name, location_name)
region_name = None
region_division = region_divisions.get(fips_country_code)
location = locations[country][location_name]
found = search(location_name, region_division, fips_country_code, entry, verbose=True)
if found is None:
city = search(location_name, region_division, fips_country_code, cities[country])
if city is not None:
region_name = cities[country][city]
region_name = REGION_REPLACE.get(fips_country_code, {}).get(region_name, region_name)
found = search(region_name, region_division, fips_country_code, entry, verbose=True)
if found is None:
sub2_name = search(location_name, region_division, fips_country_code, sub2[country])
if sub2_name is not None:
region_name = sub2[country][sub2_name]
region_name = REGION_REPLACE.get(fips_country_code, {}).get(region_name, region_name)
found = search(region_name, region_division, fips_country_code, entry, verbose=True)
if found is None:
fill(location, fips_country_code, '00')
continue
if found is None:
if ignore_city(country, location_name) or ignore_region(country, location_name) or \
(region_name and ignore_region(country, region_name)):
if fips_country_code == os.environ.get('I'):
print('Ignoring: {} ({}) - {} ({})'.format(location_name, region_name, country, fips_country_code))
fill(location, fips_country_code, '00')
continue
if 'N' in os.environ:
print('Location {} ({}) not found in {} ({})'.format(location_name, region_name, country,
fips_country_code))
continue
raise Exception('Location {} ({}) not found in {} ({})'.format(location_name, region_name, country,
fips_country_code))
if region_name and ignore_region(country, region_name):
raise Exception('Ignored region found: {} in {}'.format(location_name, country))
fill(location, fips_country_code, entry[found][2])
def writecsv():
with open(opts.output_file, 'w', encoding='utf-8') as _out:
writer = csv.writer(_out)
writer.writerow(('geoname_id', 'country', 'region'))
for geoname_id in sorted(geoid2fips.keys()):
country_code, fipscode = geoid2fips[geoname_id]
writer.writerow((geoname_id, country_code, fipscode))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input-file', required=True, help='input csv fips 10-4 data file')
parser.add_argument('-o', '--output-file', required=True, help='output csv file')
parser.add_argument('-l', '--location-file', required=True, help='location file csv')
opts = parser.parse_args()
geoid2fips = {}
cities = defaultdict(dict)
sub2 = defaultdict(dict)
region_divisions = DIVISION_OVERRIDE.copy()
_locations = parse_locations()
_fips = parse_fips()
correlate(_locations, _fips)
writecsv()
# import json
# with open('regions.json', 'w', encoding='utf-8') as f:
# json.dump(_fips, f, sort_keys=True, indent=4, ensure_ascii=False)
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化