507 lines
17 KiB
PHP
507 lines
17 KiB
PHP
<?PHP
|
||
/**
|
||
* Gathers functions for setting uniform place names.
|
||
*/
|
||
declare(strict_types = 1);
|
||
|
||
/**
|
||
* Gathers functions for setting uniform place names.
|
||
*/
|
||
final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract {
|
||
|
||
/**
|
||
* Substrings of an place name listed as a key in this array will be replaced
|
||
* by the corresponding value.
|
||
*/
|
||
private const _NAME_SANITIZATIONS = [
|
||
" - " => "-",
|
||
"unbekannt" => "",
|
||
"Unbekannt" => "",
|
||
"unknown" => "",
|
||
"Unknown" => "",
|
||
];
|
||
|
||
/** Blacklist for comparison with country names */
|
||
private const _COUNTRY_REWRITE_BLACKLISTED_TERMS = [
|
||
'District',
|
||
'Distrikt',
|
||
'India',
|
||
'Indien',
|
||
'Insel',
|
||
'Inseln',
|
||
'Tal',
|
||
'Yue',
|
||
];
|
||
|
||
private const _PLACE_TYPE_INDICATORS_GERMAN = [
|
||
'Insel',
|
||
'Stadt',
|
||
];
|
||
|
||
// Indicators signifying that a place is likely subordinate to the other
|
||
// if two places are provided in a comma-separated list
|
||
private const _PLACE_NARROWER_LOCATION_INDICATORS_GERMAN = [
|
||
'gasse',
|
||
'straße',
|
||
' Straße',
|
||
];
|
||
|
||
// Indicators signifying that a place is likely subordinate to the other
|
||
// if two places are provided in a comma-separated list
|
||
private const _PLACE_NARROWER_LOCATION_INDICATORS_HUNGARIAN = [
|
||
' körut ',
|
||
' utca ',
|
||
' út ',
|
||
];
|
||
|
||
private const _RELEVANT_ROMAN_NUMERALS = [
|
||
'I' => '1',
|
||
'II' => '2',
|
||
'III' => '3',
|
||
'IV' => '4',
|
||
'V' => '5',
|
||
'VI' => '6',
|
||
'VII' => '7',
|
||
'VIII' => '8',
|
||
'IX' => '9',
|
||
'X' => '10',
|
||
'XI' => '11',
|
||
'XII' => '12',
|
||
'XIII' => '13',
|
||
'XIV' => '14',
|
||
'XV' => '15',
|
||
'XVI' => '16',
|
||
'XVII' => '17',
|
||
'XVIII' => '18',
|
||
'XIX' => '19',
|
||
'XX' => '20',
|
||
];
|
||
|
||
/**
|
||
* @var array<string, list<string>>
|
||
*/
|
||
private static $_placeNameListCaches = [];
|
||
|
||
/**
|
||
* Rewrites indicators for narrower locations paired with a superordinate location
|
||
* into the format "Narrower (Broader)".
|
||
* E.g.: "Adalbrechtstr. 12, Berlin" > Adalbrechtstraße 12 (Berlin).
|
||
*
|
||
* @param string $name Name in which to rewrite.
|
||
* @param string $indicator Indicator for narrower place. E.g. "straße".
|
||
* @param string $separator Separating character between narrower and broader, e.g. ', '.
|
||
*
|
||
* @return string
|
||
*/
|
||
private static function _rewrite_narrower_broader_pairs_to_brackets(string $name, string $indicator, string $separator = ', '):string {
|
||
|
||
if (str_contains($name, $indicator)
|
||
&& substr_count($name, $indicator) === 1
|
||
&& substr_count($name, $separator) === 1
|
||
&& !str_contains($name, "(")
|
||
) {
|
||
|
||
$parts = explode(', ', $name);
|
||
|
||
// Skip entries like "Vaci utca 12 Budapest, Vaci utca"
|
||
$indicatorTrimmed = trim($indicator);
|
||
if ((str_ends_with($parts[0], $indicatorTrimmed) && str_contains($parts[1], $indicatorTrimmed))
|
||
|| (str_ends_with($parts[1], $indicatorTrimmed) && str_contains($parts[0], $indicatorTrimmed))
|
||
) {
|
||
return $name;
|
||
}
|
||
|
||
// Prevent errors in case of "Adalbrechtstraße 12, "
|
||
if (!empty($parts[0]) && !empty($parts[1])) {
|
||
|
||
if (str_contains($parts[0], $indicator)) { // Adalberthstraße 12, Berlin
|
||
$street = $parts[0];
|
||
$town = $parts[1];
|
||
}
|
||
else { // Berlin, Adalberthstraße 12
|
||
$street = $parts[1];
|
||
$town = $parts[0];
|
||
}
|
||
|
||
// Prevent rewrites in cases like "Deák Ferenc utca 16-18. Budapest, V."
|
||
if (str_contains($town, '.')) {
|
||
return $name;
|
||
}
|
||
|
||
return $street . ' (' . $town . ')';
|
||
|
||
}
|
||
|
||
}
|
||
|
||
return $name;
|
||
|
||
}
|
||
|
||
/**
|
||
* Cleans and consolidates name parts appearing regularly in German place names.
|
||
*
|
||
* @param string $name Name of a place.
|
||
*
|
||
* @return string
|
||
*/
|
||
private static function _clean_german_abbreviations(string $name):string {
|
||
|
||
// ABC, Inseln > ABC (Inseln)
|
||
foreach (self::_PLACE_TYPE_INDICATORS_GERMAN as $indicator) {
|
||
if (str_ends_with($name, ', ' . $indicator)) {
|
||
$name = str_replace(', ' . $indicator, ' (' . $indicator . ')', $name);
|
||
}
|
||
}
|
||
|
||
// Adalbrechtstr. 12 > Adalbrechtstraße 12
|
||
if (str_contains($name, "str. ") && \preg_match("/[a-zA-Z]str. [0-9]/", $name)) {
|
||
$name = str_replace("str. ", "straße ", $name);
|
||
}
|
||
|
||
// "Adalbrechtstraße. 12, Berlin" > Adalbrechtstraße 12 (Berlin)
|
||
|
||
foreach (self::_PLACE_NARROWER_LOCATION_INDICATORS_GERMAN as $indicator) {
|
||
$name = self::_rewrite_narrower_broader_pairs_to_brackets($name, $indicator, ', ');
|
||
}
|
||
|
||
return $name;
|
||
|
||
}
|
||
|
||
/**
|
||
* Cleans and consolidates name parts appearing regularly in Hungarian place names.
|
||
*
|
||
* @param string $name Name of a place.
|
||
*
|
||
* @return string
|
||
*/
|
||
private static function _clean_hungarian_abbreviations(string $name):string {
|
||
|
||
if (str_contains($name, " krt. ") && \preg_match("/\ krt\.\ [0-9]/", $name)) {
|
||
$name = str_replace(" krt. ", " körut ", $name);
|
||
}
|
||
if (str_contains($name, " u. ") && \preg_match("/\ u\.\ [0-9]/", $name)) {
|
||
$name = str_replace(" u. ", " utca ", $name);
|
||
}
|
||
if (str_contains($name, " ucca ") && \preg_match("/\ ucca\ [0-9]/", $name)) {
|
||
$name = str_replace(" ucca ", " utca ", $name);
|
||
}
|
||
if (str_contains($name, " utcza ") && \preg_match("/\ utcza\ [0-9]/", $name)) {
|
||
$name = str_replace(" utcza ", " utca ", $name);
|
||
}
|
||
if (str_contains($name, " rkp. ") && \preg_match("/\ rkp\.\ [0-9]/", $name)) {
|
||
$name = str_replace(" rkp. ", " rakpart ", $name);
|
||
}
|
||
|
||
// "Adalbrecht utca. 12, Berlin" > Adalbrecht utca 12 (Berlin)
|
||
|
||
foreach (self::_PLACE_NARROWER_LOCATION_INDICATORS_HUNGARIAN as $indicator) {
|
||
$name = self::_rewrite_narrower_broader_pairs_to_brackets($name, $indicator, ', ');
|
||
}
|
||
|
||
if (str_contains($name, 'Budapest') && substr_count($name, 'Budapest') === 1) {
|
||
foreach(self::_RELEVANT_ROMAN_NUMERALS as $roman_numeral => $arabic) {
|
||
|
||
$to_match = ' Budapest, ' . $roman_numeral . '.';
|
||
if (str_ends_with($name, $to_match)) {
|
||
$name = str_replace($to_match, ' (Budapest, ' . $arabic . '. kerület)', $name);
|
||
}
|
||
|
||
}
|
||
}
|
||
|
||
return $name;
|
||
|
||
}
|
||
|
||
/**
|
||
* Rewrites a Ukrainian language name based on abbreviations explaining the
|
||
* hierarchy of named places.
|
||
*
|
||
* @param string $name Input name to rewrite.
|
||
*
|
||
* @return string
|
||
*/
|
||
private static function _rewrite_ukrainian_names_by_hierarchy(string $name):string {
|
||
|
||
$identifiersByLevel = [
|
||
'state' => [' РСР', 'РСР ', ' АРСР', 'АРСР ', ' губернія', 'губернія '],
|
||
'oblast' => ['обл.', 'область', 'області', 'округа', 'губернії'],
|
||
'region' => ['р-н', 'район'],
|
||
'county' => ['повіт'],
|
||
'city' => ['м.', 'м '],
|
||
'parish' => ['волость'],
|
||
'village' => ['смт', 'сільська', 'с. ', 'село'],
|
||
'district' => [], // Is also р-н; which it is is determined based on position
|
||
'street' => ['вул. '],
|
||
];
|
||
|
||
$levels = [
|
||
'country' => '',
|
||
'state' => '',
|
||
'oblast' => '',
|
||
'region' => '',
|
||
'county' => '',
|
||
'city' => '',
|
||
'parish' => '',
|
||
'village' => '',
|
||
'district' => '',
|
||
'street' => '',
|
||
];
|
||
|
||
$parts = explode(',', $name);
|
||
foreach ($parts as $part) {
|
||
$part = trim($part);
|
||
foreach ($identifiersByLevel as $level => $identifiers) {
|
||
foreach ($identifiers as $identifier) {
|
||
|
||
if (str_starts_with($part, $identifier) || str_ends_with($part, $identifier)) {
|
||
|
||
// Special case: Region can both be rajon or a district within a city
|
||
// If both oblast and city are already known, the region will be a
|
||
// district within the city.
|
||
// Otherwise, it is to be assumed that it is a super-city region.
|
||
if ($level === 'region' && !empty($levels['oblast'])
|
||
&& (!empty($levels['city']) || !empty($levels['village']))
|
||
) {
|
||
$level = 'district';
|
||
}
|
||
|
||
if (!empty($levels[$level])) {
|
||
# throw new Exception("Used the same level (" . $level . ") twice");
|
||
return $name;
|
||
}
|
||
$levels[$level] = $part;
|
||
continue 3;
|
||
}
|
||
|
||
}
|
||
}
|
||
|
||
// Special case: Abbreviated SSRs
|
||
if (in_array($part, ['УРСР', 'УССР', 'УСРР'], true)) {
|
||
$levels['state'] = $part;
|
||
continue;
|
||
}
|
||
|
||
// Unspecified part level: Attempt identifying country
|
||
if (!isset($countryNames)) {
|
||
$countryNames = self::_loadJsonList(__DIR__ . "/../static/countries.uk.json") + self::_loadJsonList(__DIR__ . "/../static/historical_countries.uk.json");
|
||
$countryNames[] = 'СРСР';
|
||
$countryNames[] = 'УНР';
|
||
$countryNames[] = 'Російська імперія';
|
||
$countryNames[] = 'Рос.імперія';
|
||
$countryNames[] = 'Рос.имперія';
|
||
$countryNames[] = 'Російська імперія-УНР';
|
||
}
|
||
if (in_array($part, $countryNames, true)) {
|
||
$levels['country'] = $part;
|
||
continue;
|
||
}
|
||
|
||
// Unspecified level; return
|
||
return $name;
|
||
}
|
||
|
||
$main_name = '';
|
||
$specifiers = [];
|
||
|
||
foreach (array_reverse($levels) as $level => $partname) {
|
||
if (empty($partname)) continue;
|
||
|
||
if ($level === 'city' || $level === 'village') {
|
||
$strtr = [];
|
||
foreach ($identifiersByLevel[$level] as $identifier) $strtr[$identifier] = '';
|
||
$partname = trim(strtr($partname, $strtr));
|
||
}
|
||
|
||
if (empty($main_name)) {
|
||
$main_name = $partname;
|
||
}
|
||
else {
|
||
$specifiers[] = $partname;
|
||
}
|
||
}
|
||
|
||
$output = $main_name;
|
||
if (!empty($specifiers)) {
|
||
$output .= ' (' . implode(', ', $specifiers) . ')';
|
||
}
|
||
|
||
return $output;
|
||
|
||
}
|
||
|
||
/**
|
||
* Cleans and consolidates name parts appearing regularly in Ukrainian place names.
|
||
*
|
||
* @param string $name Name of an place.
|
||
*
|
||
* @return string
|
||
*/
|
||
private static function _clean_ukrainian_abbreviations(string $name):string {
|
||
|
||
if (str_contains($name, " р-н,") || str_contains($name, " р-н ") || str_ends_with($name, " р-н")) {
|
||
$name = str_replace(" р-н", " район", $name);
|
||
}
|
||
|
||
if (str_contains($name, ',')) {
|
||
$name = self::_rewrite_ukrainian_names_by_hierarchy($name);
|
||
}
|
||
|
||
return $name;
|
||
|
||
}
|
||
|
||
/**
|
||
* Loads a JSON file, optionally loading it cached through a private static variable
|
||
* if reuse is expectable (= in the case of CLI usage).
|
||
*
|
||
* @param non-empty-string $filename File name to load.
|
||
*
|
||
* @return list<string>
|
||
*/
|
||
private static function _loadJsonList(string $filename):array {
|
||
|
||
if (PHP_SAPI === 'cli' && isset(self::$_placeNameListCaches[$filename])) {
|
||
return self::$_placeNameListCaches[$filename];
|
||
}
|
||
|
||
try {
|
||
$output = json_decode(MD_STD::file_get_contents($filename), true);
|
||
}
|
||
catch (MDFileDoesNotExist $e) {
|
||
self::$_placeNameListCaches[$filename] = [];
|
||
return [];
|
||
}
|
||
|
||
if ($output === false) {
|
||
throw new Exception("Failed to get list");
|
||
}
|
||
|
||
if (PHP_SAPI === 'cli') {
|
||
self::$_placeNameListCaches[$filename] = $output;
|
||
}
|
||
|
||
return $output;
|
||
|
||
}
|
||
|
||
/**
|
||
* Moves names of regions to brackets using pre-generated lists of countries,
|
||
* historical country names, etc.
|
||
*
|
||
* @param string $lang Instance language.
|
||
* @param string $name Input string to clean.
|
||
*
|
||
* @return string
|
||
*/
|
||
private static function _move_region_names_to_brackets(string $lang, string $name):string {
|
||
|
||
$separators = ['-', ', '];
|
||
|
||
foreach ($separators as $separator) {
|
||
|
||
if (!str_contains($name, $separator) || substr_count($name, $separator) !== 1) continue;
|
||
|
||
// Get parts and trim them
|
||
$parts = explode($separator, $name);
|
||
foreach ($parts as $key => $value) {
|
||
$parts[$key] = trim($value);
|
||
}
|
||
|
||
// Load place names
|
||
$countryNames = self::_loadJsonList(__DIR__ . "/../static/countries.$lang.json") + self::_loadJsonList(__DIR__ . "/../static/historical_countries.$lang.json");
|
||
$cardinal_directions = self::_loadJsonList(__DIR__ . "/../static/cardinal_directions.json");
|
||
|
||
$part0IsCountry = in_array($parts[0], $countryNames, true);
|
||
$part1IsCountry = in_array($parts[1], $countryNames, true);
|
||
|
||
// Skip if the full name is in the list of country names
|
||
if (in_array($name, $countryNames, true)) {
|
||
return $name;
|
||
}
|
||
|
||
// If one of the parts is a blacklisted term or a cardinal directions, skip this
|
||
|
||
if ((in_array($parts[0], self::_COUNTRY_REWRITE_BLACKLISTED_TERMS, true)
|
||
|| in_array($parts[0], $cardinal_directions, true)
|
||
|| in_array(strtolower($parts[0]), $cardinal_directions, true))
|
||
|| (in_array($parts[1], self::_COUNTRY_REWRITE_BLACKLISTED_TERMS, true)
|
||
|| in_array($parts[1], $cardinal_directions, true)
|
||
|| in_array(strtolower($parts[1]), $cardinal_directions, true))
|
||
) {
|
||
return $name;
|
||
}
|
||
|
||
if ($part0IsCountry === true && $part1IsCountry === false) {
|
||
return $parts[1] . ' (' . $parts[0] . ')';
|
||
}
|
||
else if ($part0IsCountry === false && $part1IsCountry === true) {
|
||
return $parts[0] . ' (' . $parts[1] . ')';
|
||
}
|
||
|
||
}
|
||
|
||
return $name;
|
||
|
||
}
|
||
|
||
/**
|
||
* Removes duplicates after commas.
|
||
*
|
||
* @param string $ort_name Place name to clean.
|
||
*
|
||
* @return string
|
||
*/
|
||
private static function _remove_duplicates_after_commas(string $ort_name):string {
|
||
|
||
if (str_contains($ort_name, ',') === false) {
|
||
return $ort_name;
|
||
}
|
||
|
||
$parts = explode(', ', $ort_name);
|
||
|
||
return implode(', ', array_unique($parts));
|
||
|
||
}
|
||
|
||
/**
|
||
* Cleans a place name by trimming etc. Also removes uncertainty indicators.
|
||
*
|
||
* @param string $lang Instance language.
|
||
* @param string $ort_name Input string to clean.
|
||
*
|
||
* @return string
|
||
*/
|
||
public static function consolidate_name(string $lang, string $ort_name):string {
|
||
|
||
// Run basic replacements
|
||
$nameSanitizations = self::_NAME_SANITIZATIONS;
|
||
/*
|
||
if (substr_count($ort_name, "/") === 1 && !str_contains($ort_name, '.')) {
|
||
$nameSanitizations["/"] = "-";
|
||
}
|
||
*/
|
||
$ort_name = strtr(self::sanitizeInputString($ort_name), $nameSanitizations);
|
||
$ort_name = self::sanitizeInputString(NodaUncertaintyHelper::cleanUncertaintyIndicatorsPlace($ort_name));
|
||
|
||
// Remove duplicates after commas
|
||
// Västerdås, Schweden, Schweden > Västerdås, Schweden
|
||
$ort_name = self::_remove_duplicates_after_commas($ort_name);
|
||
|
||
$ort_name = match ($lang) {
|
||
'de' => self::_clean_german_abbreviations($ort_name),
|
||
'hu' => self::_clean_hungarian_abbreviations($ort_name),
|
||
'uk' => self::_clean_ukrainian_abbreviations($ort_name),
|
||
default => $ort_name,
|
||
};
|
||
|
||
$ort_name = self::_move_region_names_to_brackets($lang, $ort_name);
|
||
|
||
return $ort_name;
|
||
|
||
}
|
||
}
|