MDNodaHelpers/src/NodaConsolidatedNamesForPlaces.php

507 lines
17 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?PHP
/**
* Gathers functions for setting uniform place names.
*/
declare(strict_types = 1);
/**
* Gathers functions for setting uniform place names.
*/
final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract {
/**
* Substrings of an place name listed as a key in this array will be replaced
* by the corresponding value.
*/
private const _NAME_SANITIZATIONS = [
" - " => "-",
"unbekannt" => "",
"Unbekannt" => "",
"unknown" => "",
"Unknown" => "",
];
/** Blacklist for comparison with country names */
private const _COUNTRY_REWRITE_BLACKLISTED_TERMS = [
'District',
'Distrikt',
'India',
'Indien',
'Insel',
'Inseln',
'Tal',
'Yue',
];
private const _PLACE_TYPE_INDICATORS_GERMAN = [
'Insel',
'Stadt',
];
// Indicators signifying that a place is likely subordinate to the other
// if two places are provided in a comma-separated list
private const _PLACE_NARROWER_LOCATION_INDICATORS_GERMAN = [
'gasse',
'straße',
' Straße',
];
// Indicators signifying that a place is likely subordinate to the other
// if two places are provided in a comma-separated list
private const _PLACE_NARROWER_LOCATION_INDICATORS_HUNGARIAN = [
' körut ',
' utca ',
' út ',
];
private const _RELEVANT_ROMAN_NUMERALS = [
'I' => '1',
'II' => '2',
'III' => '3',
'IV' => '4',
'V' => '5',
'VI' => '6',
'VII' => '7',
'VIII' => '8',
'IX' => '9',
'X' => '10',
'XI' => '11',
'XII' => '12',
'XIII' => '13',
'XIV' => '14',
'XV' => '15',
'XVI' => '16',
'XVII' => '17',
'XVIII' => '18',
'XIX' => '19',
'XX' => '20',
];
/**
* @var array<string, list<string>>
*/
private static $_placeNameListCaches = [];
/**
* Rewrites indicators for narrower locations paired with a superordinate location
* into the format "Narrower (Broader)".
* E.g.: "Adalbrechtstr. 12, Berlin" > Adalbrechtstraße 12 (Berlin).
*
* @param string $name Name in which to rewrite.
* @param string $indicator Indicator for narrower place. E.g. "straße".
* @param string $separator Separating character between narrower and broader, e.g. ', '.
*
* @return string
*/
private static function _rewrite_narrower_broader_pairs_to_brackets(string $name, string $indicator, string $separator = ', '):string {
if (str_contains($name, $indicator)
&& substr_count($name, $indicator) === 1
&& substr_count($name, $separator) === 1
&& !str_contains($name, "(")
) {
$parts = explode(', ', $name);
// Skip entries like "Vaci utca 12 Budapest, Vaci utca"
$indicatorTrimmed = trim($indicator);
if ((str_ends_with($parts[0], $indicatorTrimmed) && str_contains($parts[1], $indicatorTrimmed))
|| (str_ends_with($parts[1], $indicatorTrimmed) && str_contains($parts[0], $indicatorTrimmed))
) {
return $name;
}
// Prevent errors in case of "Adalbrechtstraße 12, "
if (!empty($parts[0]) && !empty($parts[1])) {
if (str_contains($parts[0], $indicator)) { // Adalberthstraße 12, Berlin
$street = $parts[0];
$town = $parts[1];
}
else { // Berlin, Adalberthstraße 12
$street = $parts[1];
$town = $parts[0];
}
// Prevent rewrites in cases like "Deák Ferenc utca 16-18. Budapest, V."
if (str_contains($town, '.')) {
return $name;
}
return $street . ' (' . $town . ')';
}
}
return $name;
}
/**
* Cleans and consolidates name parts appearing regularly in German place names.
*
* @param string $name Name of a place.
*
* @return string
*/
private static function _clean_german_abbreviations(string $name):string {
// ABC, Inseln > ABC (Inseln)
foreach (self::_PLACE_TYPE_INDICATORS_GERMAN as $indicator) {
if (str_ends_with($name, ', ' . $indicator)) {
$name = str_replace(', ' . $indicator, ' (' . $indicator . ')', $name);
}
}
// Adalbrechtstr. 12 > Adalbrechtstraße 12
if (str_contains($name, "str. ") && \preg_match("/[a-zA-Z]str. [0-9]/", $name)) {
$name = str_replace("str. ", "straße ", $name);
}
// "Adalbrechtstraße. 12, Berlin" > Adalbrechtstraße 12 (Berlin)
foreach (self::_PLACE_NARROWER_LOCATION_INDICATORS_GERMAN as $indicator) {
$name = self::_rewrite_narrower_broader_pairs_to_brackets($name, $indicator, ', ');
}
return $name;
}
/**
* Cleans and consolidates name parts appearing regularly in Hungarian place names.
*
* @param string $name Name of a place.
*
* @return string
*/
private static function _clean_hungarian_abbreviations(string $name):string {
if (str_contains($name, " krt. ") && \preg_match("/\ krt\.\ [0-9]/", $name)) {
$name = str_replace(" krt. ", " körut ", $name);
}
if (str_contains($name, " u. ") && \preg_match("/\ u\.\ [0-9]/", $name)) {
$name = str_replace(" u. ", " utca ", $name);
}
if (str_contains($name, " ucca ") && \preg_match("/\ ucca\ [0-9]/", $name)) {
$name = str_replace(" ucca ", " utca ", $name);
}
if (str_contains($name, " utcza ") && \preg_match("/\ utcza\ [0-9]/", $name)) {
$name = str_replace(" utcza ", " utca ", $name);
}
if (str_contains($name, " rkp. ") && \preg_match("/\ rkp\.\ [0-9]/", $name)) {
$name = str_replace(" rkp. ", " rakpart ", $name);
}
// "Adalbrecht utca. 12, Berlin" > Adalbrecht utca 12 (Berlin)
foreach (self::_PLACE_NARROWER_LOCATION_INDICATORS_HUNGARIAN as $indicator) {
$name = self::_rewrite_narrower_broader_pairs_to_brackets($name, $indicator, ', ');
}
if (str_contains($name, 'Budapest') && substr_count($name, 'Budapest') === 1) {
foreach(self::_RELEVANT_ROMAN_NUMERALS as $roman_numeral => $arabic) {
$to_match = ' Budapest, ' . $roman_numeral . '.';
if (str_ends_with($name, $to_match)) {
$name = str_replace($to_match, ' (Budapest, ' . $arabic . '. kerület)', $name);
}
}
}
return $name;
}
/**
* Rewrites a Ukrainian language name based on abbreviations explaining the
* hierarchy of named places.
*
* @param string $name Input name to rewrite.
*
* @return string
*/
private static function _rewrite_ukrainian_names_by_hierarchy(string $name):string {
$identifiersByLevel = [
'state' => [' РСР', 'РСР ', ' АРСР', 'АРСР ', ' губернія', 'губернія '],
'oblast' => ['обл.', 'область', 'області', 'округа', 'губернії'],
'region' => ['р-н', 'район'],
'county' => ['повіт'],
'city' => ['м.', 'м '],
'parish' => ['волость'],
'village' => ['смт', 'сільська', 'с. ', 'село'],
'district' => [], // Is also р-н; which it is is determined based on position
'street' => ['вул. '],
];
$levels = [
'country' => '',
'state' => '',
'oblast' => '',
'region' => '',
'county' => '',
'city' => '',
'parish' => '',
'village' => '',
'district' => '',
'street' => '',
];
$parts = explode(',', $name);
foreach ($parts as $part) {
$part = trim($part);
foreach ($identifiersByLevel as $level => $identifiers) {
foreach ($identifiers as $identifier) {
if (str_starts_with($part, $identifier) || str_ends_with($part, $identifier)) {
// Special case: Region can both be rajon or a district within a city
// If both oblast and city are already known, the region will be a
// district within the city.
// Otherwise, it is to be assumed that it is a super-city region.
if ($level === 'region' && !empty($levels['oblast'])
&& (!empty($levels['city']) || !empty($levels['village']))
) {
$level = 'district';
}
if (!empty($levels[$level])) {
# throw new Exception("Used the same level (" . $level . ") twice");
return $name;
}
$levels[$level] = $part;
continue 3;
}
}
}
// Special case: Abbreviated SSRs
if (in_array($part, ['УРСР', 'УССР', 'УСРР'], true)) {
$levels['state'] = $part;
continue;
}
// Unspecified part level: Attempt identifying country
if (!isset($countryNames)) {
$countryNames = self::_loadJsonList(__DIR__ . "/../static/countries.uk.json") + self::_loadJsonList(__DIR__ . "/../static/historical_countries.uk.json");
$countryNames[] = 'СРСР';
$countryNames[] = 'УНР';
$countryNames[] = 'Російська імперія';
$countryNames[] = 'Рос.імперія';
$countryNames[] = 'Рос.имперія';
$countryNames[] = 'Російська імперія-УНР';
}
if (in_array($part, $countryNames, true)) {
$levels['country'] = $part;
continue;
}
// Unspecified level; return
return $name;
}
$main_name = '';
$specifiers = [];
foreach (array_reverse($levels) as $level => $partname) {
if (empty($partname)) continue;
if ($level === 'city' || $level === 'village') {
$strtr = [];
foreach ($identifiersByLevel[$level] as $identifier) $strtr[$identifier] = '';
$partname = trim(strtr($partname, $strtr));
}
if (empty($main_name)) {
$main_name = $partname;
}
else {
$specifiers[] = $partname;
}
}
$output = $main_name;
if (!empty($specifiers)) {
$output .= ' (' . implode(', ', $specifiers) . ')';
}
return $output;
}
/**
* Cleans and consolidates name parts appearing regularly in Ukrainian place names.
*
* @param string $name Name of an place.
*
* @return string
*/
private static function _clean_ukrainian_abbreviations(string $name):string {
if (str_contains($name, " р-н,") || str_contains($name, " р") || str_ends_with($name, " р")) {
$name = str_replace(" р", " район", $name);
}
if (str_contains($name, ',')) {
$name = self::_rewrite_ukrainian_names_by_hierarchy($name);
}
return $name;
}
/**
* Loads a JSON file, optionally loading it cached through a private static variable
* if reuse is expectable (= in the case of CLI usage).
*
* @param non-empty-string $filename File name to load.
*
* @return list<string>
*/
private static function _loadJsonList(string $filename):array {
if (PHP_SAPI === 'cli' && isset(self::$_placeNameListCaches[$filename])) {
return self::$_placeNameListCaches[$filename];
}
try {
$output = json_decode(MD_STD::file_get_contents($filename), true);
}
catch (MDFileDoesNotExist $e) {
self::$_placeNameListCaches[$filename] = [];
return [];
}
if ($output === false) {
throw new Exception("Failed to get list");
}
if (PHP_SAPI === 'cli') {
self::$_placeNameListCaches[$filename] = $output;
}
return $output;
}
/**
* Moves names of regions to brackets using pre-generated lists of countries,
* historical country names, etc.
*
* @param string $lang Instance language.
* @param string $name Input string to clean.
*
* @return string
*/
private static function _move_region_names_to_brackets(string $lang, string $name):string {
$separators = ['-', ', '];
foreach ($separators as $separator) {
if (!str_contains($name, $separator) || substr_count($name, $separator) !== 1) continue;
// Get parts and trim them
$parts = explode($separator, $name);
foreach ($parts as $key => $value) {
$parts[$key] = trim($value);
}
// Load place names
$countryNames = self::_loadJsonList(__DIR__ . "/../static/countries.$lang.json") + self::_loadJsonList(__DIR__ . "/../static/historical_countries.$lang.json");
$cardinal_directions = self::_loadJsonList(__DIR__ . "/../static/cardinal_directions.json");
$part0IsCountry = in_array($parts[0], $countryNames, true);
$part1IsCountry = in_array($parts[1], $countryNames, true);
// Skip if the full name is in the list of country names
if (in_array($name, $countryNames, true)) {
return $name;
}
// If one of the parts is a blacklisted term or a cardinal directions, skip this
if ((in_array($parts[0], self::_COUNTRY_REWRITE_BLACKLISTED_TERMS, true)
|| in_array($parts[0], $cardinal_directions, true)
|| in_array(strtolower($parts[0]), $cardinal_directions, true))
|| (in_array($parts[1], self::_COUNTRY_REWRITE_BLACKLISTED_TERMS, true)
|| in_array($parts[1], $cardinal_directions, true)
|| in_array(strtolower($parts[1]), $cardinal_directions, true))
) {
return $name;
}
if ($part0IsCountry === true && $part1IsCountry === false) {
return $parts[1] . ' (' . $parts[0] . ')';
}
else if ($part0IsCountry === false && $part1IsCountry === true) {
return $parts[0] . ' (' . $parts[1] . ')';
}
}
return $name;
}
/**
* Removes duplicates after commas.
*
* @param string $ort_name Place name to clean.
*
* @return string
*/
private static function _remove_duplicates_after_commas(string $ort_name):string {
if (str_contains($ort_name, ',') === false) {
return $ort_name;
}
$parts = explode(', ', $ort_name);
return implode(', ', array_unique($parts));
}
/**
* Cleans a place name by trimming etc. Also removes uncertainty indicators.
*
* @param string $lang Instance language.
* @param string $ort_name Input string to clean.
*
* @return string
*/
public static function consolidate_name(string $lang, string $ort_name):string {
// Run basic replacements
$nameSanitizations = self::_NAME_SANITIZATIONS;
/*
if (substr_count($ort_name, "/") === 1 && !str_contains($ort_name, '.')) {
$nameSanitizations["/"] = "-";
}
*/
$ort_name = strtr(self::sanitizeInputString($ort_name), $nameSanitizations);
$ort_name = self::sanitizeInputString(NodaUncertaintyHelper::cleanUncertaintyIndicatorsPlace($ort_name));
// Remove duplicates after commas
// Västerdås, Schweden, Schweden > Västerdås, Schweden
$ort_name = self::_remove_duplicates_after_commas($ort_name);
$ort_name = match ($lang) {
'de' => self::_clean_german_abbreviations($ort_name),
'hu' => self::_clean_hungarian_abbreviations($ort_name),
'uk' => self::_clean_ukrainian_abbreviations($ort_name),
default => $ort_name,
};
$ort_name = self::_move_region_names_to_brackets($lang, $ort_name);
return $ort_name;
}
}