MDNodaHelpers/src/NodaConsolidatedNamesForPlaces.php
2023-11-26 00:54:14 +01:00

286 lines
8.8 KiB
PHP

<?PHP
/**
* Gathers functions for setting uniform place names.
*/
declare(strict_types = 1);
/**
* Gathers functions for setting uniform place names.
*/
final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract {
/**
* Substrings of an place name listed as a key in this array will be replaced
* by the corresponding value.
*/
private const _NAME_SANITIZATIONS = [
" - " => "-",
"unbekannt" => "",
"Unbekannt" => "",
"unknown" => "",
"Unknown" => "",
];
private const _PLACE_TYPE_INDICATORS_GERMAN = [
'Insel',
'Stadt',
];
// Indicators signifying that a place is likely subordinate to the other
// if two places are provided in a comma-separated list
private const _PLACE_NARROWER_LOCATION_INDICATORS_GERMAN = [
'gasse',
'straße',
];
// Indicators signifying that a place is likely subordinate to the other
// if two places are provided in a comma-separated list
private const _PLACE_NARROWER_LOCATION_INDICATORS_HUNGARIAN = [
'körut',
'utca',
'út',
];
private const _RELEVANT_ROMAN_NUMERALS = [
'I' => '1',
'II' => '2',
'III' => '3',
'IV' => '4',
'V' => '5',
'VI' => '6',
'VII' => '7',
'VIII' => '8',
'IX' => '9',
'X' => '10',
'XI' => '11',
'XII' => '12',
'XIII' => '13',
'XIV' => '14',
'XV' => '15',
'XVI' => '16',
'XVII' => '17',
'XVIII' => '18',
'XIX' => '19',
'XX' => '20',
];
/**
* @var array<string, list<string>>
*/
private static $_placeNameListCaches = [];
/**
* Rewrites indicators for narrower locations paired with a superordinate location
* into the format "Narrower (Broader)".
* E.g.: "Adalbrechtstr. 12, Berlin" > Adalbrechtstraße 12 (Berlin).
*
* @param string $name Name in which to rewrite.
* @param string $indicator Indicator for narrower place. E.g. "straße".
* @param string $separator Separating character between narrower and broader, e.g. ', '.
*
* @return string
*/
private static function _rewrite_narrower_broader_pairs_to_brackets(string $name, string $indicator, $separator = ', '):string {
if (str_contains($name, $indicator)
&& substr_count($name, $indicator) === 1
&& substr_count($name, $separator) === 1
&& !str_contains($name, "(")
) {
$parts = explode(', ', $name);
// Prevent errors in case of "Adalbrechtstraße 12, "
if (!empty($parts[0]) && !empty($parts[1])) {
if (str_contains($parts[0], $indicator)) { // Adalberthstraße 12, Berlin
$street = $parts[0];
$town = $parts[1];
}
else { // Berlin, Adalberthstraße 12
$street = $parts[1];
$town = $parts[0];
}
// Prevent rewrites in cases like "Deák Ferenc utca 16-18. Budapest, V."
if (str_contains($town, '.')) {
return $name;
}
return $street . ' (' . $town . ')';
}
}
return $name;
}
/**
* Cleans and consolidates name parts appearing regularly in German place names.
*
* @param string $name Name of an actor.
*
* @return string
*/
private static function _clean_german_abbreviations(string $name):string {
// ABC, Inseln > ABC (Inseln)
foreach (self::_PLACE_TYPE_INDICATORS_GERMAN as $indicator) {
if (str_ends_with($name, ', ' . $indicator)) {
$name = str_replace(', ' . $indicator, ' (' . $indicator . ')', $name);
}
}
// Adalbrechtstr. 12 > Adalbrechtstraße 12
if (str_contains($name, "str. ") && \preg_match("/[a-zA-Z]str. [0-9]/", $name)) {
$name = str_replace("str. ", "straße ", $name);
}
// "Adalbrechtstraße. 12, Berlin" > Adalbrechtstraße 12 (Berlin)
foreach (self::_PLACE_NARROWER_LOCATION_INDICATORS_GERMAN as $indicator) {
$name = self::_rewrite_narrower_broader_pairs_to_brackets($name, $indicator, ', ');
}
return $name;
}
/**
* Cleans and consolidates name parts appearing regularly in Hungarian place names.
*
* @param string $name Name of an actor.
*
* @return string
*/
private static function _clean_hungarian_abbreviations(string $name):string {
if (str_contains($name, " krt. ") && \preg_match("/\ krt\.\ [0-9]/", $name)) {
$name = str_replace(" krt. ", " körut ", $name);
}
if (str_contains($name, " u. ") && \preg_match("/\ u\.\ [0-9]/", $name)) {
$name = str_replace(" u. ", " utca ", $name);
}
// "Adalbrecht utca. 12, Berlin" > Adalbrecht utca 12 (Berlin)
foreach (self::_PLACE_NARROWER_LOCATION_INDICATORS_HUNGARIAN as $indicator) {
$name = self::_rewrite_narrower_broader_pairs_to_brackets($name, $indicator, ', ');
}
if (str_contains($name, 'Budapest') && substr_count($name, 'Budapest') === 1) {
foreach(self::_RELEVANT_ROMAN_NUMERALS as $roman_numeral => $arabic) {
$to_match = ' Budapest, ' . $roman_numeral . '.';
if (str_ends_with($name, $to_match)) {
$name = str_replace($to_match, ' (Budapest, ' . $arabic . '. kerület)', $name);
}
}
}
return $name;
}
/**
* Loads a JSON file, optionally loading it cached through a private static variable
* if reuse is expectable (= in the case of CLI usage).
*
* @param non-empty-string $filename File name to load.
*
* @return list<string>
*/
private static function _loadJsonList(string $filename):array {
if (PHP_SAPI === 'cli' && isset(self::$_placeNameListCaches[$filename])) {
return self::$_placeNameListCaches[$filename];
}
$output = json_decode(MD_STD::file_get_contents($filename), true);
if ($output === false) {
throw new Exception("Failed to get list");
}
if (PHP_SAPI === 'cli') {
self::$_placeNameListCaches[$filename] = $output;
}
return $output;
}
/**
* Moves names of regions to brackets using pre-generated lists of countries,
* historical country names, etc.
*
* @param string $lang Instance language.
* @param string $name Input string to clean.
*
* @return string
*/
private static function _move_region_names_to_brackets(string $lang, string $name):string {
$separators = ['-', ', '];
foreach ($separators as $separator) {
if (!str_contains($name, $separator) || substr_count($name, $separator) !== 1) continue;
// Get parts and trim them
$parts = explode($separator, $name);
foreach ($parts as $key => $value) {
$parts[$key] = trim($value);
}
// Load place names
$countryNames = self::_loadJsonList(__DIR__ . "/../static/countries.$lang.json") + self::_loadJsonList(__DIR__ . "/../static/historical_countries.$lang.json");
$part0IsCountry = in_array($parts[0], $countryNames, true);
$part1IsCountry = in_array($parts[1], $countryNames, true);
if ($part0IsCountry === true && $part1IsCountry === false) {
return $parts[1] . ' (' . $parts[0] . ')';
}
else if ($part0IsCountry === false && $part1IsCountry === true) {
return $parts[0] . ' (' . $parts[1] . ')';
}
}
return $name;
}
/**
* Cleans a place name by trimming etc. Also removes uncertainty indicators.
*
* @param string $lang Instance language.
* @param string $ort_name Input string to clean.
*
* @return string
*/
public static function consolidate_name(string $lang, string $ort_name):string {
// Run basic replacements
$nameSanitizations = self::_NAME_SANITIZATIONS;
if (substr_count($ort_name, "/") === 1) $nameSanitizations["/"] = "-";
$ort_name = strtr(self::sanitizeInputString($ort_name), $nameSanitizations);
$ort_name = self::sanitizeInputString(NodaUncertaintyHelper::cleanUncertaintyIndicatorsPlace($ort_name));
$ort_name = match ($lang) {
'de' => self::_clean_german_abbreviations($ort_name),
'hu' => self::_clean_hungarian_abbreviations($ort_name),
default => $ort_name,
};
$ort_name = self::_move_region_names_to_brackets($lang, $ort_name);
return $ort_name;
}
}