"-", "unbekannt" => "", "Unbekannt" => "", "unknown" => "", "Unknown" => "", ]; private const _PLACE_TYPE_INDICATORS_GERMAN = [ 'Insel', 'Stadt', ]; // Indicators signifying that a place is likely subordinate to the other // if two places are provided in a comma-separated list private const _PLACE_NARROWER_LOCATION_INDICATORS_GERMAN = [ 'gasse', 'straße', ]; // Indicators signifying that a place is likely subordinate to the other // if two places are provided in a comma-separated list private const _PLACE_NARROWER_LOCATION_INDICATORS_HUNGARIAN = [ 'körut', 'utca', 'út', ]; private const _RELEVANT_ROMAN_NUMERALS = [ 'I' => '1', 'II' => '2', 'III' => '3', 'IV' => '4', 'V' => '5', 'VI' => '6', 'VII' => '7', 'VIII' => '8', 'IX' => '9', 'X' => '10', 'XI' => '11', 'XII' => '12', 'XIII' => '13', 'XIV' => '14', 'XV' => '15', 'XVI' => '16', 'XVII' => '17', 'XVIII' => '18', 'XIX' => '19', 'XX' => '20', ]; /** * @var array> */ private static $_placeNameListCaches = []; /** * Rewrites indicators for narrower locations paired with a superordinate location * into the format "Narrower (Broader)". * E.g.: "Adalbrechtstr. 12, Berlin" > Adalbrechtstraße 12 (Berlin). * * @param string $name Name in which to rewrite. * @param string $indicator Indicator for narrower place. E.g. "straße". * @param string $separator Separating character between narrower and broader, e.g. ', '. * * @return string */ private static function _rewrite_narrower_broader_pairs_to_brackets(string $name, string $indicator, $separator = ', '):string { if (str_contains($name, $indicator) && substr_count($name, $indicator) === 1 && substr_count($name, $separator) === 1 && !str_contains($name, "(") ) { $parts = explode(', ', $name); // Prevent errors in case of "Adalbrechtstraße 12, " if (!empty($parts[0]) && !empty($parts[1])) { if (str_contains($parts[0], $indicator)) { // Adalberthstraße 12, Berlin $street = $parts[0]; $town = $parts[1]; } else { // Berlin, Adalberthstraße 12 $street = $parts[1]; $town = $parts[0]; } // Prevent rewrites in cases like "Deák Ferenc utca 16-18. Budapest, V." if (str_contains($town, '.')) { return $name; } return $street . ' (' . $town . ')'; } } return $name; } /** * Cleans and consolidates name parts appearing regularly in German place names. * * @param string $name Name of an actor. * * @return string */ private static function _clean_german_abbreviations(string $name):string { // ABC, Inseln > ABC (Inseln) foreach (self::_PLACE_TYPE_INDICATORS_GERMAN as $indicator) { if (str_ends_with($name, ', ' . $indicator)) { $name = str_replace(', ' . $indicator, ' (' . $indicator . ')', $name); } } // Adalbrechtstr. 12 > Adalbrechtstraße 12 if (str_contains($name, "str. ") && \preg_match("/[a-zA-Z]str. [0-9]/", $name)) { $name = str_replace("str. ", "straße ", $name); } // "Adalbrechtstraße. 12, Berlin" > Adalbrechtstraße 12 (Berlin) foreach (self::_PLACE_NARROWER_LOCATION_INDICATORS_GERMAN as $indicator) { $name = self::_rewrite_narrower_broader_pairs_to_brackets($name, $indicator, ', '); } return $name; } /** * Cleans and consolidates name parts appearing regularly in Hungarian place names. * * @param string $name Name of an actor. * * @return string */ private static function _clean_hungarian_abbreviations(string $name):string { if (str_contains($name, " krt. ") && \preg_match("/\ krt\.\ [0-9]/", $name)) { $name = str_replace(" krt. ", " körut ", $name); } if (str_contains($name, " u. ") && \preg_match("/\ u\.\ [0-9]/", $name)) { $name = str_replace(" u. ", " utca ", $name); } // "Adalbrecht utca. 12, Berlin" > Adalbrecht utca 12 (Berlin) foreach (self::_PLACE_NARROWER_LOCATION_INDICATORS_HUNGARIAN as $indicator) { $name = self::_rewrite_narrower_broader_pairs_to_brackets($name, $indicator, ', '); } if (str_contains($name, 'Budapest') && substr_count($name, 'Budapest') === 1) { foreach(self::_RELEVANT_ROMAN_NUMERALS as $roman_numeral => $arabic) { $to_match = ' Budapest, ' . $roman_numeral . '.'; if (str_ends_with($name, $to_match)) { $name = str_replace($to_match, ' (Budapest, ' . $arabic . '. kerület)', $name); } } } return $name; } /** * Loads a JSON file, optionally loading it cached through a private static variable * if reuse is expectable (= in the case of CLI usage). * * @param non-empty-string $filename File name to load. * * @return list */ private static function _loadJsonList(string $filename):array { if (PHP_SAPI === 'cli' && isset(self::$_placeNameListCaches[$filename])) { return self::$_placeNameListCaches[$filename]; } $output = json_decode(MD_STD::file_get_contents($filename), true); if ($output === false) { throw new Exception("Failed to get list"); } if (PHP_SAPI === 'cli') { self::$_placeNameListCaches[$filename] = $output; } return $output; } /** * Moves names of regions to brackets using pre-generated lists of countries, * historical country names, etc. * * @param string $lang Instance language. * @param string $name Input string to clean. * * @return string */ private static function _move_region_names_to_brackets(string $lang, string $name):string { $separators = ['-', ', ']; foreach ($separators as $separator) { if (!str_contains($name, $separator) || substr_count($name, $separator) !== 1) continue; // Get parts and trim them $parts = explode($separator, $name); foreach ($parts as $key => $value) { $parts[$key] = trim($value); } // Load place names $countryNames = self::_loadJsonList(__DIR__ . "/../static/countries.$lang.json") + self::_loadJsonList(__DIR__ . "/../static/historical_countries.$lang.json"); $part0IsCountry = in_array($parts[0], $countryNames, true); $part1IsCountry = in_array($parts[1], $countryNames, true); if ($part0IsCountry === true && $part1IsCountry === false) { return $parts[1] . ' (' . $parts[0] . ')'; } else if ($part0IsCountry === false && $part1IsCountry === true) { return $parts[0] . ' (' . $parts[1] . ')'; } } return $name; } /** * Cleans a place name by trimming etc. Also removes uncertainty indicators. * * @param string $lang Instance language. * @param string $ort_name Input string to clean. * * @return string */ public static function consolidate_name(string $lang, string $ort_name):string { // Run basic replacements $nameSanitizations = self::_NAME_SANITIZATIONS; if (substr_count($ort_name, "/") === 1) $nameSanitizations["/"] = "-"; $ort_name = strtr(self::sanitizeInputString($ort_name), $nameSanitizations); $ort_name = self::sanitizeInputString(NodaUncertaintyHelper::cleanUncertaintyIndicatorsPlace($ort_name)); $ort_name = match ($lang) { 'de' => self::_clean_german_abbreviations($ort_name), 'hu' => self::_clean_hungarian_abbreviations($ort_name), default => $ort_name, }; $ort_name = self::_move_region_names_to_brackets($lang, $ort_name); return $ort_name; } }