"-", "unbekannt" => "", "Unbekannt" => "", "unknown" => "", "Unknown" => "", ]; /** Blacklist for comparison with country names */ private const _COUNTRY_REWRITE_BLACKLISTED_TERMS = [ 'District', 'Distrikt', 'India', 'Indien', 'Insel', 'Inseln', 'Tal', 'Yue', ]; private const _PLACE_TYPE_INDICATORS_GERMAN = [ 'Insel', 'Stadt', ]; // Indicators signifying that a place is likely subordinate to the other // if two places are provided in a comma-separated list private const _PLACE_NARROWER_LOCATION_INDICATORS_GERMAN = [ 'gasse', 'straße', ' Straße', ]; // Indicators signifying that a place is likely subordinate to the other // if two places are provided in a comma-separated list private const _PLACE_NARROWER_LOCATION_INDICATORS_HUNGARIAN = [ ' körut ', ' utca ', ' út ', ]; private const _RELEVANT_ROMAN_NUMERALS = [ 'I' => '1', 'II' => '2', 'III' => '3', 'IV' => '4', 'V' => '5', 'VI' => '6', 'VII' => '7', 'VIII' => '8', 'IX' => '9', 'X' => '10', 'XI' => '11', 'XII' => '12', 'XIII' => '13', 'XIV' => '14', 'XV' => '15', 'XVI' => '16', 'XVII' => '17', 'XVIII' => '18', 'XIX' => '19', 'XX' => '20', ]; /** * @var array> */ private static $_placeNameListCaches = []; /** * Rewrites indicators for narrower locations paired with a superordinate location * into the format "Narrower (Broader)". * E.g.: "Adalbrechtstr. 12, Berlin" > Adalbrechtstraße 12 (Berlin). * * @param string $name Name in which to rewrite. * @param string $indicator Indicator for narrower place. E.g. "straße". * @param string $separator Separating character between narrower and broader, e.g. ', '. * * @return string */ private static function _rewrite_narrower_broader_pairs_to_brackets(string $name, string $indicator, $separator = ', '):string { if (str_contains($name, $indicator) && substr_count($name, $indicator) === 1 && substr_count($name, $separator) === 1 && !str_contains($name, "(") ) { $parts = explode(', ', $name); // Skip entries like "Vaci utca 12 Budapest, Vaci utca" $indicatorTrimmed = trim($indicator); if ( (str_ends_with($parts[0], $indicatorTrimmed) && str_contains($parts[1], $indicatorTrimmed)) || (str_ends_with($parts[1], $indicatorTrimmed) && str_contains($parts[0], $indicatorTrimmed)) ) { return $name; } // Prevent errors in case of "Adalbrechtstraße 12, " if (!empty($parts[0]) && !empty($parts[1])) { if (str_contains($parts[0], $indicator)) { // Adalberthstraße 12, Berlin $street = $parts[0]; $town = $parts[1]; } else { // Berlin, Adalberthstraße 12 $street = $parts[1]; $town = $parts[0]; } // Prevent rewrites in cases like "Deák Ferenc utca 16-18. Budapest, V." if (str_contains($town, '.')) { return $name; } return $street . ' (' . $town . ')'; } } return $name; } /** * Cleans and consolidates name parts appearing regularly in German place names. * * @param string $name Name of a place. * * @return string */ private static function _clean_german_abbreviations(string $name):string { // ABC, Inseln > ABC (Inseln) foreach (self::_PLACE_TYPE_INDICATORS_GERMAN as $indicator) { if (str_ends_with($name, ', ' . $indicator)) { $name = str_replace(', ' . $indicator, ' (' . $indicator . ')', $name); } } // Adalbrechtstr. 12 > Adalbrechtstraße 12 if (str_contains($name, "str. ") && \preg_match("/[a-zA-Z]str. [0-9]/", $name)) { $name = str_replace("str. ", "straße ", $name); } // "Adalbrechtstraße. 12, Berlin" > Adalbrechtstraße 12 (Berlin) foreach (self::_PLACE_NARROWER_LOCATION_INDICATORS_GERMAN as $indicator) { $name = self::_rewrite_narrower_broader_pairs_to_brackets($name, $indicator, ', '); } return $name; } /** * Cleans and consolidates name parts appearing regularly in Hungarian place names. * * @param string $name Name of a place. * * @return string */ private static function _clean_hungarian_abbreviations(string $name):string { if (str_contains($name, " krt. ") && \preg_match("/\ krt\.\ [0-9]/", $name)) { $name = str_replace(" krt. ", " körut ", $name); } if (str_contains($name, " u. ") && \preg_match("/\ u\.\ [0-9]/", $name)) { $name = str_replace(" u. ", " utca ", $name); } if (str_contains($name, " ucca ") && \preg_match("/\ ucca\ [0-9]/", $name)) { $name = str_replace(" ucca ", " utca ", $name); } if (str_contains($name, " utcza ") && \preg_match("/\ utcza\ [0-9]/", $name)) { $name = str_replace(" utcza ", " utca ", $name); } if (str_contains($name, " rkp. ") && \preg_match("/\ rkp\.\ [0-9]/", $name)) { $name = str_replace(" rkp. ", " rakpart ", $name); } // "Adalbrecht utca. 12, Berlin" > Adalbrecht utca 12 (Berlin) foreach (self::_PLACE_NARROWER_LOCATION_INDICATORS_HUNGARIAN as $indicator) { $name = self::_rewrite_narrower_broader_pairs_to_brackets($name, $indicator, ', '); } if (str_contains($name, 'Budapest') && substr_count($name, 'Budapest') === 1) { foreach(self::_RELEVANT_ROMAN_NUMERALS as $roman_numeral => $arabic) { $to_match = ' Budapest, ' . $roman_numeral . '.'; if (str_ends_with($name, $to_match)) { $name = str_replace($to_match, ' (Budapest, ' . $arabic . '. kerület)', $name); } } } return $name; } /** * Rewrites a Ukrainian language name based on abbreviations explaining the * hierarchy of named places. * * @param string $name Input name to rewrite. * * @return string */ private static function _rewrite_ukrainian_names_by_hierarchy($name):string { $identifiersByLevel = [ 'state' => [' РСР', 'РСР ', ' губернія', 'губернія '], 'oblast' => ['обл.', 'округа', 'губернії'], 'region' => ['р-н', 'район'], 'county' => ['повіт'], 'city' => ['м.'], 'parish' => ['волость'], 'village' => ['смт', 'сільська', 'с. '], 'district' => [], // Is also р-н; which it is is determined based on position 'street' => ['вул. '], ]; $levels = [ 'country' => '', 'state' => '', 'oblast' => '', 'region' => '', 'county' => '', 'city' => '', 'parish' => '', 'village' => '', 'district' => '', 'street' => '', ]; $parts = explode(',', $name); foreach ($parts as $part) { $part = trim($part); foreach ($identifiersByLevel as $level => $identifiers) { foreach ($identifiers as $identifier) { if (str_starts_with($part, $identifier) || str_ends_with($part, $identifier)) { // Special case: Region can both be rajon or a district within a city // If both oblast and city are already known, the region will be a // district within the city. // Otherwise, it is to be assumed that it is a super-city region. if ($level === 'region' && !empty($levels['oblast']) && (!empty($levels['city']) || !empty($levels['village'])) ) { $level = 'district'; } if (!empty($levels[$level])) { # throw new Exception("Used the same level (" . $level . ") twice"); return $name; } $levels[$level] = $part; continue 3; } } } // Special case: Abbreviated SSRs if (in_array($part, ['УРСР', 'УССР'], true)) { $levels['state'] = $part; continue; } // Unspecified part level: Attempt identifying country if (!isset($countryNames)) { $countryNames = self::_loadJsonList(__DIR__ . "/../static/countries.uk.json") + self::_loadJsonList(__DIR__ . "/../static/historical_countries.uk.json"); $countryNames[] = 'СРСР'; $countryNames[] = 'УНР'; $countryNames[] = 'Російська імперія'; $countryNames[] = 'Рос.імперія'; $countryNames[] = 'Рос.имперія'; $countryNames[] = 'Російська імперія-УНР'; } if (in_array($part, $countryNames, true)) { $levels['country'] = $part; continue; } // Unspecified level; return return $name; } $main_name = ''; $specifiers = []; foreach (array_reverse($levels) as $level => $partname) { if (empty($partname)) continue; if ($level === 'city' || $level === 'village') { $strtr = []; foreach ($identifiersByLevel[$level] as $identifier) $strtr[$identifier] = ''; $partname = trim(strtr($partname, $strtr)); } if (empty($main_name)) { $main_name = $partname; } else { $specifiers[] = $partname; } } $output = $main_name; if (!empty($specifiers)) $output .= ' (' . implode(', ', $specifiers) . ')'; return $output; } /** * Cleans and consolidates name parts appearing regularly in Ukrainian place names. * * @param string $name Name of an place. * * @return string */ private static function _clean_ukrainian_abbreviations(string $name):string { if (str_contains($name, " р-н,") || str_contains($name, " р-н ") || str_ends_with($name, " р-н")) { $name = str_replace(" р-н", " район", $name); } if (str_contains($name, ',')) { $name = self::_rewrite_ukrainian_names_by_hierarchy($name); } return $name; } /** * Loads a JSON file, optionally loading it cached through a private static variable * if reuse is expectable (= in the case of CLI usage). * * @param non-empty-string $filename File name to load. * * @return list */ private static function _loadJsonList(string $filename):array { if (PHP_SAPI === 'cli' && isset(self::$_placeNameListCaches[$filename])) { return self::$_placeNameListCaches[$filename]; } try { $output = json_decode(MD_STD::file_get_contents($filename), true); } catch (MDFileDoesNotExist $e) { self::$_placeNameListCaches[$filename] = []; return []; } if ($output === false) { throw new Exception("Failed to get list"); } if (PHP_SAPI === 'cli') { self::$_placeNameListCaches[$filename] = $output; } return $output; } /** * Moves names of regions to brackets using pre-generated lists of countries, * historical country names, etc. * * @param string $lang Instance language. * @param string $name Input string to clean. * * @return string */ private static function _move_region_names_to_brackets(string $lang, string $name):string { $separators = ['-', ', ']; foreach ($separators as $separator) { if (!str_contains($name, $separator) || substr_count($name, $separator) !== 1) continue; // Get parts and trim them $parts = explode($separator, $name); foreach ($parts as $key => $value) { $parts[$key] = trim($value); } // Load place names $countryNames = self::_loadJsonList(__DIR__ . "/../static/countries.$lang.json") + self::_loadJsonList(__DIR__ . "/../static/historical_countries.$lang.json"); $cardinal_directions = self::_loadJsonList(__DIR__ . "/../static/cardinal_directions.json"); $part0IsCountry = in_array($parts[0], $countryNames, true); $part1IsCountry = in_array($parts[1], $countryNames, true); // Skip if the full name is in the list of country names if (in_array($name, $countryNames, true)) { return $name; } // If one of the parts is a blacklisted term or a cardinal directions, skip this if ( (in_array($parts[0], self::_COUNTRY_REWRITE_BLACKLISTED_TERMS, true) || in_array($parts[0], $cardinal_directions, true) || in_array(strtolower($parts[0]), $cardinal_directions, true) ) || (in_array($parts[1], self::_COUNTRY_REWRITE_BLACKLISTED_TERMS, true) || in_array($parts[1], $cardinal_directions, true) || in_array(strtolower($parts[1]), $cardinal_directions, true) ) ) { return $name; } if ($part0IsCountry === true && $part1IsCountry === false) { return $parts[1] . ' (' . $parts[0] . ')'; } else if ($part0IsCountry === false && $part1IsCountry === true) { return $parts[0] . ' (' . $parts[1] . ')'; } } return $name; } /** * Removes duplicates after commas. * * @param string $ort_name Place name to clean. * * @return string */ private static function _remove_duplicates_after_commas(string $ort_name):string { if (str_contains($ort_name, ',') === false) { return $ort_name; } $parts = explode(', ', $ort_name); return implode(', ', array_unique($parts)); } /** * Cleans a place name by trimming etc. Also removes uncertainty indicators. * * @param string $lang Instance language. * @param string $ort_name Input string to clean. * * @return string */ public static function consolidate_name(string $lang, string $ort_name):string { // Run basic replacements $nameSanitizations = self::_NAME_SANITIZATIONS; /* if (substr_count($ort_name, "/") === 1 && !str_contains($ort_name, '.')) { $nameSanitizations["/"] = "-"; } */ $ort_name = strtr(self::sanitizeInputString($ort_name), $nameSanitizations); $ort_name = self::sanitizeInputString(NodaUncertaintyHelper::cleanUncertaintyIndicatorsPlace($ort_name)); // Remove duplicates after commas // Västerdås, Schweden, Schweden > Västerdås, Schweden $ort_name = self::_remove_duplicates_after_commas($ort_name); $ort_name = match ($lang) { 'de' => self::_clean_german_abbreviations($ort_name), 'hu' => self::_clean_hungarian_abbreviations($ort_name), 'uk' => self::_clean_ukrainian_abbreviations($ort_name), default => $ort_name, }; $ort_name = self::_move_region_names_to_brackets($lang, $ort_name); return $ort_name; } }