diff --git a/src/NodaConsolidatedNamesForPlaces.php b/src/NodaConsolidatedNamesForPlaces.php index de4af37..8c8988a 100644 --- a/src/NodaConsolidatedNamesForPlaces.php +++ b/src/NodaConsolidatedNamesForPlaces.php @@ -144,7 +144,7 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract /** * Cleans and consolidates name parts appearing regularly in German place names. * - * @param string $name Name of an actor. + * @param string $name Name of a place. * * @return string */ @@ -175,7 +175,7 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract /** * Cleans and consolidates name parts appearing regularly in Hungarian place names. * - * @param string $name Name of an actor. + * @param string $name Name of a place. * * @return string */ @@ -218,6 +218,131 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract } + /** + * Rewrites a Ukrainian language name based on abbreviations explaining the + * hierarchy of named places. + * + * @param string $name Input name to rewrite. + * + * @return string + */ + private static function _rewrite_ukrainian_names_by_hierarchy($name):string { + + $identifiersByLevel = [ + 'state' => [' РСР', 'РСР ', ' губернія', 'губернія '], + 'oblast' => ['обл.'], + 'region' => ['р-н'], + 'city' => ['м.'], + 'district' => [], // Is also р-н; which it is is determined based on position + 'street' => ['вул. '], + ]; + + $levels = [ + 'country' => '', + 'state' => '', + 'oblast' => '', + 'region' => '', + 'city' => '', + 'district' => '', + 'street' => '', + ]; + + $parts = explode(',', $name); + foreach ($parts as $part) { + $part = trim($part); + foreach ($identifiersByLevel as $level => $identifiers) { + foreach ($identifiers as $identifier) { + + if (str_starts_with($part, $identifier) || str_ends_with($part, $identifier)) { + + // Special case: Region can both be rajon or a district within a city + // If both oblast and city are already known, the region will be a + // district within the city. + // Otherwise, it is to be assumed that it is a super-city region. + if ($level === 'region' && !empty($levels['oblast']) && !empty($levels['city'])) { + $level = 'district'; + } + + if (!empty($levels[$level])) { + throw new Exception("Used the same level (" . $level . ") twice"); + } + $levels[$level] = $part; + continue 3; + } + + } + } + + // Unspecified part level: Attempt identifying country + if (!isset($countryNames)) { + $countryNames = self::_loadJsonList(__DIR__ . "/../static/countries.uk.json") + self::_loadJsonList(__DIR__ . "/../static/historical_countries.uk.json"); + $countryNames[] = 'СРСР'; + $countryNames[] = 'УНР'; + $countryNames[] = 'Російська імперія'; + $countryNames[] = 'Рос.імперія'; + $countryNames[] = 'Рос.имперія'; + } + if (in_array($part, $countryNames, true)) { + $levels['country'] = $part; + continue; + } + + // Unspecified level; return + throw new Exception("Unknown " . $part); + return $name; + } + + $main_name = ''; + $specifiers = []; + + foreach (array_reverse($levels) as $level => $partname) { + if (empty($partname)) continue; + + if ($level === 'city') { + $strtr = []; + foreach ($identifiersByLevel[$level] as $identifier) $strtr[$identifier] = ''; + $partname = trim(strtr($partname, $strtr)); + } + + if (empty($main_name)) { + $main_name = $partname; + } + else { + $specifiers[] = $partname; + } + } + + $output = $main_name; + if (!empty($specifiers)) $output .= ' (' . implode(', ', $specifiers) . ')'; + + return $output; + + } + + + /** + * Cleans and consolidates name parts appearing regularly in Ukrainian place names. + * + * @param string $name Name of an place. + * + * @return string + */ + private static function _clean_ukrainian_abbreviations(string $name):string { + + /* + if (str_contains($name, " krt. ") && \preg_match("/\ krt\.\ [0-9]/", $name)) { + $name = str_replace(" krt. ", " körut ", $name); + } + */ + + if (str_contains($name, ',')) { + $name = self::_rewrite_ukrainian_names_by_hierarchy($name); + } + + return $name; + + } + /** * Loads a JSON file, optionally loading it cached through a private static variable * if reuse is expectable (= in the case of CLI usage). @@ -361,6 +486,7 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract $ort_name = match ($lang) { 'de' => self::_clean_german_abbreviations($ort_name), 'hu' => self::_clean_hungarian_abbreviations($ort_name), + 'uk' => self::_clean_ukrainian_abbreviations($ort_name), default => $ort_name, }; diff --git a/tests/NodaConsolidatedNamesForPlacesTest.php b/tests/NodaConsolidatedNamesForPlacesTest.php index 171dfa7..57e3293 100644 --- a/tests/NodaConsolidatedNamesForPlacesTest.php +++ b/tests/NodaConsolidatedNamesForPlacesTest.php @@ -71,5 +71,24 @@ final class NodaConsolidatedNamesForPlacesTest extends TestCase { self::assertEquals("Västerdås, Schweden", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Västerdås, Schweden, Schweden")); + // + // Ukrainian: Rewrite by hierarchy + // + # self::assertEquals("Приморський (Україна, Запорізька обл., м. Запоріжжя, р-н", NodaConsolidatedNamesForPlaces::consolidate_name("uk", "Україна, Запорізька обл., м. Запоріжжя, Приморський р-н")); + self::assertEquals("Рига (Латвійська РСР)", NodaConsolidatedNamesForPlaces::consolidate_name("uk", "Латвійська РСР, м.Рига")); + self::assertEquals("Латвійська РСР, м.Рига", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Латвійська РСР, м.Рига")); + self::assertEquals("Рига (Латвійська РСР, СРСР)", NodaConsolidatedNamesForPlaces::consolidate_name("uk", "СРСР, Латвійська РСР, м.Рига")); + self::assertEquals("СРСР, Латвійська РСР, м.Рига", NodaConsolidatedNamesForPlaces::consolidate_name("de", "СРСР, Латвійська РСР, м.Рига")); + + // Steet + self::assertEquals("вул. Шевченка (Рівне)", NodaConsolidatedNamesForPlaces::consolidate_name("uk", "м. Рівне, вул. Шевченка")); + self::assertEquals("м. Рівне, вул. Шевченка", NodaConsolidatedNamesForPlaces::consolidate_name("de", "м. Рівне, вул. Шевченка")); + + // Special case: Region can both be rajon or a district within a city + // If both oblast and city are already known, the region will be a + // district within the city. + // Otherwise, it is to be assumed that it is a super-city region. + self::assertEquals("Приморський р-н (Запоріжжя, Запорізька обл., Україна)", NodaConsolidatedNamesForPlaces::consolidate_name("uk", "Україна, Запорізька обл., м. Запоріжжя, Приморський р-н")); + } }