Add rewriting for Ukrainian place names based on specified hierarchies
This commit is contained in:
		| @@ -144,7 +144,7 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract | |||||||
|     /** |     /** | ||||||
|      * Cleans and consolidates name parts appearing regularly in German place names. |      * Cleans and consolidates name parts appearing regularly in German place names. | ||||||
|      * |      * | ||||||
|      * @param string $name Name of an actor. |      * @param string $name Name of a place. | ||||||
|      * |      * | ||||||
|      * @return string |      * @return string | ||||||
|      */ |      */ | ||||||
| @@ -175,7 +175,7 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract | |||||||
|     /** |     /** | ||||||
|      * Cleans and consolidates name parts appearing regularly in Hungarian place names. |      * Cleans and consolidates name parts appearing regularly in Hungarian place names. | ||||||
|      * |      * | ||||||
|      * @param string $name Name of an actor. |      * @param string $name Name of a place. | ||||||
|      * |      * | ||||||
|      * @return string |      * @return string | ||||||
|      */ |      */ | ||||||
| @@ -218,6 +218,131 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract | |||||||
|  |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     /** | ||||||
|  |      * Rewrites a Ukrainian language name based on abbreviations explaining the | ||||||
|  |      * hierarchy of named places. | ||||||
|  |      * | ||||||
|  |      * @param string $name Input name to rewrite. | ||||||
|  |      * | ||||||
|  |      * @return string | ||||||
|  |      */ | ||||||
|  |     private static function _rewrite_ukrainian_names_by_hierarchy($name):string { | ||||||
|  |  | ||||||
|  |         $identifiersByLevel = [ | ||||||
|  |             'state' => [' РСР', 'РСР ', ' губернія', 'губернія '], | ||||||
|  |             'oblast' => ['обл.'], | ||||||
|  |             'region' => ['р-н'], | ||||||
|  |             'city' => ['м.'], | ||||||
|  |             'district' => [],   // Is also р-н; which it is is determined based on position | ||||||
|  |             'street' => ['вул. '], | ||||||
|  |         ]; | ||||||
|  |  | ||||||
|  |         $levels = [ | ||||||
|  |             'country' => '', | ||||||
|  |             'state' => '', | ||||||
|  |             'oblast' => '', | ||||||
|  |             'region' => '', | ||||||
|  |             'city' => '', | ||||||
|  |             'district' => '', | ||||||
|  |             'street' => '', | ||||||
|  |         ]; | ||||||
|  |  | ||||||
|  |         $parts = explode(',', $name); | ||||||
|  |         foreach ($parts as $part) { | ||||||
|  |             $part = trim($part); | ||||||
|  |             foreach ($identifiersByLevel as $level => $identifiers) { | ||||||
|  |                 foreach ($identifiers as $identifier) { | ||||||
|  |  | ||||||
|  |                     if (str_starts_with($part, $identifier) || str_ends_with($part, $identifier)) { | ||||||
|  |  | ||||||
|  |                         // Special case: Region can both be rajon or a district within a city | ||||||
|  |                         // If both oblast and city are already known, the region will be a | ||||||
|  |                         // district within the city. | ||||||
|  |                         // Otherwise, it is to be assumed that it is a super-city region. | ||||||
|  |                         if ($level === 'region' && !empty($levels['oblast']) && !empty($levels['city'])) { | ||||||
|  |                             $level = 'district'; | ||||||
|  |                         } | ||||||
|  |  | ||||||
|  |                         if (!empty($levels[$level])) { | ||||||
|  |                             throw new Exception("Used the same level (" . $level . ") twice"); | ||||||
|  |                         } | ||||||
|  |                         $levels[$level] = $part; | ||||||
|  |                         continue 3; | ||||||
|  |                     } | ||||||
|  |  | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             // Unspecified part level: Attempt identifying country | ||||||
|  |             if (!isset($countryNames)) { | ||||||
|  |                 $countryNames = self::_loadJsonList(__DIR__ . "/../static/countries.uk.json") + self::_loadJsonList(__DIR__ . "/../static/historical_countries.uk.json"); | ||||||
|  |                 $countryNames[] = 'СРСР'; | ||||||
|  |                 $countryNames[] = 'УНР'; | ||||||
|  |                 $countryNames[] = 'Російська імперія'; | ||||||
|  |                 $countryNames[] = 'Рос.імперія'; | ||||||
|  |                 $countryNames[] = 'Рос.имперія'; | ||||||
|  |             } | ||||||
|  |             if (in_array($part, $countryNames, true)) { | ||||||
|  |                 $levels['country'] = $part; | ||||||
|  |                 continue; | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             // Unspecified level; return | ||||||
|  |             throw new Exception("Unknown " . $part); | ||||||
|  |             return $name; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         $main_name = ''; | ||||||
|  |         $specifiers = []; | ||||||
|  |  | ||||||
|  |         foreach (array_reverse($levels) as $level => $partname) { | ||||||
|  |             if (empty($partname)) continue; | ||||||
|  |  | ||||||
|  |             if ($level === 'city') { | ||||||
|  |                 $strtr = []; | ||||||
|  |                 foreach ($identifiersByLevel[$level] as $identifier) $strtr[$identifier] = ''; | ||||||
|  |                 $partname = trim(strtr($partname, $strtr)); | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             if (empty($main_name)) { | ||||||
|  |                 $main_name = $partname; | ||||||
|  |             } | ||||||
|  |             else { | ||||||
|  |                 $specifiers[] = $partname; | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         $output = $main_name; | ||||||
|  |         if (!empty($specifiers)) $output .= ' (' . implode(', ', $specifiers) . ')'; | ||||||
|  |  | ||||||
|  |         return $output; | ||||||
|  |  | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     /** | ||||||
|  |      * Cleans and consolidates name parts appearing regularly in Ukrainian place names. | ||||||
|  |      * | ||||||
|  |      * @param string $name Name of an place. | ||||||
|  |      * | ||||||
|  |      * @return string | ||||||
|  |      */ | ||||||
|  |     private static function _clean_ukrainian_abbreviations(string $name):string { | ||||||
|  |  | ||||||
|  |         /* | ||||||
|  |         if (str_contains($name, " krt. ") && \preg_match("/\ krt\.\ [0-9]/", $name)) { | ||||||
|  |             $name = str_replace(" krt. ", " körut ", $name); | ||||||
|  |         } | ||||||
|  |          */ | ||||||
|  |  | ||||||
|  |         if (str_contains($name, ',')) { | ||||||
|  |             $name = self::_rewrite_ukrainian_names_by_hierarchy($name); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         return $name; | ||||||
|  |  | ||||||
|  |     } | ||||||
|  |  | ||||||
|     /** |     /** | ||||||
|      * Loads a JSON file, optionally loading it cached through a private static variable |      * Loads a JSON file, optionally loading it cached through a private static variable | ||||||
|      * if reuse is expectable (= in the case of CLI usage). |      * if reuse is expectable (= in the case of CLI usage). | ||||||
| @@ -361,6 +486,7 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract | |||||||
|         $ort_name = match ($lang) { |         $ort_name = match ($lang) { | ||||||
|             'de' => self::_clean_german_abbreviations($ort_name), |             'de' => self::_clean_german_abbreviations($ort_name), | ||||||
|             'hu' => self::_clean_hungarian_abbreviations($ort_name), |             'hu' => self::_clean_hungarian_abbreviations($ort_name), | ||||||
|  |             'uk' => self::_clean_ukrainian_abbreviations($ort_name), | ||||||
|             default => $ort_name, |             default => $ort_name, | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -71,5 +71,24 @@ final class NodaConsolidatedNamesForPlacesTest extends TestCase { | |||||||
|  |  | ||||||
|         self::assertEquals("Västerdås, Schweden", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Västerdås, Schweden, Schweden")); |         self::assertEquals("Västerdås, Schweden", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Västerdås, Schweden, Schweden")); | ||||||
|  |  | ||||||
|  |         // | ||||||
|  |         // Ukrainian: Rewrite by hierarchy | ||||||
|  |         // | ||||||
|  |         # self::assertEquals("Приморський (Україна, Запорізька обл., м. Запоріжжя,  р-н", NodaConsolidatedNamesForPlaces::consolidate_name("uk", "Україна, Запорізька обл., м. Запоріжжя, Приморський р-н")); | ||||||
|  |         self::assertEquals("Рига (Латвійська РСР)", NodaConsolidatedNamesForPlaces::consolidate_name("uk", "Латвійська РСР, м.Рига")); | ||||||
|  |         self::assertEquals("Латвійська РСР, м.Рига", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Латвійська РСР, м.Рига")); | ||||||
|  |         self::assertEquals("Рига (Латвійська РСР, СРСР)", NodaConsolidatedNamesForPlaces::consolidate_name("uk", "СРСР, Латвійська РСР, м.Рига")); | ||||||
|  |         self::assertEquals("СРСР, Латвійська РСР, м.Рига", NodaConsolidatedNamesForPlaces::consolidate_name("de", "СРСР, Латвійська РСР, м.Рига")); | ||||||
|  |  | ||||||
|  |         // Steet | ||||||
|  |         self::assertEquals("вул. Шевченка (Рівне)", NodaConsolidatedNamesForPlaces::consolidate_name("uk", "м. Рівне, вул. Шевченка")); | ||||||
|  |         self::assertEquals("м. Рівне, вул. Шевченка", NodaConsolidatedNamesForPlaces::consolidate_name("de", "м. Рівне, вул. Шевченка")); | ||||||
|  |  | ||||||
|  |         // Special case: Region can both be rajon or a district within a city | ||||||
|  |         // If both oblast and city are already known, the region will be a | ||||||
|  |         // district within the city. | ||||||
|  |         // Otherwise, it is to be assumed that it is a super-city region. | ||||||
|  |         self::assertEquals("Приморський р-н (Запоріжжя, Запорізька обл., Україна)", NodaConsolidatedNamesForPlaces::consolidate_name("uk", "Україна, Запорізька обл., м. Запоріжжя, Приморський р-н")); | ||||||
|  |  | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user