Add rewriting for Ukrainian place names based on specified hierarchies

This commit is contained in:
Joshua Ramon Enslin 2023-12-02 15:21:02 +01:00
parent b4c941f441
commit 2badc67405
Signed by: jrenslin
GPG Key ID: 46016F84501B70AE
2 changed files with 147 additions and 2 deletions

View File

@ -144,7 +144,7 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
/**
* Cleans and consolidates name parts appearing regularly in German place names.
*
* @param string $name Name of an actor.
* @param string $name Name of a place.
*
* @return string
*/
@ -175,7 +175,7 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
/**
* Cleans and consolidates name parts appearing regularly in Hungarian place names.
*
* @param string $name Name of an actor.
* @param string $name Name of a place.
*
* @return string
*/
@ -218,6 +218,131 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
}
/**
* Rewrites a Ukrainian language name based on abbreviations explaining the
* hierarchy of named places.
*
* @param string $name Input name to rewrite.
*
* @return string
*/
private static function _rewrite_ukrainian_names_by_hierarchy($name):string {
$identifiersByLevel = [
'state' => [' РСР', 'РСР ', ' губернія', 'губернія '],
'oblast' => ['обл.'],
'region' => ['р-н'],
'city' => ['м.'],
'district' => [], // Is also р-н; which it is is determined based on position
'street' => ['вул. '],
];
$levels = [
'country' => '',
'state' => '',
'oblast' => '',
'region' => '',
'city' => '',
'district' => '',
'street' => '',
];
$parts = explode(',', $name);
foreach ($parts as $part) {
$part = trim($part);
foreach ($identifiersByLevel as $level => $identifiers) {
foreach ($identifiers as $identifier) {
if (str_starts_with($part, $identifier) || str_ends_with($part, $identifier)) {
// Special case: Region can both be rajon or a district within a city
// If both oblast and city are already known, the region will be a
// district within the city.
// Otherwise, it is to be assumed that it is a super-city region.
if ($level === 'region' && !empty($levels['oblast']) && !empty($levels['city'])) {
$level = 'district';
}
if (!empty($levels[$level])) {
throw new Exception("Used the same level (" . $level . ") twice");
}
$levels[$level] = $part;
continue 3;
}
}
}
// Unspecified part level: Attempt identifying country
if (!isset($countryNames)) {
$countryNames = self::_loadJsonList(__DIR__ . "/../static/countries.uk.json") + self::_loadJsonList(__DIR__ . "/../static/historical_countries.uk.json");
$countryNames[] = 'СРСР';
$countryNames[] = 'УНР';
$countryNames[] = 'Російська імперія';
$countryNames[] = 'Рос.імперія';
$countryNames[] = 'Рос.имперія';
}
if (in_array($part, $countryNames, true)) {
$levels['country'] = $part;
continue;
}
// Unspecified level; return
throw new Exception("Unknown " . $part);
return $name;
}
$main_name = '';
$specifiers = [];
foreach (array_reverse($levels) as $level => $partname) {
if (empty($partname)) continue;
if ($level === 'city') {
$strtr = [];
foreach ($identifiersByLevel[$level] as $identifier) $strtr[$identifier] = '';
$partname = trim(strtr($partname, $strtr));
}
if (empty($main_name)) {
$main_name = $partname;
}
else {
$specifiers[] = $partname;
}
}
$output = $main_name;
if (!empty($specifiers)) $output .= ' (' . implode(', ', $specifiers) . ')';
return $output;
}
/**
* Cleans and consolidates name parts appearing regularly in Ukrainian place names.
*
* @param string $name Name of an place.
*
* @return string
*/
private static function _clean_ukrainian_abbreviations(string $name):string {
/*
if (str_contains($name, " krt. ") && \preg_match("/\ krt\.\ [0-9]/", $name)) {
$name = str_replace(" krt. ", " körut ", $name);
}
*/
if (str_contains($name, ',')) {
$name = self::_rewrite_ukrainian_names_by_hierarchy($name);
}
return $name;
}
/**
* Loads a JSON file, optionally loading it cached through a private static variable
* if reuse is expectable (= in the case of CLI usage).
@ -361,6 +486,7 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
$ort_name = match ($lang) {
'de' => self::_clean_german_abbreviations($ort_name),
'hu' => self::_clean_hungarian_abbreviations($ort_name),
'uk' => self::_clean_ukrainian_abbreviations($ort_name),
default => $ort_name,
};

View File

@ -71,5 +71,24 @@ final class NodaConsolidatedNamesForPlacesTest extends TestCase {
self::assertEquals("Västerdås, Schweden", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Västerdås, Schweden, Schweden"));
//
// Ukrainian: Rewrite by hierarchy
//
# self::assertEquals("Приморський (Україна, Запорізька обл., м. Запоріжжя, р-н", NodaConsolidatedNamesForPlaces::consolidate_name("uk", "Україна, Запорізька обл., м. Запоріжжя, Приморський р-н"));
self::assertEquals("Рига (Латвійська РСР)", NodaConsolidatedNamesForPlaces::consolidate_name("uk", "Латвійська РСР, м.Рига"));
self::assertEquals("Латвійська РСР, м.Рига", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Латвійська РСР, м.Рига"));
self::assertEquals("Рига (Латвійська РСР, СРСР)", NodaConsolidatedNamesForPlaces::consolidate_name("uk", "СРСР, Латвійська РСР, м.Рига"));
self::assertEquals("СРСР, Латвійська РСР, м.Рига", NodaConsolidatedNamesForPlaces::consolidate_name("de", "СРСР, Латвійська РСР, м.Рига"));
// Steet
self::assertEquals("вул. Шевченка (Рівне)", NodaConsolidatedNamesForPlaces::consolidate_name("uk", "м. Рівне, вул. Шевченка"));
self::assertEquals("м. Рівне, вул. Шевченка", NodaConsolidatedNamesForPlaces::consolidate_name("de", "м. Рівне, вул. Шевченка"));
// Special case: Region can both be rajon or a district within a city
// If both oblast and city are already known, the region will be a
// district within the city.
// Otherwise, it is to be assumed that it is a super-city region.
self::assertEquals("Приморський р-н (Запоріжжя, Запорізька обл., Україна)", NodaConsolidatedNamesForPlaces::consolidate_name("uk", "Україна, Запорізька обл., м. Запоріжжя, Приморський р"));
}
}