Add rewriting for Ukrainian place names based on specified hierarchies
This commit is contained in:
parent
b4c941f441
commit
2badc67405
@ -144,7 +144,7 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
|
||||
/**
|
||||
* Cleans and consolidates name parts appearing regularly in German place names.
|
||||
*
|
||||
* @param string $name Name of an actor.
|
||||
* @param string $name Name of a place.
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
@ -175,7 +175,7 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
|
||||
/**
|
||||
* Cleans and consolidates name parts appearing regularly in Hungarian place names.
|
||||
*
|
||||
* @param string $name Name of an actor.
|
||||
* @param string $name Name of a place.
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
@ -218,6 +218,131 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Rewrites a Ukrainian language name based on abbreviations explaining the
|
||||
* hierarchy of named places.
|
||||
*
|
||||
* @param string $name Input name to rewrite.
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
private static function _rewrite_ukrainian_names_by_hierarchy($name):string {
|
||||
|
||||
$identifiersByLevel = [
|
||||
'state' => [' РСР', 'РСР ', ' губернія', 'губернія '],
|
||||
'oblast' => ['обл.'],
|
||||
'region' => ['р-н'],
|
||||
'city' => ['м.'],
|
||||
'district' => [], // Is also р-н; which it is is determined based on position
|
||||
'street' => ['вул. '],
|
||||
];
|
||||
|
||||
$levels = [
|
||||
'country' => '',
|
||||
'state' => '',
|
||||
'oblast' => '',
|
||||
'region' => '',
|
||||
'city' => '',
|
||||
'district' => '',
|
||||
'street' => '',
|
||||
];
|
||||
|
||||
$parts = explode(',', $name);
|
||||
foreach ($parts as $part) {
|
||||
$part = trim($part);
|
||||
foreach ($identifiersByLevel as $level => $identifiers) {
|
||||
foreach ($identifiers as $identifier) {
|
||||
|
||||
if (str_starts_with($part, $identifier) || str_ends_with($part, $identifier)) {
|
||||
|
||||
// Special case: Region can both be rajon or a district within a city
|
||||
// If both oblast and city are already known, the region will be a
|
||||
// district within the city.
|
||||
// Otherwise, it is to be assumed that it is a super-city region.
|
||||
if ($level === 'region' && !empty($levels['oblast']) && !empty($levels['city'])) {
|
||||
$level = 'district';
|
||||
}
|
||||
|
||||
if (!empty($levels[$level])) {
|
||||
throw new Exception("Used the same level (" . $level . ") twice");
|
||||
}
|
||||
$levels[$level] = $part;
|
||||
continue 3;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// Unspecified part level: Attempt identifying country
|
||||
if (!isset($countryNames)) {
|
||||
$countryNames = self::_loadJsonList(__DIR__ . "/../static/countries.uk.json") + self::_loadJsonList(__DIR__ . "/../static/historical_countries.uk.json");
|
||||
$countryNames[] = 'СРСР';
|
||||
$countryNames[] = 'УНР';
|
||||
$countryNames[] = 'Російська імперія';
|
||||
$countryNames[] = 'Рос.імперія';
|
||||
$countryNames[] = 'Рос.имперія';
|
||||
}
|
||||
if (in_array($part, $countryNames, true)) {
|
||||
$levels['country'] = $part;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Unspecified level; return
|
||||
throw new Exception("Unknown " . $part);
|
||||
return $name;
|
||||
}
|
||||
|
||||
$main_name = '';
|
||||
$specifiers = [];
|
||||
|
||||
foreach (array_reverse($levels) as $level => $partname) {
|
||||
if (empty($partname)) continue;
|
||||
|
||||
if ($level === 'city') {
|
||||
$strtr = [];
|
||||
foreach ($identifiersByLevel[$level] as $identifier) $strtr[$identifier] = '';
|
||||
$partname = trim(strtr($partname, $strtr));
|
||||
}
|
||||
|
||||
if (empty($main_name)) {
|
||||
$main_name = $partname;
|
||||
}
|
||||
else {
|
||||
$specifiers[] = $partname;
|
||||
}
|
||||
}
|
||||
|
||||
$output = $main_name;
|
||||
if (!empty($specifiers)) $output .= ' (' . implode(', ', $specifiers) . ')';
|
||||
|
||||
return $output;
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Cleans and consolidates name parts appearing regularly in Ukrainian place names.
|
||||
*
|
||||
* @param string $name Name of an place.
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
private static function _clean_ukrainian_abbreviations(string $name):string {
|
||||
|
||||
/*
|
||||
if (str_contains($name, " krt. ") && \preg_match("/\ krt\.\ [0-9]/", $name)) {
|
||||
$name = str_replace(" krt. ", " körut ", $name);
|
||||
}
|
||||
*/
|
||||
|
||||
if (str_contains($name, ',')) {
|
||||
$name = self::_rewrite_ukrainian_names_by_hierarchy($name);
|
||||
}
|
||||
|
||||
return $name;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a JSON file, optionally loading it cached through a private static variable
|
||||
* if reuse is expectable (= in the case of CLI usage).
|
||||
@ -361,6 +486,7 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
|
||||
$ort_name = match ($lang) {
|
||||
'de' => self::_clean_german_abbreviations($ort_name),
|
||||
'hu' => self::_clean_hungarian_abbreviations($ort_name),
|
||||
'uk' => self::_clean_ukrainian_abbreviations($ort_name),
|
||||
default => $ort_name,
|
||||
};
|
||||
|
||||
|
@ -71,5 +71,24 @@ final class NodaConsolidatedNamesForPlacesTest extends TestCase {
|
||||
|
||||
self::assertEquals("Västerdås, Schweden", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Västerdås, Schweden, Schweden"));
|
||||
|
||||
//
|
||||
// Ukrainian: Rewrite by hierarchy
|
||||
//
|
||||
# self::assertEquals("Приморський (Україна, Запорізька обл., м. Запоріжжя, р-н", NodaConsolidatedNamesForPlaces::consolidate_name("uk", "Україна, Запорізька обл., м. Запоріжжя, Приморський р-н"));
|
||||
self::assertEquals("Рига (Латвійська РСР)", NodaConsolidatedNamesForPlaces::consolidate_name("uk", "Латвійська РСР, м.Рига"));
|
||||
self::assertEquals("Латвійська РСР, м.Рига", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Латвійська РСР, м.Рига"));
|
||||
self::assertEquals("Рига (Латвійська РСР, СРСР)", NodaConsolidatedNamesForPlaces::consolidate_name("uk", "СРСР, Латвійська РСР, м.Рига"));
|
||||
self::assertEquals("СРСР, Латвійська РСР, м.Рига", NodaConsolidatedNamesForPlaces::consolidate_name("de", "СРСР, Латвійська РСР, м.Рига"));
|
||||
|
||||
// Steet
|
||||
self::assertEquals("вул. Шевченка (Рівне)", NodaConsolidatedNamesForPlaces::consolidate_name("uk", "м. Рівне, вул. Шевченка"));
|
||||
self::assertEquals("м. Рівне, вул. Шевченка", NodaConsolidatedNamesForPlaces::consolidate_name("de", "м. Рівне, вул. Шевченка"));
|
||||
|
||||
// Special case: Region can both be rajon or a district within a city
|
||||
// If both oblast and city are already known, the region will be a
|
||||
// district within the city.
|
||||
// Otherwise, it is to be assumed that it is a super-city region.
|
||||
self::assertEquals("Приморський р-н (Запоріжжя, Запорізька обл., Україна)", NodaConsolidatedNamesForPlaces::consolidate_name("uk", "Україна, Запорізька обл., м. Запоріжжя, Приморський р-н"));
|
||||
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user