Add rewriting for Ukrainian place names based on specified hierarchies
This commit is contained in:
parent
b4c941f441
commit
2badc67405
|
@ -144,7 +144,7 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
|
||||||
/**
|
/**
|
||||||
* Cleans and consolidates name parts appearing regularly in German place names.
|
* Cleans and consolidates name parts appearing regularly in German place names.
|
||||||
*
|
*
|
||||||
* @param string $name Name of an actor.
|
* @param string $name Name of a place.
|
||||||
*
|
*
|
||||||
* @return string
|
* @return string
|
||||||
*/
|
*/
|
||||||
|
@ -175,7 +175,7 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
|
||||||
/**
|
/**
|
||||||
* Cleans and consolidates name parts appearing regularly in Hungarian place names.
|
* Cleans and consolidates name parts appearing regularly in Hungarian place names.
|
||||||
*
|
*
|
||||||
* @param string $name Name of an actor.
|
* @param string $name Name of a place.
|
||||||
*
|
*
|
||||||
* @return string
|
* @return string
|
||||||
*/
|
*/
|
||||||
|
@ -218,6 +218,131 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Rewrites a Ukrainian language name based on abbreviations explaining the
|
||||||
|
* hierarchy of named places.
|
||||||
|
*
|
||||||
|
* @param string $name Input name to rewrite.
|
||||||
|
*
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
private static function _rewrite_ukrainian_names_by_hierarchy($name):string {
|
||||||
|
|
||||||
|
$identifiersByLevel = [
|
||||||
|
'state' => [' РСР', 'РСР ', ' губернія', 'губернія '],
|
||||||
|
'oblast' => ['обл.'],
|
||||||
|
'region' => ['р-н'],
|
||||||
|
'city' => ['м.'],
|
||||||
|
'district' => [], // Is also р-н; which it is is determined based on position
|
||||||
|
'street' => ['вул. '],
|
||||||
|
];
|
||||||
|
|
||||||
|
$levels = [
|
||||||
|
'country' => '',
|
||||||
|
'state' => '',
|
||||||
|
'oblast' => '',
|
||||||
|
'region' => '',
|
||||||
|
'city' => '',
|
||||||
|
'district' => '',
|
||||||
|
'street' => '',
|
||||||
|
];
|
||||||
|
|
||||||
|
$parts = explode(',', $name);
|
||||||
|
foreach ($parts as $part) {
|
||||||
|
$part = trim($part);
|
||||||
|
foreach ($identifiersByLevel as $level => $identifiers) {
|
||||||
|
foreach ($identifiers as $identifier) {
|
||||||
|
|
||||||
|
if (str_starts_with($part, $identifier) || str_ends_with($part, $identifier)) {
|
||||||
|
|
||||||
|
// Special case: Region can both be rajon or a district within a city
|
||||||
|
// If both oblast and city are already known, the region will be a
|
||||||
|
// district within the city.
|
||||||
|
// Otherwise, it is to be assumed that it is a super-city region.
|
||||||
|
if ($level === 'region' && !empty($levels['oblast']) && !empty($levels['city'])) {
|
||||||
|
$level = 'district';
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!empty($levels[$level])) {
|
||||||
|
throw new Exception("Used the same level (" . $level . ") twice");
|
||||||
|
}
|
||||||
|
$levels[$level] = $part;
|
||||||
|
continue 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Unspecified part level: Attempt identifying country
|
||||||
|
if (!isset($countryNames)) {
|
||||||
|
$countryNames = self::_loadJsonList(__DIR__ . "/../static/countries.uk.json") + self::_loadJsonList(__DIR__ . "/../static/historical_countries.uk.json");
|
||||||
|
$countryNames[] = 'СРСР';
|
||||||
|
$countryNames[] = 'УНР';
|
||||||
|
$countryNames[] = 'Російська імперія';
|
||||||
|
$countryNames[] = 'Рос.імперія';
|
||||||
|
$countryNames[] = 'Рос.имперія';
|
||||||
|
}
|
||||||
|
if (in_array($part, $countryNames, true)) {
|
||||||
|
$levels['country'] = $part;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Unspecified level; return
|
||||||
|
throw new Exception("Unknown " . $part);
|
||||||
|
return $name;
|
||||||
|
}
|
||||||
|
|
||||||
|
$main_name = '';
|
||||||
|
$specifiers = [];
|
||||||
|
|
||||||
|
foreach (array_reverse($levels) as $level => $partname) {
|
||||||
|
if (empty($partname)) continue;
|
||||||
|
|
||||||
|
if ($level === 'city') {
|
||||||
|
$strtr = [];
|
||||||
|
foreach ($identifiersByLevel[$level] as $identifier) $strtr[$identifier] = '';
|
||||||
|
$partname = trim(strtr($partname, $strtr));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (empty($main_name)) {
|
||||||
|
$main_name = $partname;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
$specifiers[] = $partname;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$output = $main_name;
|
||||||
|
if (!empty($specifiers)) $output .= ' (' . implode(', ', $specifiers) . ')';
|
||||||
|
|
||||||
|
return $output;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Cleans and consolidates name parts appearing regularly in Ukrainian place names.
|
||||||
|
*
|
||||||
|
* @param string $name Name of an place.
|
||||||
|
*
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
private static function _clean_ukrainian_abbreviations(string $name):string {
|
||||||
|
|
||||||
|
/*
|
||||||
|
if (str_contains($name, " krt. ") && \preg_match("/\ krt\.\ [0-9]/", $name)) {
|
||||||
|
$name = str_replace(" krt. ", " körut ", $name);
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
if (str_contains($name, ',')) {
|
||||||
|
$name = self::_rewrite_ukrainian_names_by_hierarchy($name);
|
||||||
|
}
|
||||||
|
|
||||||
|
return $name;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Loads a JSON file, optionally loading it cached through a private static variable
|
* Loads a JSON file, optionally loading it cached through a private static variable
|
||||||
* if reuse is expectable (= in the case of CLI usage).
|
* if reuse is expectable (= in the case of CLI usage).
|
||||||
|
@ -361,6 +486,7 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
|
||||||
$ort_name = match ($lang) {
|
$ort_name = match ($lang) {
|
||||||
'de' => self::_clean_german_abbreviations($ort_name),
|
'de' => self::_clean_german_abbreviations($ort_name),
|
||||||
'hu' => self::_clean_hungarian_abbreviations($ort_name),
|
'hu' => self::_clean_hungarian_abbreviations($ort_name),
|
||||||
|
'uk' => self::_clean_ukrainian_abbreviations($ort_name),
|
||||||
default => $ort_name,
|
default => $ort_name,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -71,5 +71,24 @@ final class NodaConsolidatedNamesForPlacesTest extends TestCase {
|
||||||
|
|
||||||
self::assertEquals("Västerdås, Schweden", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Västerdås, Schweden, Schweden"));
|
self::assertEquals("Västerdås, Schweden", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Västerdås, Schweden, Schweden"));
|
||||||
|
|
||||||
|
//
|
||||||
|
// Ukrainian: Rewrite by hierarchy
|
||||||
|
//
|
||||||
|
# self::assertEquals("Приморський (Україна, Запорізька обл., м. Запоріжжя, р-н", NodaConsolidatedNamesForPlaces::consolidate_name("uk", "Україна, Запорізька обл., м. Запоріжжя, Приморський р-н"));
|
||||||
|
self::assertEquals("Рига (Латвійська РСР)", NodaConsolidatedNamesForPlaces::consolidate_name("uk", "Латвійська РСР, м.Рига"));
|
||||||
|
self::assertEquals("Латвійська РСР, м.Рига", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Латвійська РСР, м.Рига"));
|
||||||
|
self::assertEquals("Рига (Латвійська РСР, СРСР)", NodaConsolidatedNamesForPlaces::consolidate_name("uk", "СРСР, Латвійська РСР, м.Рига"));
|
||||||
|
self::assertEquals("СРСР, Латвійська РСР, м.Рига", NodaConsolidatedNamesForPlaces::consolidate_name("de", "СРСР, Латвійська РСР, м.Рига"));
|
||||||
|
|
||||||
|
// Steet
|
||||||
|
self::assertEquals("вул. Шевченка (Рівне)", NodaConsolidatedNamesForPlaces::consolidate_name("uk", "м. Рівне, вул. Шевченка"));
|
||||||
|
self::assertEquals("м. Рівне, вул. Шевченка", NodaConsolidatedNamesForPlaces::consolidate_name("de", "м. Рівне, вул. Шевченка"));
|
||||||
|
|
||||||
|
// Special case: Region can both be rajon or a district within a city
|
||||||
|
// If both oblast and city are already known, the region will be a
|
||||||
|
// district within the city.
|
||||||
|
// Otherwise, it is to be assumed that it is a super-city region.
|
||||||
|
self::assertEquals("Приморський р-н (Запоріжжя, Запорізька обл., Україна)", NodaConsolidatedNamesForPlaces::consolidate_name("uk", "Україна, Запорізька обл., м. Запоріжжя, Приморський р-н"));
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user