Add blacklist for unwanted rewrites in consolidating place names
This commit is contained in:
@ -21,6 +21,18 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
|
||||
"Unknown" => "",
|
||||
];
|
||||
|
||||
/** Blacklist for comparison with country names */
|
||||
private const _COUNTRY_REWRITE_BLACKLISTED_TERMS = [
|
||||
'District',
|
||||
'Distrikt',
|
||||
'India',
|
||||
'Indien',
|
||||
'Insel',
|
||||
'Inseln',
|
||||
'Tal',
|
||||
'Yue',
|
||||
];
|
||||
|
||||
private const _PLACE_TYPE_INDICATORS_GERMAN = [
|
||||
'Insel',
|
||||
'Stadt',
|
||||
@ -31,14 +43,15 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
|
||||
private const _PLACE_NARROWER_LOCATION_INDICATORS_GERMAN = [
|
||||
'gasse',
|
||||
'straße',
|
||||
' Straße',
|
||||
];
|
||||
|
||||
// Indicators signifying that a place is likely subordinate to the other
|
||||
// if two places are provided in a comma-separated list
|
||||
private const _PLACE_NARROWER_LOCATION_INDICATORS_HUNGARIAN = [
|
||||
'körut',
|
||||
'utca',
|
||||
'út',
|
||||
' körut ',
|
||||
' utca ',
|
||||
' út ',
|
||||
];
|
||||
|
||||
private const _RELEVANT_ROMAN_NUMERALS = [
|
||||
@ -87,8 +100,18 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
|
||||
&& substr_count($name, $separator) === 1
|
||||
&& !str_contains($name, "(")
|
||||
) {
|
||||
|
||||
$parts = explode(', ', $name);
|
||||
|
||||
// Skip entries like "Vaci utca 12 Budapest, Vaci utca"
|
||||
$indicatorTrimmed = trim($indicator);
|
||||
if (
|
||||
(str_ends_with($parts[0], $indicatorTrimmed) && str_contains($parts[1], $indicatorTrimmed))
|
||||
|| (str_ends_with($parts[1], $indicatorTrimmed) && str_contains($parts[0], $indicatorTrimmed))
|
||||
) {
|
||||
return $name;
|
||||
}
|
||||
|
||||
// Prevent errors in case of "Adalbrechtstraße 12, "
|
||||
if (!empty($parts[0]) && !empty($parts[1])) {
|
||||
|
||||
@ -164,6 +187,15 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
|
||||
if (str_contains($name, " u. ") && \preg_match("/\ u\.\ [0-9]/", $name)) {
|
||||
$name = str_replace(" u. ", " utca ", $name);
|
||||
}
|
||||
if (str_contains($name, " ucca ") && \preg_match("/\ ucca\ [0-9]/", $name)) {
|
||||
$name = str_replace(" ucca ", " utca ", $name);
|
||||
}
|
||||
if (str_contains($name, " utcza ") && \preg_match("/\ utcza\ [0-9]/", $name)) {
|
||||
$name = str_replace(" utcza ", " utca ", $name);
|
||||
}
|
||||
if (str_contains($name, " rkp. ") && \preg_match("/\ rkp\.\ [0-9]/", $name)) {
|
||||
$name = str_replace(" rkp. ", " rakpart ", $name);
|
||||
}
|
||||
|
||||
// "Adalbrecht utca. 12, Berlin" > Adalbrecht utca 12 (Berlin)
|
||||
|
||||
@ -200,7 +232,14 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
|
||||
return self::$_placeNameListCaches[$filename];
|
||||
}
|
||||
|
||||
$output = json_decode(MD_STD::file_get_contents($filename), true);
|
||||
try {
|
||||
$output = json_decode(MD_STD::file_get_contents($filename), true);
|
||||
}
|
||||
catch (MDFileDoesNotExist $e) {
|
||||
self::$_placeNameListCaches[$filename] = [];
|
||||
return [];
|
||||
}
|
||||
|
||||
if ($output === false) {
|
||||
throw new Exception("Failed to get list");
|
||||
}
|
||||
@ -238,10 +277,31 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
|
||||
|
||||
// Load place names
|
||||
$countryNames = self::_loadJsonList(__DIR__ . "/../static/countries.$lang.json") + self::_loadJsonList(__DIR__ . "/../static/historical_countries.$lang.json");
|
||||
$cardinal_directions = self::_loadJsonList(__DIR__ . "/../static/cardinal_directions.json");
|
||||
|
||||
$part0IsCountry = in_array($parts[0], $countryNames, true);
|
||||
$part1IsCountry = in_array($parts[1], $countryNames, true);
|
||||
|
||||
// Skip if the full name is in the list of country names
|
||||
if (in_array($name, $countryNames, true)) {
|
||||
return $name;
|
||||
}
|
||||
|
||||
// If one of the parts is a blacklisted term or a cardinal directions, skip this
|
||||
|
||||
if (
|
||||
(in_array($parts[0], self::_COUNTRY_REWRITE_BLACKLISTED_TERMS, true)
|
||||
|| in_array($parts[0], $cardinal_directions, true)
|
||||
|| in_array(strtolower($parts[0]), $cardinal_directions, true)
|
||||
)
|
||||
|| (in_array($parts[1], self::_COUNTRY_REWRITE_BLACKLISTED_TERMS, true)
|
||||
|| in_array($parts[1], $cardinal_directions, true)
|
||||
|| in_array(strtolower($parts[1]), $cardinal_directions, true)
|
||||
)
|
||||
) {
|
||||
return $name;
|
||||
}
|
||||
|
||||
if ($part0IsCountry === true && $part1IsCountry === false) {
|
||||
return $parts[1] . ' (' . $parts[0] . ')';
|
||||
}
|
||||
@ -255,6 +315,25 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes duplicates after commas.
|
||||
*
|
||||
* @param string $ort_name Place name to clean.
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
private static function _remove_duplicates_after_commas(string $ort_name):string {
|
||||
|
||||
if (str_contains($ort_name, ',') === false) {
|
||||
return $ort_name;
|
||||
}
|
||||
|
||||
$parts = explode(', ', $ort_name);
|
||||
|
||||
return implode(', ', array_unique($parts));
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Cleans a place name by trimming etc. Also removes uncertainty indicators.
|
||||
*
|
||||
@ -267,10 +346,18 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
|
||||
|
||||
// Run basic replacements
|
||||
$nameSanitizations = self::_NAME_SANITIZATIONS;
|
||||
if (substr_count($ort_name, "/") === 1) $nameSanitizations["/"] = "-";
|
||||
/*
|
||||
if (substr_count($ort_name, "/") === 1 && !str_contains($ort_name, '.')) {
|
||||
$nameSanitizations["/"] = "-";
|
||||
}
|
||||
*/
|
||||
$ort_name = strtr(self::sanitizeInputString($ort_name), $nameSanitizations);
|
||||
$ort_name = self::sanitizeInputString(NodaUncertaintyHelper::cleanUncertaintyIndicatorsPlace($ort_name));
|
||||
|
||||
// Remove duplicates after commas
|
||||
// Västerdås, Schweden, Schweden > Västerdås, Schweden
|
||||
$ort_name = self::_remove_duplicates_after_commas($ort_name);
|
||||
|
||||
$ort_name = match ($lang) {
|
||||
'de' => self::_clean_german_abbreviations($ort_name),
|
||||
'hu' => self::_clean_hungarian_abbreviations($ort_name),
|
||||
|
Reference in New Issue
Block a user