Add blacklist for unwanted rewrites in consolidating place names

This commit is contained in:
Joshua Ramon Enslin 2023-11-26 23:55:22 +01:00
parent e610723107
commit b36a504277
Signed by: jrenslin
GPG Key ID: 46016F84501B70AE
4 changed files with 135 additions and 5 deletions

View File

@ -53,9 +53,11 @@ function getNames(array $data):array {
// Q6256 => country // Q6256 => country
$targets = [ $targets = [
/*
'Q6256' => 'countries', 'Q6256' => 'countries',
'Q3024240' => 'historical_countries', 'Q3024240' => 'historical_countries',
'Q10864048' => 'first_lvl_administrative_units', 'Q10864048' => 'first_lvl_administrative_units',
*/
]; ];
$langs = ['ar', 'bg', 'bn', 'cs', 'da', 'de', 'el', 'en', 'es', 'fa', 'fi', 'fr', 'ha', 'he', 'hi', 'hu', 'id', 'it', 'ja', 'ka', 'ko', 'nl', 'pl', 'pt', 'ro', 'ru', 'sv', 'sw', 'ta', 'th', 'tl', 'tr', 'uk', 'ur', 'vi', 'zh']; $langs = ['ar', 'bg', 'bn', 'cs', 'da', 'de', 'el', 'en', 'es', 'fa', 'fi', 'fr', 'ha', 'he', 'hi', 'hu', 'id', 'it', 'ja', 'ka', 'ko', 'nl', 'pl', 'pt', 'ro', 'ru', 'sv', 'sw', 'ta', 'th', 'tl', 'tr', 'uk', 'ur', 'vi', 'zh'];
@ -69,3 +71,31 @@ foreach ($langs as $lang) {
} }
} }
// The following should be lists of terms that are independent of language
$targetsForMerge = [
'Q23718' => 'cardinal_directions',
];
$mergedValues = [];
foreach ($langs as $lang) {
foreach ($targetsForMerge as $qid => $filename) {
if (!isset($mergedValues[$filename])) {
$mergedValues[$filename] = [];
}
$mergedValues[$filename] = array_merge($mergedValues[$filename], getNames(query($lang, $qid)));
echo "Fetched $lang : $filename ($qid)" . PHP_EOL;
}
}
$mergedValues['cardinal_directions'][] = 'Nord';
$mergedValues['cardinal_directions'][] = 'Ost';
$mergedValues['cardinal_directions'][] = 'West';
$mergedValues['cardinal_directions'][] = 'Süd';
foreach ($mergedValues as $filename => $values) {
file_put_contents(__DIR__ . '/../static/' . $filename . '.json', json_encode(array_values(array_unique($values))));
}

View File

@ -21,6 +21,18 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
"Unknown" => "", "Unknown" => "",
]; ];
/** Blacklist for comparison with country names */
private const _COUNTRY_REWRITE_BLACKLISTED_TERMS = [
'District',
'Distrikt',
'India',
'Indien',
'Insel',
'Inseln',
'Tal',
'Yue',
];
private const _PLACE_TYPE_INDICATORS_GERMAN = [ private const _PLACE_TYPE_INDICATORS_GERMAN = [
'Insel', 'Insel',
'Stadt', 'Stadt',
@ -31,14 +43,15 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
private const _PLACE_NARROWER_LOCATION_INDICATORS_GERMAN = [ private const _PLACE_NARROWER_LOCATION_INDICATORS_GERMAN = [
'gasse', 'gasse',
'straße', 'straße',
' Straße',
]; ];
// Indicators signifying that a place is likely subordinate to the other // Indicators signifying that a place is likely subordinate to the other
// if two places are provided in a comma-separated list // if two places are provided in a comma-separated list
private const _PLACE_NARROWER_LOCATION_INDICATORS_HUNGARIAN = [ private const _PLACE_NARROWER_LOCATION_INDICATORS_HUNGARIAN = [
'körut', ' körut ',
'utca', ' utca ',
'út', ' út ',
]; ];
private const _RELEVANT_ROMAN_NUMERALS = [ private const _RELEVANT_ROMAN_NUMERALS = [
@ -87,8 +100,18 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
&& substr_count($name, $separator) === 1 && substr_count($name, $separator) === 1
&& !str_contains($name, "(") && !str_contains($name, "(")
) { ) {
$parts = explode(', ', $name); $parts = explode(', ', $name);
// Skip entries like "Vaci utca 12 Budapest, Vaci utca"
$indicatorTrimmed = trim($indicator);
if (
(str_ends_with($parts[0], $indicatorTrimmed) && str_contains($parts[1], $indicatorTrimmed))
|| (str_ends_with($parts[1], $indicatorTrimmed) && str_contains($parts[0], $indicatorTrimmed))
) {
return $name;
}
// Prevent errors in case of "Adalbrechtstraße 12, " // Prevent errors in case of "Adalbrechtstraße 12, "
if (!empty($parts[0]) && !empty($parts[1])) { if (!empty($parts[0]) && !empty($parts[1])) {
@ -164,6 +187,15 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
if (str_contains($name, " u. ") && \preg_match("/\ u\.\ [0-9]/", $name)) { if (str_contains($name, " u. ") && \preg_match("/\ u\.\ [0-9]/", $name)) {
$name = str_replace(" u. ", " utca ", $name); $name = str_replace(" u. ", " utca ", $name);
} }
if (str_contains($name, " ucca ") && \preg_match("/\ ucca\ [0-9]/", $name)) {
$name = str_replace(" ucca ", " utca ", $name);
}
if (str_contains($name, " utcza ") && \preg_match("/\ utcza\ [0-9]/", $name)) {
$name = str_replace(" utcza ", " utca ", $name);
}
if (str_contains($name, " rkp. ") && \preg_match("/\ rkp\.\ [0-9]/", $name)) {
$name = str_replace(" rkp. ", " rakpart ", $name);
}
// "Adalbrecht utca. 12, Berlin" > Adalbrecht utca 12 (Berlin) // "Adalbrecht utca. 12, Berlin" > Adalbrecht utca 12 (Berlin)
@ -200,7 +232,14 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
return self::$_placeNameListCaches[$filename]; return self::$_placeNameListCaches[$filename];
} }
try {
$output = json_decode(MD_STD::file_get_contents($filename), true); $output = json_decode(MD_STD::file_get_contents($filename), true);
}
catch (MDFileDoesNotExist $e) {
self::$_placeNameListCaches[$filename] = [];
return [];
}
if ($output === false) { if ($output === false) {
throw new Exception("Failed to get list"); throw new Exception("Failed to get list");
} }
@ -238,10 +277,31 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
// Load place names // Load place names
$countryNames = self::_loadJsonList(__DIR__ . "/../static/countries.$lang.json") + self::_loadJsonList(__DIR__ . "/../static/historical_countries.$lang.json"); $countryNames = self::_loadJsonList(__DIR__ . "/../static/countries.$lang.json") + self::_loadJsonList(__DIR__ . "/../static/historical_countries.$lang.json");
$cardinal_directions = self::_loadJsonList(__DIR__ . "/../static/cardinal_directions.json");
$part0IsCountry = in_array($parts[0], $countryNames, true); $part0IsCountry = in_array($parts[0], $countryNames, true);
$part1IsCountry = in_array($parts[1], $countryNames, true); $part1IsCountry = in_array($parts[1], $countryNames, true);
// Skip if the full name is in the list of country names
if (in_array($name, $countryNames, true)) {
return $name;
}
// If one of the parts is a blacklisted term or a cardinal directions, skip this
if (
(in_array($parts[0], self::_COUNTRY_REWRITE_BLACKLISTED_TERMS, true)
|| in_array($parts[0], $cardinal_directions, true)
|| in_array(strtolower($parts[0]), $cardinal_directions, true)
)
|| (in_array($parts[1], self::_COUNTRY_REWRITE_BLACKLISTED_TERMS, true)
|| in_array($parts[1], $cardinal_directions, true)
|| in_array(strtolower($parts[1]), $cardinal_directions, true)
)
) {
return $name;
}
if ($part0IsCountry === true && $part1IsCountry === false) { if ($part0IsCountry === true && $part1IsCountry === false) {
return $parts[1] . ' (' . $parts[0] . ')'; return $parts[1] . ' (' . $parts[0] . ')';
} }
@ -255,6 +315,25 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
} }
/**
* Removes duplicates after commas.
*
* @param string $ort_name Place name to clean.
*
* @return string
*/
private static function _remove_duplicates_after_commas(string $ort_name):string {
if (str_contains($ort_name, ',') === false) {
return $ort_name;
}
$parts = explode(', ', $ort_name);
return implode(', ', array_unique($parts));
}
/** /**
* Cleans a place name by trimming etc. Also removes uncertainty indicators. * Cleans a place name by trimming etc. Also removes uncertainty indicators.
* *
@ -267,10 +346,18 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
// Run basic replacements // Run basic replacements
$nameSanitizations = self::_NAME_SANITIZATIONS; $nameSanitizations = self::_NAME_SANITIZATIONS;
if (substr_count($ort_name, "/") === 1) $nameSanitizations["/"] = "-"; /*
if (substr_count($ort_name, "/") === 1 && !str_contains($ort_name, '.')) {
$nameSanitizations["/"] = "-";
}
*/
$ort_name = strtr(self::sanitizeInputString($ort_name), $nameSanitizations); $ort_name = strtr(self::sanitizeInputString($ort_name), $nameSanitizations);
$ort_name = self::sanitizeInputString(NodaUncertaintyHelper::cleanUncertaintyIndicatorsPlace($ort_name)); $ort_name = self::sanitizeInputString(NodaUncertaintyHelper::cleanUncertaintyIndicatorsPlace($ort_name));
// Remove duplicates after commas
// Västerdås, Schweden, Schweden > Västerdås, Schweden
$ort_name = self::_remove_duplicates_after_commas($ort_name);
$ort_name = match ($lang) { $ort_name = match ($lang) {
'de' => self::_clean_german_abbreviations($ort_name), 'de' => self::_clean_german_abbreviations($ort_name),
'hu' => self::_clean_hungarian_abbreviations($ort_name), 'hu' => self::_clean_hungarian_abbreviations($ort_name),

View File

@ -0,0 +1 @@
["\u0634\u0645\u0627\u0644","\u062c\u0646\u0648\u0628","\u063a\u0631\u0628","\u0634\u0631\u0642","\u0441\u0435\u0432\u0435\u0440","\u042e\u0433","\u0417\u0430\u043f\u0430\u0434","\u0418\u0437\u0442\u043e\u043a","\u0989\u09a4\u09cd\u09a4\u09b0","\u09a6\u0995\u09cd\u09b7\u09bf\u09a3","\u09aa\u09b6\u09cd\u099a\u09bf\u09ae","\u09aa\u09c2\u09b0\u09cd\u09ac","sever","jih","z\u00e1pad","v\u00fdchod","nord","syd","vest","\u00f8st","Norden","S\u00fcden","Westen","Osten","\u03b2\u03bf\u03c1\u03c1\u03ac\u03c2","\u03bd\u03cc\u03c4\u03bf\u03c2","\u03b4\u03cd\u03c3\u03b7","\u03b1\u03bd\u03b1\u03c4\u03bf\u03bb\u03ae","north","south","west","east","norte","sur","oeste","este","\u0628\u0627\u062e\u062a\u0631","\u062e\u0627\u0648\u0631","pohjoinen","etel\u00e4","l\u00e4nsi","it\u00e4","sud","ouest","est","Arewa","Kudu","Yamma","Gabas","\u05e6\u05e4\u05d5\u05df","\u05d3\u05e8\u05d5\u05dd","\u05de\u05e2\u05e8\u05d1","\u05de\u05d6\u05e8\u05d7","\u0909\u0924\u094d\u0924\u0930","\u0926\u0915\u094d\u0937\u093f\u0923","\u092a\u0936\u094d\u091a\u093f\u092e","\u092a\u0942\u0930\u094d\u0935","\u00e9szak","d\u00e9l","nyugat","kelet","utara","selatan","barat","Timur","ovest","\u5317","\u5357","\u897f","\u6771","\u10e9\u10e0\u10d3\u10d8\u10da\u10dd\u10d4\u10d7\u10d8","\u10e1\u10d0\u10db\u10ee\u10e0\u10d4\u10d7\u10d8","\u10d3\u10d0\u10e1\u10d0\u10d5\u10da\u10d4\u10d7\u10d8","\u10d0\u10e6\u10db\u10dd\u10e1\u10d0\u10d5\u10da\u10d4\u10d7\u10d8","\ubd81\ucabd","\ub0a8\ucabd","\uc11c\ucabd","\ub3d9\ucabd","noord","zuid","oost","p\u00f3\u0142noc","po\u0142udnie","zach\u00f3d","wsch\u00f3d","sul","leste","Sud","Vest","Est","\u044e\u0433","\u0437\u0430\u043f\u0430\u0434","\u0432\u043e\u0441\u0442\u043e\u043a","norr","s\u00f6der","v\u00e4ster","\u00f6ster","kaskazini","Kusini","Magharibi","Mashariki","\u0bb5\u0b9f\u0b95\u0bcd\u0b95\u0bc1","\u0ba4\u0bc6\u0bb1\u0bcd\u0b95\u0bc1","\u0bae\u0bc7\u0bb1\u0bcd\u0b95\u0bc1","\u0b95\u0bbf\u0bb4\u0b95\u0bcd\u0b95\u0bc1","\u0e17\u0e34\u0e28\u0e40\u0e2b\u0e19\u0e37\u0e2d","\u0e17\u0e34\u0e28\u0e43\u0e15\u0e49","\u0e17\u0e34\u0e28\u0e15\u0e30\u0e27\u0e31\u0e19\u0e15\u0e01","\u0e17\u0e34\u0e28\u0e15\u0e30\u0e27\u0e31\u0e19\u0e2d\u0e2d\u0e01","Hilaga","Timog","kanluran","kuzey","g\u00fcney","bat\u0131","do\u011fu","\u043f\u0456\u0432\u043d\u0456\u0447","\u043f\u0456\u0432\u0434\u0435\u043d\u044c","\u0437\u0430\u0445\u0456\u0434","\u0441\u0445\u0456\u0434","\u0645\u063a\u0631\u0628-\u0633\u0645\u062a","\u0645\u0634\u0631\u0642","h\u01b0\u1edbng b\u1eafc","h\u01b0\u1edbng nam","h\u01b0\u1edbng t\u00e2y","h\u01b0\u1edbng \u0111\u00f4ng","Nord","Ost","West","S\u00fcd"]

View File

@ -43,6 +43,9 @@ final class NodaConsolidatedNamesForPlacesTest extends TestCase {
self::assertEquals("Adalbrechtstraße 12 (Berlin)", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Adalbrechtstr. 12, Berlin")); self::assertEquals("Adalbrechtstraße 12 (Berlin)", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Adalbrechtstr. 12, Berlin"));
self::assertEquals("Berlin, Adalbrechtstr. 12", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Berlin, Adalbrechtstr. 12")); self::assertEquals("Berlin, Adalbrechtstr. 12", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Berlin, Adalbrechtstr. 12"));
self::assertEquals("Ferenc József rakpart 21. Budapest", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Ferenc József rkp. 21. Budapest"));
self::assertEquals("Ferenc József rkp. 21. Budapest", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Ferenc József rkp. 21. Budapest"));
// Same in Hungarian // Same in Hungarian
self::assertEquals("Adalbrecht utca 12 (Berlin)", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Berlin, Adalbrecht utca 12")); self::assertEquals("Adalbrecht utca 12 (Berlin)", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Berlin, Adalbrecht utca 12"));
self::assertEquals("Adalbrecht utca 12 (Berlin)", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Adalbrecht utca 12, Berlin")); self::assertEquals("Adalbrecht utca 12 (Berlin)", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Adalbrecht utca 12, Berlin"));
@ -59,5 +62,14 @@ final class NodaConsolidatedNamesForPlacesTest extends TestCase {
self::assertEquals("Köln (Deutschland)", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Deutschland, Köln")); self::assertEquals("Köln (Deutschland)", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Deutschland, Köln"));
self::assertEquals("Köln (Deutschland)", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Köln, Deutschland")); self::assertEquals("Köln (Deutschland)", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Köln, Deutschland"));
self::assertEquals("Yue-Öfen", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Yue-Öfen"));
self::assertEquals("Transkei-Distrikt", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Transkei-Distrikt"));
self::assertEquals("Ost-Deutschland", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Ost-Deutschland"));
self::assertEquals("Vaci utca 12 Budapest, Vaci utca", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Vaci utca 12 Budapest, Vaci utca"));
self::assertEquals("Gambia-Tal", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Gambia-Tal"));
self::assertEquals("Västerdås, Schweden", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Västerdås, Schweden, Schweden"));
} }
} }