Add blacklist for unwanted rewrites in consolidating place names
This commit is contained in:
parent
e610723107
commit
b36a504277
|
@ -53,9 +53,11 @@ function getNames(array $data):array {
|
|||
// Q6256 => country
|
||||
|
||||
$targets = [
|
||||
/*
|
||||
'Q6256' => 'countries',
|
||||
'Q3024240' => 'historical_countries',
|
||||
'Q10864048' => 'first_lvl_administrative_units',
|
||||
*/
|
||||
];
|
||||
|
||||
$langs = ['ar', 'bg', 'bn', 'cs', 'da', 'de', 'el', 'en', 'es', 'fa', 'fi', 'fr', 'ha', 'he', 'hi', 'hu', 'id', 'it', 'ja', 'ka', 'ko', 'nl', 'pl', 'pt', 'ro', 'ru', 'sv', 'sw', 'ta', 'th', 'tl', 'tr', 'uk', 'ur', 'vi', 'zh'];
|
||||
|
@ -69,3 +71,31 @@ foreach ($langs as $lang) {
|
|||
|
||||
}
|
||||
}
|
||||
|
||||
// The following should be lists of terms that are independent of language
|
||||
$targetsForMerge = [
|
||||
'Q23718' => 'cardinal_directions',
|
||||
];
|
||||
$mergedValues = [];
|
||||
foreach ($langs as $lang) {
|
||||
foreach ($targetsForMerge as $qid => $filename) {
|
||||
|
||||
if (!isset($mergedValues[$filename])) {
|
||||
$mergedValues[$filename] = [];
|
||||
}
|
||||
$mergedValues[$filename] = array_merge($mergedValues[$filename], getNames(query($lang, $qid)));
|
||||
echo "Fetched $lang : $filename ($qid)" . PHP_EOL;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
$mergedValues['cardinal_directions'][] = 'Nord';
|
||||
$mergedValues['cardinal_directions'][] = 'Ost';
|
||||
$mergedValues['cardinal_directions'][] = 'West';
|
||||
$mergedValues['cardinal_directions'][] = 'Süd';
|
||||
|
||||
foreach ($mergedValues as $filename => $values) {
|
||||
|
||||
file_put_contents(__DIR__ . '/../static/' . $filename . '.json', json_encode(array_values(array_unique($values))));
|
||||
|
||||
}
|
||||
|
|
|
@ -21,6 +21,18 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
|
|||
"Unknown" => "",
|
||||
];
|
||||
|
||||
/** Blacklist for comparison with country names */
|
||||
private const _COUNTRY_REWRITE_BLACKLISTED_TERMS = [
|
||||
'District',
|
||||
'Distrikt',
|
||||
'India',
|
||||
'Indien',
|
||||
'Insel',
|
||||
'Inseln',
|
||||
'Tal',
|
||||
'Yue',
|
||||
];
|
||||
|
||||
private const _PLACE_TYPE_INDICATORS_GERMAN = [
|
||||
'Insel',
|
||||
'Stadt',
|
||||
|
@ -31,14 +43,15 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
|
|||
private const _PLACE_NARROWER_LOCATION_INDICATORS_GERMAN = [
|
||||
'gasse',
|
||||
'straße',
|
||||
' Straße',
|
||||
];
|
||||
|
||||
// Indicators signifying that a place is likely subordinate to the other
|
||||
// if two places are provided in a comma-separated list
|
||||
private const _PLACE_NARROWER_LOCATION_INDICATORS_HUNGARIAN = [
|
||||
'körut',
|
||||
'utca',
|
||||
'út',
|
||||
' körut ',
|
||||
' utca ',
|
||||
' út ',
|
||||
];
|
||||
|
||||
private const _RELEVANT_ROMAN_NUMERALS = [
|
||||
|
@ -87,8 +100,18 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
|
|||
&& substr_count($name, $separator) === 1
|
||||
&& !str_contains($name, "(")
|
||||
) {
|
||||
|
||||
$parts = explode(', ', $name);
|
||||
|
||||
// Skip entries like "Vaci utca 12 Budapest, Vaci utca"
|
||||
$indicatorTrimmed = trim($indicator);
|
||||
if (
|
||||
(str_ends_with($parts[0], $indicatorTrimmed) && str_contains($parts[1], $indicatorTrimmed))
|
||||
|| (str_ends_with($parts[1], $indicatorTrimmed) && str_contains($parts[0], $indicatorTrimmed))
|
||||
) {
|
||||
return $name;
|
||||
}
|
||||
|
||||
// Prevent errors in case of "Adalbrechtstraße 12, "
|
||||
if (!empty($parts[0]) && !empty($parts[1])) {
|
||||
|
||||
|
@ -164,6 +187,15 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
|
|||
if (str_contains($name, " u. ") && \preg_match("/\ u\.\ [0-9]/", $name)) {
|
||||
$name = str_replace(" u. ", " utca ", $name);
|
||||
}
|
||||
if (str_contains($name, " ucca ") && \preg_match("/\ ucca\ [0-9]/", $name)) {
|
||||
$name = str_replace(" ucca ", " utca ", $name);
|
||||
}
|
||||
if (str_contains($name, " utcza ") && \preg_match("/\ utcza\ [0-9]/", $name)) {
|
||||
$name = str_replace(" utcza ", " utca ", $name);
|
||||
}
|
||||
if (str_contains($name, " rkp. ") && \preg_match("/\ rkp\.\ [0-9]/", $name)) {
|
||||
$name = str_replace(" rkp. ", " rakpart ", $name);
|
||||
}
|
||||
|
||||
// "Adalbrecht utca. 12, Berlin" > Adalbrecht utca 12 (Berlin)
|
||||
|
||||
|
@ -200,7 +232,14 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
|
|||
return self::$_placeNameListCaches[$filename];
|
||||
}
|
||||
|
||||
try {
|
||||
$output = json_decode(MD_STD::file_get_contents($filename), true);
|
||||
}
|
||||
catch (MDFileDoesNotExist $e) {
|
||||
self::$_placeNameListCaches[$filename] = [];
|
||||
return [];
|
||||
}
|
||||
|
||||
if ($output === false) {
|
||||
throw new Exception("Failed to get list");
|
||||
}
|
||||
|
@ -238,10 +277,31 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
|
|||
|
||||
// Load place names
|
||||
$countryNames = self::_loadJsonList(__DIR__ . "/../static/countries.$lang.json") + self::_loadJsonList(__DIR__ . "/../static/historical_countries.$lang.json");
|
||||
$cardinal_directions = self::_loadJsonList(__DIR__ . "/../static/cardinal_directions.json");
|
||||
|
||||
$part0IsCountry = in_array($parts[0], $countryNames, true);
|
||||
$part1IsCountry = in_array($parts[1], $countryNames, true);
|
||||
|
||||
// Skip if the full name is in the list of country names
|
||||
if (in_array($name, $countryNames, true)) {
|
||||
return $name;
|
||||
}
|
||||
|
||||
// If one of the parts is a blacklisted term or a cardinal directions, skip this
|
||||
|
||||
if (
|
||||
(in_array($parts[0], self::_COUNTRY_REWRITE_BLACKLISTED_TERMS, true)
|
||||
|| in_array($parts[0], $cardinal_directions, true)
|
||||
|| in_array(strtolower($parts[0]), $cardinal_directions, true)
|
||||
)
|
||||
|| (in_array($parts[1], self::_COUNTRY_REWRITE_BLACKLISTED_TERMS, true)
|
||||
|| in_array($parts[1], $cardinal_directions, true)
|
||||
|| in_array(strtolower($parts[1]), $cardinal_directions, true)
|
||||
)
|
||||
) {
|
||||
return $name;
|
||||
}
|
||||
|
||||
if ($part0IsCountry === true && $part1IsCountry === false) {
|
||||
return $parts[1] . ' (' . $parts[0] . ')';
|
||||
}
|
||||
|
@ -255,6 +315,25 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
|
|||
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes duplicates after commas.
|
||||
*
|
||||
* @param string $ort_name Place name to clean.
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
private static function _remove_duplicates_after_commas(string $ort_name):string {
|
||||
|
||||
if (str_contains($ort_name, ',') === false) {
|
||||
return $ort_name;
|
||||
}
|
||||
|
||||
$parts = explode(', ', $ort_name);
|
||||
|
||||
return implode(', ', array_unique($parts));
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Cleans a place name by trimming etc. Also removes uncertainty indicators.
|
||||
*
|
||||
|
@ -267,10 +346,18 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
|
|||
|
||||
// Run basic replacements
|
||||
$nameSanitizations = self::_NAME_SANITIZATIONS;
|
||||
if (substr_count($ort_name, "/") === 1) $nameSanitizations["/"] = "-";
|
||||
/*
|
||||
if (substr_count($ort_name, "/") === 1 && !str_contains($ort_name, '.')) {
|
||||
$nameSanitizations["/"] = "-";
|
||||
}
|
||||
*/
|
||||
$ort_name = strtr(self::sanitizeInputString($ort_name), $nameSanitizations);
|
||||
$ort_name = self::sanitizeInputString(NodaUncertaintyHelper::cleanUncertaintyIndicatorsPlace($ort_name));
|
||||
|
||||
// Remove duplicates after commas
|
||||
// Västerdås, Schweden, Schweden > Västerdås, Schweden
|
||||
$ort_name = self::_remove_duplicates_after_commas($ort_name);
|
||||
|
||||
$ort_name = match ($lang) {
|
||||
'de' => self::_clean_german_abbreviations($ort_name),
|
||||
'hu' => self::_clean_hungarian_abbreviations($ort_name),
|
||||
|
|
1
static/cardinal_directions.json
Normal file
1
static/cardinal_directions.json
Normal file
|
@ -0,0 +1 @@
|
|||
["\u0634\u0645\u0627\u0644","\u062c\u0646\u0648\u0628","\u063a\u0631\u0628","\u0634\u0631\u0642","\u0441\u0435\u0432\u0435\u0440","\u042e\u0433","\u0417\u0430\u043f\u0430\u0434","\u0418\u0437\u0442\u043e\u043a","\u0989\u09a4\u09cd\u09a4\u09b0","\u09a6\u0995\u09cd\u09b7\u09bf\u09a3","\u09aa\u09b6\u09cd\u099a\u09bf\u09ae","\u09aa\u09c2\u09b0\u09cd\u09ac","sever","jih","z\u00e1pad","v\u00fdchod","nord","syd","vest","\u00f8st","Norden","S\u00fcden","Westen","Osten","\u03b2\u03bf\u03c1\u03c1\u03ac\u03c2","\u03bd\u03cc\u03c4\u03bf\u03c2","\u03b4\u03cd\u03c3\u03b7","\u03b1\u03bd\u03b1\u03c4\u03bf\u03bb\u03ae","north","south","west","east","norte","sur","oeste","este","\u0628\u0627\u062e\u062a\u0631","\u062e\u0627\u0648\u0631","pohjoinen","etel\u00e4","l\u00e4nsi","it\u00e4","sud","ouest","est","Arewa","Kudu","Yamma","Gabas","\u05e6\u05e4\u05d5\u05df","\u05d3\u05e8\u05d5\u05dd","\u05de\u05e2\u05e8\u05d1","\u05de\u05d6\u05e8\u05d7","\u0909\u0924\u094d\u0924\u0930","\u0926\u0915\u094d\u0937\u093f\u0923","\u092a\u0936\u094d\u091a\u093f\u092e","\u092a\u0942\u0930\u094d\u0935","\u00e9szak","d\u00e9l","nyugat","kelet","utara","selatan","barat","Timur","ovest","\u5317","\u5357","\u897f","\u6771","\u10e9\u10e0\u10d3\u10d8\u10da\u10dd\u10d4\u10d7\u10d8","\u10e1\u10d0\u10db\u10ee\u10e0\u10d4\u10d7\u10d8","\u10d3\u10d0\u10e1\u10d0\u10d5\u10da\u10d4\u10d7\u10d8","\u10d0\u10e6\u10db\u10dd\u10e1\u10d0\u10d5\u10da\u10d4\u10d7\u10d8","\ubd81\ucabd","\ub0a8\ucabd","\uc11c\ucabd","\ub3d9\ucabd","noord","zuid","oost","p\u00f3\u0142noc","po\u0142udnie","zach\u00f3d","wsch\u00f3d","sul","leste","Sud","Vest","Est","\u044e\u0433","\u0437\u0430\u043f\u0430\u0434","\u0432\u043e\u0441\u0442\u043e\u043a","norr","s\u00f6der","v\u00e4ster","\u00f6ster","kaskazini","Kusini","Magharibi","Mashariki","\u0bb5\u0b9f\u0b95\u0bcd\u0b95\u0bc1","\u0ba4\u0bc6\u0bb1\u0bcd\u0b95\u0bc1","\u0bae\u0bc7\u0bb1\u0bcd\u0b95\u0bc1","\u0b95\u0bbf\u0bb4\u0b95\u0bcd\u0b95\u0bc1","\u0e17\u0e34\u0e28\u0e40\u0e2b\u0e19\u0e37\u0e2d","\u0e17\u0e34\u0e28\u0e43\u0e15\u0e49","\u0e17\u0e34\u0e28\u0e15\u0e30\u0e27\u0e31\u0e19\u0e15\u0e01","\u0e17\u0e34\u0e28\u0e15\u0e30\u0e27\u0e31\u0e19\u0e2d\u0e2d\u0e01","Hilaga","Timog","kanluran","kuzey","g\u00fcney","bat\u0131","do\u011fu","\u043f\u0456\u0432\u043d\u0456\u0447","\u043f\u0456\u0432\u0434\u0435\u043d\u044c","\u0437\u0430\u0445\u0456\u0434","\u0441\u0445\u0456\u0434","\u0645\u063a\u0631\u0628-\u0633\u0645\u062a","\u0645\u0634\u0631\u0642","h\u01b0\u1edbng b\u1eafc","h\u01b0\u1edbng nam","h\u01b0\u1edbng t\u00e2y","h\u01b0\u1edbng \u0111\u00f4ng","Nord","Ost","West","S\u00fcd"]
|
|
@ -43,6 +43,9 @@ final class NodaConsolidatedNamesForPlacesTest extends TestCase {
|
|||
self::assertEquals("Adalbrechtstraße 12 (Berlin)", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Adalbrechtstr. 12, Berlin"));
|
||||
self::assertEquals("Berlin, Adalbrechtstr. 12", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Berlin, Adalbrechtstr. 12"));
|
||||
|
||||
self::assertEquals("Ferenc József rakpart 21. Budapest", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Ferenc József rkp. 21. Budapest"));
|
||||
self::assertEquals("Ferenc József rkp. 21. Budapest", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Ferenc József rkp. 21. Budapest"));
|
||||
|
||||
// Same in Hungarian
|
||||
self::assertEquals("Adalbrecht utca 12 (Berlin)", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Berlin, Adalbrecht utca 12"));
|
||||
self::assertEquals("Adalbrecht utca 12 (Berlin)", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Adalbrecht utca 12, Berlin"));
|
||||
|
@ -59,5 +62,14 @@ final class NodaConsolidatedNamesForPlacesTest extends TestCase {
|
|||
self::assertEquals("Köln (Deutschland)", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Deutschland, Köln"));
|
||||
self::assertEquals("Köln (Deutschland)", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Köln, Deutschland"));
|
||||
|
||||
self::assertEquals("Yue-Öfen", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Yue-Öfen"));
|
||||
self::assertEquals("Transkei-Distrikt", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Transkei-Distrikt"));
|
||||
self::assertEquals("Ost-Deutschland", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Ost-Deutschland"));
|
||||
|
||||
self::assertEquals("Vaci utca 12 Budapest, Vaci utca", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Vaci utca 12 Budapest, Vaci utca"));
|
||||
self::assertEquals("Gambia-Tal", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Gambia-Tal"));
|
||||
|
||||
self::assertEquals("Västerdås, Schweden", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Västerdås, Schweden, Schweden"));
|
||||
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user