diff --git a/scripts/get_wikidata_country_names.php b/scripts/get_wikidata_country_names.php index 32eb286..3c3ba3b 100644 --- a/scripts/get_wikidata_country_names.php +++ b/scripts/get_wikidata_country_names.php @@ -53,9 +53,11 @@ function getNames(array $data):array { // Q6256 => country $targets = [ + /* 'Q6256' => 'countries', 'Q3024240' => 'historical_countries', 'Q10864048' => 'first_lvl_administrative_units', + */ ]; $langs = ['ar', 'bg', 'bn', 'cs', 'da', 'de', 'el', 'en', 'es', 'fa', 'fi', 'fr', 'ha', 'he', 'hi', 'hu', 'id', 'it', 'ja', 'ka', 'ko', 'nl', 'pl', 'pt', 'ro', 'ru', 'sv', 'sw', 'ta', 'th', 'tl', 'tr', 'uk', 'ur', 'vi', 'zh']; @@ -69,3 +71,31 @@ foreach ($langs as $lang) { } } + +// The following should be lists of terms that are independent of language +$targetsForMerge = [ + 'Q23718' => 'cardinal_directions', +]; +$mergedValues = []; +foreach ($langs as $lang) { + foreach ($targetsForMerge as $qid => $filename) { + + if (!isset($mergedValues[$filename])) { + $mergedValues[$filename] = []; + } + $mergedValues[$filename] = array_merge($mergedValues[$filename], getNames(query($lang, $qid))); + echo "Fetched $lang : $filename ($qid)" . PHP_EOL; + + } +} + +$mergedValues['cardinal_directions'][] = 'Nord'; +$mergedValues['cardinal_directions'][] = 'Ost'; +$mergedValues['cardinal_directions'][] = 'West'; +$mergedValues['cardinal_directions'][] = 'Süd'; + +foreach ($mergedValues as $filename => $values) { + + file_put_contents(__DIR__ . '/../static/' . $filename . '.json', json_encode(array_values(array_unique($values)))); + +} diff --git a/src/NodaConsolidatedNamesForPlaces.php b/src/NodaConsolidatedNamesForPlaces.php index c9eafb2..de4af37 100644 --- a/src/NodaConsolidatedNamesForPlaces.php +++ b/src/NodaConsolidatedNamesForPlaces.php @@ -21,6 +21,18 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract "Unknown" => "", ]; + /** Blacklist for comparison with country names */ + private const _COUNTRY_REWRITE_BLACKLISTED_TERMS = [ + 'District', + 'Distrikt', + 'India', + 'Indien', + 'Insel', + 'Inseln', + 'Tal', + 'Yue', + ]; + private const _PLACE_TYPE_INDICATORS_GERMAN = [ 'Insel', 'Stadt', @@ -31,14 +43,15 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract private const _PLACE_NARROWER_LOCATION_INDICATORS_GERMAN = [ 'gasse', 'straße', + ' Straße', ]; // Indicators signifying that a place is likely subordinate to the other // if two places are provided in a comma-separated list private const _PLACE_NARROWER_LOCATION_INDICATORS_HUNGARIAN = [ - 'körut', - 'utca', - 'út', + ' körut ', + ' utca ', + ' út ', ]; private const _RELEVANT_ROMAN_NUMERALS = [ @@ -87,8 +100,18 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract && substr_count($name, $separator) === 1 && !str_contains($name, "(") ) { + $parts = explode(', ', $name); + // Skip entries like "Vaci utca 12 Budapest, Vaci utca" + $indicatorTrimmed = trim($indicator); + if ( + (str_ends_with($parts[0], $indicatorTrimmed) && str_contains($parts[1], $indicatorTrimmed)) + || (str_ends_with($parts[1], $indicatorTrimmed) && str_contains($parts[0], $indicatorTrimmed)) + ) { + return $name; + } + // Prevent errors in case of "Adalbrechtstraße 12, " if (!empty($parts[0]) && !empty($parts[1])) { @@ -164,6 +187,15 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract if (str_contains($name, " u. ") && \preg_match("/\ u\.\ [0-9]/", $name)) { $name = str_replace(" u. ", " utca ", $name); } + if (str_contains($name, " ucca ") && \preg_match("/\ ucca\ [0-9]/", $name)) { + $name = str_replace(" ucca ", " utca ", $name); + } + if (str_contains($name, " utcza ") && \preg_match("/\ utcza\ [0-9]/", $name)) { + $name = str_replace(" utcza ", " utca ", $name); + } + if (str_contains($name, " rkp. ") && \preg_match("/\ rkp\.\ [0-9]/", $name)) { + $name = str_replace(" rkp. ", " rakpart ", $name); + } // "Adalbrecht utca. 12, Berlin" > Adalbrecht utca 12 (Berlin) @@ -200,7 +232,14 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract return self::$_placeNameListCaches[$filename]; } - $output = json_decode(MD_STD::file_get_contents($filename), true); + try { + $output = json_decode(MD_STD::file_get_contents($filename), true); + } + catch (MDFileDoesNotExist $e) { + self::$_placeNameListCaches[$filename] = []; + return []; + } + if ($output === false) { throw new Exception("Failed to get list"); } @@ -238,10 +277,31 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract // Load place names $countryNames = self::_loadJsonList(__DIR__ . "/../static/countries.$lang.json") + self::_loadJsonList(__DIR__ . "/../static/historical_countries.$lang.json"); + $cardinal_directions = self::_loadJsonList(__DIR__ . "/../static/cardinal_directions.json"); $part0IsCountry = in_array($parts[0], $countryNames, true); $part1IsCountry = in_array($parts[1], $countryNames, true); + // Skip if the full name is in the list of country names + if (in_array($name, $countryNames, true)) { + return $name; + } + + // If one of the parts is a blacklisted term or a cardinal directions, skip this + + if ( + (in_array($parts[0], self::_COUNTRY_REWRITE_BLACKLISTED_TERMS, true) + || in_array($parts[0], $cardinal_directions, true) + || in_array(strtolower($parts[0]), $cardinal_directions, true) + ) + || (in_array($parts[1], self::_COUNTRY_REWRITE_BLACKLISTED_TERMS, true) + || in_array($parts[1], $cardinal_directions, true) + || in_array(strtolower($parts[1]), $cardinal_directions, true) + ) + ) { + return $name; + } + if ($part0IsCountry === true && $part1IsCountry === false) { return $parts[1] . ' (' . $parts[0] . ')'; } @@ -255,6 +315,25 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract } + /** + * Removes duplicates after commas. + * + * @param string $ort_name Place name to clean. + * + * @return string + */ + private static function _remove_duplicates_after_commas(string $ort_name):string { + + if (str_contains($ort_name, ',') === false) { + return $ort_name; + } + + $parts = explode(', ', $ort_name); + + return implode(', ', array_unique($parts)); + + } + /** * Cleans a place name by trimming etc. Also removes uncertainty indicators. * @@ -267,10 +346,18 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract // Run basic replacements $nameSanitizations = self::_NAME_SANITIZATIONS; - if (substr_count($ort_name, "/") === 1) $nameSanitizations["/"] = "-"; + /* + if (substr_count($ort_name, "/") === 1 && !str_contains($ort_name, '.')) { + $nameSanitizations["/"] = "-"; + } + */ $ort_name = strtr(self::sanitizeInputString($ort_name), $nameSanitizations); $ort_name = self::sanitizeInputString(NodaUncertaintyHelper::cleanUncertaintyIndicatorsPlace($ort_name)); + // Remove duplicates after commas + // Västerdås, Schweden, Schweden > Västerdås, Schweden + $ort_name = self::_remove_duplicates_after_commas($ort_name); + $ort_name = match ($lang) { 'de' => self::_clean_german_abbreviations($ort_name), 'hu' => self::_clean_hungarian_abbreviations($ort_name), diff --git a/static/cardinal_directions.json b/static/cardinal_directions.json new file mode 100644 index 0000000..f51168a --- /dev/null +++ b/static/cardinal_directions.json @@ -0,0 +1 @@ +["\u0634\u0645\u0627\u0644","\u062c\u0646\u0648\u0628","\u063a\u0631\u0628","\u0634\u0631\u0642","\u0441\u0435\u0432\u0435\u0440","\u042e\u0433","\u0417\u0430\u043f\u0430\u0434","\u0418\u0437\u0442\u043e\u043a","\u0989\u09a4\u09cd\u09a4\u09b0","\u09a6\u0995\u09cd\u09b7\u09bf\u09a3","\u09aa\u09b6\u09cd\u099a\u09bf\u09ae","\u09aa\u09c2\u09b0\u09cd\u09ac","sever","jih","z\u00e1pad","v\u00fdchod","nord","syd","vest","\u00f8st","Norden","S\u00fcden","Westen","Osten","\u03b2\u03bf\u03c1\u03c1\u03ac\u03c2","\u03bd\u03cc\u03c4\u03bf\u03c2","\u03b4\u03cd\u03c3\u03b7","\u03b1\u03bd\u03b1\u03c4\u03bf\u03bb\u03ae","north","south","west","east","norte","sur","oeste","este","\u0628\u0627\u062e\u062a\u0631","\u062e\u0627\u0648\u0631","pohjoinen","etel\u00e4","l\u00e4nsi","it\u00e4","sud","ouest","est","Arewa","Kudu","Yamma","Gabas","\u05e6\u05e4\u05d5\u05df","\u05d3\u05e8\u05d5\u05dd","\u05de\u05e2\u05e8\u05d1","\u05de\u05d6\u05e8\u05d7","\u0909\u0924\u094d\u0924\u0930","\u0926\u0915\u094d\u0937\u093f\u0923","\u092a\u0936\u094d\u091a\u093f\u092e","\u092a\u0942\u0930\u094d\u0935","\u00e9szak","d\u00e9l","nyugat","kelet","utara","selatan","barat","Timur","ovest","\u5317","\u5357","\u897f","\u6771","\u10e9\u10e0\u10d3\u10d8\u10da\u10dd\u10d4\u10d7\u10d8","\u10e1\u10d0\u10db\u10ee\u10e0\u10d4\u10d7\u10d8","\u10d3\u10d0\u10e1\u10d0\u10d5\u10da\u10d4\u10d7\u10d8","\u10d0\u10e6\u10db\u10dd\u10e1\u10d0\u10d5\u10da\u10d4\u10d7\u10d8","\ubd81\ucabd","\ub0a8\ucabd","\uc11c\ucabd","\ub3d9\ucabd","noord","zuid","oost","p\u00f3\u0142noc","po\u0142udnie","zach\u00f3d","wsch\u00f3d","sul","leste","Sud","Vest","Est","\u044e\u0433","\u0437\u0430\u043f\u0430\u0434","\u0432\u043e\u0441\u0442\u043e\u043a","norr","s\u00f6der","v\u00e4ster","\u00f6ster","kaskazini","Kusini","Magharibi","Mashariki","\u0bb5\u0b9f\u0b95\u0bcd\u0b95\u0bc1","\u0ba4\u0bc6\u0bb1\u0bcd\u0b95\u0bc1","\u0bae\u0bc7\u0bb1\u0bcd\u0b95\u0bc1","\u0b95\u0bbf\u0bb4\u0b95\u0bcd\u0b95\u0bc1","\u0e17\u0e34\u0e28\u0e40\u0e2b\u0e19\u0e37\u0e2d","\u0e17\u0e34\u0e28\u0e43\u0e15\u0e49","\u0e17\u0e34\u0e28\u0e15\u0e30\u0e27\u0e31\u0e19\u0e15\u0e01","\u0e17\u0e34\u0e28\u0e15\u0e30\u0e27\u0e31\u0e19\u0e2d\u0e2d\u0e01","Hilaga","Timog","kanluran","kuzey","g\u00fcney","bat\u0131","do\u011fu","\u043f\u0456\u0432\u043d\u0456\u0447","\u043f\u0456\u0432\u0434\u0435\u043d\u044c","\u0437\u0430\u0445\u0456\u0434","\u0441\u0445\u0456\u0434","\u0645\u063a\u0631\u0628-\u0633\u0645\u062a","\u0645\u0634\u0631\u0642","h\u01b0\u1edbng b\u1eafc","h\u01b0\u1edbng nam","h\u01b0\u1edbng t\u00e2y","h\u01b0\u1edbng \u0111\u00f4ng","Nord","Ost","West","S\u00fcd"] \ No newline at end of file diff --git a/tests/NodaConsolidatedNamesForPlacesTest.php b/tests/NodaConsolidatedNamesForPlacesTest.php index 308a88c..171dfa7 100644 --- a/tests/NodaConsolidatedNamesForPlacesTest.php +++ b/tests/NodaConsolidatedNamesForPlacesTest.php @@ -43,6 +43,9 @@ final class NodaConsolidatedNamesForPlacesTest extends TestCase { self::assertEquals("Adalbrechtstraße 12 (Berlin)", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Adalbrechtstr. 12, Berlin")); self::assertEquals("Berlin, Adalbrechtstr. 12", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Berlin, Adalbrechtstr. 12")); + self::assertEquals("Ferenc József rakpart 21. Budapest", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Ferenc József rkp. 21. Budapest")); + self::assertEquals("Ferenc József rkp. 21. Budapest", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Ferenc József rkp. 21. Budapest")); + // Same in Hungarian self::assertEquals("Adalbrecht utca 12 (Berlin)", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Berlin, Adalbrecht utca 12")); self::assertEquals("Adalbrecht utca 12 (Berlin)", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Adalbrecht utca 12, Berlin")); @@ -59,5 +62,14 @@ final class NodaConsolidatedNamesForPlacesTest extends TestCase { self::assertEquals("Köln (Deutschland)", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Deutschland, Köln")); self::assertEquals("Köln (Deutschland)", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Köln, Deutschland")); + self::assertEquals("Yue-Öfen", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Yue-Öfen")); + self::assertEquals("Transkei-Distrikt", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Transkei-Distrikt")); + self::assertEquals("Ost-Deutschland", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Ost-Deutschland")); + + self::assertEquals("Vaci utca 12 Budapest, Vaci utca", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Vaci utca 12 Budapest, Vaci utca")); + self::assertEquals("Gambia-Tal", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Gambia-Tal")); + + self::assertEquals("Västerdås, Schweden", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Västerdås, Schweden, Schweden")); + } }