Add blacklist for unwanted rewrites in consolidating place names

2023-11-26 23:55:22 +01:00
parent e610723107
commit b36a504277
4 changed files with 135 additions and 5 deletions
--- a/src/NodaConsolidatedNamesForPlaces.php
+++ b/src/NodaConsolidatedNamesForPlaces.php
@@ -21,6 +21,18 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
        "Unknown" => "",
    ];

+    /** Blacklist for comparison with country names */
+    private const _COUNTRY_REWRITE_BLACKLISTED_TERMS = [
+        'District',
+        'Distrikt',
+        'India',
+        'Indien',
+        'Insel',
+        'Inseln',
+        'Tal',
+        'Yue',
+    ];
+
    private const _PLACE_TYPE_INDICATORS_GERMAN = [
        'Insel',
        'Stadt',
@@ -31,14 +43,15 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
    private const _PLACE_NARROWER_LOCATION_INDICATORS_GERMAN = [
        'gasse',
        'straße',
+        ' Straße',
    ];

    // Indicators signifying that a place is likely subordinate to the other
    // if two places are provided in a comma-separated list
    private const _PLACE_NARROWER_LOCATION_INDICATORS_HUNGARIAN = [
-        'körut',
-        'utca',
-        'út',
+        ' körut ',
+        ' utca ',
+        ' út ',
    ];

    private const _RELEVANT_ROMAN_NUMERALS = [
@@ -87,8 +100,18 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
            && substr_count($name, $separator) === 1
            && !str_contains($name, "(")
        ) {
+
            $parts = explode(', ', $name);

+            // Skip entries like "Vaci utca 12 Budapest, Vaci utca"
+            $indicatorTrimmed = trim($indicator);
+            if (
+                (str_ends_with($parts[0], $indicatorTrimmed) && str_contains($parts[1], $indicatorTrimmed))
+                || (str_ends_with($parts[1], $indicatorTrimmed) && str_contains($parts[0], $indicatorTrimmed))
+            ) {
+                return $name;
+            }
+
            // Prevent errors in case of "Adalbrechtstraße 12, "
            if (!empty($parts[0]) && !empty($parts[1])) {

@@ -164,6 +187,15 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
        if (str_contains($name, " u. ") && \preg_match("/\ u\.\ [0-9]/", $name)) {
            $name = str_replace(" u. ", " utca ", $name);
        }
+        if (str_contains($name, " ucca ") && \preg_match("/\ ucca\ [0-9]/", $name)) {
+            $name = str_replace(" ucca ", " utca ", $name);
+        }
+        if (str_contains($name, " utcza ") && \preg_match("/\ utcza\ [0-9]/", $name)) {
+            $name = str_replace(" utcza ", " utca ", $name);
+        }
+        if (str_contains($name, " rkp. ") && \preg_match("/\ rkp\.\ [0-9]/", $name)) {
+            $name = str_replace(" rkp. ", " rakpart ", $name);
+        }

        // "Adalbrecht utca. 12, Berlin" > Adalbrecht utca 12 (Berlin)

@@ -200,7 +232,14 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
            return self::$_placeNameListCaches[$filename];
        }

-        $output = json_decode(MD_STD::file_get_contents($filename), true);
+        try {
+            $output = json_decode(MD_STD::file_get_contents($filename), true);
+        }
+        catch (MDFileDoesNotExist $e) {
+            self::$_placeNameListCaches[$filename] = [];
+            return [];
+        }
+
        if ($output === false) {
            throw new Exception("Failed to get list");
        }
@@ -238,10 +277,31 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract

            // Load place names
            $countryNames = self::_loadJsonList(__DIR__ . "/../static/countries.$lang.json") + self::_loadJsonList(__DIR__ . "/../static/historical_countries.$lang.json");
+            $cardinal_directions = self::_loadJsonList(__DIR__ . "/../static/cardinal_directions.json");

            $part0IsCountry = in_array($parts[0], $countryNames, true);
            $part1IsCountry = in_array($parts[1], $countryNames, true);

+            // Skip if the full name is in the list of country names
+            if (in_array($name, $countryNames, true)) {
+                return $name;
+            }
+
+            // If one of the parts is a blacklisted term or a cardinal directions, skip this
+
+            if (
+                (in_array($parts[0], self::_COUNTRY_REWRITE_BLACKLISTED_TERMS, true)
+                    || in_array($parts[0], $cardinal_directions, true)
+                    || in_array(strtolower($parts[0]), $cardinal_directions, true)
+                )
+                || (in_array($parts[1], self::_COUNTRY_REWRITE_BLACKLISTED_TERMS, true)
+                    || in_array($parts[1], $cardinal_directions, true)
+                    || in_array(strtolower($parts[1]), $cardinal_directions, true)
+                )
+            ) {
+                return $name;
+            }
+
            if ($part0IsCountry === true && $part1IsCountry === false) {
                return $parts[1] . ' (' . $parts[0] . ')';
            }
@@ -255,6 +315,25 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract

    }

+    /**
+     * Removes duplicates after commas.
+     *
+     * @param string $ort_name Place name to clean.
+     *
+     * @return string
+     */
+    private static function _remove_duplicates_after_commas(string $ort_name):string {
+
+        if (str_contains($ort_name, ',') === false) {
+            return $ort_name;
+        }
+
+        $parts = explode(', ', $ort_name);
+
+        return implode(', ', array_unique($parts));
+
+    }
+
    /**
     * Cleans a place name by trimming etc. Also removes uncertainty indicators.
     *
@@ -267,10 +346,18 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract

        // Run basic replacements
        $nameSanitizations = self::_NAME_SANITIZATIONS;
-        if (substr_count($ort_name, "/") === 1) $nameSanitizations["/"] = "-";
+        /*
+        if (substr_count($ort_name, "/") === 1 && !str_contains($ort_name, '.')) {
+            $nameSanitizations["/"] = "-";
+        }
+         */
        $ort_name = strtr(self::sanitizeInputString($ort_name), $nameSanitizations);
        $ort_name = self::sanitizeInputString(NodaUncertaintyHelper::cleanUncertaintyIndicatorsPlace($ort_name));

+        // Remove duplicates after commas
+        // Västerdås, Schweden, Schweden > Västerdås, Schweden
+        $ort_name = self::_remove_duplicates_after_commas($ort_name);
+
        $ort_name = match ($lang) {
            'de' => self::_clean_german_abbreviations($ort_name),
            'hu' => self::_clean_hungarian_abbreviations($ort_name),