From f6409322e568b67a41bf66d43fb6eff6faab8af1 Mon Sep 17 00:00:00 2001 From: Joshua Ramon Enslin Date: Sat, 25 Nov 2023 22:42:07 +0100 Subject: [PATCH] Add classes for writing consolidating spellings of actor and place names --- src/NodaConsolidatedNamesAbstract.php | 68 ++++++ src/NodaConsolidatedNamesForPersinst.php | 124 +++++++++++ src/NodaConsolidatedNamesForPlaces.php | 209 ++++++++++++++++++ .../NodaConsolidatedNamesForPersinstTest.php | 50 +++++ tests/NodaConsolidatedNamesForPlacesTest.php | 63 ++++++ 5 files changed, 514 insertions(+) create mode 100644 src/NodaConsolidatedNamesAbstract.php create mode 100644 src/NodaConsolidatedNamesForPersinst.php create mode 100644 src/NodaConsolidatedNamesForPlaces.php create mode 100644 tests/NodaConsolidatedNamesForPersinstTest.php create mode 100644 tests/NodaConsolidatedNamesForPlacesTest.php diff --git a/src/NodaConsolidatedNamesAbstract.php b/src/NodaConsolidatedNamesAbstract.php new file mode 100644 index 0000000..06f7194 --- /dev/null +++ b/src/NodaConsolidatedNamesAbstract.php @@ -0,0 +1,68 @@ + + */ +declare(strict_types = 1); + +/** + * Abstract class to be inherited by classes for writing consolidated vocabulary names. + */ +abstract class NodaConsolidatedNamesAbstract { + + /** + * This function sanitizes a string. + * + * @param string $inputString Input string. + * + * @return string + */ + final protected static function _sanitizeInputStringStatic(string $inputString):string { + + $string = trim($inputString, "; \t" . PHP_EOL); + $string = strtr($string, ["<" => "[", ">" => "]", "\t" => " ", '\n' => ' ', + '
' => ' ', '
' => ' ', '
' => ' ', + "<br />" => ' ', '§' => '"' + ]); + + $string = str_replace(PHP_EOL, ' ', $string); + while (strpos($string, " ") !== false) { + $string = str_replace(" ", " ", $string); + } + + $string = strip_tags((string)$string); + + return trim(trim($string), ',| '); + + } + + /** + * Does general cleanup for vocabulary entries. + * + * @param string $input Input string. + * + * @return string + */ + final public static function sanitizeInputString(string $input):string { + + $output = strtr( + self::_sanitizeInputStringStatic($input), + [ + '<' => '(', + '>' => ')', + '[' => '(', + ']' => ')', + "unbekannt" => "", + ], + ); + + // If the first and last character of the name are brackets, remove those. + if (substr($output, 0, 1) === '(' && substr($output, -1) === ')') { + $output = trim($output, '()'); + } + + return $output; + + } +} diff --git a/src/NodaConsolidatedNamesForPersinst.php b/src/NodaConsolidatedNamesForPersinst.php new file mode 100644 index 0000000..fa90923 --- /dev/null +++ b/src/NodaConsolidatedNamesForPersinst.php @@ -0,0 +1,124 @@ + "Mythologie", + "Mythologische Figur" => "Mythologie", + "Mythologische Gestalt" => "Mythologie", + "()" => "", + ]; + + /** + * Replaces last characters of a string if $from matches the end of the string, + * + * @param string $from Replace from. + * @param string $to Replace to. + * @param string $name Input name. + * + * @return string + */ + private static function _replaceFromEnd(string $from, string $to, string $name):string { + + $length = mb_strlen($from); + if (str_ends_with($name, $from) === true && substr($name, -1 * $length - 1, 1) !== '.') { + $name = str_replace(" ", " ", substr($name, 0, -1 * $length) . $to); + } + + return $name; + + } + + /** + * Cleans and consolidates name parts appearing regularly in German names + * that have a default writing in md. + * + * @param string $name Name of an actor. + * + * @return string + */ + private static function _clean_german_abbreviations(string $name):string { + + $name = self::_replaceFromEnd(" d.Ä.", " (der Ältere)", $name); + $name = self::_replaceFromEnd(" d. Ä.", " (der Ältere)", $name); + $name = self::_replaceFromEnd(" (d.Ä.)", " (der Ältere)", $name); + $name = self::_replaceFromEnd(" (d. Ä.)", " (der Ältere)", $name); + + $name = self::_replaceFromEnd(" d.J.", " (der Jüngere)", $name); + $name = self::_replaceFromEnd(" d. J.", " (der Jüngere)", $name); + $name = self::_replaceFromEnd(" (d.J.)", " (der Jüngere)", $name); + $name = self::_replaceFromEnd(" (d. J.)", " (der Jüngere)", $name); + + return $name; + + } + + /** + * Tries to make sense of life dates in brackets at the end of an actor's name. + * + * @param string $name Input name. + * + * @return array{name: string, birth: string, death: string}|array{} + */ + public static function parse_life_dates_from_name(string $name):array { + + if (str_contains($name, "(") === false || str_ends_with($name, ")") === false) return []; + + $parts = explode("(", $name); + if (count($parts) !== 2) return []; + + $nameOnly = trim($parts[0]); + $dateString = rtrim($parts[1], ')'); // + + if (!empty($dates = NodaTimeSplitter::is_timespan($dateString)) + && $dates[0] !== '?' + && $dates[1] !== '?' + && intval($dates[1]) - intval($dates[0]) < 150 + ) { + return [ + 'name' => $nameOnly, + 'birth' => $dates[0], + 'death' => $dates[1], + ]; + } + + return []; + + } + + /** + * Cleans a persinst name by trimming etc. Also removes uncertainty indicators. + * + * @param string $lang Instance language. + * @param string $persinst_name Input string to clean. + * + * @return string + */ + public static function consolidate_name(string $lang, string $persinst_name):string { + + // Run basic replacements + $name = \strtr(self::sanitizeInputString($persinst_name), + self::_NAME_SANITIZATIONS); + $name = NodaUncertaintyHelper::cleanUncertaintyIndicatorsPersinst($name); + + if (mb_strlen($name) > 10 && $lang === 'de') { + $name = self::_clean_german_abbreviations($name); + } + + // If the persinst name is empty, unset persinst ID + return \trim($name, " ;.\t" . PHP_EOL); + + } + +} diff --git a/src/NodaConsolidatedNamesForPlaces.php b/src/NodaConsolidatedNamesForPlaces.php new file mode 100644 index 0000000..498f116 --- /dev/null +++ b/src/NodaConsolidatedNamesForPlaces.php @@ -0,0 +1,209 @@ + "-", + "unbekannt" => "", + "Unbekannt" => "", + "unknown" => "", + "Unknown" => "", + ]; + + private const _PLACE_TYPE_INDICATORS_GERMAN = [ + 'Insel', + 'Stadt', + ]; + + // Indicators signifying that a place is likely subordinate to the other + // if two places are provided in a comma-separated list + private const _PLACE_NARROWER_LOCATION_INDICATORS_GERMAN = [ + 'gasse', + 'straße', + ]; + + // Indicators signifying that a place is likely subordinate to the other + // if two places are provided in a comma-separated list + private const _PLACE_NARROWER_LOCATION_INDICATORS_HUNGARIAN = [ + 'körut', + 'utca', + 'út', + ]; + + private const _RELEVANT_ROMAN_NUMERALS = [ + 'I' => '1', + 'II' => '2', + 'III' => '3', + 'IV' => '4', + 'V' => '5', + 'VI' => '6', + 'VII' => '7', + 'VIII' => '8', + 'IX' => '9', + 'X' => '10', + 'XI' => '11', + 'XII' => '12', + 'XIII' => '13', + 'XIV' => '14', + 'XV' => '15', + 'XVI' => '16', + 'XVII' => '17', + 'XVIII' => '18', + 'XIX' => '19', + 'XX' => '20', + ]; + + /** + * Rewrites indicators for narrower locations paired with a superordinate location + * into the format "Narrower (Broader)". + * E.g.: "Adalbrechtstr. 12, Berlin" > Adalbrechtstraße 12 (Berlin). + * + * @param string $name Name in which to rewrite. + * @param string $indicator Indicator for narrower place. E.g. "straße". + * @param string $separator Separating character between narrower and broader, e.g. ', '. + * + * @return string + */ + private static function _rewrite_narrower_broader_pairs_to_brackets(string $name, string $indicator, $separator = ', '):string { + + if (str_contains($name, $indicator) + && substr_count($name, $indicator) === 1 + && substr_count($name, $separator) === 1 + && !str_contains($name, "(") + ) { + $parts = explode(', ', $name); + + // Prevent errors in case of "Adalbrechtstraße 12, " + if (!empty($parts[0]) && !empty($parts[1])) { + + if (str_contains($parts[0], $indicator)) { // Adalberthstraße 12, Berlin + $street = $parts[0]; + $town = $parts[1]; + } + else { // Berlin, Adalberthstraße 12 + $street = $parts[1]; + $town = $parts[0]; + } + + // Prevent rewrites in cases like "Deák Ferenc utca 16-18. Budapest, V." + if (str_contains($town, '.')) { + return $name; + } + + return $street . ' (' . $town . ')'; + + } + + + } + + + return $name; + + } + + /** + * Cleans and consolidates name parts appearing regularly in German place names. + * + * @param string $name Name of an actor. + * + * @return string + */ + private static function _clean_german_abbreviations(string $name):string { + + // ABC, Inseln > ABC (Inseln) + foreach (self::_PLACE_TYPE_INDICATORS_GERMAN as $indicator) { + if (str_ends_with($name, ', ' . $indicator)) { + $name = str_replace(', ' . $indicator, ' (' . $indicator . ')', $name); + } + } + + // Adalbrechtstr. 12 > Adalbrechtstraße 12 + if (str_contains($name, "str. ") && \preg_match("/[a-zA-Z]str. [0-9]/", $name)) { + $name = str_replace("str. ", "straße ", $name); + } + + // "Adalbrechtstraße. 12, Berlin" > Adalbrechtstraße 12 (Berlin) + + foreach (self::_PLACE_NARROWER_LOCATION_INDICATORS_GERMAN as $indicator) { + $name = self::_rewrite_narrower_broader_pairs_to_brackets($name, $indicator, ', '); + } + + return $name; + + } + + /** + * Cleans and consolidates name parts appearing regularly in Hungarian place names. + * + * @param string $name Name of an actor. + * + * @return string + */ + private static function _clean_hungarian_abbreviations(string $name):string { + + if (str_contains($name, " krt. ") && \preg_match("/\ krt\.\ [0-9]/", $name)) { + $name = str_replace(" krt. ", " körut ", $name); + } + if (str_contains($name, " u. ") && \preg_match("/\ u\.\ [0-9]/", $name)) { + $name = str_replace(" u. ", " utca ", $name); + } + + // "Adalbrecht utca. 12, Berlin" > Adalbrecht utca 12 (Berlin) + + foreach (self::_PLACE_NARROWER_LOCATION_INDICATORS_HUNGARIAN as $indicator) { + $name = self::_rewrite_narrower_broader_pairs_to_brackets($name, $indicator, ', '); + } + + if (str_contains($name, 'Budapest') && substr_count($name, 'Budapest') === 1) { + foreach(self::_RELEVANT_ROMAN_NUMERALS as $roman_numeral => $arabic) { + + $to_match = ' Budapest, ' . $roman_numeral . '.'; + if (str_ends_with($name, $to_match)) { + $name = str_replace($to_match, ' (Budapest, ' . $arabic . '. kerület)', $name); + } + + } + } + + return $name; + + } + + /** + * Cleans a place name by trimming etc. Also removes uncertainty indicators. + * + * @param string $lang Instance language. + * @param string $ort_name Input string to clean. + * + * @return string + */ + public static function consolidate_name(string $lang, string $ort_name):string { + + // Run basic replacements + $nameSanitizations = self::_NAME_SANITIZATIONS; + if (substr_count($ort_name, "/") === 1) $nameSanitizations["/"] = "-"; + $ort_name = strtr(self::sanitizeInputString($ort_name), $nameSanitizations); + $ort_name = self::sanitizeInputString(NodaUncertaintyHelper::cleanUncertaintyIndicatorsPlace($ort_name)); + + $ort_name = match ($lang) { + 'de' => self::_clean_german_abbreviations($ort_name), + 'hu' => self::_clean_hungarian_abbreviations($ort_name), + default => $ort_name, + }; + + return $ort_name; + + } +} diff --git a/tests/NodaConsolidatedNamesForPersinstTest.php b/tests/NodaConsolidatedNamesForPersinstTest.php new file mode 100644 index 0000000..373c05d --- /dev/null +++ b/tests/NodaConsolidatedNamesForPersinstTest.php @@ -0,0 +1,50 @@ + + */ +declare(strict_types = 1); +use PHPUnit\Framework\TestCase; + +/** + * Tests for setting uniform actor names. + * + * @covers \NodaConsolidatedNamesForPersinst + */ +final class NodaConsolidatedNamesForPersinstTest extends TestCase { + /** + * Test that cleanup function returns expected values. + * + * @small + * + * @return void + */ + public function testCleaningNamesWithCanonicalForms():void { + + self::assertEquals("Friedrich Barbarossa (Kaiser)", NodaConsolidatedNamesForPersinst::consolidate_name("de", "Friedrich Barbarossa ")); + self::assertEquals("Friedrich Barbarossa (Kaiser)", NodaConsolidatedNamesForPersinst::consolidate_name("de", "Friedrich Barbarossa , ")); + self::assertEquals("Friedrich Barbarossa (Kaiser)", NodaConsolidatedNamesForPersinst::consolidate_name("de", "Friedrich Barbarossa , ||")); + self::assertEquals("Friedrich Barbarossa", NodaConsolidatedNamesForPersinst::consolidate_name("de", "(Friedrich Barbarossa)")); + self::assertEquals("Friedrich Barbarossa", NodaConsolidatedNamesForPersinst::consolidate_name("de", "Friedrich Barbarossa.")); + + self::assertEquals("Fr d.Ä", NodaConsolidatedNamesForPersinst::consolidate_name("de", "Fr d.Ä.")); + self::assertEquals("Raffaelli, C. d. J", NodaConsolidatedNamesForPersinst::consolidate_name("de", "Raffaelli, C. d. J.")); + self::assertEquals("Friedrich Barbarossa d.Ä", NodaConsolidatedNamesForPersinst::consolidate_name("en", "Friedrich Barbarossa d.Ä.")); + + self::assertEquals("Friedrich Barbarossa (der Ältere)", NodaConsolidatedNamesForPersinst::consolidate_name("de", "Friedrich Barbarossa d.Ä.")); + self::assertEquals("Friedrich Barbarossa (der Ältere)", NodaConsolidatedNamesForPersinst::consolidate_name('de', "Friedrich Barbarossa d. Ä.")); + self::assertEquals("Friedrich Barbarossa (der Ältere)", NodaConsolidatedNamesForPersinst::consolidate_name('de', "Friedrich Barbarossa (d.Ä.)")); + self::assertEquals("Friedrich Barbarossa (der Ältere)", NodaConsolidatedNamesForPersinst::consolidate_name('de', "Friedrich Barbarossa (d. Ä.)")); + self::assertEquals("Friedrich Barbarossa (der Ältere)", NodaConsolidatedNamesForPersinst::consolidate_name('de', "Friedrich Barbarossa [d.Ä.]")); + self::assertEquals("Friedrich Barbarossa (der Ältere)", NodaConsolidatedNamesForPersinst::consolidate_name('de', "Friedrich Barbarossa [d. Ä.]")); + + self::assertEquals("Friedrich Barbarossa (der Jüngere)", NodaConsolidatedNamesForPersinst::consolidate_name('de', "Friedrich Barbarossa d.J.")); + self::assertEquals("Friedrich Barbarossa (der Jüngere)", NodaConsolidatedNamesForPersinst::consolidate_name('de', "Friedrich Barbarossa d. J.")); + self::assertEquals("Friedrich Barbarossa (der Jüngere)", NodaConsolidatedNamesForPersinst::consolidate_name('de', "Friedrich Barbarossa (d.J.)")); + self::assertEquals("Friedrich Barbarossa (der Jüngere)", NodaConsolidatedNamesForPersinst::consolidate_name('de', "Friedrich Barbarossa (d. J.)")); + self::assertEquals("Friedrich Barbarossa (der Jüngere)", NodaConsolidatedNamesForPersinst::consolidate_name('de', "Friedrich Barbarossa [d.J.]")); + self::assertEquals("Friedrich Barbarossa (der Jüngere)", NodaConsolidatedNamesForPersinst::consolidate_name('de', "Friedrich Barbarossa [d. J.]")); + + } +} diff --git a/tests/NodaConsolidatedNamesForPlacesTest.php b/tests/NodaConsolidatedNamesForPlacesTest.php new file mode 100644 index 0000000..308a88c --- /dev/null +++ b/tests/NodaConsolidatedNamesForPlacesTest.php @@ -0,0 +1,63 @@ + + */ +declare(strict_types = 1); +use PHPUnit\Framework\TestCase; + +/** + * Tests for setting uniform place names. + * + * @covers \NodaConsolidatedNamesForPlaces + */ +final class NodaConsolidatedNamesForPlacesTest extends TestCase { + /** + * Test that cleanup function returns expected values. + * + * @small + * + * @return void + */ + public function testCleaningNamesWithCanonicalForms():void { + + // Hungarian: u. [0-9] > utca. + + self::assertEquals("Test u. 12", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Test u. 12")); + self::assertEquals("Test utca 12", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Test u. 12")); + self::assertEquals("Test u. Test", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Test u. Test")); + + // German: Ending in standard terms that be in brackets + + self::assertEquals("Berlin, Insel Borneo", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Berlin, Insel Borneo")); + self::assertEquals("Berlin (Insel)", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Berlin, Insel")); + self::assertEquals("Berlin, Insel", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Berlin, Insel")); + + // German: Ending in standard terms that be in brackets + + self::assertEquals("Adalbrechtstraße 12", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Adalbrechtstr. 12")); + self::assertEquals("Adalbrechtstr. 12", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Adalbrechtstr. 12")); + + self::assertEquals("Adalbrechtstraße 12 (Berlin)", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Berlin, Adalbrechtstr. 12")); + self::assertEquals("Adalbrechtstraße 12 (Berlin)", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Adalbrechtstr. 12, Berlin")); + self::assertEquals("Berlin, Adalbrechtstr. 12", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Berlin, Adalbrechtstr. 12")); + + // Same in Hungarian + self::assertEquals("Adalbrecht utca 12 (Berlin)", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Berlin, Adalbrecht utca 12")); + self::assertEquals("Adalbrecht utca 12 (Berlin)", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Adalbrecht utca 12, Berlin")); + self::assertEquals("Berlin, Adalbrecht utca 12", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Berlin, Adalbrecht utca 12")); + + // Deák Ferenc utca 16-18. (Budapest, 5. kerület) + self::assertEquals("Deák Ferenc utca 16-18. (Budapest, 5. kerület)", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Deák Ferenc utca 16-18. Budapest, V.,")); + self::assertEquals("Deák Ferenc utca 16-18. (Budapest, 5. kerület)", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Deák Ferenc utca 16-18. Budapest, V.")); + self::assertEquals("Deák Ferenc utca 16-18. Budapest, V. abc", NodaConsolidatedNamesForPlaces::consolidate_name("hu", "Deák Ferenc utca 16-18. Budapest, V. abc")); + + + // Rewriting country names in brackets + self::assertEquals("Köln (Deutschland)", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Deutschland-Köln")); + self::assertEquals("Köln (Deutschland)", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Deutschland, Köln")); + self::assertEquals("Köln (Deutschland)", NodaConsolidatedNamesForPlaces::consolidate_name("de", "Köln, Deutschland")); + + } +}