From 29ca05f552974226e1ce7219c89c8c367cc2cc4e Mon Sep 17 00:00:00 2001 From: Joshua Ramon Enslin Date: Sat, 9 Nov 2024 15:33:49 +0100 Subject: [PATCH 1/4] Properly handle commas at the end of names when guessing certainty --- src/NodaUncertaintyHelper.php | 384 ++++++++++++++-------------- tests/NodaUncertaintyHelperTest.php | 12 +- 2 files changed, 202 insertions(+), 194 deletions(-) diff --git a/src/NodaUncertaintyHelper.php b/src/NodaUncertaintyHelper.php index 0030a47..23a6b58 100644 --- a/src/NodaUncertaintyHelper.php +++ b/src/NodaUncertaintyHelper.php @@ -12,7 +12,7 @@ declare(strict_types = 1); */ final class NodaUncertaintyHelper { - const PERSINST_INDICATORS_DISALLOWED = [ + public const PERSINST_INDICATORS_DISALLOWED = [ "Unbekannt", "unbekannt", "Anonymus", @@ -41,7 +41,7 @@ final class NodaUncertaintyHelper { "Невідомий артист", // Unknown artist ]; - const PERSINST_UNCERTAINTY_PREFIXES = [ + public const PERSINST_UNCERTAINTY_PREFIXES = [ "verm. ", "Verm. ", "vermtl. ", @@ -57,7 +57,7 @@ final class NodaUncertaintyHelper { "?", ]; - const PERSINST_UNCERTAINTY_SUFFIXES = [ + public const PERSINST_UNCERTAINTY_SUFFIXES = [ "(?)", "?", " [vermutlich]", @@ -65,7 +65,7 @@ final class NodaUncertaintyHelper { " [wahrscheinlich]", ]; - const TIME_INDICATORS_DISALLOWED = [ + public const TIME_INDICATORS_DISALLOWED = [ "Nachgewiesen", "nachgewiesen", "o.D.", @@ -94,9 +94,9 @@ final class NodaUncertaintyHelper { "Без датування", // No dating "б.р.", // No dating "б.д.", // No dating - ]; + ]; - const TIME_UNCERTAINTY_PREFIXES = [ + public const TIME_UNCERTAINTY_PREFIXES = [ "c. ", "ca ", "ca. ", @@ -130,9 +130,9 @@ final class NodaUncertaintyHelper { "майже", // UK: Almost / nearly / about "орієнтовно", // UK: approximately "Прибл.", // UK: approximately - ]; + ]; - const TIME_UNCERTAINTY_SUFFIXES = [ + public const TIME_UNCERTAINTY_SUFFIXES = [ "(?)", "?", " (ca.)", @@ -145,12 +145,12 @@ final class NodaUncertaintyHelper { ", um", " (um)", " (ок.)", - ]; + ]; - /** - * Substrings used to express uncertainty about the validity of a place name. - */ - const PLACE_INDICATORS_DISALLOWED = [ + /** + * Substrings used to express uncertainty about the validity of a place name. + */ + public const PLACE_INDICATORS_DISALLOWED = [ "Unbekannt", "unbekannt", "Unknown", @@ -173,9 +173,9 @@ final class NodaUncertaintyHelper { "не вказано", // No place "не вказане", // No place "невідоме", // No place - ]; + ]; - const PLACE_UNCERTAINTY_PREFIXES = [ + public const PLACE_UNCERTAINTY_PREFIXES = [ "ca ", "Ca ", "ca. ", @@ -210,9 +210,9 @@ final class NodaUncertaintyHelper { "Wahrscheinlich ", "можливо", "?", - ]; + ]; - const PLACE_UNCERTAINTY_SUFFIXES = [ + public const PLACE_UNCERTAINTY_SUFFIXES = [ "(?)", "(vermutl.)", "[vermutl.]", @@ -221,206 +221,206 @@ final class NodaUncertaintyHelper { "(wohl)", "[wohl]", "?", - ]; + ]; - /** - * Trims common characters and charater marks. - * - * @param string $input Input text. - * - * @return string - */ - public static function trim(string $input):string { + /** + * Trims common characters and charater marks. + * + * @param string $input Input text. + * + * @return string + */ + public static function trim(string $input):string { - $input = \trim($input, ", \t\n\r\n;-:"); - return $input; + $input = \trim($input, ", \t\n\r\n;-:"); + return $input; + } + + /** + * Removes uncertainty indicators from an time name. + * + * @param string $name Input string. + * + * @return string + */ + public static function cleanUncertaintyIndicatorsTime(string $name):string { + + $name = self::trim($name); + + if (\in_array($name, self::TIME_INDICATORS_DISALLOWED, true)) { + return ""; } - /** - * Removes uncertainty indicators from an time name. - * - * @param string $name Input string. - * - * @return string - */ - public static function cleanUncertaintyIndicatorsTime(string $name):string { - - $name = self::trim($name); - - if (\in_array($name, self::TIME_INDICATORS_DISALLOWED, true)) { - return ""; + // Remove uncertainty prefixes + foreach (NodaUncertaintyHelper::TIME_UNCERTAINTY_PREFIXES as $prefix) { + if (\substr($name, 0, \strlen($prefix)) === "$prefix") { + $name = substr($name, \strlen($prefix)); } - - // Remove uncertainty prefixes - foreach (NodaUncertaintyHelper::TIME_UNCERTAINTY_PREFIXES as $prefix) { - if (\substr($name, 0, \strlen($prefix)) === "$prefix") { - $name = substr($name, \strlen($prefix)); - } - } - - // Remove uncertainty sufixes - foreach (NodaUncertaintyHelper::TIME_UNCERTAINTY_SUFFIXES as $suffix) { - if (\substr($name, \strlen($suffix) * -1) === "$suffix") { - $name = \substr($name, 0, \strlen($suffix) * -1); - } - } - - return self::trim($name); - } - /** - * Attempts guessing whether time is uncertain. Returns true if the name - * indicates certainty, false if it indicates uncertainty. - * - * @param string $zeit_name Time name. - * - * @return boolean - */ - public static function guessTimeCertainty(string $zeit_name):bool { - - $zeit_name = \strtolower($zeit_name); - - // Attempt to guess uncertainty based on prefixes. - foreach (self::TIME_UNCERTAINTY_PREFIXES as $prefix) { - if (\substr($zeit_name, 0, \strlen($prefix)) === $prefix) { - return false; // Uncertainty found - } + // Remove uncertainty sufixes + foreach (NodaUncertaintyHelper::TIME_UNCERTAINTY_SUFFIXES as $suffix) { + if (\substr($name, \strlen($suffix) * -1) === "$suffix") { + $name = \substr($name, 0, \strlen($suffix) * -1); } - - // Attempt to guess uncertainty based on prefixes. - foreach (self::TIME_UNCERTAINTY_SUFFIXES as $prefix) { - if (\substr($zeit_name, -1 * \strlen($prefix)) === $prefix) { - return false; // Uncertainty found - } - } - - return true; // No uncertainty found - } - /** - * Removes uncertainty indicators from an place name. - * - * @param string $ort_name Input string. - * - * @return string - */ - public static function cleanUncertaintyIndicatorsPlace(string $ort_name):string { + return self::trim($name); - $ort_name = self::trim($ort_name); + } - if (\in_array($ort_name, self::PLACE_INDICATORS_DISALLOWED, true)) { - return ""; + /** + * Attempts guessing whether time is uncertain. Returns true if the name + * indicates certainty, false if it indicates uncertainty. + * + * @param string $zeit_name Time name. + * + * @return boolean + */ + public static function guessTimeCertainty(string $zeit_name):bool { + + $zeit_name = \strtolower($zeit_name); + + // Attempt to guess uncertainty based on prefixes. + foreach (self::TIME_UNCERTAINTY_PREFIXES as $prefix) { + if (\substr($zeit_name, 0, \strlen($prefix)) === $prefix) { + return false; // Uncertainty found } - - // Remove uncertainty prefixes - foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_PREFIXES as $prefix) { - if (\substr($ort_name, 0, \strlen($prefix)) === "$prefix") { - $ort_name = substr($ort_name, \strlen($prefix)); - } - } - - // Remove uncertainty sufixes - foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_SUFFIXES as $suffix) { - if (\substr($ort_name, \strlen($suffix) * -1) === "$suffix") { - $ort_name = \substr($ort_name, 0, \strlen($suffix) * -1); - } - } - - return self::trim($ort_name); - } - /** - * Attempts guessing whether place is uncertain. Returns true if the name - * indicates certainty, false if it indicates uncertainty. - * - * @param string $ort_name Place name. - * - * @return boolean - */ - public static function guessPlaceCertainty(string $ort_name):bool { - - $ort_name = \strtolower($ort_name); - - // Attempt to guess uncertainty based on prefixes. - foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_PREFIXES as $prefix) { - if (\substr($ort_name, 0, \strlen($prefix)) === $prefix) { - return false; // Uncertain - } + // Attempt to guess uncertainty based on prefixes. + foreach (self::TIME_UNCERTAINTY_SUFFIXES as $prefix) { + if (\substr($zeit_name, -1 * \strlen($prefix)) === $prefix) { + return false; // Uncertainty found } - - // Attempt to guess uncertainty based on prefixes. - foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_SUFFIXES as $prefix) { - if (\substr($ort_name, -1 * \strlen($prefix)) === $prefix) { - return false; // Uncertain - } - } - - return true; // Certain / no uncertainty found - } - /** - * Removes uncertainty indicators from an actor name. - * - * @param string $value Input string. - * - * @return string - */ - public static function cleanUncertaintyIndicatorsPersinst(string $value):string { + return true; // No uncertainty found - $value = self::trim($value); + } - if (\in_array(trim($value, ";. "), self::PERSINST_INDICATORS_DISALLOWED, true)) { - return ""; - } + /** + * Removes uncertainty indicators from an place name. + * + * @param string $ort_name Input string. + * + * @return string + */ + public static function cleanUncertaintyIndicatorsPlace(string $ort_name):string { - foreach (self::PERSINST_UNCERTAINTY_PREFIXES as $toRemove) { - if (\mb_substr($value, 0, \mb_strlen($toRemove)) === $toRemove) { - $value = substr($value, \mb_strlen($toRemove)); - } - } - - foreach (self::PLACE_UNCERTAINTY_SUFFIXES as $suffix) { - if (\mb_substr($value, \mb_strlen($suffix) * -1) === "$suffix") { - $value = \mb_substr($value, 0, \mb_strlen($suffix) * -1); - } - } - - return self::trim($value); + $ort_name = self::trim($ort_name); + if (\in_array($ort_name, self::PLACE_INDICATORS_DISALLOWED, true)) { + return ""; } - /** - * Attempts guessing whether persinst is uncertain. Returns true if the name - * indicates certainty, false if it indicates uncertainty. - * - * @param string $name Persinst name. - * - * @return boolean - */ - public static function guessPersinstCertainty(string $name):bool { - - $name = \trim(\strtolower($name)); - - // Attempt to guess uncertainty based on prefixes. - foreach (NodaUncertaintyHelper::PERSINST_UNCERTAINTY_PREFIXES as $prefix) { - if (\substr($name, 0, \strlen($prefix)) === $prefix) { - return false; // Uncertain - } + // Remove uncertainty prefixes + foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_PREFIXES as $prefix) { + if (\substr($ort_name, 0, \strlen($prefix)) === "$prefix") { + $ort_name = substr($ort_name, \strlen($prefix)); } - - // Attempt to guess uncertainty based on prefixes. - foreach (NodaUncertaintyHelper::PERSINST_UNCERTAINTY_SUFFIXES as $prefix) { - if (\substr($name, -1 * \strlen($prefix)) === $prefix) { - return false; // Uncertain - } - } - - return true; // Certain / no uncertainty found - } + + // Remove uncertainty sufixes + foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_SUFFIXES as $suffix) { + if (\substr($ort_name, \strlen($suffix) * -1) === "$suffix") { + $ort_name = \substr($ort_name, 0, \strlen($suffix) * -1); + } + } + + return self::trim($ort_name); + + } + + /** + * Attempts guessing whether place is uncertain. Returns true if the name + * indicates certainty, false if it indicates uncertainty. + * + * @param string $ort_name Place name. + * + * @return boolean + */ + public static function guessPlaceCertainty(string $ort_name):bool { + + $ort_name = \trim(\strtolower($ort_name), ', ;-_'); + + // Attempt to guess uncertainty based on prefixes. + foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_PREFIXES as $prefix) { + if (\substr($ort_name, 0, \strlen($prefix)) === $prefix) { + return false; // Uncertain + } + } + + // Attempt to guess uncertainty based on prefixes. + foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_SUFFIXES as $prefix) { + if (\substr($ort_name, -1 * \strlen($prefix)) === $prefix) { + return false; // Uncertain + } + } + + return true; // Certain / no uncertainty found + + } + + /** + * Removes uncertainty indicators from an actor name. + * + * @param string $value Input string. + * + * @return string + */ + public static function cleanUncertaintyIndicatorsPersinst(string $value):string { + + $value = self::trim($value); + + if (\in_array(trim($value, ";. "), self::PERSINST_INDICATORS_DISALLOWED, true)) { + return ""; + } + + foreach (self::PERSINST_UNCERTAINTY_PREFIXES as $toRemove) { + if (\mb_substr($value, 0, \mb_strlen($toRemove)) === $toRemove) { + $value = substr($value, \mb_strlen($toRemove)); + } + } + + foreach (self::PLACE_UNCERTAINTY_SUFFIXES as $suffix) { + if (\mb_substr($value, \mb_strlen($suffix) * -1) === "$suffix") { + $value = \mb_substr($value, 0, \mb_strlen($suffix) * -1); + } + } + + return self::trim($value); + + } + + /** + * Attempts guessing whether persinst is uncertain. Returns true if the name + * indicates certainty, false if it indicates uncertainty. + * + * @param string $name Persinst name. + * + * @return boolean + */ + public static function guessPersinstCertainty(string $name):bool { + + $name = \trim(\strtolower($name), ', ;-_'); + + // Attempt to guess uncertainty based on prefixes. + foreach (NodaUncertaintyHelper::PERSINST_UNCERTAINTY_PREFIXES as $prefix) { + if (\substr($name, 0, \strlen($prefix)) === $prefix) { + return false; // Uncertain + } + } + + // Attempt to guess uncertainty based on prefixes. + foreach (NodaUncertaintyHelper::PERSINST_UNCERTAINTY_SUFFIXES as $prefix) { + if (\substr($name, -1 * \strlen($prefix)) === $prefix) { + return false; // Uncertain + } + } + + return true; // Certain / no uncertainty found + + } } diff --git a/tests/NodaUncertaintyHelperTest.php b/tests/NodaUncertaintyHelperTest.php index ecafed6..14c0594 100644 --- a/tests/NodaUncertaintyHelperTest.php +++ b/tests/NodaUncertaintyHelperTest.php @@ -6,12 +6,14 @@ */ declare(strict_types = 1); use PHPUnit\Framework\TestCase; +use PHPUnit\Framework\Attributes\CoversClass; +use PHPUnit\Framework\Attributes\Small; /** * This script contains tests for the uncertainty helper. - * - * @covers \NodaUncertaintyHelper */ +#[small] +#[CoversClass(\NodaUncertaintyHelper::class)] final class NodaUncertaintyHelperTest extends TestCase { /** * Removes uncertainty indicators from an time name. @@ -76,6 +78,9 @@ final class NodaUncertaintyHelperTest extends TestCase { self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("vermutl. Augsburg")); self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("wohl Berlin")); + self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("Berlin?")); + self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("Berlin?,")); + self::assertTrue(NodaUncertaintyHelper::guessPlaceCertainty("Berlin,")); self::assertTrue(NodaUncertaintyHelper::guessPlaceCertainty("Berlin")); self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("?-Italien")); @@ -109,6 +114,9 @@ final class NodaUncertaintyHelperTest extends TestCase { public static function testGuessPersinstCertainty():void { self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("wohl Barbarossa")); + self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("Barbarossa?")); + self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("Barbarossa?,")); + self::assertTrue(NodaUncertaintyHelper::guessPlaceCertainty("Barbarossa,")); self::assertTrue(NodaUncertaintyHelper::guessPlaceCertainty("Barbarossa")); } From 7cfe752c941cc535df1abd877a5be84109707bf7 Mon Sep 17 00:00:00 2001 From: Joshua Ramon Enslin Date: Sat, 9 Nov 2024 15:40:27 +0100 Subject: [PATCH 2/4] Handle commas when guessing time certainty --- src/NodaUncertaintyHelper.php | 9 ++++----- tests/NodaUncertaintyHelperTest.php | 3 +++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/NodaUncertaintyHelper.php b/src/NodaUncertaintyHelper.php index 23a6b58..c9f2a19 100644 --- a/src/NodaUncertaintyHelper.php +++ b/src/NodaUncertaintyHelper.php @@ -232,8 +232,7 @@ final class NodaUncertaintyHelper { */ public static function trim(string $input):string { - $input = \trim($input, ", \t\n\r\n;-:"); - return $input; + return \trim($input, ", \t\n\r\n;-:"); } @@ -280,7 +279,7 @@ final class NodaUncertaintyHelper { */ public static function guessTimeCertainty(string $zeit_name):bool { - $zeit_name = \strtolower($zeit_name); + $zeit_name = self::trim(strtolower($zeit_name)); // Attempt to guess uncertainty based on prefixes. foreach (self::TIME_UNCERTAINTY_PREFIXES as $prefix) { @@ -343,7 +342,7 @@ final class NodaUncertaintyHelper { */ public static function guessPlaceCertainty(string $ort_name):bool { - $ort_name = \trim(\strtolower($ort_name), ', ;-_'); + $ort_name = self::trim(\strtolower($ort_name)); // Attempt to guess uncertainty based on prefixes. foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_PREFIXES as $prefix) { @@ -404,7 +403,7 @@ final class NodaUncertaintyHelper { */ public static function guessPersinstCertainty(string $name):bool { - $name = \trim(\strtolower($name), ', ;-_'); + $name = self::trim(\strtolower($name)); // Attempt to guess uncertainty based on prefixes. foreach (NodaUncertaintyHelper::PERSINST_UNCERTAINTY_PREFIXES as $prefix) { diff --git a/tests/NodaUncertaintyHelperTest.php b/tests/NodaUncertaintyHelperTest.php index 14c0594..b8fc387 100644 --- a/tests/NodaUncertaintyHelperTest.php +++ b/tests/NodaUncertaintyHelperTest.php @@ -26,6 +26,9 @@ final class NodaUncertaintyHelperTest extends TestCase { public function testCleanUncertaintyIndicatorsTime():void { self::assertEquals("1950", NodaUncertaintyHelper::cleanUncertaintyIndicatorsTime("wohl 1950")); + self::assertEquals("1950", NodaUncertaintyHelper::cleanUncertaintyIndicatorsTime("1950?")); + self::assertEquals("1950", NodaUncertaintyHelper::cleanUncertaintyIndicatorsTime("1950?,")); + self::assertEquals("1950", NodaUncertaintyHelper::cleanUncertaintyIndicatorsTime("1950,")); self::assertEquals("1950", NodaUncertaintyHelper::cleanUncertaintyIndicatorsTime("1950")); } From 48355a6a365a3a35fed059581a7628e45cf7877c Mon Sep 17 00:00:00 2001 From: Joshua Ramon Enslin Date: Sat, 9 Nov 2024 18:42:18 +0100 Subject: [PATCH 3/4] Identify uncertainty before brackets ("Berlin ? (Germany)" > "Berlin (Germany)" + Uncertain) --- src/NodaUncertaintyHelper.php | 15 ++++ tests/NodaUncertaintyHelperTest.php | 125 ++++++++++++++-------------- 2 files changed, 79 insertions(+), 61 deletions(-) diff --git a/src/NodaUncertaintyHelper.php b/src/NodaUncertaintyHelper.php index c9f2a19..fd378ef 100644 --- a/src/NodaUncertaintyHelper.php +++ b/src/NodaUncertaintyHelper.php @@ -328,6 +328,14 @@ final class NodaUncertaintyHelper { } } + // If brackets are included in the name, try removing prefixes and suffixes + // from the beginning. + if (($bracketPos = strpos($ort_name, "(")) !== false) { + $start = substr($ort_name, 0, $bracketPos); + $end = substr($ort_name, $bracketPos); + $ort_name = self::cleanUncertaintyIndicatorsPlace($start) . ' ' . $end; + } + return self::trim($ort_name); } @@ -358,6 +366,13 @@ final class NodaUncertaintyHelper { } } + // If brackets are included in the name, try the same for everything up to the + // first brackets. + if (($bracketPos = strpos($ort_name, "(")) !== false) { + $name = substr($ort_name, 0, $bracketPos); + return self::guessPlaceCertainty($name); + } + return true; // Certain / no uncertainty found } diff --git a/tests/NodaUncertaintyHelperTest.php b/tests/NodaUncertaintyHelperTest.php index b8fc387..9c40c0f 100644 --- a/tests/NodaUncertaintyHelperTest.php +++ b/tests/NodaUncertaintyHelperTest.php @@ -8,6 +8,7 @@ declare(strict_types = 1); use PHPUnit\Framework\TestCase; use PHPUnit\Framework\Attributes\CoversClass; use PHPUnit\Framework\Attributes\Small; +use PHPUnit\Framework\Attributes\DataProvider; /** * This script contains tests for the uncertainty helper. @@ -16,111 +17,113 @@ use PHPUnit\Framework\Attributes\Small; #[CoversClass(\NodaUncertaintyHelper::class)] final class NodaUncertaintyHelperTest extends TestCase { /** - * Removes uncertainty indicators from an time name. + * Returns time names with expected cleaned version and expected parsed certainty. * - * @group ValidOutput - * @small - * - * @return void + * @return array */ - public function testCleanUncertaintyIndicatorsTime():void { + public static function uncertainTimesProvider():array { - self::assertEquals("1950", NodaUncertaintyHelper::cleanUncertaintyIndicatorsTime("wohl 1950")); - self::assertEquals("1950", NodaUncertaintyHelper::cleanUncertaintyIndicatorsTime("1950?")); - self::assertEquals("1950", NodaUncertaintyHelper::cleanUncertaintyIndicatorsTime("1950?,")); - self::assertEquals("1950", NodaUncertaintyHelper::cleanUncertaintyIndicatorsTime("1950,")); - self::assertEquals("1950", NodaUncertaintyHelper::cleanUncertaintyIndicatorsTime("1950")); + return [ + 'uncertainty prefix: "wohl 1950"' => ["wohl 1950", "1950", false], + 'uncertainty suffix: "1950?"' => ["1950?", "1950", false], + 'uncertainty suffix and superfluous chars: "1950 ?,"' => ["1950 ?,", "1950", false], + 'certain term with superfluous chars: "1950 ,"' => ["1950 ,", "1950", true], + 'certain term: 1950' => ["1950", "1950", true], + ]; } /** - * Attempts guessing whether time is uncertain. + * Returns place names with expected cleaned version and expected parsed certainty. * - * @group ValidOutput - * @small - * - * @return void + * @return array */ - public function testGuessTimeCertainty():void { + public static function uncertainPlacesProvider():array { - self::assertFalse(NodaUncertaintyHelper::guessTimeCertainty("wohl 1950")); - self::assertTrue(NodaUncertaintyHelper::guessTimeCertainty("1950")); + return [ + + 'uncertainty prefix: "wohl Berlin"' => ["wohl Berlin", "Berlin", false], + 'uncertainty prefix: "vermutl. Berlin"' => ["vermutl. Berlin", "Berlin", false], + 'uncertainty prefix and superfluous chars: "?-Berlin"' => ["?-Berlin", "Berlin", false], + 'uncertainty suffix: "Berlin?"' => ["Berlin?", "Berlin", false], + 'uncertainty suffix: "Berlin (?)"' => ["Berlin (?)", "Berlin", false], + 'uncertainty suffix and superfluous chars: "Berlin ?,"' => ["Berlin ?,", "Berlin", false], + 'certain term with superfluous chars: "Berlin ,"' => ["Berlin ,", "Berlin", true], + 'certain term: Berlin' => ["Berlin", "Berlin", true], + 'Berlin ? (Deutschland)' => ["Berlin ? (Deutschland)", "Berlin (Deutschland)", false], + 'Berli?n (Deutschland)' => ["Berl?n (Deutschland)", "Berl?n (Deutschland)", true], + + ]; } /** - * Removes uncertainty indicators from an place name. + * Returns actor names with expected cleaned version and expected parsed certainty. * - * @group ValidOutput - * @small - * - * @return void + * @return array */ - public static function testCleanUncertaintyIndicatorsPlace():void { + public static function uncertainPersinstProvider():array { - self::assertEquals("Berlin", NodaUncertaintyHelper::cleanUncertaintyIndicatorsPlace("wohl Berlin")); - self::assertEquals("Berlin", NodaUncertaintyHelper::cleanUncertaintyIndicatorsPlace("Berlin")); - - // Real-life examples that previously passed unencumbered - self::assertEquals("Augsburg", NodaUncertaintyHelper::cleanUncertaintyIndicatorsPlace("vermutlich: Augsburg")); - self::assertEquals("Augsburg", NodaUncertaintyHelper::cleanUncertaintyIndicatorsPlace("vermutl. Augsburg")); + return [ + 'uncertainty prefix: "wohl Barbarossa"' => ["wohl Barbarossa", "Barbarossa", false], + 'uncertainty prefix: "vermutl. Barbarossa"' => ["vermutl. Barbarossa", "Barbarossa", false], + 'uncertainty prefix and superfluous chars: "?-Barbarossa"' => ["?-Barbarossa", "Barbarossa", false], + 'uncertainty suffix: "Barbarossa?"' => ["Barbarossa?", "Barbarossa", false], + 'uncertainty suffix and superfluous chars: "Barbarossa ?,"' => ["Barbarossa ?,", "Barbarossa", false], + 'certain term with superfluous chars: "Barbarossa ,"' => ["Barbarossa ,", "Barbarossa", true], + 'certain term: Barbarossa' => ["Barbarossa", "Barbarossa", true], + ]; } /** - * Attempts guessing whether place is uncertain. + * Test to ensure times are correctly cleaned and parsed. * - * @group ValidOutput - * @small + * @param string $term Term to check. + * @param string $target_output Expected output name. + * @param boolean $target_certainty Expected output certainty. * * @return void */ - public static function testGuessPlaceCertainty():void { + #[DataProvider('uncertainTimesProvider')] + public function testParsingUncertaintyFromTimes(string $term, string $target_output, bool $target_certainty):void { - self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("vermutlich: Augsburg")); - self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("vermutl. Augsburg")); - - self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("wohl Berlin")); - self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("Berlin?")); - self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("Berlin?,")); - self::assertTrue(NodaUncertaintyHelper::guessPlaceCertainty("Berlin,")); - self::assertTrue(NodaUncertaintyHelper::guessPlaceCertainty("Berlin")); - - self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("?-Italien")); + self::assertEquals($target_output, NodaUncertaintyHelper::cleanUncertaintyIndicatorsTime($term)); + self::assertEquals($target_certainty, NodaUncertaintyHelper::guessTimeCertainty($term)); } /** - * Removes uncertainty indicators from an actor name. + * Test to ensure places are correctly cleaned and parsed. * - * @group ValidOutput - * @small + * @param string $term Term to check. + * @param string $target_output Expected output name. + * @param boolean $target_certainty Expected output certainty. * * @return void */ - public static function testCleanUncertaintyIndicatorsPersinst():void { + #[DataProvider('uncertainPlacesProvider')] + public function testParsingUncertaintyFromPlaces(string $term, string $target_output, bool $target_certainty):void { - self::assertEquals("Barbarossa", NodaUncertaintyHelper::cleanUncertaintyIndicatorsPersinst("wohl Barbarossa")); - self::assertEquals("Barbarossa", NodaUncertaintyHelper::cleanUncertaintyIndicatorsPersinst("Barbarossa")); - self::assertEquals("Barbarossa", NodaUncertaintyHelper::cleanUncertaintyIndicatorsPersinst("?-Barbarossa")); + self::assertEquals($target_output, NodaUncertaintyHelper::cleanUncertaintyIndicatorsPlace($term)); + self::assertEquals($target_certainty, NodaUncertaintyHelper::guessPlaceCertainty($term)); } /** - * Attempts guessing whether persinst is uncertain. + * Test to ensure actor names are correctly cleaned and parsed. * - * @group ValidOutput - * @small + * @param string $term Term to check. + * @param string $target_output Expected output name. + * @param boolean $target_certainty Expected output certainty. * * @return void */ - public static function testGuessPersinstCertainty():void { + #[DataProvider('uncertainPersinstProvider')] + public function testParsingUncertaintyFromPersinst(string $term, string $target_output, bool $target_certainty):void { - self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("wohl Barbarossa")); - self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("Barbarossa?")); - self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("Barbarossa?,")); - self::assertTrue(NodaUncertaintyHelper::guessPlaceCertainty("Barbarossa,")); - self::assertTrue(NodaUncertaintyHelper::guessPlaceCertainty("Barbarossa")); + self::assertEquals($target_output, NodaUncertaintyHelper::cleanUncertaintyIndicatorsPersinst($term)); + self::assertEquals($target_certainty, NodaUncertaintyHelper::guessPersinstCertainty($term)); } } From 6f7ad13c4e6632a5827cd46b94e392a163272c5d Mon Sep 17 00:00:00 2001 From: Joshua Ramon Enslin Date: Sat, 9 Nov 2024 19:44:09 +0100 Subject: [PATCH 4/4] Add class NodaTagRelationIdentifier for parsing tag relation types from input tag names --- src/NodaTagRelationIdentifier.php | 54 +++++++++++++++++++++++++ tests/NodaTagRelationIdentifierTest.php | 50 +++++++++++++++++++++++ 2 files changed, 104 insertions(+) create mode 100644 src/NodaTagRelationIdentifier.php create mode 100644 tests/NodaTagRelationIdentifierTest.php diff --git a/src/NodaTagRelationIdentifier.php b/src/NodaTagRelationIdentifier.php new file mode 100644 index 0000000..719bc7b --- /dev/null +++ b/src/NodaTagRelationIdentifier.php @@ -0,0 +1,54 @@ + + */ +declare(strict_types = 1); + +/** + * Contains static functions for identifying uncertainty or blocking + * completely uncertain inputs for actors, times, and places. + */ +final class NodaTagRelationIdentifier { + + private const SUFFIXES = [ + 'de' => [ + ' (Motiv)' => MDTagRelationType::display_subject, + ] + ]; + + public readonly string $name; + public readonly MDTagRelationType|false $relation; + + /** + * Constructor: Removes identifiers for well-known tag relations and determines cleaned name and relation type. + * + * @param string $lang Current language. + * @param string $input_string Input string to clean. + * + * @return void + */ + public function __construct(string $lang, string $input_string) { + + if (empty(self::SUFFIXES[$lang])) { + $this->name = $input_string; + $this->relation = false; + return; + } + + $relation = false; + + $suffixes = self::SUFFIXES[$lang]; + foreach (array_keys($suffixes) as $suffix) { + if (\mb_substr($input_string, \mb_strlen($suffix) * -1) === "$suffix") { + $input_string = \mb_substr($input_string, 0, \mb_strlen($suffix) * -1); + $relation = $suffixes[$suffix]; + } + } + + $this->name = $input_string; + $this->relation = $relation; + + } +} diff --git a/tests/NodaTagRelationIdentifierTest.php b/tests/NodaTagRelationIdentifierTest.php new file mode 100644 index 0000000..9470776 --- /dev/null +++ b/tests/NodaTagRelationIdentifierTest.php @@ -0,0 +1,50 @@ + + */ +declare(strict_types = 1); +use PHPUnit\Framework\TestCase; +use PHPUnit\Framework\Attributes\CoversClass; +use PHPUnit\Framework\Attributes\Small; +use PHPUnit\Framework\Attributes\DataProvider; + +/** + * This script contains tests for the uncertainty helper. + */ +#[small] +#[CoversClass(\NodaTagRelationIdentifier::class)] +final class NodaTagRelationIdentifierTest extends TestCase { + /** + * Returns input tag names with and without known suffixes signaling the relation type. + * + * @return array + */ + public static function tagNameAndRelationTypeProvider():array { + + return [ + 'Delfin' => ["Delfin", "Delfin", false], + 'Delfin (Motiv)' => ["Delfin (Motiv)", "Delfin", MDTagRelationType::display_subject], + ]; + + } + + /** + * Test to ensure times are correctly cleaned and parsed. + * + * @param string $term Term to check. + * @param string $target_output Expected output name. + * @param false|MDTagRelationType $target_type Expected identified relation type (or false for lack thereof). + * + * @return void + */ + #[DataProvider('tagNameAndRelationTypeProvider')] + public function testIdentificationWorksCorrectly(string $term, string $target_output, false|MDTagRelationType $target_type):void { + + $identification = new NodaTagRelationIdentifier("de", $term); + self::assertEquals($target_output, $identification->name); + self::assertEquals($target_type, $identification->relation); + + } +}