From 48355a6a365a3a35fed059581a7628e45cf7877c Mon Sep 17 00:00:00 2001 From: Joshua Ramon Enslin Date: Sat, 9 Nov 2024 18:42:18 +0100 Subject: [PATCH] Identify uncertainty before brackets ("Berlin ? (Germany)" > "Berlin (Germany)" + Uncertain) --- src/NodaUncertaintyHelper.php | 15 ++++ tests/NodaUncertaintyHelperTest.php | 125 ++++++++++++++-------------- 2 files changed, 79 insertions(+), 61 deletions(-) diff --git a/src/NodaUncertaintyHelper.php b/src/NodaUncertaintyHelper.php index c9f2a19..fd378ef 100644 --- a/src/NodaUncertaintyHelper.php +++ b/src/NodaUncertaintyHelper.php @@ -328,6 +328,14 @@ final class NodaUncertaintyHelper { } } + // If brackets are included in the name, try removing prefixes and suffixes + // from the beginning. + if (($bracketPos = strpos($ort_name, "(")) !== false) { + $start = substr($ort_name, 0, $bracketPos); + $end = substr($ort_name, $bracketPos); + $ort_name = self::cleanUncertaintyIndicatorsPlace($start) . ' ' . $end; + } + return self::trim($ort_name); } @@ -358,6 +366,13 @@ final class NodaUncertaintyHelper { } } + // If brackets are included in the name, try the same for everything up to the + // first brackets. + if (($bracketPos = strpos($ort_name, "(")) !== false) { + $name = substr($ort_name, 0, $bracketPos); + return self::guessPlaceCertainty($name); + } + return true; // Certain / no uncertainty found } diff --git a/tests/NodaUncertaintyHelperTest.php b/tests/NodaUncertaintyHelperTest.php index b8fc387..9c40c0f 100644 --- a/tests/NodaUncertaintyHelperTest.php +++ b/tests/NodaUncertaintyHelperTest.php @@ -8,6 +8,7 @@ declare(strict_types = 1); use PHPUnit\Framework\TestCase; use PHPUnit\Framework\Attributes\CoversClass; use PHPUnit\Framework\Attributes\Small; +use PHPUnit\Framework\Attributes\DataProvider; /** * This script contains tests for the uncertainty helper. @@ -16,111 +17,113 @@ use PHPUnit\Framework\Attributes\Small; #[CoversClass(\NodaUncertaintyHelper::class)] final class NodaUncertaintyHelperTest extends TestCase { /** - * Removes uncertainty indicators from an time name. + * Returns time names with expected cleaned version and expected parsed certainty. * - * @group ValidOutput - * @small - * - * @return void + * @return array */ - public function testCleanUncertaintyIndicatorsTime():void { + public static function uncertainTimesProvider():array { - self::assertEquals("1950", NodaUncertaintyHelper::cleanUncertaintyIndicatorsTime("wohl 1950")); - self::assertEquals("1950", NodaUncertaintyHelper::cleanUncertaintyIndicatorsTime("1950?")); - self::assertEquals("1950", NodaUncertaintyHelper::cleanUncertaintyIndicatorsTime("1950?,")); - self::assertEquals("1950", NodaUncertaintyHelper::cleanUncertaintyIndicatorsTime("1950,")); - self::assertEquals("1950", NodaUncertaintyHelper::cleanUncertaintyIndicatorsTime("1950")); + return [ + 'uncertainty prefix: "wohl 1950"' => ["wohl 1950", "1950", false], + 'uncertainty suffix: "1950?"' => ["1950?", "1950", false], + 'uncertainty suffix and superfluous chars: "1950 ?,"' => ["1950 ?,", "1950", false], + 'certain term with superfluous chars: "1950 ,"' => ["1950 ,", "1950", true], + 'certain term: 1950' => ["1950", "1950", true], + ]; } /** - * Attempts guessing whether time is uncertain. + * Returns place names with expected cleaned version and expected parsed certainty. * - * @group ValidOutput - * @small - * - * @return void + * @return array */ - public function testGuessTimeCertainty():void { + public static function uncertainPlacesProvider():array { - self::assertFalse(NodaUncertaintyHelper::guessTimeCertainty("wohl 1950")); - self::assertTrue(NodaUncertaintyHelper::guessTimeCertainty("1950")); + return [ + + 'uncertainty prefix: "wohl Berlin"' => ["wohl Berlin", "Berlin", false], + 'uncertainty prefix: "vermutl. Berlin"' => ["vermutl. Berlin", "Berlin", false], + 'uncertainty prefix and superfluous chars: "?-Berlin"' => ["?-Berlin", "Berlin", false], + 'uncertainty suffix: "Berlin?"' => ["Berlin?", "Berlin", false], + 'uncertainty suffix: "Berlin (?)"' => ["Berlin (?)", "Berlin", false], + 'uncertainty suffix and superfluous chars: "Berlin ?,"' => ["Berlin ?,", "Berlin", false], + 'certain term with superfluous chars: "Berlin ,"' => ["Berlin ,", "Berlin", true], + 'certain term: Berlin' => ["Berlin", "Berlin", true], + 'Berlin ? (Deutschland)' => ["Berlin ? (Deutschland)", "Berlin (Deutschland)", false], + 'Berli?n (Deutschland)' => ["Berl?n (Deutschland)", "Berl?n (Deutschland)", true], + + ]; } /** - * Removes uncertainty indicators from an place name. + * Returns actor names with expected cleaned version and expected parsed certainty. * - * @group ValidOutput - * @small - * - * @return void + * @return array */ - public static function testCleanUncertaintyIndicatorsPlace():void { + public static function uncertainPersinstProvider():array { - self::assertEquals("Berlin", NodaUncertaintyHelper::cleanUncertaintyIndicatorsPlace("wohl Berlin")); - self::assertEquals("Berlin", NodaUncertaintyHelper::cleanUncertaintyIndicatorsPlace("Berlin")); - - // Real-life examples that previously passed unencumbered - self::assertEquals("Augsburg", NodaUncertaintyHelper::cleanUncertaintyIndicatorsPlace("vermutlich: Augsburg")); - self::assertEquals("Augsburg", NodaUncertaintyHelper::cleanUncertaintyIndicatorsPlace("vermutl. Augsburg")); + return [ + 'uncertainty prefix: "wohl Barbarossa"' => ["wohl Barbarossa", "Barbarossa", false], + 'uncertainty prefix: "vermutl. Barbarossa"' => ["vermutl. Barbarossa", "Barbarossa", false], + 'uncertainty prefix and superfluous chars: "?-Barbarossa"' => ["?-Barbarossa", "Barbarossa", false], + 'uncertainty suffix: "Barbarossa?"' => ["Barbarossa?", "Barbarossa", false], + 'uncertainty suffix and superfluous chars: "Barbarossa ?,"' => ["Barbarossa ?,", "Barbarossa", false], + 'certain term with superfluous chars: "Barbarossa ,"' => ["Barbarossa ,", "Barbarossa", true], + 'certain term: Barbarossa' => ["Barbarossa", "Barbarossa", true], + ]; } /** - * Attempts guessing whether place is uncertain. + * Test to ensure times are correctly cleaned and parsed. * - * @group ValidOutput - * @small + * @param string $term Term to check. + * @param string $target_output Expected output name. + * @param boolean $target_certainty Expected output certainty. * * @return void */ - public static function testGuessPlaceCertainty():void { + #[DataProvider('uncertainTimesProvider')] + public function testParsingUncertaintyFromTimes(string $term, string $target_output, bool $target_certainty):void { - self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("vermutlich: Augsburg")); - self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("vermutl. Augsburg")); - - self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("wohl Berlin")); - self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("Berlin?")); - self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("Berlin?,")); - self::assertTrue(NodaUncertaintyHelper::guessPlaceCertainty("Berlin,")); - self::assertTrue(NodaUncertaintyHelper::guessPlaceCertainty("Berlin")); - - self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("?-Italien")); + self::assertEquals($target_output, NodaUncertaintyHelper::cleanUncertaintyIndicatorsTime($term)); + self::assertEquals($target_certainty, NodaUncertaintyHelper::guessTimeCertainty($term)); } /** - * Removes uncertainty indicators from an actor name. + * Test to ensure places are correctly cleaned and parsed. * - * @group ValidOutput - * @small + * @param string $term Term to check. + * @param string $target_output Expected output name. + * @param boolean $target_certainty Expected output certainty. * * @return void */ - public static function testCleanUncertaintyIndicatorsPersinst():void { + #[DataProvider('uncertainPlacesProvider')] + public function testParsingUncertaintyFromPlaces(string $term, string $target_output, bool $target_certainty):void { - self::assertEquals("Barbarossa", NodaUncertaintyHelper::cleanUncertaintyIndicatorsPersinst("wohl Barbarossa")); - self::assertEquals("Barbarossa", NodaUncertaintyHelper::cleanUncertaintyIndicatorsPersinst("Barbarossa")); - self::assertEquals("Barbarossa", NodaUncertaintyHelper::cleanUncertaintyIndicatorsPersinst("?-Barbarossa")); + self::assertEquals($target_output, NodaUncertaintyHelper::cleanUncertaintyIndicatorsPlace($term)); + self::assertEquals($target_certainty, NodaUncertaintyHelper::guessPlaceCertainty($term)); } /** - * Attempts guessing whether persinst is uncertain. + * Test to ensure actor names are correctly cleaned and parsed. * - * @group ValidOutput - * @small + * @param string $term Term to check. + * @param string $target_output Expected output name. + * @param boolean $target_certainty Expected output certainty. * * @return void */ - public static function testGuessPersinstCertainty():void { + #[DataProvider('uncertainPersinstProvider')] + public function testParsingUncertaintyFromPersinst(string $term, string $target_output, bool $target_certainty):void { - self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("wohl Barbarossa")); - self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("Barbarossa?")); - self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("Barbarossa?,")); - self::assertTrue(NodaUncertaintyHelper::guessPlaceCertainty("Barbarossa,")); - self::assertTrue(NodaUncertaintyHelper::guessPlaceCertainty("Barbarossa")); + self::assertEquals($target_output, NodaUncertaintyHelper::cleanUncertaintyIndicatorsPersinst($term)); + self::assertEquals($target_certainty, NodaUncertaintyHelper::guessPersinstCertainty($term)); } }