Properly handle commas at the end of names when guessing certainty
This commit is contained in:
		| @@ -12,7 +12,7 @@ declare(strict_types = 1); | ||||
|  */ | ||||
| final class NodaUncertaintyHelper { | ||||
|  | ||||
|     const PERSINST_INDICATORS_DISALLOWED = [ | ||||
|     public const PERSINST_INDICATORS_DISALLOWED = [ | ||||
|         "Unbekannt", | ||||
|         "unbekannt", | ||||
|         "Anonymus", | ||||
| @@ -41,7 +41,7 @@ final class NodaUncertaintyHelper { | ||||
|         "Невідомий артист",     // Unknown artist | ||||
|     ]; | ||||
|  | ||||
|     const PERSINST_UNCERTAINTY_PREFIXES = [ | ||||
|     public const PERSINST_UNCERTAINTY_PREFIXES = [ | ||||
|         "verm. ", | ||||
|         "Verm. ", | ||||
|         "vermtl. ", | ||||
| @@ -57,7 +57,7 @@ final class NodaUncertaintyHelper { | ||||
|         "?", | ||||
|     ]; | ||||
|  | ||||
|     const PERSINST_UNCERTAINTY_SUFFIXES = [ | ||||
|     public const PERSINST_UNCERTAINTY_SUFFIXES = [ | ||||
|         "(?)", | ||||
|         "?", | ||||
|         " [vermutlich]", | ||||
| @@ -65,7 +65,7 @@ final class NodaUncertaintyHelper { | ||||
|         " [wahrscheinlich]", | ||||
|     ]; | ||||
|  | ||||
|         const TIME_INDICATORS_DISALLOWED = [ | ||||
|     public const TIME_INDICATORS_DISALLOWED = [ | ||||
|         "Nachgewiesen", | ||||
|         "nachgewiesen", | ||||
|         "o.D.", | ||||
| @@ -94,9 +94,9 @@ final class NodaUncertaintyHelper { | ||||
|         "Без датування", // No dating | ||||
|         "б.р.", // No dating | ||||
|         "б.д.", // No dating | ||||
|         ]; | ||||
|     ]; | ||||
|  | ||||
|         const TIME_UNCERTAINTY_PREFIXES = [ | ||||
|     public const TIME_UNCERTAINTY_PREFIXES = [ | ||||
|         "c. ", | ||||
|         "ca ", | ||||
|         "ca. ", | ||||
| @@ -130,9 +130,9 @@ final class NodaUncertaintyHelper { | ||||
|         "майже",      // UK: Almost / nearly / about | ||||
|         "орієнтовно", // UK: approximately | ||||
|         "Прибл.",     // UK: approximately | ||||
|         ]; | ||||
|     ]; | ||||
|  | ||||
|         const TIME_UNCERTAINTY_SUFFIXES = [ | ||||
|     public const TIME_UNCERTAINTY_SUFFIXES = [ | ||||
|         "(?)", | ||||
|         "?", | ||||
|         " (ca.)", | ||||
| @@ -145,12 +145,12 @@ final class NodaUncertaintyHelper { | ||||
|         ", um", | ||||
|         " (um)", | ||||
|         " (ок.)", | ||||
|         ]; | ||||
|     ]; | ||||
|  | ||||
|         /** | ||||
|          * Substrings used to express uncertainty about the validity of a place name. | ||||
|          */ | ||||
|         const PLACE_INDICATORS_DISALLOWED = [ | ||||
|     /** | ||||
|      * Substrings used to express uncertainty about the validity of a place name. | ||||
|      */ | ||||
|     public const PLACE_INDICATORS_DISALLOWED = [ | ||||
|         "Unbekannt", | ||||
|         "unbekannt", | ||||
|         "Unknown", | ||||
| @@ -173,9 +173,9 @@ final class NodaUncertaintyHelper { | ||||
|         "не вказано", // No place | ||||
|         "не вказане", // No place | ||||
|         "невідоме", // No place | ||||
|         ]; | ||||
|     ]; | ||||
|  | ||||
|         const PLACE_UNCERTAINTY_PREFIXES = [ | ||||
|     public const PLACE_UNCERTAINTY_PREFIXES = [ | ||||
|         "ca ", | ||||
|         "Ca ", | ||||
|         "ca. ", | ||||
| @@ -210,9 +210,9 @@ final class NodaUncertaintyHelper { | ||||
|         "Wahrscheinlich ", | ||||
|         "можливо", | ||||
|         "?", | ||||
|         ]; | ||||
|     ]; | ||||
|  | ||||
|         const PLACE_UNCERTAINTY_SUFFIXES = [ | ||||
|     public const PLACE_UNCERTAINTY_SUFFIXES = [ | ||||
|         "(?)", | ||||
|         "(vermutl.)", | ||||
|         "[vermutl.]", | ||||
| @@ -221,206 +221,206 @@ final class NodaUncertaintyHelper { | ||||
|         "(wohl)", | ||||
|         "[wohl]", | ||||
|         "?", | ||||
|         ]; | ||||
|     ]; | ||||
|  | ||||
|         /** | ||||
|          * Trims common characters and charater marks. | ||||
|          * | ||||
|          * @param string $input Input text. | ||||
|          * | ||||
|          * @return string | ||||
|          */ | ||||
|         public static function trim(string $input):string { | ||||
|     /** | ||||
|      * Trims common characters and charater marks. | ||||
|      * | ||||
|      * @param string $input Input text. | ||||
|      * | ||||
|      * @return string | ||||
|      */ | ||||
|     public static function trim(string $input):string { | ||||
|  | ||||
|             $input = \trim($input, ", \t\n\r\n;-:"); | ||||
|             return $input; | ||||
|         $input = \trim($input, ", \t\n\r\n;-:"); | ||||
|         return $input; | ||||
|  | ||||
|     } | ||||
|  | ||||
|     /** | ||||
|      * Removes uncertainty indicators from an time name. | ||||
|      * | ||||
|      * @param string $name Input string. | ||||
|      * | ||||
|      * @return string | ||||
|      */ | ||||
|     public static function cleanUncertaintyIndicatorsTime(string $name):string { | ||||
|  | ||||
|         $name = self::trim($name); | ||||
|  | ||||
|         if (\in_array($name, self::TIME_INDICATORS_DISALLOWED, true)) { | ||||
|             return ""; | ||||
|         } | ||||
|  | ||||
|         /** | ||||
|          * Removes uncertainty indicators from an time name. | ||||
|          * | ||||
|          * @param string $name Input string. | ||||
|          * | ||||
|          * @return string | ||||
|          */ | ||||
|         public static function cleanUncertaintyIndicatorsTime(string $name):string { | ||||
|  | ||||
|             $name = self::trim($name); | ||||
|  | ||||
|             if (\in_array($name, self::TIME_INDICATORS_DISALLOWED, true)) { | ||||
|                 return ""; | ||||
|         // Remove uncertainty prefixes | ||||
|         foreach (NodaUncertaintyHelper::TIME_UNCERTAINTY_PREFIXES as $prefix) { | ||||
|             if (\substr($name, 0, \strlen($prefix)) === "$prefix") { | ||||
|                 $name = substr($name, \strlen($prefix)); | ||||
|             } | ||||
|  | ||||
|             // Remove uncertainty prefixes | ||||
|             foreach (NodaUncertaintyHelper::TIME_UNCERTAINTY_PREFIXES as $prefix) { | ||||
|                 if (\substr($name, 0, \strlen($prefix)) === "$prefix") { | ||||
|                     $name = substr($name, \strlen($prefix)); | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             // Remove uncertainty sufixes | ||||
|             foreach (NodaUncertaintyHelper::TIME_UNCERTAINTY_SUFFIXES as $suffix) { | ||||
|                 if (\substr($name, \strlen($suffix) * -1) === "$suffix") { | ||||
|                     $name = \substr($name, 0, \strlen($suffix) * -1); | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             return self::trim($name); | ||||
|  | ||||
|         } | ||||
|  | ||||
|         /** | ||||
|          * Attempts guessing whether time is uncertain. Returns true if the name | ||||
|          * indicates certainty, false if it indicates uncertainty. | ||||
|          * | ||||
|          * @param string $zeit_name Time name. | ||||
|          * | ||||
|          * @return boolean | ||||
|          */ | ||||
|         public static function guessTimeCertainty(string $zeit_name):bool { | ||||
|  | ||||
|             $zeit_name = \strtolower($zeit_name); | ||||
|  | ||||
|             // Attempt to guess uncertainty based on prefixes. | ||||
|             foreach (self::TIME_UNCERTAINTY_PREFIXES as $prefix) { | ||||
|                 if (\substr($zeit_name, 0, \strlen($prefix)) === $prefix) { | ||||
|                     return false; // Uncertainty found | ||||
|                 } | ||||
|         // Remove uncertainty sufixes | ||||
|         foreach (NodaUncertaintyHelper::TIME_UNCERTAINTY_SUFFIXES as $suffix) { | ||||
|             if (\substr($name, \strlen($suffix) * -1) === "$suffix") { | ||||
|                 $name = \substr($name, 0, \strlen($suffix) * -1); | ||||
|             } | ||||
|  | ||||
|             // Attempt to guess uncertainty based on prefixes. | ||||
|             foreach (self::TIME_UNCERTAINTY_SUFFIXES as $prefix) { | ||||
|                 if (\substr($zeit_name, -1 * \strlen($prefix)) === $prefix) { | ||||
|                     return false; // Uncertainty found | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             return true; // No uncertainty found | ||||
|  | ||||
|         } | ||||
|  | ||||
|         /** | ||||
|          * Removes uncertainty indicators from an place name. | ||||
|          * | ||||
|          * @param string $ort_name Input string. | ||||
|          * | ||||
|          * @return string | ||||
|          */ | ||||
|         public static function cleanUncertaintyIndicatorsPlace(string $ort_name):string { | ||||
|         return self::trim($name); | ||||
|  | ||||
|             $ort_name = self::trim($ort_name); | ||||
|     } | ||||
|  | ||||
|             if (\in_array($ort_name, self::PLACE_INDICATORS_DISALLOWED, true)) { | ||||
|                 return ""; | ||||
|     /** | ||||
|      * Attempts guessing whether time is uncertain. Returns true if the name | ||||
|      * indicates certainty, false if it indicates uncertainty. | ||||
|      * | ||||
|      * @param string $zeit_name Time name. | ||||
|      * | ||||
|      * @return boolean | ||||
|      */ | ||||
|     public static function guessTimeCertainty(string $zeit_name):bool { | ||||
|  | ||||
|         $zeit_name = \strtolower($zeit_name); | ||||
|  | ||||
|         // Attempt to guess uncertainty based on prefixes. | ||||
|         foreach (self::TIME_UNCERTAINTY_PREFIXES as $prefix) { | ||||
|             if (\substr($zeit_name, 0, \strlen($prefix)) === $prefix) { | ||||
|                 return false; // Uncertainty found | ||||
|             } | ||||
|  | ||||
|             // Remove uncertainty prefixes | ||||
|             foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_PREFIXES as $prefix) { | ||||
|                 if (\substr($ort_name, 0, \strlen($prefix)) === "$prefix") { | ||||
|                     $ort_name = substr($ort_name, \strlen($prefix)); | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             // Remove uncertainty sufixes | ||||
|             foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_SUFFIXES as $suffix) { | ||||
|                 if (\substr($ort_name, \strlen($suffix) * -1) === "$suffix") { | ||||
|                     $ort_name = \substr($ort_name, 0, \strlen($suffix) * -1); | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             return self::trim($ort_name); | ||||
|  | ||||
|         } | ||||
|  | ||||
|         /** | ||||
|          * Attempts guessing whether place is uncertain. Returns true if the name | ||||
|          * indicates certainty, false if it indicates uncertainty. | ||||
|          * | ||||
|          * @param string $ort_name Place name. | ||||
|          * | ||||
|          * @return boolean | ||||
|          */ | ||||
|         public static function guessPlaceCertainty(string $ort_name):bool { | ||||
|  | ||||
|             $ort_name = \strtolower($ort_name); | ||||
|  | ||||
|             // Attempt to guess uncertainty based on prefixes. | ||||
|             foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_PREFIXES as $prefix) { | ||||
|                 if (\substr($ort_name, 0, \strlen($prefix)) === $prefix) { | ||||
|                     return false; // Uncertain | ||||
|                 } | ||||
|         // Attempt to guess uncertainty based on prefixes. | ||||
|         foreach (self::TIME_UNCERTAINTY_SUFFIXES as $prefix) { | ||||
|             if (\substr($zeit_name, -1 * \strlen($prefix)) === $prefix) { | ||||
|                 return false; // Uncertainty found | ||||
|             } | ||||
|  | ||||
|             // Attempt to guess uncertainty based on prefixes. | ||||
|             foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_SUFFIXES as $prefix) { | ||||
|                 if (\substr($ort_name, -1 * \strlen($prefix)) === $prefix) { | ||||
|                     return false; // Uncertain | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             return true; // Certain / no uncertainty found | ||||
|  | ||||
|         } | ||||
|  | ||||
|         /** | ||||
|          * Removes uncertainty indicators from an actor name. | ||||
|          * | ||||
|          * @param string $value Input string. | ||||
|          * | ||||
|          * @return string | ||||
|          */ | ||||
|         public static function cleanUncertaintyIndicatorsPersinst(string $value):string { | ||||
|         return true; // No uncertainty found | ||||
|  | ||||
|             $value = self::trim($value); | ||||
|     } | ||||
|  | ||||
|             if (\in_array(trim($value, ";. "), self::PERSINST_INDICATORS_DISALLOWED, true)) { | ||||
|                 return ""; | ||||
|             } | ||||
|     /** | ||||
|      * Removes uncertainty indicators from an place name. | ||||
|      * | ||||
|      * @param string $ort_name Input string. | ||||
|      * | ||||
|      * @return string | ||||
|      */ | ||||
|     public static function cleanUncertaintyIndicatorsPlace(string $ort_name):string { | ||||
|  | ||||
|             foreach (self::PERSINST_UNCERTAINTY_PREFIXES as $toRemove) { | ||||
|                 if (\mb_substr($value, 0, \mb_strlen($toRemove)) === $toRemove) { | ||||
|                     $value = substr($value, \mb_strlen($toRemove)); | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             foreach (self::PLACE_UNCERTAINTY_SUFFIXES as $suffix) { | ||||
|                 if (\mb_substr($value, \mb_strlen($suffix) * -1) === "$suffix") { | ||||
|                     $value = \mb_substr($value, 0, \mb_strlen($suffix) * -1); | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             return self::trim($value); | ||||
|         $ort_name = self::trim($ort_name); | ||||
|  | ||||
|         if (\in_array($ort_name, self::PLACE_INDICATORS_DISALLOWED, true)) { | ||||
|             return ""; | ||||
|         } | ||||
|  | ||||
|         /** | ||||
|          * Attempts guessing whether persinst is uncertain. Returns true if the name | ||||
|          * indicates certainty, false if it indicates uncertainty. | ||||
|          * | ||||
|          * @param string $name Persinst name. | ||||
|          * | ||||
|          * @return boolean | ||||
|          */ | ||||
|         public static function guessPersinstCertainty(string $name):bool { | ||||
|  | ||||
|             $name = \trim(\strtolower($name)); | ||||
|  | ||||
|             // Attempt to guess uncertainty based on prefixes. | ||||
|             foreach (NodaUncertaintyHelper::PERSINST_UNCERTAINTY_PREFIXES as $prefix) { | ||||
|                 if (\substr($name, 0, \strlen($prefix)) === $prefix) { | ||||
|                     return false; // Uncertain | ||||
|                 } | ||||
|         // Remove uncertainty prefixes | ||||
|         foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_PREFIXES as $prefix) { | ||||
|             if (\substr($ort_name, 0, \strlen($prefix)) === "$prefix") { | ||||
|                 $ort_name = substr($ort_name, \strlen($prefix)); | ||||
|             } | ||||
|  | ||||
|             // Attempt to guess uncertainty based on prefixes. | ||||
|             foreach (NodaUncertaintyHelper::PERSINST_UNCERTAINTY_SUFFIXES as $prefix) { | ||||
|                 if (\substr($name, -1 * \strlen($prefix)) === $prefix) { | ||||
|                     return false; // Uncertain | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             return true; // Certain / no uncertainty found | ||||
|  | ||||
|         } | ||||
|  | ||||
|         // Remove uncertainty sufixes | ||||
|         foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_SUFFIXES as $suffix) { | ||||
|             if (\substr($ort_name, \strlen($suffix) * -1) === "$suffix") { | ||||
|                 $ort_name = \substr($ort_name, 0, \strlen($suffix) * -1); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         return self::trim($ort_name); | ||||
|  | ||||
|     } | ||||
|  | ||||
|     /** | ||||
|      * Attempts guessing whether place is uncertain. Returns true if the name | ||||
|      * indicates certainty, false if it indicates uncertainty. | ||||
|      * | ||||
|      * @param string $ort_name Place name. | ||||
|      * | ||||
|      * @return boolean | ||||
|      */ | ||||
|     public static function guessPlaceCertainty(string $ort_name):bool { | ||||
|  | ||||
|         $ort_name = \trim(\strtolower($ort_name), ', ;-_'); | ||||
|  | ||||
|         // Attempt to guess uncertainty based on prefixes. | ||||
|         foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_PREFIXES as $prefix) { | ||||
|             if (\substr($ort_name, 0, \strlen($prefix)) === $prefix) { | ||||
|                 return false; // Uncertain | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         // Attempt to guess uncertainty based on prefixes. | ||||
|         foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_SUFFIXES as $prefix) { | ||||
|             if (\substr($ort_name, -1 * \strlen($prefix)) === $prefix) { | ||||
|                 return false; // Uncertain | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         return true; // Certain / no uncertainty found | ||||
|  | ||||
|     } | ||||
|  | ||||
|     /** | ||||
|      * Removes uncertainty indicators from an actor name. | ||||
|      * | ||||
|      * @param string $value Input string. | ||||
|      * | ||||
|      * @return string | ||||
|      */ | ||||
|     public static function cleanUncertaintyIndicatorsPersinst(string $value):string { | ||||
|  | ||||
|         $value = self::trim($value); | ||||
|  | ||||
|         if (\in_array(trim($value, ";. "), self::PERSINST_INDICATORS_DISALLOWED, true)) { | ||||
|             return ""; | ||||
|         } | ||||
|  | ||||
|         foreach (self::PERSINST_UNCERTAINTY_PREFIXES as $toRemove) { | ||||
|             if (\mb_substr($value, 0, \mb_strlen($toRemove)) === $toRemove) { | ||||
|                 $value = substr($value, \mb_strlen($toRemove)); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         foreach (self::PLACE_UNCERTAINTY_SUFFIXES as $suffix) { | ||||
|             if (\mb_substr($value, \mb_strlen($suffix) * -1) === "$suffix") { | ||||
|                 $value = \mb_substr($value, 0, \mb_strlen($suffix) * -1); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         return self::trim($value); | ||||
|  | ||||
|     } | ||||
|  | ||||
|     /** | ||||
|      * Attempts guessing whether persinst is uncertain. Returns true if the name | ||||
|      * indicates certainty, false if it indicates uncertainty. | ||||
|      * | ||||
|      * @param string $name Persinst name. | ||||
|      * | ||||
|      * @return boolean | ||||
|      */ | ||||
|     public static function guessPersinstCertainty(string $name):bool { | ||||
|  | ||||
|         $name = \trim(\strtolower($name), ', ;-_'); | ||||
|  | ||||
|         // Attempt to guess uncertainty based on prefixes. | ||||
|         foreach (NodaUncertaintyHelper::PERSINST_UNCERTAINTY_PREFIXES as $prefix) { | ||||
|             if (\substr($name, 0, \strlen($prefix)) === $prefix) { | ||||
|                 return false; // Uncertain | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         // Attempt to guess uncertainty based on prefixes. | ||||
|         foreach (NodaUncertaintyHelper::PERSINST_UNCERTAINTY_SUFFIXES as $prefix) { | ||||
|             if (\substr($name, -1 * \strlen($prefix)) === $prefix) { | ||||
|                 return false; // Uncertain | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         return true; // Certain / no uncertainty found | ||||
|  | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -6,12 +6,14 @@ | ||||
|  */ | ||||
| declare(strict_types = 1); | ||||
| use PHPUnit\Framework\TestCase; | ||||
| use PHPUnit\Framework\Attributes\CoversClass; | ||||
| use PHPUnit\Framework\Attributes\Small; | ||||
|  | ||||
| /** | ||||
|  * This script contains tests for the uncertainty helper. | ||||
|  * | ||||
|  * @covers \NodaUncertaintyHelper | ||||
|  */ | ||||
| #[small] | ||||
| #[CoversClass(\NodaUncertaintyHelper::class)] | ||||
| final class NodaUncertaintyHelperTest extends TestCase { | ||||
|     /** | ||||
|      * Removes uncertainty indicators from an time name. | ||||
| @@ -76,6 +78,9 @@ final class NodaUncertaintyHelperTest extends TestCase { | ||||
|         self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("vermutl. Augsburg")); | ||||
|  | ||||
|         self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("wohl Berlin")); | ||||
|         self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("Berlin?")); | ||||
|         self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("Berlin?,")); | ||||
|         self::assertTrue(NodaUncertaintyHelper::guessPlaceCertainty("Berlin,")); | ||||
|         self::assertTrue(NodaUncertaintyHelper::guessPlaceCertainty("Berlin")); | ||||
|  | ||||
|         self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("?-Italien")); | ||||
| @@ -109,6 +114,9 @@ final class NodaUncertaintyHelperTest extends TestCase { | ||||
|     public static function testGuessPersinstCertainty():void { | ||||
|  | ||||
|         self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("wohl Barbarossa")); | ||||
|         self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("Barbarossa?")); | ||||
|         self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("Barbarossa?,")); | ||||
|         self::assertTrue(NodaUncertaintyHelper::guessPlaceCertainty("Barbarossa,")); | ||||
|         self::assertTrue(NodaUncertaintyHelper::guessPlaceCertainty("Barbarossa")); | ||||
|  | ||||
|     } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user