Properly handle commas at the end of names when guessing certainty

This commit is contained in:
Joshua Ramon Enslin 2024-11-09 15:33:49 +01:00
parent eb371d4270
commit 29ca05f552
Signed by: jrenslin
GPG Key ID: 46016F84501B70AE
2 changed files with 202 additions and 194 deletions

View File

@ -12,7 +12,7 @@ declare(strict_types = 1);
*/ */
final class NodaUncertaintyHelper { final class NodaUncertaintyHelper {
const PERSINST_INDICATORS_DISALLOWED = [ public const PERSINST_INDICATORS_DISALLOWED = [
"Unbekannt", "Unbekannt",
"unbekannt", "unbekannt",
"Anonymus", "Anonymus",
@ -41,7 +41,7 @@ final class NodaUncertaintyHelper {
"Невідомий артист", // Unknown artist "Невідомий артист", // Unknown artist
]; ];
const PERSINST_UNCERTAINTY_PREFIXES = [ public const PERSINST_UNCERTAINTY_PREFIXES = [
"verm. ", "verm. ",
"Verm. ", "Verm. ",
"vermtl. ", "vermtl. ",
@ -57,7 +57,7 @@ final class NodaUncertaintyHelper {
"?", "?",
]; ];
const PERSINST_UNCERTAINTY_SUFFIXES = [ public const PERSINST_UNCERTAINTY_SUFFIXES = [
"(?)", "(?)",
"?", "?",
" [vermutlich]", " [vermutlich]",
@ -65,7 +65,7 @@ final class NodaUncertaintyHelper {
" [wahrscheinlich]", " [wahrscheinlich]",
]; ];
const TIME_INDICATORS_DISALLOWED = [ public const TIME_INDICATORS_DISALLOWED = [
"Nachgewiesen", "Nachgewiesen",
"nachgewiesen", "nachgewiesen",
"o.D.", "o.D.",
@ -94,9 +94,9 @@ final class NodaUncertaintyHelper {
"Без датування", // No dating "Без датування", // No dating
"б.р.", // No dating "б.р.", // No dating
"б.д.", // No dating "б.д.", // No dating
]; ];
const TIME_UNCERTAINTY_PREFIXES = [ public const TIME_UNCERTAINTY_PREFIXES = [
"c. ", "c. ",
"ca ", "ca ",
"ca. ", "ca. ",
@ -130,9 +130,9 @@ final class NodaUncertaintyHelper {
"майже", // UK: Almost / nearly / about "майже", // UK: Almost / nearly / about
"орієнтовно", // UK: approximately "орієнтовно", // UK: approximately
"Прибл.", // UK: approximately "Прибл.", // UK: approximately
]; ];
const TIME_UNCERTAINTY_SUFFIXES = [ public const TIME_UNCERTAINTY_SUFFIXES = [
"(?)", "(?)",
"?", "?",
" (ca.)", " (ca.)",
@ -145,12 +145,12 @@ final class NodaUncertaintyHelper {
", um", ", um",
" (um)", " (um)",
" (ок.)", " (ок.)",
]; ];
/** /**
* Substrings used to express uncertainty about the validity of a place name. * Substrings used to express uncertainty about the validity of a place name.
*/ */
const PLACE_INDICATORS_DISALLOWED = [ public const PLACE_INDICATORS_DISALLOWED = [
"Unbekannt", "Unbekannt",
"unbekannt", "unbekannt",
"Unknown", "Unknown",
@ -173,9 +173,9 @@ final class NodaUncertaintyHelper {
"не вказано", // No place "не вказано", // No place
"не вказане", // No place "не вказане", // No place
"невідоме", // No place "невідоме", // No place
]; ];
const PLACE_UNCERTAINTY_PREFIXES = [ public const PLACE_UNCERTAINTY_PREFIXES = [
"ca ", "ca ",
"Ca ", "Ca ",
"ca. ", "ca. ",
@ -210,9 +210,9 @@ final class NodaUncertaintyHelper {
"Wahrscheinlich ", "Wahrscheinlich ",
"можливо", "можливо",
"?", "?",
]; ];
const PLACE_UNCERTAINTY_SUFFIXES = [ public const PLACE_UNCERTAINTY_SUFFIXES = [
"(?)", "(?)",
"(vermutl.)", "(vermutl.)",
"[vermutl.]", "[vermutl.]",
@ -221,206 +221,206 @@ final class NodaUncertaintyHelper {
"(wohl)", "(wohl)",
"[wohl]", "[wohl]",
"?", "?",
]; ];
/** /**
* Trims common characters and charater marks. * Trims common characters and charater marks.
* *
* @param string $input Input text. * @param string $input Input text.
* *
* @return string * @return string
*/ */
public static function trim(string $input):string { public static function trim(string $input):string {
$input = \trim($input, ", \t\n\r\n;-:"); $input = \trim($input, ", \t\n\r\n;-:");
return $input; return $input;
}
/**
* Removes uncertainty indicators from an time name.
*
* @param string $name Input string.
*
* @return string
*/
public static function cleanUncertaintyIndicatorsTime(string $name):string {
$name = self::trim($name);
if (\in_array($name, self::TIME_INDICATORS_DISALLOWED, true)) {
return "";
} }
/** // Remove uncertainty prefixes
* Removes uncertainty indicators from an time name. foreach (NodaUncertaintyHelper::TIME_UNCERTAINTY_PREFIXES as $prefix) {
* if (\substr($name, 0, \strlen($prefix)) === "$prefix") {
* @param string $name Input string. $name = substr($name, \strlen($prefix));
*
* @return string
*/
public static function cleanUncertaintyIndicatorsTime(string $name):string {
$name = self::trim($name);
if (\in_array($name, self::TIME_INDICATORS_DISALLOWED, true)) {
return "";
} }
// Remove uncertainty prefixes
foreach (NodaUncertaintyHelper::TIME_UNCERTAINTY_PREFIXES as $prefix) {
if (\substr($name, 0, \strlen($prefix)) === "$prefix") {
$name = substr($name, \strlen($prefix));
}
}
// Remove uncertainty sufixes
foreach (NodaUncertaintyHelper::TIME_UNCERTAINTY_SUFFIXES as $suffix) {
if (\substr($name, \strlen($suffix) * -1) === "$suffix") {
$name = \substr($name, 0, \strlen($suffix) * -1);
}
}
return self::trim($name);
} }
/** // Remove uncertainty sufixes
* Attempts guessing whether time is uncertain. Returns true if the name foreach (NodaUncertaintyHelper::TIME_UNCERTAINTY_SUFFIXES as $suffix) {
* indicates certainty, false if it indicates uncertainty. if (\substr($name, \strlen($suffix) * -1) === "$suffix") {
* $name = \substr($name, 0, \strlen($suffix) * -1);
* @param string $zeit_name Time name.
*
* @return boolean
*/
public static function guessTimeCertainty(string $zeit_name):bool {
$zeit_name = \strtolower($zeit_name);
// Attempt to guess uncertainty based on prefixes.
foreach (self::TIME_UNCERTAINTY_PREFIXES as $prefix) {
if (\substr($zeit_name, 0, \strlen($prefix)) === $prefix) {
return false; // Uncertainty found
}
} }
// Attempt to guess uncertainty based on prefixes.
foreach (self::TIME_UNCERTAINTY_SUFFIXES as $prefix) {
if (\substr($zeit_name, -1 * \strlen($prefix)) === $prefix) {
return false; // Uncertainty found
}
}
return true; // No uncertainty found
} }
/** return self::trim($name);
* Removes uncertainty indicators from an place name.
*
* @param string $ort_name Input string.
*
* @return string
*/
public static function cleanUncertaintyIndicatorsPlace(string $ort_name):string {
$ort_name = self::trim($ort_name); }
if (\in_array($ort_name, self::PLACE_INDICATORS_DISALLOWED, true)) { /**
return ""; * Attempts guessing whether time is uncertain. Returns true if the name
* indicates certainty, false if it indicates uncertainty.
*
* @param string $zeit_name Time name.
*
* @return boolean
*/
public static function guessTimeCertainty(string $zeit_name):bool {
$zeit_name = \strtolower($zeit_name);
// Attempt to guess uncertainty based on prefixes.
foreach (self::TIME_UNCERTAINTY_PREFIXES as $prefix) {
if (\substr($zeit_name, 0, \strlen($prefix)) === $prefix) {
return false; // Uncertainty found
} }
// Remove uncertainty prefixes
foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_PREFIXES as $prefix) {
if (\substr($ort_name, 0, \strlen($prefix)) === "$prefix") {
$ort_name = substr($ort_name, \strlen($prefix));
}
}
// Remove uncertainty sufixes
foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_SUFFIXES as $suffix) {
if (\substr($ort_name, \strlen($suffix) * -1) === "$suffix") {
$ort_name = \substr($ort_name, 0, \strlen($suffix) * -1);
}
}
return self::trim($ort_name);
} }
/** // Attempt to guess uncertainty based on prefixes.
* Attempts guessing whether place is uncertain. Returns true if the name foreach (self::TIME_UNCERTAINTY_SUFFIXES as $prefix) {
* indicates certainty, false if it indicates uncertainty. if (\substr($zeit_name, -1 * \strlen($prefix)) === $prefix) {
* return false; // Uncertainty found
* @param string $ort_name Place name.
*
* @return boolean
*/
public static function guessPlaceCertainty(string $ort_name):bool {
$ort_name = \strtolower($ort_name);
// Attempt to guess uncertainty based on prefixes.
foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_PREFIXES as $prefix) {
if (\substr($ort_name, 0, \strlen($prefix)) === $prefix) {
return false; // Uncertain
}
} }
// Attempt to guess uncertainty based on prefixes.
foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_SUFFIXES as $prefix) {
if (\substr($ort_name, -1 * \strlen($prefix)) === $prefix) {
return false; // Uncertain
}
}
return true; // Certain / no uncertainty found
} }
/** return true; // No uncertainty found
* Removes uncertainty indicators from an actor name.
*
* @param string $value Input string.
*
* @return string
*/
public static function cleanUncertaintyIndicatorsPersinst(string $value):string {
$value = self::trim($value); }
if (\in_array(trim($value, ";. "), self::PERSINST_INDICATORS_DISALLOWED, true)) { /**
return ""; * Removes uncertainty indicators from an place name.
} *
* @param string $ort_name Input string.
*
* @return string
*/
public static function cleanUncertaintyIndicatorsPlace(string $ort_name):string {
foreach (self::PERSINST_UNCERTAINTY_PREFIXES as $toRemove) { $ort_name = self::trim($ort_name);
if (\mb_substr($value, 0, \mb_strlen($toRemove)) === $toRemove) {
$value = substr($value, \mb_strlen($toRemove));
}
}
foreach (self::PLACE_UNCERTAINTY_SUFFIXES as $suffix) {
if (\mb_substr($value, \mb_strlen($suffix) * -1) === "$suffix") {
$value = \mb_substr($value, 0, \mb_strlen($suffix) * -1);
}
}
return self::trim($value);
if (\in_array($ort_name, self::PLACE_INDICATORS_DISALLOWED, true)) {
return "";
} }
/** // Remove uncertainty prefixes
* Attempts guessing whether persinst is uncertain. Returns true if the name foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_PREFIXES as $prefix) {
* indicates certainty, false if it indicates uncertainty. if (\substr($ort_name, 0, \strlen($prefix)) === "$prefix") {
* $ort_name = substr($ort_name, \strlen($prefix));
* @param string $name Persinst name.
*
* @return boolean
*/
public static function guessPersinstCertainty(string $name):bool {
$name = \trim(\strtolower($name));
// Attempt to guess uncertainty based on prefixes.
foreach (NodaUncertaintyHelper::PERSINST_UNCERTAINTY_PREFIXES as $prefix) {
if (\substr($name, 0, \strlen($prefix)) === $prefix) {
return false; // Uncertain
}
} }
// Attempt to guess uncertainty based on prefixes.
foreach (NodaUncertaintyHelper::PERSINST_UNCERTAINTY_SUFFIXES as $prefix) {
if (\substr($name, -1 * \strlen($prefix)) === $prefix) {
return false; // Uncertain
}
}
return true; // Certain / no uncertainty found
} }
// Remove uncertainty sufixes
foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_SUFFIXES as $suffix) {
if (\substr($ort_name, \strlen($suffix) * -1) === "$suffix") {
$ort_name = \substr($ort_name, 0, \strlen($suffix) * -1);
}
}
return self::trim($ort_name);
}
/**
* Attempts guessing whether place is uncertain. Returns true if the name
* indicates certainty, false if it indicates uncertainty.
*
* @param string $ort_name Place name.
*
* @return boolean
*/
public static function guessPlaceCertainty(string $ort_name):bool {
$ort_name = \trim(\strtolower($ort_name), ', ;-_');
// Attempt to guess uncertainty based on prefixes.
foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_PREFIXES as $prefix) {
if (\substr($ort_name, 0, \strlen($prefix)) === $prefix) {
return false; // Uncertain
}
}
// Attempt to guess uncertainty based on prefixes.
foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_SUFFIXES as $prefix) {
if (\substr($ort_name, -1 * \strlen($prefix)) === $prefix) {
return false; // Uncertain
}
}
return true; // Certain / no uncertainty found
}
/**
* Removes uncertainty indicators from an actor name.
*
* @param string $value Input string.
*
* @return string
*/
public static function cleanUncertaintyIndicatorsPersinst(string $value):string {
$value = self::trim($value);
if (\in_array(trim($value, ";. "), self::PERSINST_INDICATORS_DISALLOWED, true)) {
return "";
}
foreach (self::PERSINST_UNCERTAINTY_PREFIXES as $toRemove) {
if (\mb_substr($value, 0, \mb_strlen($toRemove)) === $toRemove) {
$value = substr($value, \mb_strlen($toRemove));
}
}
foreach (self::PLACE_UNCERTAINTY_SUFFIXES as $suffix) {
if (\mb_substr($value, \mb_strlen($suffix) * -1) === "$suffix") {
$value = \mb_substr($value, 0, \mb_strlen($suffix) * -1);
}
}
return self::trim($value);
}
/**
* Attempts guessing whether persinst is uncertain. Returns true if the name
* indicates certainty, false if it indicates uncertainty.
*
* @param string $name Persinst name.
*
* @return boolean
*/
public static function guessPersinstCertainty(string $name):bool {
$name = \trim(\strtolower($name), ', ;-_');
// Attempt to guess uncertainty based on prefixes.
foreach (NodaUncertaintyHelper::PERSINST_UNCERTAINTY_PREFIXES as $prefix) {
if (\substr($name, 0, \strlen($prefix)) === $prefix) {
return false; // Uncertain
}
}
// Attempt to guess uncertainty based on prefixes.
foreach (NodaUncertaintyHelper::PERSINST_UNCERTAINTY_SUFFIXES as $prefix) {
if (\substr($name, -1 * \strlen($prefix)) === $prefix) {
return false; // Uncertain
}
}
return true; // Certain / no uncertainty found
}
} }

View File

@ -6,12 +6,14 @@
*/ */
declare(strict_types = 1); declare(strict_types = 1);
use PHPUnit\Framework\TestCase; use PHPUnit\Framework\TestCase;
use PHPUnit\Framework\Attributes\CoversClass;
use PHPUnit\Framework\Attributes\Small;
/** /**
* This script contains tests for the uncertainty helper. * This script contains tests for the uncertainty helper.
*
* @covers \NodaUncertaintyHelper
*/ */
#[small]
#[CoversClass(\NodaUncertaintyHelper::class)]
final class NodaUncertaintyHelperTest extends TestCase { final class NodaUncertaintyHelperTest extends TestCase {
/** /**
* Removes uncertainty indicators from an time name. * Removes uncertainty indicators from an time name.
@ -76,6 +78,9 @@ final class NodaUncertaintyHelperTest extends TestCase {
self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("vermutl. Augsburg")); self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("vermutl. Augsburg"));
self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("wohl Berlin")); self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("wohl Berlin"));
self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("Berlin?"));
self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("Berlin?,"));
self::assertTrue(NodaUncertaintyHelper::guessPlaceCertainty("Berlin,"));
self::assertTrue(NodaUncertaintyHelper::guessPlaceCertainty("Berlin")); self::assertTrue(NodaUncertaintyHelper::guessPlaceCertainty("Berlin"));
self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("?-Italien")); self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("?-Italien"));
@ -109,6 +114,9 @@ final class NodaUncertaintyHelperTest extends TestCase {
public static function testGuessPersinstCertainty():void { public static function testGuessPersinstCertainty():void {
self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("wohl Barbarossa")); self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("wohl Barbarossa"));
self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("Barbarossa?"));
self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("Barbarossa?,"));
self::assertTrue(NodaUncertaintyHelper::guessPlaceCertainty("Barbarossa,"));
self::assertTrue(NodaUncertaintyHelper::guessPlaceCertainty("Barbarossa")); self::assertTrue(NodaUncertaintyHelper::guessPlaceCertainty("Barbarossa"));
} }