Merge branch 'master' of gitea:museum-digital/MDNodaHelpers

This commit is contained in:
Joshua Ramon Enslin 2024-11-11 09:11:35 +01:00
commit c72ad51dda
4 changed files with 378 additions and 246 deletions

View File

@ -0,0 +1,54 @@
<?PHP
/**
* Identifies the type of tag relation to an object based on known suffixes.
*
* @author Joshua Ramon Enslin <joshua@museum-digital.de>
*/
declare(strict_types = 1);
/**
* Contains static functions for identifying uncertainty or blocking
* completely uncertain inputs for actors, times, and places.
*/
final class NodaTagRelationIdentifier {
private const SUFFIXES = [
'de' => [
' (Motiv)' => MDTagRelationType::display_subject,
]
];
public readonly string $name;
public readonly MDTagRelationType|false $relation;
/**
* Constructor: Removes identifiers for well-known tag relations and determines cleaned name and relation type.
*
* @param string $lang Current language.
* @param string $input_string Input string to clean.
*
* @return void
*/
public function __construct(string $lang, string $input_string) {
if (empty(self::SUFFIXES[$lang])) {
$this->name = $input_string;
$this->relation = false;
return;
}
$relation = false;
$suffixes = self::SUFFIXES[$lang];
foreach (array_keys($suffixes) as $suffix) {
if (\mb_substr($input_string, \mb_strlen($suffix) * -1) === "$suffix") {
$input_string = \mb_substr($input_string, 0, \mb_strlen($suffix) * -1);
$relation = $suffixes[$suffix];
}
}
$this->name = $input_string;
$this->relation = $relation;
}
}

View File

@ -12,7 +12,7 @@ declare(strict_types = 1);
*/ */
final class NodaUncertaintyHelper { final class NodaUncertaintyHelper {
const PERSINST_INDICATORS_DISALLOWED = [ public const PERSINST_INDICATORS_DISALLOWED = [
"Unbekannt", "Unbekannt",
"unbekannt", "unbekannt",
"Anonymus", "Anonymus",
@ -41,7 +41,7 @@ final class NodaUncertaintyHelper {
"Невідомий артист", // Unknown artist "Невідомий артист", // Unknown artist
]; ];
const PERSINST_UNCERTAINTY_PREFIXES = [ public const PERSINST_UNCERTAINTY_PREFIXES = [
"verm. ", "verm. ",
"Verm. ", "Verm. ",
"vermtl. ", "vermtl. ",
@ -57,7 +57,7 @@ final class NodaUncertaintyHelper {
"?", "?",
]; ];
const PERSINST_UNCERTAINTY_SUFFIXES = [ public const PERSINST_UNCERTAINTY_SUFFIXES = [
"(?)", "(?)",
"?", "?",
" [vermutlich]", " [vermutlich]",
@ -65,7 +65,7 @@ final class NodaUncertaintyHelper {
" [wahrscheinlich]", " [wahrscheinlich]",
]; ];
const TIME_INDICATORS_DISALLOWED = [ public const TIME_INDICATORS_DISALLOWED = [
"Nachgewiesen", "Nachgewiesen",
"nachgewiesen", "nachgewiesen",
"o.D.", "o.D.",
@ -96,7 +96,7 @@ final class NodaUncertaintyHelper {
"б.д.", // No dating "б.д.", // No dating
]; ];
const TIME_UNCERTAINTY_PREFIXES = [ public const TIME_UNCERTAINTY_PREFIXES = [
"c. ", "c. ",
"ca ", "ca ",
"ca. ", "ca. ",
@ -132,7 +132,7 @@ final class NodaUncertaintyHelper {
"Прибл.", // UK: approximately "Прибл.", // UK: approximately
]; ];
const TIME_UNCERTAINTY_SUFFIXES = [ public const TIME_UNCERTAINTY_SUFFIXES = [
"(?)", "(?)",
"?", "?",
" (ca.)", " (ca.)",
@ -150,7 +150,7 @@ final class NodaUncertaintyHelper {
/** /**
* Substrings used to express uncertainty about the validity of a place name. * Substrings used to express uncertainty about the validity of a place name.
*/ */
const PLACE_INDICATORS_DISALLOWED = [ public const PLACE_INDICATORS_DISALLOWED = [
"Unbekannt", "Unbekannt",
"unbekannt", "unbekannt",
"Unknown", "Unknown",
@ -175,7 +175,7 @@ final class NodaUncertaintyHelper {
"невідоме", // No place "невідоме", // No place
]; ];
const PLACE_UNCERTAINTY_PREFIXES = [ public const PLACE_UNCERTAINTY_PREFIXES = [
"ca ", "ca ",
"Ca ", "Ca ",
"ca. ", "ca. ",
@ -212,7 +212,7 @@ final class NodaUncertaintyHelper {
"?", "?",
]; ];
const PLACE_UNCERTAINTY_SUFFIXES = [ public const PLACE_UNCERTAINTY_SUFFIXES = [
"(?)", "(?)",
"(vermutl.)", "(vermutl.)",
"[vermutl.]", "[vermutl.]",
@ -232,8 +232,7 @@ final class NodaUncertaintyHelper {
*/ */
public static function trim(string $input):string { public static function trim(string $input):string {
$input = \trim($input, ", \t\n\r\n;-:"); return \trim($input, ", \t\n\r\n;-:");
return $input;
} }
@ -280,7 +279,7 @@ final class NodaUncertaintyHelper {
*/ */
public static function guessTimeCertainty(string $zeit_name):bool { public static function guessTimeCertainty(string $zeit_name):bool {
$zeit_name = \strtolower($zeit_name); $zeit_name = self::trim(strtolower($zeit_name));
// Attempt to guess uncertainty based on prefixes. // Attempt to guess uncertainty based on prefixes.
foreach (self::TIME_UNCERTAINTY_PREFIXES as $prefix) { foreach (self::TIME_UNCERTAINTY_PREFIXES as $prefix) {
@ -329,6 +328,14 @@ final class NodaUncertaintyHelper {
} }
} }
// If brackets are included in the name, try removing prefixes and suffixes
// from the beginning.
if (($bracketPos = strpos($ort_name, "(")) !== false) {
$start = substr($ort_name, 0, $bracketPos);
$end = substr($ort_name, $bracketPos);
$ort_name = self::cleanUncertaintyIndicatorsPlace($start) . ' ' . $end;
}
return self::trim($ort_name); return self::trim($ort_name);
} }
@ -343,7 +350,7 @@ final class NodaUncertaintyHelper {
*/ */
public static function guessPlaceCertainty(string $ort_name):bool { public static function guessPlaceCertainty(string $ort_name):bool {
$ort_name = \strtolower($ort_name); $ort_name = self::trim(\strtolower($ort_name));
// Attempt to guess uncertainty based on prefixes. // Attempt to guess uncertainty based on prefixes.
foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_PREFIXES as $prefix) { foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_PREFIXES as $prefix) {
@ -359,6 +366,13 @@ final class NodaUncertaintyHelper {
} }
} }
// If brackets are included in the name, try the same for everything up to the
// first brackets.
if (($bracketPos = strpos($ort_name, "(")) !== false) {
$name = substr($ort_name, 0, $bracketPos);
return self::guessPlaceCertainty($name);
}
return true; // Certain / no uncertainty found return true; // Certain / no uncertainty found
} }
@ -404,7 +418,7 @@ final class NodaUncertaintyHelper {
*/ */
public static function guessPersinstCertainty(string $name):bool { public static function guessPersinstCertainty(string $name):bool {
$name = \trim(\strtolower($name)); $name = self::trim(\strtolower($name));
// Attempt to guess uncertainty based on prefixes. // Attempt to guess uncertainty based on prefixes.
foreach (NodaUncertaintyHelper::PERSINST_UNCERTAINTY_PREFIXES as $prefix) { foreach (NodaUncertaintyHelper::PERSINST_UNCERTAINTY_PREFIXES as $prefix) {

View File

@ -0,0 +1,50 @@
<?PHP
/**
* Tests for the identification of tag relation types to objects based on input tag names.
*
* @author Joshua Ramon Enslin <joshua@museum-digital.de>
*/
declare(strict_types = 1);
use PHPUnit\Framework\TestCase;
use PHPUnit\Framework\Attributes\CoversClass;
use PHPUnit\Framework\Attributes\Small;
use PHPUnit\Framework\Attributes\DataProvider;
/**
* This script contains tests for the uncertainty helper.
*/
#[small]
#[CoversClass(\NodaTagRelationIdentifier::class)]
final class NodaTagRelationIdentifierTest extends TestCase {
/**
* Returns input tag names with and without known suffixes signaling the relation type.
*
* @return array<array{0: string, 1: string, 2: false|MDTagRelationType}>
*/
public static function tagNameAndRelationTypeProvider():array {
return [
'Delfin' => ["Delfin", "Delfin", false],
'Delfin (Motiv)' => ["Delfin (Motiv)", "Delfin", MDTagRelationType::display_subject],
];
}
/**
* Test to ensure times are correctly cleaned and parsed.
*
* @param string $term Term to check.
* @param string $target_output Expected output name.
* @param false|MDTagRelationType $target_type Expected identified relation type (or false for lack thereof).
*
* @return void
*/
#[DataProvider('tagNameAndRelationTypeProvider')]
public function testIdentificationWorksCorrectly(string $term, string $target_output, false|MDTagRelationType $target_type):void {
$identification = new NodaTagRelationIdentifier("de", $term);
self::assertEquals($target_output, $identification->name);
self::assertEquals($target_type, $identification->relation);
}
}

View File

@ -6,110 +6,124 @@
*/ */
declare(strict_types = 1); declare(strict_types = 1);
use PHPUnit\Framework\TestCase; use PHPUnit\Framework\TestCase;
use PHPUnit\Framework\Attributes\CoversClass;
use PHPUnit\Framework\Attributes\Small;
use PHPUnit\Framework\Attributes\DataProvider;
/** /**
* This script contains tests for the uncertainty helper. * This script contains tests for the uncertainty helper.
*
* @covers \NodaUncertaintyHelper
*/ */
#[small]
#[CoversClass(\NodaUncertaintyHelper::class)]
final class NodaUncertaintyHelperTest extends TestCase { final class NodaUncertaintyHelperTest extends TestCase {
/** /**
* Removes uncertainty indicators from an time name. * Returns time names with expected cleaned version and expected parsed certainty.
* *
* @group ValidOutput * @return array<array{0: string, 1: string, 2: boolean}>
* @small
*
* @return void
*/ */
public function testCleanUncertaintyIndicatorsTime():void { public static function uncertainTimesProvider():array {
self::assertEquals("1950", NodaUncertaintyHelper::cleanUncertaintyIndicatorsTime("wohl 1950")); return [
self::assertEquals("1950", NodaUncertaintyHelper::cleanUncertaintyIndicatorsTime("1950")); 'uncertainty prefix: "wohl 1950"' => ["wohl 1950", "1950", false],
'uncertainty suffix: "1950?"' => ["1950?", "1950", false],
'uncertainty suffix and superfluous chars: "1950 ?,"' => ["1950 ?,", "1950", false],
'certain term with superfluous chars: "1950 ,"' => ["1950 ,", "1950", true],
'certain term: 1950' => ["1950", "1950", true],
];
} }
/** /**
* Attempts guessing whether time is uncertain. * Returns place names with expected cleaned version and expected parsed certainty.
* *
* @group ValidOutput * @return array<array{0: string, 1: string, 2: boolean}>
* @small
*
* @return void
*/ */
public function testGuessTimeCertainty():void { public static function uncertainPlacesProvider():array {
self::assertFalse(NodaUncertaintyHelper::guessTimeCertainty("wohl 1950")); return [
self::assertTrue(NodaUncertaintyHelper::guessTimeCertainty("1950"));
'uncertainty prefix: "wohl Berlin"' => ["wohl Berlin", "Berlin", false],
'uncertainty prefix: "vermutl. Berlin"' => ["vermutl. Berlin", "Berlin", false],
'uncertainty prefix and superfluous chars: "?-Berlin"' => ["?-Berlin", "Berlin", false],
'uncertainty suffix: "Berlin?"' => ["Berlin?", "Berlin", false],
'uncertainty suffix: "Berlin (?)"' => ["Berlin (?)", "Berlin", false],
'uncertainty suffix and superfluous chars: "Berlin ?,"' => ["Berlin ?,", "Berlin", false],
'certain term with superfluous chars: "Berlin ,"' => ["Berlin ,", "Berlin", true],
'certain term: Berlin' => ["Berlin", "Berlin", true],
'Berlin ? (Deutschland)' => ["Berlin ? (Deutschland)", "Berlin (Deutschland)", false],
'Berli?n (Deutschland)' => ["Berl?n (Deutschland)", "Berl?n (Deutschland)", true],
];
} }
/** /**
* Removes uncertainty indicators from an place name. * Returns actor names with expected cleaned version and expected parsed certainty.
* *
* @group ValidOutput * @return array<array{0: string, 1: string, 2: boolean}>
* @small
*
* @return void
*/ */
public static function testCleanUncertaintyIndicatorsPlace():void { public static function uncertainPersinstProvider():array {
self::assertEquals("Berlin", NodaUncertaintyHelper::cleanUncertaintyIndicatorsPlace("wohl Berlin")); return [
self::assertEquals("Berlin", NodaUncertaintyHelper::cleanUncertaintyIndicatorsPlace("Berlin")); 'uncertainty prefix: "wohl Barbarossa"' => ["wohl Barbarossa", "Barbarossa", false],
'uncertainty prefix: "vermutl. Barbarossa"' => ["vermutl. Barbarossa", "Barbarossa", false],
// Real-life examples that previously passed unencumbered 'uncertainty prefix and superfluous chars: "?-Barbarossa"' => ["?-Barbarossa", "Barbarossa", false],
self::assertEquals("Augsburg", NodaUncertaintyHelper::cleanUncertaintyIndicatorsPlace("vermutlich: Augsburg")); 'uncertainty suffix: "Barbarossa?"' => ["Barbarossa?", "Barbarossa", false],
self::assertEquals("Augsburg", NodaUncertaintyHelper::cleanUncertaintyIndicatorsPlace("vermutl. Augsburg")); 'uncertainty suffix and superfluous chars: "Barbarossa ?,"' => ["Barbarossa ?,", "Barbarossa", false],
'certain term with superfluous chars: "Barbarossa ,"' => ["Barbarossa ,", "Barbarossa", true],
'certain term: Barbarossa' => ["Barbarossa", "Barbarossa", true],
];
} }
/** /**
* Attempts guessing whether place is uncertain. * Test to ensure times are correctly cleaned and parsed.
* *
* @group ValidOutput * @param string $term Term to check.
* @small * @param string $target_output Expected output name.
* @param boolean $target_certainty Expected output certainty.
* *
* @return void * @return void
*/ */
public static function testGuessPlaceCertainty():void { #[DataProvider('uncertainTimesProvider')]
public function testParsingUncertaintyFromTimes(string $term, string $target_output, bool $target_certainty):void {
self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("vermutlich: Augsburg")); self::assertEquals($target_output, NodaUncertaintyHelper::cleanUncertaintyIndicatorsTime($term));
self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("vermutl. Augsburg")); self::assertEquals($target_certainty, NodaUncertaintyHelper::guessTimeCertainty($term));
self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("wohl Berlin"));
self::assertTrue(NodaUncertaintyHelper::guessPlaceCertainty("Berlin"));
self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("?-Italien"));
} }
/** /**
* Removes uncertainty indicators from an actor name. * Test to ensure places are correctly cleaned and parsed.
* *
* @group ValidOutput * @param string $term Term to check.
* @small * @param string $target_output Expected output name.
* @param boolean $target_certainty Expected output certainty.
* *
* @return void * @return void
*/ */
public static function testCleanUncertaintyIndicatorsPersinst():void { #[DataProvider('uncertainPlacesProvider')]
public function testParsingUncertaintyFromPlaces(string $term, string $target_output, bool $target_certainty):void {
self::assertEquals("Barbarossa", NodaUncertaintyHelper::cleanUncertaintyIndicatorsPersinst("wohl Barbarossa")); self::assertEquals($target_output, NodaUncertaintyHelper::cleanUncertaintyIndicatorsPlace($term));
self::assertEquals("Barbarossa", NodaUncertaintyHelper::cleanUncertaintyIndicatorsPersinst("Barbarossa")); self::assertEquals($target_certainty, NodaUncertaintyHelper::guessPlaceCertainty($term));
self::assertEquals("Barbarossa", NodaUncertaintyHelper::cleanUncertaintyIndicatorsPersinst("?-Barbarossa"));
} }
/** /**
* Attempts guessing whether persinst is uncertain. * Test to ensure actor names are correctly cleaned and parsed.
* *
* @group ValidOutput * @param string $term Term to check.
* @small * @param string $target_output Expected output name.
* @param boolean $target_certainty Expected output certainty.
* *
* @return void * @return void
*/ */
public static function testGuessPersinstCertainty():void { #[DataProvider('uncertainPersinstProvider')]
public function testParsingUncertaintyFromPersinst(string $term, string $target_output, bool $target_certainty):void {
self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("wohl Barbarossa")); self::assertEquals($target_output, NodaUncertaintyHelper::cleanUncertaintyIndicatorsPersinst($term));
self::assertTrue(NodaUncertaintyHelper::guessPlaceCertainty("Barbarossa")); self::assertEquals($target_certainty, NodaUncertaintyHelper::guessPersinstCertainty($term));
} }
} }