Merge branch 'master' of gitea:museum-digital/MDNodaHelpers

This commit is contained in:
2024-11-11 09:11:35 +01:00
4 changed files with 378 additions and 246 deletions

View File

@ -0,0 +1,54 @@
<?PHP
/**
* Identifies the type of tag relation to an object based on known suffixes.
*
* @author Joshua Ramon Enslin <joshua@museum-digital.de>
*/
declare(strict_types = 1);
/**
* Contains static functions for identifying uncertainty or blocking
* completely uncertain inputs for actors, times, and places.
*/
final class NodaTagRelationIdentifier {
private const SUFFIXES = [
'de' => [
' (Motiv)' => MDTagRelationType::display_subject,
]
];
public readonly string $name;
public readonly MDTagRelationType|false $relation;
/**
* Constructor: Removes identifiers for well-known tag relations and determines cleaned name and relation type.
*
* @param string $lang Current language.
* @param string $input_string Input string to clean.
*
* @return void
*/
public function __construct(string $lang, string $input_string) {
if (empty(self::SUFFIXES[$lang])) {
$this->name = $input_string;
$this->relation = false;
return;
}
$relation = false;
$suffixes = self::SUFFIXES[$lang];
foreach (array_keys($suffixes) as $suffix) {
if (\mb_substr($input_string, \mb_strlen($suffix) * -1) === "$suffix") {
$input_string = \mb_substr($input_string, 0, \mb_strlen($suffix) * -1);
$relation = $suffixes[$suffix];
}
}
$this->name = $input_string;
$this->relation = $relation;
}
}

View File

@ -12,7 +12,7 @@ declare(strict_types = 1);
*/
final class NodaUncertaintyHelper {
const PERSINST_INDICATORS_DISALLOWED = [
public const PERSINST_INDICATORS_DISALLOWED = [
"Unbekannt",
"unbekannt",
"Anonymus",
@ -41,7 +41,7 @@ final class NodaUncertaintyHelper {
"Невідомий артист", // Unknown artist
];
const PERSINST_UNCERTAINTY_PREFIXES = [
public const PERSINST_UNCERTAINTY_PREFIXES = [
"verm. ",
"Verm. ",
"vermtl. ",
@ -57,7 +57,7 @@ final class NodaUncertaintyHelper {
"?",
];
const PERSINST_UNCERTAINTY_SUFFIXES = [
public const PERSINST_UNCERTAINTY_SUFFIXES = [
"(?)",
"?",
" [vermutlich]",
@ -65,7 +65,7 @@ final class NodaUncertaintyHelper {
" [wahrscheinlich]",
];
const TIME_INDICATORS_DISALLOWED = [
public const TIME_INDICATORS_DISALLOWED = [
"Nachgewiesen",
"nachgewiesen",
"o.D.",
@ -94,9 +94,9 @@ final class NodaUncertaintyHelper {
"Без датування", // No dating
"б.р.", // No dating
"б.д.", // No dating
];
];
const TIME_UNCERTAINTY_PREFIXES = [
public const TIME_UNCERTAINTY_PREFIXES = [
"c. ",
"ca ",
"ca. ",
@ -130,9 +130,9 @@ final class NodaUncertaintyHelper {
"майже", // UK: Almost / nearly / about
"орієнтовно", // UK: approximately
"Прибл.", // UK: approximately
];
];
const TIME_UNCERTAINTY_SUFFIXES = [
public const TIME_UNCERTAINTY_SUFFIXES = [
"(?)",
"?",
" (ca.)",
@ -145,12 +145,12 @@ final class NodaUncertaintyHelper {
", um",
" (um)",
" (ок.)",
];
];
/**
* Substrings used to express uncertainty about the validity of a place name.
*/
const PLACE_INDICATORS_DISALLOWED = [
/**
* Substrings used to express uncertainty about the validity of a place name.
*/
public const PLACE_INDICATORS_DISALLOWED = [
"Unbekannt",
"unbekannt",
"Unknown",
@ -173,9 +173,9 @@ final class NodaUncertaintyHelper {
"не вказано", // No place
"не вказане", // No place
"невідоме", // No place
];
];
const PLACE_UNCERTAINTY_PREFIXES = [
public const PLACE_UNCERTAINTY_PREFIXES = [
"ca ",
"Ca ",
"ca. ",
@ -210,9 +210,9 @@ final class NodaUncertaintyHelper {
"Wahrscheinlich ",
"можливо",
"?",
];
];
const PLACE_UNCERTAINTY_SUFFIXES = [
public const PLACE_UNCERTAINTY_SUFFIXES = [
"(?)",
"(vermutl.)",
"[vermutl.]",
@ -221,206 +221,220 @@ final class NodaUncertaintyHelper {
"(wohl)",
"[wohl]",
"?",
];
];
/**
* Trims common characters and charater marks.
*
* @param string $input Input text.
*
* @return string
*/
public static function trim(string $input):string {
/**
* Trims common characters and charater marks.
*
* @param string $input Input text.
*
* @return string
*/
public static function trim(string $input):string {
$input = \trim($input, ", \t\n\r\n;-:");
return $input;
return \trim($input, ", \t\n\r\n;-:");
}
/**
* Removes uncertainty indicators from an time name.
*
* @param string $name Input string.
*
* @return string
*/
public static function cleanUncertaintyIndicatorsTime(string $name):string {
$name = self::trim($name);
if (\in_array($name, self::TIME_INDICATORS_DISALLOWED, true)) {
return "";
}
/**
* Removes uncertainty indicators from an time name.
*
* @param string $name Input string.
*
* @return string
*/
public static function cleanUncertaintyIndicatorsTime(string $name):string {
$name = self::trim($name);
if (\in_array($name, self::TIME_INDICATORS_DISALLOWED, true)) {
return "";
// Remove uncertainty prefixes
foreach (NodaUncertaintyHelper::TIME_UNCERTAINTY_PREFIXES as $prefix) {
if (\substr($name, 0, \strlen($prefix)) === "$prefix") {
$name = substr($name, \strlen($prefix));
}
// Remove uncertainty prefixes
foreach (NodaUncertaintyHelper::TIME_UNCERTAINTY_PREFIXES as $prefix) {
if (\substr($name, 0, \strlen($prefix)) === "$prefix") {
$name = substr($name, \strlen($prefix));
}
}
// Remove uncertainty sufixes
foreach (NodaUncertaintyHelper::TIME_UNCERTAINTY_SUFFIXES as $suffix) {
if (\substr($name, \strlen($suffix) * -1) === "$suffix") {
$name = \substr($name, 0, \strlen($suffix) * -1);
}
}
return self::trim($name);
}
/**
* Attempts guessing whether time is uncertain. Returns true if the name
* indicates certainty, false if it indicates uncertainty.
*
* @param string $zeit_name Time name.
*
* @return boolean
*/
public static function guessTimeCertainty(string $zeit_name):bool {
$zeit_name = \strtolower($zeit_name);
// Attempt to guess uncertainty based on prefixes.
foreach (self::TIME_UNCERTAINTY_PREFIXES as $prefix) {
if (\substr($zeit_name, 0, \strlen($prefix)) === $prefix) {
return false; // Uncertainty found
}
// Remove uncertainty sufixes
foreach (NodaUncertaintyHelper::TIME_UNCERTAINTY_SUFFIXES as $suffix) {
if (\substr($name, \strlen($suffix) * -1) === "$suffix") {
$name = \substr($name, 0, \strlen($suffix) * -1);
}
// Attempt to guess uncertainty based on prefixes.
foreach (self::TIME_UNCERTAINTY_SUFFIXES as $prefix) {
if (\substr($zeit_name, -1 * \strlen($prefix)) === $prefix) {
return false; // Uncertainty found
}
}
return true; // No uncertainty found
}
/**
* Removes uncertainty indicators from an place name.
*
* @param string $ort_name Input string.
*
* @return string
*/
public static function cleanUncertaintyIndicatorsPlace(string $ort_name):string {
return self::trim($name);
$ort_name = self::trim($ort_name);
}
if (\in_array($ort_name, self::PLACE_INDICATORS_DISALLOWED, true)) {
return "";
/**
* Attempts guessing whether time is uncertain. Returns true if the name
* indicates certainty, false if it indicates uncertainty.
*
* @param string $zeit_name Time name.
*
* @return boolean
*/
public static function guessTimeCertainty(string $zeit_name):bool {
$zeit_name = self::trim(strtolower($zeit_name));
// Attempt to guess uncertainty based on prefixes.
foreach (self::TIME_UNCERTAINTY_PREFIXES as $prefix) {
if (\substr($zeit_name, 0, \strlen($prefix)) === $prefix) {
return false; // Uncertainty found
}
// Remove uncertainty prefixes
foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_PREFIXES as $prefix) {
if (\substr($ort_name, 0, \strlen($prefix)) === "$prefix") {
$ort_name = substr($ort_name, \strlen($prefix));
}
}
// Remove uncertainty sufixes
foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_SUFFIXES as $suffix) {
if (\substr($ort_name, \strlen($suffix) * -1) === "$suffix") {
$ort_name = \substr($ort_name, 0, \strlen($suffix) * -1);
}
}
return self::trim($ort_name);
}
/**
* Attempts guessing whether place is uncertain. Returns true if the name
* indicates certainty, false if it indicates uncertainty.
*
* @param string $ort_name Place name.
*
* @return boolean
*/
public static function guessPlaceCertainty(string $ort_name):bool {
$ort_name = \strtolower($ort_name);
// Attempt to guess uncertainty based on prefixes.
foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_PREFIXES as $prefix) {
if (\substr($ort_name, 0, \strlen($prefix)) === $prefix) {
return false; // Uncertain
}
// Attempt to guess uncertainty based on prefixes.
foreach (self::TIME_UNCERTAINTY_SUFFIXES as $prefix) {
if (\substr($zeit_name, -1 * \strlen($prefix)) === $prefix) {
return false; // Uncertainty found
}
// Attempt to guess uncertainty based on prefixes.
foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_SUFFIXES as $prefix) {
if (\substr($ort_name, -1 * \strlen($prefix)) === $prefix) {
return false; // Uncertain
}
}
return true; // Certain / no uncertainty found
}
/**
* Removes uncertainty indicators from an actor name.
*
* @param string $value Input string.
*
* @return string
*/
public static function cleanUncertaintyIndicatorsPersinst(string $value):string {
return true; // No uncertainty found
$value = self::trim($value);
}
if (\in_array(trim($value, ";. "), self::PERSINST_INDICATORS_DISALLOWED, true)) {
return "";
}
/**
* Removes uncertainty indicators from an place name.
*
* @param string $ort_name Input string.
*
* @return string
*/
public static function cleanUncertaintyIndicatorsPlace(string $ort_name):string {
foreach (self::PERSINST_UNCERTAINTY_PREFIXES as $toRemove) {
if (\mb_substr($value, 0, \mb_strlen($toRemove)) === $toRemove) {
$value = substr($value, \mb_strlen($toRemove));
}
}
foreach (self::PLACE_UNCERTAINTY_SUFFIXES as $suffix) {
if (\mb_substr($value, \mb_strlen($suffix) * -1) === "$suffix") {
$value = \mb_substr($value, 0, \mb_strlen($suffix) * -1);
}
}
return self::trim($value);
$ort_name = self::trim($ort_name);
if (\in_array($ort_name, self::PLACE_INDICATORS_DISALLOWED, true)) {
return "";
}
/**
* Attempts guessing whether persinst is uncertain. Returns true if the name
* indicates certainty, false if it indicates uncertainty.
*
* @param string $name Persinst name.
*
* @return boolean
*/
public static function guessPersinstCertainty(string $name):bool {
$name = \trim(\strtolower($name));
// Attempt to guess uncertainty based on prefixes.
foreach (NodaUncertaintyHelper::PERSINST_UNCERTAINTY_PREFIXES as $prefix) {
if (\substr($name, 0, \strlen($prefix)) === $prefix) {
return false; // Uncertain
}
// Remove uncertainty prefixes
foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_PREFIXES as $prefix) {
if (\substr($ort_name, 0, \strlen($prefix)) === "$prefix") {
$ort_name = substr($ort_name, \strlen($prefix));
}
// Attempt to guess uncertainty based on prefixes.
foreach (NodaUncertaintyHelper::PERSINST_UNCERTAINTY_SUFFIXES as $prefix) {
if (\substr($name, -1 * \strlen($prefix)) === $prefix) {
return false; // Uncertain
}
}
return true; // Certain / no uncertainty found
}
// Remove uncertainty sufixes
foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_SUFFIXES as $suffix) {
if (\substr($ort_name, \strlen($suffix) * -1) === "$suffix") {
$ort_name = \substr($ort_name, 0, \strlen($suffix) * -1);
}
}
// If brackets are included in the name, try removing prefixes and suffixes
// from the beginning.
if (($bracketPos = strpos($ort_name, "(")) !== false) {
$start = substr($ort_name, 0, $bracketPos);
$end = substr($ort_name, $bracketPos);
$ort_name = self::cleanUncertaintyIndicatorsPlace($start) . ' ' . $end;
}
return self::trim($ort_name);
}
/**
* Attempts guessing whether place is uncertain. Returns true if the name
* indicates certainty, false if it indicates uncertainty.
*
* @param string $ort_name Place name.
*
* @return boolean
*/
public static function guessPlaceCertainty(string $ort_name):bool {
$ort_name = self::trim(\strtolower($ort_name));
// Attempt to guess uncertainty based on prefixes.
foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_PREFIXES as $prefix) {
if (\substr($ort_name, 0, \strlen($prefix)) === $prefix) {
return false; // Uncertain
}
}
// Attempt to guess uncertainty based on prefixes.
foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_SUFFIXES as $prefix) {
if (\substr($ort_name, -1 * \strlen($prefix)) === $prefix) {
return false; // Uncertain
}
}
// If brackets are included in the name, try the same for everything up to the
// first brackets.
if (($bracketPos = strpos($ort_name, "(")) !== false) {
$name = substr($ort_name, 0, $bracketPos);
return self::guessPlaceCertainty($name);
}
return true; // Certain / no uncertainty found
}
/**
* Removes uncertainty indicators from an actor name.
*
* @param string $value Input string.
*
* @return string
*/
public static function cleanUncertaintyIndicatorsPersinst(string $value):string {
$value = self::trim($value);
if (\in_array(trim($value, ";. "), self::PERSINST_INDICATORS_DISALLOWED, true)) {
return "";
}
foreach (self::PERSINST_UNCERTAINTY_PREFIXES as $toRemove) {
if (\mb_substr($value, 0, \mb_strlen($toRemove)) === $toRemove) {
$value = substr($value, \mb_strlen($toRemove));
}
}
foreach (self::PLACE_UNCERTAINTY_SUFFIXES as $suffix) {
if (\mb_substr($value, \mb_strlen($suffix) * -1) === "$suffix") {
$value = \mb_substr($value, 0, \mb_strlen($suffix) * -1);
}
}
return self::trim($value);
}
/**
* Attempts guessing whether persinst is uncertain. Returns true if the name
* indicates certainty, false if it indicates uncertainty.
*
* @param string $name Persinst name.
*
* @return boolean
*/
public static function guessPersinstCertainty(string $name):bool {
$name = self::trim(\strtolower($name));
// Attempt to guess uncertainty based on prefixes.
foreach (NodaUncertaintyHelper::PERSINST_UNCERTAINTY_PREFIXES as $prefix) {
if (\substr($name, 0, \strlen($prefix)) === $prefix) {
return false; // Uncertain
}
}
// Attempt to guess uncertainty based on prefixes.
foreach (NodaUncertaintyHelper::PERSINST_UNCERTAINTY_SUFFIXES as $prefix) {
if (\substr($name, -1 * \strlen($prefix)) === $prefix) {
return false; // Uncertain
}
}
return true; // Certain / no uncertainty found
}
}