MDNodaHelpers/src/NodaUncertaintyHelper.php

373 lines
9.3 KiB
PHP

<?PHP
/**
* Contains class NodaUncertaintyHelper.
*
* @author Joshua Ramon Enslin <joshua@museum-digital.de>
*/
declare(strict_types = 1);
/**
* Contains static functions for identifying uncertainty or blocking
* completely uncertain inputs for actors, times, and places.
*/
final class NodaUncertaintyHelper {
const PERSINST_INDICATORS_DISALLOWED = [
"Unbekannt",
"unbekannt",
"Anonymus",
"Anonym",
"n.n.",
"N.N.",
"Künstler, unbekannt",
"Unbekannte Person",
"Unbekannter Maler",
"Unbekannter Künstler",
"nicht benannt",
"nomen nominandum",
"Nomen Nominandum",
"ismeretlen",
"Ismeretlen",
"ismeretlen.",
"Ismeretlen.",
"ism.",
];
const PERSINST_UNCERTAINTY_PREFIXES = [
"verm. ",
"Verm. ",
"vermtl. ",
"Vermtl. ",
"vermutlich ",
"Vermutlich ",
"wahrscheinlich ",
"Wahrscheinlich ",
"wohl ",
"Wohl ",
];
const PERSINST_UNCERTAINTY_SUFFIXES = [
"(?)",
"?",
" [vermutlich]",
" [verm.]",
" [wahrscheinlich]",
];
const TIME_INDICATORS_DISALLOWED = [
"Nachgewiesen",
"nachgewiesen",
"o.D.",
"O.D.",
"o.J.",
"O.J.",
"o. D.",
"O. D.",
"o. J.",
"O. J.",
"Ohne Datum",
"ohne Datum",
"Ohne Jahr",
"ohne Jahr",
"Unbekannt",
"unbekannt",
"ismeretlen",
"Ismeretlen",
"Neu",
"Neu hergestellt",
"zeitl. nicht faßbar",
];
const TIME_UNCERTAINTY_PREFIXES = [
"c. ",
"ca ",
"ca. ",
"Ca ",
"Ca. ",
"za. ",
"~",
"circa ",
"gegen ",
"um ",
"Um ",
"verm. ",
"Verm. ",
"vermtl. ",
"Vermtl. ",
"vermutlich ",
"Vermutlich ",
"vermutlich um ",
"Vermutlich um ",
"wohl ",
"wohl um ",
"Wohl ",
"Wohl um ",
];
const TIME_UNCERTAINTY_SUFFIXES = [
"(?)",
"?",
" (ca.)",
" [ca.]",
" (circa)",
" [circa]",
" (verm.)",
" (vermutl.)",
" körül",
", um",
" (um)",
];
/**
* Substrings used to express uncertainty about the validity of a place name.
*/
const PLACE_INDICATORS_DISALLOWED = [
"Unbekannt",
"unbekannt",
"Unknown",
"unknown",
"keine Angaben",
"Keine Angaben",
"nicht benannt",
"ohne Angabe",
"Ohne Angabe",
"Ohne Ort",
"ismeretlen",
"Ismeretlen",
"ism.",
"o.O",
"versch. O.",
"o. O.",
"Diverse O. u. o.O.",
"o.O.",
];
const PLACE_UNCERTAINTY_PREFIXES = [
"ca ",
"ca. ",
"circa ",
"evtl ",
"evtl. ",
"vlt. ",
"wohl ",
"Wohl ",
"verm. ",
"vermut. ",
"vermtl. ",
"vermutlich ",
"Vermutlich ",
];
const PLACE_UNCERTAINTY_SUFFIXES = [
"(?)",
"(vermutl.)",
"[vermutl.]",
"(vermutlich)",
"[vermutlich]",
"(wohl)",
"[wohl]",
"?",
];
/**
* Trims common characters and charater marks.
*
* @param string $input Input text.
*
* @return string
*/
public static function trim(string $input):string {
$input = \trim($input, ", \t\n\r\n;-:");
return $input;
}
/**
* Removes uncertainty indicators from an time name.
*
* @param string $name Input string.
*
* @return string
*/
public static function cleanUncertaintyIndicatorsTime(string $name):string {
$name = self::trim($name);
if (\in_array($name, self::TIME_INDICATORS_DISALLOWED, true)) {
return "";
}
// Remove uncertainty prefixes
foreach (NodaUncertaintyHelper::TIME_UNCERTAINTY_PREFIXES as $prefix) {
if (\substr($name, 0, \strlen($prefix)) === "$prefix") {
$name = substr($name, \strlen($prefix));
}
}
// Remove uncertainty sufixes
foreach (NodaUncertaintyHelper::TIME_UNCERTAINTY_SUFFIXES as $suffix) {
if (\substr($name, \strlen($suffix) * -1) === "$suffix") {
$name = \substr($name, 0, \strlen($suffix) * -1);
}
}
return self::trim($name);
}
/**
* Attempts guessing whether time is uncertain. Returns true if the name
* indicates certainty, false if it indicates uncertainty.
*
* @param string $zeit_name Time name.
*
* @return boolean
*/
public static function guessTimeCertainty(string $zeit_name):bool {
$zeit_name = \strtolower($zeit_name);
// Attempt to guess uncertainty based on prefixes.
foreach (self::TIME_UNCERTAINTY_PREFIXES as $prefix) {
if (\substr($zeit_name, 0, \strlen($prefix)) === $prefix) {
return false; // Uncertainty found
}
}
// Attempt to guess uncertainty based on prefixes.
foreach (self::TIME_UNCERTAINTY_SUFFIXES as $prefix) {
if (\substr($zeit_name, -1 * \strlen($prefix)) === $prefix) {
return false; // Uncertainty found
}
}
return true; // No uncertainty found
}
/**
* Removes uncertainty indicators from an place name.
*
* @param string $ort_name Input string.
*
* @return string
*/
public static function cleanUncertaintyIndicatorsPlace(string $ort_name):string {
$ort_name = self::trim($ort_name);
if (\in_array($ort_name, self::PLACE_INDICATORS_DISALLOWED, true)) {
return "";
}
// Remove uncertainty prefixes
foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_PREFIXES as $prefix) {
if (\substr($ort_name, 0, \strlen($prefix)) === "$prefix") {
$ort_name = substr($ort_name, \strlen($prefix));
}
}
// Remove uncertainty sufixes
foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_SUFFIXES as $suffix) {
if (\substr($ort_name, \strlen($suffix) * -1) === "$suffix") {
$ort_name = \substr($ort_name, 0, \strlen($suffix) * -1);
}
}
return self::trim($ort_name);
}
/**
* Attempts guessing whether place is uncertain. Returns true if the name
* indicates certainty, false if it indicates uncertainty.
*
* @param string $ort_name Place name.
*
* @return boolean
*/
public static function guessPlaceCertainty(string $ort_name):bool {
$ort_name = \strtolower($ort_name);
// Attempt to guess uncertainty based on prefixes.
foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_PREFIXES as $prefix) {
if (\substr($ort_name, 0, \strlen($prefix)) === $prefix) {
return false; // Uncertain
}
}
// Attempt to guess uncertainty based on prefixes.
foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_SUFFIXES as $prefix) {
if (\substr($ort_name, -1 * \strlen($prefix)) === $prefix) {
return false; // Uncertain
}
}
return true; // Certain / no uncertainty found
}
/**
* Removes uncertainty indicators from an actor name.
*
* @param string $value Input string.
*
* @return string
*/
public static function cleanUncertaintyIndicatorsPersinst(string $value):string {
$value = self::trim($value);
if (\in_array(trim($value, ";. "), self::PERSINST_INDICATORS_DISALLOWED, true)) {
return "";
}
foreach (self::PERSINST_UNCERTAINTY_PREFIXES as $toRemove) {
if (\mb_substr($value, 0, \mb_strlen($toRemove)) === $toRemove) {
$value = substr($value, \mb_strlen($toRemove));
}
}
foreach (self::PLACE_UNCERTAINTY_SUFFIXES as $suffix) {
if (\mb_substr($value, \mb_strlen($suffix) * -1) === "$suffix") {
$value = \mb_substr($value, 0, \mb_strlen($suffix) * -1);
}
}
return self::trim($value);
}
/**
* Attempts guessing whether persinst is uncertain. Returns true if the name
* indicates certainty, false if it indicates uncertainty.
*
* @param string $name Persinst name.
*
* @return boolean
*/
public static function guessPersinstCertainty(string $name):bool {
$name = \trim(\strtolower($name));
// Attempt to guess uncertainty based on prefixes.
foreach (NodaUncertaintyHelper::PERSINST_UNCERTAINTY_PREFIXES as $prefix) {
if (\substr($name, 0, \strlen($prefix)) === $prefix) {
return false; // Uncertain
}
}
// Attempt to guess uncertainty based on prefixes.
foreach (NodaUncertaintyHelper::PERSINST_UNCERTAINTY_SUFFIXES as $prefix) {
if (\substr($name, -1 * \strlen($prefix)) === $prefix) {
return false; // Uncertain
}
}
return true; // Certain / no uncertainty found
}
}