417 lines
11 KiB
PHP
417 lines
11 KiB
PHP
<?PHP
|
||
/**
|
||
* Contains class NodaUncertaintyHelper.
|
||
*
|
||
* @author Joshua Ramon Enslin <joshua@museum-digital.de>
|
||
*/
|
||
declare(strict_types = 1);
|
||
|
||
/**
|
||
* Contains static functions for identifying uncertainty or blocking
|
||
* completely uncertain inputs for actors, times, and places.
|
||
*/
|
||
final class NodaUncertaintyHelper {
|
||
|
||
const PERSINST_INDICATORS_DISALLOWED = [
|
||
"Unbekannt",
|
||
"unbekannt",
|
||
"Anonymus",
|
||
"Anonym",
|
||
"n.n.",
|
||
"N.N.",
|
||
"Künstler, unbekannt",
|
||
"Unbekannte Person",
|
||
"Unbekannter Maler",
|
||
"Unbekannter Künstler",
|
||
"nicht benannt",
|
||
"nomen nominandum",
|
||
"Nomen Nominandum",
|
||
"ismeretlen",
|
||
"Ismeretlen",
|
||
"ismeretlen.",
|
||
"Ismeretlen.",
|
||
"ism.",
|
||
"невизначена", // "Uncertain" (ukr)
|
||
"невизначений", // "Unspecified" (ukr)
|
||
"не встановлено", // "Not established" (ukr)
|
||
"невизначено", // "Not established" (ukr)
|
||
"Невідом", // Unknown
|
||
"невизн", // Unknown
|
||
"Не визначен", // Not determined
|
||
"Невідомий артист", // Unknown artist
|
||
];
|
||
|
||
const PERSINST_UNCERTAINTY_PREFIXES = [
|
||
"verm. ",
|
||
"Verm. ",
|
||
"vermtl. ",
|
||
"Vermtl. ",
|
||
"vermutl. ",
|
||
"Vermutl. ",
|
||
"vermutlich ",
|
||
"Vermutlich ",
|
||
"wahrscheinlich ",
|
||
"Wahrscheinlich ",
|
||
"wohl ",
|
||
"Wohl ",
|
||
"?",
|
||
];
|
||
|
||
const PERSINST_UNCERTAINTY_SUFFIXES = [
|
||
"(?)",
|
||
"?",
|
||
" [vermutlich]",
|
||
" [verm.]",
|
||
" [wahrscheinlich]",
|
||
];
|
||
|
||
const TIME_INDICATORS_DISALLOWED = [
|
||
"Nachgewiesen",
|
||
"nachgewiesen",
|
||
"o.D.",
|
||
"O.D.",
|
||
"o.J.",
|
||
"O.J.",
|
||
"o. D.",
|
||
"O. D.",
|
||
"o. J.",
|
||
"O. J.",
|
||
"Ohne Datum",
|
||
"ohne Datum",
|
||
"Ohne Jahr",
|
||
"ohne Jahr",
|
||
"Unbekannt",
|
||
"unbekannt",
|
||
"ismeretlen",
|
||
"Ismeretlen",
|
||
"Neu",
|
||
"Neu hergestellt",
|
||
"zeitl. nicht faßbar",
|
||
"Без року", // Without a year
|
||
"Без дати", // Unknown date
|
||
"Не датовано", // Not dated
|
||
"Н.д.", // Not dated
|
||
"Без датування", // No dating
|
||
"б.р.", // No dating
|
||
"б.д.", // No dating
|
||
];
|
||
|
||
const TIME_UNCERTAINTY_PREFIXES = [
|
||
"c. ",
|
||
"ca ",
|
||
"ca. ",
|
||
"Ca ",
|
||
"Ca. ",
|
||
"za. ",
|
||
"~",
|
||
"circa ",
|
||
"gegen ",
|
||
"um ",
|
||
"Um ",
|
||
"verm. ",
|
||
"Verm. ",
|
||
"vermtl. ",
|
||
"Vermtl. ",
|
||
"vermutl. ",
|
||
"Vermutl. ",
|
||
"vermutlich ",
|
||
"Vermutlich ",
|
||
"vermutlich um ",
|
||
"Vermutlich um ",
|
||
"wohl ",
|
||
"wohl um ",
|
||
"Wohl ",
|
||
"Wohl um ",
|
||
"Приблизно", // UK: About / approximately
|
||
"близько", // UK: About
|
||
"около", // UK: About
|
||
"коло", // UK: About
|
||
"неточно", // UK: Inaccurate
|
||
"майже", // UK: Almost / nearly / about
|
||
"орієнтовно", // UK: approximately
|
||
"Прибл.", // UK: approximately
|
||
];
|
||
|
||
const TIME_UNCERTAINTY_SUFFIXES = [
|
||
"(?)",
|
||
"?",
|
||
" (ca.)",
|
||
" [ca.]",
|
||
" (circa)",
|
||
" [circa]",
|
||
" (verm.)",
|
||
" (vermutl.)",
|
||
" körül",
|
||
", um",
|
||
" (um)",
|
||
" (ок.)",
|
||
];
|
||
|
||
/**
|
||
* Substrings used to express uncertainty about the validity of a place name.
|
||
*/
|
||
const PLACE_INDICATORS_DISALLOWED = [
|
||
"Unbekannt",
|
||
"unbekannt",
|
||
"Unknown",
|
||
"unknown",
|
||
"keine Angaben",
|
||
"Keine Angaben",
|
||
"nicht benannt",
|
||
"ohne Angabe",
|
||
"Ohne Angabe",
|
||
"Ohne Ort",
|
||
"ismeretlen",
|
||
"Ismeretlen",
|
||
"ism.",
|
||
"o.O",
|
||
"versch. O.",
|
||
"o. O.",
|
||
"Diverse O. u. o.O.",
|
||
"o.O.",
|
||
"Без місця", // No place
|
||
"не вказано", // No place
|
||
"не вказане", // No place
|
||
"невідоме", // No place
|
||
];
|
||
|
||
const PLACE_UNCERTAINTY_PREFIXES = [
|
||
"ca ",
|
||
"ca. ",
|
||
"circa ",
|
||
"evtl ",
|
||
"evtl. ",
|
||
"möglicherweise ",
|
||
"vlt. ",
|
||
"verm. ",
|
||
"vermut. ",
|
||
"vermtl. ",
|
||
"vermutl. ",
|
||
"Vermutl. ",
|
||
"vermutlich ",
|
||
"vermutlich: ",
|
||
"Vermutlich ",
|
||
"Vermutlich: ",
|
||
"wohl ",
|
||
"Wohl ",
|
||
"wahrsch. ",
|
||
"Wahrsch. ",
|
||
"wahrscheinlich ",
|
||
"Wahrscheinlich ",
|
||
"можливо",
|
||
"?",
|
||
];
|
||
|
||
const PLACE_UNCERTAINTY_SUFFIXES = [
|
||
"(?)",
|
||
"(vermutl.)",
|
||
"[vermutl.]",
|
||
"(vermutlich)",
|
||
"[vermutlich]",
|
||
"(wohl)",
|
||
"[wohl]",
|
||
"?",
|
||
];
|
||
|
||
/**
|
||
* Trims common characters and charater marks.
|
||
*
|
||
* @param string $input Input text.
|
||
*
|
||
* @return string
|
||
*/
|
||
public static function trim(string $input):string {
|
||
|
||
$input = \trim($input, ", \t\n\r\n;-:");
|
||
return $input;
|
||
|
||
}
|
||
|
||
/**
|
||
* Removes uncertainty indicators from an time name.
|
||
*
|
||
* @param string $name Input string.
|
||
*
|
||
* @return string
|
||
*/
|
||
public static function cleanUncertaintyIndicatorsTime(string $name):string {
|
||
|
||
$name = self::trim($name);
|
||
|
||
if (\in_array($name, self::TIME_INDICATORS_DISALLOWED, true)) {
|
||
return "";
|
||
}
|
||
|
||
// Remove uncertainty prefixes
|
||
foreach (NodaUncertaintyHelper::TIME_UNCERTAINTY_PREFIXES as $prefix) {
|
||
if (\substr($name, 0, \strlen($prefix)) === "$prefix") {
|
||
$name = substr($name, \strlen($prefix));
|
||
}
|
||
}
|
||
|
||
// Remove uncertainty sufixes
|
||
foreach (NodaUncertaintyHelper::TIME_UNCERTAINTY_SUFFIXES as $suffix) {
|
||
if (\substr($name, \strlen($suffix) * -1) === "$suffix") {
|
||
$name = \substr($name, 0, \strlen($suffix) * -1);
|
||
}
|
||
}
|
||
|
||
return self::trim($name);
|
||
|
||
}
|
||
|
||
/**
|
||
* Attempts guessing whether time is uncertain. Returns true if the name
|
||
* indicates certainty, false if it indicates uncertainty.
|
||
*
|
||
* @param string $zeit_name Time name.
|
||
*
|
||
* @return boolean
|
||
*/
|
||
public static function guessTimeCertainty(string $zeit_name):bool {
|
||
|
||
$zeit_name = \strtolower($zeit_name);
|
||
|
||
// Attempt to guess uncertainty based on prefixes.
|
||
foreach (self::TIME_UNCERTAINTY_PREFIXES as $prefix) {
|
||
if (\substr($zeit_name, 0, \strlen($prefix)) === $prefix) {
|
||
return false; // Uncertainty found
|
||
}
|
||
}
|
||
|
||
// Attempt to guess uncertainty based on prefixes.
|
||
foreach (self::TIME_UNCERTAINTY_SUFFIXES as $prefix) {
|
||
if (\substr($zeit_name, -1 * \strlen($prefix)) === $prefix) {
|
||
return false; // Uncertainty found
|
||
}
|
||
}
|
||
|
||
return true; // No uncertainty found
|
||
|
||
}
|
||
|
||
/**
|
||
* Removes uncertainty indicators from an place name.
|
||
*
|
||
* @param string $ort_name Input string.
|
||
*
|
||
* @return string
|
||
*/
|
||
public static function cleanUncertaintyIndicatorsPlace(string $ort_name):string {
|
||
|
||
$ort_name = self::trim($ort_name);
|
||
|
||
if (\in_array($ort_name, self::PLACE_INDICATORS_DISALLOWED, true)) {
|
||
return "";
|
||
}
|
||
|
||
// Remove uncertainty prefixes
|
||
foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_PREFIXES as $prefix) {
|
||
if (\substr($ort_name, 0, \strlen($prefix)) === "$prefix") {
|
||
$ort_name = substr($ort_name, \strlen($prefix));
|
||
}
|
||
}
|
||
|
||
// Remove uncertainty sufixes
|
||
foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_SUFFIXES as $suffix) {
|
||
if (\substr($ort_name, \strlen($suffix) * -1) === "$suffix") {
|
||
$ort_name = \substr($ort_name, 0, \strlen($suffix) * -1);
|
||
}
|
||
}
|
||
|
||
return self::trim($ort_name);
|
||
|
||
}
|
||
|
||
/**
|
||
* Attempts guessing whether place is uncertain. Returns true if the name
|
||
* indicates certainty, false if it indicates uncertainty.
|
||
*
|
||
* @param string $ort_name Place name.
|
||
*
|
||
* @return boolean
|
||
*/
|
||
public static function guessPlaceCertainty(string $ort_name):bool {
|
||
|
||
$ort_name = \strtolower($ort_name);
|
||
|
||
// Attempt to guess uncertainty based on prefixes.
|
||
foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_PREFIXES as $prefix) {
|
||
if (\substr($ort_name, 0, \strlen($prefix)) === $prefix) {
|
||
return false; // Uncertain
|
||
}
|
||
}
|
||
|
||
// Attempt to guess uncertainty based on prefixes.
|
||
foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_SUFFIXES as $prefix) {
|
||
if (\substr($ort_name, -1 * \strlen($prefix)) === $prefix) {
|
||
return false; // Uncertain
|
||
}
|
||
}
|
||
|
||
return true; // Certain / no uncertainty found
|
||
|
||
}
|
||
|
||
/**
|
||
* Removes uncertainty indicators from an actor name.
|
||
*
|
||
* @param string $value Input string.
|
||
*
|
||
* @return string
|
||
*/
|
||
public static function cleanUncertaintyIndicatorsPersinst(string $value):string {
|
||
|
||
$value = self::trim($value);
|
||
|
||
if (\in_array(trim($value, ";. "), self::PERSINST_INDICATORS_DISALLOWED, true)) {
|
||
return "";
|
||
}
|
||
|
||
foreach (self::PERSINST_UNCERTAINTY_PREFIXES as $toRemove) {
|
||
if (\mb_substr($value, 0, \mb_strlen($toRemove)) === $toRemove) {
|
||
$value = substr($value, \mb_strlen($toRemove));
|
||
}
|
||
}
|
||
|
||
foreach (self::PLACE_UNCERTAINTY_SUFFIXES as $suffix) {
|
||
if (\mb_substr($value, \mb_strlen($suffix) * -1) === "$suffix") {
|
||
$value = \mb_substr($value, 0, \mb_strlen($suffix) * -1);
|
||
}
|
||
}
|
||
|
||
return self::trim($value);
|
||
|
||
}
|
||
|
||
/**
|
||
* Attempts guessing whether persinst is uncertain. Returns true if the name
|
||
* indicates certainty, false if it indicates uncertainty.
|
||
*
|
||
* @param string $name Persinst name.
|
||
*
|
||
* @return boolean
|
||
*/
|
||
public static function guessPersinstCertainty(string $name):bool {
|
||
|
||
$name = \trim(\strtolower($name));
|
||
|
||
// Attempt to guess uncertainty based on prefixes.
|
||
foreach (NodaUncertaintyHelper::PERSINST_UNCERTAINTY_PREFIXES as $prefix) {
|
||
if (\substr($name, 0, \strlen($prefix)) === $prefix) {
|
||
return false; // Uncertain
|
||
}
|
||
}
|
||
|
||
// Attempt to guess uncertainty based on prefixes.
|
||
foreach (NodaUncertaintyHelper::PERSINST_UNCERTAINTY_SUFFIXES as $prefix) {
|
||
if (\substr($name, -1 * \strlen($prefix)) === $prefix) {
|
||
return false; // Uncertain
|
||
}
|
||
}
|
||
|
||
return true; // Certain / no uncertainty found
|
||
|
||
}
|
||
}
|