2020-09-21 01:24:07 +02:00
|
|
|
<?PHP
|
|
|
|
/**
|
|
|
|
* Contains class NodaUncertaintyHelper.
|
|
|
|
*
|
|
|
|
* @author Joshua Ramon Enslin <joshua@museum-digital.de>
|
|
|
|
*/
|
|
|
|
declare(strict_types = 1);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Contains static functions for getting IDs for noda entries by various means.
|
|
|
|
*/
|
|
|
|
final class NodaUncertaintyHelper {
|
|
|
|
|
2020-10-04 02:40:21 +02:00
|
|
|
const PERSINST_INDICATORS_DISALLOWED = [
|
|
|
|
"Unbekannt",
|
|
|
|
"unbekannt",
|
2020-12-20 15:40:30 +01:00
|
|
|
"n.n.",
|
|
|
|
"N.N.",
|
|
|
|
"Künstler, unbekannt",
|
|
|
|
"ismeretlen",
|
|
|
|
"Ismeretlen",
|
|
|
|
"ismeretlen.",
|
|
|
|
"Ismeretlen.",
|
2020-10-04 02:40:21 +02:00
|
|
|
];
|
|
|
|
|
2020-09-21 01:57:21 +02:00
|
|
|
const PERSINST_UNCERTAINTY_PREFIXES = [
|
|
|
|
"wohl ",
|
|
|
|
"wahrscheinlich ",
|
|
|
|
"Wohl ",
|
|
|
|
"Wahrscheinlich ",
|
|
|
|
];
|
|
|
|
|
|
|
|
const PERSINST_UNCERTAINTY_SUFFIXES = [
|
|
|
|
"(?)",
|
|
|
|
"?",
|
|
|
|
];
|
|
|
|
|
2020-10-04 02:40:21 +02:00
|
|
|
const TIME_INDICATORS_DISALLOWED = [
|
|
|
|
"o.D.",
|
2020-12-20 15:40:30 +01:00
|
|
|
"O.D.",
|
|
|
|
"o.J.",
|
|
|
|
"O.J.",
|
2020-12-21 15:15:00 +01:00
|
|
|
"Ohne Datum",
|
|
|
|
"ohne Datum",
|
2020-10-04 02:40:21 +02:00
|
|
|
"Unbekannt",
|
|
|
|
"unbekannt",
|
2020-12-20 15:40:30 +01:00
|
|
|
"ismeretlen",
|
|
|
|
"Ismeretlen",
|
2020-10-04 02:40:21 +02:00
|
|
|
];
|
|
|
|
|
2020-09-21 01:24:07 +02:00
|
|
|
const TIME_UNCERTAINTY_PREFIXES = [
|
|
|
|
"um ",
|
|
|
|
"wohl um ",
|
|
|
|
"circa ",
|
|
|
|
"ca. ",
|
|
|
|
"ca "
|
|
|
|
];
|
|
|
|
|
|
|
|
const TIME_UNCERTAINTY_SUFFIXES = [
|
|
|
|
"(?)",
|
|
|
|
"?",
|
2020-09-22 11:03:39 +02:00
|
|
|
" körül"
|
2020-09-21 01:24:07 +02:00
|
|
|
];
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Substrings used to express uncertainty about the validity of a place name.
|
|
|
|
*/
|
2020-10-04 02:40:21 +02:00
|
|
|
const PLACE_INDICATORS_DISALLOWED = [
|
|
|
|
"Unbekannt",
|
|
|
|
"unbekannt",
|
2020-12-19 02:37:38 +01:00
|
|
|
"keine Angaben",
|
|
|
|
"Keine Angaben",
|
2020-12-21 15:32:16 +01:00
|
|
|
"ohne Angabe",
|
|
|
|
"Ohne Angabe",
|
2020-12-20 15:40:30 +01:00
|
|
|
"ismeretlen",
|
|
|
|
"Ismeretlen",
|
2020-10-04 02:40:21 +02:00
|
|
|
];
|
|
|
|
|
2020-09-21 01:24:07 +02:00
|
|
|
const PLACE_UNCERTAINTY_PREFIXES = [
|
|
|
|
"vlt. ",
|
|
|
|
"circa ",
|
|
|
|
"ca. ",
|
|
|
|
"ca ",
|
|
|
|
];
|
|
|
|
|
|
|
|
const PLACE_UNCERTAINTY_SUFFIXES = [
|
|
|
|
"(?)",
|
|
|
|
"?",
|
|
|
|
];
|
|
|
|
|
2020-10-04 22:52:15 +02:00
|
|
|
/**
|
|
|
|
* Trims common characters and charater marks.
|
|
|
|
*
|
|
|
|
* @param string $input Input text.
|
|
|
|
*
|
|
|
|
* @return string
|
|
|
|
*/
|
|
|
|
public static function trim(string $input):string {
|
|
|
|
|
|
|
|
$input = \trim($input, ", \t\n\r\n;-:");
|
|
|
|
return $input;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2020-09-21 01:57:21 +02:00
|
|
|
/**
|
|
|
|
* Removes uncertainty indicators from an time name.
|
|
|
|
*
|
|
|
|
* @param string $name Input string.
|
|
|
|
*
|
|
|
|
* @return string
|
|
|
|
*/
|
|
|
|
public static function cleanUncertaintyIndicatorsTime(string $name):string {
|
|
|
|
|
2020-10-04 22:52:15 +02:00
|
|
|
$name = self::trim($name);
|
|
|
|
|
2020-10-04 02:40:21 +02:00
|
|
|
if (\in_array($name, self::TIME_INDICATORS_DISALLOWED, true)) {
|
|
|
|
return "";
|
|
|
|
}
|
|
|
|
|
2020-09-21 01:57:21 +02:00
|
|
|
// Remove uncertainty prefixes
|
|
|
|
foreach (NodaUncertaintyHelper::TIME_UNCERTAINTY_PREFIXES as $prefix) {
|
|
|
|
if (\substr($name, 0, \strlen($prefix)) === "$prefix") {
|
2020-10-04 22:52:15 +02:00
|
|
|
$name = substr($name, \strlen($prefix));
|
2020-09-21 01:57:21 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Remove uncertainty sufixes
|
|
|
|
foreach (NodaUncertaintyHelper::TIME_UNCERTAINTY_SUFFIXES as $suffix) {
|
|
|
|
if (\substr($name, \strlen($suffix) * -1) === "$suffix") {
|
2020-10-04 22:52:15 +02:00
|
|
|
$name = \substr($name, 0, \strlen($suffix) * -1);
|
2020-09-21 01:57:21 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-10-04 22:52:15 +02:00
|
|
|
return self::trim($name);
|
2020-09-21 01:57:21 +02:00
|
|
|
|
|
|
|
}
|
|
|
|
|
2020-09-21 01:24:07 +02:00
|
|
|
/**
|
|
|
|
* Attempts guessing whether time is uncertain.
|
|
|
|
*
|
|
|
|
* @param string $zeit_name Time name.
|
|
|
|
*
|
|
|
|
* @return boolean
|
|
|
|
*/
|
|
|
|
public static function guessTimeCertainty(string $zeit_name):bool {
|
|
|
|
|
|
|
|
$zeit_name = \strtolower($zeit_name);
|
|
|
|
|
|
|
|
// Attempt to guess uncertainty based on prefixes.
|
|
|
|
foreach (self::TIME_UNCERTAINTY_PREFIXES as $prefix) {
|
|
|
|
if (\substr($zeit_name, 0, \strlen($prefix)) === $prefix) {
|
|
|
|
return false; // Uncertainty found
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Attempt to guess uncertainty based on prefixes.
|
|
|
|
foreach (self::TIME_UNCERTAINTY_SUFFIXES as $prefix) {
|
|
|
|
if (\substr($zeit_name, -1 * \strlen($prefix)) === $prefix) {
|
|
|
|
return false; // Uncertainty found
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return true; // No uncertainty found
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2020-09-21 01:57:21 +02:00
|
|
|
/**
|
|
|
|
* Removes uncertainty indicators from an place name.
|
|
|
|
*
|
|
|
|
* @param string $ort_name Input string.
|
|
|
|
*
|
|
|
|
* @return string
|
|
|
|
*/
|
|
|
|
public static function cleanUncertaintyIndicatorsPlace(string $ort_name):string {
|
|
|
|
|
2020-10-04 22:52:15 +02:00
|
|
|
$ort_name = self::trim($ort_name);
|
2020-09-21 02:11:58 +02:00
|
|
|
|
2020-10-04 02:40:21 +02:00
|
|
|
if (\in_array($ort_name, self::PLACE_INDICATORS_DISALLOWED, true)) {
|
|
|
|
return "";
|
|
|
|
}
|
|
|
|
|
2020-09-21 01:57:21 +02:00
|
|
|
// Remove uncertainty prefixes
|
|
|
|
foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_PREFIXES as $prefix) {
|
|
|
|
if (\substr($ort_name, 0, \strlen($prefix)) === "$prefix") {
|
2020-10-04 22:52:15 +02:00
|
|
|
$ort_name = substr($ort_name, \strlen($prefix));
|
2020-09-21 01:57:21 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Remove uncertainty sufixes
|
|
|
|
foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_SUFFIXES as $suffix) {
|
|
|
|
if (\substr($ort_name, \strlen($suffix) * -1) === "$suffix") {
|
2020-10-04 22:52:15 +02:00
|
|
|
$ort_name = \substr($ort_name, 0, \strlen($suffix) * -1);
|
2020-09-21 01:57:21 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-10-04 22:52:15 +02:00
|
|
|
return self::trim($ort_name);
|
2020-09-21 01:57:21 +02:00
|
|
|
|
|
|
|
}
|
|
|
|
|
2020-09-21 01:24:07 +02:00
|
|
|
/**
|
|
|
|
* Attempts guessing whether place is uncertain.
|
|
|
|
*
|
|
|
|
* @param string $ort_name Place name.
|
|
|
|
*
|
|
|
|
* @return boolean
|
|
|
|
*/
|
|
|
|
public static function guessPlaceCertainty(string $ort_name):bool {
|
|
|
|
|
|
|
|
$ort_name = \strtolower($ort_name);
|
|
|
|
|
|
|
|
// Attempt to guess uncertainty based on prefixes.
|
|
|
|
foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_PREFIXES as $prefix) {
|
|
|
|
if (\substr($ort_name, 0, \strlen($prefix)) === $prefix) {
|
|
|
|
return false; // Uncertain
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Attempt to guess uncertainty based on prefixes.
|
|
|
|
foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_SUFFIXES as $prefix) {
|
|
|
|
if (\substr($ort_name, -1 * \strlen($prefix)) === $prefix) {
|
|
|
|
return false; // Uncertain
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return true; // Certain / no uncertainty found
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2020-09-21 01:57:21 +02:00
|
|
|
/**
|
|
|
|
* Removes uncertainty indicators from an actor name.
|
|
|
|
*
|
|
|
|
* @param string $value Input string.
|
|
|
|
*
|
|
|
|
* @return string
|
|
|
|
*/
|
|
|
|
public static function cleanUncertaintyIndicatorsPersinst(string $value):string {
|
|
|
|
|
2020-10-04 22:52:15 +02:00
|
|
|
$value = self::trim($value);
|
2020-09-21 02:11:58 +02:00
|
|
|
|
2020-10-04 02:40:21 +02:00
|
|
|
if (\in_array($value, self::PERSINST_INDICATORS_DISALLOWED, true)) {
|
|
|
|
return "";
|
|
|
|
}
|
|
|
|
|
2020-09-21 01:57:21 +02:00
|
|
|
foreach (self::PERSINST_UNCERTAINTY_PREFIXES as $toRemove) {
|
|
|
|
if (\mb_substr($value, 0, \mb_strlen($toRemove)) === $toRemove) {
|
|
|
|
$value = substr($value, \mb_strlen($toRemove));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
foreach (self::PLACE_UNCERTAINTY_SUFFIXES as $suffix) {
|
|
|
|
if (\mb_substr($value, \mb_strlen($suffix) * -1) === "$suffix") {
|
|
|
|
$value = \mb_substr($value, 0, \mb_strlen($suffix) * -1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-10-04 22:52:15 +02:00
|
|
|
return self::trim($value);
|
2020-09-21 01:57:21 +02:00
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Attempts guessing whether persinst is uncertain.
|
|
|
|
*
|
|
|
|
* @param string $name Persinst name.
|
|
|
|
*
|
|
|
|
* @return boolean
|
|
|
|
*/
|
|
|
|
public static function guessPersinstCertainty(string $name):bool {
|
|
|
|
|
2020-10-04 02:40:21 +02:00
|
|
|
$name = \trim(\strtolower($name));
|
2020-09-21 01:57:21 +02:00
|
|
|
|
|
|
|
// Attempt to guess uncertainty based on prefixes.
|
|
|
|
foreach (NodaUncertaintyHelper::PERSINST_UNCERTAINTY_PREFIXES as $prefix) {
|
|
|
|
if (\substr($name, 0, \strlen($prefix)) === $prefix) {
|
|
|
|
return false; // Uncertain
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Attempt to guess uncertainty based on prefixes.
|
|
|
|
foreach (NodaUncertaintyHelper::PERSINST_UNCERTAINTY_SUFFIXES as $prefix) {
|
|
|
|
if (\substr($name, -1 * \strlen($prefix)) === $prefix) {
|
|
|
|
return false; // Uncertain
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return true; // Certain / no uncertainty found
|
|
|
|
|
|
|
|
}
|
2020-09-21 01:24:07 +02:00
|
|
|
}
|