361 lines
		
	
	
		
			9.0 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
			
		
		
	
	
			361 lines
		
	
	
		
			9.0 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
| <?PHP
 | |
| /**
 | |
|  * Contains class NodaUncertaintyHelper.
 | |
|  *
 | |
|  * @author Joshua Ramon Enslin <joshua@museum-digital.de>
 | |
|  */
 | |
| declare(strict_types = 1);
 | |
| 
 | |
| /**
 | |
|  * Contains static functions for identifying uncertainty or blocking
 | |
|  * completely uncertain inputs for actors, times, and places.
 | |
|  */
 | |
| final class NodaUncertaintyHelper {
 | |
| 
 | |
|     const PERSINST_INDICATORS_DISALLOWED = [
 | |
|         "Unbekannt",
 | |
|         "unbekannt",
 | |
|         "Anonymus",
 | |
|         "Anonym",
 | |
|         "n.n.",
 | |
|         "N.N.",
 | |
|         "Künstler, unbekannt",
 | |
|         "Unbekannte Person",
 | |
|         "Unbekannter Maler",
 | |
|         "Unbekannter Künstler",
 | |
|         "nicht benannt",
 | |
|         "nomen nominandum",
 | |
|         "Nomen Nominandum",
 | |
|         "ismeretlen",
 | |
|         "Ismeretlen",
 | |
|         "ismeretlen.",
 | |
|         "Ismeretlen.",
 | |
|         "ism.",
 | |
|     ];
 | |
| 
 | |
|     const PERSINST_UNCERTAINTY_PREFIXES = [
 | |
|         "verm. ",
 | |
|         "Verm. ",
 | |
|         "vermtl. ",
 | |
|         "Vermtl. ",
 | |
|         "vermutlich ",
 | |
|         "Vermutlich ",
 | |
|         "wahrscheinlich ",
 | |
|         "Wahrscheinlich ",
 | |
|         "wohl ",
 | |
|         "Wohl ",
 | |
|     ];
 | |
| 
 | |
|     const PERSINST_UNCERTAINTY_SUFFIXES = [
 | |
|         "(?)",
 | |
|         "?",
 | |
|         " [vermutlich]",
 | |
|         " [verm.]",
 | |
|         " [wahrscheinlich]",
 | |
|     ];
 | |
| 
 | |
|     const TIME_INDICATORS_DISALLOWED = [
 | |
|         "o.D.",
 | |
|         "O.D.",
 | |
|         "o.J.",
 | |
|         "O.J.",
 | |
|         "Ohne Datum",
 | |
|         "ohne Datum",
 | |
|         "Ohne Jahr",
 | |
|         "ohne Jahr",
 | |
|         "Unbekannt",
 | |
|         "unbekannt",
 | |
|         "ismeretlen",
 | |
|         "Ismeretlen",
 | |
|         "Neu",
 | |
|         "Neu hergestellt",
 | |
|     ];
 | |
| 
 | |
|     const TIME_UNCERTAINTY_PREFIXES = [
 | |
|         "c. ",
 | |
|         "ca ",
 | |
|         "ca. ",
 | |
|         "za. ",
 | |
|         "~",
 | |
|         "circa ",
 | |
|         "gegen ",
 | |
|         "um ",
 | |
|         "Um ",
 | |
|         "verm. ",
 | |
|         "Verm. ",
 | |
|         "vermtl. ",
 | |
|         "Vermtl. ",
 | |
|         "vermutlich ",
 | |
|         "Vermutlich ",
 | |
|         "wohl ",
 | |
|         "wohl um ",
 | |
|         "Wohl ",
 | |
|         "Wohl um ",
 | |
|     ];
 | |
| 
 | |
|     const TIME_UNCERTAINTY_SUFFIXES = [
 | |
|         "(?)",
 | |
|         "?",
 | |
|         " (ca.)",
 | |
|         " [ca.]",
 | |
|         " (circa)",
 | |
|         " [circa]",
 | |
|         " (verm.)",
 | |
|         " (vermutl.)",
 | |
|         " körül",
 | |
|         ", um",
 | |
|         " (um)",
 | |
|     ];
 | |
| 
 | |
|     /**
 | |
|      * Substrings used to express uncertainty about the validity of a place name.
 | |
|      */
 | |
|     const PLACE_INDICATORS_DISALLOWED = [
 | |
|         "Unbekannt",
 | |
|         "unbekannt",
 | |
|         "Unknown",
 | |
|         "unknown",
 | |
|         "keine Angaben",
 | |
|         "Keine Angaben",
 | |
|         "nicht benannt",
 | |
|         "ohne Angabe",
 | |
|         "Ohne Angabe",
 | |
|         "Ohne Ort",
 | |
|         "ismeretlen",
 | |
|         "Ismeretlen",
 | |
|         "ism.",
 | |
|         "o.O",
 | |
|         "versch. O.",
 | |
|         "o. O.",
 | |
|         "Diverse O. u. o.O.",
 | |
|         "o.O.",
 | |
|     ];
 | |
| 
 | |
|     const PLACE_UNCERTAINTY_PREFIXES = [
 | |
|         "ca ",
 | |
|         "ca. ",
 | |
|         "circa ",
 | |
|         "evtl ",
 | |
|         "evtl. ",
 | |
|         "vlt. ",
 | |
|         "wohl ",
 | |
|         "Wohl ",
 | |
|         "verm. ",
 | |
|         "vermut. ",
 | |
|         "vermtl. ",
 | |
|         "vermutlich ",
 | |
|     ];
 | |
| 
 | |
|     const PLACE_UNCERTAINTY_SUFFIXES = [
 | |
|         "(?)",
 | |
|         "(vermutl.)",
 | |
|         "[vermutl.]",
 | |
|         "(vermutlich)",
 | |
|         "[vermutlich]",
 | |
|         "(wohl)",
 | |
|         "[wohl]",
 | |
|         "?",
 | |
|     ];
 | |
| 
 | |
|     /**
 | |
|      * Trims common characters and charater marks.
 | |
|      *
 | |
|      * @param string $input Input text.
 | |
|      *
 | |
|      * @return string
 | |
|      */
 | |
|     public static function trim(string $input):string {
 | |
| 
 | |
|         $input = \trim($input, ", \t\n\r\n;-:");
 | |
|         return $input;
 | |
| 
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Removes uncertainty indicators from an time name.
 | |
|      *
 | |
|      * @param string $name Input string.
 | |
|      *
 | |
|      * @return string
 | |
|      */
 | |
|     public static function cleanUncertaintyIndicatorsTime(string $name):string {
 | |
| 
 | |
|         $name = self::trim($name);
 | |
| 
 | |
|         if (\in_array($name, self::TIME_INDICATORS_DISALLOWED, true)) {
 | |
|             return "";
 | |
|         }
 | |
| 
 | |
|         // Remove uncertainty prefixes
 | |
|         foreach (NodaUncertaintyHelper::TIME_UNCERTAINTY_PREFIXES as $prefix) {
 | |
|             if (\substr($name, 0, \strlen($prefix)) === "$prefix") {
 | |
|                 $name = substr($name, \strlen($prefix));
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         // Remove uncertainty sufixes
 | |
|         foreach (NodaUncertaintyHelper::TIME_UNCERTAINTY_SUFFIXES as $suffix) {
 | |
|             if (\substr($name, \strlen($suffix) * -1) === "$suffix") {
 | |
|                 $name = \substr($name, 0, \strlen($suffix) * -1);
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         return self::trim($name);
 | |
| 
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Attempts guessing whether time is uncertain. Returns true if the name
 | |
|      * indicates certainty, false if it indicates uncertainty.
 | |
|      *
 | |
|      * @param string $zeit_name Time name.
 | |
|      *
 | |
|      * @return boolean
 | |
|      */
 | |
|     public static function guessTimeCertainty(string $zeit_name):bool {
 | |
| 
 | |
|         $zeit_name = \strtolower($zeit_name);
 | |
| 
 | |
|         // Attempt to guess uncertainty based on prefixes.
 | |
|         foreach (self::TIME_UNCERTAINTY_PREFIXES as $prefix) {
 | |
|             if (\substr($zeit_name, 0, \strlen($prefix)) === $prefix) {
 | |
|                 return false; // Uncertainty found
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         // Attempt to guess uncertainty based on prefixes.
 | |
|         foreach (self::TIME_UNCERTAINTY_SUFFIXES as $prefix) {
 | |
|             if (\substr($zeit_name, -1 * \strlen($prefix)) === $prefix) {
 | |
|                 return false; // Uncertainty found
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         return true; // No uncertainty found
 | |
| 
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Removes uncertainty indicators from an place name.
 | |
|      *
 | |
|      * @param string $ort_name Input string.
 | |
|      *
 | |
|      * @return string
 | |
|      */
 | |
|     public static function cleanUncertaintyIndicatorsPlace(string $ort_name):string {
 | |
| 
 | |
|         $ort_name = self::trim($ort_name);
 | |
| 
 | |
|         if (\in_array($ort_name, self::PLACE_INDICATORS_DISALLOWED, true)) {
 | |
|             return "";
 | |
|         }
 | |
| 
 | |
|         // Remove uncertainty prefixes
 | |
|         foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_PREFIXES as $prefix) {
 | |
|             if (\substr($ort_name, 0, \strlen($prefix)) === "$prefix") {
 | |
|                 $ort_name = substr($ort_name, \strlen($prefix));
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         // Remove uncertainty sufixes
 | |
|         foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_SUFFIXES as $suffix) {
 | |
|             if (\substr($ort_name, \strlen($suffix) * -1) === "$suffix") {
 | |
|                 $ort_name = \substr($ort_name, 0, \strlen($suffix) * -1);
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         return self::trim($ort_name);
 | |
| 
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Attempts guessing whether place is uncertain. Returns true if the name
 | |
|      * indicates certainty, false if it indicates uncertainty.
 | |
|      *
 | |
|      * @param string $ort_name Place name.
 | |
|      *
 | |
|      * @return boolean
 | |
|      */
 | |
|     public static function guessPlaceCertainty(string $ort_name):bool {
 | |
| 
 | |
|         $ort_name = \strtolower($ort_name);
 | |
| 
 | |
|         // Attempt to guess uncertainty based on prefixes.
 | |
|         foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_PREFIXES as $prefix) {
 | |
|             if (\substr($ort_name, 0, \strlen($prefix)) === $prefix) {
 | |
|                 return false; // Uncertain
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         // Attempt to guess uncertainty based on prefixes.
 | |
|         foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_SUFFIXES as $prefix) {
 | |
|             if (\substr($ort_name, -1 * \strlen($prefix)) === $prefix) {
 | |
|                 return false; // Uncertain
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         return true; // Certain / no uncertainty found
 | |
| 
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Removes uncertainty indicators from an actor name.
 | |
|      *
 | |
|      * @param string $value Input string.
 | |
|      *
 | |
|      * @return string
 | |
|      */
 | |
|     public static function cleanUncertaintyIndicatorsPersinst(string $value):string {
 | |
| 
 | |
|         $value = self::trim($value);
 | |
| 
 | |
|         if (\in_array(trim($value, ";. "), self::PERSINST_INDICATORS_DISALLOWED, true)) {
 | |
|             return "";
 | |
|         }
 | |
| 
 | |
|         foreach (self::PERSINST_UNCERTAINTY_PREFIXES as $toRemove) {
 | |
|             if (\mb_substr($value, 0, \mb_strlen($toRemove)) === $toRemove) {
 | |
|                 $value = substr($value, \mb_strlen($toRemove));
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         foreach (self::PLACE_UNCERTAINTY_SUFFIXES as $suffix) {
 | |
|             if (\mb_substr($value, \mb_strlen($suffix) * -1) === "$suffix") {
 | |
|                 $value = \mb_substr($value, 0, \mb_strlen($suffix) * -1);
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         return self::trim($value);
 | |
| 
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Attempts guessing whether persinst is uncertain. Returns true if the name
 | |
|      * indicates certainty, false if it indicates uncertainty.
 | |
|      *
 | |
|      * @param string $name Persinst name.
 | |
|      *
 | |
|      * @return boolean
 | |
|      */
 | |
|     public static function guessPersinstCertainty(string $name):bool {
 | |
| 
 | |
|         $name = \trim(\strtolower($name));
 | |
| 
 | |
|         // Attempt to guess uncertainty based on prefixes.
 | |
|         foreach (NodaUncertaintyHelper::PERSINST_UNCERTAINTY_PREFIXES as $prefix) {
 | |
|             if (\substr($name, 0, \strlen($prefix)) === $prefix) {
 | |
|                 return false; // Uncertain
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         // Attempt to guess uncertainty based on prefixes.
 | |
|         foreach (NodaUncertaintyHelper::PERSINST_UNCERTAINTY_SUFFIXES as $prefix) {
 | |
|             if (\substr($name, -1 * \strlen($prefix)) === $prefix) {
 | |
|                 return false; // Uncertain
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         return true; // Certain / no uncertainty found
 | |
| 
 | |
|     }
 | |
| }
 |