Properly handle commas at the end of names when guessing certainty
This commit is contained in:
parent
eb371d4270
commit
29ca05f552
@ -12,7 +12,7 @@ declare(strict_types = 1);
|
||||
*/
|
||||
final class NodaUncertaintyHelper {
|
||||
|
||||
const PERSINST_INDICATORS_DISALLOWED = [
|
||||
public const PERSINST_INDICATORS_DISALLOWED = [
|
||||
"Unbekannt",
|
||||
"unbekannt",
|
||||
"Anonymus",
|
||||
@ -41,7 +41,7 @@ final class NodaUncertaintyHelper {
|
||||
"Невідомий артист", // Unknown artist
|
||||
];
|
||||
|
||||
const PERSINST_UNCERTAINTY_PREFIXES = [
|
||||
public const PERSINST_UNCERTAINTY_PREFIXES = [
|
||||
"verm. ",
|
||||
"Verm. ",
|
||||
"vermtl. ",
|
||||
@ -57,7 +57,7 @@ final class NodaUncertaintyHelper {
|
||||
"?",
|
||||
];
|
||||
|
||||
const PERSINST_UNCERTAINTY_SUFFIXES = [
|
||||
public const PERSINST_UNCERTAINTY_SUFFIXES = [
|
||||
"(?)",
|
||||
"?",
|
||||
" [vermutlich]",
|
||||
@ -65,7 +65,7 @@ final class NodaUncertaintyHelper {
|
||||
" [wahrscheinlich]",
|
||||
];
|
||||
|
||||
const TIME_INDICATORS_DISALLOWED = [
|
||||
public const TIME_INDICATORS_DISALLOWED = [
|
||||
"Nachgewiesen",
|
||||
"nachgewiesen",
|
||||
"o.D.",
|
||||
@ -94,9 +94,9 @@ final class NodaUncertaintyHelper {
|
||||
"Без датування", // No dating
|
||||
"б.р.", // No dating
|
||||
"б.д.", // No dating
|
||||
];
|
||||
];
|
||||
|
||||
const TIME_UNCERTAINTY_PREFIXES = [
|
||||
public const TIME_UNCERTAINTY_PREFIXES = [
|
||||
"c. ",
|
||||
"ca ",
|
||||
"ca. ",
|
||||
@ -130,9 +130,9 @@ final class NodaUncertaintyHelper {
|
||||
"майже", // UK: Almost / nearly / about
|
||||
"орієнтовно", // UK: approximately
|
||||
"Прибл.", // UK: approximately
|
||||
];
|
||||
];
|
||||
|
||||
const TIME_UNCERTAINTY_SUFFIXES = [
|
||||
public const TIME_UNCERTAINTY_SUFFIXES = [
|
||||
"(?)",
|
||||
"?",
|
||||
" (ca.)",
|
||||
@ -145,12 +145,12 @@ final class NodaUncertaintyHelper {
|
||||
", um",
|
||||
" (um)",
|
||||
" (ок.)",
|
||||
];
|
||||
];
|
||||
|
||||
/**
|
||||
* Substrings used to express uncertainty about the validity of a place name.
|
||||
*/
|
||||
const PLACE_INDICATORS_DISALLOWED = [
|
||||
/**
|
||||
* Substrings used to express uncertainty about the validity of a place name.
|
||||
*/
|
||||
public const PLACE_INDICATORS_DISALLOWED = [
|
||||
"Unbekannt",
|
||||
"unbekannt",
|
||||
"Unknown",
|
||||
@ -173,9 +173,9 @@ final class NodaUncertaintyHelper {
|
||||
"не вказано", // No place
|
||||
"не вказане", // No place
|
||||
"невідоме", // No place
|
||||
];
|
||||
];
|
||||
|
||||
const PLACE_UNCERTAINTY_PREFIXES = [
|
||||
public const PLACE_UNCERTAINTY_PREFIXES = [
|
||||
"ca ",
|
||||
"Ca ",
|
||||
"ca. ",
|
||||
@ -210,9 +210,9 @@ final class NodaUncertaintyHelper {
|
||||
"Wahrscheinlich ",
|
||||
"можливо",
|
||||
"?",
|
||||
];
|
||||
];
|
||||
|
||||
const PLACE_UNCERTAINTY_SUFFIXES = [
|
||||
public const PLACE_UNCERTAINTY_SUFFIXES = [
|
||||
"(?)",
|
||||
"(vermutl.)",
|
||||
"[vermutl.]",
|
||||
@ -221,206 +221,206 @@ final class NodaUncertaintyHelper {
|
||||
"(wohl)",
|
||||
"[wohl]",
|
||||
"?",
|
||||
];
|
||||
];
|
||||
|
||||
/**
|
||||
* Trims common characters and charater marks.
|
||||
*
|
||||
* @param string $input Input text.
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public static function trim(string $input):string {
|
||||
/**
|
||||
* Trims common characters and charater marks.
|
||||
*
|
||||
* @param string $input Input text.
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public static function trim(string $input):string {
|
||||
|
||||
$input = \trim($input, ", \t\n\r\n;-:");
|
||||
return $input;
|
||||
$input = \trim($input, ", \t\n\r\n;-:");
|
||||
return $input;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes uncertainty indicators from an time name.
|
||||
*
|
||||
* @param string $name Input string.
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public static function cleanUncertaintyIndicatorsTime(string $name):string {
|
||||
|
||||
$name = self::trim($name);
|
||||
|
||||
if (\in_array($name, self::TIME_INDICATORS_DISALLOWED, true)) {
|
||||
return "";
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes uncertainty indicators from an time name.
|
||||
*
|
||||
* @param string $name Input string.
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public static function cleanUncertaintyIndicatorsTime(string $name):string {
|
||||
|
||||
$name = self::trim($name);
|
||||
|
||||
if (\in_array($name, self::TIME_INDICATORS_DISALLOWED, true)) {
|
||||
return "";
|
||||
// Remove uncertainty prefixes
|
||||
foreach (NodaUncertaintyHelper::TIME_UNCERTAINTY_PREFIXES as $prefix) {
|
||||
if (\substr($name, 0, \strlen($prefix)) === "$prefix") {
|
||||
$name = substr($name, \strlen($prefix));
|
||||
}
|
||||
|
||||
// Remove uncertainty prefixes
|
||||
foreach (NodaUncertaintyHelper::TIME_UNCERTAINTY_PREFIXES as $prefix) {
|
||||
if (\substr($name, 0, \strlen($prefix)) === "$prefix") {
|
||||
$name = substr($name, \strlen($prefix));
|
||||
}
|
||||
}
|
||||
|
||||
// Remove uncertainty sufixes
|
||||
foreach (NodaUncertaintyHelper::TIME_UNCERTAINTY_SUFFIXES as $suffix) {
|
||||
if (\substr($name, \strlen($suffix) * -1) === "$suffix") {
|
||||
$name = \substr($name, 0, \strlen($suffix) * -1);
|
||||
}
|
||||
}
|
||||
|
||||
return self::trim($name);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempts guessing whether time is uncertain. Returns true if the name
|
||||
* indicates certainty, false if it indicates uncertainty.
|
||||
*
|
||||
* @param string $zeit_name Time name.
|
||||
*
|
||||
* @return boolean
|
||||
*/
|
||||
public static function guessTimeCertainty(string $zeit_name):bool {
|
||||
|
||||
$zeit_name = \strtolower($zeit_name);
|
||||
|
||||
// Attempt to guess uncertainty based on prefixes.
|
||||
foreach (self::TIME_UNCERTAINTY_PREFIXES as $prefix) {
|
||||
if (\substr($zeit_name, 0, \strlen($prefix)) === $prefix) {
|
||||
return false; // Uncertainty found
|
||||
}
|
||||
// Remove uncertainty sufixes
|
||||
foreach (NodaUncertaintyHelper::TIME_UNCERTAINTY_SUFFIXES as $suffix) {
|
||||
if (\substr($name, \strlen($suffix) * -1) === "$suffix") {
|
||||
$name = \substr($name, 0, \strlen($suffix) * -1);
|
||||
}
|
||||
|
||||
// Attempt to guess uncertainty based on prefixes.
|
||||
foreach (self::TIME_UNCERTAINTY_SUFFIXES as $prefix) {
|
||||
if (\substr($zeit_name, -1 * \strlen($prefix)) === $prefix) {
|
||||
return false; // Uncertainty found
|
||||
}
|
||||
}
|
||||
|
||||
return true; // No uncertainty found
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes uncertainty indicators from an place name.
|
||||
*
|
||||
* @param string $ort_name Input string.
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public static function cleanUncertaintyIndicatorsPlace(string $ort_name):string {
|
||||
return self::trim($name);
|
||||
|
||||
$ort_name = self::trim($ort_name);
|
||||
}
|
||||
|
||||
if (\in_array($ort_name, self::PLACE_INDICATORS_DISALLOWED, true)) {
|
||||
return "";
|
||||
/**
|
||||
* Attempts guessing whether time is uncertain. Returns true if the name
|
||||
* indicates certainty, false if it indicates uncertainty.
|
||||
*
|
||||
* @param string $zeit_name Time name.
|
||||
*
|
||||
* @return boolean
|
||||
*/
|
||||
public static function guessTimeCertainty(string $zeit_name):bool {
|
||||
|
||||
$zeit_name = \strtolower($zeit_name);
|
||||
|
||||
// Attempt to guess uncertainty based on prefixes.
|
||||
foreach (self::TIME_UNCERTAINTY_PREFIXES as $prefix) {
|
||||
if (\substr($zeit_name, 0, \strlen($prefix)) === $prefix) {
|
||||
return false; // Uncertainty found
|
||||
}
|
||||
|
||||
// Remove uncertainty prefixes
|
||||
foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_PREFIXES as $prefix) {
|
||||
if (\substr($ort_name, 0, \strlen($prefix)) === "$prefix") {
|
||||
$ort_name = substr($ort_name, \strlen($prefix));
|
||||
}
|
||||
}
|
||||
|
||||
// Remove uncertainty sufixes
|
||||
foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_SUFFIXES as $suffix) {
|
||||
if (\substr($ort_name, \strlen($suffix) * -1) === "$suffix") {
|
||||
$ort_name = \substr($ort_name, 0, \strlen($suffix) * -1);
|
||||
}
|
||||
}
|
||||
|
||||
return self::trim($ort_name);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempts guessing whether place is uncertain. Returns true if the name
|
||||
* indicates certainty, false if it indicates uncertainty.
|
||||
*
|
||||
* @param string $ort_name Place name.
|
||||
*
|
||||
* @return boolean
|
||||
*/
|
||||
public static function guessPlaceCertainty(string $ort_name):bool {
|
||||
|
||||
$ort_name = \strtolower($ort_name);
|
||||
|
||||
// Attempt to guess uncertainty based on prefixes.
|
||||
foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_PREFIXES as $prefix) {
|
||||
if (\substr($ort_name, 0, \strlen($prefix)) === $prefix) {
|
||||
return false; // Uncertain
|
||||
}
|
||||
// Attempt to guess uncertainty based on prefixes.
|
||||
foreach (self::TIME_UNCERTAINTY_SUFFIXES as $prefix) {
|
||||
if (\substr($zeit_name, -1 * \strlen($prefix)) === $prefix) {
|
||||
return false; // Uncertainty found
|
||||
}
|
||||
|
||||
// Attempt to guess uncertainty based on prefixes.
|
||||
foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_SUFFIXES as $prefix) {
|
||||
if (\substr($ort_name, -1 * \strlen($prefix)) === $prefix) {
|
||||
return false; // Uncertain
|
||||
}
|
||||
}
|
||||
|
||||
return true; // Certain / no uncertainty found
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes uncertainty indicators from an actor name.
|
||||
*
|
||||
* @param string $value Input string.
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public static function cleanUncertaintyIndicatorsPersinst(string $value):string {
|
||||
return true; // No uncertainty found
|
||||
|
||||
$value = self::trim($value);
|
||||
}
|
||||
|
||||
if (\in_array(trim($value, ";. "), self::PERSINST_INDICATORS_DISALLOWED, true)) {
|
||||
return "";
|
||||
}
|
||||
/**
|
||||
* Removes uncertainty indicators from an place name.
|
||||
*
|
||||
* @param string $ort_name Input string.
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public static function cleanUncertaintyIndicatorsPlace(string $ort_name):string {
|
||||
|
||||
foreach (self::PERSINST_UNCERTAINTY_PREFIXES as $toRemove) {
|
||||
if (\mb_substr($value, 0, \mb_strlen($toRemove)) === $toRemove) {
|
||||
$value = substr($value, \mb_strlen($toRemove));
|
||||
}
|
||||
}
|
||||
|
||||
foreach (self::PLACE_UNCERTAINTY_SUFFIXES as $suffix) {
|
||||
if (\mb_substr($value, \mb_strlen($suffix) * -1) === "$suffix") {
|
||||
$value = \mb_substr($value, 0, \mb_strlen($suffix) * -1);
|
||||
}
|
||||
}
|
||||
|
||||
return self::trim($value);
|
||||
$ort_name = self::trim($ort_name);
|
||||
|
||||
if (\in_array($ort_name, self::PLACE_INDICATORS_DISALLOWED, true)) {
|
||||
return "";
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempts guessing whether persinst is uncertain. Returns true if the name
|
||||
* indicates certainty, false if it indicates uncertainty.
|
||||
*
|
||||
* @param string $name Persinst name.
|
||||
*
|
||||
* @return boolean
|
||||
*/
|
||||
public static function guessPersinstCertainty(string $name):bool {
|
||||
|
||||
$name = \trim(\strtolower($name));
|
||||
|
||||
// Attempt to guess uncertainty based on prefixes.
|
||||
foreach (NodaUncertaintyHelper::PERSINST_UNCERTAINTY_PREFIXES as $prefix) {
|
||||
if (\substr($name, 0, \strlen($prefix)) === $prefix) {
|
||||
return false; // Uncertain
|
||||
}
|
||||
// Remove uncertainty prefixes
|
||||
foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_PREFIXES as $prefix) {
|
||||
if (\substr($ort_name, 0, \strlen($prefix)) === "$prefix") {
|
||||
$ort_name = substr($ort_name, \strlen($prefix));
|
||||
}
|
||||
|
||||
// Attempt to guess uncertainty based on prefixes.
|
||||
foreach (NodaUncertaintyHelper::PERSINST_UNCERTAINTY_SUFFIXES as $prefix) {
|
||||
if (\substr($name, -1 * \strlen($prefix)) === $prefix) {
|
||||
return false; // Uncertain
|
||||
}
|
||||
}
|
||||
|
||||
return true; // Certain / no uncertainty found
|
||||
|
||||
}
|
||||
|
||||
// Remove uncertainty sufixes
|
||||
foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_SUFFIXES as $suffix) {
|
||||
if (\substr($ort_name, \strlen($suffix) * -1) === "$suffix") {
|
||||
$ort_name = \substr($ort_name, 0, \strlen($suffix) * -1);
|
||||
}
|
||||
}
|
||||
|
||||
return self::trim($ort_name);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempts guessing whether place is uncertain. Returns true if the name
|
||||
* indicates certainty, false if it indicates uncertainty.
|
||||
*
|
||||
* @param string $ort_name Place name.
|
||||
*
|
||||
* @return boolean
|
||||
*/
|
||||
public static function guessPlaceCertainty(string $ort_name):bool {
|
||||
|
||||
$ort_name = \trim(\strtolower($ort_name), ', ;-_');
|
||||
|
||||
// Attempt to guess uncertainty based on prefixes.
|
||||
foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_PREFIXES as $prefix) {
|
||||
if (\substr($ort_name, 0, \strlen($prefix)) === $prefix) {
|
||||
return false; // Uncertain
|
||||
}
|
||||
}
|
||||
|
||||
// Attempt to guess uncertainty based on prefixes.
|
||||
foreach (NodaUncertaintyHelper::PLACE_UNCERTAINTY_SUFFIXES as $prefix) {
|
||||
if (\substr($ort_name, -1 * \strlen($prefix)) === $prefix) {
|
||||
return false; // Uncertain
|
||||
}
|
||||
}
|
||||
|
||||
return true; // Certain / no uncertainty found
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes uncertainty indicators from an actor name.
|
||||
*
|
||||
* @param string $value Input string.
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public static function cleanUncertaintyIndicatorsPersinst(string $value):string {
|
||||
|
||||
$value = self::trim($value);
|
||||
|
||||
if (\in_array(trim($value, ";. "), self::PERSINST_INDICATORS_DISALLOWED, true)) {
|
||||
return "";
|
||||
}
|
||||
|
||||
foreach (self::PERSINST_UNCERTAINTY_PREFIXES as $toRemove) {
|
||||
if (\mb_substr($value, 0, \mb_strlen($toRemove)) === $toRemove) {
|
||||
$value = substr($value, \mb_strlen($toRemove));
|
||||
}
|
||||
}
|
||||
|
||||
foreach (self::PLACE_UNCERTAINTY_SUFFIXES as $suffix) {
|
||||
if (\mb_substr($value, \mb_strlen($suffix) * -1) === "$suffix") {
|
||||
$value = \mb_substr($value, 0, \mb_strlen($suffix) * -1);
|
||||
}
|
||||
}
|
||||
|
||||
return self::trim($value);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempts guessing whether persinst is uncertain. Returns true if the name
|
||||
* indicates certainty, false if it indicates uncertainty.
|
||||
*
|
||||
* @param string $name Persinst name.
|
||||
*
|
||||
* @return boolean
|
||||
*/
|
||||
public static function guessPersinstCertainty(string $name):bool {
|
||||
|
||||
$name = \trim(\strtolower($name), ', ;-_');
|
||||
|
||||
// Attempt to guess uncertainty based on prefixes.
|
||||
foreach (NodaUncertaintyHelper::PERSINST_UNCERTAINTY_PREFIXES as $prefix) {
|
||||
if (\substr($name, 0, \strlen($prefix)) === $prefix) {
|
||||
return false; // Uncertain
|
||||
}
|
||||
}
|
||||
|
||||
// Attempt to guess uncertainty based on prefixes.
|
||||
foreach (NodaUncertaintyHelper::PERSINST_UNCERTAINTY_SUFFIXES as $prefix) {
|
||||
if (\substr($name, -1 * \strlen($prefix)) === $prefix) {
|
||||
return false; // Uncertain
|
||||
}
|
||||
}
|
||||
|
||||
return true; // Certain / no uncertainty found
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -6,12 +6,14 @@
|
||||
*/
|
||||
declare(strict_types = 1);
|
||||
use PHPUnit\Framework\TestCase;
|
||||
use PHPUnit\Framework\Attributes\CoversClass;
|
||||
use PHPUnit\Framework\Attributes\Small;
|
||||
|
||||
/**
|
||||
* This script contains tests for the uncertainty helper.
|
||||
*
|
||||
* @covers \NodaUncertaintyHelper
|
||||
*/
|
||||
#[small]
|
||||
#[CoversClass(\NodaUncertaintyHelper::class)]
|
||||
final class NodaUncertaintyHelperTest extends TestCase {
|
||||
/**
|
||||
* Removes uncertainty indicators from an time name.
|
||||
@ -76,6 +78,9 @@ final class NodaUncertaintyHelperTest extends TestCase {
|
||||
self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("vermutl. Augsburg"));
|
||||
|
||||
self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("wohl Berlin"));
|
||||
self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("Berlin?"));
|
||||
self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("Berlin?,"));
|
||||
self::assertTrue(NodaUncertaintyHelper::guessPlaceCertainty("Berlin,"));
|
||||
self::assertTrue(NodaUncertaintyHelper::guessPlaceCertainty("Berlin"));
|
||||
|
||||
self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("?-Italien"));
|
||||
@ -109,6 +114,9 @@ final class NodaUncertaintyHelperTest extends TestCase {
|
||||
public static function testGuessPersinstCertainty():void {
|
||||
|
||||
self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("wohl Barbarossa"));
|
||||
self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("Barbarossa?"));
|
||||
self::assertFalse(NodaUncertaintyHelper::guessPlaceCertainty("Barbarossa?,"));
|
||||
self::assertTrue(NodaUncertaintyHelper::guessPlaceCertainty("Barbarossa,"));
|
||||
self::assertTrue(NodaUncertaintyHelper::guessPlaceCertainty("Barbarossa"));
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user