Handle large numbers in GND IDs

This commit is contained in:
Joshua Ramon Enslin 2023-10-18 22:35:31 +02:00
parent c57d180aa1
commit 8f32c30fbd
Signed by: jrenslin
GPG Key ID: 46016F84501B70AE
2 changed files with 56 additions and 21 deletions

View File

@ -69,6 +69,7 @@ enum MDNodaRepository implements MDValueEnumInterface, JsonSerializable {
'http://d-nb.info/gnd' => self::gnd,
'http://d-nb.info/gnd/' => self::gnd,
'd-nb.info' => self::gnd,
'd-nb' => self::gnd,
'https://portal.dnb.de' => self::gnd,
'grobsystematik' => self::grobsystematik,
'iconclass' => self::iconclass,
@ -102,6 +103,9 @@ enum MDNodaRepository implements MDValueEnumInterface, JsonSerializable {
'WIKIPEDIA' => self::wikidata,
'wikipedia' => self::wikipedia,
'Wikipedia' => self::wikipedia,
'de.wikipedia.org' => self::wikipedia,
'en.wikipedia.org' => self::wikipedia,
'fr.wikipedia.org' => self::wikipedia,
default => throw new MDpageParameterNotFromListException("Unknown norm data repository: '" . $input . "'"),
};
@ -258,6 +262,35 @@ enum MDNodaRepository implements MDValueEnumInterface, JsonSerializable {
}
/**
* Extension of PHP's built-in is_numeric() that also supports large numbers.
*
* @param string $value Input to check.
*
* @return boolean
*/
private static function _is_numeric(string $value):bool {
// Strings starting with 0 are quite often linked, notably with the NDL.
// PHP's FILTER_VALIDATE_INT does not accept a leading 0 however, so it
// is stripped before checking.
if (filter_var(ltrim($value, '0'), FILTER_VALIDATE_INT) !== false) {
return true;
}
// FILTER_VALIDATE_INT fails on overly large IDs (e.g. VIAF IDs having
// more than 20 digits).
// In these cases, simply check for the existence of non-numeric characters.
if (strlen($value) > 9) {
if (empty(trim($value, '0123456789'))) {
return true;
}
}
return false;
}
/**
* Validates a numeric ID, returning a string or false.
*
@ -276,22 +309,9 @@ enum MDNodaRepository implements MDValueEnumInterface, JsonSerializable {
$id = strtr($id, $toRemove);
}
// FILTER_VALIDATE_INT fails on overly large IDs (e.g. VIAF IDs having
// more than 20 digits).
// In these cases, simply check for the existence of non-numeric characters.
if (strlen($id) > 9) {
if (empty(trim($id, '0123456789'))) {
return $id;
}
}
// Strings starting with 0 are quite often linked, notably with the NDL.
// PHP's FILTER_VALIDATE_INT does not accept a leading 0 however, so it
// is stripped before checking.
if (filter_var(ltrim($id, '0'), FILTER_VALIDATE_INT) === false) {
if (!self::_is_numeric($id)) {
return false;
}
return $id;
}
@ -340,6 +360,10 @@ enum MDNodaRepository implements MDValueEnumInterface, JsonSerializable {
*/
private static function validateGndId(string $id, array $prefixes):string|false {
if (str_contains($id, ' ')) {
$id = strtr($id, [' ' => '', "\t" => '']);
}
if (filter_var($id, FILTER_VALIDATE_URL) !== false) {
$toRemove = [];
foreach ($prefixes as $prefix) $toRemove[$prefix] = "";
@ -350,7 +374,8 @@ enum MDNodaRepository implements MDValueEnumInterface, JsonSerializable {
if (preg_match("/^[0-9-X]*$/", $id) === false) {
return false;
}
if (is_numeric(strtr($id, ['-' => '', 'X' => ''])) === false) {
if (self::_is_numeric(strtr($id, ['-' => '', 'X' => ''])) === false) {
throw new Exception($id);
return false;
}
@ -447,7 +472,7 @@ enum MDNodaRepository implements MDValueEnumInterface, JsonSerializable {
if (preg_match("/^[0-9-PIM]*$/", $id) === false) {
return false;
}
if (is_numeric(substr($id, 3)) === false) {
if (self::_is_numeric(substr($id, 3)) === false) {
return false;
}
@ -468,11 +493,11 @@ enum MDNodaRepository implements MDValueEnumInterface, JsonSerializable {
$id = strtr($id, ['https://www.npg.org.uk/collections/search/person/' => '']);
}
if (substr($id, 0, 2) === 'mp' && is_numeric(substr($id, 2))) {
if (substr($id, 0, 2) === 'mp' && self::_is_numeric(substr($id, 2))) {
return $id;
}
if (filter_var($id, FILTER_VALIDATE_INT) === false) {
if (self::_is_numeric($id) === false) {
return false;
}
@ -490,11 +515,14 @@ enum MDNodaRepository implements MDValueEnumInterface, JsonSerializable {
private static function validateWikidataId(string $id):string|false {
if (filter_var($id, FILTER_VALIDATE_URL) !== false) {
$id = strtr($id, ['https://www.wikidata.org/wiki/' => '']);
$id = strtr($id, [
'https://www.wikidata.org/wiki/' => '',
'https://www.wikidata.org/w/index.php?search=&search=' => '',
]);
}
if (substr($id, 0, 1) !== 'Q') {
throw new MDgenericInvalidInputsException("Wikidata IDs must be Q IDs - and start with that letter");
throw new MDInvalidNodaLinkException("Wikidata IDs must be Q IDs - and start with that letter (provided: $id)");
}
if (filter_var(substr($id, 1), FILTER_VALIDATE_INT) === false) {
@ -537,10 +565,16 @@ enum MDNodaRepository implements MDValueEnumInterface, JsonSerializable {
$validation = strtr($id, [
'http://de.wikipedia.org/wiki/' => '',
'https://de.wikipedia.org/wiki/' => '',
'http://da.wikipedia.org/wiki/' => '',
'https://da.wikipedia.org/wiki/' => '',
'http://en.wikipedia.org/wiki/' => '',
'https://en.wikipedia.org/wiki/' => '',
'http://es.wikipedia.org/wiki/' => '',
'https://es.wikipedia.org/wiki/' => '',
'http://fr.wikipedia.org/wiki/' => '',
'https://fr.wikipedia.org/wiki/' => '',
'http://hu.wikipedia.org/wiki/' => '',
'https://hu.wikipedia.org/wiki/' => '',
'http://id.wikipedia.org/wiki/' => '',
'https://id.wikipedia.org/wiki/' => '',
'http://it.wikipedia.org/wiki/' => '',
@ -590,7 +624,7 @@ enum MDNodaRepository implements MDValueEnumInterface, JsonSerializable {
self::mbl => self::validateNumericId($id, ['http://www.uni-magdeburg.de/mbl/PHP_Skripte/mbl_verwaltung/mbl_verw_anzeige_biog.php?auswahl=3&liste_biog_name=']),
self::mindatorg => self::validateNumericId($id, ['https://www.mindat.org/min-', '.html']),
self::moebeltypologie => self::validateNumericId($id, ['https://term.museum-digital.de/moebel/tag/']),
self::ndb_adb => self::validateGndId($id, ['https://www.deutsche-biographie.de/pnd', '.html', '#adbcontent', '#ndbcontent']),
self::ndb_adb => self::validateGndId($id, ['https://www.deutsche-biographie.de/pnd', '.html', '#adbcontent', '#ndbcontent', '#indexcontent']),
self::ndl => self::validateNumericId($id, [
'http://id.ndl.go.jp/auth/ndlna/',
'https://id.ndl.go.jp/auth/ndlna/',

View File

@ -28,6 +28,7 @@ final class MDNodaRepositoryTest extends TestCase {
self::assertEquals("102423008", MDNodaRepository::gnd->validateId("https://d-nb.info/gnd/102423008"));
self::assertEquals("102423008", MDNodaRepository::gnd->validateId("http://d-nb.info/gnd/102423008"));
self::assertEquals("102423008", MDNodaRepository::gnd->validateId("http://d-nb.info/gnd/ 102423008"));
self::assertEquals("1037602218", MDNodaRepository::gnd->validateId("http://d-nb.info/gnd/1037602218"));
self::assertEquals("102423008", MDNodaRepository::gnd->validateId("102423008"));
// NDL (Japan)