Handle large numbers in GND IDs

This commit is contained in:
Joshua Ramon Enslin 2023-10-18 22:35:31 +02:00
parent c57d180aa1
commit 8f32c30fbd
Signed by: jrenslin
GPG Key ID: 46016F84501B70AE
2 changed files with 56 additions and 21 deletions

View File

@ -69,6 +69,7 @@ enum MDNodaRepository implements MDValueEnumInterface, JsonSerializable {
'http://d-nb.info/gnd' => self::gnd, 'http://d-nb.info/gnd' => self::gnd,
'http://d-nb.info/gnd/' => self::gnd, 'http://d-nb.info/gnd/' => self::gnd,
'd-nb.info' => self::gnd, 'd-nb.info' => self::gnd,
'd-nb' => self::gnd,
'https://portal.dnb.de' => self::gnd, 'https://portal.dnb.de' => self::gnd,
'grobsystematik' => self::grobsystematik, 'grobsystematik' => self::grobsystematik,
'iconclass' => self::iconclass, 'iconclass' => self::iconclass,
@ -102,6 +103,9 @@ enum MDNodaRepository implements MDValueEnumInterface, JsonSerializable {
'WIKIPEDIA' => self::wikidata, 'WIKIPEDIA' => self::wikidata,
'wikipedia' => self::wikipedia, 'wikipedia' => self::wikipedia,
'Wikipedia' => self::wikipedia, 'Wikipedia' => self::wikipedia,
'de.wikipedia.org' => self::wikipedia,
'en.wikipedia.org' => self::wikipedia,
'fr.wikipedia.org' => self::wikipedia,
default => throw new MDpageParameterNotFromListException("Unknown norm data repository: '" . $input . "'"), default => throw new MDpageParameterNotFromListException("Unknown norm data repository: '" . $input . "'"),
}; };
@ -258,6 +262,35 @@ enum MDNodaRepository implements MDValueEnumInterface, JsonSerializable {
} }
/**
* Extension of PHP's built-in is_numeric() that also supports large numbers.
*
* @param string $value Input to check.
*
* @return boolean
*/
private static function _is_numeric(string $value):bool {
// Strings starting with 0 are quite often linked, notably with the NDL.
// PHP's FILTER_VALIDATE_INT does not accept a leading 0 however, so it
// is stripped before checking.
if (filter_var(ltrim($value, '0'), FILTER_VALIDATE_INT) !== false) {
return true;
}
// FILTER_VALIDATE_INT fails on overly large IDs (e.g. VIAF IDs having
// more than 20 digits).
// In these cases, simply check for the existence of non-numeric characters.
if (strlen($value) > 9) {
if (empty(trim($value, '0123456789'))) {
return true;
}
}
return false;
}
/** /**
* Validates a numeric ID, returning a string or false. * Validates a numeric ID, returning a string or false.
* *
@ -276,22 +309,9 @@ enum MDNodaRepository implements MDValueEnumInterface, JsonSerializable {
$id = strtr($id, $toRemove); $id = strtr($id, $toRemove);
} }
// FILTER_VALIDATE_INT fails on overly large IDs (e.g. VIAF IDs having if (!self::_is_numeric($id)) {
// more than 20 digits).
// In these cases, simply check for the existence of non-numeric characters.
if (strlen($id) > 9) {
if (empty(trim($id, '0123456789'))) {
return $id;
}
}
// Strings starting with 0 are quite often linked, notably with the NDL.
// PHP's FILTER_VALIDATE_INT does not accept a leading 0 however, so it
// is stripped before checking.
if (filter_var(ltrim($id, '0'), FILTER_VALIDATE_INT) === false) {
return false; return false;
} }
return $id; return $id;
} }
@ -340,6 +360,10 @@ enum MDNodaRepository implements MDValueEnumInterface, JsonSerializable {
*/ */
private static function validateGndId(string $id, array $prefixes):string|false { private static function validateGndId(string $id, array $prefixes):string|false {
if (str_contains($id, ' ')) {
$id = strtr($id, [' ' => '', "\t" => '']);
}
if (filter_var($id, FILTER_VALIDATE_URL) !== false) { if (filter_var($id, FILTER_VALIDATE_URL) !== false) {
$toRemove = []; $toRemove = [];
foreach ($prefixes as $prefix) $toRemove[$prefix] = ""; foreach ($prefixes as $prefix) $toRemove[$prefix] = "";
@ -350,7 +374,8 @@ enum MDNodaRepository implements MDValueEnumInterface, JsonSerializable {
if (preg_match("/^[0-9-X]*$/", $id) === false) { if (preg_match("/^[0-9-X]*$/", $id) === false) {
return false; return false;
} }
if (is_numeric(strtr($id, ['-' => '', 'X' => ''])) === false) { if (self::_is_numeric(strtr($id, ['-' => '', 'X' => ''])) === false) {
throw new Exception($id);
return false; return false;
} }
@ -447,7 +472,7 @@ enum MDNodaRepository implements MDValueEnumInterface, JsonSerializable {
if (preg_match("/^[0-9-PIM]*$/", $id) === false) { if (preg_match("/^[0-9-PIM]*$/", $id) === false) {
return false; return false;
} }
if (is_numeric(substr($id, 3)) === false) { if (self::_is_numeric(substr($id, 3)) === false) {
return false; return false;
} }
@ -468,11 +493,11 @@ enum MDNodaRepository implements MDValueEnumInterface, JsonSerializable {
$id = strtr($id, ['https://www.npg.org.uk/collections/search/person/' => '']); $id = strtr($id, ['https://www.npg.org.uk/collections/search/person/' => '']);
} }
if (substr($id, 0, 2) === 'mp' && is_numeric(substr($id, 2))) { if (substr($id, 0, 2) === 'mp' && self::_is_numeric(substr($id, 2))) {
return $id; return $id;
} }
if (filter_var($id, FILTER_VALIDATE_INT) === false) { if (self::_is_numeric($id) === false) {
return false; return false;
} }
@ -490,11 +515,14 @@ enum MDNodaRepository implements MDValueEnumInterface, JsonSerializable {
private static function validateWikidataId(string $id):string|false { private static function validateWikidataId(string $id):string|false {
if (filter_var($id, FILTER_VALIDATE_URL) !== false) { if (filter_var($id, FILTER_VALIDATE_URL) !== false) {
$id = strtr($id, ['https://www.wikidata.org/wiki/' => '']); $id = strtr($id, [
'https://www.wikidata.org/wiki/' => '',
'https://www.wikidata.org/w/index.php?search=&search=' => '',
]);
} }
if (substr($id, 0, 1) !== 'Q') { if (substr($id, 0, 1) !== 'Q') {
throw new MDgenericInvalidInputsException("Wikidata IDs must be Q IDs - and start with that letter"); throw new MDInvalidNodaLinkException("Wikidata IDs must be Q IDs - and start with that letter (provided: $id)");
} }
if (filter_var(substr($id, 1), FILTER_VALIDATE_INT) === false) { if (filter_var(substr($id, 1), FILTER_VALIDATE_INT) === false) {
@ -537,10 +565,16 @@ enum MDNodaRepository implements MDValueEnumInterface, JsonSerializable {
$validation = strtr($id, [ $validation = strtr($id, [
'http://de.wikipedia.org/wiki/' => '', 'http://de.wikipedia.org/wiki/' => '',
'https://de.wikipedia.org/wiki/' => '', 'https://de.wikipedia.org/wiki/' => '',
'http://da.wikipedia.org/wiki/' => '',
'https://da.wikipedia.org/wiki/' => '',
'http://en.wikipedia.org/wiki/' => '', 'http://en.wikipedia.org/wiki/' => '',
'https://en.wikipedia.org/wiki/' => '', 'https://en.wikipedia.org/wiki/' => '',
'http://es.wikipedia.org/wiki/' => '',
'https://es.wikipedia.org/wiki/' => '',
'http://fr.wikipedia.org/wiki/' => '', 'http://fr.wikipedia.org/wiki/' => '',
'https://fr.wikipedia.org/wiki/' => '', 'https://fr.wikipedia.org/wiki/' => '',
'http://hu.wikipedia.org/wiki/' => '',
'https://hu.wikipedia.org/wiki/' => '',
'http://id.wikipedia.org/wiki/' => '', 'http://id.wikipedia.org/wiki/' => '',
'https://id.wikipedia.org/wiki/' => '', 'https://id.wikipedia.org/wiki/' => '',
'http://it.wikipedia.org/wiki/' => '', 'http://it.wikipedia.org/wiki/' => '',
@ -590,7 +624,7 @@ enum MDNodaRepository implements MDValueEnumInterface, JsonSerializable {
self::mbl => self::validateNumericId($id, ['http://www.uni-magdeburg.de/mbl/PHP_Skripte/mbl_verwaltung/mbl_verw_anzeige_biog.php?auswahl=3&liste_biog_name=']), self::mbl => self::validateNumericId($id, ['http://www.uni-magdeburg.de/mbl/PHP_Skripte/mbl_verwaltung/mbl_verw_anzeige_biog.php?auswahl=3&liste_biog_name=']),
self::mindatorg => self::validateNumericId($id, ['https://www.mindat.org/min-', '.html']), self::mindatorg => self::validateNumericId($id, ['https://www.mindat.org/min-', '.html']),
self::moebeltypologie => self::validateNumericId($id, ['https://term.museum-digital.de/moebel/tag/']), self::moebeltypologie => self::validateNumericId($id, ['https://term.museum-digital.de/moebel/tag/']),
self::ndb_adb => self::validateGndId($id, ['https://www.deutsche-biographie.de/pnd', '.html', '#adbcontent', '#ndbcontent']), self::ndb_adb => self::validateGndId($id, ['https://www.deutsche-biographie.de/pnd', '.html', '#adbcontent', '#ndbcontent', '#indexcontent']),
self::ndl => self::validateNumericId($id, [ self::ndl => self::validateNumericId($id, [
'http://id.ndl.go.jp/auth/ndlna/', 'http://id.ndl.go.jp/auth/ndlna/',
'https://id.ndl.go.jp/auth/ndlna/', 'https://id.ndl.go.jp/auth/ndlna/',

View File

@ -28,6 +28,7 @@ final class MDNodaRepositoryTest extends TestCase {
self::assertEquals("102423008", MDNodaRepository::gnd->validateId("https://d-nb.info/gnd/102423008")); self::assertEquals("102423008", MDNodaRepository::gnd->validateId("https://d-nb.info/gnd/102423008"));
self::assertEquals("102423008", MDNodaRepository::gnd->validateId("http://d-nb.info/gnd/102423008")); self::assertEquals("102423008", MDNodaRepository::gnd->validateId("http://d-nb.info/gnd/102423008"));
self::assertEquals("102423008", MDNodaRepository::gnd->validateId("http://d-nb.info/gnd/ 102423008")); self::assertEquals("102423008", MDNodaRepository::gnd->validateId("http://d-nb.info/gnd/ 102423008"));
self::assertEquals("1037602218", MDNodaRepository::gnd->validateId("http://d-nb.info/gnd/1037602218"));
self::assertEquals("102423008", MDNodaRepository::gnd->validateId("102423008")); self::assertEquals("102423008", MDNodaRepository::gnd->validateId("102423008"));
// NDL (Japan) // NDL (Japan)