From 8f32c30fbd0d9b0e9170e83d975c150f6e287715 Mon Sep 17 00:00:00 2001 From: Joshua Ramon Enslin Date: Wed, 18 Oct 2023 22:35:31 +0200 Subject: [PATCH] Handle large numbers in GND IDs --- src/enums/MDNodaRepository.php | 76 ++++++++++++++++++++++++---------- tests/MDNodaRepositoryTest.php | 1 + 2 files changed, 56 insertions(+), 21 deletions(-) diff --git a/src/enums/MDNodaRepository.php b/src/enums/MDNodaRepository.php index 7f06d91..9a91043 100644 --- a/src/enums/MDNodaRepository.php +++ b/src/enums/MDNodaRepository.php @@ -69,6 +69,7 @@ enum MDNodaRepository implements MDValueEnumInterface, JsonSerializable { 'http://d-nb.info/gnd' => self::gnd, 'http://d-nb.info/gnd/' => self::gnd, 'd-nb.info' => self::gnd, + 'd-nb' => self::gnd, 'https://portal.dnb.de' => self::gnd, 'grobsystematik' => self::grobsystematik, 'iconclass' => self::iconclass, @@ -102,6 +103,9 @@ enum MDNodaRepository implements MDValueEnumInterface, JsonSerializable { 'WIKIPEDIA' => self::wikidata, 'wikipedia' => self::wikipedia, 'Wikipedia' => self::wikipedia, + 'de.wikipedia.org' => self::wikipedia, + 'en.wikipedia.org' => self::wikipedia, + 'fr.wikipedia.org' => self::wikipedia, default => throw new MDpageParameterNotFromListException("Unknown norm data repository: '" . $input . "'"), }; @@ -258,6 +262,35 @@ enum MDNodaRepository implements MDValueEnumInterface, JsonSerializable { } + /** + * Extension of PHP's built-in is_numeric() that also supports large numbers. + * + * @param string $value Input to check. + * + * @return boolean + */ + private static function _is_numeric(string $value):bool { + + // Strings starting with 0 are quite often linked, notably with the NDL. + // PHP's FILTER_VALIDATE_INT does not accept a leading 0 however, so it + // is stripped before checking. + if (filter_var(ltrim($value, '0'), FILTER_VALIDATE_INT) !== false) { + return true; + } + + // FILTER_VALIDATE_INT fails on overly large IDs (e.g. VIAF IDs having + // more than 20 digits). + // In these cases, simply check for the existence of non-numeric characters. + if (strlen($value) > 9) { + if (empty(trim($value, '0123456789'))) { + return true; + } + } + + return false; + + } + /** * Validates a numeric ID, returning a string or false. * @@ -276,22 +309,9 @@ enum MDNodaRepository implements MDValueEnumInterface, JsonSerializable { $id = strtr($id, $toRemove); } - // FILTER_VALIDATE_INT fails on overly large IDs (e.g. VIAF IDs having - // more than 20 digits). - // In these cases, simply check for the existence of non-numeric characters. - if (strlen($id) > 9) { - if (empty(trim($id, '0123456789'))) { - return $id; - } - } - - // Strings starting with 0 are quite often linked, notably with the NDL. - // PHP's FILTER_VALIDATE_INT does not accept a leading 0 however, so it - // is stripped before checking. - if (filter_var(ltrim($id, '0'), FILTER_VALIDATE_INT) === false) { + if (!self::_is_numeric($id)) { return false; } - return $id; } @@ -340,6 +360,10 @@ enum MDNodaRepository implements MDValueEnumInterface, JsonSerializable { */ private static function validateGndId(string $id, array $prefixes):string|false { + if (str_contains($id, ' ')) { + $id = strtr($id, [' ' => '', "\t" => '']); + } + if (filter_var($id, FILTER_VALIDATE_URL) !== false) { $toRemove = []; foreach ($prefixes as $prefix) $toRemove[$prefix] = ""; @@ -350,7 +374,8 @@ enum MDNodaRepository implements MDValueEnumInterface, JsonSerializable { if (preg_match("/^[0-9-X]*$/", $id) === false) { return false; } - if (is_numeric(strtr($id, ['-' => '', 'X' => ''])) === false) { + if (self::_is_numeric(strtr($id, ['-' => '', 'X' => ''])) === false) { + throw new Exception($id); return false; } @@ -447,7 +472,7 @@ enum MDNodaRepository implements MDValueEnumInterface, JsonSerializable { if (preg_match("/^[0-9-PIM]*$/", $id) === false) { return false; } - if (is_numeric(substr($id, 3)) === false) { + if (self::_is_numeric(substr($id, 3)) === false) { return false; } @@ -468,11 +493,11 @@ enum MDNodaRepository implements MDValueEnumInterface, JsonSerializable { $id = strtr($id, ['https://www.npg.org.uk/collections/search/person/' => '']); } - if (substr($id, 0, 2) === 'mp' && is_numeric(substr($id, 2))) { + if (substr($id, 0, 2) === 'mp' && self::_is_numeric(substr($id, 2))) { return $id; } - if (filter_var($id, FILTER_VALIDATE_INT) === false) { + if (self::_is_numeric($id) === false) { return false; } @@ -490,11 +515,14 @@ enum MDNodaRepository implements MDValueEnumInterface, JsonSerializable { private static function validateWikidataId(string $id):string|false { if (filter_var($id, FILTER_VALIDATE_URL) !== false) { - $id = strtr($id, ['https://www.wikidata.org/wiki/' => '']); + $id = strtr($id, [ + 'https://www.wikidata.org/wiki/' => '', + 'https://www.wikidata.org/w/index.php?search=&search=' => '', + ]); } if (substr($id, 0, 1) !== 'Q') { - throw new MDgenericInvalidInputsException("Wikidata IDs must be Q IDs - and start with that letter"); + throw new MDInvalidNodaLinkException("Wikidata IDs must be Q IDs - and start with that letter (provided: $id)"); } if (filter_var(substr($id, 1), FILTER_VALIDATE_INT) === false) { @@ -537,10 +565,16 @@ enum MDNodaRepository implements MDValueEnumInterface, JsonSerializable { $validation = strtr($id, [ 'http://de.wikipedia.org/wiki/' => '', 'https://de.wikipedia.org/wiki/' => '', + 'http://da.wikipedia.org/wiki/' => '', + 'https://da.wikipedia.org/wiki/' => '', 'http://en.wikipedia.org/wiki/' => '', 'https://en.wikipedia.org/wiki/' => '', + 'http://es.wikipedia.org/wiki/' => '', + 'https://es.wikipedia.org/wiki/' => '', 'http://fr.wikipedia.org/wiki/' => '', 'https://fr.wikipedia.org/wiki/' => '', + 'http://hu.wikipedia.org/wiki/' => '', + 'https://hu.wikipedia.org/wiki/' => '', 'http://id.wikipedia.org/wiki/' => '', 'https://id.wikipedia.org/wiki/' => '', 'http://it.wikipedia.org/wiki/' => '', @@ -590,7 +624,7 @@ enum MDNodaRepository implements MDValueEnumInterface, JsonSerializable { self::mbl => self::validateNumericId($id, ['http://www.uni-magdeburg.de/mbl/PHP_Skripte/mbl_verwaltung/mbl_verw_anzeige_biog.php?auswahl=3&liste_biog_name=']), self::mindatorg => self::validateNumericId($id, ['https://www.mindat.org/min-', '.html']), self::moebeltypologie => self::validateNumericId($id, ['https://term.museum-digital.de/moebel/tag/']), - self::ndb_adb => self::validateGndId($id, ['https://www.deutsche-biographie.de/pnd', '.html', '#adbcontent', '#ndbcontent']), + self::ndb_adb => self::validateGndId($id, ['https://www.deutsche-biographie.de/pnd', '.html', '#adbcontent', '#ndbcontent', '#indexcontent']), self::ndl => self::validateNumericId($id, [ 'http://id.ndl.go.jp/auth/ndlna/', 'https://id.ndl.go.jp/auth/ndlna/', diff --git a/tests/MDNodaRepositoryTest.php b/tests/MDNodaRepositoryTest.php index f23c728..48c5727 100644 --- a/tests/MDNodaRepositoryTest.php +++ b/tests/MDNodaRepositoryTest.php @@ -28,6 +28,7 @@ final class MDNodaRepositoryTest extends TestCase { self::assertEquals("102423008", MDNodaRepository::gnd->validateId("https://d-nb.info/gnd/102423008")); self::assertEquals("102423008", MDNodaRepository::gnd->validateId("http://d-nb.info/gnd/102423008")); self::assertEquals("102423008", MDNodaRepository::gnd->validateId("http://d-nb.info/gnd/ 102423008")); + self::assertEquals("1037602218", MDNodaRepository::gnd->validateId("http://d-nb.info/gnd/1037602218")); self::assertEquals("102423008", MDNodaRepository::gnd->validateId("102423008")); // NDL (Japan)