Improve parsing of LOC / LCSH from Wikidata

This commit is contained in:
Joshua Ramon Enslin 2023-09-29 16:20:53 +02:00
parent 0a18449e06
commit 9942c58b12
Signed by: jrenslin
GPG Key ID: 46016F84501B70AE

View File

@ -22,27 +22,6 @@ final class NodaWikidataFetcher {
const LANGUAGES_TO_CAPITALIZE = ["cs", "da", "de", "en", "es", "fr", "fi", "id", "it", "nl", "pl", "pt", "ru", "sv", "tl", "tr"];
const URL_PREFIXES_PLACES_NODA_SOURCE = [
"gnd" => "https://d-nb.info/gnd/",
"nomisma" => "http://nomisma.org/id/",
"osm" => "https://www.openstreetmap.org/relation/",
"loc" => "http://id.loc.gov/authorities/names/",
"cona" => "http://vocab.getty.edu/page/cona/",
"aat" => "http://vocab.getty.edu/page/aat/",
"iconclass" => "http://iconclass.org/rkd/",
"lcsh" => "http://id.loc.gov/authorities/subjects/",
"wikidata" => "https://www.wikidata.org/wiki/",
"bne" => "http://datos.bne.es/persona/",
"viaf" => "https://viaf.org/viaf/",
"bnf" => "https://catalogue.bnf.fr/ark:/12148/cb",
"ulan" => "http://vocab.getty.edu/page/ulan/",
"rkd" => "https://rkd.nl/explore/artists/",
"pim" => "https://opac-nevter.pim.hu/en/record/-/record/",
"ndl" => "https://id.ndl.go.jp/auth/ndlna/",
"npg" => "https://www.npg.org.uk/collections/search/person/",
"orcid" => "https://orcid.org/",
];
const P_IDS_NODA_TAGS = [
'gnd' => 'P227',
'lcsh' => 'P244',
@ -112,6 +91,40 @@ final class NodaWikidataFetcher {
}
/**
* Checks if a vocabulary link to the library
* of congress is a valid LCSH or LOC link or
* something else completely.
* This is necessary, since Wikidata only knows
* one type of link to the LOC authority files,
* while museum-digital knows two.
*
* @param string $url LOC ID to check.
*
* @return 'loc'|'lcsh'|''
*/
private function _determineLocRefMode(string $url):string {
try {
if (MDNodaRepository::loc->validateId($url) !== false) {
return 'loc';
}
}
catch (MDgenericInvalidInputsException $e) {
}
try {
if (MDNodaRepository::lcsh->validateId($url) !== false) {
return 'lcsh';
}
}
catch (MDgenericInvalidInputsException $e) {
}
return '';
}
/**
* Cleans basic tags off Wikidata input.
*
@ -959,11 +972,19 @@ final class NodaWikidataFetcher {
new MDNodaLink(MDNodaRepository::wikidata, $wikidata_id)
];
foreach (self::P_IDS_NODA_TAGS as $vocabName => $pId) {
if (!in_array($vocabName, MDNodaRepositoriesSet::REPOSITORIES_ACTOR, true)) continue;
if ($vocabName === 'lcsh') continue;
if (isset($data['claims'][$pId])) {
if (empty($data['claims'][$pId][0]['mainsnak']['datavalue'])) continue;
$nodaLinks[] = new MDNodaLink(MDNodaRepository::fromString($vocabName), $data['claims'][$pId][0]['mainsnak']['datavalue']['value']);
$url = $data['claims'][$pId][0]['mainsnak']['datavalue']['value'];
if ($vocabName === 'loc' || ($vocabName === 'lcsh')) {
$vocabName = $this->_determineLocRefMode($url);
if (empty($vocabName)) continue;
}
if (!in_array($vocabName, MDNodaRepositoriesSet::REPOSITORIES_ACTOR, true)) continue;
$nodaLinks[] = new MDNodaLink(MDNodaRepository::fromString($vocabName), $url);
}
}
@ -1258,11 +1279,18 @@ final class NodaWikidataFetcher {
new MDNodaLink(MDNodaRepository::wikidata, $wikidata_id)
];
foreach (self::P_IDS_NODA_TAGS as $vocabName => $pId) {
if (!in_array($vocabName, MDNodaRepositoriesSet::REPOSITORIES_PLACE, true)) continue;
if ($vocabName === 'lcsh') continue;
if (isset($data['claims'][$pId])) {
if (empty($data['claims'][$pId][0]['mainsnak']['datavalue'])) continue;
$nodaLinks[] = new MDNodaLink(MDNodaRepository::fromString($vocabName), $data['claims'][$pId][0]['mainsnak']['datavalue']['value']);
$url = $data['claims'][$pId][0]['mainsnak']['datavalue']['value'];
if ($vocabName === 'loc' || ($vocabName === 'lcsh')) {
$vocabName = $this->_determineLocRefMode($url);
if (empty($vocabName)) continue;
}
if (!in_array($vocabName, MDNodaRepositoriesSet::REPOSITORIES_PLACE, true)) continue;
$nodaLinks[] = new MDNodaLink(MDNodaRepository::fromString($vocabName), $url);
}
}
@ -1274,7 +1302,7 @@ final class NodaWikidataFetcher {
NodaBatchInserter::linkNodaForPlace($this->_mysqli_noda, $onum, $nodaLinks, $erfasst_von);
$this->_mysqli_noda->autocommit(false);
if (!empty($tgn_id) and is_numeric($tgn_id)) {
if (!empty($tgn_id)) {
$updateStmt = $this->_mysqli_noda->do_prepare("UPDATE `orte`
SET `ort_land` = ?
@ -1285,7 +1313,7 @@ final class NodaWikidataFetcher {
unset($updateStmt);
}
if (!empty($geonames_id) and is_numeric($geonames_id)) {
if (!empty($geonames_id)) {
$updateStmt = $this->_mysqli_noda->do_prepare("UPDATE `orte`
SET `ort_geonames` = ?
@ -1512,11 +1540,19 @@ final class NodaWikidataFetcher {
new MDNodaLink(MDNodaRepository::wikidata, $wikidata_id)
];
foreach (self::P_IDS_NODA_TAGS as $vocabName => $pId) {
if (!in_array($vocabName, MDNodaRepositoriesSet::REPOSITORIES_TAG, true)) continue;
if ($vocabName === 'loc') continue;
if (isset($data['claims'][$pId])) {
if (empty($data['claims'][$pId][0]['mainsnak']['datavalue'])) continue;
$nodaLinks[] = new MDNodaLink(MDNodaRepository::fromString($vocabName), $data['claims'][$pId][0]['mainsnak']['datavalue']['value']);
$url = $data['claims'][$pId][0]['mainsnak']['datavalue']['value'];
if ($vocabName === 'loc' || ($vocabName === 'lcsh')) {
$vocabName = $this->_determineLocRefMode($url);
if (empty($vocabName)) continue;
}
if (!in_array($vocabName, MDNodaRepositoriesSet::REPOSITORIES_TAG, true)) continue;
$nodaLinks[] = new MDNodaLink(MDNodaRepository::fromString($vocabName), $url);
}
}