This repository has been archived on 2023-01-23. You can view files and clone it, but cannot push or open issues or pull requests.
importer-parsers-archive-2022/parsers/SRU-MODS.php
2023-01-23 15:21:35 +01:00

370 lines
17 KiB
PHP

<?PHP
/**
* Parser for XML generated through CSVXML.
*
* @author Joshua Ramon Enslin <joshua@museum-digital.de>
* @author Stefan Rohde-Enslin <s.rohde-enslin@museum-digital.de>
* @link https://imports.museum-digital.org/importer/parsers/csvxml.php
*/
declare(strict_types = 1);
/**
* Parse function.
*
* @param array<mixed> $version Instance to import into.
* @param integer $institution_id Institution to import to.
* @param non-empty-string $XMLFolder Folder of the XML files to import.
* @param string $dataFolder Data folder.
* @param integer $sammlung_id Collection ID. Optional.
* @param boolean $visibility Import objects to be directly visible?.
* @param boolean $insertOnly If set to true, only new objects are added,
* old are not updated.
*
* @return void
*/
function parseImportXML(array $version, int $institution_id, string $XMLFolder, string $dataFolder = "", int $sammlung_id = 0, bool $visibility = false, bool $insertOnly = false) {
if (!is_dir(MD_IMPORTER_CONF::$import_dir_xml . "{$XMLFolder}")) throw new MDFileDoesNotExist("The folder to import from ($XMLFolder) does not exist.");
if (empty($dataFolder)) {
$importImages = false;
}
else $importImages = true;
// Set up writers
$collectionWriter = new MDCollectionWriter($version['mainDB']);
$linkWriter = new MDLinkWriter($version['mainDB']);
$seriesWriter = new MDSeriesWriter($version['mainDB']);
$exhibitionWriter = new MDExhibitionWriter($version['mainDB']);
$objectRecordWriter = new MDObjectRecordWriter($version['mainDB']);
$tagWriter = new MDTagWriter($version['nodaDB']);
$outputHandler = new MDOutputHandler;
$outputHandler->setVerbosity(2);
$objectWriter = new MDObjectWriter($version['mainDB'], $version['nodaDB'], $version['link'], $version['filepath'], $version['dataFolderLink']);
/*
$objectWriter->disableUpdateBaseData = true;
$objectWriter->disableImportAdditionalData = true;
*/
$objectWriter->disableImportImagesResources = true;
$objectWriter->disableImportTranscriptions = true;
$objectWriter->disableImportSeries = true;
$objectWriter->disableImportObjectRecords = true;
$objectWriter->disableImportExhibitions = true;
$objectWriter->disableImportTags = true;
$objectWriter->disableImportLiterature = true;
$objectWriter->disableImportMarkings = true;
/*
$objectWriter->disableImportCollections = true;
$objectWriter->disableImportHyperlinks = true;
$objectWriter->disableImportReception = true;
*/
$i = 0;
$startAtCounter = 0;
foreach (MD_STD::scandir(MD_IMPORTER_CONF::$import_dir_xml . "{$XMLFolder}") as $xmlFile) {
$fileContents = MD_STD::file_get_contents(MD_IMPORTER_CONF::$import_dir_xml . "{$XMLFolder}/{$xmlFile}");
$allRecords = explode('_____-----_____', $fileContents);
$fileContents = null;
$languages_iso639 = array_flip(MDLanguagesSet::LANGUAGES_ISO639_2B);
foreach ($allRecords as $recordStr) {
++$i;
if ($i < $startAtCounter) {
continue;
}
if (!($objectData = simplexml_load_string($recordStr, "SimpleXMLElement", LIBXML_NOCDATA))) {
throw new Exception("Cannot load raw data into SimpleXML ({$recordStr})");
}
unset($invNo);
if (!empty((string)$objectData->location->shelfLocator)
and (\preg_match("/^Hs\-[0-9][0-9][0-9][0-9]$/", (string)$objectData->location->shelfLocator)
|| \preg_match("/^Hs\-[0-9][0-9][0-9][0-9][0-9]$/", (string)$objectData->location->shelfLocator))
) {
$invNo = (string)$objectData->location->shelfLocator;
$outputHandler->toLog("Using inventory number $invNo (Hs- set) - " . $objectData->identifier, 2);
}
else if (!empty((string)$objectData->location->shelfLocator)
and (\preg_match("/^[0-9][0-9][0-9][0-9]$/", (string)$objectData->location->shelfLocator)
|| \preg_match("/^[0-9][0-9][0-9][0-9][0-9]$/", (string)$objectData->location->shelfLocator))
) {
$invNo = 'Hs-' . (string)$objectData->location->shelfLocator;
$outputHandler->toLog("Using inventory number $invNo", 2);
}
if (empty($invNo)) {
$invNo = substr((string)$objectData->identifier, strrpos((string)$objectData->identifier, '/') + 1);
}
$description = $objectData->abstract . PHP_EOL . PHP_EOL . $objectData->physicalDescription;
/*
if ($invNo !== 'Hs-28865') continue;
print_r($objectData);
exit;
*/
$title = (string)$objectData->titleInfo->title;
if (!empty((string)$objectData->titleInfo->subTitle)) {
$title .= ': ' . (string)$objectData->titleInfo->subTitle;
unset($objectData->titleInfo->subTitle);
}
// Check if inventory number is known already
$result = $version['mainDB']->query_by_stmt("SELECT 1
FROM `objekt`
WHERE `objekt_inventarnr` = ?", "s", $invNo);
if ($result->num_rows !== 0) {
$result->close();
$result = null;
continue;
}
$result->close();
$result = null;
$object = new MDObject($version['mainDB'], $version['nodaDB'], $version['language'], $institution_id, $invNo, (string)$objectData->genre, $title, $description, $outputHandler);
if (!empty($objectData->physicalDescription->extent)) {
if (str_ends_with((string)$objectData->physicalDescription->extent, " Br.")) {
if (strlen((string)$objectData->physicalDescription->extent) < 8) {
$object->set_string("stueckzahl", str_replace(" Br.", "", (string)$objectData->physicalDescription->extent));
}
}
else {
$object->append_objekt_beschreibung((string)$objectData->physicalDescription->extent);
}
}
unset($objectData->titleInfo->title, $objectData->abstract, $objectData->physicalDescription);
if (count($objectData->titleInfo->children()) === 0) {
unset($objectData->titleInfo);
}
// Will later use $objectData->identifier for setting a link to the source repository
// Will later use $objectData->genre for tags
$object->appendTagByName((string)$objectData->genre, "", $tagWriter);
unset($objectData->genre);
if (count($objectData->note) > 1) {
foreach ($objectData->note as $note) {
$object->append_string("notizen_text1", PHP_EOL . (string)$note);
}
}
else if (!empty((string)$objectData->note)) {
$object->set_string("notizen_text1", (string)$objectData->note);
}
unset($objectData->note);
unset($objectData->location->physicalLocation);
if (!empty((string)$objectData->location->shelfLocator)) {
$object->append_string("standort_eigentlich", (string)$objectData->location->shelfLocator);
}
unset($objectData->location->shelfLocator);
unset($objectData->location);
foreach ($objectData->name as $actor) {
$linkTypeName = null;
if (!empty((string)$actor->role->roleTerm[1])) {
$linkTypeName = (string)$actor->role->roleTerm[1];
}
else if (!empty((string)$actor->role->roleTerm)) {
$linkTypeName = (string)$actor->role->roleTerm;
}
if (!isset(MDConcActor::ACTOR_ROLES_TO_EVENT_TYPE[$linkTypeName])) {
throw new Exception("Unknown actor type: " . (string)$linkTypeName . ' in ' . (string)$objectData->identifier . ' for ' . (string)$actor->namePart . ' ///// ' . PHP_EOL . PHP_EOL . var_export($actor->role->roleTerm, true));
}
$linkType = MDConcActor::ACTOR_ROLES_TO_EVENT_TYPE[$linkTypeName];
$gndUrl = (string)$actor->attributes()->valueURI;
$gndId = substr($gndUrl, strrpos($gndUrl, '/') + 1);
$md_event = new MDEvent($version['mainDB'], $version['nodaDB'], $version['language'], $linkType, $outputHandler);
$md_event->set_persinst_id((string)$actor->namePart, "", "", ['gnd' => $gndId]);
if ($md_event->get_persinst_id() !== 0) {
// Handle origin information for production events
if (!empty($objectData->originInfo)) {
if (in_array($linkType, MDEventsSet::EVENTS_PRODUCTION, true) and !empty($objectData->originInfo->dateCreated)) {
if (str_starts_with((string)$objectData->originInfo->dateCreated[1], 'o.D. [')) {
$md_event->set_zeiten_id(trim(substr((string)$objectData->originInfo->dateCreated[1], 5), " []"));
}
else if (str_contains((string)$objectData->originInfo->dateCreated[1], "[")
and strlen((string)$objectData->originInfo->dateCreated[1]) === 23
and substr((string)$objectData->originInfo->dateCreated[1], 0, 10) === substr((string)$objectData->originInfo->dateCreated[1], 12, 10)
) {
$md_event->set_zeiten_id(substr((string)$objectData->originInfo->dateCreated[1], 0, 10));
}
/*
else if (str_contains((string)$objectData->originInfo->dateCreated[1], "[")) {
echo substr((string)$objectData->originInfo->dateCreated[1], 12, 10);
print_r($objectData->originInfo->dateCreated);
echo "strlen:" . strlen((string)$objectData->originInfo->dateCreated[1]);
exit;
}
*/
else $md_event->set_zeiten_id((string)$objectData->originInfo->dateCreated[1]);
unset($objectData->originInfo->dateCreated);
}
if (in_array($linkType, MDEventsSet::EVENTS_PRODUCTION, true) and !empty($objectData->originInfo->place)) {
$md_event->set_orte_id((string)$objectData->originInfo->place->placeTerm);
unset($objectData->originInfo->place->placeTerm);
if (empty($objectData->originInfo->place->children())) unset($objectData->originInfo->place);
}
if (empty($objectData->originInfo->children())) unset($objectData->originInfo);
}
$object->appendEvent($md_event);
}
}
unset($objectData->name);
// If originInfo is still set here, try handling it.
if (!empty($objectData->originInfo)) {
$md_event = new MDEvent($version['mainDB'], $version['nodaDB'], $version['language'], 1, $outputHandler);
// Handle origin information for production events
if (!empty($objectData->originInfo->dateCreated)) {
$md_event->set_zeiten_id((string)$objectData->originInfo->dateCreated[1]);
unset($objectData->originInfo->dateCreated);
}
if (!empty($objectData->originInfo->place)) {
$md_event->set_orte_id((string)$objectData->originInfo->place->placeTerm);
unset($objectData->originInfo->place->placeTerm);
if (empty($objectData->originInfo->place->children())) unset($objectData->originInfo->place);
}
if (empty($objectData->originInfo->children())) unset($objectData->originInfo);
if ($md_event->get_zeiten_id() !== 0 || $md_event->get_orte_id() !== 0) {
$object->appendEvent($md_event);
}
}
if (!empty($objectData->language)) {
$object->set_string("content_language", $languages_iso639[(string)$objectData->language->languageTerm[0]]);
$object->set_bool("content_language_show_md", true);
$object->set_bool("content_language_show_extern", true);
unset($objectData->language);
}
if (!empty($objectData->relatedItem)) {
foreach ($objectData->relatedItem as $relatedItem) {
if ((string)$relatedItem->attributes()->type !== 'constituent') {
continue;
}
if (in_array((string)$relatedItem->recordInfo->recordIdentifier, ["ead_DE-F25_37_VirtuellerBestand", "ead_DE-F25_37_VirtuellerBestand_added"], true)) {
$object->appendCollectionByID(5);
}
else if ((string)$relatedItem->recordInfo->recordIdentifier === "DE-611-BF-9624") {
$object->appendCollectionByID(521);
}
else if ((string)$relatedItem->recordInfo->recordIdentifier === "DE-611-BF-9626") {
$object->appendCollectionByID(534);
}
else if ((string)$relatedItem->recordInfo->recordIdentifier === "DE-611-BF-9627") {
$object->appendCollectionByID(543);
}
else if ((string)$relatedItem->recordInfo->recordIdentifier === "DE-611-BF-9629") {
$object->appendCollectionByID(541);
}
else if ((string)$relatedItem->recordInfo->recordIdentifier === "DE-611-BF-9630") {
$object->appendCollectionByID(540);
}
else if ((string)$relatedItem->recordInfo->recordIdentifier === "DE-611-BF-9621") {
$object->appendCollectionByID(539);
}
else if ((string)$relatedItem->recordInfo->recordIdentifier === "DE-611-BF-9623") {
$object->appendCollectionByID(538);
}
else if ((string)$relatedItem->recordInfo->recordIdentifier === "DE-611-BF-9625") {
$object->appendCollectionByID(537);
}
else if ((string)$relatedItem->recordInfo->recordIdentifier === "DE-611-BF-9622") {
$object->appendCollectionByID(536);
}
else if ((string)$relatedItem->recordInfo->recordIdentifier === "DE-611-BF-37593") {
$object->appendCollectionByID(545);
}
else if ((string)$relatedItem->recordInfo->recordIdentifier === "DE-611-BF-9636") {
$object->appendCollectionByID(544);
}
else if (str_starts_with((string)$relatedItem->titleInfo->title, 'Nachlaß')
|| str_starts_with((string)$relatedItem->titleInfo->title, 'Teilnachlaß')
) {
continue;
}
else {
throw new Exception("Unknown collection: " . var_export($relatedItem, true));
}
}
}
// For now: skip relatedItem
unset($objectData->relatedItem);
unset($objectData->recordInfo->recordContentSource);
$object->appendLinkByName((string)$objectData->identifier, "Das Objekt bei Kalliope");
unset($objectData->identifier);
unset($objectData->recordInfo->recordIdentifier);
// Import edit history at kalliope
$editHistory = [];
$editHistory[] = "Ersterfassung bei Kalliope: " . $objectData->recordInfo->recordCreationDate;
foreach ($objectData->recordInfo->recordChangeDate as $changeDate) {
$editHistory[] = "Bearbeitung bei Kalliope: " . (string)$changeDate;
}
$editHistStr = implode(PHP_EOL, $editHistory);
$object->append_string("notizen_text1", PHP_EOL . PHP_EOL . $editHistStr);
unset($objectData->recordInfo->recordCreationDate, $objectData->recordInfo->recordChangeDate);
if (count($objectData->recordInfo->children()) === 0) {
unset($objectData->recordInfo);
}
if (count($objectData->children()) === 0) {
unset($objectData);
}
$object->set_objekt_publik($visibility);
$newObjectID = $objectWriter->writeObject($object, true, $insertOnly, $outputHandler);
if (!empty($objectData)) {
throw new MDParserIncomplete("Incomplete parser: " . var_export($objectData, true));
}
$outputHandler->toLog("Done with object $i", 2);
// Sleep for a millisecond
usleep(IMPORTER_DELAY_PER_OBJECT);
}
}
}