From cd49f194f2787905652dbcce853eddea217c367d Mon Sep 17 00:00:00 2001 From: Joshua Ramon Enslin Date: Thu, 3 Oct 2024 15:56:31 +0200 Subject: [PATCH] Refactor wikidata fetcher --- src/NodaWikidataFetcher.php | 225 ++++++++++++++++++++++-------------- 1 file changed, 138 insertions(+), 87 deletions(-) diff --git a/src/NodaWikidataFetcher.php b/src/NodaWikidataFetcher.php index 4aee38e..d49bfcf 100644 --- a/src/NodaWikidataFetcher.php +++ b/src/NodaWikidataFetcher.php @@ -91,6 +91,89 @@ final class NodaWikidataFetcher { } + /** + * Returns Wikipedia links from Wikidata's API output. + * + * @param array $data Wikidata API output. + * + * @return array + */ + private static function _getWikipediaLinksFromWikidataOutput(array $data):array { + + $wikilinks = []; + + foreach (self::LANGUAGES_MAIN_DESC as $tLang) { + if (!isset($data['sitelinks'][$tLang . 'wiki']['url']) || !isset($data['sitelinks'][$tLang . 'wiki']['title'])) continue; + if (!is_string($data['sitelinks'][$tLang . 'wiki']['url']) || !is_string($data['sitelinks'][$tLang . 'wiki']['title'])) continue; + $wikilinks[$tLang] = [ + 'url' => $data['sitelinks'][$tLang . 'wiki']['url'], + 'title' => str_replace(' ', '_', $data['sitelinks'][$tLang . 'wiki']['title']), + ]; + } + + return $wikilinks; + + } + + /** + * Parses coordinates from Wikidata API output. + * + * @param array $data Wikidata API output. + * + * @return array{}|array{longitude: float, latitude: float} + */ + private static function _getPlaceCoordinatesFromWikidata(array $data):array { + + if (!isset($data['claims']['P625'])) { + return []; + } + + $latitude_wd = \filter_var($data['claims']['P625'][0]['mainsnak']['datavalue']['value']['latitude'], FILTER_VALIDATE_FLOAT); + $longitude_wd = \filter_var($data['claims']['P625'][0]['mainsnak']['datavalue']['value']['longitude'], FILTER_VALIDATE_FLOAT); + + if ($latitude_wd === false || $longitude_wd === false) return []; + + return [ + 'longitude' => $longitude_wd, + 'latitude' => $latitude_wd, + ]; + + } + + /** + * Loads Wikipedia page and cleans output. + * + * @param string $lang Language to load in. + * @param string $title Title to load from. + * + * @return string + */ + private static function _getCleanedWikipediaSnippet(string $lang, string $title):string { + + $datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($lang, $title), 10000); + $datafromwiki = strval(json_decode($datafromwiki, true)['parse']['text']['*']); + + return self::_cleanWikidataInput($datafromwiki); + + } + + /** + * Loads data for a single entity from Wikidata. + * + * @param string $wikidata_id Wikidata Q-ID. + * + * @return array + */ + private static function _getWikidataEntity(string $wikidata_id):array { + + $data = json_decode(MD_STD::runCurl("https://www.wikidata.org/wiki/Special:EntityData/" . urlencode($wikidata_id) . ".json", 10000), true); + if ($data === null) { + throw new MDhttpFailedException("Failed fetching from Wikidata. Try again later."); + } + return $data['entities'][$wikidata_id]; + + } + /** * Parses wikidata results to MDNodaLink entries. * @@ -456,6 +539,17 @@ final class NodaWikidataFetcher { throw new MDExpectedException("Invalid URL"); } + if (strpos($linkUrl, "http://www.wikidata.org/entity/") !== false) { + if ($output = self::getWikidataIdFromWikidataLink($linkUrl)) { + return $output; + } + } + if (strpos($linkUrl, "https://www.wikidata.org/entity/") !== false) { + if ($output = self::getWikidataIdFromWikidataLink($linkUrl)) { + return $output; + } + } + if (strpos($linkUrl, "https://www.wikidata.org/wiki/") !== false) { if ($output = self::getWikidataIdFromWikidataLink($linkUrl)) { return $output; @@ -481,7 +575,10 @@ final class NodaWikidataFetcher { */ public static function getWikidataIdFromWikidataLink(string $linkUrl):string { - if (strpos($linkUrl, "https://www.wikidata.org/wiki/") === false) { + if (str_contains($linkUrl, "https://www.wikidata.org/wiki/") === false + && str_contains($linkUrl, "https://www.wikidata.org/entity/") === false + && str_contains($linkUrl, "http://www.wikidata.org/entity/") === false + ) { return ''; } @@ -750,14 +847,13 @@ final class NodaWikidataFetcher { * @param integer $persinst_id Person ID. * @param string $wikidata_id Wikidata ID. * @param string $datafromwiki Data fetched from Wikipedia. - * @param string $wikilink Link to wikipedia entry. * @param string $preflang The user's currently used language. * @param string $lang Currently queried language. * @param string $erfasst_von User who adds the info. * * @return boolean */ - public function retrievePersinstDescFromWikipedia(int $persinst_id, string $wikidata_id, string $datafromwiki, string $wikilink, string $preflang, string $lang, string $erfasst_von):bool { + public function retrievePersinstDescFromWikipedia(int $persinst_id, string $wikidata_id, string $datafromwiki, string $preflang, string $lang, string $erfasst_von):bool { $output = false; @@ -961,46 +1057,27 @@ final class NodaWikidataFetcher { public function retrievePersinstInfoFromWikidataID(string $lang, string $wikidata_id, int $persinst_id, string $erfasst_von) { self::validateWikidataId($wikidata_id); - - $data = json_decode(MD_STD::runCurl("https://www.wikidata.org/wiki/Special:EntityData/" . $wikidata_id . ".json", 10000), true); - if ($data === null) { - throw new MDhttpFailedException("Failed fetching from Wikidata. Try again later."); - } - $data = $data['entities'][$wikidata_id]; + $data = self::_getWikidataEntity($wikidata_id); // Get links to wikipedia - $wikilink = $wikilinkterm = []; - - foreach (self::LANGUAGES_MAIN_DESC as $tLang) { - if (isset($data['sitelinks'][$tLang . 'wiki']['url'])) $wikilink[$tLang] = $data['sitelinks'][$tLang . 'wiki']['url']; - if (isset($data['sitelinks'][$tLang . 'wiki']['title'])) $wikilinkterm[$tLang] = str_replace(' ', '_', $data['sitelinks'][$tLang . 'wiki']['title']); - } + $wikilinks = self::_getWikipediaLinksFromWikidataOutput($data); $alreadyEntered = false; - if (isset($wikilink[$lang]) and isset($wikilinkterm[$lang]) and is_string($wikilinkterm[$lang])) { - - $datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($lang, $wikilinkterm[$lang]), 10000); - $datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*']; - + if (isset($wikilinks[$lang])) { # Process data retrieved from wikipedia - if (!empty($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki))) { - $alreadyEntered = $this->retrievePersinstDescFromWikipedia($persinst_id, $wikidata_id, $datafromwiki, $wikilink[$lang], $lang, $lang, $erfasst_von); + if (!empty($datafromwiki = self::_getCleanedWikipediaSnippet($lang, $wikilinks[$lang]['title']))) { + $alreadyEntered = $this->retrievePersinstDescFromWikipedia($persinst_id, $wikidata_id, $datafromwiki, $lang, $lang, $erfasst_von); } } foreach (self::LANGUAGES_MAIN_DESC as $cur_lang) { - if ($alreadyEntered === true) break; - if (!isset($wikilink[$cur_lang]) || !isset($wikilinkterm[$cur_lang]) || !is_string($wikilinkterm[$cur_lang])) continue; + if ($alreadyEntered === true || !isset($wikilinks[$cur_lang])) continue; - $datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($cur_lang, $wikilinkterm[$cur_lang]), 10000); - $datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*']; - - # Process data retrieved from wikipedia - if ($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki)) { - $alreadyEntered = $this->retrievePersinstDescFromWikipedia($persinst_id, $wikidata_id, $datafromwiki, $wikilink[$cur_lang], $lang, "$cur_lang", $erfasst_von); + if ($datafromwiki = self::_getCleanedWikipediaSnippet($lang, $wikilinks[$cur_lang]['title'])) { + $alreadyEntered = $this->retrievePersinstDescFromWikipedia($persinst_id, $wikidata_id, $datafromwiki, $lang, "$cur_lang", $erfasst_von); } } @@ -1102,17 +1179,16 @@ final class NodaWikidataFetcher { /** * Function for entering base information about a place from wikidata. * - * @param string $cur_place_desc Mysqli result pointing to the current place. - * @param string $datafromwiki Data parsed from wikidata. - * @param array $wikilink Wikilink. - * @param string $preflang Language of the user interface in general. - * @param string $lang Language of the main entry. - * @param integer $placeID ID of the place. - * @param string $erfasst_von User name. + * @param string $cur_place_desc Mysqli result pointing to the current place. + * @param string $datafromwiki Data parsed from wikidata. + * @param string $preflang Language of the user interface in general. + * @param string $lang Language of the main entry. + * @param integer $placeID ID of the place. + * @param string $erfasst_von User name. * * @return boolean */ - public function enterPlaceDescFromWikidata(string $cur_place_desc, string $datafromwiki, array $wikilink, string $preflang, string $lang, int $placeID, string $erfasst_von):bool { + public function enterPlaceDescFromWikidata(string $cur_place_desc, string $datafromwiki, string $preflang, string $lang, int $placeID, string $erfasst_von):bool { $datafromwiki = '"' . $datafromwiki . '" - (Wikipedia (' . $lang . ') ' . date("d.m.Y") . ')'; @@ -1239,37 +1315,27 @@ final class NodaWikidataFetcher { public function retrievePlaceInfoFromWikidataID(string $lang, string $wikidata_id, int $onum, string $erfasst_von) { self::validateWikidataId($wikidata_id); + $data = self::_getWikidataEntity($wikidata_id); - $data = MD_STD::runCurl("https://www.wikidata.org/wiki/Special:EntityData/" . urlencode($wikidata_id) . ".json", 10000); - if (!$data = json_decode($data, true)) { - throw new MDhttpFailedException("Failed fetching from Wikidata. Try again later."); - } - $data = $data['entities'][$wikidata_id]; - - $wikilink = $wikilinkterm = []; - - foreach (self::LANGUAGES_MAIN_DESC as $tLang) { - if (isset($data['sitelinks'][$tLang . 'wiki']['url'])) $wikilink[$tLang] = $data['sitelinks'][$tLang . 'wiki']['url']; - if (isset($data['sitelinks'][$tLang . 'wiki']['title'])) $wikilinkterm[$tLang] = str_replace(' ', '_', $data['sitelinks'][$tLang . 'wiki']['title']); - } + $wikilinks = self::_getWikipediaLinksFromWikidataOutput($data); // Get current description for overwriting - $cur_place_desc = $this->getPlaceDescription($onum); - $alreadyEntered = false; - // P131: Located in administrative unit if (isset($data['claims']['P131'])) { $this->retrieveSuperordinateAdministrativePlace($onum, $data['claims']['P131']); } - if (!empty($wikilink[$lang])) { + $cur_place_desc = $this->getPlaceDescription($onum); + $alreadyEntered = false; - $datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($lang, $wikilinkterm[$lang]), 10000); + if (!empty($wikilinks[$lang])) { + + $datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($lang, $wikilinks[$lang]['title']), 10000); $datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*']; if (!empty($datafromwiki) and !empty($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki))) { - $alreadyEntered = $this->enterPlaceDescFromWikidata($cur_place_desc, $datafromwiki, $wikilink, $lang, $lang, $onum, $erfasst_von); + $alreadyEntered = $this->enterPlaceDescFromWikidata($cur_place_desc, $datafromwiki, $lang, $lang, $onum, $erfasst_von); } } @@ -1277,12 +1343,12 @@ final class NodaWikidataFetcher { //if ($alreadyEntered === true) break; if ($alreadyEntered === true) break; - if (!isset($wikilink[$cur_lang])) continue; + if (!isset($wikilinks[$cur_lang]['url'])) continue; - $datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($cur_lang, $wikilinkterm[$cur_lang]), 10000); + $datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($cur_lang, $wikilinks[$cur_lang]['title']), 10000); $datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*']; if (!empty($datafromwiki) and !empty($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki))) { - $alreadyEntered = $this->enterPlaceDescFromWikidata($cur_place_desc, $datafromwiki, $wikilink, $lang, $cur_lang, $onum, $erfasst_von); + $alreadyEntered = $this->enterPlaceDescFromWikidata($cur_place_desc, $datafromwiki, $lang, $cur_lang, $onum, $erfasst_von); } } @@ -1294,10 +1360,7 @@ final class NodaWikidataFetcher { NodaBatchInserter::linkNodaForPlace($this->_mysqli_noda, $onum, $nodaLinks, $erfasst_von); } - if (isset($data['claims']['P625'])) { - $latitude_wd = \filter_var($data['claims']['P625'][0]['mainsnak']['datavalue']['value']['latitude'], FILTER_VALIDATE_FLOAT); - $longitude_wd = \filter_var($data['claims']['P625'][0]['mainsnak']['datavalue']['value']['longitude'], FILTER_VALIDATE_FLOAT); - } + $coordinates_wd = self::_getPlaceCoordinatesFromWikidata($data); $this->_mysqli_noda->autocommit(false); if (!empty($tgn_id)) { @@ -1322,12 +1385,13 @@ final class NodaWikidataFetcher { unset($updateStmt); } - if (!empty($latitude_wd) and !empty($longitude_wd)) { + + if (!empty($coordinates_wd)) { $updateStmt = $this->_mysqli_noda->do_prepare("UPDATE `orte` SET `ort_nord_sued` = ?, `ort_west_ost` = ? WHERE `ort_id` = ?"); - $updateStmt->bind_param("ddi", $latitude_wd, $longitude_wd, $onum); + $updateStmt->bind_param("ddi", $coordinates_wd['latitude'], $coordinates_wd['longitude'], $onum); $updateStmt->execute(); $updateStmt->close(); unset($updateStmt); @@ -1382,14 +1446,13 @@ final class NodaWikidataFetcher { * * @param integer $tag_id Tag ID. * @param string $datafromwiki Data fetched from Wikipedia. - * @param string $wikilink Link to wikipedia entry. * @param string $preflang The user's currently used language. * @param string $lang Currently queried language. * @param string $erfasst_von User who adds the info. * * @return boolean */ - public function retrieveTagDescFromWikipedia(int $tag_id, string $datafromwiki, string $wikilink, string $preflang, string $lang, string $erfasst_von):bool { + public function retrieveTagDescFromWikipedia(int $tag_id, string $datafromwiki, string $preflang, string $lang, string $erfasst_von):bool { $output = false; @@ -1486,46 +1549,34 @@ final class NodaWikidataFetcher { public function retrieveTagInfoFromWikidataID(string $lang, string $wikidata_id, int $tag_id, string $erfasst_von) { self::validateWikidataId($wikidata_id); + $data = self::_getWikidataEntity($wikidata_id); - $data = MD_STD::runCurl("https://www.wikidata.org/wiki/Special:EntityData/" . $wikidata_id . ".json", 10000); - $data = json_decode($data, true); - if ($data === null) { - throw new MDhttpFailedException("Failed fetching from Wikidata. Try again later."); - } - $data = $data['entities'][$wikidata_id]; - - $wikilink = $wikilinkterm = []; - - foreach (self::LANGUAGES_MAIN_DESC as $tLang) { - if (isset($data['sitelinks'][$tLang . 'wiki']['url'])) $wikilink[$tLang] = $data['sitelinks'][$tLang . 'wiki']['url']; - if (isset($data['sitelinks'][$tLang . 'wiki']['title'])) $wikilinkterm[$tLang] = str_replace(' ', '_', $data['sitelinks'][$tLang . 'wiki']['title']); - } + $wikilinks = self::_getWikipediaLinksFromWikidataOutput($data); $alreadyEntered = false; - if (isset($wikilink[$lang]) and isset($wikilinkterm[$lang]) and is_string($wikilinkterm[$lang])) { + if (isset($wikilinks[$lang])) { - $datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($lang, $wikilinkterm[$lang]), 10000); + $datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($lang, $wikilinks[$lang]['title']), 10000); $datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*']; # Process data retrieved from wikipedia if (!empty($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki))) { - $alreadyEntered = $this->retrieveTagDescFromWikipedia($tag_id, $datafromwiki, $wikilink[$lang], $lang, $lang, $erfasst_von); + $alreadyEntered = $this->retrieveTagDescFromWikipedia($tag_id, $datafromwiki, $lang, $lang, $erfasst_von); } } foreach (self::LANGUAGES_MAIN_DESC as $cur_lang) { - if ($alreadyEntered === true) break; - if (!isset($wikilink[$cur_lang]) || !isset($wikilinkterm[$cur_lang]) || !is_string($wikilinkterm[$cur_lang])) continue; + if ($alreadyEntered === true || !isset($wikilinks[$cur_lang])) continue; - $datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($cur_lang, $wikilinkterm[$cur_lang]), 10000); + $datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($cur_lang, $wikilinks[$cur_lang]['title']), 10000); $datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*']; # Process data retrieved from wikipedia if ($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki)) { - $alreadyEntered = $this->retrieveTagDescFromWikipedia($tag_id, $datafromwiki, $wikilink[$cur_lang], $lang, $cur_lang, $erfasst_von); + $alreadyEntered = $this->retrieveTagDescFromWikipedia($tag_id, $datafromwiki, $lang, $cur_lang, $erfasst_von); } }