Refactor wikidata fetcher

This commit is contained in:
Joshua Ramon Enslin 2024-10-03 15:56:31 +02:00
parent 9b63a4d95d
commit cd49f194f2
Signed by: jrenslin
GPG Key ID: 46016F84501B70AE

View File

@ -91,6 +91,89 @@ final class NodaWikidataFetcher {
} }
/**
* Returns Wikipedia links from Wikidata's API output.
*
* @param array<mixed> $data Wikidata API output.
*
* @return array<string, array{url: string, title: string}>
*/
private static function _getWikipediaLinksFromWikidataOutput(array $data):array {
$wikilinks = [];
foreach (self::LANGUAGES_MAIN_DESC as $tLang) {
if (!isset($data['sitelinks'][$tLang . 'wiki']['url']) || !isset($data['sitelinks'][$tLang . 'wiki']['title'])) continue;
if (!is_string($data['sitelinks'][$tLang . 'wiki']['url']) || !is_string($data['sitelinks'][$tLang . 'wiki']['title'])) continue;
$wikilinks[$tLang] = [
'url' => $data['sitelinks'][$tLang . 'wiki']['url'],
'title' => str_replace(' ', '_', $data['sitelinks'][$tLang . 'wiki']['title']),
];
}
return $wikilinks;
}
/**
* Parses coordinates from Wikidata API output.
*
* @param array<mixed> $data Wikidata API output.
*
* @return array{}|array{longitude: float, latitude: float}
*/
private static function _getPlaceCoordinatesFromWikidata(array $data):array {
if (!isset($data['claims']['P625'])) {
return [];
}
$latitude_wd = \filter_var($data['claims']['P625'][0]['mainsnak']['datavalue']['value']['latitude'], FILTER_VALIDATE_FLOAT);
$longitude_wd = \filter_var($data['claims']['P625'][0]['mainsnak']['datavalue']['value']['longitude'], FILTER_VALIDATE_FLOAT);
if ($latitude_wd === false || $longitude_wd === false) return [];
return [
'longitude' => $longitude_wd,
'latitude' => $latitude_wd,
];
}
/**
* Loads Wikipedia page and cleans output.
*
* @param string $lang Language to load in.
* @param string $title Title to load from.
*
* @return string
*/
private static function _getCleanedWikipediaSnippet(string $lang, string $title):string {
$datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($lang, $title), 10000);
$datafromwiki = strval(json_decode($datafromwiki, true)['parse']['text']['*']);
return self::_cleanWikidataInput($datafromwiki);
}
/**
* Loads data for a single entity from Wikidata.
*
* @param string $wikidata_id Wikidata Q-ID.
*
* @return array<mixed>
*/
private static function _getWikidataEntity(string $wikidata_id):array {
$data = json_decode(MD_STD::runCurl("https://www.wikidata.org/wiki/Special:EntityData/" . urlencode($wikidata_id) . ".json", 10000), true);
if ($data === null) {
throw new MDhttpFailedException("Failed fetching from Wikidata. Try again later.");
}
return $data['entities'][$wikidata_id];
}
/** /**
* Parses wikidata results to MDNodaLink entries. * Parses wikidata results to MDNodaLink entries.
* *
@ -456,6 +539,17 @@ final class NodaWikidataFetcher {
throw new MDExpectedException("Invalid URL"); throw new MDExpectedException("Invalid URL");
} }
if (strpos($linkUrl, "http://www.wikidata.org/entity/") !== false) {
if ($output = self::getWikidataIdFromWikidataLink($linkUrl)) {
return $output;
}
}
if (strpos($linkUrl, "https://www.wikidata.org/entity/") !== false) {
if ($output = self::getWikidataIdFromWikidataLink($linkUrl)) {
return $output;
}
}
if (strpos($linkUrl, "https://www.wikidata.org/wiki/") !== false) { if (strpos($linkUrl, "https://www.wikidata.org/wiki/") !== false) {
if ($output = self::getWikidataIdFromWikidataLink($linkUrl)) { if ($output = self::getWikidataIdFromWikidataLink($linkUrl)) {
return $output; return $output;
@ -481,7 +575,10 @@ final class NodaWikidataFetcher {
*/ */
public static function getWikidataIdFromWikidataLink(string $linkUrl):string { public static function getWikidataIdFromWikidataLink(string $linkUrl):string {
if (strpos($linkUrl, "https://www.wikidata.org/wiki/") === false) { if (str_contains($linkUrl, "https://www.wikidata.org/wiki/") === false
&& str_contains($linkUrl, "https://www.wikidata.org/entity/") === false
&& str_contains($linkUrl, "http://www.wikidata.org/entity/") === false
) {
return ''; return '';
} }
@ -750,14 +847,13 @@ final class NodaWikidataFetcher {
* @param integer $persinst_id Person ID. * @param integer $persinst_id Person ID.
* @param string $wikidata_id Wikidata ID. * @param string $wikidata_id Wikidata ID.
* @param string $datafromwiki Data fetched from Wikipedia. * @param string $datafromwiki Data fetched from Wikipedia.
* @param string $wikilink Link to wikipedia entry.
* @param string $preflang The user's currently used language. * @param string $preflang The user's currently used language.
* @param string $lang Currently queried language. * @param string $lang Currently queried language.
* @param string $erfasst_von User who adds the info. * @param string $erfasst_von User who adds the info.
* *
* @return boolean * @return boolean
*/ */
public function retrievePersinstDescFromWikipedia(int $persinst_id, string $wikidata_id, string $datafromwiki, string $wikilink, string $preflang, string $lang, string $erfasst_von):bool { public function retrievePersinstDescFromWikipedia(int $persinst_id, string $wikidata_id, string $datafromwiki, string $preflang, string $lang, string $erfasst_von):bool {
$output = false; $output = false;
@ -961,46 +1057,27 @@ final class NodaWikidataFetcher {
public function retrievePersinstInfoFromWikidataID(string $lang, string $wikidata_id, int $persinst_id, string $erfasst_von) { public function retrievePersinstInfoFromWikidataID(string $lang, string $wikidata_id, int $persinst_id, string $erfasst_von) {
self::validateWikidataId($wikidata_id); self::validateWikidataId($wikidata_id);
$data = self::_getWikidataEntity($wikidata_id);
$data = json_decode(MD_STD::runCurl("https://www.wikidata.org/wiki/Special:EntityData/" . $wikidata_id . ".json", 10000), true);
if ($data === null) {
throw new MDhttpFailedException("Failed fetching from Wikidata. Try again later.");
}
$data = $data['entities'][$wikidata_id];
// Get links to wikipedia // Get links to wikipedia
$wikilink = $wikilinkterm = [];
foreach (self::LANGUAGES_MAIN_DESC as $tLang) {
if (isset($data['sitelinks'][$tLang . 'wiki']['url'])) $wikilink[$tLang] = $data['sitelinks'][$tLang . 'wiki']['url'];
if (isset($data['sitelinks'][$tLang . 'wiki']['title'])) $wikilinkterm[$tLang] = str_replace(' ', '_', $data['sitelinks'][$tLang . 'wiki']['title']);
}
$wikilinks = self::_getWikipediaLinksFromWikidataOutput($data);
$alreadyEntered = false; $alreadyEntered = false;
if (isset($wikilink[$lang]) and isset($wikilinkterm[$lang]) and is_string($wikilinkterm[$lang])) { if (isset($wikilinks[$lang])) {
$datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($lang, $wikilinkterm[$lang]), 10000);
$datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*'];
# Process data retrieved from wikipedia # Process data retrieved from wikipedia
if (!empty($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki))) { if (!empty($datafromwiki = self::_getCleanedWikipediaSnippet($lang, $wikilinks[$lang]['title']))) {
$alreadyEntered = $this->retrievePersinstDescFromWikipedia($persinst_id, $wikidata_id, $datafromwiki, $wikilink[$lang], $lang, $lang, $erfasst_von); $alreadyEntered = $this->retrievePersinstDescFromWikipedia($persinst_id, $wikidata_id, $datafromwiki, $lang, $lang, $erfasst_von);
} }
} }
foreach (self::LANGUAGES_MAIN_DESC as $cur_lang) { foreach (self::LANGUAGES_MAIN_DESC as $cur_lang) {
if ($alreadyEntered === true) break; if ($alreadyEntered === true || !isset($wikilinks[$cur_lang])) continue;
if (!isset($wikilink[$cur_lang]) || !isset($wikilinkterm[$cur_lang]) || !is_string($wikilinkterm[$cur_lang])) continue;
$datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($cur_lang, $wikilinkterm[$cur_lang]), 10000); if ($datafromwiki = self::_getCleanedWikipediaSnippet($lang, $wikilinks[$cur_lang]['title'])) {
$datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*']; $alreadyEntered = $this->retrievePersinstDescFromWikipedia($persinst_id, $wikidata_id, $datafromwiki, $lang, "$cur_lang", $erfasst_von);
# Process data retrieved from wikipedia
if ($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki)) {
$alreadyEntered = $this->retrievePersinstDescFromWikipedia($persinst_id, $wikidata_id, $datafromwiki, $wikilink[$cur_lang], $lang, "$cur_lang", $erfasst_von);
} }
} }
@ -1104,7 +1181,6 @@ final class NodaWikidataFetcher {
* *
* @param string $cur_place_desc Mysqli result pointing to the current place. * @param string $cur_place_desc Mysqli result pointing to the current place.
* @param string $datafromwiki Data parsed from wikidata. * @param string $datafromwiki Data parsed from wikidata.
* @param array<mixed> $wikilink Wikilink.
* @param string $preflang Language of the user interface in general. * @param string $preflang Language of the user interface in general.
* @param string $lang Language of the main entry. * @param string $lang Language of the main entry.
* @param integer $placeID ID of the place. * @param integer $placeID ID of the place.
@ -1112,7 +1188,7 @@ final class NodaWikidataFetcher {
* *
* @return boolean * @return boolean
*/ */
public function enterPlaceDescFromWikidata(string $cur_place_desc, string $datafromwiki, array $wikilink, string $preflang, string $lang, int $placeID, string $erfasst_von):bool { public function enterPlaceDescFromWikidata(string $cur_place_desc, string $datafromwiki, string $preflang, string $lang, int $placeID, string $erfasst_von):bool {
$datafromwiki = '"' . $datafromwiki . '" - (Wikipedia (' . $lang . ') ' . date("d.m.Y") . ')'; $datafromwiki = '"' . $datafromwiki . '" - (Wikipedia (' . $lang . ') ' . date("d.m.Y") . ')';
@ -1239,37 +1315,27 @@ final class NodaWikidataFetcher {
public function retrievePlaceInfoFromWikidataID(string $lang, string $wikidata_id, int $onum, string $erfasst_von) { public function retrievePlaceInfoFromWikidataID(string $lang, string $wikidata_id, int $onum, string $erfasst_von) {
self::validateWikidataId($wikidata_id); self::validateWikidataId($wikidata_id);
$data = self::_getWikidataEntity($wikidata_id);
$data = MD_STD::runCurl("https://www.wikidata.org/wiki/Special:EntityData/" . urlencode($wikidata_id) . ".json", 10000); $wikilinks = self::_getWikipediaLinksFromWikidataOutput($data);
if (!$data = json_decode($data, true)) {
throw new MDhttpFailedException("Failed fetching from Wikidata. Try again later.");
}
$data = $data['entities'][$wikidata_id];
$wikilink = $wikilinkterm = [];
foreach (self::LANGUAGES_MAIN_DESC as $tLang) {
if (isset($data['sitelinks'][$tLang . 'wiki']['url'])) $wikilink[$tLang] = $data['sitelinks'][$tLang . 'wiki']['url'];
if (isset($data['sitelinks'][$tLang . 'wiki']['title'])) $wikilinkterm[$tLang] = str_replace(' ', '_', $data['sitelinks'][$tLang . 'wiki']['title']);
}
// Get current description for overwriting // Get current description for overwriting
$cur_place_desc = $this->getPlaceDescription($onum);
$alreadyEntered = false;
// P131: Located in administrative unit // P131: Located in administrative unit
if (isset($data['claims']['P131'])) { if (isset($data['claims']['P131'])) {
$this->retrieveSuperordinateAdministrativePlace($onum, $data['claims']['P131']); $this->retrieveSuperordinateAdministrativePlace($onum, $data['claims']['P131']);
} }
if (!empty($wikilink[$lang])) { $cur_place_desc = $this->getPlaceDescription($onum);
$alreadyEntered = false;
$datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($lang, $wikilinkterm[$lang]), 10000); if (!empty($wikilinks[$lang])) {
$datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($lang, $wikilinks[$lang]['title']), 10000);
$datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*']; $datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*'];
if (!empty($datafromwiki) and !empty($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki))) { if (!empty($datafromwiki) and !empty($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki))) {
$alreadyEntered = $this->enterPlaceDescFromWikidata($cur_place_desc, $datafromwiki, $wikilink, $lang, $lang, $onum, $erfasst_von); $alreadyEntered = $this->enterPlaceDescFromWikidata($cur_place_desc, $datafromwiki, $lang, $lang, $onum, $erfasst_von);
} }
} }
@ -1277,12 +1343,12 @@ final class NodaWikidataFetcher {
//if ($alreadyEntered === true) break; //if ($alreadyEntered === true) break;
if ($alreadyEntered === true) break; if ($alreadyEntered === true) break;
if (!isset($wikilink[$cur_lang])) continue; if (!isset($wikilinks[$cur_lang]['url'])) continue;
$datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($cur_lang, $wikilinkterm[$cur_lang]), 10000); $datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($cur_lang, $wikilinks[$cur_lang]['title']), 10000);
$datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*']; $datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*'];
if (!empty($datafromwiki) and !empty($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki))) { if (!empty($datafromwiki) and !empty($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki))) {
$alreadyEntered = $this->enterPlaceDescFromWikidata($cur_place_desc, $datafromwiki, $wikilink, $lang, $cur_lang, $onum, $erfasst_von); $alreadyEntered = $this->enterPlaceDescFromWikidata($cur_place_desc, $datafromwiki, $lang, $cur_lang, $onum, $erfasst_von);
} }
} }
@ -1294,10 +1360,7 @@ final class NodaWikidataFetcher {
NodaBatchInserter::linkNodaForPlace($this->_mysqli_noda, $onum, $nodaLinks, $erfasst_von); NodaBatchInserter::linkNodaForPlace($this->_mysqli_noda, $onum, $nodaLinks, $erfasst_von);
} }
if (isset($data['claims']['P625'])) { $coordinates_wd = self::_getPlaceCoordinatesFromWikidata($data);
$latitude_wd = \filter_var($data['claims']['P625'][0]['mainsnak']['datavalue']['value']['latitude'], FILTER_VALIDATE_FLOAT);
$longitude_wd = \filter_var($data['claims']['P625'][0]['mainsnak']['datavalue']['value']['longitude'], FILTER_VALIDATE_FLOAT);
}
$this->_mysqli_noda->autocommit(false); $this->_mysqli_noda->autocommit(false);
if (!empty($tgn_id)) { if (!empty($tgn_id)) {
@ -1322,12 +1385,13 @@ final class NodaWikidataFetcher {
unset($updateStmt); unset($updateStmt);
} }
if (!empty($latitude_wd) and !empty($longitude_wd)) {
if (!empty($coordinates_wd)) {
$updateStmt = $this->_mysqli_noda->do_prepare("UPDATE `orte` $updateStmt = $this->_mysqli_noda->do_prepare("UPDATE `orte`
SET `ort_nord_sued` = ?, `ort_west_ost` = ? SET `ort_nord_sued` = ?, `ort_west_ost` = ?
WHERE `ort_id` = ?"); WHERE `ort_id` = ?");
$updateStmt->bind_param("ddi", $latitude_wd, $longitude_wd, $onum); $updateStmt->bind_param("ddi", $coordinates_wd['latitude'], $coordinates_wd['longitude'], $onum);
$updateStmt->execute(); $updateStmt->execute();
$updateStmt->close(); $updateStmt->close();
unset($updateStmt); unset($updateStmt);
@ -1382,14 +1446,13 @@ final class NodaWikidataFetcher {
* *
* @param integer $tag_id Tag ID. * @param integer $tag_id Tag ID.
* @param string $datafromwiki Data fetched from Wikipedia. * @param string $datafromwiki Data fetched from Wikipedia.
* @param string $wikilink Link to wikipedia entry.
* @param string $preflang The user's currently used language. * @param string $preflang The user's currently used language.
* @param string $lang Currently queried language. * @param string $lang Currently queried language.
* @param string $erfasst_von User who adds the info. * @param string $erfasst_von User who adds the info.
* *
* @return boolean * @return boolean
*/ */
public function retrieveTagDescFromWikipedia(int $tag_id, string $datafromwiki, string $wikilink, string $preflang, string $lang, string $erfasst_von):bool { public function retrieveTagDescFromWikipedia(int $tag_id, string $datafromwiki, string $preflang, string $lang, string $erfasst_von):bool {
$output = false; $output = false;
@ -1486,46 +1549,34 @@ final class NodaWikidataFetcher {
public function retrieveTagInfoFromWikidataID(string $lang, string $wikidata_id, int $tag_id, string $erfasst_von) { public function retrieveTagInfoFromWikidataID(string $lang, string $wikidata_id, int $tag_id, string $erfasst_von) {
self::validateWikidataId($wikidata_id); self::validateWikidataId($wikidata_id);
$data = self::_getWikidataEntity($wikidata_id);
$data = MD_STD::runCurl("https://www.wikidata.org/wiki/Special:EntityData/" . $wikidata_id . ".json", 10000); $wikilinks = self::_getWikipediaLinksFromWikidataOutput($data);
$data = json_decode($data, true);
if ($data === null) {
throw new MDhttpFailedException("Failed fetching from Wikidata. Try again later.");
}
$data = $data['entities'][$wikidata_id];
$wikilink = $wikilinkterm = [];
foreach (self::LANGUAGES_MAIN_DESC as $tLang) {
if (isset($data['sitelinks'][$tLang . 'wiki']['url'])) $wikilink[$tLang] = $data['sitelinks'][$tLang . 'wiki']['url'];
if (isset($data['sitelinks'][$tLang . 'wiki']['title'])) $wikilinkterm[$tLang] = str_replace(' ', '_', $data['sitelinks'][$tLang . 'wiki']['title']);
}
$alreadyEntered = false; $alreadyEntered = false;
if (isset($wikilink[$lang]) and isset($wikilinkterm[$lang]) and is_string($wikilinkterm[$lang])) { if (isset($wikilinks[$lang])) {
$datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($lang, $wikilinkterm[$lang]), 10000); $datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($lang, $wikilinks[$lang]['title']), 10000);
$datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*']; $datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*'];
# Process data retrieved from wikipedia # Process data retrieved from wikipedia
if (!empty($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki))) { if (!empty($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki))) {
$alreadyEntered = $this->retrieveTagDescFromWikipedia($tag_id, $datafromwiki, $wikilink[$lang], $lang, $lang, $erfasst_von); $alreadyEntered = $this->retrieveTagDescFromWikipedia($tag_id, $datafromwiki, $lang, $lang, $erfasst_von);
} }
} }
foreach (self::LANGUAGES_MAIN_DESC as $cur_lang) { foreach (self::LANGUAGES_MAIN_DESC as $cur_lang) {
if ($alreadyEntered === true) break; if ($alreadyEntered === true || !isset($wikilinks[$cur_lang])) continue;
if (!isset($wikilink[$cur_lang]) || !isset($wikilinkterm[$cur_lang]) || !is_string($wikilinkterm[$cur_lang])) continue;
$datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($cur_lang, $wikilinkterm[$cur_lang]), 10000); $datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($cur_lang, $wikilinks[$cur_lang]['title']), 10000);
$datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*']; $datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*'];
# Process data retrieved from wikipedia # Process data retrieved from wikipedia
if ($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki)) { if ($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki)) {
$alreadyEntered = $this->retrieveTagDescFromWikipedia($tag_id, $datafromwiki, $wikilink[$cur_lang], $lang, $cur_lang, $erfasst_von); $alreadyEntered = $this->retrieveTagDescFromWikipedia($tag_id, $datafromwiki, $lang, $cur_lang, $erfasst_von);
} }
} }