Refactor wikidata fetcher

This commit is contained in:
Joshua Ramon Enslin 2024-10-03 15:56:31 +02:00
parent 9b63a4d95d
commit cd49f194f2
Signed by: jrenslin
GPG Key ID: 46016F84501B70AE

View File

@ -91,6 +91,89 @@ final class NodaWikidataFetcher {
}
/**
* Returns Wikipedia links from Wikidata's API output.
*
* @param array<mixed> $data Wikidata API output.
*
* @return array<string, array{url: string, title: string}>
*/
private static function _getWikipediaLinksFromWikidataOutput(array $data):array {
$wikilinks = [];
foreach (self::LANGUAGES_MAIN_DESC as $tLang) {
if (!isset($data['sitelinks'][$tLang . 'wiki']['url']) || !isset($data['sitelinks'][$tLang . 'wiki']['title'])) continue;
if (!is_string($data['sitelinks'][$tLang . 'wiki']['url']) || !is_string($data['sitelinks'][$tLang . 'wiki']['title'])) continue;
$wikilinks[$tLang] = [
'url' => $data['sitelinks'][$tLang . 'wiki']['url'],
'title' => str_replace(' ', '_', $data['sitelinks'][$tLang . 'wiki']['title']),
];
}
return $wikilinks;
}
/**
* Parses coordinates from Wikidata API output.
*
* @param array<mixed> $data Wikidata API output.
*
* @return array{}|array{longitude: float, latitude: float}
*/
private static function _getPlaceCoordinatesFromWikidata(array $data):array {
if (!isset($data['claims']['P625'])) {
return [];
}
$latitude_wd = \filter_var($data['claims']['P625'][0]['mainsnak']['datavalue']['value']['latitude'], FILTER_VALIDATE_FLOAT);
$longitude_wd = \filter_var($data['claims']['P625'][0]['mainsnak']['datavalue']['value']['longitude'], FILTER_VALIDATE_FLOAT);
if ($latitude_wd === false || $longitude_wd === false) return [];
return [
'longitude' => $longitude_wd,
'latitude' => $latitude_wd,
];
}
/**
* Loads Wikipedia page and cleans output.
*
* @param string $lang Language to load in.
* @param string $title Title to load from.
*
* @return string
*/
private static function _getCleanedWikipediaSnippet(string $lang, string $title):string {
$datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($lang, $title), 10000);
$datafromwiki = strval(json_decode($datafromwiki, true)['parse']['text']['*']);
return self::_cleanWikidataInput($datafromwiki);
}
/**
* Loads data for a single entity from Wikidata.
*
* @param string $wikidata_id Wikidata Q-ID.
*
* @return array<mixed>
*/
private static function _getWikidataEntity(string $wikidata_id):array {
$data = json_decode(MD_STD::runCurl("https://www.wikidata.org/wiki/Special:EntityData/" . urlencode($wikidata_id) . ".json", 10000), true);
if ($data === null) {
throw new MDhttpFailedException("Failed fetching from Wikidata. Try again later.");
}
return $data['entities'][$wikidata_id];
}
/**
* Parses wikidata results to MDNodaLink entries.
*
@ -456,6 +539,17 @@ final class NodaWikidataFetcher {
throw new MDExpectedException("Invalid URL");
}
if (strpos($linkUrl, "http://www.wikidata.org/entity/") !== false) {
if ($output = self::getWikidataIdFromWikidataLink($linkUrl)) {
return $output;
}
}
if (strpos($linkUrl, "https://www.wikidata.org/entity/") !== false) {
if ($output = self::getWikidataIdFromWikidataLink($linkUrl)) {
return $output;
}
}
if (strpos($linkUrl, "https://www.wikidata.org/wiki/") !== false) {
if ($output = self::getWikidataIdFromWikidataLink($linkUrl)) {
return $output;
@ -481,7 +575,10 @@ final class NodaWikidataFetcher {
*/
public static function getWikidataIdFromWikidataLink(string $linkUrl):string {
if (strpos($linkUrl, "https://www.wikidata.org/wiki/") === false) {
if (str_contains($linkUrl, "https://www.wikidata.org/wiki/") === false
&& str_contains($linkUrl, "https://www.wikidata.org/entity/") === false
&& str_contains($linkUrl, "http://www.wikidata.org/entity/") === false
) {
return '';
}
@ -750,14 +847,13 @@ final class NodaWikidataFetcher {
* @param integer $persinst_id Person ID.
* @param string $wikidata_id Wikidata ID.
* @param string $datafromwiki Data fetched from Wikipedia.
* @param string $wikilink Link to wikipedia entry.
* @param string $preflang The user's currently used language.
* @param string $lang Currently queried language.
* @param string $erfasst_von User who adds the info.
*
* @return boolean
*/
public function retrievePersinstDescFromWikipedia(int $persinst_id, string $wikidata_id, string $datafromwiki, string $wikilink, string $preflang, string $lang, string $erfasst_von):bool {
public function retrievePersinstDescFromWikipedia(int $persinst_id, string $wikidata_id, string $datafromwiki, string $preflang, string $lang, string $erfasst_von):bool {
$output = false;
@ -961,46 +1057,27 @@ final class NodaWikidataFetcher {
public function retrievePersinstInfoFromWikidataID(string $lang, string $wikidata_id, int $persinst_id, string $erfasst_von) {
self::validateWikidataId($wikidata_id);
$data = json_decode(MD_STD::runCurl("https://www.wikidata.org/wiki/Special:EntityData/" . $wikidata_id . ".json", 10000), true);
if ($data === null) {
throw new MDhttpFailedException("Failed fetching from Wikidata. Try again later.");
}
$data = $data['entities'][$wikidata_id];
$data = self::_getWikidataEntity($wikidata_id);
// Get links to wikipedia
$wikilink = $wikilinkterm = [];
foreach (self::LANGUAGES_MAIN_DESC as $tLang) {
if (isset($data['sitelinks'][$tLang . 'wiki']['url'])) $wikilink[$tLang] = $data['sitelinks'][$tLang . 'wiki']['url'];
if (isset($data['sitelinks'][$tLang . 'wiki']['title'])) $wikilinkterm[$tLang] = str_replace(' ', '_', $data['sitelinks'][$tLang . 'wiki']['title']);
}
$wikilinks = self::_getWikipediaLinksFromWikidataOutput($data);
$alreadyEntered = false;
if (isset($wikilink[$lang]) and isset($wikilinkterm[$lang]) and is_string($wikilinkterm[$lang])) {
$datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($lang, $wikilinkterm[$lang]), 10000);
$datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*'];
if (isset($wikilinks[$lang])) {
# Process data retrieved from wikipedia
if (!empty($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki))) {
$alreadyEntered = $this->retrievePersinstDescFromWikipedia($persinst_id, $wikidata_id, $datafromwiki, $wikilink[$lang], $lang, $lang, $erfasst_von);
if (!empty($datafromwiki = self::_getCleanedWikipediaSnippet($lang, $wikilinks[$lang]['title']))) {
$alreadyEntered = $this->retrievePersinstDescFromWikipedia($persinst_id, $wikidata_id, $datafromwiki, $lang, $lang, $erfasst_von);
}
}
foreach (self::LANGUAGES_MAIN_DESC as $cur_lang) {
if ($alreadyEntered === true) break;
if (!isset($wikilink[$cur_lang]) || !isset($wikilinkterm[$cur_lang]) || !is_string($wikilinkterm[$cur_lang])) continue;
if ($alreadyEntered === true || !isset($wikilinks[$cur_lang])) continue;
$datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($cur_lang, $wikilinkterm[$cur_lang]), 10000);
$datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*'];
# Process data retrieved from wikipedia
if ($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki)) {
$alreadyEntered = $this->retrievePersinstDescFromWikipedia($persinst_id, $wikidata_id, $datafromwiki, $wikilink[$cur_lang], $lang, "$cur_lang", $erfasst_von);
if ($datafromwiki = self::_getCleanedWikipediaSnippet($lang, $wikilinks[$cur_lang]['title'])) {
$alreadyEntered = $this->retrievePersinstDescFromWikipedia($persinst_id, $wikidata_id, $datafromwiki, $lang, "$cur_lang", $erfasst_von);
}
}
@ -1102,17 +1179,16 @@ final class NodaWikidataFetcher {
/**
* Function for entering base information about a place from wikidata.
*
* @param string $cur_place_desc Mysqli result pointing to the current place.
* @param string $datafromwiki Data parsed from wikidata.
* @param array<mixed> $wikilink Wikilink.
* @param string $preflang Language of the user interface in general.
* @param string $lang Language of the main entry.
* @param integer $placeID ID of the place.
* @param string $erfasst_von User name.
* @param string $cur_place_desc Mysqli result pointing to the current place.
* @param string $datafromwiki Data parsed from wikidata.
* @param string $preflang Language of the user interface in general.
* @param string $lang Language of the main entry.
* @param integer $placeID ID of the place.
* @param string $erfasst_von User name.
*
* @return boolean
*/
public function enterPlaceDescFromWikidata(string $cur_place_desc, string $datafromwiki, array $wikilink, string $preflang, string $lang, int $placeID, string $erfasst_von):bool {
public function enterPlaceDescFromWikidata(string $cur_place_desc, string $datafromwiki, string $preflang, string $lang, int $placeID, string $erfasst_von):bool {
$datafromwiki = '"' . $datafromwiki . '" - (Wikipedia (' . $lang . ') ' . date("d.m.Y") . ')';
@ -1239,37 +1315,27 @@ final class NodaWikidataFetcher {
public function retrievePlaceInfoFromWikidataID(string $lang, string $wikidata_id, int $onum, string $erfasst_von) {
self::validateWikidataId($wikidata_id);
$data = self::_getWikidataEntity($wikidata_id);
$data = MD_STD::runCurl("https://www.wikidata.org/wiki/Special:EntityData/" . urlencode($wikidata_id) . ".json", 10000);
if (!$data = json_decode($data, true)) {
throw new MDhttpFailedException("Failed fetching from Wikidata. Try again later.");
}
$data = $data['entities'][$wikidata_id];
$wikilink = $wikilinkterm = [];
foreach (self::LANGUAGES_MAIN_DESC as $tLang) {
if (isset($data['sitelinks'][$tLang . 'wiki']['url'])) $wikilink[$tLang] = $data['sitelinks'][$tLang . 'wiki']['url'];
if (isset($data['sitelinks'][$tLang . 'wiki']['title'])) $wikilinkterm[$tLang] = str_replace(' ', '_', $data['sitelinks'][$tLang . 'wiki']['title']);
}
$wikilinks = self::_getWikipediaLinksFromWikidataOutput($data);
// Get current description for overwriting
$cur_place_desc = $this->getPlaceDescription($onum);
$alreadyEntered = false;
// P131: Located in administrative unit
if (isset($data['claims']['P131'])) {
$this->retrieveSuperordinateAdministrativePlace($onum, $data['claims']['P131']);
}
if (!empty($wikilink[$lang])) {
$cur_place_desc = $this->getPlaceDescription($onum);
$alreadyEntered = false;
$datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($lang, $wikilinkterm[$lang]), 10000);
if (!empty($wikilinks[$lang])) {
$datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($lang, $wikilinks[$lang]['title']), 10000);
$datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*'];
if (!empty($datafromwiki) and !empty($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki))) {
$alreadyEntered = $this->enterPlaceDescFromWikidata($cur_place_desc, $datafromwiki, $wikilink, $lang, $lang, $onum, $erfasst_von);
$alreadyEntered = $this->enterPlaceDescFromWikidata($cur_place_desc, $datafromwiki, $lang, $lang, $onum, $erfasst_von);
}
}
@ -1277,12 +1343,12 @@ final class NodaWikidataFetcher {
//if ($alreadyEntered === true) break;
if ($alreadyEntered === true) break;
if (!isset($wikilink[$cur_lang])) continue;
if (!isset($wikilinks[$cur_lang]['url'])) continue;
$datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($cur_lang, $wikilinkterm[$cur_lang]), 10000);
$datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($cur_lang, $wikilinks[$cur_lang]['title']), 10000);
$datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*'];
if (!empty($datafromwiki) and !empty($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki))) {
$alreadyEntered = $this->enterPlaceDescFromWikidata($cur_place_desc, $datafromwiki, $wikilink, $lang, $cur_lang, $onum, $erfasst_von);
$alreadyEntered = $this->enterPlaceDescFromWikidata($cur_place_desc, $datafromwiki, $lang, $cur_lang, $onum, $erfasst_von);
}
}
@ -1294,10 +1360,7 @@ final class NodaWikidataFetcher {
NodaBatchInserter::linkNodaForPlace($this->_mysqli_noda, $onum, $nodaLinks, $erfasst_von);
}
if (isset($data['claims']['P625'])) {
$latitude_wd = \filter_var($data['claims']['P625'][0]['mainsnak']['datavalue']['value']['latitude'], FILTER_VALIDATE_FLOAT);
$longitude_wd = \filter_var($data['claims']['P625'][0]['mainsnak']['datavalue']['value']['longitude'], FILTER_VALIDATE_FLOAT);
}
$coordinates_wd = self::_getPlaceCoordinatesFromWikidata($data);
$this->_mysqli_noda->autocommit(false);
if (!empty($tgn_id)) {
@ -1322,12 +1385,13 @@ final class NodaWikidataFetcher {
unset($updateStmt);
}
if (!empty($latitude_wd) and !empty($longitude_wd)) {
if (!empty($coordinates_wd)) {
$updateStmt = $this->_mysqli_noda->do_prepare("UPDATE `orte`
SET `ort_nord_sued` = ?, `ort_west_ost` = ?
WHERE `ort_id` = ?");
$updateStmt->bind_param("ddi", $latitude_wd, $longitude_wd, $onum);
$updateStmt->bind_param("ddi", $coordinates_wd['latitude'], $coordinates_wd['longitude'], $onum);
$updateStmt->execute();
$updateStmt->close();
unset($updateStmt);
@ -1382,14 +1446,13 @@ final class NodaWikidataFetcher {
*
* @param integer $tag_id Tag ID.
* @param string $datafromwiki Data fetched from Wikipedia.
* @param string $wikilink Link to wikipedia entry.
* @param string $preflang The user's currently used language.
* @param string $lang Currently queried language.
* @param string $erfasst_von User who adds the info.
*
* @return boolean
*/
public function retrieveTagDescFromWikipedia(int $tag_id, string $datafromwiki, string $wikilink, string $preflang, string $lang, string $erfasst_von):bool {
public function retrieveTagDescFromWikipedia(int $tag_id, string $datafromwiki, string $preflang, string $lang, string $erfasst_von):bool {
$output = false;
@ -1486,46 +1549,34 @@ final class NodaWikidataFetcher {
public function retrieveTagInfoFromWikidataID(string $lang, string $wikidata_id, int $tag_id, string $erfasst_von) {
self::validateWikidataId($wikidata_id);
$data = self::_getWikidataEntity($wikidata_id);
$data = MD_STD::runCurl("https://www.wikidata.org/wiki/Special:EntityData/" . $wikidata_id . ".json", 10000);
$data = json_decode($data, true);
if ($data === null) {
throw new MDhttpFailedException("Failed fetching from Wikidata. Try again later.");
}
$data = $data['entities'][$wikidata_id];
$wikilink = $wikilinkterm = [];
foreach (self::LANGUAGES_MAIN_DESC as $tLang) {
if (isset($data['sitelinks'][$tLang . 'wiki']['url'])) $wikilink[$tLang] = $data['sitelinks'][$tLang . 'wiki']['url'];
if (isset($data['sitelinks'][$tLang . 'wiki']['title'])) $wikilinkterm[$tLang] = str_replace(' ', '_', $data['sitelinks'][$tLang . 'wiki']['title']);
}
$wikilinks = self::_getWikipediaLinksFromWikidataOutput($data);
$alreadyEntered = false;
if (isset($wikilink[$lang]) and isset($wikilinkterm[$lang]) and is_string($wikilinkterm[$lang])) {
if (isset($wikilinks[$lang])) {
$datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($lang, $wikilinkterm[$lang]), 10000);
$datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($lang, $wikilinks[$lang]['title']), 10000);
$datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*'];
# Process data retrieved from wikipedia
if (!empty($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki))) {
$alreadyEntered = $this->retrieveTagDescFromWikipedia($tag_id, $datafromwiki, $wikilink[$lang], $lang, $lang, $erfasst_von);
$alreadyEntered = $this->retrieveTagDescFromWikipedia($tag_id, $datafromwiki, $lang, $lang, $erfasst_von);
}
}
foreach (self::LANGUAGES_MAIN_DESC as $cur_lang) {
if ($alreadyEntered === true) break;
if (!isset($wikilink[$cur_lang]) || !isset($wikilinkterm[$cur_lang]) || !is_string($wikilinkterm[$cur_lang])) continue;
if ($alreadyEntered === true || !isset($wikilinks[$cur_lang])) continue;
$datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($cur_lang, $wikilinkterm[$cur_lang]), 10000);
$datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($cur_lang, $wikilinks[$cur_lang]['title']), 10000);
$datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*'];
# Process data retrieved from wikipedia
if ($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki)) {
$alreadyEntered = $this->retrieveTagDescFromWikipedia($tag_id, $datafromwiki, $wikilink[$cur_lang], $lang, $cur_lang, $erfasst_von);
$alreadyEntered = $this->retrieveTagDescFromWikipedia($tag_id, $datafromwiki, $lang, $cur_lang, $erfasst_von);
}
}