Refactor wikidata fetcher
This commit is contained in:
parent
9b63a4d95d
commit
cd49f194f2
|
@ -91,6 +91,89 @@ final class NodaWikidataFetcher {
|
|||
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns Wikipedia links from Wikidata's API output.
|
||||
*
|
||||
* @param array<mixed> $data Wikidata API output.
|
||||
*
|
||||
* @return array<string, array{url: string, title: string}>
|
||||
*/
|
||||
private static function _getWikipediaLinksFromWikidataOutput(array $data):array {
|
||||
|
||||
$wikilinks = [];
|
||||
|
||||
foreach (self::LANGUAGES_MAIN_DESC as $tLang) {
|
||||
if (!isset($data['sitelinks'][$tLang . 'wiki']['url']) || !isset($data['sitelinks'][$tLang . 'wiki']['title'])) continue;
|
||||
if (!is_string($data['sitelinks'][$tLang . 'wiki']['url']) || !is_string($data['sitelinks'][$tLang . 'wiki']['title'])) continue;
|
||||
$wikilinks[$tLang] = [
|
||||
'url' => $data['sitelinks'][$tLang . 'wiki']['url'],
|
||||
'title' => str_replace(' ', '_', $data['sitelinks'][$tLang . 'wiki']['title']),
|
||||
];
|
||||
}
|
||||
|
||||
return $wikilinks;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses coordinates from Wikidata API output.
|
||||
*
|
||||
* @param array<mixed> $data Wikidata API output.
|
||||
*
|
||||
* @return array{}|array{longitude: float, latitude: float}
|
||||
*/
|
||||
private static function _getPlaceCoordinatesFromWikidata(array $data):array {
|
||||
|
||||
if (!isset($data['claims']['P625'])) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$latitude_wd = \filter_var($data['claims']['P625'][0]['mainsnak']['datavalue']['value']['latitude'], FILTER_VALIDATE_FLOAT);
|
||||
$longitude_wd = \filter_var($data['claims']['P625'][0]['mainsnak']['datavalue']['value']['longitude'], FILTER_VALIDATE_FLOAT);
|
||||
|
||||
if ($latitude_wd === false || $longitude_wd === false) return [];
|
||||
|
||||
return [
|
||||
'longitude' => $longitude_wd,
|
||||
'latitude' => $latitude_wd,
|
||||
];
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads Wikipedia page and cleans output.
|
||||
*
|
||||
* @param string $lang Language to load in.
|
||||
* @param string $title Title to load from.
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
private static function _getCleanedWikipediaSnippet(string $lang, string $title):string {
|
||||
|
||||
$datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($lang, $title), 10000);
|
||||
$datafromwiki = strval(json_decode($datafromwiki, true)['parse']['text']['*']);
|
||||
|
||||
return self::_cleanWikidataInput($datafromwiki);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads data for a single entity from Wikidata.
|
||||
*
|
||||
* @param string $wikidata_id Wikidata Q-ID.
|
||||
*
|
||||
* @return array<mixed>
|
||||
*/
|
||||
private static function _getWikidataEntity(string $wikidata_id):array {
|
||||
|
||||
$data = json_decode(MD_STD::runCurl("https://www.wikidata.org/wiki/Special:EntityData/" . urlencode($wikidata_id) . ".json", 10000), true);
|
||||
if ($data === null) {
|
||||
throw new MDhttpFailedException("Failed fetching from Wikidata. Try again later.");
|
||||
}
|
||||
return $data['entities'][$wikidata_id];
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses wikidata results to MDNodaLink entries.
|
||||
*
|
||||
|
@ -456,6 +539,17 @@ final class NodaWikidataFetcher {
|
|||
throw new MDExpectedException("Invalid URL");
|
||||
}
|
||||
|
||||
if (strpos($linkUrl, "http://www.wikidata.org/entity/") !== false) {
|
||||
if ($output = self::getWikidataIdFromWikidataLink($linkUrl)) {
|
||||
return $output;
|
||||
}
|
||||
}
|
||||
if (strpos($linkUrl, "https://www.wikidata.org/entity/") !== false) {
|
||||
if ($output = self::getWikidataIdFromWikidataLink($linkUrl)) {
|
||||
return $output;
|
||||
}
|
||||
}
|
||||
|
||||
if (strpos($linkUrl, "https://www.wikidata.org/wiki/") !== false) {
|
||||
if ($output = self::getWikidataIdFromWikidataLink($linkUrl)) {
|
||||
return $output;
|
||||
|
@ -481,7 +575,10 @@ final class NodaWikidataFetcher {
|
|||
*/
|
||||
public static function getWikidataIdFromWikidataLink(string $linkUrl):string {
|
||||
|
||||
if (strpos($linkUrl, "https://www.wikidata.org/wiki/") === false) {
|
||||
if (str_contains($linkUrl, "https://www.wikidata.org/wiki/") === false
|
||||
&& str_contains($linkUrl, "https://www.wikidata.org/entity/") === false
|
||||
&& str_contains($linkUrl, "http://www.wikidata.org/entity/") === false
|
||||
) {
|
||||
return '';
|
||||
}
|
||||
|
||||
|
@ -750,14 +847,13 @@ final class NodaWikidataFetcher {
|
|||
* @param integer $persinst_id Person ID.
|
||||
* @param string $wikidata_id Wikidata ID.
|
||||
* @param string $datafromwiki Data fetched from Wikipedia.
|
||||
* @param string $wikilink Link to wikipedia entry.
|
||||
* @param string $preflang The user's currently used language.
|
||||
* @param string $lang Currently queried language.
|
||||
* @param string $erfasst_von User who adds the info.
|
||||
*
|
||||
* @return boolean
|
||||
*/
|
||||
public function retrievePersinstDescFromWikipedia(int $persinst_id, string $wikidata_id, string $datafromwiki, string $wikilink, string $preflang, string $lang, string $erfasst_von):bool {
|
||||
public function retrievePersinstDescFromWikipedia(int $persinst_id, string $wikidata_id, string $datafromwiki, string $preflang, string $lang, string $erfasst_von):bool {
|
||||
|
||||
$output = false;
|
||||
|
||||
|
@ -961,46 +1057,27 @@ final class NodaWikidataFetcher {
|
|||
public function retrievePersinstInfoFromWikidataID(string $lang, string $wikidata_id, int $persinst_id, string $erfasst_von) {
|
||||
|
||||
self::validateWikidataId($wikidata_id);
|
||||
|
||||
$data = json_decode(MD_STD::runCurl("https://www.wikidata.org/wiki/Special:EntityData/" . $wikidata_id . ".json", 10000), true);
|
||||
if ($data === null) {
|
||||
throw new MDhttpFailedException("Failed fetching from Wikidata. Try again later.");
|
||||
}
|
||||
$data = $data['entities'][$wikidata_id];
|
||||
$data = self::_getWikidataEntity($wikidata_id);
|
||||
|
||||
// Get links to wikipedia
|
||||
$wikilink = $wikilinkterm = [];
|
||||
|
||||
foreach (self::LANGUAGES_MAIN_DESC as $tLang) {
|
||||
if (isset($data['sitelinks'][$tLang . 'wiki']['url'])) $wikilink[$tLang] = $data['sitelinks'][$tLang . 'wiki']['url'];
|
||||
if (isset($data['sitelinks'][$tLang . 'wiki']['title'])) $wikilinkterm[$tLang] = str_replace(' ', '_', $data['sitelinks'][$tLang . 'wiki']['title']);
|
||||
}
|
||||
|
||||
$wikilinks = self::_getWikipediaLinksFromWikidataOutput($data);
|
||||
$alreadyEntered = false;
|
||||
|
||||
if (isset($wikilink[$lang]) and isset($wikilinkterm[$lang]) and is_string($wikilinkterm[$lang])) {
|
||||
|
||||
$datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($lang, $wikilinkterm[$lang]), 10000);
|
||||
$datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*'];
|
||||
|
||||
if (isset($wikilinks[$lang])) {
|
||||
# Process data retrieved from wikipedia
|
||||
if (!empty($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki))) {
|
||||
$alreadyEntered = $this->retrievePersinstDescFromWikipedia($persinst_id, $wikidata_id, $datafromwiki, $wikilink[$lang], $lang, $lang, $erfasst_von);
|
||||
if (!empty($datafromwiki = self::_getCleanedWikipediaSnippet($lang, $wikilinks[$lang]['title']))) {
|
||||
$alreadyEntered = $this->retrievePersinstDescFromWikipedia($persinst_id, $wikidata_id, $datafromwiki, $lang, $lang, $erfasst_von);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
foreach (self::LANGUAGES_MAIN_DESC as $cur_lang) {
|
||||
|
||||
if ($alreadyEntered === true) break;
|
||||
if (!isset($wikilink[$cur_lang]) || !isset($wikilinkterm[$cur_lang]) || !is_string($wikilinkterm[$cur_lang])) continue;
|
||||
if ($alreadyEntered === true || !isset($wikilinks[$cur_lang])) continue;
|
||||
|
||||
$datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($cur_lang, $wikilinkterm[$cur_lang]), 10000);
|
||||
$datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*'];
|
||||
|
||||
# Process data retrieved from wikipedia
|
||||
if ($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki)) {
|
||||
$alreadyEntered = $this->retrievePersinstDescFromWikipedia($persinst_id, $wikidata_id, $datafromwiki, $wikilink[$cur_lang], $lang, "$cur_lang", $erfasst_von);
|
||||
if ($datafromwiki = self::_getCleanedWikipediaSnippet($lang, $wikilinks[$cur_lang]['title'])) {
|
||||
$alreadyEntered = $this->retrievePersinstDescFromWikipedia($persinst_id, $wikidata_id, $datafromwiki, $lang, "$cur_lang", $erfasst_von);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1104,7 +1181,6 @@ final class NodaWikidataFetcher {
|
|||
*
|
||||
* @param string $cur_place_desc Mysqli result pointing to the current place.
|
||||
* @param string $datafromwiki Data parsed from wikidata.
|
||||
* @param array<mixed> $wikilink Wikilink.
|
||||
* @param string $preflang Language of the user interface in general.
|
||||
* @param string $lang Language of the main entry.
|
||||
* @param integer $placeID ID of the place.
|
||||
|
@ -1112,7 +1188,7 @@ final class NodaWikidataFetcher {
|
|||
*
|
||||
* @return boolean
|
||||
*/
|
||||
public function enterPlaceDescFromWikidata(string $cur_place_desc, string $datafromwiki, array $wikilink, string $preflang, string $lang, int $placeID, string $erfasst_von):bool {
|
||||
public function enterPlaceDescFromWikidata(string $cur_place_desc, string $datafromwiki, string $preflang, string $lang, int $placeID, string $erfasst_von):bool {
|
||||
|
||||
$datafromwiki = '"' . $datafromwiki . '" - (Wikipedia (' . $lang . ') ' . date("d.m.Y") . ')';
|
||||
|
||||
|
@ -1239,37 +1315,27 @@ final class NodaWikidataFetcher {
|
|||
public function retrievePlaceInfoFromWikidataID(string $lang, string $wikidata_id, int $onum, string $erfasst_von) {
|
||||
|
||||
self::validateWikidataId($wikidata_id);
|
||||
$data = self::_getWikidataEntity($wikidata_id);
|
||||
|
||||
$data = MD_STD::runCurl("https://www.wikidata.org/wiki/Special:EntityData/" . urlencode($wikidata_id) . ".json", 10000);
|
||||
if (!$data = json_decode($data, true)) {
|
||||
throw new MDhttpFailedException("Failed fetching from Wikidata. Try again later.");
|
||||
}
|
||||
$data = $data['entities'][$wikidata_id];
|
||||
|
||||
$wikilink = $wikilinkterm = [];
|
||||
|
||||
foreach (self::LANGUAGES_MAIN_DESC as $tLang) {
|
||||
if (isset($data['sitelinks'][$tLang . 'wiki']['url'])) $wikilink[$tLang] = $data['sitelinks'][$tLang . 'wiki']['url'];
|
||||
if (isset($data['sitelinks'][$tLang . 'wiki']['title'])) $wikilinkterm[$tLang] = str_replace(' ', '_', $data['sitelinks'][$tLang . 'wiki']['title']);
|
||||
}
|
||||
$wikilinks = self::_getWikipediaLinksFromWikidataOutput($data);
|
||||
|
||||
// Get current description for overwriting
|
||||
|
||||
$cur_place_desc = $this->getPlaceDescription($onum);
|
||||
$alreadyEntered = false;
|
||||
|
||||
// P131: Located in administrative unit
|
||||
if (isset($data['claims']['P131'])) {
|
||||
$this->retrieveSuperordinateAdministrativePlace($onum, $data['claims']['P131']);
|
||||
}
|
||||
|
||||
if (!empty($wikilink[$lang])) {
|
||||
$cur_place_desc = $this->getPlaceDescription($onum);
|
||||
$alreadyEntered = false;
|
||||
|
||||
$datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($lang, $wikilinkterm[$lang]), 10000);
|
||||
if (!empty($wikilinks[$lang])) {
|
||||
|
||||
$datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($lang, $wikilinks[$lang]['title']), 10000);
|
||||
$datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*'];
|
||||
|
||||
if (!empty($datafromwiki) and !empty($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki))) {
|
||||
$alreadyEntered = $this->enterPlaceDescFromWikidata($cur_place_desc, $datafromwiki, $wikilink, $lang, $lang, $onum, $erfasst_von);
|
||||
$alreadyEntered = $this->enterPlaceDescFromWikidata($cur_place_desc, $datafromwiki, $lang, $lang, $onum, $erfasst_von);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1277,12 +1343,12 @@ final class NodaWikidataFetcher {
|
|||
|
||||
//if ($alreadyEntered === true) break;
|
||||
if ($alreadyEntered === true) break;
|
||||
if (!isset($wikilink[$cur_lang])) continue;
|
||||
if (!isset($wikilinks[$cur_lang]['url'])) continue;
|
||||
|
||||
$datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($cur_lang, $wikilinkterm[$cur_lang]), 10000);
|
||||
$datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($cur_lang, $wikilinks[$cur_lang]['title']), 10000);
|
||||
$datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*'];
|
||||
if (!empty($datafromwiki) and !empty($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki))) {
|
||||
$alreadyEntered = $this->enterPlaceDescFromWikidata($cur_place_desc, $datafromwiki, $wikilink, $lang, $cur_lang, $onum, $erfasst_von);
|
||||
$alreadyEntered = $this->enterPlaceDescFromWikidata($cur_place_desc, $datafromwiki, $lang, $cur_lang, $onum, $erfasst_von);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1294,10 +1360,7 @@ final class NodaWikidataFetcher {
|
|||
NodaBatchInserter::linkNodaForPlace($this->_mysqli_noda, $onum, $nodaLinks, $erfasst_von);
|
||||
}
|
||||
|
||||
if (isset($data['claims']['P625'])) {
|
||||
$latitude_wd = \filter_var($data['claims']['P625'][0]['mainsnak']['datavalue']['value']['latitude'], FILTER_VALIDATE_FLOAT);
|
||||
$longitude_wd = \filter_var($data['claims']['P625'][0]['mainsnak']['datavalue']['value']['longitude'], FILTER_VALIDATE_FLOAT);
|
||||
}
|
||||
$coordinates_wd = self::_getPlaceCoordinatesFromWikidata($data);
|
||||
|
||||
$this->_mysqli_noda->autocommit(false);
|
||||
if (!empty($tgn_id)) {
|
||||
|
@ -1322,12 +1385,13 @@ final class NodaWikidataFetcher {
|
|||
unset($updateStmt);
|
||||
|
||||
}
|
||||
if (!empty($latitude_wd) and !empty($longitude_wd)) {
|
||||
|
||||
if (!empty($coordinates_wd)) {
|
||||
|
||||
$updateStmt = $this->_mysqli_noda->do_prepare("UPDATE `orte`
|
||||
SET `ort_nord_sued` = ?, `ort_west_ost` = ?
|
||||
WHERE `ort_id` = ?");
|
||||
$updateStmt->bind_param("ddi", $latitude_wd, $longitude_wd, $onum);
|
||||
$updateStmt->bind_param("ddi", $coordinates_wd['latitude'], $coordinates_wd['longitude'], $onum);
|
||||
$updateStmt->execute();
|
||||
$updateStmt->close();
|
||||
unset($updateStmt);
|
||||
|
@ -1382,14 +1446,13 @@ final class NodaWikidataFetcher {
|
|||
*
|
||||
* @param integer $tag_id Tag ID.
|
||||
* @param string $datafromwiki Data fetched from Wikipedia.
|
||||
* @param string $wikilink Link to wikipedia entry.
|
||||
* @param string $preflang The user's currently used language.
|
||||
* @param string $lang Currently queried language.
|
||||
* @param string $erfasst_von User who adds the info.
|
||||
*
|
||||
* @return boolean
|
||||
*/
|
||||
public function retrieveTagDescFromWikipedia(int $tag_id, string $datafromwiki, string $wikilink, string $preflang, string $lang, string $erfasst_von):bool {
|
||||
public function retrieveTagDescFromWikipedia(int $tag_id, string $datafromwiki, string $preflang, string $lang, string $erfasst_von):bool {
|
||||
|
||||
$output = false;
|
||||
|
||||
|
@ -1486,46 +1549,34 @@ final class NodaWikidataFetcher {
|
|||
public function retrieveTagInfoFromWikidataID(string $lang, string $wikidata_id, int $tag_id, string $erfasst_von) {
|
||||
|
||||
self::validateWikidataId($wikidata_id);
|
||||
$data = self::_getWikidataEntity($wikidata_id);
|
||||
|
||||
$data = MD_STD::runCurl("https://www.wikidata.org/wiki/Special:EntityData/" . $wikidata_id . ".json", 10000);
|
||||
$data = json_decode($data, true);
|
||||
if ($data === null) {
|
||||
throw new MDhttpFailedException("Failed fetching from Wikidata. Try again later.");
|
||||
}
|
||||
$data = $data['entities'][$wikidata_id];
|
||||
|
||||
$wikilink = $wikilinkterm = [];
|
||||
|
||||
foreach (self::LANGUAGES_MAIN_DESC as $tLang) {
|
||||
if (isset($data['sitelinks'][$tLang . 'wiki']['url'])) $wikilink[$tLang] = $data['sitelinks'][$tLang . 'wiki']['url'];
|
||||
if (isset($data['sitelinks'][$tLang . 'wiki']['title'])) $wikilinkterm[$tLang] = str_replace(' ', '_', $data['sitelinks'][$tLang . 'wiki']['title']);
|
||||
}
|
||||
$wikilinks = self::_getWikipediaLinksFromWikidataOutput($data);
|
||||
|
||||
$alreadyEntered = false;
|
||||
|
||||
if (isset($wikilink[$lang]) and isset($wikilinkterm[$lang]) and is_string($wikilinkterm[$lang])) {
|
||||
if (isset($wikilinks[$lang])) {
|
||||
|
||||
$datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($lang, $wikilinkterm[$lang]), 10000);
|
||||
$datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($lang, $wikilinks[$lang]['title']), 10000);
|
||||
$datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*'];
|
||||
|
||||
# Process data retrieved from wikipedia
|
||||
if (!empty($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki))) {
|
||||
$alreadyEntered = $this->retrieveTagDescFromWikipedia($tag_id, $datafromwiki, $wikilink[$lang], $lang, $lang, $erfasst_von);
|
||||
$alreadyEntered = $this->retrieveTagDescFromWikipedia($tag_id, $datafromwiki, $lang, $lang, $erfasst_von);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
foreach (self::LANGUAGES_MAIN_DESC as $cur_lang) {
|
||||
|
||||
if ($alreadyEntered === true) break;
|
||||
if (!isset($wikilink[$cur_lang]) || !isset($wikilinkterm[$cur_lang]) || !is_string($wikilinkterm[$cur_lang])) continue;
|
||||
if ($alreadyEntered === true || !isset($wikilinks[$cur_lang])) continue;
|
||||
|
||||
$datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($cur_lang, $wikilinkterm[$cur_lang]), 10000);
|
||||
$datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($cur_lang, $wikilinks[$cur_lang]['title']), 10000);
|
||||
$datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*'];
|
||||
|
||||
# Process data retrieved from wikipedia
|
||||
if ($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki)) {
|
||||
$alreadyEntered = $this->retrieveTagDescFromWikipedia($tag_id, $datafromwiki, $wikilink[$cur_lang], $lang, $cur_lang, $erfasst_von);
|
||||
$alreadyEntered = $this->retrieveTagDescFromWikipedia($tag_id, $datafromwiki, $lang, $cur_lang, $erfasst_von);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user