From 107a4cd640e362daf57df191f90fbe03ee993faa Mon Sep 17 00:00:00 2001 From: Joshua Ramon Enslin Date: Thu, 31 Aug 2023 15:38:12 +0200 Subject: [PATCH] Improve NodaWikidataFetcher's loading of descriptions Close #15 --- src/NodaWikidataFetcher.php | 554 +++++++++++++++++------------- tests/NodaWikidataFetcherTest.php | 219 ++++++++++++ 2 files changed, 538 insertions(+), 235 deletions(-) diff --git a/src/NodaWikidataFetcher.php b/src/NodaWikidataFetcher.php index 504a647..d90179e 100644 --- a/src/NodaWikidataFetcher.php +++ b/src/NodaWikidataFetcher.php @@ -98,6 +98,256 @@ final class NodaWikidataFetcher { /** @var MDMysqli */ private MDMysqli $_mysqli_noda; + /** + * Returns the API link to Wikipedia's API for getting information on a page. + * + * @param string $lang Language / wikipedia version to fetch. + * @param string $searchTerm Search term. + * + * @return non-empty-string + */ + private static function _getWikipediaApiLink(string $lang, string $searchTerm):string { + + return "https://" . urlencode($lang) . ".wikipedia.org/w/api.php?action=parse&page=" . urlencode($searchTerm) . "&prop=text§ion=0&format=json"; + + } + + /** + * Cleans basic tags off Wikidata input. + * + * @param string $input Input string. + * + * @return string + */ + private static function _cleanWikidataInputHtml(string $input):string { + + // Clean off anything before first

+ if ($pStartPos = strpos($input, '')) { + $input = substr($input, 0, $pEndPos + 4); + } + + $doc = new DOMDocument(); + try { + $doc->loadXML('

' . trim($input) . '
'); + } + catch (Exception $e) { + throw new Exception("Failed to load DOMDocument." . PHP_EOL . $e->getMessage() . PHP_EOL . PHP_EOL . $input); + } + + $list = $doc->getElementsByTagName("style"); + while ($list->length > 0) { + $p = $list->item(0); + if ($p === null || $p->parentNode === null) break; + $p->parentNode->removeChild($p); + } + + $list = $doc->getElementsByTagName("table"); + while ($list->length > 0) { + $p = $list->item(0); + if ($p === null || $p->parentNode === null) break; + $p->parentNode->removeChild($p); + } + + $list = $doc->getElementsByTagName("ol"); + while ($list->length > 0) { + $p = $list->item(0); + if ($p === null || $p->parentNode === null) break; + $p->parentNode->removeChild($p); + } + + if (($firstP = $doc->getElementsByTagName("p")->item(0)) !== null) { + if (($firstPhtml = $doc->saveHTML($firstP)) !== false) { + if (strpos($firstPhtml, 'geohack') !== false) { + if ($firstP->parentNode !== null) $firstP->parentNode->removeChild($firstP); + } + } + } + + $output = []; + foreach ($doc->getElementsByTagName("p") as $p) { + $output[] = trim($p->textContent); + } + + /* + if (strpos($doc->saveHTML(), 'Coordinates:') !== false) { + echo $doc->saveHTML(); + exit; + } + */ + return str_replace(PHP_EOL, PHP_EOL . PHP_EOL, trim(implode(PHP_EOL, $output))); + + } + + /** + * Cleans brackets ([1], [2]) off description text. + * + * @param string $input Input string. + * + * @return string + */ + private static function _cleanSourceBracketsOffTranslation(string $input):string { + + $bracketsToRemove = []; + for ($i = 0; $i < 100; $i++) { + $bracketsToRemove["[$i]"] = ""; + } + return strtr($input, $bracketsToRemove); + + } + + /** + * Cleans contents parsed from Wikipedia. + * + * @param string $input Input string. + * + * @return string + */ + private static function _cleanWikidataInput(string $input):string { + + $input = trim($input, '"'); + foreach (self::WIKIPEDIA_REMOVE_LITERALS as $tToRemove) $input = str_replace($tToRemove, "", $input); + + if (substr($input, 0, strlen('<')) === '<') { + + $input = self::_cleanWikidataInputHtml($input); + + if (mb_strlen($input) > 600) { + if (strpos($input, PHP_EOL . PHP_EOL, 600) !== false) { + $input = substr($input, 0, strpos($input, PHP_EOL . PHP_EOL, 600)); + } + } + + $input = self::_cleanSourceBracketsOffTranslation($input); + + $input = str_replace("\t", " ", $input); + + return $input; + // Remove newlines with ensuing spaces + while (strpos($input, PHP_EOL . " ") !== false) { + $input = str_replace(PHP_EOL . " ", PHP_EOL, $input); + } + + // Remove double newlines + while (strpos($input, PHP_EOL . PHP_EOL . PHP_EOL) !== false) { + $input = str_replace(PHP_EOL . PHP_EOL . PHP_EOL, PHP_EOL . PHP_EOL, $input); + } + return MD_STD_IN::sanitize_text($input); + + } + + $input = str_replace(PHP_EOL, '', $input); + + if (empty($input)) return ""; + + // Remove infobox tables specifically + $firstParagraphPosition = strpos($input, '"); + if ($currentSearchPos !== false && $currentSearchPos < $firstParagraphPosition) { + if (($tableEndPos = strpos($input, "")) !== false) { + if (($pStartPos = strpos($input, '", "', '

' . PHP_EOL . PHP_EOL . PHP_EOL, $input); + # $input = str_replace('?/i', '', $input); + $input = strip_tags($input); + + # for ($i = 150; $i < 1000; $i++) $input = str_replace("&#$i;", " ", $input); + $i = 0; + while (strpos($input, ".mw-parser-output") !== false and strpos($input, "}", strpos($input, ".mw-parser-output")) !== false) { + $part1 = substr($input, 0, strpos($input, ".mw-parser-output")); + $part2 = substr($input, strpos($input, "}", strpos($input, ".mw-parser-output")) + 1); + $input = $part1 . $part2; + ++$i; + if ($i === 30) break; + } + + $input = self::_cleanSourceBracketsOffTranslation($input); + + $input = str_replace("\t", " ", $input); + + // Remove double whitespaces + while (strpos($input, " ") !== false) { + $input = str_replace(" ", " ", $input); + } + + // Remove newlines with ensuing spaces + while (strpos($input, PHP_EOL . " ") !== false) { + $input = str_replace(PHP_EOL . " ", PHP_EOL, $input); + } + + // Remove double newlines + while (strpos($input, PHP_EOL . PHP_EOL . PHP_EOL) !== false) { + $input = str_replace(PHP_EOL . PHP_EOL . PHP_EOL, PHP_EOL . PHP_EOL, $input); + } + + $stableToRemove = [ + "Vous pouvez partager vos connaissances en l’améliorant (comment ?) selon les recommandations des projets correspondants.", + ]; + foreach ($stableToRemove as $tToRemove) $input = str_replace($tToRemove, "", $input); + + $endings = [ + "StubDenne artikel om et vandløb ", + ]; + foreach ($endings as $ending) { + if (strpos($input, $ending) !== false) $input = substr($input, 0, strpos($input, $ending)); + } + + $input = trim($input); + + // Cut off overly long articles + if (mb_strlen($input) > 600) { + if (strpos($input, PHP_EOL . PHP_EOL, 600) !== false) { + $input = trim(substr($input, 0, strpos($input, PHP_EOL . PHP_EOL, 600))); + } + } + + if (empty($input)) return ''; + + $input = str_replace("'", "´", MD_STD::preg_replace_str("/\&\#91\;[0-9]\&\#93\;/", '', $input)); + + $input = html_entity_decode($input); + + return MD_STD_IN::sanitize_text($input); + + } + + /** + * Wrapper around _cleanWikidataInput for testing. + * + * @param string $input Input string. + * + * @return string + */ + public static function cleanWikidataInput(string $input):string { + + if (PHP_SAPI !== 'cli') throw new Exception("Use this function only for testing"); + return self::_cleanWikidataInput($input); + + } + /** * Sets the retrieval mode. * @@ -343,7 +593,7 @@ final class NodaWikidataFetcher { if (isset($wikilink)) { - $languagesToFetch[$lang] = "https://" . $lang . ".wikipedia.org/w/api.php?action=parse&page=" . urlencode($wikilinkterm) . "&prop=text§ion=0&format=json"; + $languagesToFetch[$lang] = self::_getWikipediaApiLink($lang, $wikilinkterm); $wikilinks[$lang] = $wikilink; } @@ -379,6 +629,7 @@ final class NodaWikidataFetcher { $output = []; + $descs = []; foreach ($checkagainstLanguage as $lang) { if (!empty($languagesToFetch[$lang]) && !empty($data['sitelinks'][$lang . 'wiki']) && !empty($wikilinks[$lang])) { @@ -393,18 +644,22 @@ final class NodaWikidataFetcher { if ($descFromWiki !== null) $tDescription = (string)$descFromWiki; else $tDescription = ""; - $tDescription = '"' . $tDescription . '" - (' . $data['labels'][$lang]['language'] . '.wikipedia.org ' . date('d.m.Y') . ')'; - } else { $tDescription = ""; } - $output[$lang] = [ - 'label' => self::_cleanWikidataInput((string)$data['labels'][$lang]['value']), - 'description' => self::_cleanWikidataInput($tDescription), - 'link' => $wikilink, - ]; + if ($tDescription !== '') { + $descs[$lang] = $tDescription; + $desc_cleaned = self::_cleanWikidataInput($tDescription); + if ($desc_cleaned !== '') { + $output[$lang] = [ + 'label' => self::_cleanWikidataInput((string)$data['labels'][$lang]['value']), + 'description' => '"' . $desc_cleaned . '" - (' . $data['labels'][$lang]['language'] . '.wikipedia.org ' . date('d.m.Y') . ')', + 'link' => $wikilink, + ]; + } + } } // echo '
Wikipedia Links fehlen'; @@ -417,6 +672,7 @@ final class NodaWikidataFetcher { ]; } + # print_r($descs); } @@ -424,196 +680,6 @@ final class NodaWikidataFetcher { } - /** - * Cleans contents parsed from Wikipedia. - * - * @param string $input Input string. - * - * @return string - */ - private static function _cleanWikidataInput(string $input):string { - - if (substr($input, 0, strlen('<')) === '<') { - $doc = new DOMDocument(); - $doc->loadXML($input); - - $list = $doc->getElementsByTagName("style"); - while ($list->length > 0) { - $p = $list->item(0); - if ($p === null || $p->parentNode === null) break; - $p->parentNode->removeChild($p); - } - - $list = $doc->getElementsByTagName("table"); - while ($list->length > 0) { - $p = $list->item(0); - if ($p === null || $p->parentNode === null) break; - $p->parentNode->removeChild($p); - } - - $list = $doc->getElementsByTagName("div"); - while ($list->length > 1) { - $p = $list->item(1); - if ($p === null || $p->parentNode === null) break; - $p->parentNode->removeChild($p); - } - - $list = $doc->getElementsByTagName("ol"); - while ($list->length > 0) { - $p = $list->item(0); - if ($p === null || $p->parentNode === null) break; - $p->parentNode->removeChild($p); - } - - if (($firstP = $doc->getElementsByTagName("p")->item(0)) !== null) { - if (($firstPhtml = $doc->saveHTML($firstP)) !== false) { - if (strpos($firstPhtml, 'geohack') !== false) { - if ($firstP->parentNode !== null) $firstP->parentNode->removeChild($firstP); - } - } - } - - /* - if (strpos($doc->saveHTML(), 'Coordinates:') !== false) { - echo $doc->saveHTML(); - exit; - } - */ - - $input = str_replace(PHP_EOL, PHP_EOL . PHP_EOL, trim($doc->textContent)); - - if (mb_strlen($input) > 600) { - if (strpos($input, PHP_EOL . PHP_EOL, 600) !== false) { - $input = substr($input, 0, strpos($input, PHP_EOL . PHP_EOL, 600)); - } - } - - $bracketsToRemove = []; - for ($i = 0; $i < 100; $i++) { - $bracketsToRemove["[$i]"] = ""; - } - $input = strtr($input, $bracketsToRemove); - - $input = str_replace("\t", " ", $input); - - // Remove newlines with ensuing spaces - while (strpos($input, PHP_EOL . " ") !== false) { - $input = str_replace(PHP_EOL . " ", PHP_EOL, $input); - } - - // Remove double newlines - while (strpos($input, PHP_EOL . PHP_EOL . PHP_EOL) !== false) { - $input = str_replace(PHP_EOL . PHP_EOL . PHP_EOL, PHP_EOL . PHP_EOL, $input); - } - return $input; - - } - - $input = str_replace(PHP_EOL, '', $input); - - foreach (self::WIKIPEDIA_REMOVE_LITERALS as $tToRemove) $input = str_replace($tToRemove, "", $input); - - $first_mention_of_paragraph = strpos($input, '

'); - if ($first_mention_of_paragraph !== false) $input = substr($input, $first_mention_of_paragraph, (strrpos($input, '

') ?: strlen($input)) - $first_mention_of_paragraph); - - // Remove infobox tables specifically - $removeFirstParagraph = false; - if (empty($input)) return ""; - $firstParagraphPosition = strpos($input, '"); - if ($currentSearchPos !== false && $currentSearchPos < $firstParagraphPosition) { - if (($tableEndPos = strpos($input, "")) !== false) { - if (($pStartPos = strpos($input, '", "', '

' . PHP_EOL . PHP_EOL . PHP_EOL, $input); - # $input = str_replace('?/i', '', $input); - $input = strip_tags($input); - - # for ($i = 150; $i < 1000; $i++) $input = str_replace("&#$i;", " ", $input); - $i = 0; - while (strpos($input, ".mw-parser-output") !== false and strpos($input, "}", strpos($input, ".mw-parser-output")) !== false) { - $part1 = substr($input, 0, strpos($input, ".mw-parser-output")); - $part2 = substr($input, strpos($input, "}", strpos($input, ".mw-parser-output")) + 1); - $input = $part1 . $part2; - ++$i; - if ($i === 30) break; - } - - $bracketsToRemove = []; - for ($i = 0; $i < 100; $i++) { - $bracketsToRemove["[$i]"] = ""; - } - $input = strtr($input, $bracketsToRemove); - - $input = str_replace("\t", " ", $input); - - // Remove double whitespaces - while (strpos($input, " ") !== false) { - $input = str_replace(" ", " ", $input); - } - - // Remove newlines with ensuing spaces - while (strpos($input, PHP_EOL . " ") !== false) { - $input = str_replace(PHP_EOL . " ", PHP_EOL, $input); - } - - // Remove double newlines - while (strpos($input, PHP_EOL . PHP_EOL . PHP_EOL) !== false) { - $input = str_replace(PHP_EOL . PHP_EOL . PHP_EOL, PHP_EOL . PHP_EOL, $input); - } - - $stableToRemove = [ - "Vous pouvez partager vos connaissances en l’améliorant (comment ?) selon les recommandations des projets correspondants.", - ]; - foreach ($stableToRemove as $tToRemove) $input = str_replace($tToRemove, "", $input); - - $endings = [ - "StubDenne artikel om et vandløb ", - ]; - foreach ($endings as $ending) { - if (strpos($input, $ending) !== false) $input = substr($input, 0, strpos($input, $ending)); - } - - $input = trim($input); - - // Cut off overly long articles - if (mb_strlen($input) > 600) { - if (strpos($input, PHP_EOL . PHP_EOL, 600) !== false) { - $input = trim(substr($input, 0, strpos($input, PHP_EOL . PHP_EOL, 600))); - } - } - - if (empty($input)) return ''; - - $input = str_replace("'", "´", MD_STD::preg_replace_str("/\&\#91\;[0-9]\&\#93\;/", '', $input)); - - $input = html_entity_decode($input); - - return $input; - - } - /** * Function for fetching description from Wikipedia * @@ -855,7 +921,7 @@ final class NodaWikidataFetcher { if (isset($wikilink[$lang]) and isset($wikilinkterm[$lang]) and is_string($wikilinkterm[$lang])) { - $datafromwiki = MD_STD::runCurl("https://" . $lang . ".wikipedia.org/w/api.php?action=parse&page=" . urlencode($wikilinkterm[$lang]) . "&prop=text§ion=0&format=json", 10000); + $datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($lang, $wikilinkterm[$lang]), 10000); $datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*']; # Process data retrieved from wikipedia @@ -865,17 +931,17 @@ final class NodaWikidataFetcher { } - foreach (self::LANGUAGES_MAIN_DESC as $sprache) { + foreach (self::LANGUAGES_MAIN_DESC as $cur_lang) { if ($alreadyEntered === true) break; - if (!isset($wikilink[$sprache]) || !isset($wikilinkterm[$sprache]) || !is_string($wikilinkterm[$sprache])) continue; + if (!isset($wikilink[$cur_lang]) || !isset($wikilinkterm[$cur_lang]) || !is_string($wikilinkterm[$cur_lang])) continue; - $datafromwiki = MD_STD::runCurl("https://" . $sprache . ".wikipedia.org/w/api.php?action=parse&page=" . urlencode((string)$wikilinkterm[$sprache]) . "&prop=text§ion=0&format=json", 10000); + $datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($cur_lang, $wikilinkterm[$cur_lang]), 10000); $datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*']; # Process data retrieved from wikipedia if ($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki)) { - $alreadyEntered = $this->retrievePersinstDescFromWikipedia($persinst_id, $wikidata_id, $datafromwiki, $wikilink[$sprache], $lang, "$sprache", $erfasst_von); + $alreadyEntered = $this->retrievePersinstDescFromWikipedia($persinst_id, $wikidata_id, $datafromwiki, $wikilink[$cur_lang], $lang, "$cur_lang", $erfasst_von); } } @@ -940,33 +1006,54 @@ final class NodaWikidataFetcher { } + /** + * Gets the current description of a place. + * + * @param integer $onum Place ID. + * + * @return string + */ + private function getPlaceDescription(int $onum):string { + + $currentPlaceResult = $this->_mysqli_noda->query_by_stmt("SELECT `ort_anmerkung` + FROM `orte` + WHERE `ort_id` = ?", "i", $onum); + + if (!($curPlaceInfo = $currentPlaceResult->fetch_row())) { + $currentPlaceResult->close(); + throw new Exception("This place does not exist"); + } + $currentPlaceResult->close(); + + return $curPlaceInfo[0]; + + } + /** * Function for entering base information about a place from wikidata. * - * @param mysqli_result $currentPlaceResult Mysqli result pointing to the current place. - * @param string $datafromwiki Data parsed from wikidata. - * @param array $wikilink Wikilink. - * @param string $preflang Language of the user interface in general. - * @param string $lang Language of the main entry. - * @param integer $placeID ID of the place. - * @param string $erfasst_von User name. + * @param string $cur_place_desc Mysqli result pointing to the current place. + * @param string $datafromwiki Data parsed from wikidata. + * @param array $wikilink Wikilink. + * @param string $preflang Language of the user interface in general. + * @param string $lang Language of the main entry. + * @param integer $placeID ID of the place. + * @param string $erfasst_von User name. * * @return boolean */ - public function enterPlaceDescFromWikidata(mysqli_result $currentPlaceResult, string $datafromwiki, array $wikilink, string $preflang, string $lang, int $placeID, string $erfasst_von) { + public function enterPlaceDescFromWikidata(string $cur_place_desc, string $datafromwiki, array $wikilink, string $preflang, string $lang, int $placeID, string $erfasst_von):bool { $datafromwiki = '"' . $datafromwiki . '" - (Wikipedia (' . $lang . ') ' . date("d.m.Y") . ')'; - if (!($curPlaceInfo = $currentPlaceResult->fetch_assoc())) return false; - - if (!empty(trim($curPlaceInfo['ort_anmerkung'])) and substr($curPlaceInfo['ort_anmerkung'], 0, 3) !== 'GND') { + if (!empty(trim($cur_place_desc)) and substr($cur_place_desc, 0, 3) !== 'GND') { switch ($this->_retrievalMode) { case "add": - $datafromwiki = $curPlaceInfo['ort_anmerkung'] . PHP_EOL . PHP_EOL . $datafromwiki; + $datafromwiki = $cur_place_desc . PHP_EOL . PHP_EOL . $datafromwiki; break; case "keep": - $datafromwiki = $curPlaceInfo['ort_anmerkung']; + $datafromwiki = $cur_place_desc; break; case "replace": break; @@ -977,7 +1064,7 @@ final class NodaWikidataFetcher { echo '

There is already an entry for description ...

-

Actual entry

' . nl2br($curPlaceInfo['ort_anmerkung']) . '

+

Actual entry

' . nl2br($cur_place_desc) . '

Now found

@@ -1102,10 +1189,9 @@ final class NodaWikidataFetcher { if (isset($data['sitelinks'][$tLang . 'wiki']['title'])) $wikilinkterm[$tLang] = str_replace(' ', '_', $data['sitelinks'][$tLang . 'wiki']['title']); } - $currentPlaceResult = $this->_mysqli_noda->query_by_stmt("SELECT `ort_anmerkung` - FROM `orte` - WHERE `ort_id` = ?", "i", $onum); + // Get current description for overwriting + $cur_place_desc = $this->getPlaceDescription($onum); $alreadyEntered = false; // P131: Located in administrative unit @@ -1115,29 +1201,27 @@ final class NodaWikidataFetcher { if (!empty($wikilink[$lang])) { - $datafromwiki = MD_STD::runCurl("https://" . urlencode($lang) . ".wikipedia.org/w/api.php?action=parse&page=" . urlencode($wikilinkterm[$lang]) . "&prop=text§ion=0&format=json", 10000); + $datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($lang, $wikilinkterm[$lang]), 10000); $datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*']; if (!empty($datafromwiki) and !empty($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki))) { - $alreadyEntered = $this->enterPlaceDescFromWikidata($currentPlaceResult, $datafromwiki, $wikilink, $lang, $lang, $onum, $erfasst_von); + $alreadyEntered = $this->enterPlaceDescFromWikidata($cur_place_desc, $datafromwiki, $wikilink, $lang, $lang, $onum, $erfasst_von); } } - foreach (self::LANGUAGES_MAIN_DESC as $sprache) { + foreach (self::LANGUAGES_MAIN_DESC as $cur_lang) { //if ($alreadyEntered === true) break; if ($alreadyEntered === true) break; - if (!isset($wikilink[$sprache])) continue; + if (!isset($wikilink[$cur_lang])) continue; - $datafromwiki = MD_STD::runCurl("https://" . urlencode($sprache) . ".wikipedia.org/w/api.php?action=parse&page=" . urlencode($wikilinkterm[$sprache]) . "&prop=text§ion=0&format=json", 10000); + $datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($cur_lang, $wikilinkterm[$cur_lang]), 10000); $datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*']; if (!empty($datafromwiki) and !empty($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki))) { - $alreadyEntered = $this->enterPlaceDescFromWikidata($currentPlaceResult, $datafromwiki, $wikilink, $lang, $sprache, $onum, $erfasst_von); + $alreadyEntered = $this->enterPlaceDescFromWikidata($cur_place_desc, $datafromwiki, $wikilink, $lang, $cur_lang, $onum, $erfasst_von); } } - $currentPlaceResult->close(); - unset($currentPlaceResult); if (isset($data['claims']['P1566'])) $geonames_id = filter_var($data['claims']['P1566'][0]['mainsnak']['datavalue']['value'], FILTER_VALIDATE_INT); if (isset($data['claims']['P1667'])) $tgn_id = filter_var($data['claims']['P1667'][0]['mainsnak']['datavalue']['value'], FILTER_VALIDATE_INT); @@ -1381,7 +1465,7 @@ final class NodaWikidataFetcher { if (isset($wikilink[$lang]) and isset($wikilinkterm[$lang]) and is_string($wikilinkterm[$lang])) { - $datafromwiki = MD_STD::runCurl("https://" . $lang . ".wikipedia.org/w/api.php?action=parse&page=" . urlencode($wikilinkterm[$lang]) . "&prop=text§ion=0&format=json", 10000); + $datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($lang, $wikilinkterm[$lang]), 10000); $datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*']; # Process data retrieved from wikipedia @@ -1391,17 +1475,17 @@ final class NodaWikidataFetcher { } - foreach (self::LANGUAGES_MAIN_DESC as $sprache) { + foreach (self::LANGUAGES_MAIN_DESC as $cur_lang) { if ($alreadyEntered === true) break; - if (!isset($wikilink[$sprache]) || !isset($wikilinkterm[$sprache]) || !is_string($wikilinkterm[$sprache])) continue; + if (!isset($wikilink[$cur_lang]) || !isset($wikilinkterm[$cur_lang]) || !is_string($wikilinkterm[$cur_lang])) continue; - $datafromwiki = MD_STD::runCurl("https://" . $sprache . ".wikipedia.org/w/api.php?action=parse&page=" . urlencode((string)$wikilinkterm[$sprache]) . "&prop=text§ion=0&format=json", 10000); + $datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($cur_lang, $wikilinkterm[$cur_lang]), 10000); $datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*']; # Process data retrieved from wikipedia if ($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki)) { - $alreadyEntered = $this->retrieveTagDescFromWikipedia($tag_id, $datafromwiki, $wikilink[$sprache], $lang, "$sprache", $erfasst_von); + $alreadyEntered = $this->retrieveTagDescFromWikipedia($tag_id, $datafromwiki, $wikilink[$cur_lang], $lang, $cur_lang, $erfasst_von); } } @@ -1724,13 +1808,13 @@ final class NodaWikidataFetcher { public static function generateWikidataFetcherHeader(MDTlLoader $tlLoader, string $additional = "", string $searchTerm = ""):string { if (empty($searchTerm) and !empty($_GET['suchbegriff'])) { - $searchTerm = $_GET['suchbegriff']; + $searchTerm = (string)$_GET['suchbegriff']; } $output = '

Logo: Wikidata' . $tlLoader->tl("wiki", "wiki", "fetch_from_wikidata"); - $output .= ': ' . $searchTerm; + $output .= ': ' . htmlspecialchars($searchTerm); $output .= '

'; $output .= $additional; $output .= '
'; diff --git a/tests/NodaWikidataFetcherTest.php b/tests/NodaWikidataFetcherTest.php index dd0ffbb..20c75b8 100644 --- a/tests/NodaWikidataFetcherTest.php +++ b/tests/NodaWikidataFetcherTest.php @@ -9,6 +9,7 @@ use PHPUnit\Framework\TestCase; require_once __DIR__ . "/../src/NodaWikidataFetcher.php"; require_once __DIR__ . "/../../MDErrorReporter/exceptions/generic/MDExpectedException.php"; require_once __DIR__ . "/../../MD_STD/src/MD_STD.php"; +require_once __DIR__ . "/../../MD_STD/src/MD_STD_IN.php"; /** * This script contains tests for the Wikidata fetcher. @@ -56,4 +57,222 @@ final class NodaWikidataFetcherTest extends TestCase { self::assertEquals(NodaWikidataFetcher::getWikidataIdFromWikidataLink("https://www.wikidata.org/wiki/Q106697"), "Q106697"); } + + /** + * Test for cleaning wikidata info. + * + * @return void + */ + public function testCleanWikidataInput():void { + + $testStr = '"
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Werbowez (Kossiw) +
Вербовець +
Wappen fehlt + + +
Werbowez (Kossiw) (Ukraine)
Werbowez (Kossiw) (Ukraine)
+
Werbowez (Kossiw)
+
Basisdaten +
Oblast:Oblast Iwano-Frankiwsk +
Rajon:Rajon Kossiw +
Höhe:369 m +
Fläche:18,77 km² +
Einwohner:3.395 (2001) +
Bevölkerungsdichte: +181 Einwohner je km² +
Postleitzahlen:78605 +
Vorwahl:+380 3478 +
Geographische Lage:48° 21′ N, 25° 8′ OKoordinaten: 48° 20′ 32″ N, 25° 8′ 0″ O +
KATOTTH: +UA26100010030094355 +
KOATUU: +2623682401 +
Verwaltungsgliederung: +1 Dorf +
Adresse: +вул. Миру, буд. 15
78605 с. Вербовець +
Website: +Offizielle Webseite +
Statistische Informationen +
+ + +
Werbowez (Kossiw) (Oblast Iwano-Frankiwsk)
Werbowez (Kossiw) (Oblast Iwano-Frankiwsk)
+
Werbowez (Kossiw)
i1 +
+

Werbowez (ukrainisch Вербовець; russisch Вербовец, polnisch Wierzbowiec; rumänisch Verboveț) ist ein Dorf in der ukrainischen Oblast Iwano-Frankiwsk mit etwa 3400 Einwohnern (2001).[1] +

+
Blick auf das Dorf
+

Das um 1650 erstmals schriftlich erwähnte Dorf[2] liegt im Osten der historischen Landschaft Galizien am Ufer der Rybnyzja (Рибниця), einem 56 km langen Nebenfluss des Pruth 7 km nordöstlich vom Rajonzentrum Kossiw und 95 km südlich vom Oblastzentrum Iwano-Frankiwsk. Südlich der Ortschaft verläuft die Territorialstraße T–09–09. +

Am 12. Juni 2020 wurde das Dorf ein Teil der neu gegründeten Stadtgemeinde Kossiw im Rajon Kossiw[3], bis dahin bildete es zusammen mit dem Dorf Staryj Kossiw (Старий Косів) die Landratsgemeinde Werbowez (Вербовецька сільська рада/Werbowezka silska rada) im Osten des Rajons. +

+
    +
  1. Ortswebseite auf der offiziellen Webpräsenz der Werchowna Rada; abgerufen am 14. November 2017 (ukrainisch) +
  2. +
  3. Ortsgeschichte Werbowez in der Geschichte der Städte und Dörfer der Ukrainischen SSR; abgerufen am 14. November 2017 (ukrainisch) +
  4. +
  5. Кабінет Міністрів України Розпорядження від 12 червня 2020 р. № 714-р "Про визначення адміністративних центрів та затвердження територій територіальних громад Івано-Франківської області" +
  6. +
+ + +
" - (de.wikipedia.org 31.08.2023)'; + + $output = NodaWikidataFetcher::cleanWikidataInput($testStr); + $expected = 'Werbowez (ukrainisch Вербовець; russisch Вербовец, polnisch Wierzbowiec; rumänisch Verboveț) ist ein Dorf in der ukrainischen Oblast Iwano-Frankiwsk mit etwa 3400 Einwohnern (2001).'; + self::assertTrue( + str_starts_with($output, $expected), + "Start of parsed Wikipedia text should be:" . PHP_EOL . PHP_EOL . $expected . PHP_EOL . PHP_EOL . 'Real start text is: ' . PHP_EOL . PHP_EOL . substr($output, 0, 250) + ); + + $output = NodaWikidataFetcher::cleanWikidataInput('
+

坐标48°20′32″N 25°8′0″E / 48.34222°N 25.13333°E / 48.34222; 25.13333 +

韋爾博韋齊烏克蘭語Вербовець),是烏克蘭的村落,位於該國西部伊萬諾-弗蘭科夫斯克州,由科索夫區負責管轄,始建於1456年,面積18.77平方公里,2001年人口3,395。 +

+ + +
'); + $expected = '韋爾博韋齊(烏克蘭語:Вербовець),是烏克蘭的村落,位於該國西部伊萬諾-弗蘭科夫斯克州,由科索夫區負責管轄,始建於1456年,面積18.77平方公里,2001年人口3,3'; + self::assertTrue( + str_starts_with($output, $expected), + "Start of parsed Wikipedia text should be:" . PHP_EOL . PHP_EOL . $expected . PHP_EOL . PHP_EOL . 'Real start text is: ' . PHP_EOL . PHP_EOL . substr($output, 0, 250) + ); + + + + + } + + /** + * Test for cleaning wikidata info. + * + * @return void + */ + public function testCleanWikidataInputWithoutHtml():void { + + $output = NodaWikidataFetcher::cleanWikidataInput('Werbowez (ukrainisch Вербовець; russisch Вербовец, polnisch Wierzbowiec; rumänisch Verboveț) ist ein Dorf in der ukrainischen Oblast Iwano-Frankiwsk mit etwa 3400 Einwohnern (2001).[1]'); + $expected = 'Werbowez (ukrainisch Вербовець; russisch Вербовец, polnisch Wierzbowiec; rumänisch Verboveț) ist ein Dorf in der ukrainischen Oblast Iwano-Frankiwsk mit etwa 3400 Einwohnern (2001).'; + self::assertTrue( + str_starts_with($output, $expected), + "Start of parsed Wikipedia text should be:" . PHP_EOL . PHP_EOL . $expected . PHP_EOL . PHP_EOL . 'Real start text is: ' . PHP_EOL . PHP_EOL . substr($output, 0, 250) + ); + + } }