From 5192781494793efcc68c82bb95ad4c1d3fddc0a6 Mon Sep 17 00:00:00 2001 From: Joshua Ramon Enslin Date: Sun, 9 Mar 2025 02:08:26 +0100 Subject: [PATCH] Use Wikipedia API for getting descriptions from Wikipedia rather than parsing HTML in Wikidata fetcher Thanks @awinkler --- src/NodaWikidataFetcher.php | 407 ++++++------------------------ tests/NodaWikidataFetcherTest.php | 219 ---------------- 2 files changed, 75 insertions(+), 551 deletions(-) diff --git a/src/NodaWikidataFetcher.php b/src/NodaWikidataFetcher.php index 19a7805..3e03d71 100644 --- a/src/NodaWikidataFetcher.php +++ b/src/NodaWikidataFetcher.php @@ -43,25 +43,6 @@ final class NodaWikidataFetcher { "orcid" => "P496", ]; - private const WIKIPEDIA_REMOVE_LITERALS = [ - "

Si vous disposez d'ouvrages ou d'articles de référence ou si vous ", - '

En pratique : Quelles sources sont attendu', - '', - '

Géolocalisation sur la carte', - '

Koordinaatit:', - '

', - //'

', - '

', - '

', - '

', - '

', - '

- if ($pStartPos = strpos($input, '')) { - $input = substr($input, 0, $pEndPos + 4); - } - - $doc = new DOMDocument(); - try { - libxml_use_internal_errors(true); - $doc->loadXML('

' . trim($input) . '
'); - libxml_use_internal_errors(false); - } - catch (Exception $e) { - throw new Exception("Failed to load DOMDocument." . PHP_EOL . $e->getMessage() . PHP_EOL . PHP_EOL . '---' . $input . '---'); - } - - $list = $doc->getElementsByTagName("style"); - while ($list->length > 0) { - $p = $list->item(0); - if ($p === null || $p->parentNode === null) break; - $p->parentNode->removeChild($p); - } - - $list = $doc->getElementsByTagName("table"); - while ($list->length > 0) { - $p = $list->item(0); - if ($p === null || $p->parentNode === null) break; - $p->parentNode->removeChild($p); - } - - $list = $doc->getElementsByTagName("ol"); - while ($list->length > 0) { - $p = $list->item(0); - if ($p === null || $p->parentNode === null) break; - $p->parentNode->removeChild($p); - } - - if (($firstP = $doc->getElementsByTagName("p")->item(0)) !== null) { - if (($firstPhtml = $doc->saveHTML($firstP)) !== false) { - if (strpos($firstPhtml, 'geohack') !== false) { - if ($firstP->parentNode !== null) $firstP->parentNode->removeChild($firstP); - } - } - } - - $output = []; - foreach ($doc->getElementsByTagName("p") as $p) { - $output[] = trim($p->textContent); - } - - /* - if (strpos($doc->saveHTML(), 'Coordinates:') !== false) { - echo $doc->saveHTML(); - exit; - } - */ - return str_replace(PHP_EOL, PHP_EOL . PHP_EOL, trim(implode(PHP_EOL, $output))); - - } - - /** - * Cleans brackets ([1], [2]) off description text. - * - * @param string $input Input string. - * - * @return string - */ - private static function _cleanSourceBracketsOffTranslation(string $input):string { - - $bracketsToRemove = []; - for ($i = 0; $i < 100; $i++) { - $bracketsToRemove["[$i]"] = ""; - } - return strtr($input, $bracketsToRemove); - - } - - /** - * Cleans contents parsed from Wikipedia. - * - * @param string $input Input string. - * - * @return string - */ - private static function _cleanWikidataInput(string $input):string { - - $input = trim($input, '"'); - foreach (self::WIKIPEDIA_REMOVE_LITERALS as $tToRemove) $input = str_replace($tToRemove, "", $input); - - if (substr($input, 0, strlen('<')) === '<') { - - $input = self::_cleanWikidataInputHtml($input); - - if (mb_strlen($input) > 600) { - if (strpos($input, PHP_EOL . PHP_EOL, 600) !== false) { - $input = substr($input, 0, strpos($input, PHP_EOL . PHP_EOL, 600)); - } - } - - $input = self::_cleanSourceBracketsOffTranslation($input); - - $input = str_replace("\t", " ", $input); - - // Remove newlines with ensuing spaces - while (strpos($input, PHP_EOL . " ") !== false) { - $input = str_replace(PHP_EOL . " ", PHP_EOL, $input); - } - - // Remove double newlines - while (strpos($input, PHP_EOL . PHP_EOL . PHP_EOL) !== false) { - $input = str_replace(PHP_EOL . PHP_EOL . PHP_EOL, PHP_EOL . PHP_EOL, $input); - } - return MD_STD_IN::sanitize_text($input); - - } - - $input = str_replace(PHP_EOL, '', $input); - - if (empty($input)) return ""; - - // Remove infobox tables specifically - $firstParagraphPosition = strpos($input, '"); - if ($currentSearchPos !== false && $currentSearchPos < $firstParagraphPosition) { - if (($tableEndPos = strpos($input, "")) !== false) { - if (($pStartPos = strpos($input, '", "', '

' . PHP_EOL . PHP_EOL . PHP_EOL, $input); - # $input = str_replace('?/i', '', $input); - $input = strip_tags($input); - - # for ($i = 150; $i < 1000; $i++) $input = str_replace("&#$i;", " ", $input); - $i = 0; - while (strpos($input, ".mw-parser-output") !== false and strpos($input, "}", strpos($input, ".mw-parser-output")) !== false) { - $part1 = substr($input, 0, strpos($input, ".mw-parser-output")); - $part2 = substr($input, strpos($input, "}", strpos($input, ".mw-parser-output")) + 1); - $input = $part1 . $part2; - ++$i; - if ($i === 30) break; - } - - $input = self::_cleanSourceBracketsOffTranslation($input); - - $input = str_replace("\t", " ", $input); - - // Remove double whitespaces - while (strpos($input, " ") !== false) { - $input = str_replace(" ", " ", $input); - } - - // Remove newlines with ensuing spaces - while (strpos($input, PHP_EOL . " ") !== false) { - $input = str_replace(PHP_EOL . " ", PHP_EOL, $input); - } - - // Remove double newlines - while (strpos($input, PHP_EOL . PHP_EOL . PHP_EOL) !== false) { - $input = str_replace(PHP_EOL . PHP_EOL . PHP_EOL, PHP_EOL . PHP_EOL, $input); - } - - $stableToRemove = [ - "Vous pouvez partager vos connaissances en l’améliorant (comment ?) selon les recommandations des projets correspondants.", - ]; - foreach ($stableToRemove as $tToRemove) $input = str_replace($tToRemove, "", $input); - - $endings = [ - "StubDenne artikel om et vandløb ", - ]; - foreach ($endings as $ending) { - if (strpos($input, $ending) !== false) $input = substr($input, 0, strpos($input, $ending)); - } - - $input = trim($input); - - // Cut off overly long articles - if (mb_strlen($input) > 600) { - if (strpos($input, PHP_EOL . PHP_EOL, 600) !== false) { - $input = trim(substr($input, 0, strpos($input, PHP_EOL . PHP_EOL, 600))); - } - } - - if (empty($input)) return ''; - - $input = str_replace("'", "´", MD_STD::preg_replace_str("/\&\#91\;[0-9]\&\#93\;/", '', $input)); - - $input = html_entity_decode($input); - - return MD_STD_IN::sanitize_text($input); - - } - - /** - * Wrapper around _cleanWikidataInput for testing. - * - * @param string $input Input string. - * - * @return string - */ - public static function cleanWikidataInput(string $input):string { - - if (PHP_SAPI !== 'cli') throw new Exception("Use this function only for testing"); - return self::_cleanWikidataInput($input); + return strtr( + trim(MD_STD_IN::sanitize_text($input)), + [ + PHP_EOL => PHP_EOL . PHP_EOL, + PHP_EOL . PHP_EOL . PHP_EOL => PHP_EOL . PHP_EOL, + ] + ); } @@ -815,25 +580,20 @@ final class NodaWikidataFetcher { $wikilink = $wikilinks[$lang]; if (!empty($contents[$lang])) { - $fromWikipedia = json_decode($contents[$lang], true)['parse']; - $titleFromWikipedia = $fromWikipedia['title']; - $descFromWiki = $fromWikipedia['text']['*']; - - # Process data retrieved from wikipedia - - if ($descFromWiki !== null) $tDescription = (string)$descFromWiki; - else $tDescription = ""; + $titleFromWikipedia = $data['sitelinks'][$lang . 'wiki']['title']; + $tDescription = self::_getCleanedWikipediaSnippet($lang, $titleFromWikipedia); } else { $tDescription = ""; } - if (!empty($titleFromWikipedia) && !empty($tDescription) && !empty($desc_cleaned = self::_cleanWikidataInput($tDescription))) { + if (!empty($titleFromWikipedia) && !empty($tDescription)) { + # $descs[$lang] = $tDescription; $output[$lang] = [ 'label' => $titleFromWikipedia, - 'description' => '"' . $desc_cleaned . '" - (' . $data['labels'][$lang]['language'] . '.wikipedia.org ' . date('d.m.Y') . ')', + 'description' => '"' . $tDescription . '" - (' . $data['labels'][$lang]['language'] . '.wikipedia.org ' . date('d.m.Y') . ')', 'link' => $wikilink, ]; } @@ -841,8 +601,8 @@ final class NodaWikidataFetcher { else if (!empty($data['labels'][$lang]['value']) and !empty($data['descriptions'][$lang])) { $output[$lang] = [ - 'label' => self::_cleanWikidataInput($data['labels'][$lang]['value']), - 'description' => self::_cleanWikidataInput($data['descriptions'][$lang]['value']), + 'label' => self::_cleanInputSimple($data['labels'][$lang]['value']), + 'description' => self::_cleanInputSimple($data['descriptions'][$lang]['value']), 'link' => "", ]; @@ -853,8 +613,8 @@ final class NodaWikidataFetcher { else if (!empty($data['labels'][$lang]['value']) and !empty($data['descriptions'][$lang])) { $output[$lang] = [ - 'label' => self::_cleanWikidataInput($data['labels'][$lang]['value']), - 'description' => self::_cleanWikidataInput($data['descriptions'][$lang]['value']), + 'label' => self::_cleanInputSimple($data['labels'][$lang]['value']), + 'description' => self::_cleanInputSimple($data['descriptions'][$lang]['value']), 'link' => "", ]; @@ -1070,6 +830,51 @@ final class NodaWikidataFetcher { } + /** + * Function for retrieving information. + * + * @param string $lang The user's selected used language. + * @param array $data Data fetched from wikidata. + * @param array $wikilinks Links to wikipedia APIs. + * + * @return array{}|array{lang: string, desc: string, source: 'wikidata'|'wikipedia'} + */ + private static function _getDescriptionFromWikidataAndWikipediaLinks(string $lang, array $data, array $wikilinks):array { + + // Try the current user language for retrieving wikipedia texts + if (isset($wikilinks[$lang])) { + # Process data retrieved from wikipedia + if (!empty($datafromwiki = self::_getCleanedWikipediaSnippet($lang, $wikilinks[$lang]['title']))) { + return ['lang' => $lang, 'desc' => $datafromwiki, 'source' => 'wikipedia']; + } + + } + + // Try the alternative languages for retrieving wikidata tests + foreach (self::LANGUAGES_MAIN_DESC as $cur_lang) { + + if ($lang === $cur_lang || !isset($wikilinks[$cur_lang])) continue; + + if ($datafromwiki = self::_getCleanedWikipediaSnippet($lang, $wikilinks[$cur_lang]['title'])) { + return ['lang' => $cur_lang, 'desc' => $datafromwiki, 'source' => 'wikipedia']; + } + + } + + // If the description still has not been entered, try retrieving it from wikidata. + if (!empty($data['descriptions'][$lang])) { + return ['lang' => $lang, 'desc' => $data['descriptions'][$lang]['value'], 'source' => 'wikidata']; + } + else if (!empty($data['descriptions'])) { + $tLang = (string)array_keys($data['descriptions'])[0]; + $desc = $data['descriptions'][$tLang]; + return ['lang' => $tLang, 'desc' => (string)$desc['value'], 'source' => 'wikidata']; + } + + return []; + + } + /** * Function for retrieving information. * @@ -1087,24 +892,8 @@ final class NodaWikidataFetcher { // Get links to wikipedia $wikilinks = self::_getWikipediaLinksFromWikidataOutput($data); - $alreadyEntered = false; - - if (isset($wikilinks[$lang])) { - # Process data retrieved from wikipedia - if (!empty($datafromwiki = self::_getCleanedWikipediaSnippet($lang, $wikilinks[$lang]['title']))) { - $alreadyEntered = $this->retrievePersinstDescFromWikipedia($persinst_id, $wikidata_id, $datafromwiki, $lang, $lang, $erfasst_von); - } - - } - - foreach (self::LANGUAGES_MAIN_DESC as $cur_lang) { - - if ($alreadyEntered === true || !isset($wikilinks[$cur_lang])) continue; - - if ($datafromwiki = self::_getCleanedWikipediaSnippet($lang, $wikilinks[$cur_lang]['title'])) { - $alreadyEntered = $this->retrievePersinstDescFromWikipedia($persinst_id, $wikidata_id, $datafromwiki, $lang, "$cur_lang", $erfasst_von); - } - + if (!empty($desc = self::_getDescriptionFromWikidataAndWikipediaLinks($lang, $data, $wikilinks))) { + $alreadyEntered = $this->retrievePersinstDescFromWikipedia($persinst_id, $wikidata_id, $desc['desc'], $lang, $desc['lang'], $erfasst_von); } $this->enterPersinstBirthDeathDatesFromWikidata($data, $persinst_id); @@ -1386,30 +1175,8 @@ final class NodaWikidataFetcher { } $cur_place_desc = $this->getPlaceDescription($onum); - $alreadyEntered = false; - - if (!empty($wikilinks[$lang])) { - - $datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($lang, $wikilinks[$lang]['title']), 10000); - $datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*']; - - if (!empty($datafromwiki) and !empty($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki))) { - $alreadyEntered = $this->enterPlaceDescFromWikidata($cur_place_desc, $datafromwiki, $lang, $lang, $onum, $erfasst_von); - } - } - - foreach (self::LANGUAGES_MAIN_DESC as $cur_lang) { - - //if ($alreadyEntered === true) break; - if ($alreadyEntered === true) break; - if (!isset($wikilinks[$cur_lang]['url'])) continue; - - $datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($cur_lang, $wikilinks[$cur_lang]['title']), 10000); - $datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*']; - if (!empty($datafromwiki) and !empty($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki))) { - $alreadyEntered = $this->enterPlaceDescFromWikidata($cur_place_desc, $datafromwiki, $lang, $cur_lang, $onum, $erfasst_von); - } - + if (!empty($desc = self::_getDescriptionFromWikidataAndWikipediaLinks($lang, $data, $wikilinks))) { + $this->enterPlaceDescFromWikidata($cur_place_desc, $desc['desc'], $lang, $desc['lang'], $onum, $erfasst_von); } if (isset($data['claims']['P1566'])) $geonames_id = filter_var($data['claims']['P1566'][0]['mainsnak']['datavalue']['value'], FILTER_VALIDATE_INT); @@ -1611,32 +1378,8 @@ final class NodaWikidataFetcher { $wikilinks = self::_getWikipediaLinksFromWikidataOutput($data); - $alreadyEntered = false; - - if (isset($wikilinks[$lang])) { - - $datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($lang, $wikilinks[$lang]['title']), 10000); - $datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*']; - - # Process data retrieved from wikipedia - if (!empty($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki))) { - $alreadyEntered = $this->retrieveTagDescFromWikipedia($tag_id, $datafromwiki, $lang, $lang, $erfasst_von); - } - - } - - foreach (self::LANGUAGES_MAIN_DESC as $cur_lang) { - - if ($alreadyEntered === true || !isset($wikilinks[$cur_lang])) continue; - - $datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($cur_lang, $wikilinks[$cur_lang]['title']), 10000); - $datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*']; - - # Process data retrieved from wikipedia - if ($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki)) { - $alreadyEntered = $this->retrieveTagDescFromWikipedia($tag_id, $datafromwiki, $lang, $cur_lang, $erfasst_von); - } - + if (!empty($desc = self::_getDescriptionFromWikidataAndWikipediaLinks($lang, $data, $wikilinks))) { + $alreadyEntered = $this->retrieveTagDescFromWikipedia($tag_id, $desc['desc'], $lang, $desc['lang'], $erfasst_von); } if (!empty($nodaLinks = $this->_getNodaLinksFromWikidataResult('tag', $wikidata_id, $data))) { diff --git a/tests/NodaWikidataFetcherTest.php b/tests/NodaWikidataFetcherTest.php index 0eec33e..502428c 100644 --- a/tests/NodaWikidataFetcherTest.php +++ b/tests/NodaWikidataFetcherTest.php @@ -93,225 +93,6 @@ final class NodaWikidataFetcherTest extends TestCase { } - /** - * Test for cleaning wikidata info. - * - * @group ValidOutput - * - * @return void - */ - public function testCleanWikidataInput():void { - - $testStr = '"
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Werbowez (Kossiw) -
Вербовець -
Wappen fehlt - - -
Werbowez (Kossiw) (Ukraine)
Werbowez (Kossiw) (Ukraine)
-
Werbowez (Kossiw)
-
Basisdaten -
Oblast:Oblast Iwano-Frankiwsk -
Rajon:Rajon Kossiw -
Höhe:369 m -
Fläche:18,77 km² -
Einwohner:3.395 (2001) -
Bevölkerungsdichte: -181 Einwohner je km² -
Postleitzahlen:78605 -
Vorwahl:+380 3478 -
Geographische Lage:48° 21′ N, 25° 8′ OKoordinaten: 48° 20′ 32″ N, 25° 8′ 0″ O -
KATOTTH: -UA26100010030094355 -
KOATUU: -2623682401 -
Verwaltungsgliederung: -1 Dorf -
Adresse: -вул. Миру, буд. 15
78605 с. Вербовець -
Website: -Offizielle Webseite -
Statistische Informationen -
- - -
Werbowez (Kossiw) (Oblast Iwano-Frankiwsk)
Werbowez (Kossiw) (Oblast Iwano-Frankiwsk)
-
Werbowez (Kossiw)
i1 -
-

Werbowez (ukrainisch Вербовець; russisch Вербовец, polnisch Wierzbowiec; rumänisch Verboveț) ist ein Dorf in der ukrainischen Oblast Iwano-Frankiwsk mit etwa 3400 Einwohnern (2001).[1] -

-
Blick auf das Dorf
-

Das um 1650 erstmals schriftlich erwähnte Dorf[2] liegt im Osten der historischen Landschaft Galizien am Ufer der Rybnyzja (Рибниця), einem 56 km langen Nebenfluss des Pruth 7 km nordöstlich vom Rajonzentrum Kossiw und 95 km südlich vom Oblastzentrum Iwano-Frankiwsk. Südlich der Ortschaft verläuft die Territorialstraße T–09–09. -

Am 12. Juni 2020 wurde das Dorf ein Teil der neu gegründeten Stadtgemeinde Kossiw im Rajon Kossiw[3], bis dahin bildete es zusammen mit dem Dorf Staryj Kossiw (Старий Косів) die Landratsgemeinde Werbowez (Вербовецька сільська рада/Werbowezka silska rada) im Osten des Rajons. -

-
    -
  1. Ortswebseite auf der offiziellen Webpräsenz der Werchowna Rada; abgerufen am 14. November 2017 (ukrainisch) -
  2. -
  3. Ortsgeschichte Werbowez in der Geschichte der Städte und Dörfer der Ukrainischen SSR; abgerufen am 14. November 2017 (ukrainisch) -
  4. -
  5. Кабінет Міністрів України Розпорядження від 12 червня 2020 р. № 714-р "Про визначення адміністративних центрів та затвердження територій територіальних громад Івано-Франківської області" -
  6. -
- - -
" - (de.wikipedia.org 31.08.2023)'; - - $output = NodaWikidataFetcher::cleanWikidataInput($testStr); - $expected = 'Werbowez (ukrainisch Вербовець; russisch Вербовец, polnisch Wierzbowiec; rumänisch Verboveț) ist ein Dorf in der ukrainischen Oblast Iwano-Frankiwsk mit etwa 3400 Einwohnern (2001).'; - self::assertTrue( - str_starts_with($output, $expected), - "Start of parsed Wikipedia text should be:" . PHP_EOL . PHP_EOL . $expected . PHP_EOL . PHP_EOL . 'Real start text is: ' . PHP_EOL . PHP_EOL . substr($output, 0, 250) - ); - - $output = NodaWikidataFetcher::cleanWikidataInput('
-

坐标48°20′32″N 25°8′0″E / 48.34222°N 25.13333°E / 48.34222; 25.13333 -

韋爾博韋齊烏克蘭語Вербовець),是烏克蘭的村落,位於該國西部伊萬諾-弗蘭科夫斯克州,由科索夫區負責管轄,始建於1456年,面積18.77平方公里,2001年人口3,395。 -

- - -
'); - $expected = '韋爾博韋齊(烏克蘭語:Вербовець),是烏克蘭的村落,位於該國西部伊萬諾-弗蘭科夫斯克州,由科索夫區負責管轄,始建於1456年,面積18.77平方公里,2001年人口3,3'; - self::assertTrue( - str_starts_with($output, $expected), - "Start of parsed Wikipedia text should be:" . PHP_EOL . PHP_EOL . $expected . PHP_EOL . PHP_EOL . 'Real start text is: ' . PHP_EOL . PHP_EOL . substr($output, 0, 250) - ); - - } - - /** - * Test for cleaning wikidata info. - * - * @group ValidOutput - * - * @return void - */ - public function testCleanWikidataInputWithoutHtml():void { - - $output = NodaWikidataFetcher::cleanWikidataInput('Werbowez (ukrainisch Вербовець; russisch Вербовец, polnisch Wierzbowiec; rumänisch Verboveț) ist ein Dorf in der ukrainischen Oblast Iwano-Frankiwsk mit etwa 3400 Einwohnern (2001).[1]'); - $expected = 'Werbowez (ukrainisch Вербовець; russisch Вербовец, polnisch Wierzbowiec; rumänisch Verboveț) ist ein Dorf in der ukrainischen Oblast Iwano-Frankiwsk mit etwa 3400 Einwohnern (2001).'; - self::assertTrue( - str_starts_with($output, $expected), - "Start of parsed Wikipedia text should be:" . PHP_EOL . PHP_EOL . $expected . PHP_EOL . PHP_EOL . 'Real start text is: ' . PHP_EOL . PHP_EOL . substr($output, 0, 250) - ); - - } - /** * Data provider for an actor that has a wikidata link and a Telugu translation. *