- '
- '
- if ($pStartPos = strpos($input, '')) {
- $input = substr($input, 0, $pEndPos + 4);
- }
- $doc = new DOMDocument();
- try {
- libxml_use_internal_errors(true);
- $doc->loadXML('');
- libxml_use_internal_errors(false);
- }
- catch (Exception $e) {
- throw new Exception("Failed to load DOMDocument." . PHP_EOL . $e->getMessage() . PHP_EOL . PHP_EOL . '---' . $input . '---');
- }
- $list = $doc->getElementsByTagName("style");
- while ($list->length > 0) {
- $p = $list->item(0);
- if ($p === null || $p->parentNode === null) break;
- $p->parentNode->removeChild($p);
- }
- $list = $doc->getElementsByTagName("table");
- while ($list->length > 0) {
- $p = $list->item(0);
- if ($p === null || $p->parentNode === null) break;
- $p->parentNode->removeChild($p);
- }
- $list = $doc->getElementsByTagName("ol");
- while ($list->length > 0) {
- $p = $list->item(0);
- if ($p === null || $p->parentNode === null) break;
- $p->parentNode->removeChild($p);
- }
- if (($firstP = $doc->getElementsByTagName("p")->item(0)) !== null) {
- if (($firstPhtml = $doc->saveHTML($firstP)) !== false) {
- if (strpos($firstPhtml, 'geohack') !== false) {
- if ($firstP->parentNode !== null) $firstP->parentNode->removeChild($firstP);
- }
- }
- }
- $output = [];
- foreach ($doc->getElementsByTagName("p") as $p) {
- $output[] = trim($p->textContent);
- }
- /*
- if (strpos($doc->saveHTML(), 'Coordinates:') !== false) {
- echo $doc->saveHTML();
- exit;
- }
- */
- return str_replace(PHP_EOL, PHP_EOL . PHP_EOL, trim(implode(PHP_EOL, $output)));
- }
- /**
- * Cleans brackets ([1], [2]) off description text.
- *
- * @param string $input Input string.
- *
- * @return string
- */
- private static function _cleanSourceBracketsOffTranslation(string $input):string {
- $bracketsToRemove = [];
- for ($i = 0; $i < 100; $i++) {
- $bracketsToRemove["[$i]"] = "";
- }
- return strtr($input, $bracketsToRemove);
- }
- /**
- * Cleans contents parsed from Wikipedia.
- *
- * @param string $input Input string.
- *
- * @return string
- */
- private static function _cleanWikidataInput(string $input):string {
- $input = trim($input, '"');
- foreach (self::WIKIPEDIA_REMOVE_LITERALS as $tToRemove) $input = str_replace($tToRemove, "", $input);
- if (substr($input, 0, strlen('<')) === '<') {
- $input = self::_cleanWikidataInputHtml($input);
- if (mb_strlen($input) > 600) {
- if (strpos($input, PHP_EOL . PHP_EOL, 600) !== false) {
- $input = substr($input, 0, strpos($input, PHP_EOL . PHP_EOL, 600));
- }
- }
- $input = self::_cleanSourceBracketsOffTranslation($input);
- $input = str_replace("\t", " ", $input);
- // Remove newlines with ensuing spaces
- while (strpos($input, PHP_EOL . " ") !== false) {
- $input = str_replace(PHP_EOL . " ", PHP_EOL, $input);
- }
- // Remove double newlines
- while (strpos($input, PHP_EOL . PHP_EOL . PHP_EOL) !== false) {
- $input = str_replace(PHP_EOL . PHP_EOL . PHP_EOL, PHP_EOL . PHP_EOL, $input);
- }
- return MD_STD_IN::sanitize_text($input);
- }
- $input = str_replace(PHP_EOL, '', $input);
- if (empty($input)) return "";
- // Remove infobox tables specifically
- $firstParagraphPosition = strpos($input, '
- if ($currentSearchPos !== false && $currentSearchPos < $firstParagraphPosition) {
- if (($tableEndPos = strpos($input, "")) !== false) {
- if (($pStartPos = strpos($input, '
", "
', '
' . PHP_EOL . PHP_EOL . PHP_EOL, $input);
- # $input = str_replace('?/i', '', $input);
- $input = strip_tags($input);
- # for ($i = 150; $i < 1000; $i++) $input = str_replace("$i;", " ", $input);
- $i = 0;
- while (strpos($input, ".mw-parser-output") !== false and strpos($input, "}", strpos($input, ".mw-parser-output")) !== false) {
- $part1 = substr($input, 0, strpos($input, ".mw-parser-output"));
- $part2 = substr($input, strpos($input, "}", strpos($input, ".mw-parser-output")) + 1);
- $input = $part1 . $part2;
- ++$i;
- if ($i === 30) break;
- }
- $input = self::_cleanSourceBracketsOffTranslation($input);
- $input = str_replace("\t", " ", $input);
- // Remove double whitespaces
- while (strpos($input, " ") !== false) {
- $input = str_replace(" ", " ", $input);
- }
- // Remove newlines with ensuing spaces
- while (strpos($input, PHP_EOL . " ") !== false) {
- $input = str_replace(PHP_EOL . " ", PHP_EOL, $input);
- }
- // Remove double newlines
- while (strpos($input, PHP_EOL . PHP_EOL . PHP_EOL) !== false) {
- $input = str_replace(PHP_EOL . PHP_EOL . PHP_EOL, PHP_EOL . PHP_EOL, $input);
- }
- $stableToRemove = [
- "Vous pouvez partager vos connaissances en l’améliorant (comment ?) selon les recommandations des projets correspondants.",
- ];
- foreach ($stableToRemove as $tToRemove) $input = str_replace($tToRemove, "", $input);
- $endings = [
- "StubDenne artikel om et vandløb ",
- ];
- foreach ($endings as $ending) {
- if (strpos($input, $ending) !== false) $input = substr($input, 0, strpos($input, $ending));
- }
- $input = trim($input);
- // Cut off overly long articles
- if (mb_strlen($input) > 600) {
- if (strpos($input, PHP_EOL . PHP_EOL, 600) !== false) {
- $input = trim(substr($input, 0, strpos($input, PHP_EOL . PHP_EOL, 600)));
- }
- }
- if (empty($input)) return '';
- $input = str_replace("'", "´", MD_STD::preg_replace_str("/\&\#91\;[0-9]\&\#93\;/", '', $input));
- $input = html_entity_decode($input);
- return MD_STD_IN::sanitize_text($input);
- }
- /**
- * Wrapper around _cleanWikidataInput for testing.
- *
- * @param string $input Input string.
- *
- * @return string
- */
- public static function cleanWikidataInput(string $input):string {
- if (PHP_SAPI !== 'cli') throw new Exception("Use this function only for testing");
- return self::_cleanWikidataInput($input);
+ return strtr(
+ trim(MD_STD_IN::sanitize_text($input)),
+ [
+ ]
+ );
@@ -815,25 +580,20 @@ final class NodaWikidataFetcher {
$wikilink = $wikilinks[$lang];
if (!empty($contents[$lang])) {
- $fromWikipedia = json_decode($contents[$lang], true)['parse'];
- $titleFromWikipedia = $fromWikipedia['title'];
- $descFromWiki = $fromWikipedia['text']['*'];
- # Process data retrieved from wikipedia
- if ($descFromWiki !== null) $tDescription = (string)$descFromWiki;
- else $tDescription = "";
+ $titleFromWikipedia = $data['sitelinks'][$lang . 'wiki']['title'];
+ $tDescription = self::_getCleanedWikipediaSnippet($lang, $titleFromWikipedia);
else {
$tDescription = "";
- if (!empty($titleFromWikipedia) && !empty($tDescription) && !empty($desc_cleaned = self::_cleanWikidataInput($tDescription))) {
+ if (!empty($titleFromWikipedia) && !empty($tDescription)) {
# $descs[$lang] = $tDescription;
$output[$lang] = [
'label' => $titleFromWikipedia,
- 'description' => '"' . $desc_cleaned . '" - (' . $data['labels'][$lang]['language'] . '.wikipedia.org ' . date('d.m.Y') . ')',
+ 'description' => '"' . $tDescription . '" - (' . $data['labels'][$lang]['language'] . '.wikipedia.org ' . date('d.m.Y') . ')',
'link' => $wikilink,
@@ -841,8 +601,8 @@ final class NodaWikidataFetcher {
else if (!empty($data['labels'][$lang]['value']) and !empty($data['descriptions'][$lang])) {
$output[$lang] = [
- 'label' => self::_cleanWikidataInput($data['labels'][$lang]['value']),
- 'description' => self::_cleanWikidataInput($data['descriptions'][$lang]['value']),
+ 'label' => self::_cleanInputSimple($data['labels'][$lang]['value']),
+ 'description' => self::_cleanInputSimple($data['descriptions'][$lang]['value']),
'link' => "",
@@ -853,8 +613,8 @@ final class NodaWikidataFetcher {
else if (!empty($data['labels'][$lang]['value']) and !empty($data['descriptions'][$lang])) {
$output[$lang] = [
- 'label' => self::_cleanWikidataInput($data['labels'][$lang]['value']),
- 'description' => self::_cleanWikidataInput($data['descriptions'][$lang]['value']),
+ 'label' => self::_cleanInputSimple($data['labels'][$lang]['value']),
+ 'description' => self::_cleanInputSimple($data['descriptions'][$lang]['value']),
'link' => "",
@@ -1070,6 +830,51 @@ final class NodaWikidataFetcher {
+ /**
+ * Function for retrieving information.
+ *
+ * @param string $lang The user's selected used language.
+ * @param array $data Data fetched from wikidata.
+ * @param array $wikilinks Links to wikipedia APIs.
+ *
+ * @return array{}|array{lang: string, desc: string, source: 'wikidata'|'wikipedia'}
+ */
+ private static function _getDescriptionFromWikidataAndWikipediaLinks(string $lang, array $data, array $wikilinks):array {
+ // Try the current user language for retrieving wikipedia texts
+ if (isset($wikilinks[$lang])) {
+ # Process data retrieved from wikipedia
+ if (!empty($datafromwiki = self::_getCleanedWikipediaSnippet($lang, $wikilinks[$lang]['title']))) {
+ return ['lang' => $lang, 'desc' => $datafromwiki, 'source' => 'wikipedia'];
+ }
+ }
+ // Try the alternative languages for retrieving wikidata tests
+ foreach (self::LANGUAGES_MAIN_DESC as $cur_lang) {
+ if ($lang === $cur_lang || !isset($wikilinks[$cur_lang])) continue;
+ if ($datafromwiki = self::_getCleanedWikipediaSnippet($lang, $wikilinks[$cur_lang]['title'])) {
+ return ['lang' => $cur_lang, 'desc' => $datafromwiki, 'source' => 'wikipedia'];
+ }
+ }
+ // If the description still has not been entered, try retrieving it from wikidata.
+ if (!empty($data['descriptions'][$lang])) {
+ return ['lang' => $lang, 'desc' => $data['descriptions'][$lang]['value'], 'source' => 'wikidata'];
+ }
+ else if (!empty($data['descriptions'])) {
+ $tLang = (string)array_keys($data['descriptions'])[0];
+ $desc = $data['descriptions'][$tLang];
+ return ['lang' => $tLang, 'desc' => (string)$desc['value'], 'source' => 'wikidata'];
+ }
+ return [];
+ }
* Function for retrieving information.
@@ -1087,24 +892,8 @@ final class NodaWikidataFetcher {
// Get links to wikipedia
$wikilinks = self::_getWikipediaLinksFromWikidataOutput($data);
- $alreadyEntered = false;
- if (isset($wikilinks[$lang])) {
- # Process data retrieved from wikipedia
- if (!empty($datafromwiki = self::_getCleanedWikipediaSnippet($lang, $wikilinks[$lang]['title']))) {
- $alreadyEntered = $this->retrievePersinstDescFromWikipedia($persinst_id, $wikidata_id, $datafromwiki, $lang, $lang, $erfasst_von);
- }
- }
- foreach (self::LANGUAGES_MAIN_DESC as $cur_lang) {
- if ($alreadyEntered === true || !isset($wikilinks[$cur_lang])) continue;
- if ($datafromwiki = self::_getCleanedWikipediaSnippet($lang, $wikilinks[$cur_lang]['title'])) {
- $alreadyEntered = $this->retrievePersinstDescFromWikipedia($persinst_id, $wikidata_id, $datafromwiki, $lang, "$cur_lang", $erfasst_von);
- }
+ if (!empty($desc = self::_getDescriptionFromWikidataAndWikipediaLinks($lang, $data, $wikilinks))) {
+ $alreadyEntered = $this->retrievePersinstDescFromWikipedia($persinst_id, $wikidata_id, $desc['desc'], $lang, $desc['lang'], $erfasst_von);
$this->enterPersinstBirthDeathDatesFromWikidata($data, $persinst_id);
@@ -1386,30 +1175,8 @@ final class NodaWikidataFetcher {
$cur_place_desc = $this->getPlaceDescription($onum);
- $alreadyEntered = false;
- if (!empty($wikilinks[$lang])) {
- $datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($lang, $wikilinks[$lang]['title']), 10000);
- $datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*'];
- if (!empty($datafromwiki) and !empty($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki))) {
- $alreadyEntered = $this->enterPlaceDescFromWikidata($cur_place_desc, $datafromwiki, $lang, $lang, $onum, $erfasst_von);
- }
- }
- foreach (self::LANGUAGES_MAIN_DESC as $cur_lang) {
- //if ($alreadyEntered === true) break;
- if ($alreadyEntered === true) break;
- if (!isset($wikilinks[$cur_lang]['url'])) continue;
- $datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($cur_lang, $wikilinks[$cur_lang]['title']), 10000);
- $datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*'];
- if (!empty($datafromwiki) and !empty($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki))) {
- $alreadyEntered = $this->enterPlaceDescFromWikidata($cur_place_desc, $datafromwiki, $lang, $cur_lang, $onum, $erfasst_von);
- }
+ if (!empty($desc = self::_getDescriptionFromWikidataAndWikipediaLinks($lang, $data, $wikilinks))) {
+ $this->enterPlaceDescFromWikidata($cur_place_desc, $desc['desc'], $lang, $desc['lang'], $onum, $erfasst_von);
if (isset($data['claims']['P1566'])) $geonames_id = filter_var($data['claims']['P1566'][0]['mainsnak']['datavalue']['value'], FILTER_VALIDATE_INT);
@@ -1611,32 +1378,8 @@ final class NodaWikidataFetcher {
$wikilinks = self::_getWikipediaLinksFromWikidataOutput($data);
- $alreadyEntered = false;
- if (isset($wikilinks[$lang])) {
- $datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($lang, $wikilinks[$lang]['title']), 10000);
- $datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*'];
- # Process data retrieved from wikipedia
- if (!empty($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki))) {
- $alreadyEntered = $this->retrieveTagDescFromWikipedia($tag_id, $datafromwiki, $lang, $lang, $erfasst_von);
- }
- }
- foreach (self::LANGUAGES_MAIN_DESC as $cur_lang) {
- if ($alreadyEntered === true || !isset($wikilinks[$cur_lang])) continue;
- $datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($cur_lang, $wikilinks[$cur_lang]['title']), 10000);
- $datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*'];
- # Process data retrieved from wikipedia
- if ($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki)) {
- $alreadyEntered = $this->retrieveTagDescFromWikipedia($tag_id, $datafromwiki, $lang, $cur_lang, $erfasst_von);
- }
+ if (!empty($desc = self::_getDescriptionFromWikidataAndWikipediaLinks($lang, $data, $wikilinks))) {
+ $alreadyEntered = $this->retrieveTagDescFromWikipedia($tag_id, $desc['desc'], $lang, $desc['lang'], $erfasst_von);
if (!empty($nodaLinks = $this->_getNodaLinksFromWikidataResult('tag', $wikidata_id, $data))) {
diff --git a/tests/NodaWikidataFetcherTest.php b/tests/NodaWikidataFetcherTest.php
index 0eec33e..502428c 100644
--- a/tests/NodaWikidataFetcherTest.php
+++ b/tests/NodaWikidataFetcherTest.php
@@ -93,225 +93,6 @@ final class NodaWikidataFetcherTest extends TestCase {
- /**
- * Test for cleaning wikidata info.
- *
- * @group ValidOutput
- *
- * @return void
- */
- public function testCleanWikidataInput():void {
- $testStr = '"" - (de.wikipedia.org 31.08.2023)';
- $output = NodaWikidataFetcher::cleanWikidataInput($testStr);
- $expected = 'Werbowez (ukrainisch Вербовець; russisch Вербовец, polnisch Wierzbowiec; rumänisch Verboveț) ist ein Dorf in der ukrainischen Oblast Iwano-Frankiwsk mit etwa 3400 Einwohnern (2001).';
- self::assertTrue(
- str_starts_with($output, $expected),
- "Start of parsed Wikipedia text should be:" . PHP_EOL . PHP_EOL . $expected . PHP_EOL . PHP_EOL . 'Real start text is: ' . PHP_EOL . PHP_EOL . substr($output, 0, 250)
- );
- $output = NodaWikidataFetcher::cleanWikidataInput('');
- $expected = '韋爾博韋齊(烏克蘭語:Вербовець),是烏克蘭的村落,位於該國西部伊萬諾-弗蘭科夫斯克州,由科索夫區負責管轄,始建於1456年,面積18.77平方公里,2001年人口3,3';
- self::assertTrue(
- str_starts_with($output, $expected),
- "Start of parsed Wikipedia text should be:" . PHP_EOL . PHP_EOL . $expected . PHP_EOL . PHP_EOL . 'Real start text is: ' . PHP_EOL . PHP_EOL . substr($output, 0, 250)
- );
- }
- /**
- * Test for cleaning wikidata info.
- *
- * @group ValidOutput
- *
- * @return void
- */
- public function testCleanWikidataInputWithoutHtml():void {
- $output = NodaWikidataFetcher::cleanWikidataInput('Werbowez (ukrainisch Вербовець; russisch Вербовец, polnisch Wierzbowiec; rumänisch Verboveț) ist ein Dorf in der ukrainischen Oblast Iwano-Frankiwsk mit etwa 3400 Einwohnern (2001).[1]');
- $expected = 'Werbowez (ukrainisch Вербовець; russisch Вербовец, polnisch Wierzbowiec; rumänisch Verboveț) ist ein Dorf in der ukrainischen Oblast Iwano-Frankiwsk mit etwa 3400 Einwohnern (2001).';
- self::assertTrue(
- str_starts_with($output, $expected),
- "Start of parsed Wikipedia text should be:" . PHP_EOL . PHP_EOL . $expected . PHP_EOL . PHP_EOL . 'Real start text is: ' . PHP_EOL . PHP_EOL . substr($output, 0, 250)
- );
- }
* Data provider for an actor that has a wikidata link and a Telugu translation.