diff --git a/src/NodaWikidataFetcher.php b/src/NodaWikidataFetcher.php index 504a647..d90179e 100644 --- a/src/NodaWikidataFetcher.php +++ b/src/NodaWikidataFetcher.php @@ -98,6 +98,256 @@ final class NodaWikidataFetcher { /** @var MDMysqli */ private MDMysqli $_mysqli_noda; + /** + * Returns the API link to Wikipedia's API for getting information on a page. + * + * @param string $lang Language / wikipedia version to fetch. + * @param string $searchTerm Search term. + * + * @return non-empty-string + */ + private static function _getWikipediaApiLink(string $lang, string $searchTerm):string { + + return "https://" . urlencode($lang) . ".wikipedia.org/w/api.php?action=parse&page=" . urlencode($searchTerm) . "&prop=text§ion=0&format=json"; + + } + + /** + * Cleans basic tags off Wikidata input. + * + * @param string $input Input string. + * + * @return string + */ + private static function _cleanWikidataInputHtml(string $input):string { + + // Clean off anything before first
+ if ($pStartPos = strpos($input, '
')) {
+ $input = substr($input, 0, $pEndPos + 4);
+ }
+
+ $doc = new DOMDocument();
+ try {
+ $doc->loadXML('
"); + if ($currentSearchPos !== false && $currentSearchPos < $firstParagraphPosition) { + if (($tableEndPos = strpos($input, "")) !== false) { + if (($pStartPos = strpos($input, '
", "', '
' . PHP_EOL . PHP_EOL . PHP_EOL, $input); + # $input = str_replace('?/i', '', $input); + $input = strip_tags($input); + + # for ($i = 150; $i < 1000; $i++) $input = str_replace("$i;", " ", $input); + $i = 0; + while (strpos($input, ".mw-parser-output") !== false and strpos($input, "}", strpos($input, ".mw-parser-output")) !== false) { + $part1 = substr($input, 0, strpos($input, ".mw-parser-output")); + $part2 = substr($input, strpos($input, "}", strpos($input, ".mw-parser-output")) + 1); + $input = $part1 . $part2; + ++$i; + if ($i === 30) break; + } + + $input = self::_cleanSourceBracketsOffTranslation($input); + + $input = str_replace("\t", " ", $input); + + // Remove double whitespaces + while (strpos($input, " ") !== false) { + $input = str_replace(" ", " ", $input); + } + + // Remove newlines with ensuing spaces + while (strpos($input, PHP_EOL . " ") !== false) { + $input = str_replace(PHP_EOL . " ", PHP_EOL, $input); + } + + // Remove double newlines + while (strpos($input, PHP_EOL . PHP_EOL . PHP_EOL) !== false) { + $input = str_replace(PHP_EOL . PHP_EOL . PHP_EOL, PHP_EOL . PHP_EOL, $input); + } + + $stableToRemove = [ + "Vous pouvez partager vos connaissances en l’améliorant (comment ?) selon les recommandations des projets correspondants.", + ]; + foreach ($stableToRemove as $tToRemove) $input = str_replace($tToRemove, "", $input); + + $endings = [ + "StubDenne artikel om et vandløb ", + ]; + foreach ($endings as $ending) { + if (strpos($input, $ending) !== false) $input = substr($input, 0, strpos($input, $ending)); + } + + $input = trim($input); + + // Cut off overly long articles + if (mb_strlen($input) > 600) { + if (strpos($input, PHP_EOL . PHP_EOL, 600) !== false) { + $input = trim(substr($input, 0, strpos($input, PHP_EOL . PHP_EOL, 600))); + } + } + + if (empty($input)) return ''; + + $input = str_replace("'", "´", MD_STD::preg_replace_str("/\&\#91\;[0-9]\&\#93\;/", '', $input)); + + $input = html_entity_decode($input); + + return MD_STD_IN::sanitize_text($input); + + } + + /** + * Wrapper around _cleanWikidataInput for testing. + * + * @param string $input Input string. + * + * @return string + */ + public static function cleanWikidataInput(string $input):string { + + if (PHP_SAPI !== 'cli') throw new Exception("Use this function only for testing"); + return self::_cleanWikidataInput($input); + + } + /** * Sets the retrieval mode. * @@ -343,7 +593,7 @@ final class NodaWikidataFetcher { if (isset($wikilink)) { - $languagesToFetch[$lang] = "https://" . $lang . ".wikipedia.org/w/api.php?action=parse&page=" . urlencode($wikilinkterm) . "&prop=text§ion=0&format=json"; + $languagesToFetch[$lang] = self::_getWikipediaApiLink($lang, $wikilinkterm); $wikilinks[$lang] = $wikilink; } @@ -379,6 +629,7 @@ final class NodaWikidataFetcher { $output = []; + $descs = []; foreach ($checkagainstLanguage as $lang) { if (!empty($languagesToFetch[$lang]) && !empty($data['sitelinks'][$lang . 'wiki']) && !empty($wikilinks[$lang])) { @@ -393,18 +644,22 @@ final class NodaWikidataFetcher { if ($descFromWiki !== null) $tDescription = (string)$descFromWiki; else $tDescription = ""; - $tDescription = '"' . $tDescription . '" - (' . $data['labels'][$lang]['language'] . '.wikipedia.org ' . date('d.m.Y') . ')'; - } else { $tDescription = ""; } - $output[$lang] = [ - 'label' => self::_cleanWikidataInput((string)$data['labels'][$lang]['value']), - 'description' => self::_cleanWikidataInput($tDescription), - 'link' => $wikilink, - ]; + if ($tDescription !== '') { + $descs[$lang] = $tDescription; + $desc_cleaned = self::_cleanWikidataInput($tDescription); + if ($desc_cleaned !== '') { + $output[$lang] = [ + 'label' => self::_cleanWikidataInput((string)$data['labels'][$lang]['value']), + 'description' => '"' . $desc_cleaned . '" - (' . $data['labels'][$lang]['language'] . '.wikipedia.org ' . date('d.m.Y') . ')', + 'link' => $wikilink, + ]; + } + } } // echo ''); - if ($first_mention_of_paragraph !== false) $input = substr($input, $first_mention_of_paragraph, (strrpos($input, '
') ?: strlen($input)) - $first_mention_of_paragraph); - - // Remove infobox tables specifically - $removeFirstParagraph = false; - if (empty($input)) return ""; - $firstParagraphPosition = strpos($input, '"); - if ($currentSearchPos !== false && $currentSearchPos < $firstParagraphPosition) { - if (($tableEndPos = strpos($input, "")) !== false) { - if (($pStartPos = strpos($input, '
", "', '
' . PHP_EOL . PHP_EOL . PHP_EOL, $input); - # $input = str_replace('?/i', '', $input); - $input = strip_tags($input); - - # for ($i = 150; $i < 1000; $i++) $input = str_replace("$i;", " ", $input); - $i = 0; - while (strpos($input, ".mw-parser-output") !== false and strpos($input, "}", strpos($input, ".mw-parser-output")) !== false) { - $part1 = substr($input, 0, strpos($input, ".mw-parser-output")); - $part2 = substr($input, strpos($input, "}", strpos($input, ".mw-parser-output")) + 1); - $input = $part1 . $part2; - ++$i; - if ($i === 30) break; - } - - $bracketsToRemove = []; - for ($i = 0; $i < 100; $i++) { - $bracketsToRemove["[$i]"] = ""; - } - $input = strtr($input, $bracketsToRemove); - - $input = str_replace("\t", " ", $input); - - // Remove double whitespaces - while (strpos($input, " ") !== false) { - $input = str_replace(" ", " ", $input); - } - - // Remove newlines with ensuing spaces - while (strpos($input, PHP_EOL . " ") !== false) { - $input = str_replace(PHP_EOL . " ", PHP_EOL, $input); - } - - // Remove double newlines - while (strpos($input, PHP_EOL . PHP_EOL . PHP_EOL) !== false) { - $input = str_replace(PHP_EOL . PHP_EOL . PHP_EOL, PHP_EOL . PHP_EOL, $input); - } - - $stableToRemove = [ - "Vous pouvez partager vos connaissances en l’améliorant (comment ?) selon les recommandations des projets correspondants.", - ]; - foreach ($stableToRemove as $tToRemove) $input = str_replace($tToRemove, "", $input); - - $endings = [ - "StubDenne artikel om et vandløb ", - ]; - foreach ($endings as $ending) { - if (strpos($input, $ending) !== false) $input = substr($input, 0, strpos($input, $ending)); - } - - $input = trim($input); - - // Cut off overly long articles - if (mb_strlen($input) > 600) { - if (strpos($input, PHP_EOL . PHP_EOL, 600) !== false) { - $input = trim(substr($input, 0, strpos($input, PHP_EOL . PHP_EOL, 600))); - } - } - - if (empty($input)) return ''; - - $input = str_replace("'", "´", MD_STD::preg_replace_str("/\&\#91\;[0-9]\&\#93\;/", '', $input)); - - $input = html_entity_decode($input); - - return $input; - - } - /** * Function for fetching description from Wikipedia * @@ -855,7 +921,7 @@ final class NodaWikidataFetcher { if (isset($wikilink[$lang]) and isset($wikilinkterm[$lang]) and is_string($wikilinkterm[$lang])) { - $datafromwiki = MD_STD::runCurl("https://" . $lang . ".wikipedia.org/w/api.php?action=parse&page=" . urlencode($wikilinkterm[$lang]) . "&prop=text§ion=0&format=json", 10000); + $datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($lang, $wikilinkterm[$lang]), 10000); $datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*']; # Process data retrieved from wikipedia @@ -865,17 +931,17 @@ final class NodaWikidataFetcher { } - foreach (self::LANGUAGES_MAIN_DESC as $sprache) { + foreach (self::LANGUAGES_MAIN_DESC as $cur_lang) { if ($alreadyEntered === true) break; - if (!isset($wikilink[$sprache]) || !isset($wikilinkterm[$sprache]) || !is_string($wikilinkterm[$sprache])) continue; + if (!isset($wikilink[$cur_lang]) || !isset($wikilinkterm[$cur_lang]) || !is_string($wikilinkterm[$cur_lang])) continue; - $datafromwiki = MD_STD::runCurl("https://" . $sprache . ".wikipedia.org/w/api.php?action=parse&page=" . urlencode((string)$wikilinkterm[$sprache]) . "&prop=text§ion=0&format=json", 10000); + $datafromwiki = MD_STD::runCurl(self::_getWikipediaApiLink($cur_lang, $wikilinkterm[$cur_lang]), 10000); $datafromwiki = json_decode($datafromwiki, true)['parse']['text']['*']; # Process data retrieved from wikipedia if ($datafromwiki = self::_cleanWikidataInput((string)$datafromwiki)) { - $alreadyEntered = $this->retrievePersinstDescFromWikipedia($persinst_id, $wikidata_id, $datafromwiki, $wikilink[$sprache], $lang, "$sprache", $erfasst_von); + $alreadyEntered = $this->retrievePersinstDescFromWikipedia($persinst_id, $wikidata_id, $datafromwiki, $wikilink[$cur_lang], $lang, "$cur_lang", $erfasst_von); } } @@ -940,33 +1006,54 @@ final class NodaWikidataFetcher { } + /** + * Gets the current description of a place. + * + * @param integer $onum Place ID. + * + * @return string + */ + private function getPlaceDescription(int $onum):string { + + $currentPlaceResult = $this->_mysqli_noda->query_by_stmt("SELECT `ort_anmerkung` + FROM `orte` + WHERE `ort_id` = ?", "i", $onum); + + if (!($curPlaceInfo = $currentPlaceResult->fetch_row())) { + $currentPlaceResult->close(); + throw new Exception("This place does not exist"); + } + $currentPlaceResult->close(); + + return $curPlaceInfo[0]; + + } + /** * Function for entering base information about a place from wikidata. * - * @param mysqli_result $currentPlaceResult Mysqli result pointing to the current place. - * @param string $datafromwiki Data parsed from wikidata. - * @param arrayThere is already an entry for description ...
' . nl2br($curPlaceInfo['ort_anmerkung']) . '
+' . nl2br($cur_place_desc) . '
Werbowez (Kossiw) + | ||
Вербовець + | ||
+ | +||
Basisdaten + | ||
---|---|---|
Oblast: | +Oblast Iwano-Frankiwsk + | |
Rajon: | +Rajon Kossiw + | |
Höhe: | +369 m + | |
Fläche: | +18,77 km² + | |
Einwohner: | +3.395 (2001) + | |
Bevölkerungsdichte: + | +181 Einwohner je km² + | |
Postleitzahlen: | +78605 + | |
Vorwahl: | ++380 3478 + | |
Geographische Lage: | +48° 21′ N, 25° 8′ O + | |
KATOTTH: + | +UA26100010030094355 + | |
KOATUU: + | +2623682401 + | |
Verwaltungsgliederung: + | +1 Dorf + | |
Adresse: + | +вул. Миру, буд. 15 78605 с. Вербовець + | |
Website: + | +Offizielle Webseite + | |
Statistische Informationen + | ||
+ |
Werbowez (ukrainisch Вербовець; russisch Вербовец, polnisch Wierzbowiec; rumänisch Verboveț) ist ein Dorf in der ukrainischen Oblast Iwano-Frankiwsk mit etwa 3400 Einwohnern (2001).[1] +
+ +Das um 1650 erstmals schriftlich erwähnte Dorf[2] liegt im Osten der historischen Landschaft Galizien am Ufer der Rybnyzja (Рибниця), einem 56 km langen Nebenfluss des Pruth 7 km nordöstlich vom Rajonzentrum Kossiw und 95 km südlich vom Oblastzentrum Iwano-Frankiwsk. Südlich der Ortschaft verläuft die Territorialstraße T–09–09. +
Am 12. Juni 2020 wurde das Dorf ein Teil der neu gegründeten Stadtgemeinde Kossiw im Rajon Kossiw[3], bis dahin bildete es zusammen mit dem Dorf Staryj Kossiw (Старий Косів) die Landratsgemeinde Werbowez (Вербовецька сільська рада/Werbowezka silska rada) im Osten des Rajons. +
+坐标:48°20′32″N 25°8′0″E / 48.34222°N 25.13333°E +
韋爾博韋齊(烏克蘭語:Вербовець),是烏克蘭的村落,位於該國西部伊萬諾-弗蘭科夫斯克州,由科索夫區負責管轄,始建於1456年,面積18.77平方公里,2001年人口3,395。 +
+ + +