From b318b5b4711da7d809d9a5854f8c0e7a57e120e6 Mon Sep 17 00:00:00 2001 From: Joshua Ramon Enslin Date: Mon, 14 Nov 2022 00:51:56 +0100 Subject: [PATCH] Better modularize NodaWikidataFetcher's loading of translations --- src/NodaWikidataFetcher.php | 280 +++++++++++++----------------------- 1 file changed, 100 insertions(+), 180 deletions(-) diff --git a/src/NodaWikidataFetcher.php b/src/NodaWikidataFetcher.php index e4ccea3..f5b50fe 100644 --- a/src/NodaWikidataFetcher.php +++ b/src/NodaWikidataFetcher.php @@ -17,8 +17,8 @@ final class NodaWikidataFetcher { 'Accept: application/sparql-results+json', ]; - const LANGUAGES_MAIN_DESC = ['de', 'da', 'en', 'es', 'fr', 'hu', 'it', 'jp', 'nl', 'pt', 'ru', 'sv', 'zh']; - const LANGUAGES_TO_CHECK = ['ar', 'bn', 'cs', 'da', 'de', 'el', 'en', 'es', 'fa', 'fi', 'fr', 'ha', 'he', 'hi', 'hu', 'id', 'it', 'ja', 'ka', 'ko', 'nl', 'pl', 'pt', 'ro', 'ru', 'sv', 'sw', 'ta', 'th', 'tl', 'tr', 'uk', 'ur', 'vi', 'zh']; + const LANGUAGES_MAIN_DESC = ['de', 'da', 'en', 'es', 'fr', 'hu', 'it', 'jp', 'nl', 'pt', 'ru', 'sv', 'uk', 'zh']; + const LANGUAGES_TO_CHECK = ['ar', 'bg', 'bn', 'cs', 'da', 'de', 'el', 'en', 'es', 'fa', 'fi', 'fr', 'ha', 'he', 'hi', 'hu', 'id', 'it', 'ja', 'ka', 'ko', 'nl', 'pl', 'pt', 'ro', 'ru', 'sv', 'sw', 'ta', 'th', 'tl', 'tr', 'uk', 'ur', 'vi', 'zh']; const LANGUAGES_TO_CAPITALIZE = ["cs", "da", "de", "en", "es", "fr", "fi", "id", "it", "nl", "pl", "pt", "ru", "sv", "tl", "tr"]; @@ -354,6 +354,76 @@ final class NodaWikidataFetcher { } + /** + * Loads translations from Wikipedia pages through wikidata and then merges + * them with Wikidata's own translations into a usable array. + * + * @param array $checkagainstLanguage The language to check against. + * @param array $data Data fetched from Wikidata. + * + * @return array + */ + public static function listTranslationsFromWikidataWikipedia(array $checkagainstLanguage, array $data):array { + + list($languagesToFetch, $wikilinks) = self::getWikidataWikipediaTranslationSources($checkagainstLanguage, $data); + if (empty($languagesToFetch)) { + return []; + } + + try { + $contents = MD_STD::runCurlMulti($languagesToFetch, 10000); + } + catch (TypeError $e) { + throw new MDExpectedException("Failed to initialize a request. Try pressing F5 to run the requests again."); + } + + $output = []; + + foreach ($checkagainstLanguage as $lang) { + + if (!empty($languagesToFetch[$lang]) && !empty($data['sitelinks'][$lang . 'wiki']) && !empty($wikilinks[$lang])) { + + $wikilink = $wikilinks[$lang]; + if (!empty($contents[$lang])) { + + $descFromWiki = json_decode($contents[$lang], true)['parse']['text']['*']; + + # Process data retrieved from wikipedia + + if ($descFromWiki !== null) $tDescription = (string)$descFromWiki; + else $tDescription = ""; + + $tDescription = '"' . $tDescription . '" - (' . $data['labels'][$lang]['language'] . '.wikipedia.org ' . date('d.m.Y') . ')'; + + } + else { + $tDescription = ""; + } + + $output[$lang] = [ + 'label' => self::_cleanWikidataInput((string)$data['labels'][$lang]['value']), + 'description' => self::_cleanWikidataInput($tDescription), + 'link' => $wikilink, + ]; + + } + // echo '
Wikipedia Links fehlen'; + else if (!empty($data['labels'][$lang]['value']) and !empty($data['descriptions'][$lang])) { + + $output[$lang] = [ + 'label' => self::_cleanWikidataInput($data['labels'][$lang]['value']), + 'description' => self::_cleanWikidataInput($data['descriptions'][$lang]['value']), + 'link' => "", + ]; + + } + + } + + return $output; + + } + /** * Cleans contents parsed from Wikipedia. * @@ -875,68 +945,23 @@ final class NodaWikidataFetcher { */ public function getWikidataTranslationsForPersinst(array $data, int $persinst_id):void { - $checkagainstLanguage = self::LANGUAGES_TO_CHECK; - - list($languagesToFetch, $wikilinks) = self::getWikidataWikipediaTranslationSources($checkagainstLanguage, $data); - if (empty($languagesToFetch)) { + if (empty($translations = self::listTranslationsFromWikidataWikipedia(self::LANGUAGES_TO_CHECK, $data))) { return; } - try { - $contents = MD_STD::runCurlMulti($languagesToFetch, 10000); - } - catch (TypeError $e) { - throw new MDExpectedException("Failed to initialize a request. Try pressing F5 to run the requests again."); - } - $insertStmt = $this->_mysqli_noda->do_prepare("CALL nodaInsertPersinstTranslation(?, ?, ?, ?, ?)"); $this->_mysqli_noda->autocommit(false); - foreach ($checkagainstLanguage as $lang) { + foreach ($translations as $lang => $values) { - if (!empty($languagesToFetch[$lang]) && !empty($data['sitelinks'][$lang . 'wiki'])) { - - $wikilink = $wikilinks[$lang]; - if (!empty($contents[$lang])) { - - $descFromWiki = $contents[$lang]; - $descFromWiki = json_decode($descFromWiki, true)['parse']['text']['*']; - - # Process data retrieved from wikipedia - - if ($descFromWiki !== null) $tDescription = self::_cleanWikidataInput((string)$descFromWiki); - else $tDescription = ""; - - if (substr($tDescription, -1) === chr(10)) $tDescription = substr($tDescription, 0, strlen($tDescription) - 1); - - $tDescription = '"' . $tDescription . '" - (' . $data['labels'][$lang]['language'] . '.wikipedia.org ' . date('d.m.Y') . ')'; - // Inhalt erster Absatz jeweilige Wikipedia: ' . $tDescription - // dies enthält den ersten Absatz der jeweiligen Wikipedia - - } - else { - $tDescription = ""; - } - - $tLang = self::_cleanWikidataInput((string)$data['labels'][$lang]['language']); - $tLabel = self::_cleanWikidataInput((string)$data['labels'][$lang]['value']); - - try { - $insertStmt->bind_param("issss", $persinst_id, $tLang, $tLabel, $tDescription, $wikilink); - $insertStmt->execute(); - } - catch (MDMysqliInvalidEncodingError $e) { - } - - } - // echo '
Wikipedia Links fehlen'; - else if (!empty($data['labels'][$lang]['value']) and !empty($data['descriptions'][$lang])) { - - $wikilink = ""; - $insertStmt->bind_param("issss", $persinst_id, $data['labels'][$lang]['language'], $data['labels'][$lang]['value'], $data['descriptions'][$lang]['value'], $wikilink); + try { + $insertStmt->bind_param("issss", $persinst_id, $lang, + $values['label'], $values['description'], $values['link']); $insertStmt->execute(); } + catch (MDMysqliInvalidEncodingError $e) { + } } @@ -944,7 +969,6 @@ final class NodaWikidataFetcher { $this->_mysqli_noda->autocommit(true); $insertStmt->close(); - unset($insertStmt); } @@ -1248,75 +1272,23 @@ final class NodaWikidataFetcher { */ public function getWikidataTranslationsForPlace(array $data, int $ort_id) { - $checkagainstLanguage = self::LANGUAGES_TO_CHECK; - - list($languagesToFetch, $wikilinks) = self::getWikidataWikipediaTranslationSources($checkagainstLanguage, $data); - if (empty($languagesToFetch)) { + if (empty($translations = self::listTranslationsFromWikidataWikipedia(self::LANGUAGES_TO_CHECK, $data))) { return; } - try { - $contents = MD_STD::runCurlMulti($languagesToFetch, 10000); - } - catch (TypeError $e) { - throw new MDExpectedException("Failed to initialize a request. Try pressing F5 to run the requests again."); - } - $insertStmt = $this->_mysqli_noda->do_prepare("CALL `nodaInsertOrtTranslation`(?, ?, ?, ?, ?)"); $this->_mysqli_noda->autocommit(false); - foreach ($checkagainstLanguage as $lang) { + foreach ($translations as $lang => $values) { - if (!empty($languagesToFetch[$lang]) && !empty($data['sitelinks'][$lang . 'wiki'])) { - - $wikilink = $wikilinks[$lang]; - if (!empty($contents[$lang])) { - - $descFromWiki = $contents[$lang]; - - if (!($wikiDataDecoded = json_decode($descFromWiki, true))) { - continue; - } - $tLabel = $wikiDataDecoded['parse']['title']; - $descFromWiki = $wikiDataDecoded['parse']['text']['*']; - - # Process data retrieved from wikipedia - if (empty($descFromWiki)) $tDescription = ""; - else { - - $tDescription = self::_cleanWikidataInput((string)$descFromWiki); - - if (substr($tDescription, -1) === chr(10)) $tDescription = substr($tDescription, 0, strlen($tDescription) - 1); - $tDescription = '"' . $tDescription . '" - (' . $data['labels'][$lang]['language'] . '.wikipedia.org ' . date('d.m.Y') . ')'; - $tDescription = str_replace("'", "´", MD_STD::preg_replace_str("/\&\#91\;[0-9]\&\#93\;/", '', $tDescription)); - // echo '
Inhalt erster Absatz jeweilige Wikipedia: ' . $tDescription; // dies enthält den ersten Absatz der jeweiligen Wikipedia - - } - - } - else { - $tDescription = ""; - } - - $tLang = self::_cleanWikidataInput((string)$data['labels'][$lang]['language']); - if (empty($tLabel)) $tLabel = self::_cleanWikidataInput((string)$data['labels'][$lang]['value']); - - try { - $insertStmt->bind_param("issss", $ort_id, $tLang, $tLabel, $tDescription, $wikilink); - $insertStmt->execute(); - } - catch (MDMysqliInvalidEncodingError $e) { - $_SESSION["editHistory"] = ["changesStored", "Error adding translation for language $tLang"]; - } - - } - else if (!empty($data['labels'][$lang]['value']) and !empty($data['descriptions'][$lang])) { - - $wikilink = ""; - $insertStmt->bind_param("issss", $ort_id, $data['labels'][$lang]['language'], $data['labels'][$lang]['value'], $data['descriptions'][$lang]['value'], $wikilink); + try { + $insertStmt->bind_param("issss", $ort_id, $lang, + $values['label'], $values['description'], $values['link']); $insertStmt->execute(); } + catch (MDMysqliInvalidEncodingError $e) { + } } @@ -1573,84 +1545,32 @@ final class NodaWikidataFetcher { */ public function getWikidataTranslationsForTag(array $data, int $tag_id) { - $checkagainstLanguage = self::LANGUAGES_TO_CHECK; - - list($languagesToFetch, $wikilinks) = self::getWikidataWikipediaTranslationSources($checkagainstLanguage, $data); - if (empty($languagesToFetch)) { + if (empty($translations = self::listTranslationsFromWikidataWikipedia(self::LANGUAGES_TO_CHECK, $data))) { return; } - try { - $contents = MD_STD::runCurlMulti($languagesToFetch, 10000); - } - catch (TypeError $e) { - throw new MDExpectedException("Failed to initialize a request. Try pressing F5 to run the requests again."); - } - $insertStmt = $this->_mysqli_noda->do_prepare("CALL nodaInsertTagTranslation(?, ?, ?, ?, ?)"); $this->_mysqli_noda->autocommit(false); - foreach ($checkagainstLanguage as $lang) { - - if (!empty($languagesToFetch[$lang]) && !empty($data['sitelinks'][$lang . 'wiki'])) { - - $wikilink = $wikilinks[$lang]; - if (!empty($contents[$lang])) { - - $descFromWiki = $contents[$lang]; - $descFromWiki = json_decode($descFromWiki, true)['parse']['text']['*']; - - if (!empty($descFromWiki)) { - - # Process data retrieved from wikipedia - $tDescription = self::_cleanWikidataInput((string)$descFromWiki); - - if (substr($tDescription, -1) === chr(10)) { - $tDescription = substr($tDescription, 0, strlen($tDescription) - 1); - } - - $tDescription = '"' . $tDescription . '" - (' . $data['labels'][$lang]['language'] . '.wikipedia.org ' . date('d.m.Y') . ')'; - $tDescription = str_replace("'", "´", MD_STD::preg_replace_str("/\&\#91\;[0-9]\&\#93\;/", '', $tDescription)); - - } - else { - $tDescription = ""; - } - - } - else { - $tDescription = ""; - } - - $tLang = self::_cleanWikidataInput((string)$data['labels'][$lang]['language']); - $tLabel = self::_cleanWikidataInput((string)$data['labels'][$lang]['value']); - - if (in_array($tLang, self::LANGUAGES_TO_CAPITALIZE, true)) { - $tLabel = ucfirst(trim($tLabel)); - $tDescription = ucfirst(trim($tDescription)); - } - - try { - $insertStmt->bind_param("issss", $tag_id, $tLang, $tLabel, $tDescription, $wikilink); - $insertStmt->execute(); - } - catch (MDMysqliInvalidEncodingError $e) { - } + foreach ($translations as $lang => $values) { + if (in_array($lang, self::LANGUAGES_TO_CAPITALIZE, true)) { + $label = ucfirst($values['label']); + $description = ucfirst($values['description']); + } + else { + $label = $values['label']; + $description = $values['description']; } - else if (!empty($data['labels'][$lang]['value']) and !empty($data['descriptions'][$lang])) { - $wikilink = ""; - - if (in_array($lang, self::LANGUAGES_TO_CAPITALIZE, true)) { - $data['labels'][$lang]['value'] = ucfirst(trim($data['labels'][$lang]['value'])); - $data['descriptions'][$lang]['value'] = ucfirst(trim($data['descriptions'][$lang]['value'])); - } - - $insertStmt->bind_param("issss", $tag_id, $data['labels'][$lang]['language'], $data['labels'][$lang]['value'], $data['descriptions'][$lang]['value'], $wikilink); + try { + $insertStmt->bind_param("issss", $tag_id, $lang, + $label, $description, $values['link']); $insertStmt->execute(); } + catch (MDMysqliInvalidEncodingError $e) { + } }