Better modularize NodaWikidataFetcher's loading of translations

This commit is contained in:
Joshua Ramon Enslin 2022-11-14 00:51:56 +01:00
parent 511304b6f2
commit b318b5b471
Signed by: jrenslin
GPG Key ID: 46016F84501B70AE

View File

@ -17,8 +17,8 @@ final class NodaWikidataFetcher {
'Accept: application/sparql-results+json', 'Accept: application/sparql-results+json',
]; ];
const LANGUAGES_MAIN_DESC = ['de', 'da', 'en', 'es', 'fr', 'hu', 'it', 'jp', 'nl', 'pt', 'ru', 'sv', 'zh']; const LANGUAGES_MAIN_DESC = ['de', 'da', 'en', 'es', 'fr', 'hu', 'it', 'jp', 'nl', 'pt', 'ru', 'sv', 'uk', 'zh'];
const LANGUAGES_TO_CHECK = ['ar', 'bn', 'cs', 'da', 'de', 'el', 'en', 'es', 'fa', 'fi', 'fr', 'ha', 'he', 'hi', 'hu', 'id', 'it', 'ja', 'ka', 'ko', 'nl', 'pl', 'pt', 'ro', 'ru', 'sv', 'sw', 'ta', 'th', 'tl', 'tr', 'uk', 'ur', 'vi', 'zh']; const LANGUAGES_TO_CHECK = ['ar', 'bg', 'bn', 'cs', 'da', 'de', 'el', 'en', 'es', 'fa', 'fi', 'fr', 'ha', 'he', 'hi', 'hu', 'id', 'it', 'ja', 'ka', 'ko', 'nl', 'pl', 'pt', 'ro', 'ru', 'sv', 'sw', 'ta', 'th', 'tl', 'tr', 'uk', 'ur', 'vi', 'zh'];
const LANGUAGES_TO_CAPITALIZE = ["cs", "da", "de", "en", "es", "fr", "fi", "id", "it", "nl", "pl", "pt", "ru", "sv", "tl", "tr"]; const LANGUAGES_TO_CAPITALIZE = ["cs", "da", "de", "en", "es", "fr", "fi", "id", "it", "nl", "pl", "pt", "ru", "sv", "tl", "tr"];
@ -354,6 +354,76 @@ final class NodaWikidataFetcher {
} }
/**
* Loads translations from Wikipedia pages through wikidata and then merges
* them with Wikidata's own translations into a usable array.
*
* @param array<string> $checkagainstLanguage The language to check against.
* @param array<mixed> $data Data fetched from Wikidata.
*
* @return array<string, array{label: string, description: string, link: string}>
*/
public static function listTranslationsFromWikidataWikipedia(array $checkagainstLanguage, array $data):array {
list($languagesToFetch, $wikilinks) = self::getWikidataWikipediaTranslationSources($checkagainstLanguage, $data);
if (empty($languagesToFetch)) {
return [];
}
try {
$contents = MD_STD::runCurlMulti($languagesToFetch, 10000);
}
catch (TypeError $e) {
throw new MDExpectedException("Failed to initialize a request. Try pressing F5 to run the requests again.");
}
$output = [];
foreach ($checkagainstLanguage as $lang) {
if (!empty($languagesToFetch[$lang]) && !empty($data['sitelinks'][$lang . 'wiki']) && !empty($wikilinks[$lang])) {
$wikilink = $wikilinks[$lang];
if (!empty($contents[$lang])) {
$descFromWiki = json_decode($contents[$lang], true)['parse']['text']['*'];
# Process data retrieved from wikipedia
if ($descFromWiki !== null) $tDescription = (string)$descFromWiki;
else $tDescription = "";
$tDescription = '"' . $tDescription . '" - (' . $data['labels'][$lang]['language'] . '.wikipedia.org ' . date('d.m.Y') . ')';
}
else {
$tDescription = "";
}
$output[$lang] = [
'label' => self::_cleanWikidataInput((string)$data['labels'][$lang]['value']),
'description' => self::_cleanWikidataInput($tDescription),
'link' => $wikilink,
];
}
// echo '<br><b style="color: cc0000;">Wikipedia Links fehlen</b>';
else if (!empty($data['labels'][$lang]['value']) and !empty($data['descriptions'][$lang])) {
$output[$lang] = [
'label' => self::_cleanWikidataInput($data['labels'][$lang]['value']),
'description' => self::_cleanWikidataInput($data['descriptions'][$lang]['value']),
'link' => "",
];
}
}
return $output;
}
/** /**
* Cleans contents parsed from Wikipedia. * Cleans contents parsed from Wikipedia.
* *
@ -875,68 +945,23 @@ final class NodaWikidataFetcher {
*/ */
public function getWikidataTranslationsForPersinst(array $data, int $persinst_id):void { public function getWikidataTranslationsForPersinst(array $data, int $persinst_id):void {
$checkagainstLanguage = self::LANGUAGES_TO_CHECK; if (empty($translations = self::listTranslationsFromWikidataWikipedia(self::LANGUAGES_TO_CHECK, $data))) {
list($languagesToFetch, $wikilinks) = self::getWikidataWikipediaTranslationSources($checkagainstLanguage, $data);
if (empty($languagesToFetch)) {
return; return;
} }
try {
$contents = MD_STD::runCurlMulti($languagesToFetch, 10000);
}
catch (TypeError $e) {
throw new MDExpectedException("Failed to initialize a request. Try pressing F5 to run the requests again.");
}
$insertStmt = $this->_mysqli_noda->do_prepare("CALL nodaInsertPersinstTranslation(?, ?, ?, ?, ?)"); $insertStmt = $this->_mysqli_noda->do_prepare("CALL nodaInsertPersinstTranslation(?, ?, ?, ?, ?)");
$this->_mysqli_noda->autocommit(false); $this->_mysqli_noda->autocommit(false);
foreach ($checkagainstLanguage as $lang) { foreach ($translations as $lang => $values) {
if (!empty($languagesToFetch[$lang]) && !empty($data['sitelinks'][$lang . 'wiki'])) { try {
$insertStmt->bind_param("issss", $persinst_id, $lang,
$wikilink = $wikilinks[$lang]; $values['label'], $values['description'], $values['link']);
if (!empty($contents[$lang])) {
$descFromWiki = $contents[$lang];
$descFromWiki = json_decode($descFromWiki, true)['parse']['text']['*'];
# Process data retrieved from wikipedia
if ($descFromWiki !== null) $tDescription = self::_cleanWikidataInput((string)$descFromWiki);
else $tDescription = "";
if (substr($tDescription, -1) === chr(10)) $tDescription = substr($tDescription, 0, strlen($tDescription) - 1);
$tDescription = '"' . $tDescription . '" - (' . $data['labels'][$lang]['language'] . '.wikipedia.org ' . date('d.m.Y') . ')';
// Inhalt erster Absatz jeweilige Wikipedia: ' . $tDescription
// dies enthält den ersten Absatz der jeweiligen Wikipedia
}
else {
$tDescription = "";
}
$tLang = self::_cleanWikidataInput((string)$data['labels'][$lang]['language']);
$tLabel = self::_cleanWikidataInput((string)$data['labels'][$lang]['value']);
try {
$insertStmt->bind_param("issss", $persinst_id, $tLang, $tLabel, $tDescription, $wikilink);
$insertStmt->execute();
}
catch (MDMysqliInvalidEncodingError $e) {
}
}
// echo '<br><b style="color: cc0000;">Wikipedia Links fehlen</b>';
else if (!empty($data['labels'][$lang]['value']) and !empty($data['descriptions'][$lang])) {
$wikilink = "";
$insertStmt->bind_param("issss", $persinst_id, $data['labels'][$lang]['language'], $data['labels'][$lang]['value'], $data['descriptions'][$lang]['value'], $wikilink);
$insertStmt->execute(); $insertStmt->execute();
} }
catch (MDMysqliInvalidEncodingError $e) {
}
} }
@ -944,7 +969,6 @@ final class NodaWikidataFetcher {
$this->_mysqli_noda->autocommit(true); $this->_mysqli_noda->autocommit(true);
$insertStmt->close(); $insertStmt->close();
unset($insertStmt);
} }
@ -1248,75 +1272,23 @@ final class NodaWikidataFetcher {
*/ */
public function getWikidataTranslationsForPlace(array $data, int $ort_id) { public function getWikidataTranslationsForPlace(array $data, int $ort_id) {
$checkagainstLanguage = self::LANGUAGES_TO_CHECK; if (empty($translations = self::listTranslationsFromWikidataWikipedia(self::LANGUAGES_TO_CHECK, $data))) {
list($languagesToFetch, $wikilinks) = self::getWikidataWikipediaTranslationSources($checkagainstLanguage, $data);
if (empty($languagesToFetch)) {
return; return;
} }
try {
$contents = MD_STD::runCurlMulti($languagesToFetch, 10000);
}
catch (TypeError $e) {
throw new MDExpectedException("Failed to initialize a request. Try pressing F5 to run the requests again.");
}
$insertStmt = $this->_mysqli_noda->do_prepare("CALL `nodaInsertOrtTranslation`(?, ?, ?, ?, ?)"); $insertStmt = $this->_mysqli_noda->do_prepare("CALL `nodaInsertOrtTranslation`(?, ?, ?, ?, ?)");
$this->_mysqli_noda->autocommit(false); $this->_mysqli_noda->autocommit(false);
foreach ($checkagainstLanguage as $lang) { foreach ($translations as $lang => $values) {
if (!empty($languagesToFetch[$lang]) && !empty($data['sitelinks'][$lang . 'wiki'])) { try {
$insertStmt->bind_param("issss", $ort_id, $lang,
$wikilink = $wikilinks[$lang]; $values['label'], $values['description'], $values['link']);
if (!empty($contents[$lang])) {
$descFromWiki = $contents[$lang];
if (!($wikiDataDecoded = json_decode($descFromWiki, true))) {
continue;
}
$tLabel = $wikiDataDecoded['parse']['title'];
$descFromWiki = $wikiDataDecoded['parse']['text']['*'];
# Process data retrieved from wikipedia
if (empty($descFromWiki)) $tDescription = "";
else {
$tDescription = self::_cleanWikidataInput((string)$descFromWiki);
if (substr($tDescription, -1) === chr(10)) $tDescription = substr($tDescription, 0, strlen($tDescription) - 1);
$tDescription = '"' . $tDescription . '" - (' . $data['labels'][$lang]['language'] . '.wikipedia.org ' . date('d.m.Y') . ')';
$tDescription = str_replace("'", "´", MD_STD::preg_replace_str("/\&\#91\;[0-9]\&\#93\;/", '', $tDescription));
// echo '<br>Inhalt erster Absatz jeweilige Wikipedia: ' . $tDescription; // dies enthält den ersten Absatz der jeweiligen Wikipedia
}
}
else {
$tDescription = "";
}
$tLang = self::_cleanWikidataInput((string)$data['labels'][$lang]['language']);
if (empty($tLabel)) $tLabel = self::_cleanWikidataInput((string)$data['labels'][$lang]['value']);
try {
$insertStmt->bind_param("issss", $ort_id, $tLang, $tLabel, $tDescription, $wikilink);
$insertStmt->execute();
}
catch (MDMysqliInvalidEncodingError $e) {
$_SESSION["editHistory"] = ["changesStored", "Error adding translation for language $tLang"];
}
}
else if (!empty($data['labels'][$lang]['value']) and !empty($data['descriptions'][$lang])) {
$wikilink = "";
$insertStmt->bind_param("issss", $ort_id, $data['labels'][$lang]['language'], $data['labels'][$lang]['value'], $data['descriptions'][$lang]['value'], $wikilink);
$insertStmt->execute(); $insertStmt->execute();
} }
catch (MDMysqliInvalidEncodingError $e) {
}
} }
@ -1573,84 +1545,32 @@ final class NodaWikidataFetcher {
*/ */
public function getWikidataTranslationsForTag(array $data, int $tag_id) { public function getWikidataTranslationsForTag(array $data, int $tag_id) {
$checkagainstLanguage = self::LANGUAGES_TO_CHECK; if (empty($translations = self::listTranslationsFromWikidataWikipedia(self::LANGUAGES_TO_CHECK, $data))) {
list($languagesToFetch, $wikilinks) = self::getWikidataWikipediaTranslationSources($checkagainstLanguage, $data);
if (empty($languagesToFetch)) {
return; return;
} }
try {
$contents = MD_STD::runCurlMulti($languagesToFetch, 10000);
}
catch (TypeError $e) {
throw new MDExpectedException("Failed to initialize a request. Try pressing F5 to run the requests again.");
}
$insertStmt = $this->_mysqli_noda->do_prepare("CALL nodaInsertTagTranslation(?, ?, ?, ?, ?)"); $insertStmt = $this->_mysqli_noda->do_prepare("CALL nodaInsertTagTranslation(?, ?, ?, ?, ?)");
$this->_mysqli_noda->autocommit(false); $this->_mysqli_noda->autocommit(false);
foreach ($checkagainstLanguage as $lang) { foreach ($translations as $lang => $values) {
if (!empty($languagesToFetch[$lang]) && !empty($data['sitelinks'][$lang . 'wiki'])) {
$wikilink = $wikilinks[$lang];
if (!empty($contents[$lang])) {
$descFromWiki = $contents[$lang];
$descFromWiki = json_decode($descFromWiki, true)['parse']['text']['*'];
if (!empty($descFromWiki)) {
# Process data retrieved from wikipedia
$tDescription = self::_cleanWikidataInput((string)$descFromWiki);
if (substr($tDescription, -1) === chr(10)) {
$tDescription = substr($tDescription, 0, strlen($tDescription) - 1);
}
$tDescription = '"' . $tDescription . '" - (' . $data['labels'][$lang]['language'] . '.wikipedia.org ' . date('d.m.Y') . ')';
$tDescription = str_replace("'", "´", MD_STD::preg_replace_str("/\&\#91\;[0-9]\&\#93\;/", '', $tDescription));
}
else {
$tDescription = "";
}
}
else {
$tDescription = "";
}
$tLang = self::_cleanWikidataInput((string)$data['labels'][$lang]['language']);
$tLabel = self::_cleanWikidataInput((string)$data['labels'][$lang]['value']);
if (in_array($tLang, self::LANGUAGES_TO_CAPITALIZE, true)) {
$tLabel = ucfirst(trim($tLabel));
$tDescription = ucfirst(trim($tDescription));
}
try {
$insertStmt->bind_param("issss", $tag_id, $tLang, $tLabel, $tDescription, $wikilink);
$insertStmt->execute();
}
catch (MDMysqliInvalidEncodingError $e) {
}
if (in_array($lang, self::LANGUAGES_TO_CAPITALIZE, true)) {
$label = ucfirst($values['label']);
$description = ucfirst($values['description']);
}
else {
$label = $values['label'];
$description = $values['description'];
} }
else if (!empty($data['labels'][$lang]['value']) and !empty($data['descriptions'][$lang])) {
$wikilink = ""; try {
$insertStmt->bind_param("issss", $tag_id, $lang,
if (in_array($lang, self::LANGUAGES_TO_CAPITALIZE, true)) { $label, $description, $values['link']);
$data['labels'][$lang]['value'] = ucfirst(trim($data['labels'][$lang]['value']));
$data['descriptions'][$lang]['value'] = ucfirst(trim($data['descriptions'][$lang]['value']));
}
$insertStmt->bind_param("issss", $tag_id, $data['labels'][$lang]['language'], $data['labels'][$lang]['value'], $data['descriptions'][$lang]['value'], $wikilink);
$insertStmt->execute(); $insertStmt->execute();
} }
catch (MDMysqliInvalidEncodingError $e) {
}
} }