Add functions for automatic rewriting of country names to brackets at

the end of place names based on lists
This commit is contained in:
Joshua Ramon Enslin 2023-11-26 00:54:14 +01:00
parent f6409322e5
commit e610723107
Signed by: jrenslin
GPG Key ID: 46016F84501B70AE
110 changed files with 255 additions and 0 deletions

View File

@ -0,0 +1,71 @@
<?PHP
/**
* This file contains tools for fetching data from Wikidata.
*
* @file
* @author Joshua Ramon Enslin <joshua@museum-digital.de>
*/
declare(strict_types = 1);
require_once __DIR__ . '/../src/NodaWikidataFetcher.php';
require_once __DIR__ . '/../../MD_STD/src/MD_STD.php';
/**
* Queries wikidata for instances of a Q-ID.
*
* @param string $lang Query language.
* @param string $instanceOf Q-ID.
*
* @return array<mixed>
*/
function query(string $lang, string $instanceOf):array {
$sparqlQueryString = 'SELECT ?item ?itemLabel
WHERE
{
?item wdt:P31/wdt:P279* wd:' . $instanceOf . '.
SERVICE wikibase:label { bd:serviceParam wikibase:language "' . $lang . ',[AUTO_LANGUAGE],en". } # Helps get the label in your language, if not, then en language
}';
return NodaWikidataFetcher::sparqlQuery($sparqlQueryString);
}
/**
* Returns names from a query.
*
* @param array<mixed> $data Wikidata output values.
*
* @return array<string>
*/
function getNames(array $data):array {
$output = [];
foreach ($data['results']['bindings'] as $entry) {
$output[] = $entry['itemLabel']['value'];
}
return $output;
}
// Q6256 => country
$targets = [
'Q6256' => 'countries',
'Q3024240' => 'historical_countries',
'Q10864048' => 'first_lvl_administrative_units',
];
$langs = ['ar', 'bg', 'bn', 'cs', 'da', 'de', 'el', 'en', 'es', 'fa', 'fi', 'fr', 'ha', 'he', 'hi', 'hu', 'id', 'it', 'ja', 'ka', 'ko', 'nl', 'pl', 'pt', 'ro', 'ru', 'sv', 'sw', 'ta', 'th', 'tl', 'tr', 'uk', 'ur', 'vi', 'zh'];
foreach ($langs as $lang) {
foreach ($targets as $qid => $filename) {
$regionNames = getNames(query($lang, $qid));
file_put_contents(__DIR__ . '/../static/' . $filename . '.' . $lang . '.json', json_encode($regionNames));
echo "Fetched $lang : $filename ($qid)" . PHP_EOL;
}
}

View File

@ -64,6 +64,11 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
'XX' => '20',
];
/**
* @var array<string, list<string>>
*/
private static $_placeNameListCaches = [];
/**
* Rewrites indicators for narrower locations paired with a superordinate location
* into the format "Narrower (Broader)".
@ -181,6 +186,75 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
}
/**
* Loads a JSON file, optionally loading it cached through a private static variable
* if reuse is expectable (= in the case of CLI usage).
*
* @param non-empty-string $filename File name to load.
*
* @return list<string>
*/
private static function _loadJsonList(string $filename):array {
if (PHP_SAPI === 'cli' && isset(self::$_placeNameListCaches[$filename])) {
return self::$_placeNameListCaches[$filename];
}
$output = json_decode(MD_STD::file_get_contents($filename), true);
if ($output === false) {
throw new Exception("Failed to get list");
}
if (PHP_SAPI === 'cli') {
self::$_placeNameListCaches[$filename] = $output;
}
return $output;
}
/**
* Moves names of regions to brackets using pre-generated lists of countries,
* historical country names, etc.
*
* @param string $lang Instance language.
* @param string $name Input string to clean.
*
* @return string
*/
private static function _move_region_names_to_brackets(string $lang, string $name):string {
$separators = ['-', ', '];
foreach ($separators as $separator) {
if (!str_contains($name, $separator) || substr_count($name, $separator) !== 1) continue;
// Get parts and trim them
$parts = explode($separator, $name);
foreach ($parts as $key => $value) {
$parts[$key] = trim($value);
}
// Load place names
$countryNames = self::_loadJsonList(__DIR__ . "/../static/countries.$lang.json") + self::_loadJsonList(__DIR__ . "/../static/historical_countries.$lang.json");
$part0IsCountry = in_array($parts[0], $countryNames, true);
$part1IsCountry = in_array($parts[1], $countryNames, true);
if ($part0IsCountry === true && $part1IsCountry === false) {
return $parts[1] . ' (' . $parts[0] . ')';
}
else if ($part0IsCountry === false && $part1IsCountry === true) {
return $parts[0] . ' (' . $parts[1] . ')';
}
}
return $name;
}
/**
* Cleans a place name by trimming etc. Also removes uncertainty indicators.
*
@ -203,6 +277,8 @@ final class NodaConsolidatedNamesForPlaces extends NodaConsolidatedNamesAbstract
default => $ort_name,
};
$ort_name = self::_move_region_names_to_brackets($lang, $ort_name);
return $ort_name;
}

1
static/countries.ar.json Normal file

File diff suppressed because one or more lines are too long

1
static/countries.bg.json Normal file

File diff suppressed because one or more lines are too long

1
static/countries.bn.json Normal file

File diff suppressed because one or more lines are too long

1
static/countries.cs.json Normal file

File diff suppressed because one or more lines are too long

1
static/countries.da.json Normal file

File diff suppressed because one or more lines are too long

1
static/countries.de.json Normal file

File diff suppressed because one or more lines are too long

1
static/countries.el.json Normal file

File diff suppressed because one or more lines are too long

1
static/countries.en.json Normal file

File diff suppressed because one or more lines are too long

1
static/countries.es.json Normal file

File diff suppressed because one or more lines are too long

1
static/countries.fa.json Normal file

File diff suppressed because one or more lines are too long

1
static/countries.fi.json Normal file

File diff suppressed because one or more lines are too long

1
static/countries.fr.json Normal file

File diff suppressed because one or more lines are too long

1
static/countries.ha.json Normal file

File diff suppressed because one or more lines are too long

1
static/countries.he.json Normal file

File diff suppressed because one or more lines are too long

1
static/countries.hi.json Normal file

File diff suppressed because one or more lines are too long

1
static/countries.hu.json Normal file

File diff suppressed because one or more lines are too long

1
static/countries.id.json Normal file

File diff suppressed because one or more lines are too long

1
static/countries.it.json Normal file

File diff suppressed because one or more lines are too long

1
static/countries.ja.json Normal file

File diff suppressed because one or more lines are too long

1
static/countries.ka.json Normal file

File diff suppressed because one or more lines are too long

1
static/countries.ko.json Normal file

File diff suppressed because one or more lines are too long

1
static/countries.nl.json Normal file

File diff suppressed because one or more lines are too long

1
static/countries.pl.json Normal file

File diff suppressed because one or more lines are too long

1
static/countries.pt.json Normal file

File diff suppressed because one or more lines are too long

1
static/countries.ro.json Normal file

File diff suppressed because one or more lines are too long

1
static/countries.ru.json Normal file

File diff suppressed because one or more lines are too long

1
static/countries.sv.json Normal file

File diff suppressed because one or more lines are too long

1
static/countries.sw.json Normal file

File diff suppressed because one or more lines are too long

1
static/countries.ta.json Normal file

File diff suppressed because one or more lines are too long

1
static/countries.th.json Normal file

File diff suppressed because one or more lines are too long

1
static/countries.tl.json Normal file

File diff suppressed because one or more lines are too long

1
static/countries.tr.json Normal file

File diff suppressed because one or more lines are too long

1
static/countries.uk.json Normal file

File diff suppressed because one or more lines are too long

1
static/countries.ur.json Normal file

File diff suppressed because one or more lines are too long

1
static/countries.vi.json Normal file

File diff suppressed because one or more lines are too long

1
static/countries.zh.json Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

Some files were not shown because too many files have changed in this diff Show More