Fix erroneous splitting of Hungarian time names with additions

This commit is contained in:
Joshua Ramon Enslin 2023-12-09 11:56:15 +01:00
parent 3e9f675fdc
commit 4a49c7a4e7
Signed by: jrenslin
GPG Key ID: 46016F84501B70AE
2 changed files with 73 additions and 12 deletions

View File

@ -42,7 +42,7 @@ final class NodaTimeSplitter {
]; ];
const MONTH_NAMES_HUNGARIAN = [ const MONTH_NAMES_HUNGARIAN = [
"01" => ['január', 'jan'], "01" => ['január', 'januar', 'jan'],
"02" => ['február', 'feb'], "02" => ['február', 'feb'],
"03" => ['március', 'mar.', 'már.'], "03" => ['március', 'mar.', 'már.'],
"04" => ['április', 'apr.', 'ápr.'], "04" => ['április', 'apr.', 'ápr.'],
@ -521,20 +521,25 @@ final class NodaTimeSplitter {
// Skip, if dates are too long and do not contain spaces (= no translatable names) // Skip, if dates are too long and do not contain spaces (= no translatable names)
if (str_contains($datum, " ") === false && strlen($datum) > 12) return []; if (str_contains($datum, " ") === false && strlen($datum) > 12) return [];
$unparsed = trim(strtolower(str_replace($year, '', $datum)), ' ,.');
foreach (self::MONTH_NAMES_HUNGARIAN as $monthVal => $monthValidNames) { foreach (self::MONTH_NAMES_HUNGARIAN as $monthVal => $monthValidNames) {
if (self::stri_occurs($datum, $monthValidNames)) { if (self::stri_occurs($datum, $monthValidNames)) {
$monat = (string)$monthVal; $monat = (string)$monthVal;
foreach ($monthValidNames as $name) {
$unparsed = str_replace($name, '', $unparsed);
}
break; break;
} }
} }
if (strlen($unparsed) > 5) {
return [];
}
if (empty($monat) and self::is_numeric((string)\substr($datum, 5, 2))) $monat = \substr($datum, 5, 2); if (empty($monat) and self::is_numeric((string)\substr($datum, 5, 2))) $monat = \substr($datum, 5, 2);
else if (empty($monat) and self::is_numeric((string)\substr($datum, 6, 2))) $monat = \substr($datum, 6, 2); else if (empty($monat) and self::is_numeric((string)\substr($datum, 6, 2))) $monat = \substr($datum, 6, 2);
// Last four characters must contain at least one space or one dot // Last four characters must contain at least one space or one dot
$lastChars = substr($datum, -4);
if (str_contains($lastChars, '.') === false && str_contains($lastChars, ' ') === false) return [];
$day = self::validateDateSubstr($datum, -2); $day = self::validateDateSubstr($datum, -2);
if (empty($day)) $day = self::validateDateSubstr($datum, -3, 2); if (empty($day)) $day = self::validateDateSubstr($datum, -3, 2);
if (empty($day)) $day = self::validateDateSubstr($datum, -4, 2); if (empty($day)) $day = self::validateDateSubstr($datum, -4, 2);
@ -550,6 +555,14 @@ final class NodaTimeSplitter {
} }
} }
if ($datum === '1978. július 7 elött') {
throw new Exception(var_export($monat, true));
}
if (!empty($monat) && empty($day) && preg_match('~[0-9]+~', substr($datum, -3))) {
return [];
}
if (!empty($monat) and !empty($day)) { if (!empty($monat) and !empty($day)) {
return [$year, $year, $monat, $day, '+', ""]; return [$year, $year, $monat, $day, '+', ""];
} }
@ -932,6 +945,12 @@ final class NodaTimeSplitter {
return $output; return $output;
} }
} }
if (!empty(\preg_match("/^[0-9]{4}ig$/", $datum))) {
if ($output = self::attempt_splitting(\substr($datum, 0, 4))) {
$output[0] = "?";
return $output;
}
}
return []; return [];
@ -1102,7 +1121,7 @@ final class NodaTimeSplitter {
// Hungarian year and month until month // Hungarian year and month until month
// 2005.01.-02. => 2005.01.-2005.02. // 2005.01.-02. => 2005.01.-2005.02.
if ($inputLength === 12 && (\preg_match("/^[0-9]{4}\.[0-1][0-9]\.\-[0-1][0-9]\.$/", $datum)) !== false) { if ($inputLength === 12 && (\preg_match("/^[0-9]{4}\.[0-1][0-9]\.\-[0-1][0-9]\.$/", $datum))) {
$reconstituted = substr($datum, 0, 8) . '-'; $reconstituted = substr($datum, 0, 8) . '-';
$reconstituted .= substr($datum, 0, 4) . '.' . substr($datum, -3); $reconstituted .= substr($datum, 0, 4) . '.' . substr($datum, -3);
return $reconstituted; return $reconstituted;
@ -1111,7 +1130,7 @@ final class NodaTimeSplitter {
// Hungarian year and month until month without a dot after the first YYYY-MM // Hungarian year and month until month without a dot after the first YYYY-MM
// 2005.01-02. => 2005.01.-2005.02. // 2005.01-02. => 2005.01.-2005.02.
if (in_array($inputLength, [10, 11], true) && (\preg_match("/^[0-9]{4}\.[0-1][0-9]\-[0-1][0-9](\.|)$/", $datum)) !== false) { if (in_array($inputLength, [10, 11], true) && (\preg_match("/^[0-9]{4}\.[0-1][0-9]\-[0-1][0-9](\.|)$/", $datum))) {
$reconstituted = substr($datum, 0, 7) . '.-'; $reconstituted = substr($datum, 0, 7) . '.-';
$reconstituted .= substr($datum, 0, 4) . '.' . substr(rtrim($datum, '.'), -2) . '.'; $reconstituted .= substr($datum, 0, 4) . '.' . substr(rtrim($datum, '.'), -2) . '.';
return $reconstituted; return $reconstituted;
@ -1120,7 +1139,7 @@ final class NodaTimeSplitter {
// Hungarian year and month until month // Hungarian year and month until month
// 2005.01.01.-02.02. => 2005.01.01-2005.02.02. // 2005.01.01.-02.02. => 2005.01.01-2005.02.02.
// 2005.01.01-02.02 => 2005.01.01-2005.02.02. // 2005.01.01-02.02 => 2005.01.01-2005.02.02.
if ($inputLength >= 16 && $inputLength <= 18 && (\preg_match("/^[0-9]{4}\.[0-1][0-9]\.[0-3][0-9](\.|)\-[0-1][0-9]\.[0-3][0-9](\.|)$/", $datum)) !== false) { if ($inputLength >= 16 && $inputLength <= 18 && (\preg_match("/^[0-9]{4}\.[0-1][0-9]\.[0-3][0-9](\.|)\-[0-1][0-9]\.[0-3][0-9](\.|)$/", $datum))) {
$parts = explode('-', $datum); $parts = explode('-', $datum);
if (count($parts) !== 2) return ''; if (count($parts) !== 2) return '';
$reconstituted = substr($datum, 0, 10) . '.-'; $reconstituted = substr($datum, 0, 10) . '.-';
@ -1129,7 +1148,7 @@ final class NodaTimeSplitter {
} }
// Hungarian; without trailing dots: YYYY.MM.DD-DD // Hungarian; without trailing dots: YYYY.MM.DD-DD
if ($inputLength >= 13 && $inputLength <= 15 && (\preg_match("/^[0-9]{4}\.[0-1][0-9]\.[0-3][0-9](\.|)\-[0-3][0-9](\.|)$/", $datum)) !== false) { if ($inputLength >= 13 && $inputLength <= 15 && \preg_match("/^[0-9]{4}\.[0-1][0-9]\.[0-3][0-9](\.|)\-[0-3][0-9](\.|)$/", $datum)) {
$parts = explode('-', $datum); $parts = explode('-', $datum);
if (count($parts) !== 2) return ''; if (count($parts) !== 2) return '';
$reconstituted = substr($datum, 0, 10) . '.-'; $reconstituted = substr($datum, 0, 10) . '.-';
@ -1154,8 +1173,8 @@ final class NodaTimeSplitter {
// If es évek / as évek is contained in the string (e.g. 1880-1990-es évek), there // If es évek / as évek is contained in the string (e.g. 1880-1990-es évek), there
// will be more than one hyphens // will be more than one hyphens
if (MD_STD::stri_contains_any($datum, ['-as évek', '-es évek'])) { if (MD_STD::stri_contains_any($datum, ['-as évek', '-es-évek', '-es évek'])) {
return strtr($datum, ['-as évek' => ' as évek', '-es évek' => ' es évek']); return strtr($datum, ['-as évek' => ' as évek', '-es-évek' => ' es évek', '-es évek' => ' es évek']);
} }
// 1981. július-augusztus > 1981.07-08 // 1981. július-augusztus > 1981.07-08

View File

@ -610,14 +610,14 @@ final class NodaTimeSplitterTest extends TestCase {
self::assertEquals(NodaTimeSplitter::timePartsToCountingYear($output), 1925); self::assertEquals(NodaTimeSplitter::timePartsToCountingYear($output), 1925);
$output = NodaTimeSplitter::attempt_splitting("2020. Januar"); $output = NodaTimeSplitter::attempt_splitting("2020. Januar");
self::assertEquals($output, [ self::assertEquals([
0 => "2020", 0 => "2020",
1 => "2020", 1 => "2020",
2 => "01", 2 => "01",
3 => "00", 3 => "00",
4 => "+", 4 => "+",
5 => "", 5 => "",
]); ], $output);
self::assertEquals(NodaTimeSplitter::timePartsToTimeName($output), "Januar 2020"); self::assertEquals(NodaTimeSplitter::timePartsToTimeName($output), "Januar 2020");
self::assertEquals(NodaTimeSplitter::timePartsToCountingYear($output), 2020); self::assertEquals(NodaTimeSplitter::timePartsToCountingYear($output), 2020);
@ -729,6 +729,45 @@ final class NodaTimeSplitterTest extends TestCase {
self::assertEquals(NodaTimeSplitter::timePartsToTimeName($output), "Bis 1801 v. Chr."); self::assertEquals(NodaTimeSplitter::timePartsToTimeName($output), "Bis 1801 v. Chr.");
self::assertEquals(NodaTimeSplitter::timePartsToCountingYear($output), 1801); self::assertEquals(NodaTimeSplitter::timePartsToCountingYear($output), 1801);
/*
* TODO
$output = NodaTimeSplitter::attempt_splitting("1900 előtt");
self::assertEquals($output, [
0 => "?",
1 => "1899",
2 => "00",
3 => "00",
4 => "+",
5 => "",
]);
self::assertEquals(NodaTimeSplitter::timePartsToTimeName($output), "Vor 1900");
self::assertEquals(NodaTimeSplitter::timePartsToCountingYear($output), 1899);
*/
$output = NodaTimeSplitter::attempt_splitting("1900-ig");
self::assertEquals($output, [
0 => "?",
1 => "1900",
2 => "00",
3 => "00",
4 => "+",
5 => "",
]);
self::assertEquals(NodaTimeSplitter::timePartsToTimeName($output), "Bis 1900");
self::assertEquals(NodaTimeSplitter::timePartsToCountingYear($output), 1900);
$output = NodaTimeSplitter::attempt_splitting("1900ig");
self::assertEquals($output, [
0 => "?",
1 => "1900",
2 => "00",
3 => "00",
4 => "+",
5 => "",
]);
self::assertEquals(NodaTimeSplitter::timePartsToTimeName($output), "Bis 1900");
self::assertEquals(NodaTimeSplitter::timePartsToCountingYear($output), 1900);
} }
/** /**
@ -881,6 +920,9 @@ final class NodaTimeSplitterTest extends TestCase {
$output = NodaTimeSplitter::attempt_splitting("1978. július7"); $output = NodaTimeSplitter::attempt_splitting("1978. július7");
self::assertEmpty($output); self::assertEmpty($output);
$output = NodaTimeSplitter::attempt_splitting("1978. július 7 elött");
self::assertEmpty($output);
$output = NodaTimeSplitter::attempt_splitting("Anfang September 1903"); $output = NodaTimeSplitter::attempt_splitting("Anfang September 1903");
self::assertEmpty($output); self::assertEmpty($output);