Cover more edge cases for splitting time names

This commit is contained in:
Joshua Ramon Enslin 2025-01-15 11:49:20 +01:00
parent 9c2eaa2929
commit 51fe9a5e45
Signed by: jrenslin
GPG Key ID: 46016F84501B70AE
3 changed files with 281 additions and 25 deletions

View File

@ -140,7 +140,7 @@ final class NodaTimeSplitter {
"decemberig",
];
private const REGEX_CENTURIES = '(\ |)(Jh\.|Jhd(|\.)|Jhdt(|\.)|Jahrhundert|sz|század|th century|ст|ст\.)';
private const REGEX_CENTURIES = '(\ |)(Jh|Jh\.|Jhd(|\.)|Jhdt(|\.)|Jahrhundert|sz|század|th century|ст|ст\.)';
private const REGEX_DECADES = '(s|er|er\ Jahre|(\-|\ )es\ évek|(\-|\ )as\ \évek|\ évek|\-es\ években|\-ті)';
/**
@ -350,11 +350,20 @@ final class NodaTimeSplitter {
}
}
if (\preg_match("/^[0-9][0-9][0-9][0-9]\ bis [0-9][0-9][0-9][0-9]$/", $datum)) {
if (\preg_match("/^[0-9]{4}\ bis\ [0-9]{4}$/", $datum)) {
$start = \substr($datum, 0, 4);
$end = \substr($datum, -4);
return new NodaSplitTime($start, $end);
}
if (\preg_match("/^[0-9]{4}\ (und|oder|od.)\ [0-9]{4}$/", $datum)) {
$start = \substr($datum, 0, 4);
$end = \substr($datum, -4);
$startInt = (int)$start;
$endInt = (int)$end;
if ($startInt === $endInt - 1) {
return new NodaSplitTime($start, $end);
}
}
$datum = \str_replace(". ", ".", $datum);
@ -551,7 +560,7 @@ final class NodaTimeSplitter {
}
// German TT.MM.JJJJ / TT.MM.JJJ / TT.MM.JJ / TT.MM.J
if (\preg_match("/^[0-9][0-9]\.[0-9][0-9]\.([0-9][0-9][0-9][0-9]|[0-9][0-9][0-9]|[0-9][0-9]|[0-9])$/", $datum)) { // German T.MM.JJJJ
if (\preg_match("/^[0-9]{2}\.[0-9]{2}\.([0-9]{4}|[0-9]{3}|[0-9]{2}|[0-9])$/", $datum)) { // German T.MM.JJJJ
$year = \substr($datum, 6, 4);
$month = \substr($datum, 3, 2);
$day = \substr($datum, 0, 2);
@ -559,7 +568,7 @@ final class NodaTimeSplitter {
}
// German TT.M.JJJJ / TT.M.JJJ / TT.M.JJ / TT.M.J
if (\preg_match("/^[0-9][0-9]\.[0-9]\.([0-9][0-9][0-9][0-9]|[0-9][0-9][0-9]|[0-9][0-9]|[0-9])$/", $datum)) { // German T.MM.JJJJ
if (\preg_match("/^[0-9]{2}\.[0-9]\.([0-9]{4}|[0-9]{3}|[0-9]{2}|[0-9])$/", $datum)) { // German T.MM.JJJJ
$year = \substr($datum, 5, 4);
$month = "0" . \substr($datum, 3, 1);
$day = \substr($datum, 0, 2);
@ -707,35 +716,60 @@ final class NodaTimeSplitter {
$datum = self::clean_input($datum);
if (\preg_match("/^[0-9]{4}\.[0-9]{2}\.[0-9]{2}(\.|)\-$/", $datum)) { // YYYY.MM.DD.
$year = \substr($datum, 0, 4);
$month = \substr($datum, 5, 2);
$day = \substr($datum, 8, 2);
$inpDateWoSpaces = str_replace(" ", "", $datum);
if (\preg_match("/^[0-9]{4}\.[0-9]{2}\.[0-9]{2}(\.|)\-$/", $inpDateWoSpaces)) { // YYYY.MM.DD.
$year = \substr($inpDateWoSpaces, 0, 4);
$month = \substr($inpDateWoSpaces, 5, 2);
$day = \substr($inpDateWoSpaces, 8, 2);
return NodaSplitTime::genExactDate($year, $month, $day, NodaTimeBeforeAfterIndicator::since);
}
if (\preg_match("/^[0-9]{4}\.[0-9]{2}(\.|)\-$/", $datum)) { // YYYY.MM.-
$start = \substr($datum, 0, 4);
$month = \substr($datum, 5, 2);
if (\preg_match("/^[0-9]{4}\.[0-9]{2}(\.|)\-$/", $inpDateWoSpaces)) { // YYYY.MM.-
$start = \substr($inpDateWoSpaces, 0, 4);
$month = \substr($inpDateWoSpaces, 5, 2);
return new NodaSplitTime($start, '?', $month, before_after_indicator: NodaTimeBeforeAfterIndicator::since);
}
if (\preg_match("/^[0-9]{4}\-$/", $datum)) { // YYYY-
$start = \substr($datum, 0, 4);
if (\preg_match("/^[0-9]{4}\-$/", $inpDateWoSpaces)) { // YYYY-
$start = \substr($inpDateWoSpaces, 0, 4);
return new NodaSplitTime($start, '?', before_after_indicator: NodaTimeBeforeAfterIndicator::since);
}
if (\preg_match("/^\-[0-9]{4}\.[0-9]{2}\.[0-9]{2}$/", $datum)) { // Hungarian Y-m
$year = \substr($datum, 1, 4);
$month = \substr($datum, 6, 2);
$day = \substr($datum, 9, 2);
// ?.6.2024
if (\preg_match("/^\?\.([0-9]|[0-9]{2})\.[0-9]{4}$/", $inpDateWoSpaces)) { // German Y-m
$year = \substr($inpDateWoSpaces, -4);
$month = trim(\substr($inpDateWoSpaces, 2, 2), '. ');
return new NodaSplitTime($year, $year, $month);
}
// ?.?.2024
if (\preg_match("/^\?\.\?\.[0-9]{4}$/", $inpDateWoSpaces)) { // German Y-m
$year = \substr($inpDateWoSpaces, -4);
return new NodaSplitTime($year, $year);
}
if (\preg_match("/^[0-9]{4}$/", \trim($inpDateWoSpaces, '. ?!()[]X'))) { // German Y-m
$year = \trim($inpDateWoSpaces, '. ?!()[]X');
return new NodaSplitTime($year, $year);
}
if (\preg_match("/^[0-9]{4}$/", \strtr($inpDateWoSpaces, ['-0' => '', '0-' => '', 'o' => '0']))) { // German Y-m
$year = \strtr($inpDateWoSpaces, ['-0' => '', '0-' => '', 'o' => '0']);
return new NodaSplitTime($year, $year);
}
if (\preg_match("/^\-[0-9]{4}\.[0-9]{2}\.[0-9]{2}$/", $inpDateWoSpaces)) { // Hungarian Y-m
$year = \substr($inpDateWoSpaces, 1, 4);
$month = \substr($inpDateWoSpaces, 6, 2);
$day = \substr($inpDateWoSpaces, 9, 2);
return NodaSplitTime::genExactDate($year, $month, $day, NodaTimeBeforeAfterIndicator::until);
}
if (\preg_match("/^\-[0-9]{4}\.[0-9]{2}$/", $datum)) { // Hungarian Y-m
$year = \substr($datum, 1, 4);
$month = \substr($datum, 6, 2);
if (\preg_match("/^\-[0-9]{4}\.[0-9]{2}$/", $inpDateWoSpaces)) { // Hungarian Y-m
$year = \substr($inpDateWoSpaces, 1, 4);
$month = \substr($inpDateWoSpaces, 6, 2);
return new NodaSplitTime('?', $year, $month, before_after_indicator: NodaTimeBeforeAfterIndicator::until);
}
if (\preg_match("/^\-[0-9]{4}$/", $datum)) { // Hungarian -Y
$year = \substr($datum, 1, 4);
if (\preg_match("/^\-[0-9]{4}$/", $inpDateWoSpaces)) { // Hungarian -Y
$year = \substr($inpDateWoSpaces, 1, 4);
return new NodaSplitTime('?', $year, before_after_indicator: NodaTimeBeforeAfterIndicator::until);
}
@ -907,7 +941,7 @@ final class NodaTimeSplitter {
}
// 1. Jahrhundert
if (\preg_match("/^[0-9]\.\ (Jh\.|Jahrhundert|sz|század)$/", $datum)) {
if (\preg_match("/^[0-9]\.\ (Jh\|Jh\.|Jahrhundert|sz|század)$/", $datum)) {
if ($centuryNo = \intval(\substr($datum, 0, 1))) {
$centuryNo--;
return new NodaSplitTime((string)$centuryNo . "01", \strval($centuryNo + 1) . '00');
@ -915,7 +949,7 @@ final class NodaTimeSplitter {
}
// 17.-18. Jahrhundert
if (\preg_match("/^[0-9]{2}(\.|)(|\ Jh\.||\ Jahrhundert||\ sz||\ század)(\-|\/)[0-9]{2}\.\ (Jh\.|Jahrhundert|sz|század)$/", $datum)) {
if (\preg_match("/^[0-9]{2}(\.|)(|\ Jh|\ Jh\.|\ Jahrhundert|\ sz|\ század)(\-|\/)[0-9]{2}\.\ (Jh\.|Jahrhundert|sz|század)$/", $datum)) {
if (\strpos($datum, '/') !== false) {
$datum = str_replace('/', '-', $datum);
}
@ -1099,6 +1133,33 @@ final class NodaTimeSplitter {
return $reconstituted;
}
// German T.-T.MM.JJJJ / T.-T.MM.JJJ / T.-T.MM.JJ / T.-T.MM.J
if (\preg_match("/^[0-9].\-[0-9]\.([0-9]|[0-9]{2})\.([0-9]{4}|[0-9]{3}|[0-9]{2}|[0-9])$/", $datum)) { // German T.MM.JJJJ
$year = \substr($datum, -4);
$month = trim(\substr($datum, -7, 2), '.');
$day = '0' . \substr($datum, 3, 1);
$firstday = '0' . \substr($datum, 0, 1);
return "$firstday.$month.$year-$day.$month.$year";
}
// German T.-TT.MM.JJJJ / T.-TT.MM.JJJ / T.-TT.MM.JJ / T.-TT.MM.J
if (\preg_match("/^[0-9].\-[0-9]{2}\.([0-9]|[0-9]{2})\.([0-9]{4}|[0-9]{3}|[0-9]{2}|[0-9])$/", $datum)) { // German T.MM.JJJJ
$year = \substr($datum, -4);
$month = trim(\substr($datum, -7, 2), '.');
$day = \substr($datum, 3, 2);
$firstday = '0' . \substr($datum, 0, 1);
return "$firstday.$month.$year-$day.$month.$year";
}
// German TT.-TT.MM.JJJJ / TT.-TT.MM.JJJ / TT.-TT.MM.JJ / TT.-TT.MM.J
if (\preg_match("/^[0-9]{2}.\-[0-9]{2}\.([0-9]|[0-9]{2})\.([0-9]{4}|[0-9]{3}|[0-9]{2}|[0-9])$/", $datum)) { // German T.MM.JJJJ
$year = \substr($datum, -4);
$month = trim(\substr($datum, -7, 2), '.');
$day = \substr($datum, 4, 2);
$firstday = \substr($datum, 0, 2);
return "$firstday.$month.$year-$day.$month.$year";
}
// 17-19. Jahrhundert
if (\preg_match("/^[0-9]{2}(\.|)\-[0-9]{2}(\.|)" . self::REGEX_CENTURIES . "$/", $datum)) {
$parts = explode('-', $datum);
@ -1225,7 +1286,7 @@ final class NodaTimeSplitter {
*/
private static function _runBasicNameCleanup(string $input):string {
$input = trim(trim($input), ',;');
$input = ltrim(trim(trim($input), ',;'), ' .');
// Clean away duplicate inputs
// 1440-1440

View File

@ -61,6 +61,7 @@ final class NodaUncertaintyHelper {
"(?)",
"?",
" [vermutlich]",
" vermutlich",
" [verm.]",
" [wahrscheinlich]",
];
@ -100,6 +101,7 @@ final class NodaUncertaintyHelper {
"c. ",
"ca ",
"ca. ",
"ca.",
"Ca ",
"Ca. ",
"za. ",
@ -141,6 +143,7 @@ final class NodaUncertaintyHelper {
" [circa]",
" (verm.)",
" (vermutl.)",
" vermutlich",
" körül",
", um",
" (um)",

View File

@ -118,6 +118,97 @@ final class NodaTimeSplitterTest extends TestCase {
self::assertEquals($output->toTimeName(), "02.01.2020");
self::assertEquals(NodaTimeSplitter::timePartsToCountingYear($output), 2020);
$output = NodaTimeSplitter::attempt_splitting("?.1.2020");
self::assertNotEmpty($output);
self::assertEquals($output->toOldFormat(), [
0 => "2020",
1 => "2020",
2 => "01",
3 => "00",
4 => "+",
5 => "",
]);
self::assertEquals($output->toTimeName(), "Januar 2020");
self::assertEquals(NodaTimeSplitter::timePartsToCountingYear($output), 2020);
$output = NodaTimeSplitter::attempt_splitting("?.11.2020");
self::assertNotEmpty($output);
self::assertEquals($output->toOldFormat(), [
0 => "2020",
1 => "2020",
2 => "11",
3 => "00",
4 => "+",
5 => "",
]);
self::assertEquals($output->toTimeName(), "November 2020");
self::assertEquals(NodaTimeSplitter::timePartsToCountingYear($output), 2020);
$output = NodaTimeSplitter::attempt_splitting(".2020");
self::assertNotEmpty($output);
self::assertEquals($output->toOldFormat(), [
0 => "2020",
1 => "2020",
2 => "00",
3 => "00",
4 => "+",
5 => "",
]);
self::assertEquals($output->toTimeName(), "2020");
self::assertEquals(NodaTimeSplitter::timePartsToCountingYear($output), 2020);
$output = NodaTimeSplitter::attempt_splitting("0-2020");
self::assertNotEmpty($output);
self::assertEquals($output->toOldFormat(), [
0 => "2020",
1 => "2020",
2 => "00",
3 => "00",
4 => "+",
5 => "",
]);
self::assertEquals($output->toTimeName(), "2020");
self::assertEquals(NodaTimeSplitter::timePartsToCountingYear($output), 2020);
$output = NodaTimeSplitter::attempt_splitting("2020-0");
self::assertNotEmpty($output);
self::assertEquals($output->toOldFormat(), [
0 => "2020",
1 => "2020",
2 => "00",
3 => "00",
4 => "+",
5 => "",
]);
self::assertEquals($output->toTimeName(), "2020");
self::assertEquals(NodaTimeSplitter::timePartsToCountingYear($output), 2020);
$output = NodaTimeSplitter::attempt_splitting("? 2020");
self::assertNotEmpty($output);
self::assertEquals($output->toOldFormat(), [
0 => "2020",
1 => "2020",
2 => "00",
3 => "00",
4 => "+",
5 => "",
]);
self::assertEquals($output->toTimeName(), "2020");
self::assertEquals(NodaTimeSplitter::timePartsToCountingYear($output), 2020);
$output = NodaTimeSplitter::attempt_splitting("?.?.2020");
self::assertNotEmpty($output);
self::assertEquals($output->toOldFormat(), [
0 => "2020",
1 => "2020",
2 => "00",
3 => "00",
4 => "+",
5 => "",
]);
self::assertEquals($output->toTimeName(), "2020");
self::assertEquals(NodaTimeSplitter::timePartsToCountingYear($output), 2020);
$output = NodaTimeSplitter::attempt_splitting("1920-1929");
self::assertNotEmpty($output);
self::assertEquals($output->toOldFormat(), [
@ -463,6 +554,32 @@ final class NodaTimeSplitterTest extends TestCase {
self::assertEquals($output->toTimeName(), "1945-1948");
self::assertEquals(NodaTimeSplitter::timePartsToCountingYear($output), 1947);
$output = NodaTimeSplitter::attempt_splitting("1945 und 1946");
self::assertNotEmpty($output);
self::assertEquals($output->toOldFormat(), [
0 => "1945",
1 => "1946",
2 => "00",
3 => "00",
4 => "+",
5 => "",
]);
self::assertEquals($output->toTimeName(), "1945-1946");
self::assertEquals(NodaTimeSplitter::timePartsToCountingYear($output), 1946);
$output = NodaTimeSplitter::attempt_splitting("1945 oder 1946");
self::assertNotEmpty($output);
self::assertEquals($output->toOldFormat(), [
0 => "1945",
1 => "1946",
2 => "00",
3 => "00",
4 => "+",
5 => "",
]);
self::assertEquals($output->toTimeName(), "1945-1946");
self::assertEquals(NodaTimeSplitter::timePartsToCountingYear($output), 1946);
$output = NodaTimeSplitter::attempt_splitting("20.-19. Jahrhundert v. Chr.");
self::assertNotEmpty($output);
self::assertEquals($output->toOldFormat(), [
@ -1016,6 +1133,81 @@ final class NodaTimeSplitterTest extends TestCase {
"counting_time_bcce" => "+",
]);
$output = NodaTimeSplitter::attempt_splitting_from_till("14.-15.03.2019");
self::assertNotEmpty($output);
self::assertEquals($output, [
'start_name' => "14.03.2019",
'end_name' => "15.03.2019",
"start_year" => '2019',
"end_year" => '2019',
'start_date' => '2019-03-14',
'end_date' => '2019-03-15',
"counting_time_year" => "2019",
"counting_time_month" => "03",
"counting_time_day" => "15",
"counting_time_bcce" => "+",
]);
$output = NodaTimeSplitter::attempt_splitting_from_till("3.-7.9.1819");
self::assertNotEmpty($output);
self::assertEquals($output, [
'start_name' => "03.09.1819",
'end_name' => "07.09.1819",
"start_year" => '1819',
"end_year" => '1819',
'start_date' => '1819-09-03',
'end_date' => '1819-09-07',
"counting_time_year" => "1819",
"counting_time_month" => "09",
"counting_time_day" => "05",
"counting_time_bcce" => "+",
]);
$output = NodaTimeSplitter::attempt_splitting_from_till("3.-15.9.1819");
self::assertNotEmpty($output);
self::assertEquals($output, [
'start_name' => "03.09.1819",
'end_name' => "15.09.1819",
"start_year" => '1819',
"end_year" => '1819',
'start_date' => '1819-09-03',
'end_date' => '1819-09-15',
"counting_time_year" => "1819",
"counting_time_month" => "09",
"counting_time_day" => "09",
"counting_time_bcce" => "+",
]);
$output = NodaTimeSplitter::attempt_splitting_from_till("14.-15.9.1819");
self::assertNotEmpty($output);
self::assertEquals($output, [
'start_name' => "14.09.1819",
'end_name' => "15.09.1819",
"start_year" => '1819',
"end_year" => '1819',
'start_date' => '1819-09-14',
'end_date' => '1819-09-15',
"counting_time_year" => "1819",
"counting_time_month" => "09",
"counting_time_day" => "15",
"counting_time_bcce" => "+",
]);
$output = NodaTimeSplitter::attempt_splitting_from_till("14.-15.11.1819");
self::assertNotEmpty($output);
self::assertEquals($output, [
'start_name' => "14.11.1819",
'end_name' => "15.11.1819",
"start_year" => '1819',
"end_year" => '1819',
'start_date' => '1819-11-14',
'end_date' => '1819-11-15',
"counting_time_year" => "1819",
"counting_time_month" => "11",
"counting_time_day" => "15",
"counting_time_bcce" => "+",
]);
$output = NodaTimeSplitter::attempt_splitting_from_till("2019.03.14-15");
self::assertNotEmpty($output);
self::assertEquals($output, [