Rewrite incomplete time span spellings to extend parsable and splittable time names
This commit is contained in:
		| @@ -48,7 +48,7 @@ final class NodaTimeSplitter { | ||||
|         "04" => ['április', 'apr.', 'ápr.'], | ||||
|         "05" => ['május', 'maj.', 'máj.'], | ||||
|         "06" => ['június', 'jun.', 'jún'], | ||||
|         "07" => ['július', 'jul.', 'júl.'], | ||||
|         "07" => ['július', 'julius', 'jul.', 'júl.'], | ||||
|         "08" => ['augusztus', 'aug.'], | ||||
|         "09" => ['szeptember', 'szp.'], | ||||
|         "10" => ['október', 'okt.'], | ||||
| @@ -132,6 +132,8 @@ final class NodaTimeSplitter { | ||||
|         "decemberig", | ||||
|     ]; | ||||
|  | ||||
|     private const REGEX_CENTURIES = '(\ |)(Jh\.|Jhd(|\.)|Jhdt(|\.)|Jahrhundert|sz|század|th century|ст|ст\.)'; | ||||
|  | ||||
|     /** | ||||
|      * Cleans input strings by trimming obsolete stuff. | ||||
|      * | ||||
| @@ -395,7 +397,7 @@ final class NodaTimeSplitter { | ||||
|      * | ||||
|      * @param string $datum Date. | ||||
|      * | ||||
|      * @return array<string> | ||||
|      * @return array{0: string, 1: string, 2: string, 3: string, 4: '+'|'-'|'', 5: string}|array{} | ||||
|      */ | ||||
|     public static function is_valid_date(string $datum):array { | ||||
|  | ||||
| @@ -472,7 +474,7 @@ final class NodaTimeSplitter { | ||||
|      * | ||||
|      * @param string $datum Date. | ||||
|      * | ||||
|      * @return array<string> | ||||
|      * @return array{0: string, 1: string, 2: string, 3: string, 4: '+'|'-'|'', 5: string}|array{} | ||||
|      */ | ||||
|     public static function is_valid_date_hungarian(string $datum):array { | ||||
|  | ||||
| @@ -491,6 +493,8 @@ final class NodaTimeSplitter { | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         // Example: 2009-tol 2010-ig | ||||
|         // From 2009 to 2010 | ||||
|         if (\preg_match("/^[0-9][0-9][0-9][0-9]\-t(ő|ó)l(\ |\-)[0-9][0-9][0-9][0-9]\-ig$/", $datum)) { | ||||
|             $start = \substr($datum, 0, 4); | ||||
|             $end = \substr($datum, -7, 4); | ||||
| @@ -576,7 +580,7 @@ final class NodaTimeSplitter { | ||||
|      * | ||||
|      * @param string $datum Input date. | ||||
|      * | ||||
|      * @return array<string> | ||||
|      * @return array{0: string, 1: string, 2: string, 3: string, 4: '+'|'-'|'', 5: string}|array{} | ||||
|      */ | ||||
|     public static function is_timespan(string $datum):array { | ||||
|  | ||||
| @@ -670,13 +674,19 @@ final class NodaTimeSplitter { | ||||
|             $month = "0" . \substr($datum, 0, 1); | ||||
|             return [$start, $start, $month, "00", "+", ""]; | ||||
|         } | ||||
|         if (\preg_match("/^[0-9]{4}\.[0-9]{2}\.[0-9]{1,2}(\.|)$/", $datum)) { // Hungarian Y-m-d | ||||
|         if (\preg_match("/^[0-9]{4}\.[0-3][0-9]\.[0-9]{1,2}(\.|)$/", $datum)) { // Hungarian Y-m-d | ||||
|             $start = \substr($datum, 0, 4); | ||||
|             $month = \substr($datum, 5, 2); | ||||
|             $day = self::pad_to_two(\substr($datum, 8, 2)); | ||||
|             $day = self::pad_to_two(\rtrim(\substr($datum, 8, 2), '.')); | ||||
|             if (\intval($month) < 13) return [$start, $start, $month, $day, "+", ""]; | ||||
|         } | ||||
|         if (\preg_match("/^[0-9]{4}\.[0-9]{2}(\.|)$/", $datum)) { // Hungarian Y-m | ||||
|         if (\preg_match("/^[0-9]{4}\.[0-9]\.[0-9]{1,2}\.$/", $datum)) { // Hungarian Y-m-d > 2005.1.1. | ||||
|             $start = \substr($datum, 0, 4); | ||||
|             $month = self::pad_to_two(\substr($datum, 5, 1)); | ||||
|             $day = self::pad_to_two(\rtrim(\substr($datum, 7, 2), '.')); | ||||
|             if (\intval($month) < 13) return [$start, $start, $month, $day, "+", ""]; | ||||
|         } | ||||
|         if (\preg_match("/^[0-9]{4}\.[0-3][0-9](\.|)$/", $datum)) { // Hungarian Y-m | ||||
|             $start = \substr($datum, 0, 4); | ||||
|             $month = \substr($datum, 5, 2); | ||||
|             if (\intval($month) < 13) return [$start, $start, $month, "00", "+", ""]; | ||||
| @@ -750,7 +760,7 @@ final class NodaTimeSplitter { | ||||
|      * | ||||
|      * @param string $datum Input date. | ||||
|      * | ||||
|      * @return array<string> | ||||
|      * @return array{0: string, 1: string, 2: string, 3: string, 4: '+'|'-'|'', 5: string}|array{} | ||||
|      */ | ||||
|     public static function is_incomplete_date(string $datum):array { | ||||
|  | ||||
| @@ -860,7 +870,7 @@ final class NodaTimeSplitter { | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         // Endings beginning with a space | ||||
|         // Endings beginning with a dash | ||||
|         if (\preg_match("/(\-től|\-tól)$/", $datum)) { | ||||
|             if (($spacePos = strrpos($datum, "-")) === false) { | ||||
|                 return []; | ||||
| @@ -880,8 +890,25 @@ final class NodaTimeSplitter { | ||||
|                 return $output; | ||||
|             } | ||||
|         } | ||||
|         // Endings that are extensions of an existing word | ||||
|         if (\preg_match("/évektől$/", $datum)) { | ||||
|             if ($output = self::attempt_splitting(\substr($datum, 0, -4))) { | ||||
|                 $output[1] = "?"; | ||||
|                 return $output; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         // Endings beginning with a space | ||||
|         // Endings beginning with a space (after) | ||||
|         if (\preg_match("/ (utantól|utántól)$/", $datum)) { | ||||
|             if (($spacePos = strrpos($datum, " ")) === false) { | ||||
|                 return []; | ||||
|             } | ||||
|             if ($output = self::attempt_splitting(\substr($datum, 0, $spacePos))) { | ||||
|                 $output[1] = "?"; | ||||
|                 return $output; | ||||
|             } | ||||
|         } | ||||
|         // Endings beginning with a space (until) | ||||
|         if (\preg_match("/ (\(bis)$/", $datum)) { | ||||
|             if (($spacePos = strrpos($datum, " ")) === false) { | ||||
|                 return []; | ||||
| @@ -891,6 +918,7 @@ final class NodaTimeSplitter { | ||||
|                 return $output; | ||||
|             } | ||||
|         } | ||||
|         // Ends beginning with a hyphen | ||||
|         if (\preg_match("/\-ig(\.|)$/", $datum)) { | ||||
|             if (($spacePos = strrpos($datum, "-")) === false) { | ||||
|                 return []; | ||||
| @@ -931,23 +959,18 @@ final class NodaTimeSplitter { | ||||
|      * | ||||
|      * @param string $datum Input date. | ||||
|      * | ||||
|      * @return array<string> | ||||
|      * @return array{0: string, 1: string, 2: string, 3: string, 4: '+'|'-'|'', 5: string}|array{} | ||||
|      */ | ||||
|     public static function is_century(string $datum):array { | ||||
|  | ||||
|         $datum = self::clean_input($datum); | ||||
|         $bcBceIndicator = '+'; | ||||
|  | ||||
|         // 17. Jahrhundert | ||||
|         if (\preg_match("/^[0-9]{2}\.\ (Jh\.|Jhd(|\.)|Jhdt(|\.)|Jahrhundert|sz|század)$/", $datum)) { | ||||
|             if ($centuryNo = \intval(\substr($datum, 0, 2))) { | ||||
|                 $centuryNo--; | ||||
|                 return [(string)$centuryNo . "01", \strval($centuryNo + 1) . "00", "00", "00", $bcBceIndicator, ""]; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         // 17th century | ||||
|         if (\preg_match("/^[0-9]{2}th century$/", $datum)) { | ||||
|         // TODO: Check if this is duplicate | ||||
|  | ||||
|         // 17. Jahrhundert | ||||
|         if (\preg_match("/^[0-9]{2}(\.|)" . self::REGEX_CENTURIES ."$/", $datum)) { | ||||
|             if ($centuryNo = \intval(\substr($datum, 0, 2))) { | ||||
|                 $centuryNo--; | ||||
|                 return [(string)$centuryNo . "01", \strval($centuryNo + 1) . "00", "00", "00", $bcBceIndicator, ""]; | ||||
| @@ -1000,20 +1023,22 @@ final class NodaTimeSplitter { | ||||
|      * | ||||
|      * @param string $datum Input date. | ||||
|      * | ||||
|      * @return array<string> | ||||
|      * @return array{0: string, 1: string, 2: string, 3: string, 4: '+'|'-'|'', 5: string}|array{} | ||||
|      */ | ||||
|     public static function is_decade(string $datum):array { | ||||
|  | ||||
|         $datum = self::clean_input($datum); | ||||
|         $bcBceIndicator = '+'; | ||||
|  | ||||
|         // 20er Jahre | ||||
|         if (\preg_match("/^[0-9]0(er|er\ Jahre|\-es\ évek|\-as\ \évek)$/", $datum)) { | ||||
|             $start = "19" . \substr($datum, 0, 2); | ||||
|             $ende = (string)(\intval($start) + 9); | ||||
|             return [$start, $ende, "00", "00", $bcBceIndicator, ""]; | ||||
|         } | ||||
|  | ||||
|         if (\preg_match("/^[0-9]{3}0(s|er|er\ Jahre|\-es\ évek|\-as\ \évek)$/", $datum)) { | ||||
|         // 1920er Jahre | ||||
|         if (\preg_match("/^[0-9]{3}0(s|er|er\ Jahre|(\-|\ )es\ évek|(\-|\ )as\ \évek)$/", $datum)) { | ||||
|             $start = \substr($datum, 0, 4); | ||||
|             $ende = (string)(\intval($start) + 9); | ||||
|             return [$start, $ende, "00", "00", $bcBceIndicator, ""]; | ||||
| @@ -1032,10 +1057,19 @@ final class NodaTimeSplitter { | ||||
|      */ | ||||
|     public static function check_is_timespan_from_till(string $datum):array { | ||||
|  | ||||
|         if (substr_count($datum, '-') !== 1) return []; | ||||
|         if (substr_count($datum, '-') !== 1) { | ||||
|             return []; | ||||
|         } | ||||
|  | ||||
|         list($start_str, $end_str) = explode('-', $datum); | ||||
|  | ||||
|         if (strlen($end_str) < 4 && strlen($end_str) < strlen($start_str)) { | ||||
|             return []; | ||||
|         } | ||||
|         if (strlen($start_str) < 4 && strlen($start_str) < strlen($end_str)) { | ||||
|             return []; | ||||
|         } | ||||
|  | ||||
|         if (empty($start = self::attempt_splitting($start_str))) { | ||||
|             return []; | ||||
|         } | ||||
| @@ -1048,6 +1082,107 @@ final class NodaTimeSplitter { | ||||
|  | ||||
|     } | ||||
|  | ||||
|     /** | ||||
|      * Contains special rules for incorrectly or incompletely spelled out timespan names. | ||||
|      * To be called by self::attempt_splitting_from_till(). | ||||
|      * | ||||
|      * @param string $datum Date. | ||||
|      * | ||||
|      * @return string | ||||
|      */ | ||||
|     public static function _attempt_rewriting_special_cases_from_till(string $datum):string { | ||||
|  | ||||
|         if (empty($datum)) return ''; | ||||
|  | ||||
|         $inputLength = strlen($datum); | ||||
|  | ||||
|         // Hungarian year and month until month | ||||
|         // 2005.01.-02. => 2005.01.-2005.02. | ||||
|         if ($inputLength === 12 && \preg_match("/^[0-9]{4}\.[0-1][0-9]\.\-[0-1][0-9]\.$/", $datum)) { | ||||
|             $reconstituted = substr($datum, 0, 8) . '-'; | ||||
|             $reconstituted .= substr($datum, 0, 4) . '.' . substr($datum, -3); | ||||
|             return $reconstituted; | ||||
|         } | ||||
|  | ||||
|         // Hungarian year and month until month without a dot after the first YYYY-MM | ||||
|         // 2005.01-02. => 2005.01.-2005.02. | ||||
|  | ||||
|         if (in_array($inputLength, [10, 11], true) && \preg_match("/^[0-9]{4}\.[0-1][0-9]\-[0-1][0-9](\.|)$/", $datum)) { | ||||
|             $reconstituted = substr($datum, 0, 7) . '.-'; | ||||
|             $reconstituted .= substr($datum, 0, 4) . '.' . substr(rtrim($datum, '.'), -2) . '.'; | ||||
|             return $reconstituted; | ||||
|         } | ||||
|  | ||||
|         // Hungarian year and month until month | ||||
|         // 2005.01.01.-02.02. => 2005.01.01-2005.02.02. | ||||
|         // 2005.01.01-02.02 => 2005.01.01-2005.02.02. | ||||
|         if ($inputLength >= 16 && $inputLength <= 18 && \preg_match("/^[0-9]{4}\.[0-1][0-9]\.[0-3][0-9](\.|)\-[0-1][0-9]\.[0-3][0-9](\.|)$/", $datum)) { | ||||
|             $parts = explode('-', $datum); | ||||
|             if (count($parts) !== 2) return ''; | ||||
|             $reconstituted = substr($datum, 0, 10) . '.-'; | ||||
|             $reconstituted .= substr($datum, 0, 4) . '.' . rtrim($parts[1], '.') . '.'; | ||||
|             return $reconstituted; | ||||
|         } | ||||
|  | ||||
|         // Hungarian; without trailing dots: YYYY.MM.DD-DD | ||||
|         if ($inputLength >= 13 && $inputLength <= 15 && \preg_match("/^[0-9]{4}\.[0-1][0-9]\.[0-3][0-9](\.|)\-[0-3][0-9](\.|)$/", $datum)) { | ||||
|             $parts = explode('-', $datum); | ||||
|             if (count($parts) !== 2) return ''; | ||||
|             $reconstituted = substr($datum, 0, 10) . '.-'; | ||||
|             $reconstituted .= substr($datum, 0, 7) . '.' . substr(rtrim($parts[1], '.'), -2); | ||||
|             return $reconstituted; | ||||
|         } | ||||
|  | ||||
|         // 17-19. Jahrhundert | ||||
|         if (\preg_match("/^[0-9]{2}(\.|)\-[0-9]{2}(\.|)" . self::REGEX_CENTURIES . "$/", $datum)) { | ||||
|             $parts = explode('-', $datum); | ||||
|             $reconstituted  = ((int)substr($parts[0] ?? "", 0, 2) - 1) . '01-'; | ||||
|             $reconstituted .= substr($parts[1] ?? "", 0, 2) . '. Jahrhundert'; | ||||
|             return $reconstituted; | ||||
|         } | ||||
|  | ||||
|         // 1950-60-as évek | ||||
|         if (\preg_match("/^[0-9]{4}\-[0-9]{2} (a|e)s évek$/", $datum)) { | ||||
|             $reconstituted  = substr($datum, 0, 4) . '-'; | ||||
|             $reconstituted .= substr($datum, 5, 2) . 'er Jahre'; | ||||
|             return $reconstituted; | ||||
|         } | ||||
|  | ||||
|         // If es évek / as évek is contained in the string (e.g. 1880-1990-es évek), there | ||||
|         // will be more than one hyphens | ||||
|         if (MD_STD::stri_contains_any($datum, ['-as évek', '-es évek'])) { | ||||
|             return strtr($datum, ['-as évek' => ' as évek', '-es évek' => ' es évek']); | ||||
|         } | ||||
|  | ||||
|         // 1981. július-augusztus > 1981.07-08 | ||||
|         if (is_numeric(substr($datum, 0, 4)) && substr($datum, 4, 2) === '. ') { | ||||
|  | ||||
|             $monthNames = []; | ||||
|             foreach (self::MONTH_NAMES_ENGLISH as $month => $names) { | ||||
|                 foreach ($names as $name) $monthNames[$name] = $month; | ||||
|             } | ||||
|             foreach (self::MONTH_NAMES_GERMAN as $month => $names) { | ||||
|                 foreach ($names as $name) $monthNames[$name] = $month; | ||||
|             } | ||||
|             foreach (self::MONTH_NAMES_HUNGARIAN as $month => $names) { | ||||
|                 foreach ($names as $name) $monthNames[$name] = $month; | ||||
|             } | ||||
|  | ||||
|             $rewrite = strtr($datum, $monthNames); | ||||
|             if ($rewrite !== $datum) { | ||||
|                 return str_replace('..', '.', str_replace(" ", ".", $rewrite)); | ||||
|             } | ||||
|  | ||||
|         } | ||||
|  | ||||
|         if (str_contains($datum, ',')) { | ||||
|             return str_replace(',', '-', $datum); | ||||
|         } | ||||
|  | ||||
|         return ''; | ||||
|  | ||||
|     } | ||||
|  | ||||
|     /** | ||||
|      * Checks if the string is a time span with given start and end dates. | ||||
|      * | ||||
| @@ -1062,6 +1197,9 @@ final class NodaTimeSplitter { | ||||
|         if (strlen($datum) === 9 and substr($datum, 4, 1) !== '-') return []; | ||||
|  | ||||
|         if (empty($startEnd = self::check_is_timespan_from_till($datum))) { | ||||
|             if ($rewritten = self::_attempt_rewriting_special_cases_from_till($datum)) { | ||||
|                 return self::attempt_splitting_from_till($rewritten); | ||||
|             } | ||||
|             return []; | ||||
|         } | ||||
|         list($start, $end) = $startEnd; | ||||
| @@ -1105,38 +1243,76 @@ final class NodaTimeSplitter { | ||||
|     } | ||||
|  | ||||
|     /** | ||||
|      * Wrapper to check if any splitting command works. | ||||
|      * Cleans invalid outputs from splitting. | ||||
|      * | ||||
|      * @param string $datum Input date. | ||||
|      * @param array{0: string, 1: string, 2: string, 3: string, 4: '+'|'-'|'', 5: string}|array{} $moda Split time to check. | ||||
|      * | ||||
|      * @return array<string> | ||||
|      * @return array{0: string, 1: string, 2: string, 3: string, 4: '+'|'-'|'', 5: string}|array{} | ||||
|      */ | ||||
|     public static function attempt_splitting(string $datum):array { | ||||
|     private static function validate_split_time(array $moda):array { | ||||
|  | ||||
|         $moda = NodaTimeSplitter::is_timespan($datum); | ||||
|         if (!$moda) { | ||||
|             $moda = NodaTimeSplitter::is_incomplete_date($datum); | ||||
|         } | ||||
|         if (!$moda) { | ||||
|             $moda = NodaTimeSplitter::is_valid_date($datum); | ||||
|         } | ||||
|         if (!$moda) { | ||||
|             $moda = NodaTimeSplitter::is_valid_date_hungarian($datum); | ||||
|         } | ||||
|         if (!$moda) { | ||||
|             $moda = NodaTimeSplitter::is_century($datum); | ||||
|         } | ||||
|         if (!$moda) { | ||||
|             $moda = NodaTimeSplitter::is_decade($datum); | ||||
|         if (empty($moda)) return []; | ||||
|  | ||||
|         if ((int)$moda[2] > 12 || (int)$moda[3] > 31) { | ||||
|             return []; | ||||
|         } | ||||
|  | ||||
|         if (!empty($moda)) { | ||||
|             if ((int)$moda[2] > 12 || (int)$moda[3] > 31) { | ||||
|                 return []; | ||||
|             } | ||||
|  | ||||
|         $month_no_zero = strtr($moda[2], ["0" => "", "1" => "", "2" => "", "3" => "", "4" => "", "5" => "", "6" => "", "7" => "", "8" => "", "9" => ""]); | ||||
|         $day_no_zero = strtr($moda[3], ["0" => "", "1" => "", "2" => "", "3" => "", "4" => "", "5" => "", "6" => "", "7" => "", "8" => "", "9" => ""]); | ||||
|         if (!empty($month_no_zero)) { | ||||
|             throw new Exception("Invalid split month: " . var_export($moda, true)); | ||||
|         } | ||||
|         if (!empty($day_no_zero)) { | ||||
|             throw new Exception("Invalid split day: " . var_export($moda, true)); | ||||
|         } | ||||
|  | ||||
|         return $moda; | ||||
|  | ||||
|     } | ||||
|  | ||||
|     /** | ||||
|      * Wrapper to check if any splitting command works. | ||||
|      * | ||||
|      * @param string $datum Input date. | ||||
|      * | ||||
|      * @return array{0: string, 1: string, 2: string, 3: string, 4: '+'|'-'|'', 5: string}|array{} | ||||
|      */ | ||||
|     public static function attempt_splitting(string $datum):array { | ||||
|  | ||||
|         if (!empty($moda = self::is_timespan($datum))) { | ||||
|             return self::validate_split_time($moda); | ||||
|         } | ||||
|  | ||||
|         if (!empty($moda = self::is_incomplete_date($datum))) { | ||||
|             return self::validate_split_time($moda); | ||||
|         } | ||||
|  | ||||
|         if (!empty($moda = self::is_valid_date($datum))) { | ||||
|             return self::validate_split_time($moda); | ||||
|         } | ||||
|  | ||||
|         if (!empty($moda = self::is_valid_date_hungarian($datum))) { | ||||
|             return self::validate_split_time($moda); | ||||
|         } | ||||
|  | ||||
|         if (!empty($moda = self::is_century($datum))) { | ||||
|             return self::validate_split_time($moda); | ||||
|         } | ||||
|  | ||||
|         if (!empty($moda = self::is_decade($datum))) { | ||||
|             return self::validate_split_time($moda); | ||||
|         } | ||||
|  | ||||
|         // 2015. 05. | ||||
|         if (str_contains($datum, ' ')) { | ||||
|             $rewrite = str_replace(' ', '', $datum); | ||||
|             if (is_numeric(str_replace('.', '', $datum))) { | ||||
|                 return self::attempt_splitting($rewrite); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         return []; | ||||
|  | ||||
|     } | ||||
| } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user