Compare commits

2 Commits

Author SHA1 Message Date
06ef0932d2 Make link checker in MD_STD_RSS_TEST non-final 2026-01-31 14:30:25 +01:00
68e1221215 Accept remote statements on content type if the file to check is large
and the domain is whitelisted
2025-12-09 12:11:00 +01:00
2 changed files with 60 additions and 5 deletions

View File

@@ -775,13 +775,55 @@ final class MD_STD {
throw new Exception("Failed to get temporary file location");
}
$fp = \fopen($tmp_file, 'w');
// Check remote headers
if (!($ch = \curl_init($url))) {
if (!($ch_headers = \curl_init())) {
throw new Exception("Failed to initialize curl for $url");
};
\curl_setopt($ch, CURLOPT_FILE, $fp);
\curl_setopt_array($ch_headers, [
CURLOPT_URL => $url,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_CONNECTTIMEOUT_MS => 10000,
CURLOPT_TIMEOUT_MS => 10000,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_AUTOREFERER => true,
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.2) Gecko/20100101 Firefox/10.0.2',
CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_2_0,
CURLOPT_TCP_FASTOPEN => true,
CURLOPT_HEADER => true,
CURLOPT_NOBODY => true,
]);
\curl_exec($ch_headers);
// If the content length is too high and the request went out to a trusted source,
// return the content type as stated by the remote server.
if (curl_getinfo($ch_headers, CURLINFO_CONTENT_LENGTH_DOWNLOAD_T) > 2000000) {
if (($url_parsed = parse_url($url)) && str_contains($url_parsed['host'], 'archive.org') && !empty($remote_content_type = curl_getinfo($ch_headers, CURLINFO_CONTENT_TYPE))) {
return $remote_content_type;
}
}
// Validate that the remote file really is of the correct content type.
$fp = \fopen($tmp_file, 'w');
if (!($ch = \curl_init())) {
throw new Exception("Failed to initialize curl for $url");
};
\curl_setopt_array($ch, [
CURLOPT_URL => $url,
CURLOPT_CONNECTTIMEOUT_MS => 10000,
CURLOPT_TIMEOUT_MS => 10000,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_AUTOREFERER => true,
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.2) Gecko/20100101 Firefox/10.0.2',
CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_2_0,
CURLOPT_TCP_FASTOPEN => true,
CURLOPT_FILE => $fp,
]);
\curl_exec($ch);
$mime_type = MD_STD::mime_content_type($tmp_file);

View File

@@ -25,9 +25,22 @@ abstract class MD_STD_RSS_TEST extends TestCase {
$domDoc = new DomDocument();
self::assertTrue($domDoc->loadXML($this->feed));
self::assertTrue($domDoc->schemaValidate(__DIR__ . "/../../assets/xsd/Rss2.xsd"));
// Don't just validate, but validate after ignoring warning on
// the existence of a non-specified <updated/> element.
libxml_use_internal_errors(true);
$domDoc->schemaValidate(__DIR__ . "/../../assets/xsd/Rss2.xsd");
$errors = libxml_get_errors();
unset($domDoc);
foreach ($errors as $e) {
if (str_contains((string)$e->message, '\'updated\': This element is not expected')) {
continue;
}
throw new Exception($e->message);
}
}
/**
@@ -35,7 +48,7 @@ abstract class MD_STD_RSS_TEST extends TestCase {
*
* @return void
*/
final public function testRssFeedLinksAndEnclosure() {
public function testRssFeedLinksAndEnclosure() {
if (!($xmlData = simplexml_load_string($this->feed))) {
throw new Exception("Failed to load RSS feed string to SimpleXML element");