From 88458df949866b985c1f98dfbfce2ad321ab556a Mon Sep 17 00:00:00 2001 From: Joshua Ramon Enslin Date: Fri, 18 Aug 2023 15:09:58 +0200 Subject: [PATCH] Add general abstract classes for tests, starting with test classes for RSS feeds --- assets/xsd/Rss2.xsd | 500 ++++++++++++++++++++++++++++++++ src/MD_STD.php | 38 +++ src/testing/MD_STD_RSS_TEST.php | 60 ++++ 3 files changed, 598 insertions(+) create mode 100644 assets/xsd/Rss2.xsd create mode 100644 src/testing/MD_STD_RSS_TEST.php diff --git a/assets/xsd/Rss2.xsd b/assets/xsd/Rss2.xsd new file mode 100644 index 0000000..cbcff4b --- /dev/null +++ b/assets/xsd/Rss2.xsd @@ -0,0 +1,500 @@ + + + + + XML Schema for RSS v2.0 feed files. + Project home: http://www.codeplex.com/rss2schema/ + Based on the RSS 2.0 specification document at http://cyber.law.harvard.edu/rss/rss.html + Author: Jorgen Thelin + Revision: 16 + Date: 01-Nov-2008 + Feedback to: http://www.codeplex.com/rss2schema/WorkItem/List.aspx + + + + + + + + + + + + + + An item may represent a "story" -- much like a story in a newspaper or magazine; if so its description is a synopsis of the story, and the link points to the full story. An item may also be complete in itself, if so, the description contains the text (entity-encoded HTML is allowed), and the link and title may be omitted. + + + + + + The title of the item. + + + + + The item synopsis. + + + + + The URL of the item. + + + + + Email address of the author of the item. + + + + + Includes the item in one or more categories. + + + + + URL of a page for comments relating to the item. + + + + + Describes a media object that is attached to the item. + + + + + guid or permalink URL for this entry + + + + + Indicates when the item was published. + + + + + The RSS channel that the item came from. + + + + + Extensibility element. + + + + + + + + + + + + The name of the channel. It's how people refer to your service. If you have an HTML website that contains the same information as your RSS file, the title of your channel should be the same as the title of your website. + + + + + The URL to the HTML website corresponding to the channel. + + + + + Phrase or sentence describing the channel. + + + + + The language the channel is written in. This allows aggregators to group all Italian language sites, for example, on a single page. A list of allowable values for this element, as provided by Netscape, is here. You may also use values defined by the W3C. + + + + + Copyright notice for content in the channel. + + + + + Email address for person responsible for editorial content. + + + + + Email address for person responsible for technical issues relating to channel. + + + + + The publication date for the content in the channel. All date-times in RSS conform to the Date and Time Specification of RFC 822, with the exception that the year may be expressed with two characters or four characters (four preferred). + + + + + The last time the content of the channel changed. + + + + + Specify one or more categories that the channel belongs to. + + + + + A string indicating the program used to generate the channel. + + + + + A URL that points to the documentation for the format used in the RSS file. It's probably a pointer to this page. It's for people who might stumble across an RSS file on a Web server 25 years from now and wonder what it is. + + + + + Allows processes to register with a cloud to be notified of updates to the channel, implementing a lightweight publish-subscribe protocol for RSS feeds. + + + + + ttl stands for time to live. It's a number of minutes that indicates how long a channel can be cached before refreshing from the source. + + + + + Specifies a GIF, JPEG or PNG image that can be displayed with the channel. + + + + + The PICS rating for the channel. + + + + + Specifies a text input box that can be displayed with the channel. + + + + + A hint for aggregators telling them which hours they can skip. + + + + + A hint for aggregators telling them which days they can skip. + + + + + Extensibility element. + + + + + + + + + Extensibility element. + + + + + + + + A time in GMT when aggregators should not request the channel data. The hour beginning at midnight is hour zero. + + + + + + + + + + + + + + A day when aggregators should not request the channel data. + + + + + + + + + + + + + + + + A time in GMT, when aggregators should not request the channel data. The hour beginning at midnight is hour zero. + + + + + + + + + + + + + + + + The URL of the image file. + + + + + Describes the image, it's used in the ALT attribute of the HTML <img> tag when the channel is rendered in HTML. + + + + + The URL of the site, when the channel is rendered, the image is a link to the site. (Note, in practice the image <title> and <link> should have the same value as the channel's <title> and <link>. + + + + + The width of the image in pixels. + + + + + The height of the image in pixels. + + + + + Text that is included in the TITLE attribute of the link formed around the image in the HTML rendering. + + + + + + + The height of the image in pixels. + + + + + + + + The width of the image in pixels. + + + + + + + + Specifies a web service that supports the rssCloud interface which can be implemented in HTTP-POST, XML-RPC or SOAP 1.1. Its purpose is to allow processes to register with a cloud to be notified of updates to the channel, implementing a lightweight publish-subscribe protocol for RSS feeds. + + + + + + + + + + + + + + + + + The purpose of this element is something of a mystery! You can use it to specify a search engine box. Or to allow a reader to provide feedback. Most aggregators ignore it. + + + + + The label of the Submit button in the text input area. + + + + + Explains the text input area. + + + + + The name of the text object in the text input area. + + + + + The URL of the CGI script that processes text input requests. + + + + + + + Using the regexp definiton of E-Mail Address by Lucadean from the .NET RegExp Pattern Repository at http://www.3leaf.com/default/NetRegExpRepository.aspx + + + + + + + + A date-time displayed in RFC-822 format. + Using the regexp definiton of rfc-822 date by Sam Ruby at http://www.intertwingly.net/blog/1360.html + + + + + + + + + + + + + + + + + + URL where the enclosure is located + + + + + Size in bytes + + + + + MIME media-type of the enclosure + + + + + + + + + + + + + + + + + + diff --git a/src/MD_STD.php b/src/MD_STD.php index 4969b3e..5394e36 100644 --- a/src/MD_STD.php +++ b/src/MD_STD.php @@ -419,6 +419,44 @@ final class MD_STD { } + /** + * Check if a URL is reachable (200) using curl. + * + * @param string $url URL to validate. + * + * @return bool + */ + public static function checkUrlIsReachable(string $url):bool { + + if (filter_var($url, FILTER_VALIDATE_URL) === false) { + throw new MDInvalidUrl("URL to check (" . $url . ") does not seem to be a valid URL"); + } + + $ch = curl_init($url); + curl_setopt_array($ch, [ + CURLOPT_AUTOREFERER => true, + CURLOPT_CONNECTTIMEOUT => 5, + CURLOPT_ENCODING => "", + CURLOPT_FOLLOWLOCATION => true, + CURLOPT_MAXREDIRS => 1, + CURLOPT_NOBODY => true, + CURLOPT_TIMEOUT => 5, + // It's very important to let other webmasters know who's probing their servers. + CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.2) Gecko/20100101 Firefox/10.0.2', + CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_2_0, + CURLOPT_TCP_FASTOPEN => true, + ]); + curl_exec($ch); + $code = curl_getinfo($ch, CURLINFO_HTTP_CODE); + curl_close($ch); + if ($code !== 200) { + return false; + } + + return true; + + } + /** * Sets and returns user language based on a session cookie. * diff --git a/src/testing/MD_STD_RSS_TEST.php b/src/testing/MD_STD_RSS_TEST.php new file mode 100644 index 0000000..e40a7d7 --- /dev/null +++ b/src/testing/MD_STD_RSS_TEST.php @@ -0,0 +1,60 @@ + + */ +declare(strict_types = 1); + +use PHPUnit\Framework\TestCase; + + +/** + * Tests for the manifest. + */ +abstract class MD_STD_RSS_TEST extends TestCase { + + protected string $feed; + + /** + * Protected function testRssFeedValidity. + * + * @return void + */ + final public function testRssFeedValidity() { + + $domDoc = new DomDocument(); + self::assertTrue($domDoc->loadXML($this->feed)); + self::assertTrue($domDoc->schemaValidate(__DIR__ . "/../../assets/xsd/Rss2.xsd")); + unset($domDoc); + + } + + /** + * Checks for the availability of RSS feed links and encosures. + * + * @return void + */ + final public function testRssFeedLinksAndEnclosure() { + + if (!($xmlData = simplexml_load_string($this->feed))) { + throw new Exception("Failed to load RSS feed string to SimpleXML element"); + } + + self::assertNotEmpty((string)$xmlData->channel->title); + self::assertTrue(MD_STD::checkUrlIsReachable((string)$xmlData->channel->image->url), "Path " . $xmlData->channel->image->url . " does not appear to be a reachable URL"); + self::assertTrue(MD_STD::checkUrlIsReachable((string)$xmlData->channel->image->link), "Path " . $xmlData->channel->image->link . " does not appear to be a reachable URL"); + + self::assertTrue(MD_STD::checkUrlIsReachable((string)$xmlData->channel->item->link), "Path " . $xmlData->channel->item->link . " does not appear to be a reachable URL"); + + if (($firstEntryImg = $xmlData->channel->item->enclosure) === null) { + throw new Exception("First item does not seem to have an enclosure"); + } + if (($firstEntryImgAttr = $firstEntryImg->attributes()) === null) { + throw new Exception("First enclosure does not seem to have attributes"); + } + + self::assertTrue(MD_STD::checkUrlIsReachable((string)$firstEntryImgAttr->url), "First enclosure does not appear to be a reachable URL (" . $firstEntryImgAttr->url . ")"); + + } +}