This DTO (data transfer object) holds information on an article. DTOs only contain data fields--they do not contain any application logic.
<?php
namespace spokenWikipedia;
/**
* Bean class that holds data on a single article.
* @author Michael Angstadt
*/
class Article{
/**
* The title of the article.
* @var string
*/
public $title;
/**
* The URL of the article.
* @var string
*/
public $url;
/**
* The dates that the article has been updated with an audio recording.
* @var array(DateTime)
*/
public $dates = array();
/**
* Whether or not the article is a featured article.
* @var boolean
*/
public $featured;
} This class is a collection of static methods that scrape the HTML of my Wikipedia user page, as well as save and load the scraped data to the XML cache file. I put this code in its own class to make it easy to unit test.
<?php
namespace spokenWikipedia;
use \DateTime;
use \DOMDocument;
use \DOMXPath;
/**
* Marshalls the information on my Spoken Wikipedia articles to/from various formats.
* @author Michael Angstadt
*/
class SpokenWikipediaMarshaller{
/**
* Screen-scrapes the data from my Wikipedia user page.
* @param string $url the URL of the website (or path to a file)
* @return SpokenWikipedia the screen-scraped data
*/
public static function loadFromWebsite($url){
$spokenWikipedia = new SpokenWikipedia();
//load the HTML into a DOM
$html = file_get_contents($url);
$dom = new DOMDocument();
@$dom->loadHTML($html);
$xpath = new DOMXPath($dom);
//article information are within <table> elements
$cellNodes = $xpath->query('//table[@class="wikitable articlestable"]/tbody/tr/td');
if ($cellNodes->length > 0){
for ($i = 0; $i < $cellNodes->length; $i+=2){
$article = new Article();
$titleNode = $cellNodes->item($i);
//get the title of the article
$linkNodes = $xpath->query('.//a', $titleNode);
if ($linkNodes->length > 0){
$n = $linkNodes->item(0);
if ($n->attributes->getNamedItem('title')->value == 'Featured article'){
//the "star" icon is wrapped in a link, so ignore it
$n = ($linkNodes->length > 1) ? $linkNodes->item(1) : null;
}
if ($n != null){
$article->title = $n->textContent;
}
}
//get the link to the sound file
$sourceNodes = $xpath->query('.//audio/source', $titleNode);
if ($sourceNodes->length > 0){
//<audio id="mwe_player_1" style="width:220px;height:23px" poster="//bits.wikimedia.org/static-1.22wmf11/skins/common/images/icons/fileicon-ogg.png" controls="" preload="none" class="kskin" data-durationhint="2637.1715192744" data-startoffset="0" data-mwtitle="HTTP_cookie.ogg" data-mwprovider="wikimediacommons"><source src="//upload.wikimedia.org/wikipedia/commons/1/1e/HTTP_cookie.ogg" type="audio/ogg; codecs="vorbis"" data-title="Original Ogg file (70 kbps)" data-shorttitle="Ogg source" data-width="0" data-height="0" data-bandwidth="69720"></source>...</audio>
$article->url = 'http:' . $sourceNodes->item(0)->attributes->getNamedItem('src')->value;
}
//is the article a featured article?
//the title is bolded if it is featured
$boldNodes = $xpath->query('.//b', $titleNode);
$article->featured = ($boldNodes->length > 0);
//get the update date(s) for the article
$updatedNode = $cellNodes->item($i+1);
$dateNodes = $xpath->query('del | p', $updatedNode);
if ($dateNodes->length > 0){
//if there are multiple dates, then they are nested within <del> and <p> tags
foreach ($dateNodes as $n){
$article->dates[] = new DateTime($n->textContent);
}
//make sure the dates are in ascending order
//they are already in this order on the page, but I don't know how the XPath "|" (concat) operator combines its results
usort($article->dates, function($a, $b){
//null times should go at the end
if ($a == null){
return 1;
}
if ($b == null){
return -1;
}
return $a->getTimestamp() - $b->getTimestamp();
});
} else {
//if there is only one date, then it is nested directly inside the <td> tag
$article->dates[] = new DateTime($updatedNode->textContent);
}
$spokenWikipedia->articles[] = $article;
}
}
return $spokenWikipedia;
}
/**
* Loads the data from an XML file.
* @param string $filePath the path to the XML file
* @return SpokenWikipedia the cached data
*/
public static function loadFromXml($filePath){
$spokenWikipedia = new SpokenWikipedia();
$xml = simplexml_load_file($filePath);
foreach ($xml->article as $xmlArticle){
$article = new Article();
$article->title = (string)$xmlArticle['title'];
$article->url = (string)$xmlArticle['url'];
$article->featured = (string)$xmlArticle['featured'] == 'true';
foreach ($xmlArticle->date as $xmlDate){
$article->dates[] = new DateTime((string)$xmlDate);
}
$spokenWikipedia->articles[] = $article;
}
return $spokenWikipedia;
}
/**
* Saves the data to an XML file.
* @param SpokenWikipedia $spokenWikipedia the data to save
* @param string $filePath the path to the XML file.
*/
public static function saveToXml($spokenWikipedia, $filePath){
//create XML document
$xml = simplexml_load_string('<spokenWikipedia/>');
foreach ($spokenWikipedia->articles as $article){
$node = $xml->addChild('article');
$node->addAttribute('url', $article->url);
$node->addAttribute('title', $article->title);
$node->addAttribute('featured', $article->featured ? 'true' : 'false');
foreach ($article->dates as $date){
$node->addChild('date', $date->format('Y-m-d'));
}
}
//save the XML to disk
$xml->asXML($filePath);
}
}
The unit test class for the SpokenWikipediaMarshaller class. It uses a copy of my Wikipedia user page which I saved to a file. It uses this saved file instead of loading the file from Wikipedia because I don't want my unit test to fail if I change my Wikipedia user page (or if my Internet connection is down). I also test the caching functionality by saving the cache file, loading the cache file, and then comparing the data I saved with the data I loaded to make sure everything is the same.
<?php
namespace spokenWikipedia;
use \DateTime;
/**
* Tests the SpokenWikipediaMarshaller class.
* @author mangst
*/
class SpokenWikipediaMarshallerTest extends \PHPUnit_Framework_TestCase{
/**
* Test scraping the HTML of the webpage.
*/
public function testLoadFromWebsite(){
$spokenWikipedia = SpokenWikipediaMarshaller::loadFromWebsite(__DIR__ . '/sample.html');
$articles = $spokenWikipedia->articles;
$this->assertEquals(5, count($articles));
$i = 0;
//three dates
//example: <td><del>4/6/2009</del> <p><del>1/24/2010</del></p> <p>9/26/2010</p> </td>"
$a = $articles[$i++];
$this->assertEquals('http://upload.wikimedia.org/wikipedia/commons/1/1e/HTTP_cookie.ogg', $a->url);
$this->assertEquals('HTTP cookie', $a->title);
$this->assertEquals(3, count($a->dates));
$this->assertEquals(strtotime('2009-04-06'), $a->dates[0]->getTimestamp());
$this->assertEquals(strtotime('2010-01-24'), $a->dates[1]->getTimestamp());
$this->assertEquals(strtotime('2010-09-26'), $a->dates[2]->getTimestamp());
$this->assertEquals(false, $a->featured);
//two dates
//example: "<td><del>1/3/2010</del> <p>10/18/2010</p> </td>"
$a = $articles[$i++];
$this->assertEquals('http://upload.wikimedia.org/wikipedia/commons/5/52/Iapetus_%28moon%29.ogg', $a->url);
$this->assertEquals('Iapetus (moon)', $a->title);
$this->assertEquals(2, count($a->dates));
$this->assertEquals(strtotime('2010-01-03'), $a->dates[0]->getTimestamp());
$this->assertEquals(strtotime('2010-10-18'), $a->dates[1]->getTimestamp());
$this->assertEquals(false, $a->featured);
//featured
$a = $articles[$i++];
$this->assertEquals('http://upload.wikimedia.org/wikipedia/commons/c/c6/Saturn.ogg', $a->url);
$this->assertEquals('Saturn', $a->title);
$this->assertEquals(2, count($a->dates));
$this->assertEquals(strtotime('2010-02-10'), $a->dates[0]->getTimestamp());
$this->assertEquals(strtotime('2010-10-10'), $a->dates[1]->getTimestamp());
$this->assertEquals(true, $a->featured);
//one date
//example: "<td>1/31/2009</td>"
$a = $articles[$i++];
$this->assertEquals('http://upload.wikimedia.org/wikipedia/commons/0/09/Battle_of_the_Somme.ogg', $a->url);
$this->assertEquals('Battle of the Somme', $a->title);
$this->assertEquals(1, count($a->dates));
$this->assertEquals(strtotime('2009-01-31'), $a->dates[0]->getTimestamp());
$this->assertEquals(false, $a->featured);
//escaped character
$a = $articles[$i++];
$this->assertEquals('http://upload.wikimedia.org/wikipedia/commons/f/fa/ToeJam_and_Earl.ogg', $a->url);
$this->assertEquals('ToeJam & Earl', $a->title);
$this->assertEquals(1, count($a->dates));
$this->assertEquals(strtotime('2009-11-14'), $a->dates[0]->getTimestamp());
$this->assertEquals(false, $a->featured);
}
/**
* Saves an object to a file, and then loads it from the file to test the marshalling code.
*/
public function testSaveAndLoadFromXml(){
$orig = new SpokenWikipedia();
$article = new Article();
$article->title = 'Zero Dates';
$article->url = 'http://en.wikipedia.org/wiki/Zero_Dates';
$article->featured = false;
$orig->articles[] = $article;
$article = new Article();
$article->title = 'One Date';
$article->url = 'http://en.wikipedia.org/wiki/One_Date';
$article->featured = false;
$article->dates[] = new DateTime('2009-01-31');
$orig->articles[] = $article;
$article = new Article();
$article->title = 'Two Dates';
$article->url = 'http://en.wikipedia.org/wiki/Two_Dates';
$article->featured = false;
$article->dates[] = new DateTime('2010-01-03');
$article->dates[] = new DateTime('2010-10-18');
$orig->articles[] = $article;
$article = new Article();
$article->title = 'Featured';
$article->url = 'http://en.wikipedia.org/wiki/Featured';
$article->featured = true;
$orig->articles[] = $article;
$article = new Article();
$article->title = 'Escaped & Character';
$article->url = 'http://en.wikipedia.org/wiki/Escaped_%26_Character';
$article->featured = false;
$orig->articles[] = $article;
$file = tempnam(sys_get_temp_dir(), 'mangst');
SpokenWikipediaMarshaller::saveToXML($orig, $file);
$loaded = SpokenWikipediaMarshaller::loadFromXML($file);
unlink($file);
$this->assertEquals(count($orig->articles), count($loaded->articles));
for ($i = 0; $i < count($orig->articles); $i++){
$origArticle = $orig->articles[$i];
$loadedArticle = $loaded->articles[$i];
$this->assertEquals($origArticle->title, $loadedArticle->title);
$this->assertEquals($origArticle->url, $loadedArticle->url);
$this->assertEquals($origArticle->featured, $loadedArticle->featured);
$this->assertEquals(count($origArticle->dates), count($loadedArticle->dates));
for ($j = 0; $j < count($origArticle->dates); $j++){
$origDate = $origArticle->dates[$j];
$loadedDate = $loadedArticle->dates[$j];
$this->assertEquals($origDate->getTimestamp(), $loadedDate->getTimestamp());
}
}
}
}
This is a DTO (data transfer object) that contains the list of Articles from my Wikipedia user page. This class doesn't do much--I created it incase I need to scrape more information from my userpage at some point in the future.
<?php
namespace spokenWikipedia;
/**
* Bean class that holds the scraped data.
* @author Michael Angstadt
*/
class SpokenWikipedia{
/**
* Contains all the articles.
* @var array(Article)
*/
public $articles = array();
} This DAO (data access object) pulls data from the cache or refreshes the cache if the cache is stale.
<?php
namespace spokenWikipedia;
/**
* Makes use of a cache to save the scraped data to instead of scraping the website every time data is requested.
* Refreshes the cache every so often by re-scraping the page.
* @author Michael Angstadt
*/
class SpokenWikipediaCacheDao extends SpokenWikipediaNoCacheDao{
/**
* The path to the cache file.
* @var string
*/
private $cachePath;
/**
* The number of seconds old the cache file is allowed to be before it is refreshed.
* @var integer
*/
private $refreshRate;
/**
* Constructor.
* @param string $url the URL of the website
* @param string $cachePath the path to the cache file
* @param integer $refreshRate the number of seconds old the cache file is allowed to be before it is refreshed
*/
public function __construct($url, $cachePath, $refreshRate){
parent::__construct($url);
$this->cachePath = $cachePath;
$this->refreshRate = $refreshRate;
}
/**
* Gets the list of all recorded Wikipedia articles in no particular order.
* Makes use of a caching mechanism to save the scraped data.
* @return array(Article) the articles
*/
//override
public function getArticles(){
if (file_exists($this->cachePath)){
//determine how long ago the file was last modified
$stats = stat($this->cachePath);
$diff = time() - $stats['mtime'];
if ($diff > $this->refreshRate){
//recreate the file if it was last updated more than an hour ago, incase any changes were made to the website
$spokenWikipedia = SpokenWikipediaMarshaller::loadFromWebsite($this->url);
SpokenWikipediaMarshaller::saveToXml($spokenWikipedia, $this->cachePath);
} else {
$spokenWikipedia = SpokenWikipediaMarshaller::loadFromXml($this->cachePath);
}
} else {
//the file doesn't exist, so create it
$spokenWikipedia = SpokenWikipediaMarshaller::loadFromWebsite($this->url);
SpokenWikipediaMarshaller::saveToXml($spokenWikipedia, $this->cachePath);
}
return $spokenWikipedia->articles;
}
}
This DAO (data access object) will always scrape data from the Wikipedia page and never use a cache.
<?php
namespace spokenWikipedia;
/**
* Screen-scrapes the Wikipedia page every time data is requested.
* @author Michael Angstadt
*/
class SpokenWikipediaNoCacheDao implements SpokenWikipediaDao{
/**
* The URL of the Wikipedia page.
* @var string
*/
protected $url;
/**
* Constructor.
* @param string $url the URL of the webpage to load
*/
public function __construct($url){
$this->url = $url;
//Wikipedia requires that all requests contain a User-Agent header, or else a HTTP 403 response will be returned
//see http://meta.wikimedia.org/wiki/User-Agent_policy
ini_set('user_agent', 'Michael Angstadt\'s Cool Screen-Scraper (+http://www.mangst.com)');
}
public function getArticles(){
return SpokenWikipediaMarshaller::loadFromWebsite($this->url)->articles;
}
public function getArticlesSortedByLatestUpdated(){
$articles = $this->getArticles();
usort($articles, function ($a, $b){
if (count($a->dates) == 0){
return 1;
}
if (count($b->dates) == 0){
return -1;
}
$dateA = $a->dates[count($a->dates)-1];
$dateB = $b->dates[count($b->dates)-1];
return $dateB->getTimestamp() - $dateA->getTimestamp();
});
return $articles;
}
} This interface defines the methods that each DAO implementation must implement. By having an interface like this, I can create additional DAO implementations that get their data from differences sources. For example, I could create a database DAO that pulls my Wikipedia information from a database. Or, I could create a mock implementation for testing purposes that just returns hard-coded data.
<?php
namespace spokenWikipedia;
/**
* Defines the interface for screen-scraping my Wikipedia user page.
* @author Michael Angstadt
*/
interface SpokenWikipediaDao{
/**
* Gets the list of all recorded Wikipedia articles sorted by the last time they were updated.
* @return array(Article) the articles
*/
public function getArticlesSortedByLatestUpdated();
/**
* Gets the list of all recorded Wikipedia articles in no particular order.
* @return array(Article) the articles
*/
public function getArticles();
}