This DTO (data transfer object) holds information on an article. DTOs only contain data fields--they do not contain any application logic.

(double-click the code to select all)

<?php
namespace spokenWikipedia;

/**
 * Bean class that holds data on a single article.
 * @author Michael Angstadt
 */
class Article{
	/**
	 * The title of the article.
	 * @var string
	 */
	public $title;
	
	/**
	 * The URL of the article.
	 * @var string
	 */
	public $url;
	
	/**
	 * The dates that the article has been updated with an audio recording.
	 * @var array(DateTime)
	 */
	public $dates = array();
	
	/**
	 * Whether or not the article is a featured article.
	 * @var boolean
	 */
	public $featured;
}

This class is a collection of static methods that scrape the HTML of my Wikipedia user page, as well as save and load the scraped data to the XML cache file. I put this code in its own class to make it easy to unit test.

(double-click the code to select all)

<?php
namespace spokenWikipedia;

use \DateTime;
use \DOMDocument;
use \DOMXPath;

/**
 * Marshalls the information on my Spoken Wikipedia articles to/from various formats.
 * @author Michael Angstadt
 */
class SpokenWikipediaMarshaller{

	/**
	 * Screen-scrapes the data from my Wikipedia user page.
	 * @param string $url the URL of the website (or path to a file)
	 * @return SpokenWikipedia the screen-scraped data
	 */
	public static function loadFromWebsite($url){
		$spokenWikipedia = new SpokenWikipedia();
		
		//load the HTML into a DOM
		$html = file_get_contents($url);
		$dom = new DOMDocument();
		@$dom->loadHTML($html);
		$xpath = new DOMXPath($dom);

		//article information are within <table> elements
		$cellNodes = $xpath->query('//table[@class="wikitable articlestable"]/tbody/tr/td');
		if ($cellNodes->length > 0){
			for ($i = 0; $i < $cellNodes->length; $i+=2){
				$article = new Article();
				
				$titleNode = $cellNodes->item($i);
				
				//get the title of the article
				$linkNodes = $xpath->query('.//a', $titleNode);
				if ($linkNodes->length > 0){
					$n = $linkNodes->item(0);
					if ($n->attributes->getNamedItem('title')->value == 'Featured article'){
						//the "star" icon is wrapped in a link, so ignore it
						$n = ($linkNodes->length > 1) ? $linkNodes->item(1) : null;
					}
					if ($n != null){
						$article->title = $n->textContent;
					}
				}
				
				//get the link to the sound file
				$sourceNodes = $xpath->query('.//audio/source', $titleNode);
				if ($sourceNodes->length > 0){
					//<audio id="mwe_player_1" style="width:220px;height:23px" poster="//bits.wikimedia.org/static-1.22wmf11/skins/common/images/icons/fileicon-ogg.png" controls="" preload="none" class="kskin" data-durationhint="2637.1715192744" data-startoffset="0" data-mwtitle="HTTP_cookie.ogg" data-mwprovider="wikimediacommons"><source src="//upload.wikimedia.org/wikipedia/commons/1/1e/HTTP_cookie.ogg" type="audio/ogg; codecs=&quot;vorbis&quot;" data-title="Original Ogg file (70 kbps)" data-shorttitle="Ogg source" data-width="0" data-height="0" data-bandwidth="69720"></source>...</audio>
					$article->url = 'http:' . $sourceNodes->item(0)->attributes->getNamedItem('src')->value;
				}
				
				//is the article a featured article?
				//the title is bolded if it is featured
				$boldNodes = $xpath->query('.//b', $titleNode);
				$article->featured = ($boldNodes->length > 0);
				
				//get the update date(s) for the article
				$updatedNode = $cellNodes->item($i+1);
				$dateNodes = $xpath->query('del | p', $updatedNode);
				if ($dateNodes->length > 0){
					//if there are multiple dates, then they are nested within <del> and <p> tags
					foreach ($dateNodes as $n){
						$article->dates[] = new DateTime($n->textContent);
					}
					
					//make sure the dates are in ascending order
					//they are already in this order on the page, but I don't know how the XPath "|" (concat) operator combines its results
					usort($article->dates, function($a, $b){
						//null times should go at the end
						if ($a == null){
							return 1;
						}
						if ($b == null){
							return -1;
						}
						return $a->getTimestamp() - $b->getTimestamp();
					});
				} else {
					//if there is only one date, then it is nested directly inside the <td> tag
					$article->dates[] = new DateTime($updatedNode->textContent);
				}
				
				$spokenWikipedia->articles[] = $article;
			}
		}
		
		return $spokenWikipedia;
	}
	
	/**
	 * Loads the data from an XML file.
	 * @param string $filePath the path to the XML file
	 * @return SpokenWikipedia the cached data
	 */
	public static function loadFromXml($filePath){
		$spokenWikipedia = new SpokenWikipedia();
		
		$xml = simplexml_load_file($filePath);
		foreach ($xml->article as $xmlArticle){
			$article = new Article();
			
			$article->title = (string)$xmlArticle['title'];
			$article->url = (string)$xmlArticle['url'];
			$article->featured = (string)$xmlArticle['featured'] == 'true';
			foreach ($xmlArticle->date as $xmlDate){
				$article->dates[] = new DateTime((string)$xmlDate);
			}
			
			$spokenWikipedia->articles[] = $article;
		}
		
		return $spokenWikipedia;
	}
	
	/**
	 * Saves the data to an XML file.
	 * @param SpokenWikipedia $spokenWikipedia the data to save
	 * @param string $filePath the path to the XML file.
	 */
	public static function saveToXml($spokenWikipedia, $filePath){
		//create XML document
		$xml = simplexml_load_string('<spokenWikipedia/>');
		foreach ($spokenWikipedia->articles as $article){
			$node = $xml->addChild('article');
			$node->addAttribute('url', $article->url);
			$node->addAttribute('title', $article->title);
			$node->addAttribute('featured', $article->featured ? 'true' : 'false');
			foreach ($article->dates as $date){
				$node->addChild('date', $date->format('Y-m-d'));
			}
		}

		//save the XML to disk
		$xml->asXML($filePath);
	}
}

The unit test class for the SpokenWikipediaMarshaller class. It uses a copy of my Wikipedia user page which I saved to a file. It uses this saved file instead of loading the file from Wikipedia because I don't want my unit test to fail if I change my Wikipedia user page (or if my Internet connection is down). I also test the caching functionality by saving the cache file, loading the cache file, and then comparing the data I saved with the data I loaded to make sure everything is the same.

(double-click the code to select all)

<?php
namespace spokenWikipedia;

use \DateTime;

/**
 * Tests the SpokenWikipediaMarshaller class.
 * @author mangst
 */
class SpokenWikipediaMarshallerTest extends \PHPUnit_Framework_TestCase{
	/**
	 * Test scraping the HTML of the webpage.
	 */
	public function testLoadFromWebsite(){
		$spokenWikipedia = SpokenWikipediaMarshaller::loadFromWebsite(__DIR__ . '/sample.html');
		$articles = $spokenWikipedia->articles;
		
		$this->assertEquals(5, count($articles));
		$i = 0;
		
		//three dates
		//example: <td><del>4/6/2009</del> <p><del>1/24/2010</del></p> <p>9/26/2010</p> </td>"
		$a = $articles[$i++];
		$this->assertEquals('http://upload.wikimedia.org/wikipedia/commons/1/1e/HTTP_cookie.ogg', $a->url);
		$this->assertEquals('HTTP cookie', $a->title);
		$this->assertEquals(3, count($a->dates));
		$this->assertEquals(strtotime('2009-04-06'), $a->dates[0]->getTimestamp());
		$this->assertEquals(strtotime('2010-01-24'), $a->dates[1]->getTimestamp());
		$this->assertEquals(strtotime('2010-09-26'), $a->dates[2]->getTimestamp());
		$this->assertEquals(false, $a->featured);
		
		//two dates
		//example: "<td><del>1/3/2010</del> <p>10/18/2010</p> </td>"
		$a = $articles[$i++];
		$this->assertEquals('http://upload.wikimedia.org/wikipedia/commons/5/52/Iapetus_%28moon%29.ogg', $a->url);
		$this->assertEquals('Iapetus (moon)', $a->title);
		$this->assertEquals(2, count($a->dates));
		$this->assertEquals(strtotime('2010-01-03'), $a->dates[0]->getTimestamp());
		$this->assertEquals(strtotime('2010-10-18'), $a->dates[1]->getTimestamp());
		$this->assertEquals(false, $a->featured);

		//featured
		$a = $articles[$i++];
		$this->assertEquals('http://upload.wikimedia.org/wikipedia/commons/c/c6/Saturn.ogg', $a->url);
		$this->assertEquals('Saturn', $a->title);
		$this->assertEquals(2, count($a->dates));
		$this->assertEquals(strtotime('2010-02-10'), $a->dates[0]->getTimestamp());
		$this->assertEquals(strtotime('2010-10-10'), $a->dates[1]->getTimestamp());
		$this->assertEquals(true, $a->featured);
		
		//one date
		//example: "<td>1/31/2009</td>"
		$a = $articles[$i++];
		$this->assertEquals('http://upload.wikimedia.org/wikipedia/commons/0/09/Battle_of_the_Somme.ogg', $a->url);
		$this->assertEquals('Battle of the Somme', $a->title);
		$this->assertEquals(1, count($a->dates));
		$this->assertEquals(strtotime('2009-01-31'), $a->dates[0]->getTimestamp());
		$this->assertEquals(false, $a->featured);
		
		//escaped character
		$a = $articles[$i++];
		$this->assertEquals('http://upload.wikimedia.org/wikipedia/commons/f/fa/ToeJam_and_Earl.ogg', $a->url);
		$this->assertEquals('ToeJam & Earl', $a->title);
		$this->assertEquals(1, count($a->dates));
		$this->assertEquals(strtotime('2009-11-14'), $a->dates[0]->getTimestamp());
		$this->assertEquals(false, $a->featured);
	}
	
	/**
	 * Saves an object to a file, and then loads it from the file to test the marshalling code.
	 */
	public function testSaveAndLoadFromXml(){
		$orig = new SpokenWikipedia();
		
		$article = new Article();
		$article->title = 'Zero Dates';
		$article->url = 'http://en.wikipedia.org/wiki/Zero_Dates';
		$article->featured = false;
		$orig->articles[] = $article;
		
		$article = new Article();
		$article->title = 'One Date';
		$article->url = 'http://en.wikipedia.org/wiki/One_Date';
		$article->featured = false;
		$article->dates[] = new DateTime('2009-01-31');
		$orig->articles[] = $article;
		
		$article = new Article();
		$article->title = 'Two Dates';
		$article->url = 'http://en.wikipedia.org/wiki/Two_Dates';
		$article->featured = false;
		$article->dates[] = new DateTime('2010-01-03');
		$article->dates[] = new DateTime('2010-10-18');
		$orig->articles[] = $article;
		
		$article = new Article();
		$article->title = 'Featured';
		$article->url = 'http://en.wikipedia.org/wiki/Featured';
		$article->featured = true;
		$orig->articles[] = $article;
		
		$article = new Article();
		$article->title = 'Escaped & Character';
		$article->url = 'http://en.wikipedia.org/wiki/Escaped_%26_Character';
		$article->featured = false;
		$orig->articles[] = $article;
		
		$file = tempnam(sys_get_temp_dir(), 'mangst');
		SpokenWikipediaMarshaller::saveToXML($orig, $file);
		$loaded = SpokenWikipediaMarshaller::loadFromXML($file);
		unlink($file);

		$this->assertEquals(count($orig->articles), count($loaded->articles));
		for ($i = 0; $i < count($orig->articles); $i++){
			$origArticle = $orig->articles[$i];
			$loadedArticle = $loaded->articles[$i];
	
			$this->assertEquals($origArticle->title, $loadedArticle->title);
			$this->assertEquals($origArticle->url, $loadedArticle->url);
			$this->assertEquals($origArticle->featured, $loadedArticle->featured);
			$this->assertEquals(count($origArticle->dates), count($loadedArticle->dates));
			for ($j = 0; $j < count($origArticle->dates); $j++){
				$origDate = $origArticle->dates[$j];
				$loadedDate = $loadedArticle->dates[$j];
				$this->assertEquals($origDate->getTimestamp(), $loadedDate->getTimestamp());
			}
		}
	}
}

This is a DTO (data transfer object) that contains the list of Articles from my Wikipedia user page. This class doesn't do much--I created it incase I need to scrape more information from my userpage at some point in the future.

(double-click the code to select all)

<?php
namespace spokenWikipedia;

/**
 * Bean class that holds the scraped data.
 * @author Michael Angstadt
 */
class SpokenWikipedia{
	/**
	 * Contains all the articles.
	 * @var array(Article)
	 */
	public $articles = array();
}

This DAO (data access object) pulls data from the cache or refreshes the cache if the cache is stale.

(double-click the code to select all)

<?php
namespace spokenWikipedia;

/**
 * Makes use of a cache to save the scraped data to instead of scraping the website every time data is requested.
 * Refreshes the cache every so often by re-scraping the page.
 * @author Michael Angstadt
 */
class SpokenWikipediaCacheDao extends SpokenWikipediaNoCacheDao{
	/**
	 * The path to the cache file.
	 * @var string
	 */
	private $cachePath;
	
	/**
	 * The number of seconds old the cache file is allowed to be before it is refreshed.
	 * @var integer
	 */
	private $refreshRate;
	
	/**
	 * Constructor.
	 * @param string $url the URL of the website
	 * @param string $cachePath the path to the cache file
	 * @param integer $refreshRate the number of seconds old the cache file is allowed to be before it is refreshed 
	 */
	public function __construct($url, $cachePath, $refreshRate){
		parent::__construct($url);
		$this->cachePath = $cachePath;
		$this->refreshRate = $refreshRate;
	}
	
	/**
	 * Gets the list of all recorded Wikipedia articles in no particular order.
	 * Makes use of a caching mechanism to save the scraped data.
	 * @return array(Article) the articles
	 */
	//override
	public function getArticles(){
		if (file_exists($this->cachePath)){
			//determine how long ago the file was last modified
			$stats = stat($this->cachePath);
			$diff = time() - $stats['mtime'];
			
			if ($diff > $this->refreshRate){
				//recreate the file if it was last updated more than an hour ago, incase any changes were made to the website
				$spokenWikipedia = SpokenWikipediaMarshaller::loadFromWebsite($this->url);
				SpokenWikipediaMarshaller::saveToXml($spokenWikipedia, $this->cachePath);
			} else {
				$spokenWikipedia = SpokenWikipediaMarshaller::loadFromXml($this->cachePath);
			}
		} else {
			//the file doesn't exist, so create it
			$spokenWikipedia = SpokenWikipediaMarshaller::loadFromWebsite($this->url);
			SpokenWikipediaMarshaller::saveToXml($spokenWikipedia, $this->cachePath);
		}
		return $spokenWikipedia->articles;
	}
}

This DAO (data access object) will always scrape data from the Wikipedia page and never use a cache.

(double-click the code to select all)

<?php
namespace spokenWikipedia;

/**
 * Screen-scrapes the Wikipedia page every time data is requested.
 * @author Michael Angstadt
 */
class SpokenWikipediaNoCacheDao implements SpokenWikipediaDao{
	/**
	 * The URL of the Wikipedia page.
	 * @var string
	 */
	protected $url;
	
	/**
	 * Constructor.
	 * @param string $url the URL of the webpage to load
	 */
	public function __construct($url){
		$this->url = $url;
		
		//Wikipedia requires that all requests contain a User-Agent header, or else a HTTP 403 response will be returned
		//see http://meta.wikimedia.org/wiki/User-Agent_policy
		ini_set('user_agent', 'Michael Angstadt\'s Cool Screen-Scraper (+http://www.mangst.com)');
	}
	
	public function getArticles(){
		return SpokenWikipediaMarshaller::loadFromWebsite($this->url)->articles;
	}
	
	public function getArticlesSortedByLatestUpdated(){
		$articles = $this->getArticles();
		usort($articles, function ($a, $b){
			if (count($a->dates) == 0){
				return 1;
			} 
			if (count($b->dates) == 0){
				return -1;
			}
			
			$dateA = $a->dates[count($a->dates)-1];
			$dateB = $b->dates[count($b->dates)-1];
			return $dateB->getTimestamp() - $dateA->getTimestamp();
		});
		return $articles;
	}
}

This interface defines the methods that each DAO implementation must implement. By having an interface like this, I can create additional DAO implementations that get their data from differences sources. For example, I could create a database DAO that pulls my Wikipedia information from a database. Or, I could create a mock implementation for testing purposes that just returns hard-coded data.

(double-click the code to select all)

<?php
namespace spokenWikipedia;

/**
 * Defines the interface for screen-scraping my Wikipedia user page.
 * @author Michael Angstadt
 */
interface SpokenWikipediaDao{
	/**
	 * Gets the list of all recorded Wikipedia articles sorted by the last time they were updated.
	 * @return array(Article) the articles
	 */
	public function getArticlesSortedByLatestUpdated();
	
	/**
	 * Gets the list of all recorded Wikipedia articles in no particular order.
	 * @return array(Article) the articles
	 */
	public function	getArticles();
}