.cs code of generated extractor usage:
var discussion = new CodeplexDiscussion("http://phalanger.codeplex.com/discussions?size=100&page=0");
string lastdate = null;

foreach (var t in discussion.threads) {
      Console.WriteLine("\n{0}:", t.title);
      foreach (var p in t.posts) {
            if (lastdate == p.date) continue;   // the pattern matches some posts twice or more...skip them :)
             lastdate = p.date;

             Console.WriteLine("{0}, {1}: {2}", p.date, p.username, p.html);                    
       }
}

.webml file containing the extraction process description:
/* CodeplexDiscussion.webml */
class Thread {
	string title;
	string url;
	Post[] posts;
}

class Post {
	string username;
	string userurl;
	
	string date;
	string html;
}

// Helper C# methods.
string urlencode(string str) c# @'
	return System.Web.HttpUtility.UrlEncode(str);
'
string htmldecode(string str) c# @'
	return System.Web.HttpUtility.HtmlDecode(str);
'

// Extraction methods.
main(Thread[] threads, string discussionUrl) {
	[open(discussionUrl)]
	{
		parseThreads(threads);
	}
}

parseThreads(Thread[] threads) {
	foreach(xmlmatch(@'
		<a id="ThreadTitleLink" class="ThreadLink" href="~@_threadUrl@~">~@_threadTitle@~</a>
	'))
	{
		Thread t = Thread(url=htmldecode(_threadUrl),title=_threadTitle);
		parseThread(t.url, t.posts);
		threads[] = t;
	}
}

parseThread(string url, Post[] posts) {
	[open(url)]
	foreach(xmlmatch(@'
	<tr id="PostPanel" class="Post">
		<td>
			<div class="Details">
				<div class="UserName"><a class="UserProfileLink" href="~@_userUrl@~">~@_userName@~</a></div>
				<div class="SubText"><span class="smartDate">~@_postDate@~</span></div>
			</div>
		</td>
		<td class="discussionListContent "><html><body>~@_postHtml@~</body></html></td>
	</tr>
	'))
	{
		posts[] = Post( username=_userName, userurl=_userUrl, date=_postDate, html=_postHtml );
	}
}

Last edited Jan 5, 2012 at 3:31 PM by jakub, version 1

Comments

No comments yet.