I've more or less completed the scriopt that converts the old news into Atom entry documents:
xquery version "1.0";
declare namespace xmldb="http://exist-db.org/xquery/xmldb";
declare namespace html="http://www.w3.org/1999/xhtml";
declare namespace xs="http://www.w3.org/2001/XMLSchema";
declare namespace atom="http://www.w3.org/2005/Atom";
declare namespace text="http://exist-db.org/xquery/text";
declare function local:leading-zero($n as xs:decimal) as xs:string {
let $result := if ($n >= 10)
then string($n)
else concat("0", string($n))
return $result
};
declare function local:parse-date($date as xs:string) as xs:string {
let $day := normalize-space(substring-before($date, ","))
let $string-date := normalize-space(substring-after($date, ","))
let $y1 := normalize-space(substring-after($string-date, ","))
(: strip permalink :)
let $year := if (contains($y1, "("))
then normalize-space(substring-before($y1, "("))
else $y1
let $month-day := normalize-space(substring-before($string-date, ","))
let $months := ("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")
let $month := substring-before($month-day, " ")
let $day-of-month := local:leading-zero(xs:decimal(substring-after($month-day, " ")))
let $monthnum := local:leading-zero(index-of($months,$month))
(: I don't necessarily know the time so I'll pick something vaguely plausible. :)
return concat($year, "-", $monthnum, "-", $day-of-month, "T07:00:31-05:00")
};
declare function local:first-sentence($text as xs:string) as xs:string {
let $r0 := normalize-space($text)
let $r1 := substring-before($text, '. ')
let $penultimate := substring($r1, string-length($r1)-1, 1)
let $sentence := if ($penultimate != " " or not(contains($r1, ' ')))
then concat($r1, ".")
else concat($r1, ". ", local:first-sentence($r1))
return $sentence
};
declare function local:make-id($date as xs:string, $position as xs:integer) as xs:string {
let $day := normalize-space(substring-before($date, ","))
let $string-date := normalize-space(substring-after($date, ","))
let $y1 := normalize-space(substring-after($string-date, ","))
(: strip permalink :)
let $year := if (contains($y1, "("))
then normalize-space(substring-before($y1, "("))
else $y1
let $month-day := normalize-space(substring-before($string-date, ","))
let $months := ("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")
let $month := substring-before($month-day, " ")
let $day-of-month := local:leading-zero(xs:decimal(substring-after($month-day, " ")))
let $monthnum := local:leading-zero(index-of($months,$month))
return concat($month, "_", $day-of-month, "_", $year, "_", $position)
};
declare function local:permalink-date($date as xs:string) as xs:string {
let $day := normalize-space(substring-before($date, ","))
let $string-date := normalize-space(substring-after($date, ","))
let $y1 := normalize-space(substring-after($string-date, ","))
(: strip permalink :)
let $year := if (contains($y1, "("))
then normalize-space(substring-before($y1, "("))
else $y1
let $month-day := normalize-space(substring-before($string-date, ","))
let $month := substring-before($month-day, " ")
let $day-of-month := xs:decimal(substring-after($month-day, " "))
return concat($year, $month, $day-of-month)
};
for $newsyear in (1998 to 2009)
return
for $dt in doc(concat("file:///Users/elharo/cafe%20con%20Leche/news", $newsyear ,".html"))/html:html/html:body/html:dl/html:dt
let $dd := $dt/following-sibling::html:dd[1]
let $date := string($dt)
let $itemstoday := count($dd/html:div)
return
for $item at $count in $dd/html:div
let $sequence := $itemstoday - $count + 1
let $id := if ($item/@id)
then string($item/@id)
else local:make-id($date, $sequence)
let $published := if ($item/@class)
then string($item/@class)
else local:parse-date($date)
let $link := concat("http://www.cafeconleche.org/#", $id)
let $permalink := if ($item/@id)
then concat("http://www.cafeconleche.org/oldnews/news", local:permalink-date($date), ".html#", $item/@id)
else concat("http://www.cafeconleche.org/oldnews/news", local:permalink-date($date), ".html")
return
<atom:entry xml:id="{$id}">
<atom:author>
<atom:name>Elliotte Rusty Harold</atom:name>
<atom:uri>http://www.elharo.com/</atom:uri>
</atom:author>
<atom:id>{$link}</atom:id>
<atom:title>{local:first-sentence(string($item))}</atom:title>
<atom:updated>{$published}</atom:updated>
<atom:content type="xhtml" xml:lang="en"
xml:base="http://www.cafeconleche.org/"
xmlns="http://www.w3.org/1999/xhtml">{$item/node()}</atom:content>
<link rel="alternate" href="{$link}"/>
<link rel="permalink" href="{$permalink}"/>
</atom:entry>
I should probably figure out how to remove some of the duplicate date parsing code, but it's basically a one-off migration script so I may not bother.
I think I have enough in place now that I can start setting up the templates for the main index.html page and the quote and news archives. Then I can start exploring the authoring half of the equation.