XML News from Tuesday, January 26, 2010

I've more or less completed the scriopt that converts the old news into Atom entry documents:

xquery version "1.0";

declare namespace xmldb="http://exist-db.org/xquery/xmldb";
declare namespace html="http://www.w3.org/1999/xhtml";
declare namespace xs="http://www.w3.org/2001/XMLSchema";
declare namespace atom="http://www.w3.org/2005/Atom";
declare namespace text="http://exist-db.org/xquery/text";

declare function local:leading-zero($n as xs:decimal) as xs:string {
    let $result := if ($n >= 10) 
    then string($n)
    else concat("0", string($n))
   return $result
};

declare function local:parse-date($date as xs:string) as xs:string {
    let $day := normalize-space(substring-before($date, ","))
    let $string-date := normalize-space(substring-after($date, ","))
    let $y1 := normalize-space(substring-after($string-date, ","))
    (: strip permalink :)
    let $year := if (contains($y1, "("))
                 then normalize-space(substring-before($y1, "("))
                 else $y1
    
    let $month-day := normalize-space(substring-before($string-date, ","))
    let $months := ("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")
    
    let $month := substring-before($month-day, " ")
    let $day-of-month := local:leading-zero(xs:decimal(substring-after($month-day, " ")))
    let $monthnum := local:leading-zero(index-of($months,$month))
    (: I don't necessarily know the time so I'll pick something vaguely plausible. :)
    return concat($year, "-", $monthnum, "-", $day-of-month, "T07:00:31-05:00")
};


declare function local:first-sentence($text as xs:string) as xs:string {
    let $r0 := normalize-space($text)
    let $r1 := substring-before($text, '. ')
    let $penultimate := substring($r1, string-length($r1)-1, 1)
    let $sentence := if ($penultimate != " " or not(contains($r1, ' ')))
                   then concat($r1, ".")
                   else concat($r1, ". ", local:first-sentence($r1))
    return $sentence
};

declare function local:make-id($date as xs:string, $position as xs:integer) as xs:string {
    let $day := normalize-space(substring-before($date, ","))
    let $string-date := normalize-space(substring-after($date, ","))
    let $y1 := normalize-space(substring-after($string-date, ","))
    (: strip permalink :)
    let $year := if (contains($y1, "("))
                 then normalize-space(substring-before($y1, "("))
                 else $y1
    let $month-day := normalize-space(substring-before($string-date, ","))
    let $months := ("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")
    
    let $month := substring-before($month-day, " ")
    let $day-of-month := local:leading-zero(xs:decimal(substring-after($month-day, " ")))
    let $monthnum := local:leading-zero(index-of($months,$month))
    return concat($month, "_", $day-of-month, "_", $year, "_", $position)
};


declare function local:permalink-date($date as xs:string) as xs:string {
    let $day := normalize-space(substring-before($date, ","))
    let $string-date := normalize-space(substring-after($date, ","))
    let $y1 := normalize-space(substring-after($string-date, ","))
    (: strip permalink :)
    let $year := if (contains($y1, "("))
                 then normalize-space(substring-before($y1, "("))
                 else $y1
    let $month-day := normalize-space(substring-before($string-date, ","))
    let $month := substring-before($month-day, " ")
    let $day-of-month := xs:decimal(substring-after($month-day, " "))
    return concat($year, $month, $day-of-month)
};

for $newsyear in (1998 to 2009)
return 
for $dt in doc(concat("file:///Users/elharo/cafe%20con%20Leche/news", $newsyear ,".html"))/html:html/html:body/html:dl/html:dt
let $dd := $dt/following-sibling::html:dd[1]
let $date := string($dt)
let $itemstoday := count($dd/html:div)

return
    for $item at $count in $dd/html:div
    let $sequence := $itemstoday - $count + 1
    let $id := if ($item/@id)
               then string($item/@id)
               else local:make-id($date, $sequence)      
               
    let $published := if ($item/@class)
                 then string($item/@class)
                 else local:parse-date($date)
    let $link := concat("http://www.cafeconleche.org/#", $id)
    let $permalink := if ($item/@id)
                      then concat("http://www.cafeconleche.org/oldnews/news", local:permalink-date($date), ".html#", $item/@id)
                      else concat("http://www.cafeconleche.org/oldnews/news", local:permalink-date($date), ".html")
    return
    <atom:entry xml:id="{$id}">
        <atom:author>
         <atom:name>Elliotte Rusty Harold</atom:name>
         <atom:uri>http://www.elharo.com/</atom:uri>
       </atom:author>
       <atom:id>{$link}</atom:id>
       <atom:title>{local:first-sentence(string($item))}</atom:title>
       <atom:updated>{$published}</atom:updated>
       <atom:content type="xhtml" xml:lang="en" 
           xml:base="http://www.cafeconleche.org/"
           xmlns="http://www.w3.org/1999/xhtml">{$item/node()}</atom:content>
       <link rel="alternate" href="{$link}"/>
       <link rel="permalink" href="{$permalink}"/>
    </atom:entry>

I should probably figure out how to remove some of the duplicate date parsing code, but it's basically a one-off migration script so I may not bother.

I think I have enough in place now that I can start setting up the templates for the main index.html page and the quote and news archives. Then I can start exploring the authoring half of the equation.