XML News from Tuesday, January 19, 2010

Today I went from merely splitting the quotes files apart into indiviodual quotes to actually storing them back into the database:

xquery version "1.0";
declare namespace xmldb="http://exist-db.org/xquery/xmldb";
declare namespace html="http://www.w3.org/1999/xhtml";

for $dt in doc("/db/quoteshtml/quotes2009.html")/html:html/html:body/html:dl/html:dt
let $id := string($dt/@id)
let $date := string($dt)
let $dd := $dt/following-sibling::html:dd[1]
let $quote := $dd/html:blockquote
let $cite := string($quote/@cite)
let $source := $quote/following-sibling::*
let $sourcetext := normalize-space(substring-after($source, "--"))
let $author := if (contains($sourcetext, "Read the"))
               then substring-before($sourcetext, "Read")
               else substring-before($sourcetext, "on the")
let $location := if ($source/html:a)
               then $source/html:a
               else substring-after($sourcetext, "on the")
let $quotedate := if (contains($sourcetext, "list,"))
               then  normalize-space(substring-after($sourcetext, "list,"))
               else ""
let $justlocation := if (contains($location, "list,"))
               then  normalize-space(substring-after(substring-before($sourcetext, ","), "on the"))
               else $location
let $singlequote := <quote>
   <id>{$id}</id>
   <postdate>{$date}</postdate>
   <content>{$quote}</content>
   <cite>{$cite}</cite>
   <author>{$author}</author>
   <location>{$justlocation}</location>
   {
     if ($quotedate) 
     then <quotedate>{$quotedate}</quotedate>
     else ""
   }
</quote>

let $name := concat("quote_", $id)

let $store-return := xmldb:store("quotes", $name, $singlequote)

return
<store-result>
   <store>{$store-return}</store>
   <documentname>{$name}</documentname>
</store-result>

I suspect the next thing I should do is work on iomproving the dates somewhat since I'll likely want to sort and query by them. Right now they're human reabale but not so easy to process. E.g.

<postdate>Monday, April 27, 2009</postdate>

I should try to turn this into

<postdate>
  <day>Monday</day> 
  <date>2009-04-27</date>
</postdate>

Time to read up on the XQuery date and time functions. Hmm, looks like it's going to be regular expressions after all.