XQuery JAXP メモ - gomiwikiの日記

# RSS+ATOMアグリゲータの例：　[Source] [Result]
… RSSフィードとATOMフィードを統合した例です。 ZDNET, MYCOM, Google NEWS(US), Goo Sportから，実行時点での最新50件のフィードを表示しました。
簡単にRSSアグリゲータのようなアプリケーションを生成できました。Goo Sportsからは阪神の話題だけをピックアップ！

xquery version "1.0" encoding "UTF-8";

declare default element namespace "http://db-www.naist.jp/~makoto-y/";

declare namespace rss = "http://purl.org/rss/1.0/";
declare namespace rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
declare namespace atom = "http://purl.org/atom/ns#";
declare namespace dc = "http://purl.org/dc/elements/1.1/";

declare variable $mycom := doc("http://pcweb.mycom.co.jp/haishin/rss/index.rdf");
declare variable $gnews := doc("http://news.google.co.jp/news?ned=us&output=atom&head=t");
declare variable $zdnet := doc("http://japan.zdnet.com/rss/news/index.rdf");
declare variable $goo_sports := doc("http://news.goo.ne.jp/news/rss/topics/sports/npb/index.rdf");

declare function local:list-rss($site)
{
for $item in $site/rdf:RDF/rss:item
return

{ $item/rss:title/text() }
last modifiled: { $item/dc:date/text() }
source: { $site/rdf:RDF/rss:channel/rss:title/text() }

};

declare function local:list-atom($site)
{
for $entry in $site/atom:feed/atom:entry
return

{ $entry/atom:title/text() }
last modifiled: { $entry/atom:modified/text() }
source: { $site/atom:feed/atom:title/text() }

};

(: 阪神の情報だけピックアップするフィルタ :)
declare function local:hanshin_filter($item)
{
for $i in $item
where fn:contains($i/@rdf:about, "tigers")
return $i
};

declare function local:resources()
{
(local:list-rss($mycom), local:list-rss($zdnet), local:list-atom($gnews), local:hanshin_filter(local:list-rss($goo_sports)))
};

{
for $e at $pos in (for $i in local:resources()
order by $i/modified descending empty least
return { $i/node() } )
where $pos < 50
return $e/node()
}

# SigmodRecordから興味のある記事をピックアップ：　[Source] [Result]
… ACM SigmodRecordの過去十年分の記事から目的のキーワードを含む記事を抜き出します。
動的に読み出すデータソースが決まるのがミソです。pre-processingによる高速化が難しい例です。

(: A program to extract articles related to declared keywords from SigmodRecord :)

declare variable $baseuri := "http://www.sigmod.org/record/xml/";
declare variable $target := fn:concat($baseuri, "index.xml");
declare variable $sigmod := fn:doc($target);
declare variable $this_year := fn:year-from-date(fn:current-date());
declare variable $year_after := $this_year - 10;
declare variable $every_keys := ("xml");
declare variable $some_keys := ("integration", "view");
declare variable $except_keys := ();

Extract reserch paper infomations from SigmodRecord.

Published { $year_after } - { $this_year } .

Choosing keywords:

Every - {{{ $every_keys }}}, Some - {{{ $some_keys }}}, Expept - {{{ $except_keys }}}

{ for $issue in $sigmod/HomePage/volumes/issuesByYear/issues/issue let $ref := fn:doc(fn:concat($baseuri, "/", $issue/toIssue/@href/text()))/OrdinaryIssuePage where $issue/toIssue/year > $year_after return for $article in $ref/sections/section/articles/article let $title := $article/title, $t := fn:lower-case($title) where (every $e in $every_keys satisfies fn:contains($t, $e)) and (some $s in $some_keys satisfies fn:contains($t, $s)) and (every $n in $except_keys satisfies fn:not(fn:contains($t, $n))) return }

title	year	authors	sources
{ $article/title/text() }	{ $ref/year/text() }	{ for $a at $pos in $article/authors/author/text() return if($pos = 1) then $a else (", ", $a) }	{ for $src at $pos in $article/availableSources/toAvailableSource return if($pos = 1) then { $src/format/text() } else (", ", { $src/format/text() } ) }

(: Stylus Studio meta-information - (c) 2004-2005. Progress Software Corporation. All rights reserved.

:)

# Table of XQuery Lexical States：　[Source] [Result]
… Scott Boag氏の素晴らしい仕事を加工してます。
Lexical StateとPatternでまとめてます。XQueryのLexer書くために書きました。

declare default element namespace "http://www.w3.org/1999/xhtml";
declare namespace eg = "http://db-www.naist.jp/~makoto-y/xquery/eg";

declare variable $spec := fn:doc("http://www.w3.org/TR/xquery-xpath-parsing/");

declare function eg:value-except($arg1 as xdt:anyAtomicType*, $arg2 as xdt:anyAtomicType*) as xdt:anyAtomicType*
{
fn:distinct-values($arg1[not(. = $arg2)])
};

XQuery Lexical States

Target version:
{ $spec/html/body/div/h2[a/@name = "w3c-doctype"]/text() }

Generated Date: { fn:current-date() }

{ for $doc in $spec let $lex_states := $doc//div[*/a/@name = "XQuery-lexical-states"] let $tables := $lex_states/dl let $row := $tables/dd/table/tbody/tr let $res := (for $ptns in (for $p in $row/td[1] return $p) let $s := $ptns/../../../../preceding-sibling::dt[1]/text() let $state := fn:substring-before(fn:substring($s, 5), " State") let $trans := (for $t in $ptns/following-sibling::td/table/tbody return fn:normalize-space(fn:string($t))) let $p := (for $ptn in fn:tokenize(fn:string($ptns), ",\s+") return { $state }{ fn:normalize-space($ptn) }{ $trans }) return $p) let $resset := (for $r at $pos in $res let $ptn := $r/pattern/text(), $trans := $r/transition/text() (: filter redundant evaluation on for-loop :) let $after_state := fn:subsequence($res, $pos) let $state := fn:distinct-values($after_state[pattern/text() = $ptn and transition/text() = $trans]/state/text()) let $before_state := fn:subsequence($res, 1, $pos - 1) where every $bs in $before_state satisfies not($bs/pattern/text() = $ptn and $bs/transition/text() = $trans) return ) let $gkey := fn:distinct-values($resset/td[1]/text()) let $rr := (for $g in $gkey return $resset[td[1]/text() = $g]) return $rr }

Lexical State	Pattern	Transition To State
{ $state }	{ $ptn }	{ $trans }

SKIP

{
for $doc in $spec
let $div2 := $doc/html/body/div[@class = "body"]/div[@class = "div1"]/div[@class = "div2"]
let $p := $div2/div/p[contains(text()[1], "whitespace is not ignored in these states:")]
let $ns := for $token in fn:tokenize(fn:substring-before(fn:replace(fn:substring-after(fn:string($p), "states:"), ", and", ","), "."), ",")
return fn:normalize-space($token)
let $allstates := (for $t in $div2/div[@class = "div3"]/dl/dt/text()
return fn:normalize-space(fn:substring-before(fn:substring($t, 5), " State")))
return eg:value-except($allstates, ($ns, "OCCURRENCEINDICATOR"))
}

(: Stylus Studio meta-information - (c) 2004-2005. Progress Software Corporation. All rights reserved.

:)

Scripting the addition of XML files to the eXist XQuery database

Saxon is great for getting to know XQuery syntax (see part one and part two of my "Getting Started with XQuery" articles in XML.com for more on this), but it reads all of the data to query into memory, and much of the point of XQuery is to work with large, indexed, disk-based collections of XML that won't fit into memory. I've started playing with the open-source eXist XML database for this.

After starting up the eXist server, you can start up the interactive client and load files from there, but if the client has any problems loading the files, it doesn't show any error messages that I could find?all I knew was that the file I tried to load wasn't showing up in the client's list of loaded files. If you want to load a lot of files, you don't want an interactive client, anyway; you want to create a script that does it for you. Apparently, the documentation and sample perl/python/java scripts that come with eXist are a bit behind the development of the system itself, so they don't always work. I finally found a simple way to load files using an eXist extension to XQuery, demonstrated by the code below.

(: Load the files temp2a.xml, temp2b.xml, temp2c.xml
from c:\temp into the eXist database. :)

xquery version "1.0";

declare namespace xmldb="http://exist-db.org/xquery/xmldb";

{
(: We'll load each file into the coll1 collection as the administrator. :)
let $collection := xmldb:collection("xmldb:exist:///db/coll1", "admin", "")
for $dataFilename in ("temp2a","temp2b","temp2c")
let $name := $dataFilename
let $URI := xs:anyURI(concat("file:///c:/temp/",$name,".xml"))
let $retCode := xmldb:store($collection, $name, $URI)
return

{$retCode}

}

With eXist stored in c:\bin\eXist on a Windows machine and its server up and running, storing the XQuery script above as C:\bin\eXist\webapp\xquery\loadfiles.xq and then sending a browser to http://localhost:8080/exist/xquery/loadfiles.xq ran the query, loaded the files, and displayed the return codes in the browser.

After getting this to work with simple dummy files, I found what was wrong with the file I was originally having problems with: "The document is too complex/irregularily structured to be mapped into eXist's numbering scheme." As a dayjob-related file, I can't describe it in much detail, but this reaction to it didn't surprise me. Still, I have plenty of ideas for eXist apps to build around less complex XML.

For standalone Java(TM) applications to work with the @wsp.fullname@, the JAXP classes built into the JDK must be overridden. Overriding these classes is not necessary in order to run the Web container or the samples, since the provided scripts set "java.endorsed.dirs".

Set the java.endorsed.dirs system property to:

C:\Sun\jwsdp-1.6\jaxp\lib;C:\Sun\jwsdp-1.6\jaxp\lib\endorsed

Alternatively, create the directory:

\jre\lib\endorsed

and then copy the files in the following directories to the newly created directory.

C:\Sun\jwsdp-1.6\jaxp\lib
C:\Sun\jwsdp-1.6\jaxp\lib\endorsed

XQuery Lexical States

Target version: { $spec/html/body/div/h2[a/@name = "w3c-doctype"]/text() } Generated Date: { fn:current-date() }

Target version:
{ $spec/html/body/div/h2[a/@name = "w3c-doctype"]/text() }

Generated Date: { fn:current-date() }