XQuery/获取压缩的 XML 文件
您想处理来自网络的 XML 文档,这些文档包含在一个 zip 文件中。
此脚本使用 eXist 压缩模块中的解压缩函数。该函数使用高阶函数来过滤压缩文件的所需组件并处理每个组件。
解压缩函数有五个输入参数,其中两个是传递给解压缩函数的 XQuery 函数。这两个函数反过来都有参数。
以下是压缩函数的一般布局
compression:unzip( $zip-data as xs:base64Binary, $entry-filter as function, $entry-filter-param as xs:anyType*, $entry-data as function, $entry-data-param as xs:anyType*) item()*
通过调用用户定义的函数来解压缩来自提供的数据的所有资源/文件夹,以确定如何存储资源/文件夹
- $zip-data zip 文件数据
- $entry-filter 用于从 zip 文件中过滤资源的用户定义函数。该函数接受 3 个参数,例如 user:unzip-entry-filter($path as xs:string, $data-type as xs:string, $param as item()*) as xs:boolean。$type 可以是“资源”或“文件夹”。$param 是一个包含任何其他参数的序列,例如已提取文件的列表。如果返回类型为 true(),则表示应处理条目并将其传递给 entry-data 函数,否则跳过资源。
- $entry-filter-param 用于过滤函数的包含额外参数的序列。
- $entry-data 用于存储从 zip 文件中提取的资源的用户定义函数。该函数接受 4 个参数,例如 user:unzip-entry-data($path as xs:string, $data-type as xs:string, $data as item()?, $param as item()*). $type 可以是“资源”或“文件夹”。$param 是一个包含任何其他参数的序列。
- $entry-data-param 用于存储函数的包含额外参数的序列。
在第一个示例中,我们知道只有一个 XML 文件,并且我们打算在脚本中处理该 XML 文件。后面的示例将存储该文件或这些文件以供以后处理。
declare namespace fw = "http://www.cems.uwe.ac.uk/xmlwiki/fw"; declare function fw:filter($path as xs:string, $type as xs:string, $param as item()*) as xs:boolean { (: pass all :) true() }; declare function fw:process($path as xs:string,$type as xs:string, $data as item()? , $param as item()*) { (: return the XML :) $data }; let $uri := request:get-parameter("uri","http://www.iso.org/iso/iso_3166-1_list_en.zip") let $zip := httpclient:get(xs:anyURI($uri), true(), ())/httpclient:body/text() let $filter := util:function(QName("http://www.cems.uwe.ac.uk/xmlwiki/fw","fw:filter"),3) let $process := util:function(QName("http://www.cems.uwe.ac.uk/xmlwiki/fw","fw:process"),4) let $xml := compression:unzip($zip,$filter,(),$process,()) return $xml
<ISO_3166-1_List_en xml:lang="en">
<ISO_3166-1_Entry>
<ISO_3166-1_Country_name>AFGHANISTAN</ISO_3166-1_Country_name>
<ISO_3166-1_Alpha-2_Code_element>AF</ISO_3166-1_Alpha-2_Code_element>
</ISO_3166-1_Entry>
<ISO_3166-1_Entry>
<ISO_3166-1_Country_name>ÅLAND ISLANDS</ISO_3166-1_Country_name>
<ISO_3166-1_Alpha-2_Code_element>AX</ISO_3166-1_Alpha-2_Code_element>
</ISO_3166-1_Entry>
...
</ISO_3166-1_List_en>
compression:unzip() 函数为它找到的 zip 存档中的每个组件调用 process 函数。这被称为回调函数。您可以在 process 函数中放置任何有效的 XQuery 代码来执行您对每个输入文件想要执行的操作,例如列出或存储它。
例如,以下 process 函数将列出 zip 文件中的所有项目、它们的路径、它们的类型以及如果该项目是 XML 文件,则列出其根节点。
declare function t:process($path as xs:string, $type as xs:string, $data as item()? , $param as item()*) {
(: return a list of the items in the zip file. :)
<item path="{$path}" type="{$type}">{name($data/*)}</item>
};
在 Office Open XML 文件上运行此命令将返回以下内容
<item path="[Content_Types].xml" type="resource">Types</item>
<item path="_rels/.rels" type="resource">Relationships</item>
<item path="word/_rels/document.xml.rels" type="resource">Relationships</item>
<item path="word/document.xml" type="resource">w:document</item>
<item path="word/theme/theme1.xml" type="resource">a:theme</item>
<item path="word/settings.xml" type="resource">w:settings</item>
<item path="word/fontTable.xml" type="resource">w:fonts</item>
<item path="word/webSettings.xml" type="resource">w:webSettings</item>
<item path="docProps/app.xml" type="resource">Properties</item>
<item path="docProps/core.xml" type="resource">cp:coreProperties</item>
<item path="word/styles.xml" type="resource">w:styles</item>
您可能希望将解压缩的文档存储在数据库中。我们可以修改 process 函数来执行此操作。我们可以使用第三个参数传递要存储每个文件的目录。此外,我们需要创建一个集合来保存解压缩的文件。
declare namespace fw = "http://www.cems.uwe.ac.uk/xmlwiki/fw";
declare function fw:filter($path as xs:string, $type as xs:string, $param as item()*) as xs:boolean {
(: pass all :)
true()
};
declare function fw:process($path as xs:string,$type as xs:string, $data as item()? , $param as item()*) {
(: store the XML in the nominated directory :)
xmldb:store($param/@directory, $path, $data)
};
let $baseCollection := "/db/apps/zip/data/"
let $uri := request:get-parameter("uri","http://www.iso.org/iso/iso_3166-1_list_en.zip")
let $unzipCollection := request:get-parameter("dir","temp")
let $zip := httpclient:get(xs:anyURI($uri), true(), ())/httpclient:body/text()
let $filter := util:function(QName("http://www.cems.uwe.ac.uk/xmlwiki/fw","fw:filter"),3)
let $process := util:function(QName("http://www.cems.uwe.ac.uk/xmlwiki/fw","fw:process"),4)
let $login := xmldb:login("/db","admin","password")
let $fullPath := concat($baseCollection, $unzipCollection)
let $mkdir :=
if (xmldb:collection-available($fullPath)) then ()
else xmldb:create-collection($baseCollection, $unzipCollection)
let $store := compression:unzip($zip,$filter,(),$process,<param directory="{$fullPath}"/>)
return $store
zip 文件通常包含多个文件。特别是 Microsoft Word .docX 和 Excel .xslx 文件是 xml 文件的压缩集合,它们共同定义文档或电子表格。
当文档存储在 eXist 数据库中时,MIME 类型(媒体类型)是使用 mime-types.xml 文件从文件后缀推断出来的。或者,可以在存储文档时显式设置 MIME 类型。
我们在此假设 zip 文件中的文件名很简单。如果存在目录结构,则需要进行额外的编码。
declare namespace fw = "http://www.cems.uwe.ac.uk/xmlwiki/fw";
declare function fw:filter($path as xs:string, $type as xs:string, $param as item()*) as xs:boolean {
(: pass all :)
true()
};
declare function fw:process($path as xs:string,$type as xs:string, $data as item()? , $param as item()*) {
(: store the XML in the nominated directory :)
(: we need to encode the filename to account for filenames with illegal characters like [Content_Types].xml :)
let $path := xmldb:encode($path)
(: ensure mime type is set properly for .rels files which are xml
alternatively you could add this mime type to the mime-types.xml configuration file
:)
return
if (ends-with($path, '.rels')) then
xmldb:store($param/@directory, $path, $data, 'application/xml')
else
xmldb:store($param/@directory, $path, $data)
};
let $baseCollection := "/db/apps/zip/data/"
let $uri := request:get-parameter("uri","http://www.iso.org/iso/iso_3166-1_list_en.zip")
let $unzipCollection := request:get-parameter("dir","temp")
let $zip := httpclient:get(xs:anyURI($uri), true(), ())/httpclient:body/text()
let $filter := util:function(QName("http://www.cems.uwe.ac.uk/xmlwiki/fw","fw:filter"),3)
let $process := util:function(QName("http://www.cems.uwe.ac.uk/xmlwiki/fw","fw:process"),4)
let $login := xmldb:login("/db","admin","password")
let $fullPath := concat($baseCollection, $unzipCollection)
let $mkdir :=
if (xmldb:collection-available($fullPath))
then ()
else xmldb:create-collection($baseCollection, $unzipCollection)
let $store := compression:unzip($zip,$filter,(),$process,<param directory="{$fullPath}"/>)
return
<result>
{for $file in $store
return
<file>{$file}</file>
}
</result>
大多数 zip 文件包含一个文件的目录树。在解压缩文件时,需要在数据库中重新创建此目录结构。我们可以修改 process 函数来根据需要创建数据库集合,假设上级目录在子目录之前被引用。
declare namespace fw = "http://www.cems.uwe.ac.uk/xmlwiki/fw";
declare function fw:filter($path as xs:string, $type as xs:string, $param as item()*) as xs:boolean {
(: filter any files which are not required :)
if (ends-with($path,".bin")) then false() else true()
};
declare function fw:process($path as xs:string,$type as xs:string, $data as item()? , $param as item()*) {
(: parse the path and create a collection if necessary :)
let $steps := tokenize($path,"/")
let $nsteps := count($steps)
let $filename := $steps[$nsteps]
let $collection := string-join(subsequence($steps,1,$nsteps - 1 ),"/")
let $baseCollection := string($param/@collection)
let $fullCollection := concat($baseCollection,"/",$collection)
let $mkdir :=
if (xmldb:collection-available($fullCollection)) then ()
else xmldb:create-collection($baseCollection, $collection)
let $filename := xmldb:encode($filename)
return
xmldb:store($fullCollection, $filename, $data)
};
let $baseCollection := "/db/apps/zip/data/"
let $path := request:get-parameter("path","http://www.iso.org/iso/iso_3166-1_list_en.zip")
let $unzipCollection := request:get-parameter("dir","temp")
let $zip := httpclient:get(xs:anyURI($path), true(), ())/httpclient:body/text()
let $filter := util:function(QName("http://www.cems.uwe.ac.uk/xmlwiki/fw","fw:filter"),3)
let $process := util:function(QName("http://www.cems.uwe.ac.uk/xmlwiki/fw","fw:process"),4)
let $login := xmldb:login("/db","admin","password")
let $collection := concat($baseCollection, $unzipCollection)
let $mkdir :=
if (xmldb:collection-available($collection)) then ()
else xmldb:create-collection($baseCollection, $unzipCollection)
let $store := compression:unzip($zip,$filter,(),$process,<param collection="{$collection}"/>)
return
<result>
{for $file in $store
return
<file>{$file}</file>
}
</result>
在解压缩 zip 文件之前,将它们作为二进制资源存储在数据库中可能很有用。默认情况下,以 .zip 为后缀的文件被存储为二进制数据。要在 eXist 中存储 .docx 和 .xslx 文件,您需要将这些后缀添加到 $EXIST_HOME/mime-type.xml 配置文件中的条目中。
更改
<mime-type name="application/zip" type="binary">
<description>ZIP archive</description>
<extensions>.zip</extensions>
</mime-type>
为
<mime-type name="application/zip" type="binary">
<description>ZIP archive and Office Open XML</description>
<extensions>.zip,.docx,.xlsx,.pptx</extensions>
</mime-type>
您将需要重新启动服务器以使此更改生效。
基本脚本保持不变,只是进行了细微的修改
let $path := request:get-parameter("path","http://www.iso.org/iso/iso_3166-1_list_en.zip")
let $unzipCollection := request:get-parameter("dir","temp")
let $zip :=
if (starts-with($path,"http"))
then httpclient:get(xs:anyURI($path), true(), ())/httpclient:body/text()
else util:binary-doc($path)