annotate src/clojure/contrib/lazy_xml.clj @ 10:ef7dbbd6452c

added clojure source goodness
author Robert McIntyre <rlm@mit.edu>
date Sat, 21 Aug 2010 06:25:44 -0400
parents
children
rev   line source
rlm@10 1 ; Copyright (c) Chris Houser, Dec 2008. All rights reserved.
rlm@10 2 ; The use and distribution terms for this software are covered by the
rlm@10 3 ; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php)
rlm@10 4 ; which can be found in the file epl-v10.html at the root of this distribution.
rlm@10 5 ; By using this software in any fashion, you are agreeing to be bound by
rlm@10 6 ; the terms of this license.
rlm@10 7 ; You must not remove this notice, or any other, from this software.
rlm@10 8
rlm@10 9 ; Functions to parse xml lazily and emit back to text.
rlm@10 10
rlm@10 11 (ns
rlm@10 12 ^{:author "Chris Houser",
rlm@10 13 :doc "Functions to parse xml lazily and emit back to text."}
rlm@10 14 clojure.contrib.lazy-xml
rlm@10 15 (:use [clojure.xml :as xml :only []]
rlm@10 16 [clojure.contrib.seq :only [fill-queue]])
rlm@10 17 (:import (org.xml.sax Attributes InputSource)
rlm@10 18 (org.xml.sax.helpers DefaultHandler)
rlm@10 19 (javax.xml.parsers SAXParserFactory)
rlm@10 20 (java.util.concurrent LinkedBlockingQueue TimeUnit)
rlm@10 21 (java.lang.ref WeakReference)
rlm@10 22 (java.io Reader)))
rlm@10 23
rlm@10 24 (defstruct node :type :name :attrs :str)
rlm@10 25
rlm@10 26 ; http://www.extreme.indiana.edu/xgws/xsoap/xpp/
rlm@10 27 (def has-pull false)
rlm@10 28 (defn- parse-seq-pull [& _])
rlm@10 29 (try
rlm@10 30 (load "lazy_xml/with_pull")
rlm@10 31 (catch Exception e
rlm@10 32 (when-not (re-find #"XmlPullParser" (str e))
rlm@10 33 (throw e))))
rlm@10 34
rlm@10 35 (defn startparse-sax [s ch]
rlm@10 36 (.. SAXParserFactory newInstance newSAXParser (parse s ch)))
rlm@10 37
rlm@10 38 (defn parse-seq
rlm@10 39 "Parses the source s, which can be a File, InputStream or String
rlm@10 40 naming a URI. Returns a lazy sequence of maps with two or more of
rlm@10 41 the keys :type, :name, :attrs, and :str. Other SAX-compatible
rlm@10 42 parsers can be supplied by passing startparse, a fn taking a source
rlm@10 43 and a ContentHandler and returning a parser. If a parser is
rlm@10 44 specified, it will be run in a separate thread and be allowed to get
rlm@10 45 ahead by queue-size items, which defaults to maxint. If no parser
rlm@10 46 is specified and org.xmlpull.v1.XmlPullParser is in the classpath,
rlm@10 47 this superior pull parser will be used."
rlm@10 48 ([s] (if has-pull
rlm@10 49 (parse-seq-pull s)
rlm@10 50 (parse-seq s startparse-sax)))
rlm@10 51 ([s startparse] (parse-seq s startparse Integer/MAX_VALUE))
rlm@10 52 ([s startparse queue-size]
rlm@10 53 (let [s (if (instance? Reader s) (InputSource. s) s)
rlm@10 54 f (fn filler-func [fill]
rlm@10 55 (startparse s (proxy [DefaultHandler] []
rlm@10 56 (startElement [uri local-name q-name ^Attributes atts]
rlm@10 57 ;(prn :start-element q-name)(flush)
rlm@10 58 (let [attrs (into {} (for [i (range (.getLength atts))]
rlm@10 59 [(keyword (.getQName atts i))
rlm@10 60 (.getValue atts i)]))]
rlm@10 61 (fill (struct node :start-element (keyword q-name) attrs))))
rlm@10 62 (endElement [uri local-name q-name]
rlm@10 63 ;(prn :end-element q-name)(flush)
rlm@10 64 (fill (struct node :end-element (keyword q-name))))
rlm@10 65 (characters [ch start length]
rlm@10 66 ;(prn :characters)(flush)
rlm@10 67 (let [st (String. ch start length)]
rlm@10 68 (when (seq (.trim st))
rlm@10 69 (fill (struct node :characters nil nil st))))))))]
rlm@10 70 (fill-queue f :queue-size queue-size))))
rlm@10 71
rlm@10 72
rlm@10 73 (defstruct element :tag :attrs :content)
rlm@10 74 (declare mktree)
rlm@10 75
rlm@10 76 (defn- siblings [coll]
rlm@10 77 (lazy-seq
rlm@10 78 (when-let [s (seq coll)]
rlm@10 79 (let [event (first s)]
rlm@10 80 (condp = (:type event)
rlm@10 81 :characters (cons (:str event) (siblings (rest s)))
rlm@10 82 :start-element (let [t (mktree s)]
rlm@10 83 (cons (first t) (siblings (rest t))))
rlm@10 84 :end-element [(rest s)])))))
rlm@10 85
rlm@10 86 (defn- mktree
rlm@10 87 [[elem & events]]
rlm@10 88 (lazy-seq
rlm@10 89 (let [sibs (siblings events)]
rlm@10 90 ;(prn :elem elem)
rlm@10 91 (cons
rlm@10 92 (struct element (:name elem) (:attrs elem) (drop-last sibs))
rlm@10 93 (lazy-seq (last sibs))))))
rlm@10 94
rlm@10 95 (defn parse-trim
rlm@10 96 "Parses the source s, which can be a File, InputStream or String
rlm@10 97 naming a URI. Returns a lazy tree of the clojure.xml/element
rlm@10 98 struct-map, which has the keys :tag, :attrs, and :content and
rlm@10 99 accessor fns tag, attrs, and content, with the whitespace trimmed
rlm@10 100 from around each content string. This format is compatible with what
rlm@10 101 clojure.xml/parse produces, except :content is a lazy seq instead of
rlm@10 102 a vector. Other SAX-compatible parsers can be supplied by passing
rlm@10 103 startparse, a fn taking a source and a ContentHandler and returning
rlm@10 104 a parser. If a parser is specified, it will be run in a separate
rlm@10 105 thread and be allowed to get ahead by queue-size items, which
rlm@10 106 defaults to maxing. If no parser is specified and
rlm@10 107 org.xmlpull.v1.XmlPullParser is in the classpath, this superior pull
rlm@10 108 parser will be used."
rlm@10 109 ([s] (first (mktree (parse-seq s))))
rlm@10 110 ([s startparse queue-size]
rlm@10 111 (first (mktree (parse-seq s startparse queue-size)))))
rlm@10 112
rlm@10 113 (defn attributes [e]
rlm@10 114 (let [v (vec (:attrs e))]
rlm@10 115 (reify org.xml.sax.Attributes
rlm@10 116 (getLength [_] (count v))
rlm@10 117 (getURI [_ i] (namespace (key (v i))))
rlm@10 118 (getLocalName [_ i] (name (key (v i))))
rlm@10 119 (getQName [_ i] (name (key (v i))))
rlm@10 120 (getValue [_ uri name] (get (:attrs e) name))
rlm@10 121 (^String getValue [_ ^int i] (val (v i)))
rlm@10 122 (^String getType [_ ^int i] "CDATA"))))
rlm@10 123
rlm@10 124 (defn- emit-element
rlm@10 125 "Recursively prints as XML text the element struct e. To have it
rlm@10 126 print extra whitespace like clojure.xml/emit, use the :pad true
rlm@10 127 option."
rlm@10 128 [e ^org.xml.sax.ContentHandler ch]
rlm@10 129 (if (instance? String e)
rlm@10 130 (.characters ch (.toCharArray ^String e) 0 (count e))
rlm@10 131 (let [nspace (namespace (:tag e))
rlm@10 132 qname (name (:tag e))]
rlm@10 133 (.startElement ch (or nspace "") qname qname (attributes e))
rlm@10 134 (doseq [c (:content e)]
rlm@10 135 (emit-element c ch))
rlm@10 136 (.endElement ch (or nspace "") qname qname))))
rlm@10 137
rlm@10 138
rlm@10 139 (defn emit
rlm@10 140 [e & {:as opts}]
rlm@10 141 (let [content-handler (atom nil)
rlm@10 142 trans (-> (javax.xml.transform.TransformerFactory/newInstance)
rlm@10 143 .newTransformer)]
rlm@10 144
rlm@10 145 (when (:indent opts)
rlm@10 146 (.setOutputProperty trans "indent" "yes")
rlm@10 147 (.setOutputProperty trans "{http://xml.apache.org/xslt}indent-amount"
rlm@10 148 (str (:indent opts))))
rlm@10 149
rlm@10 150 (when (contains? opts :xml-declaration)
rlm@10 151 (.setOutputProperty trans "omit-xml-declaration"
rlm@10 152 (if (:xml-declaration opts) "no" "yes")))
rlm@10 153
rlm@10 154 (when (:encoding opts)
rlm@10 155 (.setOutputProperty trans "encoding" (:encoding opts)))
rlm@10 156
rlm@10 157 (.transform
rlm@10 158 trans
rlm@10 159 (javax.xml.transform.sax.SAXSource.
rlm@10 160 (reify org.xml.sax.XMLReader
rlm@10 161 (getContentHandler [_] @content-handler)
rlm@10 162 (setDTDHandler [_ handler])
rlm@10 163 (setFeature [_ name value])
rlm@10 164 (setProperty [_ name value])
rlm@10 165 (setContentHandler [_ ch] (reset! content-handler ch))
rlm@10 166 (^void parse [_ ^org.xml.sax.InputSource _]
rlm@10 167 (when @content-handler
rlm@10 168 (.startDocument @content-handler)
rlm@10 169 (emit-element e @content-handler)
rlm@10 170 (.endDocument @content-handler))))
rlm@10 171 (org.xml.sax.InputSource.))
rlm@10 172 (javax.xml.transform.stream.StreamResult. *out*))))
rlm@10 173
rlm@10 174 (comment
rlm@10 175
rlm@10 176 (def atomstr "<?xml version='1.0' encoding='UTF-8'?>
rlm@10 177 <feed xmlns='http://www.w3.org/2005/Atom'>
rlm@10 178 <id>tag:blogger.com,1999:blog-28403206</id>
rlm@10 179 <updated>2008-02-14T08:00:58.567-08:00</updated>
rlm@10 180 <title type='text'>n01senet</title>
rlm@10 181 <link rel='alternate' type='text/html' href='http://n01senet.blogspot.com/'/>
rlm@10 182 <entry xmlns:foo='http://foo' xmlns:bar='http://bar'>
rlm@10 183 <id>1</id>
rlm@10 184 <published>2008-02-13</published>
rlm@10 185 <title type='text'>clojure is the best lisp yet</title>
rlm@10 186 <author><name>Chouser</name></author>
rlm@10 187 </entry>
rlm@10 188 <entry>
rlm@10 189 <id>2</id>
rlm@10 190 <published>2008-02-07</published>
rlm@10 191 <title type='text'>experimenting with vnc</title>
rlm@10 192 <author><name>agriffis</name></author>
rlm@10 193 </entry>
rlm@10 194 </feed>
rlm@10 195 ")
rlm@10 196
rlm@10 197 (def tree (parse-trim (java.io.StringReader. atomstr)
rlm@10 198 startparse-sax
rlm@10 199 1))
rlm@10 200 (println "\nsax")
rlm@10 201 (emit tree)
rlm@10 202
rlm@10 203 (def tree (parse-trim (java.io.StringReader. atomstr)))
rlm@10 204 (println "\ndefault")
rlm@10 205 (emit tree)
rlm@10 206
rlm@10 207 (def tree (xml/parse (org.xml.sax.InputSource. (java.io.StringReader. atomstr))))
rlm@10 208 (println "\norig")
rlm@10 209 (emit tree)
rlm@10 210
rlm@10 211 ; When used with zip and zip-filter, you can get do queries like this
rlm@10 212 ; without parsing more than the first few tags:
rlm@10 213 ; (zip/node (first (xml-> (zip/xml-zip tree) :id)))
rlm@10 214
rlm@10 215 )