rlm@10
|
1 ; Copyright (c) Chris Houser, Dec 2008. All rights reserved.
|
rlm@10
|
2 ; The use and distribution terms for this software are covered by the
|
rlm@10
|
3 ; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php)
|
rlm@10
|
4 ; which can be found in the file epl-v10.html at the root of this distribution.
|
rlm@10
|
5 ; By using this software in any fashion, you are agreeing to be bound by
|
rlm@10
|
6 ; the terms of this license.
|
rlm@10
|
7 ; You must not remove this notice, or any other, from this software.
|
rlm@10
|
8
|
rlm@10
|
9 ; Functions to parse xml lazily and emit back to text.
|
rlm@10
|
10
|
rlm@10
|
11 (ns
|
rlm@10
|
12 ^{:author "Chris Houser",
|
rlm@10
|
13 :doc "Functions to parse xml lazily and emit back to text."}
|
rlm@10
|
14 clojure.contrib.lazy-xml
|
rlm@10
|
15 (:use [clojure.xml :as xml :only []]
|
rlm@10
|
16 [clojure.contrib.seq :only [fill-queue]])
|
rlm@10
|
17 (:import (org.xml.sax Attributes InputSource)
|
rlm@10
|
18 (org.xml.sax.helpers DefaultHandler)
|
rlm@10
|
19 (javax.xml.parsers SAXParserFactory)
|
rlm@10
|
20 (java.util.concurrent LinkedBlockingQueue TimeUnit)
|
rlm@10
|
21 (java.lang.ref WeakReference)
|
rlm@10
|
22 (java.io Reader)))
|
rlm@10
|
23
|
rlm@10
|
24 (defstruct node :type :name :attrs :str)
|
rlm@10
|
25
|
rlm@10
|
26 ; http://www.extreme.indiana.edu/xgws/xsoap/xpp/
|
rlm@10
|
27 (def has-pull false)
|
rlm@10
|
28 (defn- parse-seq-pull [& _])
|
rlm@10
|
29 (try
|
rlm@10
|
30 (load "lazy_xml/with_pull")
|
rlm@10
|
31 (catch Exception e
|
rlm@10
|
32 (when-not (re-find #"XmlPullParser" (str e))
|
rlm@10
|
33 (throw e))))
|
rlm@10
|
34
|
rlm@10
|
35 (defn startparse-sax [s ch]
|
rlm@10
|
36 (.. SAXParserFactory newInstance newSAXParser (parse s ch)))
|
rlm@10
|
37
|
rlm@10
|
38 (defn parse-seq
|
rlm@10
|
39 "Parses the source s, which can be a File, InputStream or String
|
rlm@10
|
40 naming a URI. Returns a lazy sequence of maps with two or more of
|
rlm@10
|
41 the keys :type, :name, :attrs, and :str. Other SAX-compatible
|
rlm@10
|
42 parsers can be supplied by passing startparse, a fn taking a source
|
rlm@10
|
43 and a ContentHandler and returning a parser. If a parser is
|
rlm@10
|
44 specified, it will be run in a separate thread and be allowed to get
|
rlm@10
|
45 ahead by queue-size items, which defaults to maxint. If no parser
|
rlm@10
|
46 is specified and org.xmlpull.v1.XmlPullParser is in the classpath,
|
rlm@10
|
47 this superior pull parser will be used."
|
rlm@10
|
48 ([s] (if has-pull
|
rlm@10
|
49 (parse-seq-pull s)
|
rlm@10
|
50 (parse-seq s startparse-sax)))
|
rlm@10
|
51 ([s startparse] (parse-seq s startparse Integer/MAX_VALUE))
|
rlm@10
|
52 ([s startparse queue-size]
|
rlm@10
|
53 (let [s (if (instance? Reader s) (InputSource. s) s)
|
rlm@10
|
54 f (fn filler-func [fill]
|
rlm@10
|
55 (startparse s (proxy [DefaultHandler] []
|
rlm@10
|
56 (startElement [uri local-name q-name ^Attributes atts]
|
rlm@10
|
57 ;(prn :start-element q-name)(flush)
|
rlm@10
|
58 (let [attrs (into {} (for [i (range (.getLength atts))]
|
rlm@10
|
59 [(keyword (.getQName atts i))
|
rlm@10
|
60 (.getValue atts i)]))]
|
rlm@10
|
61 (fill (struct node :start-element (keyword q-name) attrs))))
|
rlm@10
|
62 (endElement [uri local-name q-name]
|
rlm@10
|
63 ;(prn :end-element q-name)(flush)
|
rlm@10
|
64 (fill (struct node :end-element (keyword q-name))))
|
rlm@10
|
65 (characters [ch start length]
|
rlm@10
|
66 ;(prn :characters)(flush)
|
rlm@10
|
67 (let [st (String. ch start length)]
|
rlm@10
|
68 (when (seq (.trim st))
|
rlm@10
|
69 (fill (struct node :characters nil nil st))))))))]
|
rlm@10
|
70 (fill-queue f :queue-size queue-size))))
|
rlm@10
|
71
|
rlm@10
|
72
|
rlm@10
|
73 (defstruct element :tag :attrs :content)
|
rlm@10
|
74 (declare mktree)
|
rlm@10
|
75
|
rlm@10
|
76 (defn- siblings [coll]
|
rlm@10
|
77 (lazy-seq
|
rlm@10
|
78 (when-let [s (seq coll)]
|
rlm@10
|
79 (let [event (first s)]
|
rlm@10
|
80 (condp = (:type event)
|
rlm@10
|
81 :characters (cons (:str event) (siblings (rest s)))
|
rlm@10
|
82 :start-element (let [t (mktree s)]
|
rlm@10
|
83 (cons (first t) (siblings (rest t))))
|
rlm@10
|
84 :end-element [(rest s)])))))
|
rlm@10
|
85
|
rlm@10
|
86 (defn- mktree
|
rlm@10
|
87 [[elem & events]]
|
rlm@10
|
88 (lazy-seq
|
rlm@10
|
89 (let [sibs (siblings events)]
|
rlm@10
|
90 ;(prn :elem elem)
|
rlm@10
|
91 (cons
|
rlm@10
|
92 (struct element (:name elem) (:attrs elem) (drop-last sibs))
|
rlm@10
|
93 (lazy-seq (last sibs))))))
|
rlm@10
|
94
|
rlm@10
|
95 (defn parse-trim
|
rlm@10
|
96 "Parses the source s, which can be a File, InputStream or String
|
rlm@10
|
97 naming a URI. Returns a lazy tree of the clojure.xml/element
|
rlm@10
|
98 struct-map, which has the keys :tag, :attrs, and :content and
|
rlm@10
|
99 accessor fns tag, attrs, and content, with the whitespace trimmed
|
rlm@10
|
100 from around each content string. This format is compatible with what
|
rlm@10
|
101 clojure.xml/parse produces, except :content is a lazy seq instead of
|
rlm@10
|
102 a vector. Other SAX-compatible parsers can be supplied by passing
|
rlm@10
|
103 startparse, a fn taking a source and a ContentHandler and returning
|
rlm@10
|
104 a parser. If a parser is specified, it will be run in a separate
|
rlm@10
|
105 thread and be allowed to get ahead by queue-size items, which
|
rlm@10
|
106 defaults to maxing. If no parser is specified and
|
rlm@10
|
107 org.xmlpull.v1.XmlPullParser is in the classpath, this superior pull
|
rlm@10
|
108 parser will be used."
|
rlm@10
|
109 ([s] (first (mktree (parse-seq s))))
|
rlm@10
|
110 ([s startparse queue-size]
|
rlm@10
|
111 (first (mktree (parse-seq s startparse queue-size)))))
|
rlm@10
|
112
|
rlm@10
|
113 (defn attributes [e]
|
rlm@10
|
114 (let [v (vec (:attrs e))]
|
rlm@10
|
115 (reify org.xml.sax.Attributes
|
rlm@10
|
116 (getLength [_] (count v))
|
rlm@10
|
117 (getURI [_ i] (namespace (key (v i))))
|
rlm@10
|
118 (getLocalName [_ i] (name (key (v i))))
|
rlm@10
|
119 (getQName [_ i] (name (key (v i))))
|
rlm@10
|
120 (getValue [_ uri name] (get (:attrs e) name))
|
rlm@10
|
121 (^String getValue [_ ^int i] (val (v i)))
|
rlm@10
|
122 (^String getType [_ ^int i] "CDATA"))))
|
rlm@10
|
123
|
rlm@10
|
124 (defn- emit-element
|
rlm@10
|
125 "Recursively prints as XML text the element struct e. To have it
|
rlm@10
|
126 print extra whitespace like clojure.xml/emit, use the :pad true
|
rlm@10
|
127 option."
|
rlm@10
|
128 [e ^org.xml.sax.ContentHandler ch]
|
rlm@10
|
129 (if (instance? String e)
|
rlm@10
|
130 (.characters ch (.toCharArray ^String e) 0 (count e))
|
rlm@10
|
131 (let [nspace (namespace (:tag e))
|
rlm@10
|
132 qname (name (:tag e))]
|
rlm@10
|
133 (.startElement ch (or nspace "") qname qname (attributes e))
|
rlm@10
|
134 (doseq [c (:content e)]
|
rlm@10
|
135 (emit-element c ch))
|
rlm@10
|
136 (.endElement ch (or nspace "") qname qname))))
|
rlm@10
|
137
|
rlm@10
|
138
|
rlm@10
|
139 (defn emit
|
rlm@10
|
140 [e & {:as opts}]
|
rlm@10
|
141 (let [content-handler (atom nil)
|
rlm@10
|
142 trans (-> (javax.xml.transform.TransformerFactory/newInstance)
|
rlm@10
|
143 .newTransformer)]
|
rlm@10
|
144
|
rlm@10
|
145 (when (:indent opts)
|
rlm@10
|
146 (.setOutputProperty trans "indent" "yes")
|
rlm@10
|
147 (.setOutputProperty trans "{http://xml.apache.org/xslt}indent-amount"
|
rlm@10
|
148 (str (:indent opts))))
|
rlm@10
|
149
|
rlm@10
|
150 (when (contains? opts :xml-declaration)
|
rlm@10
|
151 (.setOutputProperty trans "omit-xml-declaration"
|
rlm@10
|
152 (if (:xml-declaration opts) "no" "yes")))
|
rlm@10
|
153
|
rlm@10
|
154 (when (:encoding opts)
|
rlm@10
|
155 (.setOutputProperty trans "encoding" (:encoding opts)))
|
rlm@10
|
156
|
rlm@10
|
157 (.transform
|
rlm@10
|
158 trans
|
rlm@10
|
159 (javax.xml.transform.sax.SAXSource.
|
rlm@10
|
160 (reify org.xml.sax.XMLReader
|
rlm@10
|
161 (getContentHandler [_] @content-handler)
|
rlm@10
|
162 (setDTDHandler [_ handler])
|
rlm@10
|
163 (setFeature [_ name value])
|
rlm@10
|
164 (setProperty [_ name value])
|
rlm@10
|
165 (setContentHandler [_ ch] (reset! content-handler ch))
|
rlm@10
|
166 (^void parse [_ ^org.xml.sax.InputSource _]
|
rlm@10
|
167 (when @content-handler
|
rlm@10
|
168 (.startDocument @content-handler)
|
rlm@10
|
169 (emit-element e @content-handler)
|
rlm@10
|
170 (.endDocument @content-handler))))
|
rlm@10
|
171 (org.xml.sax.InputSource.))
|
rlm@10
|
172 (javax.xml.transform.stream.StreamResult. *out*))))
|
rlm@10
|
173
|
rlm@10
|
174 (comment
|
rlm@10
|
175
|
rlm@10
|
176 (def atomstr "<?xml version='1.0' encoding='UTF-8'?>
|
rlm@10
|
177 <feed xmlns='http://www.w3.org/2005/Atom'>
|
rlm@10
|
178 <id>tag:blogger.com,1999:blog-28403206</id>
|
rlm@10
|
179 <updated>2008-02-14T08:00:58.567-08:00</updated>
|
rlm@10
|
180 <title type='text'>n01senet</title>
|
rlm@10
|
181 <link rel='alternate' type='text/html' href='http://n01senet.blogspot.com/'/>
|
rlm@10
|
182 <entry xmlns:foo='http://foo' xmlns:bar='http://bar'>
|
rlm@10
|
183 <id>1</id>
|
rlm@10
|
184 <published>2008-02-13</published>
|
rlm@10
|
185 <title type='text'>clojure is the best lisp yet</title>
|
rlm@10
|
186 <author><name>Chouser</name></author>
|
rlm@10
|
187 </entry>
|
rlm@10
|
188 <entry>
|
rlm@10
|
189 <id>2</id>
|
rlm@10
|
190 <published>2008-02-07</published>
|
rlm@10
|
191 <title type='text'>experimenting with vnc</title>
|
rlm@10
|
192 <author><name>agriffis</name></author>
|
rlm@10
|
193 </entry>
|
rlm@10
|
194 </feed>
|
rlm@10
|
195 ")
|
rlm@10
|
196
|
rlm@10
|
197 (def tree (parse-trim (java.io.StringReader. atomstr)
|
rlm@10
|
198 startparse-sax
|
rlm@10
|
199 1))
|
rlm@10
|
200 (println "\nsax")
|
rlm@10
|
201 (emit tree)
|
rlm@10
|
202
|
rlm@10
|
203 (def tree (parse-trim (java.io.StringReader. atomstr)))
|
rlm@10
|
204 (println "\ndefault")
|
rlm@10
|
205 (emit tree)
|
rlm@10
|
206
|
rlm@10
|
207 (def tree (xml/parse (org.xml.sax.InputSource. (java.io.StringReader. atomstr))))
|
rlm@10
|
208 (println "\norig")
|
rlm@10
|
209 (emit tree)
|
rlm@10
|
210
|
rlm@10
|
211 ; When used with zip and zip-filter, you can get do queries like this
|
rlm@10
|
212 ; without parsing more than the first few tags:
|
rlm@10
|
213 ; (zip/node (first (xml-> (zip/xml-zip tree) :id)))
|
rlm@10
|
214
|
rlm@10
|
215 )
|