comparison src/clojure/contrib/lazy_xml.clj @ 10:ef7dbbd6452c

added clojure source goodness
author Robert McIntyre <rlm@mit.edu>
date Sat, 21 Aug 2010 06:25:44 -0400
parents
children
comparison
equal deleted inserted replaced
9:35cf337adfcf 10:ef7dbbd6452c
1 ; Copyright (c) Chris Houser, Dec 2008. All rights reserved.
2 ; The use and distribution terms for this software are covered by the
3 ; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php)
4 ; which can be found in the file epl-v10.html at the root of this distribution.
5 ; By using this software in any fashion, you are agreeing to be bound by
6 ; the terms of this license.
7 ; You must not remove this notice, or any other, from this software.
8
9 ; Functions to parse xml lazily and emit back to text.
10
11 (ns
12 ^{:author "Chris Houser",
13 :doc "Functions to parse xml lazily and emit back to text."}
14 clojure.contrib.lazy-xml
15 (:use [clojure.xml :as xml :only []]
16 [clojure.contrib.seq :only [fill-queue]])
17 (:import (org.xml.sax Attributes InputSource)
18 (org.xml.sax.helpers DefaultHandler)
19 (javax.xml.parsers SAXParserFactory)
20 (java.util.concurrent LinkedBlockingQueue TimeUnit)
21 (java.lang.ref WeakReference)
22 (java.io Reader)))
23
24 (defstruct node :type :name :attrs :str)
25
26 ; http://www.extreme.indiana.edu/xgws/xsoap/xpp/
27 (def has-pull false)
28 (defn- parse-seq-pull [& _])
29 (try
30 (load "lazy_xml/with_pull")
31 (catch Exception e
32 (when-not (re-find #"XmlPullParser" (str e))
33 (throw e))))
34
35 (defn startparse-sax [s ch]
36 (.. SAXParserFactory newInstance newSAXParser (parse s ch)))
37
38 (defn parse-seq
39 "Parses the source s, which can be a File, InputStream or String
40 naming a URI. Returns a lazy sequence of maps with two or more of
41 the keys :type, :name, :attrs, and :str. Other SAX-compatible
42 parsers can be supplied by passing startparse, a fn taking a source
43 and a ContentHandler and returning a parser. If a parser is
44 specified, it will be run in a separate thread and be allowed to get
45 ahead by queue-size items, which defaults to maxint. If no parser
46 is specified and org.xmlpull.v1.XmlPullParser is in the classpath,
47 this superior pull parser will be used."
48 ([s] (if has-pull
49 (parse-seq-pull s)
50 (parse-seq s startparse-sax)))
51 ([s startparse] (parse-seq s startparse Integer/MAX_VALUE))
52 ([s startparse queue-size]
53 (let [s (if (instance? Reader s) (InputSource. s) s)
54 f (fn filler-func [fill]
55 (startparse s (proxy [DefaultHandler] []
56 (startElement [uri local-name q-name ^Attributes atts]
57 ;(prn :start-element q-name)(flush)
58 (let [attrs (into {} (for [i (range (.getLength atts))]
59 [(keyword (.getQName atts i))
60 (.getValue atts i)]))]
61 (fill (struct node :start-element (keyword q-name) attrs))))
62 (endElement [uri local-name q-name]
63 ;(prn :end-element q-name)(flush)
64 (fill (struct node :end-element (keyword q-name))))
65 (characters [ch start length]
66 ;(prn :characters)(flush)
67 (let [st (String. ch start length)]
68 (when (seq (.trim st))
69 (fill (struct node :characters nil nil st))))))))]
70 (fill-queue f :queue-size queue-size))))
71
72
73 (defstruct element :tag :attrs :content)
74 (declare mktree)
75
76 (defn- siblings [coll]
77 (lazy-seq
78 (when-let [s (seq coll)]
79 (let [event (first s)]
80 (condp = (:type event)
81 :characters (cons (:str event) (siblings (rest s)))
82 :start-element (let [t (mktree s)]
83 (cons (first t) (siblings (rest t))))
84 :end-element [(rest s)])))))
85
86 (defn- mktree
87 [[elem & events]]
88 (lazy-seq
89 (let [sibs (siblings events)]
90 ;(prn :elem elem)
91 (cons
92 (struct element (:name elem) (:attrs elem) (drop-last sibs))
93 (lazy-seq (last sibs))))))
94
95 (defn parse-trim
96 "Parses the source s, which can be a File, InputStream or String
97 naming a URI. Returns a lazy tree of the clojure.xml/element
98 struct-map, which has the keys :tag, :attrs, and :content and
99 accessor fns tag, attrs, and content, with the whitespace trimmed
100 from around each content string. This format is compatible with what
101 clojure.xml/parse produces, except :content is a lazy seq instead of
102 a vector. Other SAX-compatible parsers can be supplied by passing
103 startparse, a fn taking a source and a ContentHandler and returning
104 a parser. If a parser is specified, it will be run in a separate
105 thread and be allowed to get ahead by queue-size items, which
106 defaults to maxing. If no parser is specified and
107 org.xmlpull.v1.XmlPullParser is in the classpath, this superior pull
108 parser will be used."
109 ([s] (first (mktree (parse-seq s))))
110 ([s startparse queue-size]
111 (first (mktree (parse-seq s startparse queue-size)))))
112
113 (defn attributes [e]
114 (let [v (vec (:attrs e))]
115 (reify org.xml.sax.Attributes
116 (getLength [_] (count v))
117 (getURI [_ i] (namespace (key (v i))))
118 (getLocalName [_ i] (name (key (v i))))
119 (getQName [_ i] (name (key (v i))))
120 (getValue [_ uri name] (get (:attrs e) name))
121 (^String getValue [_ ^int i] (val (v i)))
122 (^String getType [_ ^int i] "CDATA"))))
123
124 (defn- emit-element
125 "Recursively prints as XML text the element struct e. To have it
126 print extra whitespace like clojure.xml/emit, use the :pad true
127 option."
128 [e ^org.xml.sax.ContentHandler ch]
129 (if (instance? String e)
130 (.characters ch (.toCharArray ^String e) 0 (count e))
131 (let [nspace (namespace (:tag e))
132 qname (name (:tag e))]
133 (.startElement ch (or nspace "") qname qname (attributes e))
134 (doseq [c (:content e)]
135 (emit-element c ch))
136 (.endElement ch (or nspace "") qname qname))))
137
138
139 (defn emit
140 [e & {:as opts}]
141 (let [content-handler (atom nil)
142 trans (-> (javax.xml.transform.TransformerFactory/newInstance)
143 .newTransformer)]
144
145 (when (:indent opts)
146 (.setOutputProperty trans "indent" "yes")
147 (.setOutputProperty trans "{http://xml.apache.org/xslt}indent-amount"
148 (str (:indent opts))))
149
150 (when (contains? opts :xml-declaration)
151 (.setOutputProperty trans "omit-xml-declaration"
152 (if (:xml-declaration opts) "no" "yes")))
153
154 (when (:encoding opts)
155 (.setOutputProperty trans "encoding" (:encoding opts)))
156
157 (.transform
158 trans
159 (javax.xml.transform.sax.SAXSource.
160 (reify org.xml.sax.XMLReader
161 (getContentHandler [_] @content-handler)
162 (setDTDHandler [_ handler])
163 (setFeature [_ name value])
164 (setProperty [_ name value])
165 (setContentHandler [_ ch] (reset! content-handler ch))
166 (^void parse [_ ^org.xml.sax.InputSource _]
167 (when @content-handler
168 (.startDocument @content-handler)
169 (emit-element e @content-handler)
170 (.endDocument @content-handler))))
171 (org.xml.sax.InputSource.))
172 (javax.xml.transform.stream.StreamResult. *out*))))
173
174 (comment
175
176 (def atomstr "<?xml version='1.0' encoding='UTF-8'?>
177 <feed xmlns='http://www.w3.org/2005/Atom'>
178 <id>tag:blogger.com,1999:blog-28403206</id>
179 <updated>2008-02-14T08:00:58.567-08:00</updated>
180 <title type='text'>n01senet</title>
181 <link rel='alternate' type='text/html' href='http://n01senet.blogspot.com/'/>
182 <entry xmlns:foo='http://foo' xmlns:bar='http://bar'>
183 <id>1</id>
184 <published>2008-02-13</published>
185 <title type='text'>clojure is the best lisp yet</title>
186 <author><name>Chouser</name></author>
187 </entry>
188 <entry>
189 <id>2</id>
190 <published>2008-02-07</published>
191 <title type='text'>experimenting with vnc</title>
192 <author><name>agriffis</name></author>
193 </entry>
194 </feed>
195 ")
196
197 (def tree (parse-trim (java.io.StringReader. atomstr)
198 startparse-sax
199 1))
200 (println "\nsax")
201 (emit tree)
202
203 (def tree (parse-trim (java.io.StringReader. atomstr)))
204 (println "\ndefault")
205 (emit tree)
206
207 (def tree (xml/parse (org.xml.sax.InputSource. (java.io.StringReader. atomstr))))
208 (println "\norig")
209 (emit tree)
210
211 ; When used with zip and zip-filter, you can get do queries like this
212 ; without parsing more than the first few tags:
213 ; (zip/node (first (xml-> (zip/xml-zip tree) :id)))
214
215 )