Mercurial > lasercutter
comparison src/clojure/contrib/lazy_xml.clj @ 10:ef7dbbd6452c
added clojure source goodness
author | Robert McIntyre <rlm@mit.edu> |
---|---|
date | Sat, 21 Aug 2010 06:25:44 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
9:35cf337adfcf | 10:ef7dbbd6452c |
---|---|
1 ; Copyright (c) Chris Houser, Dec 2008. All rights reserved. | |
2 ; The use and distribution terms for this software are covered by the | |
3 ; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) | |
4 ; which can be found in the file epl-v10.html at the root of this distribution. | |
5 ; By using this software in any fashion, you are agreeing to be bound by | |
6 ; the terms of this license. | |
7 ; You must not remove this notice, or any other, from this software. | |
8 | |
9 ; Functions to parse xml lazily and emit back to text. | |
10 | |
11 (ns | |
12 ^{:author "Chris Houser", | |
13 :doc "Functions to parse xml lazily and emit back to text."} | |
14 clojure.contrib.lazy-xml | |
15 (:use [clojure.xml :as xml :only []] | |
16 [clojure.contrib.seq :only [fill-queue]]) | |
17 (:import (org.xml.sax Attributes InputSource) | |
18 (org.xml.sax.helpers DefaultHandler) | |
19 (javax.xml.parsers SAXParserFactory) | |
20 (java.util.concurrent LinkedBlockingQueue TimeUnit) | |
21 (java.lang.ref WeakReference) | |
22 (java.io Reader))) | |
23 | |
24 (defstruct node :type :name :attrs :str) | |
25 | |
26 ; http://www.extreme.indiana.edu/xgws/xsoap/xpp/ | |
27 (def has-pull false) | |
28 (defn- parse-seq-pull [& _]) | |
29 (try | |
30 (load "lazy_xml/with_pull") | |
31 (catch Exception e | |
32 (when-not (re-find #"XmlPullParser" (str e)) | |
33 (throw e)))) | |
34 | |
35 (defn startparse-sax [s ch] | |
36 (.. SAXParserFactory newInstance newSAXParser (parse s ch))) | |
37 | |
38 (defn parse-seq | |
39 "Parses the source s, which can be a File, InputStream or String | |
40 naming a URI. Returns a lazy sequence of maps with two or more of | |
41 the keys :type, :name, :attrs, and :str. Other SAX-compatible | |
42 parsers can be supplied by passing startparse, a fn taking a source | |
43 and a ContentHandler and returning a parser. If a parser is | |
44 specified, it will be run in a separate thread and be allowed to get | |
45 ahead by queue-size items, which defaults to maxint. If no parser | |
46 is specified and org.xmlpull.v1.XmlPullParser is in the classpath, | |
47 this superior pull parser will be used." | |
48 ([s] (if has-pull | |
49 (parse-seq-pull s) | |
50 (parse-seq s startparse-sax))) | |
51 ([s startparse] (parse-seq s startparse Integer/MAX_VALUE)) | |
52 ([s startparse queue-size] | |
53 (let [s (if (instance? Reader s) (InputSource. s) s) | |
54 f (fn filler-func [fill] | |
55 (startparse s (proxy [DefaultHandler] [] | |
56 (startElement [uri local-name q-name ^Attributes atts] | |
57 ;(prn :start-element q-name)(flush) | |
58 (let [attrs (into {} (for [i (range (.getLength atts))] | |
59 [(keyword (.getQName atts i)) | |
60 (.getValue atts i)]))] | |
61 (fill (struct node :start-element (keyword q-name) attrs)))) | |
62 (endElement [uri local-name q-name] | |
63 ;(prn :end-element q-name)(flush) | |
64 (fill (struct node :end-element (keyword q-name)))) | |
65 (characters [ch start length] | |
66 ;(prn :characters)(flush) | |
67 (let [st (String. ch start length)] | |
68 (when (seq (.trim st)) | |
69 (fill (struct node :characters nil nil st))))))))] | |
70 (fill-queue f :queue-size queue-size)))) | |
71 | |
72 | |
73 (defstruct element :tag :attrs :content) | |
74 (declare mktree) | |
75 | |
76 (defn- siblings [coll] | |
77 (lazy-seq | |
78 (when-let [s (seq coll)] | |
79 (let [event (first s)] | |
80 (condp = (:type event) | |
81 :characters (cons (:str event) (siblings (rest s))) | |
82 :start-element (let [t (mktree s)] | |
83 (cons (first t) (siblings (rest t)))) | |
84 :end-element [(rest s)]))))) | |
85 | |
86 (defn- mktree | |
87 [[elem & events]] | |
88 (lazy-seq | |
89 (let [sibs (siblings events)] | |
90 ;(prn :elem elem) | |
91 (cons | |
92 (struct element (:name elem) (:attrs elem) (drop-last sibs)) | |
93 (lazy-seq (last sibs)))))) | |
94 | |
95 (defn parse-trim | |
96 "Parses the source s, which can be a File, InputStream or String | |
97 naming a URI. Returns a lazy tree of the clojure.xml/element | |
98 struct-map, which has the keys :tag, :attrs, and :content and | |
99 accessor fns tag, attrs, and content, with the whitespace trimmed | |
100 from around each content string. This format is compatible with what | |
101 clojure.xml/parse produces, except :content is a lazy seq instead of | |
102 a vector. Other SAX-compatible parsers can be supplied by passing | |
103 startparse, a fn taking a source and a ContentHandler and returning | |
104 a parser. If a parser is specified, it will be run in a separate | |
105 thread and be allowed to get ahead by queue-size items, which | |
106 defaults to maxing. If no parser is specified and | |
107 org.xmlpull.v1.XmlPullParser is in the classpath, this superior pull | |
108 parser will be used." | |
109 ([s] (first (mktree (parse-seq s)))) | |
110 ([s startparse queue-size] | |
111 (first (mktree (parse-seq s startparse queue-size))))) | |
112 | |
113 (defn attributes [e] | |
114 (let [v (vec (:attrs e))] | |
115 (reify org.xml.sax.Attributes | |
116 (getLength [_] (count v)) | |
117 (getURI [_ i] (namespace (key (v i)))) | |
118 (getLocalName [_ i] (name (key (v i)))) | |
119 (getQName [_ i] (name (key (v i)))) | |
120 (getValue [_ uri name] (get (:attrs e) name)) | |
121 (^String getValue [_ ^int i] (val (v i))) | |
122 (^String getType [_ ^int i] "CDATA")))) | |
123 | |
124 (defn- emit-element | |
125 "Recursively prints as XML text the element struct e. To have it | |
126 print extra whitespace like clojure.xml/emit, use the :pad true | |
127 option." | |
128 [e ^org.xml.sax.ContentHandler ch] | |
129 (if (instance? String e) | |
130 (.characters ch (.toCharArray ^String e) 0 (count e)) | |
131 (let [nspace (namespace (:tag e)) | |
132 qname (name (:tag e))] | |
133 (.startElement ch (or nspace "") qname qname (attributes e)) | |
134 (doseq [c (:content e)] | |
135 (emit-element c ch)) | |
136 (.endElement ch (or nspace "") qname qname)))) | |
137 | |
138 | |
139 (defn emit | |
140 [e & {:as opts}] | |
141 (let [content-handler (atom nil) | |
142 trans (-> (javax.xml.transform.TransformerFactory/newInstance) | |
143 .newTransformer)] | |
144 | |
145 (when (:indent opts) | |
146 (.setOutputProperty trans "indent" "yes") | |
147 (.setOutputProperty trans "{http://xml.apache.org/xslt}indent-amount" | |
148 (str (:indent opts)))) | |
149 | |
150 (when (contains? opts :xml-declaration) | |
151 (.setOutputProperty trans "omit-xml-declaration" | |
152 (if (:xml-declaration opts) "no" "yes"))) | |
153 | |
154 (when (:encoding opts) | |
155 (.setOutputProperty trans "encoding" (:encoding opts))) | |
156 | |
157 (.transform | |
158 trans | |
159 (javax.xml.transform.sax.SAXSource. | |
160 (reify org.xml.sax.XMLReader | |
161 (getContentHandler [_] @content-handler) | |
162 (setDTDHandler [_ handler]) | |
163 (setFeature [_ name value]) | |
164 (setProperty [_ name value]) | |
165 (setContentHandler [_ ch] (reset! content-handler ch)) | |
166 (^void parse [_ ^org.xml.sax.InputSource _] | |
167 (when @content-handler | |
168 (.startDocument @content-handler) | |
169 (emit-element e @content-handler) | |
170 (.endDocument @content-handler)))) | |
171 (org.xml.sax.InputSource.)) | |
172 (javax.xml.transform.stream.StreamResult. *out*)))) | |
173 | |
174 (comment | |
175 | |
176 (def atomstr "<?xml version='1.0' encoding='UTF-8'?> | |
177 <feed xmlns='http://www.w3.org/2005/Atom'> | |
178 <id>tag:blogger.com,1999:blog-28403206</id> | |
179 <updated>2008-02-14T08:00:58.567-08:00</updated> | |
180 <title type='text'>n01senet</title> | |
181 <link rel='alternate' type='text/html' href='http://n01senet.blogspot.com/'/> | |
182 <entry xmlns:foo='http://foo' xmlns:bar='http://bar'> | |
183 <id>1</id> | |
184 <published>2008-02-13</published> | |
185 <title type='text'>clojure is the best lisp yet</title> | |
186 <author><name>Chouser</name></author> | |
187 </entry> | |
188 <entry> | |
189 <id>2</id> | |
190 <published>2008-02-07</published> | |
191 <title type='text'>experimenting with vnc</title> | |
192 <author><name>agriffis</name></author> | |
193 </entry> | |
194 </feed> | |
195 ") | |
196 | |
197 (def tree (parse-trim (java.io.StringReader. atomstr) | |
198 startparse-sax | |
199 1)) | |
200 (println "\nsax") | |
201 (emit tree) | |
202 | |
203 (def tree (parse-trim (java.io.StringReader. atomstr))) | |
204 (println "\ndefault") | |
205 (emit tree) | |
206 | |
207 (def tree (xml/parse (org.xml.sax.InputSource. (java.io.StringReader. atomstr)))) | |
208 (println "\norig") | |
209 (emit tree) | |
210 | |
211 ; When used with zip and zip-filter, you can get do queries like this | |
212 ; without parsing more than the first few tags: | |
213 ; (zip/node (first (xml-> (zip/xml-zip tree) :id))) | |
214 | |
215 ) |