Mercurial > lasercutter
diff src/clojure/contrib/lazy_xml.clj @ 10:ef7dbbd6452c
added clojure source goodness
author | Robert McIntyre <rlm@mit.edu> |
---|---|
date | Sat, 21 Aug 2010 06:25:44 -0400 |
parents | |
children |
line wrap: on
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/src/clojure/contrib/lazy_xml.clj Sat Aug 21 06:25:44 2010 -0400 1.3 @@ -0,0 +1,215 @@ 1.4 +; Copyright (c) Chris Houser, Dec 2008. All rights reserved. 1.5 +; The use and distribution terms for this software are covered by the 1.6 +; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) 1.7 +; which can be found in the file epl-v10.html at the root of this distribution. 1.8 +; By using this software in any fashion, you are agreeing to be bound by 1.9 +; the terms of this license. 1.10 +; You must not remove this notice, or any other, from this software. 1.11 + 1.12 +; Functions to parse xml lazily and emit back to text. 1.13 + 1.14 +(ns 1.15 + ^{:author "Chris Houser", 1.16 + :doc "Functions to parse xml lazily and emit back to text."} 1.17 + clojure.contrib.lazy-xml 1.18 + (:use [clojure.xml :as xml :only []] 1.19 + [clojure.contrib.seq :only [fill-queue]]) 1.20 + (:import (org.xml.sax Attributes InputSource) 1.21 + (org.xml.sax.helpers DefaultHandler) 1.22 + (javax.xml.parsers SAXParserFactory) 1.23 + (java.util.concurrent LinkedBlockingQueue TimeUnit) 1.24 + (java.lang.ref WeakReference) 1.25 + (java.io Reader))) 1.26 + 1.27 +(defstruct node :type :name :attrs :str) 1.28 + 1.29 +; http://www.extreme.indiana.edu/xgws/xsoap/xpp/ 1.30 +(def has-pull false) 1.31 +(defn- parse-seq-pull [& _]) 1.32 +(try 1.33 + (load "lazy_xml/with_pull") 1.34 + (catch Exception e 1.35 + (when-not (re-find #"XmlPullParser" (str e)) 1.36 + (throw e)))) 1.37 + 1.38 +(defn startparse-sax [s ch] 1.39 + (.. SAXParserFactory newInstance newSAXParser (parse s ch))) 1.40 + 1.41 +(defn parse-seq 1.42 + "Parses the source s, which can be a File, InputStream or String 1.43 + naming a URI. Returns a lazy sequence of maps with two or more of 1.44 + the keys :type, :name, :attrs, and :str. Other SAX-compatible 1.45 + parsers can be supplied by passing startparse, a fn taking a source 1.46 + and a ContentHandler and returning a parser. If a parser is 1.47 + specified, it will be run in a separate thread and be allowed to get 1.48 + ahead by queue-size items, which defaults to maxint. If no parser 1.49 + is specified and org.xmlpull.v1.XmlPullParser is in the classpath, 1.50 + this superior pull parser will be used." 1.51 + ([s] (if has-pull 1.52 + (parse-seq-pull s) 1.53 + (parse-seq s startparse-sax))) 1.54 + ([s startparse] (parse-seq s startparse Integer/MAX_VALUE)) 1.55 + ([s startparse queue-size] 1.56 + (let [s (if (instance? Reader s) (InputSource. s) s) 1.57 + f (fn filler-func [fill] 1.58 + (startparse s (proxy [DefaultHandler] [] 1.59 + (startElement [uri local-name q-name ^Attributes atts] 1.60 + ;(prn :start-element q-name)(flush) 1.61 + (let [attrs (into {} (for [i (range (.getLength atts))] 1.62 + [(keyword (.getQName atts i)) 1.63 + (.getValue atts i)]))] 1.64 + (fill (struct node :start-element (keyword q-name) attrs)))) 1.65 + (endElement [uri local-name q-name] 1.66 + ;(prn :end-element q-name)(flush) 1.67 + (fill (struct node :end-element (keyword q-name)))) 1.68 + (characters [ch start length] 1.69 + ;(prn :characters)(flush) 1.70 + (let [st (String. ch start length)] 1.71 + (when (seq (.trim st)) 1.72 + (fill (struct node :characters nil nil st))))))))] 1.73 + (fill-queue f :queue-size queue-size)))) 1.74 + 1.75 + 1.76 +(defstruct element :tag :attrs :content) 1.77 +(declare mktree) 1.78 + 1.79 +(defn- siblings [coll] 1.80 + (lazy-seq 1.81 + (when-let [s (seq coll)] 1.82 + (let [event (first s)] 1.83 + (condp = (:type event) 1.84 + :characters (cons (:str event) (siblings (rest s))) 1.85 + :start-element (let [t (mktree s)] 1.86 + (cons (first t) (siblings (rest t)))) 1.87 + :end-element [(rest s)]))))) 1.88 + 1.89 +(defn- mktree 1.90 + [[elem & events]] 1.91 + (lazy-seq 1.92 + (let [sibs (siblings events)] 1.93 + ;(prn :elem elem) 1.94 + (cons 1.95 + (struct element (:name elem) (:attrs elem) (drop-last sibs)) 1.96 + (lazy-seq (last sibs)))))) 1.97 + 1.98 +(defn parse-trim 1.99 + "Parses the source s, which can be a File, InputStream or String 1.100 + naming a URI. Returns a lazy tree of the clojure.xml/element 1.101 + struct-map, which has the keys :tag, :attrs, and :content and 1.102 + accessor fns tag, attrs, and content, with the whitespace trimmed 1.103 + from around each content string. This format is compatible with what 1.104 + clojure.xml/parse produces, except :content is a lazy seq instead of 1.105 + a vector. Other SAX-compatible parsers can be supplied by passing 1.106 + startparse, a fn taking a source and a ContentHandler and returning 1.107 + a parser. If a parser is specified, it will be run in a separate 1.108 + thread and be allowed to get ahead by queue-size items, which 1.109 + defaults to maxing. If no parser is specified and 1.110 + org.xmlpull.v1.XmlPullParser is in the classpath, this superior pull 1.111 + parser will be used." 1.112 + ([s] (first (mktree (parse-seq s)))) 1.113 + ([s startparse queue-size] 1.114 + (first (mktree (parse-seq s startparse queue-size))))) 1.115 + 1.116 +(defn attributes [e] 1.117 + (let [v (vec (:attrs e))] 1.118 + (reify org.xml.sax.Attributes 1.119 + (getLength [_] (count v)) 1.120 + (getURI [_ i] (namespace (key (v i)))) 1.121 + (getLocalName [_ i] (name (key (v i)))) 1.122 + (getQName [_ i] (name (key (v i)))) 1.123 + (getValue [_ uri name] (get (:attrs e) name)) 1.124 + (^String getValue [_ ^int i] (val (v i))) 1.125 + (^String getType [_ ^int i] "CDATA")))) 1.126 + 1.127 +(defn- emit-element 1.128 + "Recursively prints as XML text the element struct e. To have it 1.129 + print extra whitespace like clojure.xml/emit, use the :pad true 1.130 + option." 1.131 + [e ^org.xml.sax.ContentHandler ch] 1.132 + (if (instance? String e) 1.133 + (.characters ch (.toCharArray ^String e) 0 (count e)) 1.134 + (let [nspace (namespace (:tag e)) 1.135 + qname (name (:tag e))] 1.136 + (.startElement ch (or nspace "") qname qname (attributes e)) 1.137 + (doseq [c (:content e)] 1.138 + (emit-element c ch)) 1.139 + (.endElement ch (or nspace "") qname qname)))) 1.140 + 1.141 + 1.142 +(defn emit 1.143 + [e & {:as opts}] 1.144 + (let [content-handler (atom nil) 1.145 + trans (-> (javax.xml.transform.TransformerFactory/newInstance) 1.146 + .newTransformer)] 1.147 + 1.148 + (when (:indent opts) 1.149 + (.setOutputProperty trans "indent" "yes") 1.150 + (.setOutputProperty trans "{http://xml.apache.org/xslt}indent-amount" 1.151 + (str (:indent opts)))) 1.152 + 1.153 + (when (contains? opts :xml-declaration) 1.154 + (.setOutputProperty trans "omit-xml-declaration" 1.155 + (if (:xml-declaration opts) "no" "yes"))) 1.156 + 1.157 + (when (:encoding opts) 1.158 + (.setOutputProperty trans "encoding" (:encoding opts))) 1.159 + 1.160 + (.transform 1.161 + trans 1.162 + (javax.xml.transform.sax.SAXSource. 1.163 + (reify org.xml.sax.XMLReader 1.164 + (getContentHandler [_] @content-handler) 1.165 + (setDTDHandler [_ handler]) 1.166 + (setFeature [_ name value]) 1.167 + (setProperty [_ name value]) 1.168 + (setContentHandler [_ ch] (reset! content-handler ch)) 1.169 + (^void parse [_ ^org.xml.sax.InputSource _] 1.170 + (when @content-handler 1.171 + (.startDocument @content-handler) 1.172 + (emit-element e @content-handler) 1.173 + (.endDocument @content-handler)))) 1.174 + (org.xml.sax.InputSource.)) 1.175 + (javax.xml.transform.stream.StreamResult. *out*)))) 1.176 + 1.177 +(comment 1.178 + 1.179 +(def atomstr "<?xml version='1.0' encoding='UTF-8'?> 1.180 +<feed xmlns='http://www.w3.org/2005/Atom'> 1.181 + <id>tag:blogger.com,1999:blog-28403206</id> 1.182 + <updated>2008-02-14T08:00:58.567-08:00</updated> 1.183 + <title type='text'>n01senet</title> 1.184 + <link rel='alternate' type='text/html' href='http://n01senet.blogspot.com/'/> 1.185 + <entry xmlns:foo='http://foo' xmlns:bar='http://bar'> 1.186 + <id>1</id> 1.187 + <published>2008-02-13</published> 1.188 + <title type='text'>clojure is the best lisp yet</title> 1.189 + <author><name>Chouser</name></author> 1.190 + </entry> 1.191 + <entry> 1.192 + <id>2</id> 1.193 + <published>2008-02-07</published> 1.194 + <title type='text'>experimenting with vnc</title> 1.195 + <author><name>agriffis</name></author> 1.196 + </entry> 1.197 +</feed> 1.198 +") 1.199 + 1.200 +(def tree (parse-trim (java.io.StringReader. atomstr) 1.201 + startparse-sax 1.202 + 1)) 1.203 +(println "\nsax") 1.204 +(emit tree) 1.205 + 1.206 +(def tree (parse-trim (java.io.StringReader. atomstr))) 1.207 +(println "\ndefault") 1.208 +(emit tree) 1.209 + 1.210 +(def tree (xml/parse (org.xml.sax.InputSource. (java.io.StringReader. atomstr)))) 1.211 +(println "\norig") 1.212 +(emit tree) 1.213 + 1.214 +; When used with zip and zip-filter, you can get do queries like this 1.215 +; without parsing more than the first few tags: 1.216 +; (zip/node (first (xml-> (zip/xml-zip tree) :id))) 1.217 + 1.218 +)