Mercurial > lasercutter
view src/clojure/contrib/lazy_xml.clj @ 10:ef7dbbd6452c
added clojure source goodness
author | Robert McIntyre <rlm@mit.edu> |
---|---|
date | Sat, 21 Aug 2010 06:25:44 -0400 |
parents | |
children |
line wrap: on
line source
1 ; Copyright (c) Chris Houser, Dec 2008. All rights reserved.2 ; The use and distribution terms for this software are covered by the3 ; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php)4 ; which can be found in the file epl-v10.html at the root of this distribution.5 ; By using this software in any fashion, you are agreeing to be bound by6 ; the terms of this license.7 ; You must not remove this notice, or any other, from this software.9 ; Functions to parse xml lazily and emit back to text.11 (ns12 ^{:author "Chris Houser",13 :doc "Functions to parse xml lazily and emit back to text."}14 clojure.contrib.lazy-xml15 (:use [clojure.xml :as xml :only []]16 [clojure.contrib.seq :only [fill-queue]])17 (:import (org.xml.sax Attributes InputSource)18 (org.xml.sax.helpers DefaultHandler)19 (javax.xml.parsers SAXParserFactory)20 (java.util.concurrent LinkedBlockingQueue TimeUnit)21 (java.lang.ref WeakReference)22 (java.io Reader)))24 (defstruct node :type :name :attrs :str)26 ; http://www.extreme.indiana.edu/xgws/xsoap/xpp/27 (def has-pull false)28 (defn- parse-seq-pull [& _])29 (try30 (load "lazy_xml/with_pull")31 (catch Exception e32 (when-not (re-find #"XmlPullParser" (str e))33 (throw e))))35 (defn startparse-sax [s ch]36 (.. SAXParserFactory newInstance newSAXParser (parse s ch)))38 (defn parse-seq39 "Parses the source s, which can be a File, InputStream or String40 naming a URI. Returns a lazy sequence of maps with two or more of41 the keys :type, :name, :attrs, and :str. Other SAX-compatible42 parsers can be supplied by passing startparse, a fn taking a source43 and a ContentHandler and returning a parser. If a parser is44 specified, it will be run in a separate thread and be allowed to get45 ahead by queue-size items, which defaults to maxint. If no parser46 is specified and org.xmlpull.v1.XmlPullParser is in the classpath,47 this superior pull parser will be used."48 ([s] (if has-pull49 (parse-seq-pull s)50 (parse-seq s startparse-sax)))51 ([s startparse] (parse-seq s startparse Integer/MAX_VALUE))52 ([s startparse queue-size]53 (let [s (if (instance? Reader s) (InputSource. s) s)54 f (fn filler-func [fill]55 (startparse s (proxy [DefaultHandler] []56 (startElement [uri local-name q-name ^Attributes atts]57 ;(prn :start-element q-name)(flush)58 (let [attrs (into {} (for [i (range (.getLength atts))]59 [(keyword (.getQName atts i))60 (.getValue atts i)]))]61 (fill (struct node :start-element (keyword q-name) attrs))))62 (endElement [uri local-name q-name]63 ;(prn :end-element q-name)(flush)64 (fill (struct node :end-element (keyword q-name))))65 (characters [ch start length]66 ;(prn :characters)(flush)67 (let [st (String. ch start length)]68 (when (seq (.trim st))69 (fill (struct node :characters nil nil st))))))))]70 (fill-queue f :queue-size queue-size))))73 (defstruct element :tag :attrs :content)74 (declare mktree)76 (defn- siblings [coll]77 (lazy-seq78 (when-let [s (seq coll)]79 (let [event (first s)]80 (condp = (:type event)81 :characters (cons (:str event) (siblings (rest s)))82 :start-element (let [t (mktree s)]83 (cons (first t) (siblings (rest t))))84 :end-element [(rest s)])))))86 (defn- mktree87 [[elem & events]]88 (lazy-seq89 (let [sibs (siblings events)]90 ;(prn :elem elem)91 (cons92 (struct element (:name elem) (:attrs elem) (drop-last sibs))93 (lazy-seq (last sibs))))))95 (defn parse-trim96 "Parses the source s, which can be a File, InputStream or String97 naming a URI. Returns a lazy tree of the clojure.xml/element98 struct-map, which has the keys :tag, :attrs, and :content and99 accessor fns tag, attrs, and content, with the whitespace trimmed100 from around each content string. This format is compatible with what101 clojure.xml/parse produces, except :content is a lazy seq instead of102 a vector. Other SAX-compatible parsers can be supplied by passing103 startparse, a fn taking a source and a ContentHandler and returning104 a parser. If a parser is specified, it will be run in a separate105 thread and be allowed to get ahead by queue-size items, which106 defaults to maxing. If no parser is specified and107 org.xmlpull.v1.XmlPullParser is in the classpath, this superior pull108 parser will be used."109 ([s] (first (mktree (parse-seq s))))110 ([s startparse queue-size]111 (first (mktree (parse-seq s startparse queue-size)))))113 (defn attributes [e]114 (let [v (vec (:attrs e))]115 (reify org.xml.sax.Attributes116 (getLength [_] (count v))117 (getURI [_ i] (namespace (key (v i))))118 (getLocalName [_ i] (name (key (v i))))119 (getQName [_ i] (name (key (v i))))120 (getValue [_ uri name] (get (:attrs e) name))121 (^String getValue [_ ^int i] (val (v i)))122 (^String getType [_ ^int i] "CDATA"))))124 (defn- emit-element125 "Recursively prints as XML text the element struct e. To have it126 print extra whitespace like clojure.xml/emit, use the :pad true127 option."128 [e ^org.xml.sax.ContentHandler ch]129 (if (instance? String e)130 (.characters ch (.toCharArray ^String e) 0 (count e))131 (let [nspace (namespace (:tag e))132 qname (name (:tag e))]133 (.startElement ch (or nspace "") qname qname (attributes e))134 (doseq [c (:content e)]135 (emit-element c ch))136 (.endElement ch (or nspace "") qname qname))))139 (defn emit140 [e & {:as opts}]141 (let [content-handler (atom nil)142 trans (-> (javax.xml.transform.TransformerFactory/newInstance)143 .newTransformer)]145 (when (:indent opts)146 (.setOutputProperty trans "indent" "yes")147 (.setOutputProperty trans "{http://xml.apache.org/xslt}indent-amount"148 (str (:indent opts))))150 (when (contains? opts :xml-declaration)151 (.setOutputProperty trans "omit-xml-declaration"152 (if (:xml-declaration opts) "no" "yes")))154 (when (:encoding opts)155 (.setOutputProperty trans "encoding" (:encoding opts)))157 (.transform158 trans159 (javax.xml.transform.sax.SAXSource.160 (reify org.xml.sax.XMLReader161 (getContentHandler [_] @content-handler)162 (setDTDHandler [_ handler])163 (setFeature [_ name value])164 (setProperty [_ name value])165 (setContentHandler [_ ch] (reset! content-handler ch))166 (^void parse [_ ^org.xml.sax.InputSource _]167 (when @content-handler168 (.startDocument @content-handler)169 (emit-element e @content-handler)170 (.endDocument @content-handler))))171 (org.xml.sax.InputSource.))172 (javax.xml.transform.stream.StreamResult. *out*))))174 (comment176 (def atomstr "<?xml version='1.0' encoding='UTF-8'?>177 <feed xmlns='http://www.w3.org/2005/Atom'>178 <id>tag:blogger.com,1999:blog-28403206</id>179 <updated>2008-02-14T08:00:58.567-08:00</updated>180 <title type='text'>n01senet</title>181 <link rel='alternate' type='text/html' href='http://n01senet.blogspot.com/'/>182 <entry xmlns:foo='http://foo' xmlns:bar='http://bar'>183 <id>1</id>184 <published>2008-02-13</published>185 <title type='text'>clojure is the best lisp yet</title>186 <author><name>Chouser</name></author>187 </entry>188 <entry>189 <id>2</id>190 <published>2008-02-07</published>191 <title type='text'>experimenting with vnc</title>192 <author><name>agriffis</name></author>193 </entry>194 </feed>195 ")197 (def tree (parse-trim (java.io.StringReader. atomstr)198 startparse-sax199 1))200 (println "\nsax")201 (emit tree)203 (def tree (parse-trim (java.io.StringReader. atomstr)))204 (println "\ndefault")205 (emit tree)207 (def tree (xml/parse (org.xml.sax.InputSource. (java.io.StringReader. atomstr))))208 (println "\norig")209 (emit tree)211 ; When used with zip and zip-filter, you can get do queries like this212 ; without parsing more than the first few tags:213 ; (zip/node (first (xml-> (zip/xml-zip tree) :id)))215 )