diff src/clojure/contrib/lazy_xml.clj @ 10:ef7dbbd6452c

added clojure source goodness
author Robert McIntyre <rlm@mit.edu>
date Sat, 21 Aug 2010 06:25:44 -0400
parents
children
line wrap: on
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/src/clojure/contrib/lazy_xml.clj	Sat Aug 21 06:25:44 2010 -0400
     1.3 @@ -0,0 +1,215 @@
     1.4 +;   Copyright (c) Chris Houser, Dec 2008. All rights reserved.
     1.5 +;   The use and distribution terms for this software are covered by the
     1.6 +;   Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php)
     1.7 +;   which can be found in the file epl-v10.html at the root of this distribution.
     1.8 +;   By using this software in any fashion, you are agreeing to be bound by
     1.9 +;   the terms of this license.
    1.10 +;   You must not remove this notice, or any other, from this software.
    1.11 +
    1.12 +; Functions to parse xml lazily and emit back to text.
    1.13 +
    1.14 +(ns 
    1.15 +    ^{:author "Chris Houser",
    1.16 +       :doc "Functions to parse xml lazily and emit back to text."}
    1.17 +    clojure.contrib.lazy-xml
    1.18 +    (:use [clojure.xml :as xml :only []]
    1.19 +          [clojure.contrib.seq :only [fill-queue]])
    1.20 +    (:import (org.xml.sax Attributes InputSource)
    1.21 +             (org.xml.sax.helpers DefaultHandler)
    1.22 +             (javax.xml.parsers SAXParserFactory)
    1.23 +             (java.util.concurrent LinkedBlockingQueue TimeUnit)
    1.24 +             (java.lang.ref WeakReference)
    1.25 +             (java.io Reader)))
    1.26 +
    1.27 +(defstruct node :type :name :attrs :str)
    1.28 +
    1.29 +; http://www.extreme.indiana.edu/xgws/xsoap/xpp/
    1.30 +(def has-pull false)
    1.31 +(defn- parse-seq-pull [& _])
    1.32 +(try
    1.33 +  (load "lazy_xml/with_pull")
    1.34 +  (catch Exception e
    1.35 +    (when-not (re-find #"XmlPullParser" (str e))
    1.36 +      (throw e))))
    1.37 +
    1.38 +(defn startparse-sax [s ch]
    1.39 +  (.. SAXParserFactory newInstance newSAXParser (parse s ch)))
    1.40 +
    1.41 +(defn parse-seq
    1.42 +  "Parses the source s, which can be a File, InputStream or String
    1.43 +  naming a URI. Returns a lazy sequence of maps with two or more of
    1.44 +  the keys :type, :name, :attrs, and :str. Other SAX-compatible
    1.45 +  parsers can be supplied by passing startparse, a fn taking a source
    1.46 +  and a ContentHandler and returning a parser. If a parser is
    1.47 +  specified, it will be run in a separate thread and be allowed to get
    1.48 +  ahead by queue-size items, which defaults to maxint.  If no parser
    1.49 +  is specified and org.xmlpull.v1.XmlPullParser is in the classpath,
    1.50 +  this superior pull parser will be used."
    1.51 +  ([s] (if has-pull
    1.52 +         (parse-seq-pull s)
    1.53 +         (parse-seq s startparse-sax)))
    1.54 +  ([s startparse] (parse-seq s startparse Integer/MAX_VALUE))
    1.55 +  ([s startparse queue-size]
    1.56 +   (let [s (if (instance? Reader s) (InputSource. s) s)
    1.57 +         f (fn filler-func [fill]
    1.58 +             (startparse s (proxy [DefaultHandler] []
    1.59 +               (startElement [uri local-name q-name ^Attributes atts]
    1.60 +                 ;(prn :start-element q-name)(flush)
    1.61 +                 (let [attrs (into {} (for [i (range (.getLength atts))]
    1.62 +                                           [(keyword (.getQName atts i))
    1.63 +                                            (.getValue atts i)]))]
    1.64 +                   (fill (struct node :start-element (keyword q-name) attrs))))
    1.65 +               (endElement [uri local-name q-name]
    1.66 +                 ;(prn :end-element q-name)(flush)
    1.67 +                 (fill (struct node :end-element (keyword q-name))))
    1.68 +               (characters [ch start length]
    1.69 +                 ;(prn :characters)(flush)
    1.70 +                 (let [st (String. ch start length)]
    1.71 +                   (when (seq (.trim st))
    1.72 +                     (fill (struct node :characters nil nil st))))))))]
    1.73 +     (fill-queue f :queue-size queue-size))))
    1.74 +
    1.75 +
    1.76 +(defstruct element :tag :attrs :content)
    1.77 +(declare mktree)
    1.78 +
    1.79 +(defn- siblings [coll]
    1.80 +  (lazy-seq
    1.81 +    (when-let [s (seq coll)]
    1.82 +      (let [event (first s)]
    1.83 +        (condp = (:type event)
    1.84 +          :characters    (cons (:str event) (siblings (rest s)))
    1.85 +          :start-element (let [t (mktree s)]
    1.86 +                           (cons (first t) (siblings (rest t))))
    1.87 +          :end-element   [(rest s)])))))
    1.88 +
    1.89 +(defn- mktree
    1.90 +  [[elem & events]]
    1.91 +    (lazy-seq
    1.92 +      (let [sibs (siblings events)]
    1.93 +        ;(prn :elem elem)
    1.94 +        (cons
    1.95 +          (struct element (:name elem) (:attrs elem) (drop-last sibs))
    1.96 +          (lazy-seq (last sibs))))))
    1.97 +
    1.98 +(defn parse-trim
    1.99 +  "Parses the source s, which can be a File, InputStream or String
   1.100 +  naming a URI. Returns a lazy tree of the clojure.xml/element
   1.101 +  struct-map, which has the keys :tag, :attrs, and :content and
   1.102 +  accessor fns tag, attrs, and content, with the whitespace trimmed
   1.103 +  from around each content string. This format is compatible with what
   1.104 +  clojure.xml/parse produces, except :content is a lazy seq instead of
   1.105 +  a vector.  Other SAX-compatible parsers can be supplied by passing
   1.106 +  startparse, a fn taking a source and a ContentHandler and returning
   1.107 +  a parser. If a parser is specified, it will be run in a separate
   1.108 +  thread and be allowed to get ahead by queue-size items, which
   1.109 +  defaults to maxing.  If no parser is specified and
   1.110 +  org.xmlpull.v1.XmlPullParser is in the classpath, this superior pull
   1.111 +  parser will be used."
   1.112 +  ([s] (first (mktree (parse-seq s))))
   1.113 +  ([s startparse queue-size]
   1.114 +    (first (mktree (parse-seq s startparse queue-size)))))
   1.115 +
   1.116 +(defn attributes [e]
   1.117 +  (let [v (vec (:attrs e))]
   1.118 +    (reify org.xml.sax.Attributes
   1.119 +      (getLength [_] (count v))
   1.120 +      (getURI [_ i] (namespace (key (v i))))
   1.121 +      (getLocalName [_ i] (name (key (v i))))
   1.122 +      (getQName [_ i] (name (key (v i))))
   1.123 +      (getValue [_ uri name] (get (:attrs e) name))
   1.124 +      (^String getValue [_ ^int i] (val (v i)))
   1.125 +      (^String getType [_ ^int i] "CDATA"))))
   1.126 +
   1.127 +(defn- emit-element
   1.128 +  "Recursively prints as XML text the element struct e.  To have it
   1.129 +  print extra whitespace like clojure.xml/emit, use the :pad true
   1.130 +  option."
   1.131 +  [e ^org.xml.sax.ContentHandler ch]
   1.132 +  (if (instance? String e)
   1.133 +    (.characters ch (.toCharArray ^String e) 0 (count e))
   1.134 +    (let [nspace (namespace (:tag e))
   1.135 +          qname (name (:tag e))]
   1.136 +      (.startElement ch (or nspace "") qname qname (attributes e))
   1.137 +      (doseq [c (:content e)]
   1.138 +        (emit-element c ch))
   1.139 +      (.endElement ch (or nspace "") qname qname))))
   1.140 +  
   1.141 +
   1.142 +(defn emit
   1.143 +  [e & {:as opts}]
   1.144 +  (let [content-handler (atom nil)
   1.145 +        trans (-> (javax.xml.transform.TransformerFactory/newInstance)
   1.146 +                .newTransformer)]
   1.147 +
   1.148 +    (when (:indent opts)
   1.149 +      (.setOutputProperty trans "indent" "yes")
   1.150 +      (.setOutputProperty trans "{http://xml.apache.org/xslt}indent-amount"
   1.151 +                          (str (:indent opts))))
   1.152 +
   1.153 +    (when (contains? opts :xml-declaration)
   1.154 +      (.setOutputProperty trans "omit-xml-declaration" 
   1.155 +                          (if (:xml-declaration opts) "no" "yes")))
   1.156 +
   1.157 +    (when (:encoding opts)
   1.158 +      (.setOutputProperty trans "encoding" (:encoding opts)))
   1.159 +
   1.160 +    (.transform
   1.161 +      trans
   1.162 +      (javax.xml.transform.sax.SAXSource.
   1.163 +        (reify org.xml.sax.XMLReader
   1.164 +          (getContentHandler [_] @content-handler)
   1.165 +          (setDTDHandler [_ handler])
   1.166 +          (setFeature [_ name value])
   1.167 +          (setProperty [_ name value])
   1.168 +          (setContentHandler [_ ch] (reset! content-handler ch))
   1.169 +          (^void parse [_ ^org.xml.sax.InputSource _]
   1.170 +            (when @content-handler
   1.171 +              (.startDocument @content-handler)
   1.172 +              (emit-element e @content-handler)
   1.173 +              (.endDocument @content-handler))))
   1.174 +        (org.xml.sax.InputSource.))
   1.175 +      (javax.xml.transform.stream.StreamResult. *out*))))
   1.176 +
   1.177 +(comment
   1.178 +
   1.179 +(def atomstr "<?xml version='1.0' encoding='UTF-8'?>
   1.180 +<feed xmlns='http://www.w3.org/2005/Atom'>
   1.181 +  <id>tag:blogger.com,1999:blog-28403206</id>
   1.182 +  <updated>2008-02-14T08:00:58.567-08:00</updated>
   1.183 +  <title type='text'>n01senet</title>
   1.184 +  <link rel='alternate' type='text/html' href='http://n01senet.blogspot.com/'/>
   1.185 +  <entry xmlns:foo='http://foo' xmlns:bar='http://bar'>
   1.186 +    <id>1</id>
   1.187 +    <published>2008-02-13</published>
   1.188 +    <title type='text'>clojure is the best lisp yet</title>
   1.189 +    <author><name>Chouser</name></author>
   1.190 +  </entry>
   1.191 +  <entry>
   1.192 +    <id>2</id>
   1.193 +    <published>2008-02-07</published>
   1.194 +    <title type='text'>experimenting with vnc</title>
   1.195 +    <author><name>agriffis</name></author>
   1.196 +  </entry>
   1.197 +</feed>
   1.198 +")
   1.199 +
   1.200 +(def tree (parse-trim (java.io.StringReader. atomstr)
   1.201 +                 startparse-sax
   1.202 +                 1))
   1.203 +(println "\nsax")
   1.204 +(emit tree)
   1.205 +
   1.206 +(def tree (parse-trim (java.io.StringReader. atomstr)))
   1.207 +(println "\ndefault")
   1.208 +(emit tree)
   1.209 +
   1.210 +(def tree (xml/parse (org.xml.sax.InputSource. (java.io.StringReader. atomstr))))
   1.211 +(println "\norig")
   1.212 +(emit tree)
   1.213 +
   1.214 +; When used with zip and zip-filter, you can get do queries like this
   1.215 +; without parsing more than the first few tags:
   1.216 +; (zip/node (first (xml-> (zip/xml-zip tree) :id)))
   1.217 +
   1.218 +)