annotate src/clojure/xml.clj @ 10:ef7dbbd6452c

added clojure source goodness
author Robert McIntyre <rlm@mit.edu>
date Sat, 21 Aug 2010 06:25:44 -0400
parents
children
rev   line source
rlm@10 1 ; Copyright (c) Rich Hickey. All rights reserved.
rlm@10 2 ; The use and distribution terms for this software are covered by the
rlm@10 3 ; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php)
rlm@10 4 ; which can be found in the file epl-v10.html at the root of this distribution.
rlm@10 5 ; By using this software in any fashion, you are agreeing to be bound by
rlm@10 6 ; the terms of this license.
rlm@10 7 ; You must not remove this notice, or any other, from this software.
rlm@10 8
rlm@10 9 (ns ^{:doc "XML reading/writing."
rlm@10 10 :author "Rich Hickey"}
rlm@10 11 clojure.xml
rlm@10 12 (:import (org.xml.sax ContentHandler Attributes SAXException)
rlm@10 13 (javax.xml.parsers SAXParser SAXParserFactory)))
rlm@10 14
rlm@10 15 (def *stack*)
rlm@10 16 (def *current*)
rlm@10 17 (def *state*) ; :element :chars :between
rlm@10 18 (def *sb*)
rlm@10 19
rlm@10 20 (defstruct element :tag :attrs :content)
rlm@10 21
rlm@10 22 (def tag (accessor element :tag))
rlm@10 23 (def attrs (accessor element :attrs))
rlm@10 24 (def content (accessor element :content))
rlm@10 25
rlm@10 26 (def content-handler
rlm@10 27 (let [push-content (fn [e c]
rlm@10 28 (assoc e :content (conj (or (:content e) []) c)))
rlm@10 29 push-chars (fn []
rlm@10 30 (when (and (= *state* :chars)
rlm@10 31 (some (complement #(Character/isWhitespace (char %))) (str *sb*)))
rlm@10 32 (set! *current* (push-content *current* (str *sb*)))))]
rlm@10 33 (new clojure.lang.XMLHandler
rlm@10 34 (proxy [ContentHandler] []
rlm@10 35 (startElement [uri local-name q-name ^Attributes atts]
rlm@10 36 (let [attrs (fn [ret i]
rlm@10 37 (if (neg? i)
rlm@10 38 ret
rlm@10 39 (recur (assoc ret
rlm@10 40 (clojure.lang.Keyword/intern (symbol (.getQName atts i)))
rlm@10 41 (.getValue atts (int i)))
rlm@10 42 (dec i))))
rlm@10 43 e (struct element
rlm@10 44 (. clojure.lang.Keyword (intern (symbol q-name)))
rlm@10 45 (when (pos? (.getLength atts))
rlm@10 46 (attrs {} (dec (.getLength atts)))))]
rlm@10 47 (push-chars)
rlm@10 48 (set! *stack* (conj *stack* *current*))
rlm@10 49 (set! *current* e)
rlm@10 50 (set! *state* :element))
rlm@10 51 nil)
rlm@10 52 (endElement [uri local-name q-name]
rlm@10 53 (push-chars)
rlm@10 54 (set! *current* (push-content (peek *stack*) *current*))
rlm@10 55 (set! *stack* (pop *stack*))
rlm@10 56 (set! *state* :between)
rlm@10 57 nil)
rlm@10 58 (characters [^chars ch start length]
rlm@10 59 (when-not (= *state* :chars)
rlm@10 60 (set! *sb* (new StringBuilder)))
rlm@10 61 (let [^StringBuilder sb *sb*]
rlm@10 62 (.append sb ch (int start) (int length))
rlm@10 63 (set! *state* :chars))
rlm@10 64 nil)
rlm@10 65 (setDocumentLocator [locator])
rlm@10 66 (startDocument [])
rlm@10 67 (endDocument [])
rlm@10 68 (startPrefixMapping [prefix uri])
rlm@10 69 (endPrefixMapping [prefix])
rlm@10 70 (ignorableWhitespace [ch start length])
rlm@10 71 (processingInstruction [target data])
rlm@10 72 (skippedEntity [name])
rlm@10 73 ))))
rlm@10 74
rlm@10 75 (defn startparse-sax [s ch]
rlm@10 76 (.. SAXParserFactory (newInstance) (newSAXParser) (parse s ch)))
rlm@10 77
rlm@10 78 (defn parse
rlm@10 79 "Parses and loads the source s, which can be a File, InputStream or
rlm@10 80 String naming a URI. Returns a tree of the xml/element struct-map,
rlm@10 81 which has the keys :tag, :attrs, and :content. and accessor fns tag,
rlm@10 82 attrs, and content. Other parsers can be supplied by passing
rlm@10 83 startparse, a fn taking a source and a ContentHandler and returning
rlm@10 84 a parser"
rlm@10 85 {:added "1.0"}
rlm@10 86 ([s] (parse s startparse-sax))
rlm@10 87 ([s startparse]
rlm@10 88 (binding [*stack* nil
rlm@10 89 *current* (struct element)
rlm@10 90 *state* :between
rlm@10 91 *sb* nil]
rlm@10 92 (startparse s content-handler)
rlm@10 93 ((:content *current*) 0))))
rlm@10 94
rlm@10 95 (defn emit-element [e]
rlm@10 96 (if (instance? String e)
rlm@10 97 (println e)
rlm@10 98 (do
rlm@10 99 (print (str "<" (name (:tag e))))
rlm@10 100 (when (:attrs e)
rlm@10 101 (doseq [attr (:attrs e)]
rlm@10 102 (print (str " " (name (key attr)) "='" (val attr)"'"))))
rlm@10 103 (if (:content e)
rlm@10 104 (do
rlm@10 105 (println ">")
rlm@10 106 (doseq [c (:content e)]
rlm@10 107 (emit-element c))
rlm@10 108 (println (str "</" (name (:tag e)) ">")))
rlm@10 109 (println "/>")))))
rlm@10 110
rlm@10 111 (defn emit [x]
rlm@10 112 (println "<?xml version='1.0' encoding='UTF-8'?>")
rlm@10 113 (emit-element x))
rlm@10 114
rlm@10 115 ;(export '(tag attrs content parse element emit emit-element))
rlm@10 116
rlm@10 117 ;(load-file "/Users/rich/dev/clojure/src/xml.clj")
rlm@10 118 ;(def x (xml/parse "http://arstechnica.com/journals.rssx"))