rlm@10
|
1 ; Copyright (c) Rich Hickey. All rights reserved.
|
rlm@10
|
2 ; The use and distribution terms for this software are covered by the
|
rlm@10
|
3 ; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php)
|
rlm@10
|
4 ; which can be found in the file epl-v10.html at the root of this distribution.
|
rlm@10
|
5 ; By using this software in any fashion, you are agreeing to be bound by
|
rlm@10
|
6 ; the terms of this license.
|
rlm@10
|
7 ; You must not remove this notice, or any other, from this software.
|
rlm@10
|
8
|
rlm@10
|
9 (ns ^{:doc "XML reading/writing."
|
rlm@10
|
10 :author "Rich Hickey"}
|
rlm@10
|
11 clojure.xml
|
rlm@10
|
12 (:import (org.xml.sax ContentHandler Attributes SAXException)
|
rlm@10
|
13 (javax.xml.parsers SAXParser SAXParserFactory)))
|
rlm@10
|
14
|
rlm@10
|
15 (def *stack*)
|
rlm@10
|
16 (def *current*)
|
rlm@10
|
17 (def *state*) ; :element :chars :between
|
rlm@10
|
18 (def *sb*)
|
rlm@10
|
19
|
rlm@10
|
20 (defstruct element :tag :attrs :content)
|
rlm@10
|
21
|
rlm@10
|
22 (def tag (accessor element :tag))
|
rlm@10
|
23 (def attrs (accessor element :attrs))
|
rlm@10
|
24 (def content (accessor element :content))
|
rlm@10
|
25
|
rlm@10
|
26 (def content-handler
|
rlm@10
|
27 (let [push-content (fn [e c]
|
rlm@10
|
28 (assoc e :content (conj (or (:content e) []) c)))
|
rlm@10
|
29 push-chars (fn []
|
rlm@10
|
30 (when (and (= *state* :chars)
|
rlm@10
|
31 (some (complement #(Character/isWhitespace (char %))) (str *sb*)))
|
rlm@10
|
32 (set! *current* (push-content *current* (str *sb*)))))]
|
rlm@10
|
33 (new clojure.lang.XMLHandler
|
rlm@10
|
34 (proxy [ContentHandler] []
|
rlm@10
|
35 (startElement [uri local-name q-name ^Attributes atts]
|
rlm@10
|
36 (let [attrs (fn [ret i]
|
rlm@10
|
37 (if (neg? i)
|
rlm@10
|
38 ret
|
rlm@10
|
39 (recur (assoc ret
|
rlm@10
|
40 (clojure.lang.Keyword/intern (symbol (.getQName atts i)))
|
rlm@10
|
41 (.getValue atts (int i)))
|
rlm@10
|
42 (dec i))))
|
rlm@10
|
43 e (struct element
|
rlm@10
|
44 (. clojure.lang.Keyword (intern (symbol q-name)))
|
rlm@10
|
45 (when (pos? (.getLength atts))
|
rlm@10
|
46 (attrs {} (dec (.getLength atts)))))]
|
rlm@10
|
47 (push-chars)
|
rlm@10
|
48 (set! *stack* (conj *stack* *current*))
|
rlm@10
|
49 (set! *current* e)
|
rlm@10
|
50 (set! *state* :element))
|
rlm@10
|
51 nil)
|
rlm@10
|
52 (endElement [uri local-name q-name]
|
rlm@10
|
53 (push-chars)
|
rlm@10
|
54 (set! *current* (push-content (peek *stack*) *current*))
|
rlm@10
|
55 (set! *stack* (pop *stack*))
|
rlm@10
|
56 (set! *state* :between)
|
rlm@10
|
57 nil)
|
rlm@10
|
58 (characters [^chars ch start length]
|
rlm@10
|
59 (when-not (= *state* :chars)
|
rlm@10
|
60 (set! *sb* (new StringBuilder)))
|
rlm@10
|
61 (let [^StringBuilder sb *sb*]
|
rlm@10
|
62 (.append sb ch (int start) (int length))
|
rlm@10
|
63 (set! *state* :chars))
|
rlm@10
|
64 nil)
|
rlm@10
|
65 (setDocumentLocator [locator])
|
rlm@10
|
66 (startDocument [])
|
rlm@10
|
67 (endDocument [])
|
rlm@10
|
68 (startPrefixMapping [prefix uri])
|
rlm@10
|
69 (endPrefixMapping [prefix])
|
rlm@10
|
70 (ignorableWhitespace [ch start length])
|
rlm@10
|
71 (processingInstruction [target data])
|
rlm@10
|
72 (skippedEntity [name])
|
rlm@10
|
73 ))))
|
rlm@10
|
74
|
rlm@10
|
75 (defn startparse-sax [s ch]
|
rlm@10
|
76 (.. SAXParserFactory (newInstance) (newSAXParser) (parse s ch)))
|
rlm@10
|
77
|
rlm@10
|
78 (defn parse
|
rlm@10
|
79 "Parses and loads the source s, which can be a File, InputStream or
|
rlm@10
|
80 String naming a URI. Returns a tree of the xml/element struct-map,
|
rlm@10
|
81 which has the keys :tag, :attrs, and :content. and accessor fns tag,
|
rlm@10
|
82 attrs, and content. Other parsers can be supplied by passing
|
rlm@10
|
83 startparse, a fn taking a source and a ContentHandler and returning
|
rlm@10
|
84 a parser"
|
rlm@10
|
85 {:added "1.0"}
|
rlm@10
|
86 ([s] (parse s startparse-sax))
|
rlm@10
|
87 ([s startparse]
|
rlm@10
|
88 (binding [*stack* nil
|
rlm@10
|
89 *current* (struct element)
|
rlm@10
|
90 *state* :between
|
rlm@10
|
91 *sb* nil]
|
rlm@10
|
92 (startparse s content-handler)
|
rlm@10
|
93 ((:content *current*) 0))))
|
rlm@10
|
94
|
rlm@10
|
95 (defn emit-element [e]
|
rlm@10
|
96 (if (instance? String e)
|
rlm@10
|
97 (println e)
|
rlm@10
|
98 (do
|
rlm@10
|
99 (print (str "<" (name (:tag e))))
|
rlm@10
|
100 (when (:attrs e)
|
rlm@10
|
101 (doseq [attr (:attrs e)]
|
rlm@10
|
102 (print (str " " (name (key attr)) "='" (val attr)"'"))))
|
rlm@10
|
103 (if (:content e)
|
rlm@10
|
104 (do
|
rlm@10
|
105 (println ">")
|
rlm@10
|
106 (doseq [c (:content e)]
|
rlm@10
|
107 (emit-element c))
|
rlm@10
|
108 (println (str "</" (name (:tag e)) ">")))
|
rlm@10
|
109 (println "/>")))))
|
rlm@10
|
110
|
rlm@10
|
111 (defn emit [x]
|
rlm@10
|
112 (println "<?xml version='1.0' encoding='UTF-8'?>")
|
rlm@10
|
113 (emit-element x))
|
rlm@10
|
114
|
rlm@10
|
115 ;(export '(tag attrs content parse element emit emit-element))
|
rlm@10
|
116
|
rlm@10
|
117 ;(load-file "/Users/rich/dev/clojure/src/xml.clj")
|
rlm@10
|
118 ;(def x (xml/parse "http://arstechnica.com/journals.rssx"))
|