comparison src/clojure/contrib/str_utils2.clj @ 10:ef7dbbd6452c

added clojure source goodness
author Robert McIntyre <rlm@mit.edu>
date Sat, 21 Aug 2010 06:25:44 -0400
parents
children
comparison
equal deleted inserted replaced
9:35cf337adfcf 10:ef7dbbd6452c
1 ;;; str_utils2.clj -- functional string utilities for Clojure
2
3 ;; by Stuart Sierra, http://stuartsierra.com/
4 ;; August 19, 2009
5
6 ;; Copyright (c) Stuart Sierra, 2009. All rights reserved. The use
7 ;; and distribution terms for this software are covered by the Eclipse
8 ;; Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php)
9 ;; which can be found in the file epl-v10.html at the root of this
10 ;; distribution. By using this software in any fashion, you are
11 ;; agreeing to be bound by the terms of this license. You must not
12 ;; remove this notice, or any other, from this software.
13
14 ;; DEPRECATED in 1.2: Promoted to clojure.java.string. Note that
15 ;; many function names and semantics have changed
16
17 (ns ^{:author "Stuart Sierra"
18 :deprecated "1.2"
19 :doc "This is a library of string manipulation functions. It
20 is intented as a replacement for clojure.contrib.str-utils.
21
22 You cannot (use 'clojure.contrib.str-utils2) because it defines
23 functions with the same names as functions in clojure.core.
24 Instead, do (require '[clojure.contrib.str-utils2 :as s])
25 or something similar.
26
27 Goals:
28 1. Be functional
29 2. String argument first, to work with ->
30 3. Performance linear in string length
31
32 Some ideas are borrowed from
33 http://github.com/francoisdevlin/devlinsf-clojure-utils/"}
34 clojure.contrib.str-utils2
35 (:refer-clojure :exclude (take replace drop butlast partition
36 contains? get repeat reverse partial))
37 (:import (java.util.regex Pattern)))
38
39
40 (defmacro dochars
41 "bindings => [name string]
42
43 Repeatedly executes body, with name bound to each character in
44 string. Does NOT handle Unicode supplementary characters (above
45 U+FFFF)."
46 [bindings & body]
47 (assert (vector bindings))
48 (assert (= 2 (count bindings)))
49 ;; This seems to be the fastest way to iterate over characters.
50 `(let [^String s# ~(second bindings)]
51 (dotimes [i# (.length s#)]
52 (let [~(first bindings) (.charAt s# i#)]
53 ~@body))))
54
55
56 (defmacro docodepoints
57 "bindings => [name string]
58
59 Repeatedly executes body, with name bound to the integer code point
60 of each Unicode character in the string. Handles Unicode
61 supplementary characters (above U+FFFF) correctly."
62 [bindings & body]
63 (assert (vector bindings))
64 (assert (= 2 (count bindings)))
65 (let [character (first bindings)
66 string (second bindings)]
67 `(let [^String s# ~string
68 len# (.length s#)]
69 (loop [i# 0]
70 (when (< i# len#)
71 (let [~character (.charAt s# i#)]
72 (if (Character/isHighSurrogate ~character)
73 (let [~character (.codePointAt s# i#)]
74 ~@body
75 (recur (+ 2 i#)))
76 (let [~character (int ~character)]
77 ~@body
78 (recur (inc i#))))))))))
79
80 (defn codepoints
81 "Returns a sequence of integer Unicode code points in s. Handles
82 Unicode supplementary characters (above U+FFFF) correctly."
83 [^String s]
84 (let [len (.length s)
85 f (fn thisfn [^String s i]
86 (when (< i len)
87 (let [c (.charAt s i)]
88 (if (Character/isHighSurrogate c)
89 (cons (.codePointAt s i) (thisfn s (+ 2 i)))
90 (cons (int c) (thisfn s (inc i)))))))]
91 (lazy-seq (f s 0))))
92
93 (defn ^String escape
94 "Returns a new String by applying cmap (a function or a map) to each
95 character in s. If cmap returns nil, the original character is
96 added to the output unchanged."
97 [^String s cmap]
98 (let [buffer (StringBuilder. (.length s))]
99 (dochars [c s]
100 (if-let [r (cmap c)]
101 (.append buffer r)
102 (.append buffer c)))
103 (.toString buffer)))
104
105 (defn blank?
106 "True if s is nil, empty, or contains only whitespace."
107 [^String s]
108 (every? (fn [^Character c] (Character/isWhitespace c)) s))
109
110 (defn ^String take
111 "Take first n characters from s, up to the length of s.
112
113 Note the argument order is the opposite of clojure.core/take; this
114 is to keep the string as the first argument for use with ->"
115 [^String s n]
116 (if (< (count s) n)
117 s
118 (.substring s 0 n)))
119
120 (defn ^String drop
121 "Drops first n characters from s. Returns an empty string if n is
122 greater than the length of s.
123
124 Note the argument order is the opposite of clojure.core/drop; this
125 is to keep the string as the first argument for use with ->"
126 [^String s n]
127 (if (< (count s) n)
128 ""
129 (.substring s n)))
130
131 (defn ^String butlast
132 "Returns s without the last n characters. Returns an empty string
133 if n is greater than the length of s.
134
135 Note the argument order is the opposite of clojure.core/butlast;
136 this is to keep the string as the first argument for use with ->"
137 [^String s n]
138 (if (< (count s) n)
139 ""
140 (.substring s 0 (- (count s) n))))
141
142 (defn ^String tail
143 "Returns the last n characters of s."
144 [^String s n]
145 (if (< (count s) n)
146 s
147 (.substring s (- (count s) n))))
148
149 (defn ^String repeat
150 "Returns a new String containing s repeated n times."
151 [^String s n]
152 (apply str (clojure.core/repeat n s)))
153
154 (defn ^String reverse
155 "Returns s with its characters reversed."
156 [^String s]
157 (.toString (.reverse (StringBuilder. s))))
158
159 (defmulti
160 ^{:doc "Replaces all instances of pattern in string with replacement.
161
162 Allowed argument types for pattern and replacement are:
163 1. String and String
164 2. Character and Character
165 3. regex Pattern and String
166 (Uses java.util.regex.Matcher.replaceAll)
167 4. regex Pattern and function
168 (Calls function with re-groups of each match, uses return
169 value as replacement.)"
170 :arglists '([string pattern replacement])
171 :tag String}
172 replace
173 (fn [^String string pattern replacement]
174 [(class pattern) (class replacement)]))
175
176 (defmethod replace [String String] [^String s ^String a ^String b]
177 (.replace s a b))
178
179 (defmethod replace [Character Character] [^String s ^Character a ^Character b]
180 (.replace s a b))
181
182 (defmethod replace [Pattern String] [^String s re replacement]
183 (.replaceAll (re-matcher re s) replacement))
184
185 (defmethod replace [Pattern clojure.lang.IFn] [^String s re replacement]
186 (let [m (re-matcher re s)]
187 (let [buffer (StringBuffer. (.length s))]
188 (loop []
189 (if (.find m)
190 (do (.appendReplacement m buffer (replacement (re-groups m)))
191 (recur))
192 (do (.appendTail m buffer)
193 (.toString buffer)))))))
194
195 (defmulti
196 ^{:doc "Replaces the first instance of pattern in s with replacement.
197
198 Allowed argument types for pattern and replacement are:
199 1. String and String
200 2. regex Pattern and String
201 (Uses java.util.regex.Matcher.replaceAll)
202 3. regex Pattern and function
203 "
204 :arglists '([s pattern replacement])
205 :tag String}
206 replace-first
207 (fn [s pattern replacement]
208 [(class pattern) (class replacement)]))
209
210 (defmethod replace-first [String String] [^String s pattern replacement]
211 (.replaceFirst (re-matcher (Pattern/quote pattern) s) replacement))
212
213 (defmethod replace-first [Pattern String] [^String s re replacement]
214 (.replaceFirst (re-matcher re s) replacement))
215
216 (defmethod replace-first [Pattern clojure.lang.IFn] [^String s ^Pattern re f]
217 (let [m (re-matcher re s)]
218 (let [buffer (StringBuffer.)]
219 (if (.find m)
220 (let [rep (f (re-groups m))]
221 (.appendReplacement m buffer rep)
222 (.appendTail m buffer)
223 (str buffer))))))
224
225 (defn partition
226 "Splits the string into a lazy sequence of substrings, alternating
227 between substrings that match the patthern and the substrings
228 between the matches. The sequence always starts with the substring
229 before the first match, or an empty string if the beginning of the
230 string matches.
231
232 For example: (partition \"abc123def\" #\"[a-z]+\")
233 returns: (\"\" \"abc\" \"123\" \"def\")"
234 [^String s ^Pattern re]
235 (let [m (re-matcher re s)]
236 ((fn step [prevend]
237 (lazy-seq
238 (if (.find m)
239 (cons (.subSequence s prevend (.start m))
240 (cons (re-groups m)
241 (step (+ (.start m) (count (.group m))))))
242 (when (< prevend (.length s))
243 (list (.subSequence s prevend (.length s)))))))
244 0)))
245
246 (defn ^String join
247 "Returns a string of all elements in coll, separated by
248 separator. Like Perl's join."
249 [^String separator coll]
250 (apply str (interpose separator coll)))
251
252 (defn ^String chop
253 "Removes the last character of string, does nothing on a zero-length
254 string."
255 [^String s]
256 (let [size (count s)]
257 (if (zero? size)
258 s
259 (subs s 0 (dec (count s))))))
260
261 (defn ^String chomp
262 "Removes all trailing newline \\n or return \\r characters from
263 string. Note: String.trim() is similar and faster."
264 [^String s]
265 (replace s #"[\r\n]+$" ""))
266
267 (defn title-case [^String s]
268 (throw (Exception. "title-case not implemeted yet")))
269
270 (defn ^String swap-case
271 "Changes upper case characters to lower case and vice-versa.
272 Handles Unicode supplementary characters correctly. Uses the
273 locale-sensitive String.toUpperCase() and String.toLowerCase()
274 methods."
275 [^String s]
276 (let [buffer (StringBuilder. (.length s))
277 ;; array to make a String from one code point
278 ^"[I" array (make-array Integer/TYPE 1)]
279 (docodepoints [c s]
280 (aset-int array 0 c)
281 (if (Character/isLowerCase c)
282 ;; Character.toUpperCase is not locale-sensitive, but
283 ;; String.toUpperCase is; so we use a String.
284 (.append buffer (.toUpperCase (String. array 0 1)))
285 (.append buffer (.toLowerCase (String. array 0 1)))))
286 (.toString buffer)))
287
288 (defn ^String capitalize
289 "Converts first character of the string to upper-case, all other
290 characters to lower-case."
291 [^String s]
292 (if (< (count s) 2)
293 (.toUpperCase s)
294 (str (.toUpperCase ^String (subs s 0 1))
295 (.toLowerCase ^String (subs s 1)))))
296
297 (defn ^String ltrim
298 "Removes whitespace from the left side of string."
299 [^String s]
300 (replace s #"^\s+" ""))
301
302 (defn ^String rtrim
303 "Removes whitespace from the right side of string."
304 [^String s]
305 (replace s #"\s+$" ""))
306
307 (defn split-lines
308 "Splits s on \\n or \\r\\n."
309 [^String s]
310 (seq (.split #"\r?\n" s)))
311
312 ;; borrowed from compojure.str-utils, by James Reeves, EPL 1.0
313 (defn ^String map-str
314 "Apply f to each element of coll, concatenate all results into a
315 String."
316 [f coll]
317 (apply str (map f coll)))
318
319 ;; borrowed from compojure.str-utils, by James Reeves, EPL 1.0
320 (defn grep
321 "Filters elements of coll by a regular expression. The String
322 representation (with str) of each element is tested with re-find."
323 [re coll]
324 (filter (fn [x] (re-find re (str x))) coll))
325
326 (defn partial
327 "Like clojure.core/partial for functions that take their primary
328 argument first.
329
330 Takes a function f and its arguments, NOT INCLUDING the first
331 argument. Returns a new function whose first argument will be the
332 first argument to f.
333
334 Example: (str-utils2/partial str-utils2/take 2)
335 ;;=> (fn [s] (str-utils2/take s 2))"
336 [f & args]
337 (fn [s & more] (apply f s (concat args more))))
338
339
340 ;;; WRAPPERS
341
342 ;; The following functions are simple wrappers around java.lang.String
343 ;; functions. They are included here for completeness, and for use
344 ;; when mapping over a collection of strings.
345
346 (defn ^String upper-case
347 "Converts string to all upper-case."
348 [^String s]
349 (.toUpperCase s))
350
351 (defn ^String lower-case
352 "Converts string to all lower-case."
353 [^String s]
354 (.toLowerCase s))
355
356 (defn split
357 "Splits string on a regular expression. Optional argument limit is
358 the maximum number of splits."
359 ([^String s ^Pattern re] (seq (.split re s)))
360 ([^String s ^Pattern re limit] (seq (.split re s limit))))
361
362 (defn ^String trim
363 "Removes whitespace from both ends of string."
364 [^String s]
365 (.trim s))
366
367 (defn ^String contains?
368 "True if s contains the substring."
369 [^String s substring]
370 (.contains s substring))
371
372 (defn ^String get
373 "Gets the i'th character in string."
374 [^String s i]
375 (.charAt s i))
376