rlm@10
|
1 ;;; string.clj -- functional string utilities for Clojure
|
rlm@10
|
2
|
rlm@10
|
3 ;; by Stuart Sierra, http://stuartsierra.com/
|
rlm@10
|
4 ;; January 26, 2010
|
rlm@10
|
5
|
rlm@10
|
6 ;; Copyright (c) Stuart Sierra, 2010. All rights reserved. The use
|
rlm@10
|
7 ;; and distribution terms for this software are covered by the Eclipse
|
rlm@10
|
8 ;; Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php)
|
rlm@10
|
9 ;; which can be found in the file epl-v10.html at the root of this
|
rlm@10
|
10 ;; distribution. By using this software in any fashion, you are
|
rlm@10
|
11 ;; agreeing to be bound by the terms of this license. You must not
|
rlm@10
|
12 ;; remove this notice, or any other, from this software.
|
rlm@10
|
13
|
rlm@10
|
14 ;; DEPRECATED in 1.2: Many functions have moved to clojure.string.
|
rlm@10
|
15
|
rlm@10
|
16 (ns ^{:author "Stuart Sierra"
|
rlm@10
|
17 :doc "This is a library of string manipulation functions. It
|
rlm@10
|
18 is intented as a replacement for clojure.contrib.string.
|
rlm@10
|
19
|
rlm@10
|
20 You cannot (use 'clojure.contrib.string) because it defines
|
rlm@10
|
21 functions with the same names as functions in clojure.core.
|
rlm@10
|
22 Instead, do (require '[clojure.contrib.string :as s])
|
rlm@10
|
23 or something similar.
|
rlm@10
|
24
|
rlm@10
|
25 Goals:
|
rlm@10
|
26 1. Be functional
|
rlm@10
|
27 2. Most significant argument LAST, to work with ->>
|
rlm@10
|
28 3. At least O(n) performance for Strings of length n
|
rlm@10
|
29
|
rlm@10
|
30 Some ideas are borrowed from
|
rlm@10
|
31 http://github.com/francoisdevlin/devlinsf-clojure-utils/"}
|
rlm@10
|
32 clojure.contrib.string
|
rlm@10
|
33 (:refer-clojure :exclude (take replace drop butlast partition
|
rlm@10
|
34 contains? get repeat reverse partial))
|
rlm@10
|
35 (:import (java.util.regex Pattern)))
|
rlm@10
|
36
|
rlm@10
|
37
|
rlm@10
|
38 (defmacro dochars
|
rlm@10
|
39 "bindings => [name string]
|
rlm@10
|
40
|
rlm@10
|
41 Repeatedly executes body, with name bound to each character in
|
rlm@10
|
42 string. Does NOT handle Unicode supplementary characters (above
|
rlm@10
|
43 U+FFFF)."
|
rlm@10
|
44 [bindings & body]
|
rlm@10
|
45 (assert (vector bindings))
|
rlm@10
|
46 (assert (= 2 (count bindings)))
|
rlm@10
|
47 ;; This seems to be the fastest way to iterate over characters.
|
rlm@10
|
48 `(let [^String s# ~(second bindings)]
|
rlm@10
|
49 (dotimes [i# (.length s#)]
|
rlm@10
|
50 (let [~(first bindings) (.charAt s# i#)]
|
rlm@10
|
51 ~@body))))
|
rlm@10
|
52
|
rlm@10
|
53
|
rlm@10
|
54 (defmacro docodepoints
|
rlm@10
|
55 "bindings => [name string]
|
rlm@10
|
56
|
rlm@10
|
57 Repeatedly executes body, with name bound to the integer code point
|
rlm@10
|
58 of each Unicode character in the string. Handles Unicode
|
rlm@10
|
59 supplementary characters (above U+FFFF) correctly."
|
rlm@10
|
60 [bindings & body]
|
rlm@10
|
61 (assert (vector bindings))
|
rlm@10
|
62 (assert (= 2 (count bindings)))
|
rlm@10
|
63 (let [character (first bindings)
|
rlm@10
|
64 string (second bindings)]
|
rlm@10
|
65 `(let [^String s# ~string
|
rlm@10
|
66 len# (.length s#)]
|
rlm@10
|
67 (loop [i# 0]
|
rlm@10
|
68 (when (< i# len#)
|
rlm@10
|
69 (let [~character (.charAt s# i#)]
|
rlm@10
|
70 (if (Character/isHighSurrogate ~character)
|
rlm@10
|
71 (let [~character (.codePointAt s# i#)]
|
rlm@10
|
72 ~@body
|
rlm@10
|
73 (recur (+ 2 i#)))
|
rlm@10
|
74 (let [~character (int ~character)]
|
rlm@10
|
75 ~@body
|
rlm@10
|
76 (recur (inc i#))))))))))
|
rlm@10
|
77
|
rlm@10
|
78 (defn codepoints
|
rlm@10
|
79 "Returns a sequence of integer Unicode code points in s. Handles
|
rlm@10
|
80 Unicode supplementary characters (above U+FFFF) correctly."
|
rlm@10
|
81 [^String s]
|
rlm@10
|
82 (let [len (.length s)
|
rlm@10
|
83 f (fn thisfn [^String s i]
|
rlm@10
|
84 (when (< i len)
|
rlm@10
|
85 (let [c (.charAt s i)]
|
rlm@10
|
86 (if (Character/isHighSurrogate c)
|
rlm@10
|
87 (cons (.codePointAt s i) (thisfn s (+ 2 i)))
|
rlm@10
|
88 (cons (int c) (thisfn s (inc i)))))))]
|
rlm@10
|
89 (lazy-seq (f s 0))))
|
rlm@10
|
90
|
rlm@10
|
91 (defn ^String escape
|
rlm@10
|
92 "Returns a new String by applying cmap (a function or a map) to each
|
rlm@10
|
93 character in s. If cmap returns nil, the original character is
|
rlm@10
|
94 added to the output unchanged."
|
rlm@10
|
95 {:deprecated "1.2"}
|
rlm@10
|
96 [cmap ^String s]
|
rlm@10
|
97 (let [buffer (StringBuilder. (.length s))]
|
rlm@10
|
98 (dochars [c s]
|
rlm@10
|
99 (if-let [r (cmap c)]
|
rlm@10
|
100 (.append buffer r)
|
rlm@10
|
101 (.append buffer c)))
|
rlm@10
|
102 (.toString buffer)))
|
rlm@10
|
103
|
rlm@10
|
104 (defn blank?
|
rlm@10
|
105 "True if s is nil, empty, or contains only whitespace."
|
rlm@10
|
106 {:deprecated "1.2"}
|
rlm@10
|
107 [^String s]
|
rlm@10
|
108 (every? (fn [^Character c] (Character/isWhitespace c)) s))
|
rlm@10
|
109
|
rlm@10
|
110 (defn ^String take
|
rlm@10
|
111 "Take first n characters from s, up to the length of s."
|
rlm@10
|
112 [n ^String s]
|
rlm@10
|
113 (if (< (count s) n)
|
rlm@10
|
114 s
|
rlm@10
|
115 (.substring s 0 n)))
|
rlm@10
|
116
|
rlm@10
|
117 (defn ^String drop
|
rlm@10
|
118 "Drops first n characters from s. Returns an empty string if n is
|
rlm@10
|
119 greater than the length of s."
|
rlm@10
|
120 [n ^String s]
|
rlm@10
|
121 (if (< (count s) n)
|
rlm@10
|
122 ""
|
rlm@10
|
123 (.substring s n)))
|
rlm@10
|
124
|
rlm@10
|
125 (defn ^String butlast
|
rlm@10
|
126 "Returns s without the last n characters. Returns an empty string
|
rlm@10
|
127 if n is greater than the length of s."
|
rlm@10
|
128 [n ^String s]
|
rlm@10
|
129 (if (< (count s) n)
|
rlm@10
|
130 ""
|
rlm@10
|
131 (.substring s 0 (- (count s) n))))
|
rlm@10
|
132
|
rlm@10
|
133 (defn ^String tail
|
rlm@10
|
134 "Returns the last n characters of s."
|
rlm@10
|
135 [n ^String s]
|
rlm@10
|
136 (if (< (count s) n)
|
rlm@10
|
137 s
|
rlm@10
|
138 (.substring s (- (count s) n))))
|
rlm@10
|
139
|
rlm@10
|
140 (defn ^String repeat
|
rlm@10
|
141 "Returns a new String containing s repeated n times."
|
rlm@10
|
142 [n ^String s]
|
rlm@10
|
143 (apply str (clojure.core/repeat n s)))
|
rlm@10
|
144
|
rlm@10
|
145 (defn ^String reverse
|
rlm@10
|
146 "Returns s with its characters reversed."
|
rlm@10
|
147 {:deprecated "1.2"}
|
rlm@10
|
148 [^String s]
|
rlm@10
|
149 (.toString (.reverse (StringBuilder. s))))
|
rlm@10
|
150
|
rlm@10
|
151 (defn replace-str
|
rlm@10
|
152 "Replaces all instances of substring a with b in s."
|
rlm@10
|
153 {:deprecated "1.2"}
|
rlm@10
|
154 [^String a ^String b ^String s]
|
rlm@10
|
155 (.replace s a b))
|
rlm@10
|
156
|
rlm@10
|
157 (defn replace-char
|
rlm@10
|
158 "Replaces all instances of character a with character b in s."
|
rlm@10
|
159 {:deprecated "1.2"}
|
rlm@10
|
160 [^Character a ^Character b ^String s]
|
rlm@10
|
161 (.replace s a b))
|
rlm@10
|
162
|
rlm@10
|
163 (defn replace-re
|
rlm@10
|
164 "Replaces all matches of re with replacement in s."
|
rlm@10
|
165 {:deprecated "1.2"}
|
rlm@10
|
166 [re replacement ^String s]
|
rlm@10
|
167 (.replaceAll (re-matcher re s) replacement))
|
rlm@10
|
168
|
rlm@10
|
169 (defn replace-by
|
rlm@10
|
170 "Replaces all matches of re in s with the result of
|
rlm@10
|
171 (f (re-groups the-match))."
|
rlm@10
|
172 {:deprecated "1.2"}
|
rlm@10
|
173 [re f ^String s]
|
rlm@10
|
174 (let [m (re-matcher re s)]
|
rlm@10
|
175 (let [buffer (StringBuffer. (.length s))]
|
rlm@10
|
176 (loop []
|
rlm@10
|
177 (if (.find m)
|
rlm@10
|
178 (do (.appendReplacement m buffer (f (re-groups m)))
|
rlm@10
|
179 (recur))
|
rlm@10
|
180 (do (.appendTail m buffer)
|
rlm@10
|
181 (.toString buffer)))))))
|
rlm@10
|
182
|
rlm@10
|
183 (defn replace-first-str
|
rlm@10
|
184 "Replace first occurance of substring a with b in s."
|
rlm@10
|
185 {:deprecated "1.2"}
|
rlm@10
|
186 [^String a ^String b ^String s]
|
rlm@10
|
187 (.replaceFirst (re-matcher (Pattern/quote a) s) b))
|
rlm@10
|
188
|
rlm@10
|
189 (defn replace-first-re
|
rlm@10
|
190 "Replace first match of re in s."
|
rlm@10
|
191 {:deprecated "1.2"}
|
rlm@10
|
192 [^Pattern re ^String replacement ^String s]
|
rlm@10
|
193 (.replaceFirst (re-matcher re s) replacement))
|
rlm@10
|
194
|
rlm@10
|
195 (defn replace-first-by
|
rlm@10
|
196 "Replace first match of re in s with the result of
|
rlm@10
|
197 (f (re-groups the-match))."
|
rlm@10
|
198 {:deprecated "1.2"}
|
rlm@10
|
199 [^Pattern re f ^String s]
|
rlm@10
|
200 (let [m (re-matcher re s)]
|
rlm@10
|
201 (let [buffer (StringBuffer.)]
|
rlm@10
|
202 (if (.find m)
|
rlm@10
|
203 (let [rep (f (re-groups m))]
|
rlm@10
|
204 (.appendReplacement m buffer rep)
|
rlm@10
|
205 (.appendTail m buffer)
|
rlm@10
|
206 (str buffer))))))
|
rlm@10
|
207
|
rlm@10
|
208 (defn partition
|
rlm@10
|
209 "Splits the string into a lazy sequence of substrings, alternating
|
rlm@10
|
210 between substrings that match the patthern and the substrings
|
rlm@10
|
211 between the matches. The sequence always starts with the substring
|
rlm@10
|
212 before the first match, or an empty string if the beginning of the
|
rlm@10
|
213 string matches.
|
rlm@10
|
214
|
rlm@10
|
215 For example: (partition #\"[a-z]+\" \"abc123def\")
|
rlm@10
|
216 returns: (\"\" \"abc\" \"123\" \"def\")"
|
rlm@10
|
217 [^Pattern re ^String s]
|
rlm@10
|
218 (let [m (re-matcher re s)]
|
rlm@10
|
219 ((fn step [prevend]
|
rlm@10
|
220 (lazy-seq
|
rlm@10
|
221 (if (.find m)
|
rlm@10
|
222 (cons (.subSequence s prevend (.start m))
|
rlm@10
|
223 (cons (re-groups m)
|
rlm@10
|
224 (step (+ (.start m) (count (.group m))))))
|
rlm@10
|
225 (when (< prevend (.length s))
|
rlm@10
|
226 (list (.subSequence s prevend (.length s)))))))
|
rlm@10
|
227 0)))
|
rlm@10
|
228
|
rlm@10
|
229 (defn ^String join
|
rlm@10
|
230 "Returns a string of all elements in coll, separated by
|
rlm@10
|
231 separator. Like Perl's join."
|
rlm@10
|
232 {:deprecated "1.2"}
|
rlm@10
|
233 [^String separator coll]
|
rlm@10
|
234 (apply str (interpose separator coll)))
|
rlm@10
|
235
|
rlm@10
|
236 (defn ^String chop
|
rlm@10
|
237 "Removes the last character of string, does nothing on a zero-length
|
rlm@10
|
238 string."
|
rlm@10
|
239 [^String s]
|
rlm@10
|
240 (let [size (count s)]
|
rlm@10
|
241 (if (zero? size)
|
rlm@10
|
242 s
|
rlm@10
|
243 (subs s 0 (dec (count s))))))
|
rlm@10
|
244
|
rlm@10
|
245 (defn ^String chomp
|
rlm@10
|
246 "Removes all trailing newline \\n or return \\r characters from
|
rlm@10
|
247 string. Note: String.trim() is similar and faster.
|
rlm@10
|
248 Deprecated in 1.2. Use clojure.string/trim-newline"
|
rlm@10
|
249 {:deprecated "1.2"}
|
rlm@10
|
250 [^String s]
|
rlm@10
|
251 (replace-re #"[\r\n]+$" "" s))
|
rlm@10
|
252
|
rlm@10
|
253 (defn ^String swap-case
|
rlm@10
|
254 "Changes upper case characters to lower case and vice-versa.
|
rlm@10
|
255 Handles Unicode supplementary characters correctly. Uses the
|
rlm@10
|
256 locale-sensitive String.toUpperCase() and String.toLowerCase()
|
rlm@10
|
257 methods."
|
rlm@10
|
258 [^String s]
|
rlm@10
|
259 (let [buffer (StringBuilder. (.length s))
|
rlm@10
|
260 ;; array to make a String from one code point
|
rlm@10
|
261 ^"[I" array (make-array Integer/TYPE 1)]
|
rlm@10
|
262 (docodepoints [c s]
|
rlm@10
|
263 (aset-int array 0 c)
|
rlm@10
|
264 (if (Character/isLowerCase c)
|
rlm@10
|
265 ;; Character.toUpperCase is not locale-sensitive, but
|
rlm@10
|
266 ;; String.toUpperCase is; so we use a String.
|
rlm@10
|
267 (.append buffer (.toUpperCase (String. array 0 1)))
|
rlm@10
|
268 (.append buffer (.toLowerCase (String. array 0 1)))))
|
rlm@10
|
269 (.toString buffer)))
|
rlm@10
|
270
|
rlm@10
|
271 (defn ^String capitalize
|
rlm@10
|
272 "Converts first character of the string to upper-case, all other
|
rlm@10
|
273 characters to lower-case."
|
rlm@10
|
274 {:deprecated "1.2"}
|
rlm@10
|
275 [^String s]
|
rlm@10
|
276 (if (< (count s) 2)
|
rlm@10
|
277 (.toUpperCase s)
|
rlm@10
|
278 (str (.toUpperCase ^String (subs s 0 1))
|
rlm@10
|
279 (.toLowerCase ^String (subs s 1)))))
|
rlm@10
|
280
|
rlm@10
|
281 (defn ^String ltrim
|
rlm@10
|
282 "Removes whitespace from the left side of string.
|
rlm@10
|
283 Deprecated in 1.2. Use clojure.string/triml."
|
rlm@10
|
284 {:deprecated "1.2"}
|
rlm@10
|
285 [^String s]
|
rlm@10
|
286 (replace-re #"^\s+" "" s))
|
rlm@10
|
287
|
rlm@10
|
288 (defn ^String rtrim
|
rlm@10
|
289 "Removes whitespace from the right side of string.
|
rlm@10
|
290 Deprecated in 1.2. Use clojure.string/trimr."
|
rlm@10
|
291 {:deprecated "1.2"}
|
rlm@10
|
292 [^String s]
|
rlm@10
|
293 (replace-re #"\s+$" "" s))
|
rlm@10
|
294
|
rlm@10
|
295 (defn split-lines
|
rlm@10
|
296 "Splits s on \\n or \\r\\n."
|
rlm@10
|
297 {:deprecated "1.2"}
|
rlm@10
|
298 [^String s]
|
rlm@10
|
299 (seq (.split #"\r?\n" s)))
|
rlm@10
|
300
|
rlm@10
|
301 ;; borrowed from compojure.string, by James Reeves, EPL 1.0
|
rlm@10
|
302 (defn ^String map-str
|
rlm@10
|
303 "Apply f to each element of coll, concatenate all results into a
|
rlm@10
|
304 String."
|
rlm@10
|
305 [f coll]
|
rlm@10
|
306 (apply str (map f coll)))
|
rlm@10
|
307
|
rlm@10
|
308 ;; borrowed from compojure.string, by James Reeves, EPL 1.0
|
rlm@10
|
309 (defn grep
|
rlm@10
|
310 "Filters elements of coll by a regular expression. The String
|
rlm@10
|
311 representation (with str) of each element is tested with re-find."
|
rlm@10
|
312 [re coll]
|
rlm@10
|
313 (filter (fn [x] (re-find re (str x))) coll))
|
rlm@10
|
314
|
rlm@10
|
315 (defn as-str
|
rlm@10
|
316 "Like clojure.core/str, but if an argument is a keyword or symbol,
|
rlm@10
|
317 its name will be used instead of its literal representation.
|
rlm@10
|
318
|
rlm@10
|
319 Example:
|
rlm@10
|
320 (str :foo :bar) ;;=> \":foo:bar\"
|
rlm@10
|
321 (as-str :foo :bar) ;;=> \"foobar\"
|
rlm@10
|
322
|
rlm@10
|
323 Note that this does not apply to keywords or symbols nested within
|
rlm@10
|
324 data structures; they will be rendered as with str.
|
rlm@10
|
325
|
rlm@10
|
326 Example:
|
rlm@10
|
327 (str {:foo :bar}) ;;=> \"{:foo :bar}\"
|
rlm@10
|
328 (as-str {:foo :bar}) ;;=> \"{:foo :bar}\" "
|
rlm@10
|
329 ([] "")
|
rlm@10
|
330 ([x] (if (instance? clojure.lang.Named x)
|
rlm@10
|
331 (name x)
|
rlm@10
|
332 (str x)))
|
rlm@10
|
333 ([x & ys]
|
rlm@10
|
334 ((fn [^StringBuilder sb more]
|
rlm@10
|
335 (if more
|
rlm@10
|
336 (recur (. sb (append (as-str (first more)))) (next more))
|
rlm@10
|
337 (str sb)))
|
rlm@10
|
338 (new StringBuilder ^String (as-str x)) ys)))
|
rlm@10
|
339
|
rlm@10
|
340
|
rlm@10
|
341 ;;; WRAPPERS
|
rlm@10
|
342
|
rlm@10
|
343 ;; The following functions are simple wrappers around java.lang.String
|
rlm@10
|
344 ;; functions. They are included here for completeness, and for use
|
rlm@10
|
345 ;; when mapping over a collection of strings.
|
rlm@10
|
346
|
rlm@10
|
347 (defn ^String upper-case
|
rlm@10
|
348 "Converts string to all upper-case."
|
rlm@10
|
349 {:deprecated "1.2"}
|
rlm@10
|
350 [^String s]
|
rlm@10
|
351 (.toUpperCase s))
|
rlm@10
|
352
|
rlm@10
|
353 (defn ^String lower-case
|
rlm@10
|
354 "Converts string to all lower-case."
|
rlm@10
|
355 {:deprecated "1.2"}
|
rlm@10
|
356 [^String s]
|
rlm@10
|
357 (.toLowerCase s))
|
rlm@10
|
358
|
rlm@10
|
359 (defn split
|
rlm@10
|
360 "Splits string on a regular expression. Optional argument limit is
|
rlm@10
|
361 the maximum number of splits."
|
rlm@10
|
362 {:deprecated "1.2"}
|
rlm@10
|
363 ([^Pattern re ^String s] (seq (.split re s)))
|
rlm@10
|
364 ([^Pattern re limit ^String s] (seq (.split re s limit))))
|
rlm@10
|
365
|
rlm@10
|
366 (defn ^String trim
|
rlm@10
|
367 "Removes whitespace from both ends of string."
|
rlm@10
|
368 {:deprecated "1.2"}
|
rlm@10
|
369 [^String s]
|
rlm@10
|
370 (.trim s))
|
rlm@10
|
371
|
rlm@10
|
372 (defn ^String substring?
|
rlm@10
|
373 "True if s contains the substring."
|
rlm@10
|
374 [substring ^String s]
|
rlm@10
|
375 (.contains s substring))
|
rlm@10
|
376
|
rlm@10
|
377 (defn ^String get
|
rlm@10
|
378 "Gets the i'th character in string."
|
rlm@10
|
379 {:deprecated "1.2"}
|
rlm@10
|
380 [^String s i]
|
rlm@10
|
381 (.charAt s i))
|
rlm@10
|
382
|