rlm@10
|
1 ;;; str_utils2.clj -- functional string utilities for Clojure
|
rlm@10
|
2
|
rlm@10
|
3 ;; by Stuart Sierra, http://stuartsierra.com/
|
rlm@10
|
4 ;; August 19, 2009
|
rlm@10
|
5
|
rlm@10
|
6 ;; Copyright (c) Stuart Sierra, 2009. All rights reserved. The use
|
rlm@10
|
7 ;; and distribution terms for this software are covered by the Eclipse
|
rlm@10
|
8 ;; Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php)
|
rlm@10
|
9 ;; which can be found in the file epl-v10.html at the root of this
|
rlm@10
|
10 ;; distribution. By using this software in any fashion, you are
|
rlm@10
|
11 ;; agreeing to be bound by the terms of this license. You must not
|
rlm@10
|
12 ;; remove this notice, or any other, from this software.
|
rlm@10
|
13
|
rlm@10
|
14 ;; DEPRECATED in 1.2: Promoted to clojure.java.string. Note that
|
rlm@10
|
15 ;; many function names and semantics have changed
|
rlm@10
|
16
|
rlm@10
|
17 (ns ^{:author "Stuart Sierra"
|
rlm@10
|
18 :deprecated "1.2"
|
rlm@10
|
19 :doc "This is a library of string manipulation functions. It
|
rlm@10
|
20 is intented as a replacement for clojure.contrib.str-utils.
|
rlm@10
|
21
|
rlm@10
|
22 You cannot (use 'clojure.contrib.str-utils2) because it defines
|
rlm@10
|
23 functions with the same names as functions in clojure.core.
|
rlm@10
|
24 Instead, do (require '[clojure.contrib.str-utils2 :as s])
|
rlm@10
|
25 or something similar.
|
rlm@10
|
26
|
rlm@10
|
27 Goals:
|
rlm@10
|
28 1. Be functional
|
rlm@10
|
29 2. String argument first, to work with ->
|
rlm@10
|
30 3. Performance linear in string length
|
rlm@10
|
31
|
rlm@10
|
32 Some ideas are borrowed from
|
rlm@10
|
33 http://github.com/francoisdevlin/devlinsf-clojure-utils/"}
|
rlm@10
|
34 clojure.contrib.str-utils2
|
rlm@10
|
35 (:refer-clojure :exclude (take replace drop butlast partition
|
rlm@10
|
36 contains? get repeat reverse partial))
|
rlm@10
|
37 (:import (java.util.regex Pattern)))
|
rlm@10
|
38
|
rlm@10
|
39
|
rlm@10
|
40 (defmacro dochars
|
rlm@10
|
41 "bindings => [name string]
|
rlm@10
|
42
|
rlm@10
|
43 Repeatedly executes body, with name bound to each character in
|
rlm@10
|
44 string. Does NOT handle Unicode supplementary characters (above
|
rlm@10
|
45 U+FFFF)."
|
rlm@10
|
46 [bindings & body]
|
rlm@10
|
47 (assert (vector bindings))
|
rlm@10
|
48 (assert (= 2 (count bindings)))
|
rlm@10
|
49 ;; This seems to be the fastest way to iterate over characters.
|
rlm@10
|
50 `(let [^String s# ~(second bindings)]
|
rlm@10
|
51 (dotimes [i# (.length s#)]
|
rlm@10
|
52 (let [~(first bindings) (.charAt s# i#)]
|
rlm@10
|
53 ~@body))))
|
rlm@10
|
54
|
rlm@10
|
55
|
rlm@10
|
56 (defmacro docodepoints
|
rlm@10
|
57 "bindings => [name string]
|
rlm@10
|
58
|
rlm@10
|
59 Repeatedly executes body, with name bound to the integer code point
|
rlm@10
|
60 of each Unicode character in the string. Handles Unicode
|
rlm@10
|
61 supplementary characters (above U+FFFF) correctly."
|
rlm@10
|
62 [bindings & body]
|
rlm@10
|
63 (assert (vector bindings))
|
rlm@10
|
64 (assert (= 2 (count bindings)))
|
rlm@10
|
65 (let [character (first bindings)
|
rlm@10
|
66 string (second bindings)]
|
rlm@10
|
67 `(let [^String s# ~string
|
rlm@10
|
68 len# (.length s#)]
|
rlm@10
|
69 (loop [i# 0]
|
rlm@10
|
70 (when (< i# len#)
|
rlm@10
|
71 (let [~character (.charAt s# i#)]
|
rlm@10
|
72 (if (Character/isHighSurrogate ~character)
|
rlm@10
|
73 (let [~character (.codePointAt s# i#)]
|
rlm@10
|
74 ~@body
|
rlm@10
|
75 (recur (+ 2 i#)))
|
rlm@10
|
76 (let [~character (int ~character)]
|
rlm@10
|
77 ~@body
|
rlm@10
|
78 (recur (inc i#))))))))))
|
rlm@10
|
79
|
rlm@10
|
80 (defn codepoints
|
rlm@10
|
81 "Returns a sequence of integer Unicode code points in s. Handles
|
rlm@10
|
82 Unicode supplementary characters (above U+FFFF) correctly."
|
rlm@10
|
83 [^String s]
|
rlm@10
|
84 (let [len (.length s)
|
rlm@10
|
85 f (fn thisfn [^String s i]
|
rlm@10
|
86 (when (< i len)
|
rlm@10
|
87 (let [c (.charAt s i)]
|
rlm@10
|
88 (if (Character/isHighSurrogate c)
|
rlm@10
|
89 (cons (.codePointAt s i) (thisfn s (+ 2 i)))
|
rlm@10
|
90 (cons (int c) (thisfn s (inc i)))))))]
|
rlm@10
|
91 (lazy-seq (f s 0))))
|
rlm@10
|
92
|
rlm@10
|
93 (defn ^String escape
|
rlm@10
|
94 "Returns a new String by applying cmap (a function or a map) to each
|
rlm@10
|
95 character in s. If cmap returns nil, the original character is
|
rlm@10
|
96 added to the output unchanged."
|
rlm@10
|
97 [^String s cmap]
|
rlm@10
|
98 (let [buffer (StringBuilder. (.length s))]
|
rlm@10
|
99 (dochars [c s]
|
rlm@10
|
100 (if-let [r (cmap c)]
|
rlm@10
|
101 (.append buffer r)
|
rlm@10
|
102 (.append buffer c)))
|
rlm@10
|
103 (.toString buffer)))
|
rlm@10
|
104
|
rlm@10
|
105 (defn blank?
|
rlm@10
|
106 "True if s is nil, empty, or contains only whitespace."
|
rlm@10
|
107 [^String s]
|
rlm@10
|
108 (every? (fn [^Character c] (Character/isWhitespace c)) s))
|
rlm@10
|
109
|
rlm@10
|
110 (defn ^String take
|
rlm@10
|
111 "Take first n characters from s, up to the length of s.
|
rlm@10
|
112
|
rlm@10
|
113 Note the argument order is the opposite of clojure.core/take; this
|
rlm@10
|
114 is to keep the string as the first argument for use with ->"
|
rlm@10
|
115 [^String s n]
|
rlm@10
|
116 (if (< (count s) n)
|
rlm@10
|
117 s
|
rlm@10
|
118 (.substring s 0 n)))
|
rlm@10
|
119
|
rlm@10
|
120 (defn ^String drop
|
rlm@10
|
121 "Drops first n characters from s. Returns an empty string if n is
|
rlm@10
|
122 greater than the length of s.
|
rlm@10
|
123
|
rlm@10
|
124 Note the argument order is the opposite of clojure.core/drop; this
|
rlm@10
|
125 is to keep the string as the first argument for use with ->"
|
rlm@10
|
126 [^String s n]
|
rlm@10
|
127 (if (< (count s) n)
|
rlm@10
|
128 ""
|
rlm@10
|
129 (.substring s n)))
|
rlm@10
|
130
|
rlm@10
|
131 (defn ^String butlast
|
rlm@10
|
132 "Returns s without the last n characters. Returns an empty string
|
rlm@10
|
133 if n is greater than the length of s.
|
rlm@10
|
134
|
rlm@10
|
135 Note the argument order is the opposite of clojure.core/butlast;
|
rlm@10
|
136 this is to keep the string as the first argument for use with ->"
|
rlm@10
|
137 [^String s n]
|
rlm@10
|
138 (if (< (count s) n)
|
rlm@10
|
139 ""
|
rlm@10
|
140 (.substring s 0 (- (count s) n))))
|
rlm@10
|
141
|
rlm@10
|
142 (defn ^String tail
|
rlm@10
|
143 "Returns the last n characters of s."
|
rlm@10
|
144 [^String s n]
|
rlm@10
|
145 (if (< (count s) n)
|
rlm@10
|
146 s
|
rlm@10
|
147 (.substring s (- (count s) n))))
|
rlm@10
|
148
|
rlm@10
|
149 (defn ^String repeat
|
rlm@10
|
150 "Returns a new String containing s repeated n times."
|
rlm@10
|
151 [^String s n]
|
rlm@10
|
152 (apply str (clojure.core/repeat n s)))
|
rlm@10
|
153
|
rlm@10
|
154 (defn ^String reverse
|
rlm@10
|
155 "Returns s with its characters reversed."
|
rlm@10
|
156 [^String s]
|
rlm@10
|
157 (.toString (.reverse (StringBuilder. s))))
|
rlm@10
|
158
|
rlm@10
|
159 (defmulti
|
rlm@10
|
160 ^{:doc "Replaces all instances of pattern in string with replacement.
|
rlm@10
|
161
|
rlm@10
|
162 Allowed argument types for pattern and replacement are:
|
rlm@10
|
163 1. String and String
|
rlm@10
|
164 2. Character and Character
|
rlm@10
|
165 3. regex Pattern and String
|
rlm@10
|
166 (Uses java.util.regex.Matcher.replaceAll)
|
rlm@10
|
167 4. regex Pattern and function
|
rlm@10
|
168 (Calls function with re-groups of each match, uses return
|
rlm@10
|
169 value as replacement.)"
|
rlm@10
|
170 :arglists '([string pattern replacement])
|
rlm@10
|
171 :tag String}
|
rlm@10
|
172 replace
|
rlm@10
|
173 (fn [^String string pattern replacement]
|
rlm@10
|
174 [(class pattern) (class replacement)]))
|
rlm@10
|
175
|
rlm@10
|
176 (defmethod replace [String String] [^String s ^String a ^String b]
|
rlm@10
|
177 (.replace s a b))
|
rlm@10
|
178
|
rlm@10
|
179 (defmethod replace [Character Character] [^String s ^Character a ^Character b]
|
rlm@10
|
180 (.replace s a b))
|
rlm@10
|
181
|
rlm@10
|
182 (defmethod replace [Pattern String] [^String s re replacement]
|
rlm@10
|
183 (.replaceAll (re-matcher re s) replacement))
|
rlm@10
|
184
|
rlm@10
|
185 (defmethod replace [Pattern clojure.lang.IFn] [^String s re replacement]
|
rlm@10
|
186 (let [m (re-matcher re s)]
|
rlm@10
|
187 (let [buffer (StringBuffer. (.length s))]
|
rlm@10
|
188 (loop []
|
rlm@10
|
189 (if (.find m)
|
rlm@10
|
190 (do (.appendReplacement m buffer (replacement (re-groups m)))
|
rlm@10
|
191 (recur))
|
rlm@10
|
192 (do (.appendTail m buffer)
|
rlm@10
|
193 (.toString buffer)))))))
|
rlm@10
|
194
|
rlm@10
|
195 (defmulti
|
rlm@10
|
196 ^{:doc "Replaces the first instance of pattern in s with replacement.
|
rlm@10
|
197
|
rlm@10
|
198 Allowed argument types for pattern and replacement are:
|
rlm@10
|
199 1. String and String
|
rlm@10
|
200 2. regex Pattern and String
|
rlm@10
|
201 (Uses java.util.regex.Matcher.replaceAll)
|
rlm@10
|
202 3. regex Pattern and function
|
rlm@10
|
203 "
|
rlm@10
|
204 :arglists '([s pattern replacement])
|
rlm@10
|
205 :tag String}
|
rlm@10
|
206 replace-first
|
rlm@10
|
207 (fn [s pattern replacement]
|
rlm@10
|
208 [(class pattern) (class replacement)]))
|
rlm@10
|
209
|
rlm@10
|
210 (defmethod replace-first [String String] [^String s pattern replacement]
|
rlm@10
|
211 (.replaceFirst (re-matcher (Pattern/quote pattern) s) replacement))
|
rlm@10
|
212
|
rlm@10
|
213 (defmethod replace-first [Pattern String] [^String s re replacement]
|
rlm@10
|
214 (.replaceFirst (re-matcher re s) replacement))
|
rlm@10
|
215
|
rlm@10
|
216 (defmethod replace-first [Pattern clojure.lang.IFn] [^String s ^Pattern re f]
|
rlm@10
|
217 (let [m (re-matcher re s)]
|
rlm@10
|
218 (let [buffer (StringBuffer.)]
|
rlm@10
|
219 (if (.find m)
|
rlm@10
|
220 (let [rep (f (re-groups m))]
|
rlm@10
|
221 (.appendReplacement m buffer rep)
|
rlm@10
|
222 (.appendTail m buffer)
|
rlm@10
|
223 (str buffer))))))
|
rlm@10
|
224
|
rlm@10
|
225 (defn partition
|
rlm@10
|
226 "Splits the string into a lazy sequence of substrings, alternating
|
rlm@10
|
227 between substrings that match the patthern and the substrings
|
rlm@10
|
228 between the matches. The sequence always starts with the substring
|
rlm@10
|
229 before the first match, or an empty string if the beginning of the
|
rlm@10
|
230 string matches.
|
rlm@10
|
231
|
rlm@10
|
232 For example: (partition \"abc123def\" #\"[a-z]+\")
|
rlm@10
|
233 returns: (\"\" \"abc\" \"123\" \"def\")"
|
rlm@10
|
234 [^String s ^Pattern re]
|
rlm@10
|
235 (let [m (re-matcher re s)]
|
rlm@10
|
236 ((fn step [prevend]
|
rlm@10
|
237 (lazy-seq
|
rlm@10
|
238 (if (.find m)
|
rlm@10
|
239 (cons (.subSequence s prevend (.start m))
|
rlm@10
|
240 (cons (re-groups m)
|
rlm@10
|
241 (step (+ (.start m) (count (.group m))))))
|
rlm@10
|
242 (when (< prevend (.length s))
|
rlm@10
|
243 (list (.subSequence s prevend (.length s)))))))
|
rlm@10
|
244 0)))
|
rlm@10
|
245
|
rlm@10
|
246 (defn ^String join
|
rlm@10
|
247 "Returns a string of all elements in coll, separated by
|
rlm@10
|
248 separator. Like Perl's join."
|
rlm@10
|
249 [^String separator coll]
|
rlm@10
|
250 (apply str (interpose separator coll)))
|
rlm@10
|
251
|
rlm@10
|
252 (defn ^String chop
|
rlm@10
|
253 "Removes the last character of string, does nothing on a zero-length
|
rlm@10
|
254 string."
|
rlm@10
|
255 [^String s]
|
rlm@10
|
256 (let [size (count s)]
|
rlm@10
|
257 (if (zero? size)
|
rlm@10
|
258 s
|
rlm@10
|
259 (subs s 0 (dec (count s))))))
|
rlm@10
|
260
|
rlm@10
|
261 (defn ^String chomp
|
rlm@10
|
262 "Removes all trailing newline \\n or return \\r characters from
|
rlm@10
|
263 string. Note: String.trim() is similar and faster."
|
rlm@10
|
264 [^String s]
|
rlm@10
|
265 (replace s #"[\r\n]+$" ""))
|
rlm@10
|
266
|
rlm@10
|
267 (defn title-case [^String s]
|
rlm@10
|
268 (throw (Exception. "title-case not implemeted yet")))
|
rlm@10
|
269
|
rlm@10
|
270 (defn ^String swap-case
|
rlm@10
|
271 "Changes upper case characters to lower case and vice-versa.
|
rlm@10
|
272 Handles Unicode supplementary characters correctly. Uses the
|
rlm@10
|
273 locale-sensitive String.toUpperCase() and String.toLowerCase()
|
rlm@10
|
274 methods."
|
rlm@10
|
275 [^String s]
|
rlm@10
|
276 (let [buffer (StringBuilder. (.length s))
|
rlm@10
|
277 ;; array to make a String from one code point
|
rlm@10
|
278 ^"[I" array (make-array Integer/TYPE 1)]
|
rlm@10
|
279 (docodepoints [c s]
|
rlm@10
|
280 (aset-int array 0 c)
|
rlm@10
|
281 (if (Character/isLowerCase c)
|
rlm@10
|
282 ;; Character.toUpperCase is not locale-sensitive, but
|
rlm@10
|
283 ;; String.toUpperCase is; so we use a String.
|
rlm@10
|
284 (.append buffer (.toUpperCase (String. array 0 1)))
|
rlm@10
|
285 (.append buffer (.toLowerCase (String. array 0 1)))))
|
rlm@10
|
286 (.toString buffer)))
|
rlm@10
|
287
|
rlm@10
|
288 (defn ^String capitalize
|
rlm@10
|
289 "Converts first character of the string to upper-case, all other
|
rlm@10
|
290 characters to lower-case."
|
rlm@10
|
291 [^String s]
|
rlm@10
|
292 (if (< (count s) 2)
|
rlm@10
|
293 (.toUpperCase s)
|
rlm@10
|
294 (str (.toUpperCase ^String (subs s 0 1))
|
rlm@10
|
295 (.toLowerCase ^String (subs s 1)))))
|
rlm@10
|
296
|
rlm@10
|
297 (defn ^String ltrim
|
rlm@10
|
298 "Removes whitespace from the left side of string."
|
rlm@10
|
299 [^String s]
|
rlm@10
|
300 (replace s #"^\s+" ""))
|
rlm@10
|
301
|
rlm@10
|
302 (defn ^String rtrim
|
rlm@10
|
303 "Removes whitespace from the right side of string."
|
rlm@10
|
304 [^String s]
|
rlm@10
|
305 (replace s #"\s+$" ""))
|
rlm@10
|
306
|
rlm@10
|
307 (defn split-lines
|
rlm@10
|
308 "Splits s on \\n or \\r\\n."
|
rlm@10
|
309 [^String s]
|
rlm@10
|
310 (seq (.split #"\r?\n" s)))
|
rlm@10
|
311
|
rlm@10
|
312 ;; borrowed from compojure.str-utils, by James Reeves, EPL 1.0
|
rlm@10
|
313 (defn ^String map-str
|
rlm@10
|
314 "Apply f to each element of coll, concatenate all results into a
|
rlm@10
|
315 String."
|
rlm@10
|
316 [f coll]
|
rlm@10
|
317 (apply str (map f coll)))
|
rlm@10
|
318
|
rlm@10
|
319 ;; borrowed from compojure.str-utils, by James Reeves, EPL 1.0
|
rlm@10
|
320 (defn grep
|
rlm@10
|
321 "Filters elements of coll by a regular expression. The String
|
rlm@10
|
322 representation (with str) of each element is tested with re-find."
|
rlm@10
|
323 [re coll]
|
rlm@10
|
324 (filter (fn [x] (re-find re (str x))) coll))
|
rlm@10
|
325
|
rlm@10
|
326 (defn partial
|
rlm@10
|
327 "Like clojure.core/partial for functions that take their primary
|
rlm@10
|
328 argument first.
|
rlm@10
|
329
|
rlm@10
|
330 Takes a function f and its arguments, NOT INCLUDING the first
|
rlm@10
|
331 argument. Returns a new function whose first argument will be the
|
rlm@10
|
332 first argument to f.
|
rlm@10
|
333
|
rlm@10
|
334 Example: (str-utils2/partial str-utils2/take 2)
|
rlm@10
|
335 ;;=> (fn [s] (str-utils2/take s 2))"
|
rlm@10
|
336 [f & args]
|
rlm@10
|
337 (fn [s & more] (apply f s (concat args more))))
|
rlm@10
|
338
|
rlm@10
|
339
|
rlm@10
|
340 ;;; WRAPPERS
|
rlm@10
|
341
|
rlm@10
|
342 ;; The following functions are simple wrappers around java.lang.String
|
rlm@10
|
343 ;; functions. They are included here for completeness, and for use
|
rlm@10
|
344 ;; when mapping over a collection of strings.
|
rlm@10
|
345
|
rlm@10
|
346 (defn ^String upper-case
|
rlm@10
|
347 "Converts string to all upper-case."
|
rlm@10
|
348 [^String s]
|
rlm@10
|
349 (.toUpperCase s))
|
rlm@10
|
350
|
rlm@10
|
351 (defn ^String lower-case
|
rlm@10
|
352 "Converts string to all lower-case."
|
rlm@10
|
353 [^String s]
|
rlm@10
|
354 (.toLowerCase s))
|
rlm@10
|
355
|
rlm@10
|
356 (defn split
|
rlm@10
|
357 "Splits string on a regular expression. Optional argument limit is
|
rlm@10
|
358 the maximum number of splits."
|
rlm@10
|
359 ([^String s ^Pattern re] (seq (.split re s)))
|
rlm@10
|
360 ([^String s ^Pattern re limit] (seq (.split re s limit))))
|
rlm@10
|
361
|
rlm@10
|
362 (defn ^String trim
|
rlm@10
|
363 "Removes whitespace from both ends of string."
|
rlm@10
|
364 [^String s]
|
rlm@10
|
365 (.trim s))
|
rlm@10
|
366
|
rlm@10
|
367 (defn ^String contains?
|
rlm@10
|
368 "True if s contains the substring."
|
rlm@10
|
369 [^String s substring]
|
rlm@10
|
370 (.contains s substring))
|
rlm@10
|
371
|
rlm@10
|
372 (defn ^String get
|
rlm@10
|
373 "Gets the i'th character in string."
|
rlm@10
|
374 [^String s i]
|
rlm@10
|
375 (.charAt s i))
|
rlm@10
|
376
|