annotate src/clojure/contrib/string.clj @ 10:ef7dbbd6452c

added clojure source goodness
author Robert McIntyre <rlm@mit.edu>
date Sat, 21 Aug 2010 06:25:44 -0400
parents
children
rev   line source
rlm@10 1 ;;; string.clj -- functional string utilities for Clojure
rlm@10 2
rlm@10 3 ;; by Stuart Sierra, http://stuartsierra.com/
rlm@10 4 ;; January 26, 2010
rlm@10 5
rlm@10 6 ;; Copyright (c) Stuart Sierra, 2010. All rights reserved. The use
rlm@10 7 ;; and distribution terms for this software are covered by the Eclipse
rlm@10 8 ;; Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php)
rlm@10 9 ;; which can be found in the file epl-v10.html at the root of this
rlm@10 10 ;; distribution. By using this software in any fashion, you are
rlm@10 11 ;; agreeing to be bound by the terms of this license. You must not
rlm@10 12 ;; remove this notice, or any other, from this software.
rlm@10 13
rlm@10 14 ;; DEPRECATED in 1.2: Many functions have moved to clojure.string.
rlm@10 15
rlm@10 16 (ns ^{:author "Stuart Sierra"
rlm@10 17 :doc "This is a library of string manipulation functions. It
rlm@10 18 is intented as a replacement for clojure.contrib.string.
rlm@10 19
rlm@10 20 You cannot (use 'clojure.contrib.string) because it defines
rlm@10 21 functions with the same names as functions in clojure.core.
rlm@10 22 Instead, do (require '[clojure.contrib.string :as s])
rlm@10 23 or something similar.
rlm@10 24
rlm@10 25 Goals:
rlm@10 26 1. Be functional
rlm@10 27 2. Most significant argument LAST, to work with ->>
rlm@10 28 3. At least O(n) performance for Strings of length n
rlm@10 29
rlm@10 30 Some ideas are borrowed from
rlm@10 31 http://github.com/francoisdevlin/devlinsf-clojure-utils/"}
rlm@10 32 clojure.contrib.string
rlm@10 33 (:refer-clojure :exclude (take replace drop butlast partition
rlm@10 34 contains? get repeat reverse partial))
rlm@10 35 (:import (java.util.regex Pattern)))
rlm@10 36
rlm@10 37
rlm@10 38 (defmacro dochars
rlm@10 39 "bindings => [name string]
rlm@10 40
rlm@10 41 Repeatedly executes body, with name bound to each character in
rlm@10 42 string. Does NOT handle Unicode supplementary characters (above
rlm@10 43 U+FFFF)."
rlm@10 44 [bindings & body]
rlm@10 45 (assert (vector bindings))
rlm@10 46 (assert (= 2 (count bindings)))
rlm@10 47 ;; This seems to be the fastest way to iterate over characters.
rlm@10 48 `(let [^String s# ~(second bindings)]
rlm@10 49 (dotimes [i# (.length s#)]
rlm@10 50 (let [~(first bindings) (.charAt s# i#)]
rlm@10 51 ~@body))))
rlm@10 52
rlm@10 53
rlm@10 54 (defmacro docodepoints
rlm@10 55 "bindings => [name string]
rlm@10 56
rlm@10 57 Repeatedly executes body, with name bound to the integer code point
rlm@10 58 of each Unicode character in the string. Handles Unicode
rlm@10 59 supplementary characters (above U+FFFF) correctly."
rlm@10 60 [bindings & body]
rlm@10 61 (assert (vector bindings))
rlm@10 62 (assert (= 2 (count bindings)))
rlm@10 63 (let [character (first bindings)
rlm@10 64 string (second bindings)]
rlm@10 65 `(let [^String s# ~string
rlm@10 66 len# (.length s#)]
rlm@10 67 (loop [i# 0]
rlm@10 68 (when (< i# len#)
rlm@10 69 (let [~character (.charAt s# i#)]
rlm@10 70 (if (Character/isHighSurrogate ~character)
rlm@10 71 (let [~character (.codePointAt s# i#)]
rlm@10 72 ~@body
rlm@10 73 (recur (+ 2 i#)))
rlm@10 74 (let [~character (int ~character)]
rlm@10 75 ~@body
rlm@10 76 (recur (inc i#))))))))))
rlm@10 77
rlm@10 78 (defn codepoints
rlm@10 79 "Returns a sequence of integer Unicode code points in s. Handles
rlm@10 80 Unicode supplementary characters (above U+FFFF) correctly."
rlm@10 81 [^String s]
rlm@10 82 (let [len (.length s)
rlm@10 83 f (fn thisfn [^String s i]
rlm@10 84 (when (< i len)
rlm@10 85 (let [c (.charAt s i)]
rlm@10 86 (if (Character/isHighSurrogate c)
rlm@10 87 (cons (.codePointAt s i) (thisfn s (+ 2 i)))
rlm@10 88 (cons (int c) (thisfn s (inc i)))))))]
rlm@10 89 (lazy-seq (f s 0))))
rlm@10 90
rlm@10 91 (defn ^String escape
rlm@10 92 "Returns a new String by applying cmap (a function or a map) to each
rlm@10 93 character in s. If cmap returns nil, the original character is
rlm@10 94 added to the output unchanged."
rlm@10 95 {:deprecated "1.2"}
rlm@10 96 [cmap ^String s]
rlm@10 97 (let [buffer (StringBuilder. (.length s))]
rlm@10 98 (dochars [c s]
rlm@10 99 (if-let [r (cmap c)]
rlm@10 100 (.append buffer r)
rlm@10 101 (.append buffer c)))
rlm@10 102 (.toString buffer)))
rlm@10 103
rlm@10 104 (defn blank?
rlm@10 105 "True if s is nil, empty, or contains only whitespace."
rlm@10 106 {:deprecated "1.2"}
rlm@10 107 [^String s]
rlm@10 108 (every? (fn [^Character c] (Character/isWhitespace c)) s))
rlm@10 109
rlm@10 110 (defn ^String take
rlm@10 111 "Take first n characters from s, up to the length of s."
rlm@10 112 [n ^String s]
rlm@10 113 (if (< (count s) n)
rlm@10 114 s
rlm@10 115 (.substring s 0 n)))
rlm@10 116
rlm@10 117 (defn ^String drop
rlm@10 118 "Drops first n characters from s. Returns an empty string if n is
rlm@10 119 greater than the length of s."
rlm@10 120 [n ^String s]
rlm@10 121 (if (< (count s) n)
rlm@10 122 ""
rlm@10 123 (.substring s n)))
rlm@10 124
rlm@10 125 (defn ^String butlast
rlm@10 126 "Returns s without the last n characters. Returns an empty string
rlm@10 127 if n is greater than the length of s."
rlm@10 128 [n ^String s]
rlm@10 129 (if (< (count s) n)
rlm@10 130 ""
rlm@10 131 (.substring s 0 (- (count s) n))))
rlm@10 132
rlm@10 133 (defn ^String tail
rlm@10 134 "Returns the last n characters of s."
rlm@10 135 [n ^String s]
rlm@10 136 (if (< (count s) n)
rlm@10 137 s
rlm@10 138 (.substring s (- (count s) n))))
rlm@10 139
rlm@10 140 (defn ^String repeat
rlm@10 141 "Returns a new String containing s repeated n times."
rlm@10 142 [n ^String s]
rlm@10 143 (apply str (clojure.core/repeat n s)))
rlm@10 144
rlm@10 145 (defn ^String reverse
rlm@10 146 "Returns s with its characters reversed."
rlm@10 147 {:deprecated "1.2"}
rlm@10 148 [^String s]
rlm@10 149 (.toString (.reverse (StringBuilder. s))))
rlm@10 150
rlm@10 151 (defn replace-str
rlm@10 152 "Replaces all instances of substring a with b in s."
rlm@10 153 {:deprecated "1.2"}
rlm@10 154 [^String a ^String b ^String s]
rlm@10 155 (.replace s a b))
rlm@10 156
rlm@10 157 (defn replace-char
rlm@10 158 "Replaces all instances of character a with character b in s."
rlm@10 159 {:deprecated "1.2"}
rlm@10 160 [^Character a ^Character b ^String s]
rlm@10 161 (.replace s a b))
rlm@10 162
rlm@10 163 (defn replace-re
rlm@10 164 "Replaces all matches of re with replacement in s."
rlm@10 165 {:deprecated "1.2"}
rlm@10 166 [re replacement ^String s]
rlm@10 167 (.replaceAll (re-matcher re s) replacement))
rlm@10 168
rlm@10 169 (defn replace-by
rlm@10 170 "Replaces all matches of re in s with the result of
rlm@10 171 (f (re-groups the-match))."
rlm@10 172 {:deprecated "1.2"}
rlm@10 173 [re f ^String s]
rlm@10 174 (let [m (re-matcher re s)]
rlm@10 175 (let [buffer (StringBuffer. (.length s))]
rlm@10 176 (loop []
rlm@10 177 (if (.find m)
rlm@10 178 (do (.appendReplacement m buffer (f (re-groups m)))
rlm@10 179 (recur))
rlm@10 180 (do (.appendTail m buffer)
rlm@10 181 (.toString buffer)))))))
rlm@10 182
rlm@10 183 (defn replace-first-str
rlm@10 184 "Replace first occurance of substring a with b in s."
rlm@10 185 {:deprecated "1.2"}
rlm@10 186 [^String a ^String b ^String s]
rlm@10 187 (.replaceFirst (re-matcher (Pattern/quote a) s) b))
rlm@10 188
rlm@10 189 (defn replace-first-re
rlm@10 190 "Replace first match of re in s."
rlm@10 191 {:deprecated "1.2"}
rlm@10 192 [^Pattern re ^String replacement ^String s]
rlm@10 193 (.replaceFirst (re-matcher re s) replacement))
rlm@10 194
rlm@10 195 (defn replace-first-by
rlm@10 196 "Replace first match of re in s with the result of
rlm@10 197 (f (re-groups the-match))."
rlm@10 198 {:deprecated "1.2"}
rlm@10 199 [^Pattern re f ^String s]
rlm@10 200 (let [m (re-matcher re s)]
rlm@10 201 (let [buffer (StringBuffer.)]
rlm@10 202 (if (.find m)
rlm@10 203 (let [rep (f (re-groups m))]
rlm@10 204 (.appendReplacement m buffer rep)
rlm@10 205 (.appendTail m buffer)
rlm@10 206 (str buffer))))))
rlm@10 207
rlm@10 208 (defn partition
rlm@10 209 "Splits the string into a lazy sequence of substrings, alternating
rlm@10 210 between substrings that match the patthern and the substrings
rlm@10 211 between the matches. The sequence always starts with the substring
rlm@10 212 before the first match, or an empty string if the beginning of the
rlm@10 213 string matches.
rlm@10 214
rlm@10 215 For example: (partition #\"[a-z]+\" \"abc123def\")
rlm@10 216 returns: (\"\" \"abc\" \"123\" \"def\")"
rlm@10 217 [^Pattern re ^String s]
rlm@10 218 (let [m (re-matcher re s)]
rlm@10 219 ((fn step [prevend]
rlm@10 220 (lazy-seq
rlm@10 221 (if (.find m)
rlm@10 222 (cons (.subSequence s prevend (.start m))
rlm@10 223 (cons (re-groups m)
rlm@10 224 (step (+ (.start m) (count (.group m))))))
rlm@10 225 (when (< prevend (.length s))
rlm@10 226 (list (.subSequence s prevend (.length s)))))))
rlm@10 227 0)))
rlm@10 228
rlm@10 229 (defn ^String join
rlm@10 230 "Returns a string of all elements in coll, separated by
rlm@10 231 separator. Like Perl's join."
rlm@10 232 {:deprecated "1.2"}
rlm@10 233 [^String separator coll]
rlm@10 234 (apply str (interpose separator coll)))
rlm@10 235
rlm@10 236 (defn ^String chop
rlm@10 237 "Removes the last character of string, does nothing on a zero-length
rlm@10 238 string."
rlm@10 239 [^String s]
rlm@10 240 (let [size (count s)]
rlm@10 241 (if (zero? size)
rlm@10 242 s
rlm@10 243 (subs s 0 (dec (count s))))))
rlm@10 244
rlm@10 245 (defn ^String chomp
rlm@10 246 "Removes all trailing newline \\n or return \\r characters from
rlm@10 247 string. Note: String.trim() is similar and faster.
rlm@10 248 Deprecated in 1.2. Use clojure.string/trim-newline"
rlm@10 249 {:deprecated "1.2"}
rlm@10 250 [^String s]
rlm@10 251 (replace-re #"[\r\n]+$" "" s))
rlm@10 252
rlm@10 253 (defn ^String swap-case
rlm@10 254 "Changes upper case characters to lower case and vice-versa.
rlm@10 255 Handles Unicode supplementary characters correctly. Uses the
rlm@10 256 locale-sensitive String.toUpperCase() and String.toLowerCase()
rlm@10 257 methods."
rlm@10 258 [^String s]
rlm@10 259 (let [buffer (StringBuilder. (.length s))
rlm@10 260 ;; array to make a String from one code point
rlm@10 261 ^"[I" array (make-array Integer/TYPE 1)]
rlm@10 262 (docodepoints [c s]
rlm@10 263 (aset-int array 0 c)
rlm@10 264 (if (Character/isLowerCase c)
rlm@10 265 ;; Character.toUpperCase is not locale-sensitive, but
rlm@10 266 ;; String.toUpperCase is; so we use a String.
rlm@10 267 (.append buffer (.toUpperCase (String. array 0 1)))
rlm@10 268 (.append buffer (.toLowerCase (String. array 0 1)))))
rlm@10 269 (.toString buffer)))
rlm@10 270
rlm@10 271 (defn ^String capitalize
rlm@10 272 "Converts first character of the string to upper-case, all other
rlm@10 273 characters to lower-case."
rlm@10 274 {:deprecated "1.2"}
rlm@10 275 [^String s]
rlm@10 276 (if (< (count s) 2)
rlm@10 277 (.toUpperCase s)
rlm@10 278 (str (.toUpperCase ^String (subs s 0 1))
rlm@10 279 (.toLowerCase ^String (subs s 1)))))
rlm@10 280
rlm@10 281 (defn ^String ltrim
rlm@10 282 "Removes whitespace from the left side of string.
rlm@10 283 Deprecated in 1.2. Use clojure.string/triml."
rlm@10 284 {:deprecated "1.2"}
rlm@10 285 [^String s]
rlm@10 286 (replace-re #"^\s+" "" s))
rlm@10 287
rlm@10 288 (defn ^String rtrim
rlm@10 289 "Removes whitespace from the right side of string.
rlm@10 290 Deprecated in 1.2. Use clojure.string/trimr."
rlm@10 291 {:deprecated "1.2"}
rlm@10 292 [^String s]
rlm@10 293 (replace-re #"\s+$" "" s))
rlm@10 294
rlm@10 295 (defn split-lines
rlm@10 296 "Splits s on \\n or \\r\\n."
rlm@10 297 {:deprecated "1.2"}
rlm@10 298 [^String s]
rlm@10 299 (seq (.split #"\r?\n" s)))
rlm@10 300
rlm@10 301 ;; borrowed from compojure.string, by James Reeves, EPL 1.0
rlm@10 302 (defn ^String map-str
rlm@10 303 "Apply f to each element of coll, concatenate all results into a
rlm@10 304 String."
rlm@10 305 [f coll]
rlm@10 306 (apply str (map f coll)))
rlm@10 307
rlm@10 308 ;; borrowed from compojure.string, by James Reeves, EPL 1.0
rlm@10 309 (defn grep
rlm@10 310 "Filters elements of coll by a regular expression. The String
rlm@10 311 representation (with str) of each element is tested with re-find."
rlm@10 312 [re coll]
rlm@10 313 (filter (fn [x] (re-find re (str x))) coll))
rlm@10 314
rlm@10 315 (defn as-str
rlm@10 316 "Like clojure.core/str, but if an argument is a keyword or symbol,
rlm@10 317 its name will be used instead of its literal representation.
rlm@10 318
rlm@10 319 Example:
rlm@10 320 (str :foo :bar) ;;=> \":foo:bar\"
rlm@10 321 (as-str :foo :bar) ;;=> \"foobar\"
rlm@10 322
rlm@10 323 Note that this does not apply to keywords or symbols nested within
rlm@10 324 data structures; they will be rendered as with str.
rlm@10 325
rlm@10 326 Example:
rlm@10 327 (str {:foo :bar}) ;;=> \"{:foo :bar}\"
rlm@10 328 (as-str {:foo :bar}) ;;=> \"{:foo :bar}\" "
rlm@10 329 ([] "")
rlm@10 330 ([x] (if (instance? clojure.lang.Named x)
rlm@10 331 (name x)
rlm@10 332 (str x)))
rlm@10 333 ([x & ys]
rlm@10 334 ((fn [^StringBuilder sb more]
rlm@10 335 (if more
rlm@10 336 (recur (. sb (append (as-str (first more)))) (next more))
rlm@10 337 (str sb)))
rlm@10 338 (new StringBuilder ^String (as-str x)) ys)))
rlm@10 339
rlm@10 340
rlm@10 341 ;;; WRAPPERS
rlm@10 342
rlm@10 343 ;; The following functions are simple wrappers around java.lang.String
rlm@10 344 ;; functions. They are included here for completeness, and for use
rlm@10 345 ;; when mapping over a collection of strings.
rlm@10 346
rlm@10 347 (defn ^String upper-case
rlm@10 348 "Converts string to all upper-case."
rlm@10 349 {:deprecated "1.2"}
rlm@10 350 [^String s]
rlm@10 351 (.toUpperCase s))
rlm@10 352
rlm@10 353 (defn ^String lower-case
rlm@10 354 "Converts string to all lower-case."
rlm@10 355 {:deprecated "1.2"}
rlm@10 356 [^String s]
rlm@10 357 (.toLowerCase s))
rlm@10 358
rlm@10 359 (defn split
rlm@10 360 "Splits string on a regular expression. Optional argument limit is
rlm@10 361 the maximum number of splits."
rlm@10 362 {:deprecated "1.2"}
rlm@10 363 ([^Pattern re ^String s] (seq (.split re s)))
rlm@10 364 ([^Pattern re limit ^String s] (seq (.split re s limit))))
rlm@10 365
rlm@10 366 (defn ^String trim
rlm@10 367 "Removes whitespace from both ends of string."
rlm@10 368 {:deprecated "1.2"}
rlm@10 369 [^String s]
rlm@10 370 (.trim s))
rlm@10 371
rlm@10 372 (defn ^String substring?
rlm@10 373 "True if s contains the substring."
rlm@10 374 [substring ^String s]
rlm@10 375 (.contains s substring))
rlm@10 376
rlm@10 377 (defn ^String get
rlm@10 378 "Gets the i'th character in string."
rlm@10 379 {:deprecated "1.2"}
rlm@10 380 [^String s i]
rlm@10 381 (.charAt s i))
rlm@10 382