annotate src/clojure/contrib/str_utils2.clj @ 10:ef7dbbd6452c

added clojure source goodness
author Robert McIntyre <rlm@mit.edu>
date Sat, 21 Aug 2010 06:25:44 -0400
parents
children
rev   line source
rlm@10 1 ;;; str_utils2.clj -- functional string utilities for Clojure
rlm@10 2
rlm@10 3 ;; by Stuart Sierra, http://stuartsierra.com/
rlm@10 4 ;; August 19, 2009
rlm@10 5
rlm@10 6 ;; Copyright (c) Stuart Sierra, 2009. All rights reserved. The use
rlm@10 7 ;; and distribution terms for this software are covered by the Eclipse
rlm@10 8 ;; Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php)
rlm@10 9 ;; which can be found in the file epl-v10.html at the root of this
rlm@10 10 ;; distribution. By using this software in any fashion, you are
rlm@10 11 ;; agreeing to be bound by the terms of this license. You must not
rlm@10 12 ;; remove this notice, or any other, from this software.
rlm@10 13
rlm@10 14 ;; DEPRECATED in 1.2: Promoted to clojure.java.string. Note that
rlm@10 15 ;; many function names and semantics have changed
rlm@10 16
rlm@10 17 (ns ^{:author "Stuart Sierra"
rlm@10 18 :deprecated "1.2"
rlm@10 19 :doc "This is a library of string manipulation functions. It
rlm@10 20 is intented as a replacement for clojure.contrib.str-utils.
rlm@10 21
rlm@10 22 You cannot (use 'clojure.contrib.str-utils2) because it defines
rlm@10 23 functions with the same names as functions in clojure.core.
rlm@10 24 Instead, do (require '[clojure.contrib.str-utils2 :as s])
rlm@10 25 or something similar.
rlm@10 26
rlm@10 27 Goals:
rlm@10 28 1. Be functional
rlm@10 29 2. String argument first, to work with ->
rlm@10 30 3. Performance linear in string length
rlm@10 31
rlm@10 32 Some ideas are borrowed from
rlm@10 33 http://github.com/francoisdevlin/devlinsf-clojure-utils/"}
rlm@10 34 clojure.contrib.str-utils2
rlm@10 35 (:refer-clojure :exclude (take replace drop butlast partition
rlm@10 36 contains? get repeat reverse partial))
rlm@10 37 (:import (java.util.regex Pattern)))
rlm@10 38
rlm@10 39
rlm@10 40 (defmacro dochars
rlm@10 41 "bindings => [name string]
rlm@10 42
rlm@10 43 Repeatedly executes body, with name bound to each character in
rlm@10 44 string. Does NOT handle Unicode supplementary characters (above
rlm@10 45 U+FFFF)."
rlm@10 46 [bindings & body]
rlm@10 47 (assert (vector bindings))
rlm@10 48 (assert (= 2 (count bindings)))
rlm@10 49 ;; This seems to be the fastest way to iterate over characters.
rlm@10 50 `(let [^String s# ~(second bindings)]
rlm@10 51 (dotimes [i# (.length s#)]
rlm@10 52 (let [~(first bindings) (.charAt s# i#)]
rlm@10 53 ~@body))))
rlm@10 54
rlm@10 55
rlm@10 56 (defmacro docodepoints
rlm@10 57 "bindings => [name string]
rlm@10 58
rlm@10 59 Repeatedly executes body, with name bound to the integer code point
rlm@10 60 of each Unicode character in the string. Handles Unicode
rlm@10 61 supplementary characters (above U+FFFF) correctly."
rlm@10 62 [bindings & body]
rlm@10 63 (assert (vector bindings))
rlm@10 64 (assert (= 2 (count bindings)))
rlm@10 65 (let [character (first bindings)
rlm@10 66 string (second bindings)]
rlm@10 67 `(let [^String s# ~string
rlm@10 68 len# (.length s#)]
rlm@10 69 (loop [i# 0]
rlm@10 70 (when (< i# len#)
rlm@10 71 (let [~character (.charAt s# i#)]
rlm@10 72 (if (Character/isHighSurrogate ~character)
rlm@10 73 (let [~character (.codePointAt s# i#)]
rlm@10 74 ~@body
rlm@10 75 (recur (+ 2 i#)))
rlm@10 76 (let [~character (int ~character)]
rlm@10 77 ~@body
rlm@10 78 (recur (inc i#))))))))))
rlm@10 79
rlm@10 80 (defn codepoints
rlm@10 81 "Returns a sequence of integer Unicode code points in s. Handles
rlm@10 82 Unicode supplementary characters (above U+FFFF) correctly."
rlm@10 83 [^String s]
rlm@10 84 (let [len (.length s)
rlm@10 85 f (fn thisfn [^String s i]
rlm@10 86 (when (< i len)
rlm@10 87 (let [c (.charAt s i)]
rlm@10 88 (if (Character/isHighSurrogate c)
rlm@10 89 (cons (.codePointAt s i) (thisfn s (+ 2 i)))
rlm@10 90 (cons (int c) (thisfn s (inc i)))))))]
rlm@10 91 (lazy-seq (f s 0))))
rlm@10 92
rlm@10 93 (defn ^String escape
rlm@10 94 "Returns a new String by applying cmap (a function or a map) to each
rlm@10 95 character in s. If cmap returns nil, the original character is
rlm@10 96 added to the output unchanged."
rlm@10 97 [^String s cmap]
rlm@10 98 (let [buffer (StringBuilder. (.length s))]
rlm@10 99 (dochars [c s]
rlm@10 100 (if-let [r (cmap c)]
rlm@10 101 (.append buffer r)
rlm@10 102 (.append buffer c)))
rlm@10 103 (.toString buffer)))
rlm@10 104
rlm@10 105 (defn blank?
rlm@10 106 "True if s is nil, empty, or contains only whitespace."
rlm@10 107 [^String s]
rlm@10 108 (every? (fn [^Character c] (Character/isWhitespace c)) s))
rlm@10 109
rlm@10 110 (defn ^String take
rlm@10 111 "Take first n characters from s, up to the length of s.
rlm@10 112
rlm@10 113 Note the argument order is the opposite of clojure.core/take; this
rlm@10 114 is to keep the string as the first argument for use with ->"
rlm@10 115 [^String s n]
rlm@10 116 (if (< (count s) n)
rlm@10 117 s
rlm@10 118 (.substring s 0 n)))
rlm@10 119
rlm@10 120 (defn ^String drop
rlm@10 121 "Drops first n characters from s. Returns an empty string if n is
rlm@10 122 greater than the length of s.
rlm@10 123
rlm@10 124 Note the argument order is the opposite of clojure.core/drop; this
rlm@10 125 is to keep the string as the first argument for use with ->"
rlm@10 126 [^String s n]
rlm@10 127 (if (< (count s) n)
rlm@10 128 ""
rlm@10 129 (.substring s n)))
rlm@10 130
rlm@10 131 (defn ^String butlast
rlm@10 132 "Returns s without the last n characters. Returns an empty string
rlm@10 133 if n is greater than the length of s.
rlm@10 134
rlm@10 135 Note the argument order is the opposite of clojure.core/butlast;
rlm@10 136 this is to keep the string as the first argument for use with ->"
rlm@10 137 [^String s n]
rlm@10 138 (if (< (count s) n)
rlm@10 139 ""
rlm@10 140 (.substring s 0 (- (count s) n))))
rlm@10 141
rlm@10 142 (defn ^String tail
rlm@10 143 "Returns the last n characters of s."
rlm@10 144 [^String s n]
rlm@10 145 (if (< (count s) n)
rlm@10 146 s
rlm@10 147 (.substring s (- (count s) n))))
rlm@10 148
rlm@10 149 (defn ^String repeat
rlm@10 150 "Returns a new String containing s repeated n times."
rlm@10 151 [^String s n]
rlm@10 152 (apply str (clojure.core/repeat n s)))
rlm@10 153
rlm@10 154 (defn ^String reverse
rlm@10 155 "Returns s with its characters reversed."
rlm@10 156 [^String s]
rlm@10 157 (.toString (.reverse (StringBuilder. s))))
rlm@10 158
rlm@10 159 (defmulti
rlm@10 160 ^{:doc "Replaces all instances of pattern in string with replacement.
rlm@10 161
rlm@10 162 Allowed argument types for pattern and replacement are:
rlm@10 163 1. String and String
rlm@10 164 2. Character and Character
rlm@10 165 3. regex Pattern and String
rlm@10 166 (Uses java.util.regex.Matcher.replaceAll)
rlm@10 167 4. regex Pattern and function
rlm@10 168 (Calls function with re-groups of each match, uses return
rlm@10 169 value as replacement.)"
rlm@10 170 :arglists '([string pattern replacement])
rlm@10 171 :tag String}
rlm@10 172 replace
rlm@10 173 (fn [^String string pattern replacement]
rlm@10 174 [(class pattern) (class replacement)]))
rlm@10 175
rlm@10 176 (defmethod replace [String String] [^String s ^String a ^String b]
rlm@10 177 (.replace s a b))
rlm@10 178
rlm@10 179 (defmethod replace [Character Character] [^String s ^Character a ^Character b]
rlm@10 180 (.replace s a b))
rlm@10 181
rlm@10 182 (defmethod replace [Pattern String] [^String s re replacement]
rlm@10 183 (.replaceAll (re-matcher re s) replacement))
rlm@10 184
rlm@10 185 (defmethod replace [Pattern clojure.lang.IFn] [^String s re replacement]
rlm@10 186 (let [m (re-matcher re s)]
rlm@10 187 (let [buffer (StringBuffer. (.length s))]
rlm@10 188 (loop []
rlm@10 189 (if (.find m)
rlm@10 190 (do (.appendReplacement m buffer (replacement (re-groups m)))
rlm@10 191 (recur))
rlm@10 192 (do (.appendTail m buffer)
rlm@10 193 (.toString buffer)))))))
rlm@10 194
rlm@10 195 (defmulti
rlm@10 196 ^{:doc "Replaces the first instance of pattern in s with replacement.
rlm@10 197
rlm@10 198 Allowed argument types for pattern and replacement are:
rlm@10 199 1. String and String
rlm@10 200 2. regex Pattern and String
rlm@10 201 (Uses java.util.regex.Matcher.replaceAll)
rlm@10 202 3. regex Pattern and function
rlm@10 203 "
rlm@10 204 :arglists '([s pattern replacement])
rlm@10 205 :tag String}
rlm@10 206 replace-first
rlm@10 207 (fn [s pattern replacement]
rlm@10 208 [(class pattern) (class replacement)]))
rlm@10 209
rlm@10 210 (defmethod replace-first [String String] [^String s pattern replacement]
rlm@10 211 (.replaceFirst (re-matcher (Pattern/quote pattern) s) replacement))
rlm@10 212
rlm@10 213 (defmethod replace-first [Pattern String] [^String s re replacement]
rlm@10 214 (.replaceFirst (re-matcher re s) replacement))
rlm@10 215
rlm@10 216 (defmethod replace-first [Pattern clojure.lang.IFn] [^String s ^Pattern re f]
rlm@10 217 (let [m (re-matcher re s)]
rlm@10 218 (let [buffer (StringBuffer.)]
rlm@10 219 (if (.find m)
rlm@10 220 (let [rep (f (re-groups m))]
rlm@10 221 (.appendReplacement m buffer rep)
rlm@10 222 (.appendTail m buffer)
rlm@10 223 (str buffer))))))
rlm@10 224
rlm@10 225 (defn partition
rlm@10 226 "Splits the string into a lazy sequence of substrings, alternating
rlm@10 227 between substrings that match the patthern and the substrings
rlm@10 228 between the matches. The sequence always starts with the substring
rlm@10 229 before the first match, or an empty string if the beginning of the
rlm@10 230 string matches.
rlm@10 231
rlm@10 232 For example: (partition \"abc123def\" #\"[a-z]+\")
rlm@10 233 returns: (\"\" \"abc\" \"123\" \"def\")"
rlm@10 234 [^String s ^Pattern re]
rlm@10 235 (let [m (re-matcher re s)]
rlm@10 236 ((fn step [prevend]
rlm@10 237 (lazy-seq
rlm@10 238 (if (.find m)
rlm@10 239 (cons (.subSequence s prevend (.start m))
rlm@10 240 (cons (re-groups m)
rlm@10 241 (step (+ (.start m) (count (.group m))))))
rlm@10 242 (when (< prevend (.length s))
rlm@10 243 (list (.subSequence s prevend (.length s)))))))
rlm@10 244 0)))
rlm@10 245
rlm@10 246 (defn ^String join
rlm@10 247 "Returns a string of all elements in coll, separated by
rlm@10 248 separator. Like Perl's join."
rlm@10 249 [^String separator coll]
rlm@10 250 (apply str (interpose separator coll)))
rlm@10 251
rlm@10 252 (defn ^String chop
rlm@10 253 "Removes the last character of string, does nothing on a zero-length
rlm@10 254 string."
rlm@10 255 [^String s]
rlm@10 256 (let [size (count s)]
rlm@10 257 (if (zero? size)
rlm@10 258 s
rlm@10 259 (subs s 0 (dec (count s))))))
rlm@10 260
rlm@10 261 (defn ^String chomp
rlm@10 262 "Removes all trailing newline \\n or return \\r characters from
rlm@10 263 string. Note: String.trim() is similar and faster."
rlm@10 264 [^String s]
rlm@10 265 (replace s #"[\r\n]+$" ""))
rlm@10 266
rlm@10 267 (defn title-case [^String s]
rlm@10 268 (throw (Exception. "title-case not implemeted yet")))
rlm@10 269
rlm@10 270 (defn ^String swap-case
rlm@10 271 "Changes upper case characters to lower case and vice-versa.
rlm@10 272 Handles Unicode supplementary characters correctly. Uses the
rlm@10 273 locale-sensitive String.toUpperCase() and String.toLowerCase()
rlm@10 274 methods."
rlm@10 275 [^String s]
rlm@10 276 (let [buffer (StringBuilder. (.length s))
rlm@10 277 ;; array to make a String from one code point
rlm@10 278 ^"[I" array (make-array Integer/TYPE 1)]
rlm@10 279 (docodepoints [c s]
rlm@10 280 (aset-int array 0 c)
rlm@10 281 (if (Character/isLowerCase c)
rlm@10 282 ;; Character.toUpperCase is not locale-sensitive, but
rlm@10 283 ;; String.toUpperCase is; so we use a String.
rlm@10 284 (.append buffer (.toUpperCase (String. array 0 1)))
rlm@10 285 (.append buffer (.toLowerCase (String. array 0 1)))))
rlm@10 286 (.toString buffer)))
rlm@10 287
rlm@10 288 (defn ^String capitalize
rlm@10 289 "Converts first character of the string to upper-case, all other
rlm@10 290 characters to lower-case."
rlm@10 291 [^String s]
rlm@10 292 (if (< (count s) 2)
rlm@10 293 (.toUpperCase s)
rlm@10 294 (str (.toUpperCase ^String (subs s 0 1))
rlm@10 295 (.toLowerCase ^String (subs s 1)))))
rlm@10 296
rlm@10 297 (defn ^String ltrim
rlm@10 298 "Removes whitespace from the left side of string."
rlm@10 299 [^String s]
rlm@10 300 (replace s #"^\s+" ""))
rlm@10 301
rlm@10 302 (defn ^String rtrim
rlm@10 303 "Removes whitespace from the right side of string."
rlm@10 304 [^String s]
rlm@10 305 (replace s #"\s+$" ""))
rlm@10 306
rlm@10 307 (defn split-lines
rlm@10 308 "Splits s on \\n or \\r\\n."
rlm@10 309 [^String s]
rlm@10 310 (seq (.split #"\r?\n" s)))
rlm@10 311
rlm@10 312 ;; borrowed from compojure.str-utils, by James Reeves, EPL 1.0
rlm@10 313 (defn ^String map-str
rlm@10 314 "Apply f to each element of coll, concatenate all results into a
rlm@10 315 String."
rlm@10 316 [f coll]
rlm@10 317 (apply str (map f coll)))
rlm@10 318
rlm@10 319 ;; borrowed from compojure.str-utils, by James Reeves, EPL 1.0
rlm@10 320 (defn grep
rlm@10 321 "Filters elements of coll by a regular expression. The String
rlm@10 322 representation (with str) of each element is tested with re-find."
rlm@10 323 [re coll]
rlm@10 324 (filter (fn [x] (re-find re (str x))) coll))
rlm@10 325
rlm@10 326 (defn partial
rlm@10 327 "Like clojure.core/partial for functions that take their primary
rlm@10 328 argument first.
rlm@10 329
rlm@10 330 Takes a function f and its arguments, NOT INCLUDING the first
rlm@10 331 argument. Returns a new function whose first argument will be the
rlm@10 332 first argument to f.
rlm@10 333
rlm@10 334 Example: (str-utils2/partial str-utils2/take 2)
rlm@10 335 ;;=> (fn [s] (str-utils2/take s 2))"
rlm@10 336 [f & args]
rlm@10 337 (fn [s & more] (apply f s (concat args more))))
rlm@10 338
rlm@10 339
rlm@10 340 ;;; WRAPPERS
rlm@10 341
rlm@10 342 ;; The following functions are simple wrappers around java.lang.String
rlm@10 343 ;; functions. They are included here for completeness, and for use
rlm@10 344 ;; when mapping over a collection of strings.
rlm@10 345
rlm@10 346 (defn ^String upper-case
rlm@10 347 "Converts string to all upper-case."
rlm@10 348 [^String s]
rlm@10 349 (.toUpperCase s))
rlm@10 350
rlm@10 351 (defn ^String lower-case
rlm@10 352 "Converts string to all lower-case."
rlm@10 353 [^String s]
rlm@10 354 (.toLowerCase s))
rlm@10 355
rlm@10 356 (defn split
rlm@10 357 "Splits string on a regular expression. Optional argument limit is
rlm@10 358 the maximum number of splits."
rlm@10 359 ([^String s ^Pattern re] (seq (.split re s)))
rlm@10 360 ([^String s ^Pattern re limit] (seq (.split re s limit))))
rlm@10 361
rlm@10 362 (defn ^String trim
rlm@10 363 "Removes whitespace from both ends of string."
rlm@10 364 [^String s]
rlm@10 365 (.trim s))
rlm@10 366
rlm@10 367 (defn ^String contains?
rlm@10 368 "True if s contains the substring."
rlm@10 369 [^String s substring]
rlm@10 370 (.contains s substring))
rlm@10 371
rlm@10 372 (defn ^String get
rlm@10 373 "Gets the i'th character in string."
rlm@10 374 [^String s i]
rlm@10 375 (.charAt s i))
rlm@10 376