diff src/clojure/contrib/str_utils2.clj @ 10:ef7dbbd6452c

added clojure source goodness
author Robert McIntyre <rlm@mit.edu>
date Sat, 21 Aug 2010 06:25:44 -0400
parents
children
line wrap: on
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/src/clojure/contrib/str_utils2.clj	Sat Aug 21 06:25:44 2010 -0400
     1.3 @@ -0,0 +1,376 @@
     1.4 +;;; str_utils2.clj -- functional string utilities for Clojure
     1.5 +
     1.6 +;; by Stuart Sierra, http://stuartsierra.com/
     1.7 +;; August 19, 2009
     1.8 +
     1.9 +;; Copyright (c) Stuart Sierra, 2009. All rights reserved.  The use
    1.10 +;; and distribution terms for this software are covered by the Eclipse
    1.11 +;; Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php)
    1.12 +;; which can be found in the file epl-v10.html at the root of this
    1.13 +;; distribution.  By using this software in any fashion, you are
    1.14 +;; agreeing to be bound by the terms of this license.  You must not
    1.15 +;; remove this notice, or any other, from this software.
    1.16 +
    1.17 +;; DEPRECATED in 1.2: Promoted to clojure.java.string. Note that
    1.18 +;; many function names and semantics have changed
    1.19 +
    1.20 +(ns ^{:author "Stuart Sierra"
    1.21 +      :deprecated "1.2"
    1.22 +      :doc "This is a library of string manipulation functions.  It
    1.23 +    is intented as a replacement for clojure.contrib.str-utils.
    1.24 +
    1.25 +    You cannot (use 'clojure.contrib.str-utils2) because it defines
    1.26 +    functions with the same names as functions in clojure.core.
    1.27 +    Instead, do (require '[clojure.contrib.str-utils2 :as s]) 
    1.28 +    or something similar.
    1.29 +
    1.30 +    Goals:
    1.31 +      1. Be functional
    1.32 +      2. String argument first, to work with ->
    1.33 +      3. Performance linear in string length
    1.34 +
    1.35 +    Some ideas are borrowed from
    1.36 +    http://github.com/francoisdevlin/devlinsf-clojure-utils/"}
    1.37 + clojure.contrib.str-utils2
    1.38 + (:refer-clojure :exclude (take replace drop butlast partition
    1.39 +                           contains? get repeat reverse partial))
    1.40 + (:import (java.util.regex Pattern)))
    1.41 +
    1.42 +
    1.43 +(defmacro dochars 
    1.44 +  "bindings => [name string]
    1.45 +
    1.46 +  Repeatedly executes body, with name bound to each character in
    1.47 +  string.  Does NOT handle Unicode supplementary characters (above
    1.48 +  U+FFFF)."
    1.49 +  [bindings & body]
    1.50 +  (assert (vector bindings))
    1.51 +  (assert (= 2 (count bindings)))
    1.52 +  ;; This seems to be the fastest way to iterate over characters.
    1.53 +  `(let [^String s# ~(second bindings)]
    1.54 +     (dotimes [i# (.length s#)]
    1.55 +       (let [~(first bindings) (.charAt s# i#)]
    1.56 +         ~@body))))
    1.57 +
    1.58 +
    1.59 +(defmacro docodepoints
    1.60 +  "bindings => [name string]
    1.61 +
    1.62 +  Repeatedly executes body, with name bound to the integer code point
    1.63 +  of each Unicode character in the string.  Handles Unicode
    1.64 +  supplementary characters (above U+FFFF) correctly."
    1.65 +  [bindings & body]
    1.66 +  (assert (vector bindings))
    1.67 +  (assert (= 2 (count bindings)))
    1.68 +  (let [character (first bindings)
    1.69 +        string (second bindings)]
    1.70 +    `(let [^String s# ~string
    1.71 +           len# (.length s#)]
    1.72 +       (loop [i# 0]
    1.73 +         (when (< i# len#)
    1.74 +           (let [~character (.charAt s# i#)]
    1.75 +             (if (Character/isHighSurrogate ~character)
    1.76 +               (let [~character (.codePointAt s# i#)]
    1.77 +                 ~@body
    1.78 +                 (recur (+ 2 i#)))
    1.79 +               (let [~character (int ~character)]
    1.80 +                 ~@body
    1.81 +                 (recur (inc i#))))))))))
    1.82 +
    1.83 +(defn codepoints
    1.84 +  "Returns a sequence of integer Unicode code points in s.  Handles
    1.85 +  Unicode supplementary characters (above U+FFFF) correctly."
    1.86 +  [^String s]
    1.87 +  (let [len (.length s)
    1.88 +        f (fn thisfn [^String s i]
    1.89 +            (when (< i len)
    1.90 +              (let [c (.charAt s i)]
    1.91 +                (if (Character/isHighSurrogate c)
    1.92 +                  (cons (.codePointAt s i) (thisfn s (+ 2 i)))
    1.93 +                  (cons (int c) (thisfn s (inc i)))))))]
    1.94 +    (lazy-seq (f s 0))))
    1.95 +
    1.96 +(defn ^String escape
    1.97 +  "Returns a new String by applying cmap (a function or a map) to each
    1.98 +   character in s.  If cmap returns nil, the original character is
    1.99 +   added to the output unchanged."
   1.100 +  [^String s cmap]
   1.101 +  (let [buffer (StringBuilder. (.length s))]
   1.102 +    (dochars [c s]
   1.103 +      (if-let [r (cmap c)]
   1.104 +        (.append buffer r)
   1.105 +        (.append buffer c)))
   1.106 +    (.toString buffer)))
   1.107 +
   1.108 +(defn blank?
   1.109 +  "True if s is nil, empty, or contains only whitespace."
   1.110 +  [^String s]
   1.111 +  (every? (fn [^Character c] (Character/isWhitespace c)) s))
   1.112 +
   1.113 +(defn ^String take
   1.114 +  "Take first n characters from s, up to the length of s.
   1.115 +
   1.116 +  Note the argument order is the opposite of clojure.core/take; this
   1.117 +  is to keep the string as the first argument for use with ->"
   1.118 +  [^String s n]
   1.119 +  (if (< (count s) n)
   1.120 +    s
   1.121 +    (.substring s 0 n)))
   1.122 +
   1.123 +(defn ^String drop
   1.124 +  "Drops first n characters from s.  Returns an empty string if n is
   1.125 +  greater than the length of s.
   1.126 +
   1.127 +  Note the argument order is the opposite of clojure.core/drop; this
   1.128 +  is to keep the string as the first argument for use with ->"
   1.129 +  [^String s n]
   1.130 +  (if (< (count s) n)
   1.131 +    ""
   1.132 +    (.substring s n)))
   1.133 +
   1.134 +(defn ^String butlast
   1.135 +  "Returns s without the last n characters.  Returns an empty string
   1.136 +  if n is greater than the length of s.
   1.137 +
   1.138 +  Note the argument order is the opposite of clojure.core/butlast;
   1.139 +  this is to keep the string as the first argument for use with ->"
   1.140 +  [^String s n]
   1.141 +  (if (< (count s) n)
   1.142 +    ""
   1.143 +    (.substring s 0 (- (count s) n))))
   1.144 +
   1.145 +(defn ^String tail
   1.146 +  "Returns the last n characters of s."
   1.147 +  [^String s n]
   1.148 +  (if (< (count s) n)
   1.149 +    s
   1.150 +    (.substring s (- (count s) n))))
   1.151 +
   1.152 +(defn ^String repeat
   1.153 +  "Returns a new String containing s repeated n times."
   1.154 +  [^String s n]
   1.155 +  (apply str (clojure.core/repeat n s)))
   1.156 +
   1.157 +(defn ^String reverse
   1.158 +  "Returns s with its characters reversed."
   1.159 +  [^String s]
   1.160 +  (.toString (.reverse (StringBuilder. s))))
   1.161 +
   1.162 +(defmulti
   1.163 +  ^{:doc "Replaces all instances of pattern in string with replacement.  
   1.164 +  
   1.165 +  Allowed argument types for pattern and replacement are:
   1.166 +   1. String and String
   1.167 +   2. Character and Character
   1.168 +   3. regex Pattern and String
   1.169 +      (Uses java.util.regex.Matcher.replaceAll)
   1.170 +   4. regex Pattern and function
   1.171 +      (Calls function with re-groups of each match, uses return 
   1.172 +       value as replacement.)"
   1.173 +     :arglists '([string pattern replacement])
   1.174 +     :tag String}
   1.175 +  replace
   1.176 +  (fn [^String string pattern replacement]
   1.177 +    [(class pattern) (class replacement)]))
   1.178 +
   1.179 +(defmethod replace [String String] [^String s ^String a ^String b]
   1.180 +  (.replace s a b))
   1.181 +
   1.182 +(defmethod replace [Character Character] [^String s ^Character a ^Character b]
   1.183 +  (.replace s a b))
   1.184 +
   1.185 +(defmethod replace [Pattern String] [^String s re replacement]
   1.186 +  (.replaceAll (re-matcher re s) replacement))
   1.187 +
   1.188 +(defmethod replace [Pattern clojure.lang.IFn] [^String s re replacement]
   1.189 +  (let [m (re-matcher re s)]
   1.190 +    (let [buffer (StringBuffer. (.length s))]
   1.191 +      (loop []
   1.192 +        (if (.find m)
   1.193 +          (do (.appendReplacement m buffer (replacement (re-groups m)))
   1.194 +              (recur))
   1.195 +          (do (.appendTail m buffer)
   1.196 +              (.toString buffer)))))))
   1.197 +
   1.198 +(defmulti
   1.199 +  ^{:doc "Replaces the first instance of pattern in s with replacement.
   1.200 +
   1.201 +  Allowed argument types for pattern and replacement are:
   1.202 +   1. String and String
   1.203 +   2. regex Pattern and String
   1.204 +      (Uses java.util.regex.Matcher.replaceAll)
   1.205 +   3. regex Pattern and function
   1.206 +"
   1.207 +     :arglists '([s pattern replacement])
   1.208 +     :tag String}
   1.209 +  replace-first
   1.210 +  (fn [s pattern replacement]
   1.211 +    [(class pattern) (class replacement)]))
   1.212 +
   1.213 +(defmethod replace-first [String String] [^String s pattern replacement]
   1.214 +  (.replaceFirst (re-matcher (Pattern/quote pattern) s) replacement))
   1.215 +
   1.216 +(defmethod replace-first [Pattern String] [^String s re replacement]
   1.217 +  (.replaceFirst (re-matcher re s) replacement))
   1.218 +
   1.219 +(defmethod replace-first [Pattern clojure.lang.IFn] [^String s ^Pattern re f]
   1.220 +  (let [m (re-matcher re s)]
   1.221 +    (let [buffer (StringBuffer.)]
   1.222 +      (if (.find m)
   1.223 +        (let [rep (f (re-groups m))]
   1.224 +          (.appendReplacement m buffer rep)
   1.225 +          (.appendTail m buffer)
   1.226 +          (str buffer))))))
   1.227 +
   1.228 +(defn partition
   1.229 +  "Splits the string into a lazy sequence of substrings, alternating
   1.230 +  between substrings that match the patthern and the substrings
   1.231 +  between the matches.  The sequence always starts with the substring
   1.232 +  before the first match, or an empty string if the beginning of the
   1.233 +  string matches.
   1.234 +
   1.235 +  For example: (partition \"abc123def\" #\"[a-z]+\")
   1.236 +  returns: (\"\" \"abc\" \"123\" \"def\")"
   1.237 +  [^String s ^Pattern re]
   1.238 +  (let [m (re-matcher re s)]
   1.239 +    ((fn step [prevend]
   1.240 +       (lazy-seq
   1.241 +        (if (.find m)
   1.242 +          (cons (.subSequence s prevend (.start m))
   1.243 +                (cons (re-groups m)
   1.244 +                      (step (+ (.start m) (count (.group m))))))
   1.245 +          (when (< prevend (.length s))
   1.246 +            (list (.subSequence s prevend (.length s)))))))
   1.247 +     0)))
   1.248 +
   1.249 +(defn ^String join
   1.250 +  "Returns a string of all elements in coll, separated by
   1.251 +  separator.  Like Perl's join."
   1.252 +  [^String separator coll]
   1.253 +  (apply str (interpose separator coll)))
   1.254 +
   1.255 +(defn ^String chop
   1.256 +  "Removes the last character of string, does nothing on a zero-length
   1.257 +  string."
   1.258 +  [^String s]
   1.259 +  (let [size (count s)]
   1.260 +    (if (zero? size)
   1.261 +      s
   1.262 +      (subs s 0 (dec (count s))))))
   1.263 +
   1.264 +(defn ^String chomp
   1.265 +  "Removes all trailing newline \\n or return \\r characters from
   1.266 +  string.  Note: String.trim() is similar and faster."
   1.267 +  [^String s]
   1.268 +  (replace s #"[\r\n]+$" ""))
   1.269 +
   1.270 +(defn title-case [^String s]
   1.271 +  (throw (Exception. "title-case not implemeted yet")))
   1.272 +
   1.273 +(defn ^String swap-case
   1.274 +  "Changes upper case characters to lower case and vice-versa.
   1.275 +  Handles Unicode supplementary characters correctly.  Uses the
   1.276 +  locale-sensitive String.toUpperCase() and String.toLowerCase()
   1.277 +  methods."
   1.278 +  [^String s]
   1.279 +  (let [buffer (StringBuilder. (.length s))
   1.280 +        ;; array to make a String from one code point
   1.281 +        ^"[I" array (make-array Integer/TYPE 1)]
   1.282 +    (docodepoints [c s]
   1.283 +      (aset-int array 0 c)
   1.284 +      (if (Character/isLowerCase c)
   1.285 +        ;; Character.toUpperCase is not locale-sensitive, but
   1.286 +        ;; String.toUpperCase is; so we use a String.
   1.287 +        (.append buffer (.toUpperCase (String. array 0 1)))
   1.288 +        (.append buffer (.toLowerCase (String. array 0 1)))))
   1.289 +    (.toString buffer)))
   1.290 +
   1.291 +(defn ^String capitalize
   1.292 +  "Converts first character of the string to upper-case, all other
   1.293 +  characters to lower-case."
   1.294 +  [^String s]
   1.295 +  (if (< (count s) 2)
   1.296 +    (.toUpperCase s)
   1.297 +    (str (.toUpperCase ^String (subs s 0 1))
   1.298 +         (.toLowerCase ^String (subs s 1)))))
   1.299 +
   1.300 +(defn ^String ltrim
   1.301 +  "Removes whitespace from the left side of string."
   1.302 +  [^String s]
   1.303 +  (replace s #"^\s+" ""))
   1.304 +
   1.305 +(defn ^String rtrim
   1.306 +  "Removes whitespace from the right side of string."
   1.307 +  [^String s]
   1.308 +  (replace s #"\s+$" ""))
   1.309 +
   1.310 +(defn split-lines
   1.311 +  "Splits s on \\n or \\r\\n."
   1.312 +  [^String s]
   1.313 +  (seq (.split #"\r?\n" s)))
   1.314 +
   1.315 +;; borrowed from compojure.str-utils, by James Reeves, EPL 1.0
   1.316 +(defn ^String map-str
   1.317 +  "Apply f to each element of coll, concatenate all results into a
   1.318 +  String."
   1.319 +  [f coll]
   1.320 +  (apply str (map f coll)))
   1.321 +
   1.322 +;; borrowed from compojure.str-utils, by James Reeves, EPL 1.0
   1.323 +(defn grep
   1.324 +  "Filters elements of coll by a regular expression.  The String
   1.325 +  representation (with str) of each element is tested with re-find."
   1.326 +  [re coll]
   1.327 +  (filter (fn [x] (re-find re (str x))) coll))
   1.328 +
   1.329 +(defn partial
   1.330 +  "Like clojure.core/partial for functions that take their primary
   1.331 +  argument first.
   1.332 +
   1.333 +  Takes a function f and its arguments, NOT INCLUDING the first
   1.334 +  argument.  Returns a new function whose first argument will be the
   1.335 +  first argument to f.
   1.336 +
   1.337 +  Example: (str-utils2/partial str-utils2/take 2)
   1.338 +           ;;=> (fn [s] (str-utils2/take s 2))"
   1.339 +  [f & args]
   1.340 +  (fn [s & more] (apply f s (concat args more))))
   1.341 +
   1.342 +
   1.343 +;;; WRAPPERS
   1.344 +
   1.345 +;; The following functions are simple wrappers around java.lang.String
   1.346 +;; functions.  They are included here for completeness, and for use
   1.347 +;; when mapping over a collection of strings.
   1.348 +
   1.349 +(defn ^String upper-case
   1.350 +  "Converts string to all upper-case."
   1.351 +  [^String s]
   1.352 +  (.toUpperCase s))
   1.353 +
   1.354 +(defn ^String lower-case
   1.355 +  "Converts string to all lower-case."
   1.356 +  [^String s]
   1.357 +  (.toLowerCase s))
   1.358 +
   1.359 +(defn split
   1.360 +  "Splits string on a regular expression.  Optional argument limit is
   1.361 +  the maximum number of splits."
   1.362 +  ([^String s ^Pattern re] (seq (.split re s)))
   1.363 +  ([^String s ^Pattern re limit] (seq (.split re s limit))))
   1.364 +
   1.365 +(defn ^String trim
   1.366 +  "Removes whitespace from both ends of string."
   1.367 +  [^String s]
   1.368 +  (.trim s))
   1.369 +
   1.370 +(defn ^String contains?
   1.371 +  "True if s contains the substring."
   1.372 +  [^String s substring]
   1.373 +  (.contains s substring))
   1.374 +
   1.375 +(defn ^String get
   1.376 +  "Gets the i'th character in string."
   1.377 +  [^String s i]
   1.378 +  (.charAt s i))
   1.379 +