rlm@10: ;;; string.clj -- functional string utilities for Clojure rlm@10: rlm@10: ;; by Stuart Sierra, http://stuartsierra.com/ rlm@10: ;; January 26, 2010 rlm@10: rlm@10: ;; Copyright (c) Stuart Sierra, 2010. All rights reserved. The use rlm@10: ;; and distribution terms for this software are covered by the Eclipse rlm@10: ;; Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) rlm@10: ;; which can be found in the file epl-v10.html at the root of this rlm@10: ;; distribution. By using this software in any fashion, you are rlm@10: ;; agreeing to be bound by the terms of this license. You must not rlm@10: ;; remove this notice, or any other, from this software. rlm@10: rlm@10: ;; DEPRECATED in 1.2: Many functions have moved to clojure.string. rlm@10: rlm@10: (ns ^{:author "Stuart Sierra" rlm@10: :doc "This is a library of string manipulation functions. It rlm@10: is intented as a replacement for clojure.contrib.string. rlm@10: rlm@10: You cannot (use 'clojure.contrib.string) because it defines rlm@10: functions with the same names as functions in clojure.core. rlm@10: Instead, do (require '[clojure.contrib.string :as s]) rlm@10: or something similar. rlm@10: rlm@10: Goals: rlm@10: 1. Be functional rlm@10: 2. Most significant argument LAST, to work with ->> rlm@10: 3. At least O(n) performance for Strings of length n rlm@10: rlm@10: Some ideas are borrowed from rlm@10: http://github.com/francoisdevlin/devlinsf-clojure-utils/"} rlm@10: clojure.contrib.string rlm@10: (:refer-clojure :exclude (take replace drop butlast partition rlm@10: contains? get repeat reverse partial)) rlm@10: (:import (java.util.regex Pattern))) rlm@10: rlm@10: rlm@10: (defmacro dochars rlm@10: "bindings => [name string] rlm@10: rlm@10: Repeatedly executes body, with name bound to each character in rlm@10: string. Does NOT handle Unicode supplementary characters (above rlm@10: U+FFFF)." rlm@10: [bindings & body] rlm@10: (assert (vector bindings)) rlm@10: (assert (= 2 (count bindings))) rlm@10: ;; This seems to be the fastest way to iterate over characters. rlm@10: `(let [^String s# ~(second bindings)] rlm@10: (dotimes [i# (.length s#)] rlm@10: (let [~(first bindings) (.charAt s# i#)] rlm@10: ~@body)))) rlm@10: rlm@10: rlm@10: (defmacro docodepoints rlm@10: "bindings => [name string] rlm@10: rlm@10: Repeatedly executes body, with name bound to the integer code point rlm@10: of each Unicode character in the string. Handles Unicode rlm@10: supplementary characters (above U+FFFF) correctly." rlm@10: [bindings & body] rlm@10: (assert (vector bindings)) rlm@10: (assert (= 2 (count bindings))) rlm@10: (let [character (first bindings) rlm@10: string (second bindings)] rlm@10: `(let [^String s# ~string rlm@10: len# (.length s#)] rlm@10: (loop [i# 0] rlm@10: (when (< i# len#) rlm@10: (let [~character (.charAt s# i#)] rlm@10: (if (Character/isHighSurrogate ~character) rlm@10: (let [~character (.codePointAt s# i#)] rlm@10: ~@body rlm@10: (recur (+ 2 i#))) rlm@10: (let [~character (int ~character)] rlm@10: ~@body rlm@10: (recur (inc i#)))))))))) rlm@10: rlm@10: (defn codepoints rlm@10: "Returns a sequence of integer Unicode code points in s. Handles rlm@10: Unicode supplementary characters (above U+FFFF) correctly." rlm@10: [^String s] rlm@10: (let [len (.length s) rlm@10: f (fn thisfn [^String s i] rlm@10: (when (< i len) rlm@10: (let [c (.charAt s i)] rlm@10: (if (Character/isHighSurrogate c) rlm@10: (cons (.codePointAt s i) (thisfn s (+ 2 i))) rlm@10: (cons (int c) (thisfn s (inc i)))))))] rlm@10: (lazy-seq (f s 0)))) rlm@10: rlm@10: (defn ^String escape rlm@10: "Returns a new String by applying cmap (a function or a map) to each rlm@10: character in s. If cmap returns nil, the original character is rlm@10: added to the output unchanged." rlm@10: {:deprecated "1.2"} rlm@10: [cmap ^String s] rlm@10: (let [buffer (StringBuilder. (.length s))] rlm@10: (dochars [c s] rlm@10: (if-let [r (cmap c)] rlm@10: (.append buffer r) rlm@10: (.append buffer c))) rlm@10: (.toString buffer))) rlm@10: rlm@10: (defn blank? rlm@10: "True if s is nil, empty, or contains only whitespace." rlm@10: {:deprecated "1.2"} rlm@10: [^String s] rlm@10: (every? (fn [^Character c] (Character/isWhitespace c)) s)) rlm@10: rlm@10: (defn ^String take rlm@10: "Take first n characters from s, up to the length of s." rlm@10: [n ^String s] rlm@10: (if (< (count s) n) rlm@10: s rlm@10: (.substring s 0 n))) rlm@10: rlm@10: (defn ^String drop rlm@10: "Drops first n characters from s. Returns an empty string if n is rlm@10: greater than the length of s." rlm@10: [n ^String s] rlm@10: (if (< (count s) n) rlm@10: "" rlm@10: (.substring s n))) rlm@10: rlm@10: (defn ^String butlast rlm@10: "Returns s without the last n characters. Returns an empty string rlm@10: if n is greater than the length of s." rlm@10: [n ^String s] rlm@10: (if (< (count s) n) rlm@10: "" rlm@10: (.substring s 0 (- (count s) n)))) rlm@10: rlm@10: (defn ^String tail rlm@10: "Returns the last n characters of s." rlm@10: [n ^String s] rlm@10: (if (< (count s) n) rlm@10: s rlm@10: (.substring s (- (count s) n)))) rlm@10: rlm@10: (defn ^String repeat rlm@10: "Returns a new String containing s repeated n times." rlm@10: [n ^String s] rlm@10: (apply str (clojure.core/repeat n s))) rlm@10: rlm@10: (defn ^String reverse rlm@10: "Returns s with its characters reversed." rlm@10: {:deprecated "1.2"} rlm@10: [^String s] rlm@10: (.toString (.reverse (StringBuilder. s)))) rlm@10: rlm@10: (defn replace-str rlm@10: "Replaces all instances of substring a with b in s." rlm@10: {:deprecated "1.2"} rlm@10: [^String a ^String b ^String s] rlm@10: (.replace s a b)) rlm@10: rlm@10: (defn replace-char rlm@10: "Replaces all instances of character a with character b in s." rlm@10: {:deprecated "1.2"} rlm@10: [^Character a ^Character b ^String s] rlm@10: (.replace s a b)) rlm@10: rlm@10: (defn replace-re rlm@10: "Replaces all matches of re with replacement in s." rlm@10: {:deprecated "1.2"} rlm@10: [re replacement ^String s] rlm@10: (.replaceAll (re-matcher re s) replacement)) rlm@10: rlm@10: (defn replace-by rlm@10: "Replaces all matches of re in s with the result of rlm@10: (f (re-groups the-match))." rlm@10: {:deprecated "1.2"} rlm@10: [re f ^String s] rlm@10: (let [m (re-matcher re s)] rlm@10: (let [buffer (StringBuffer. (.length s))] rlm@10: (loop [] rlm@10: (if (.find m) rlm@10: (do (.appendReplacement m buffer (f (re-groups m))) rlm@10: (recur)) rlm@10: (do (.appendTail m buffer) rlm@10: (.toString buffer))))))) rlm@10: rlm@10: (defn replace-first-str rlm@10: "Replace first occurance of substring a with b in s." rlm@10: {:deprecated "1.2"} rlm@10: [^String a ^String b ^String s] rlm@10: (.replaceFirst (re-matcher (Pattern/quote a) s) b)) rlm@10: rlm@10: (defn replace-first-re rlm@10: "Replace first match of re in s." rlm@10: {:deprecated "1.2"} rlm@10: [^Pattern re ^String replacement ^String s] rlm@10: (.replaceFirst (re-matcher re s) replacement)) rlm@10: rlm@10: (defn replace-first-by rlm@10: "Replace first match of re in s with the result of rlm@10: (f (re-groups the-match))." rlm@10: {:deprecated "1.2"} rlm@10: [^Pattern re f ^String s] rlm@10: (let [m (re-matcher re s)] rlm@10: (let [buffer (StringBuffer.)] rlm@10: (if (.find m) rlm@10: (let [rep (f (re-groups m))] rlm@10: (.appendReplacement m buffer rep) rlm@10: (.appendTail m buffer) rlm@10: (str buffer)))))) rlm@10: rlm@10: (defn partition rlm@10: "Splits the string into a lazy sequence of substrings, alternating rlm@10: between substrings that match the patthern and the substrings rlm@10: between the matches. The sequence always starts with the substring rlm@10: before the first match, or an empty string if the beginning of the rlm@10: string matches. rlm@10: rlm@10: For example: (partition #\"[a-z]+\" \"abc123def\") rlm@10: returns: (\"\" \"abc\" \"123\" \"def\")" rlm@10: [^Pattern re ^String s] rlm@10: (let [m (re-matcher re s)] rlm@10: ((fn step [prevend] rlm@10: (lazy-seq rlm@10: (if (.find m) rlm@10: (cons (.subSequence s prevend (.start m)) rlm@10: (cons (re-groups m) rlm@10: (step (+ (.start m) (count (.group m)))))) rlm@10: (when (< prevend (.length s)) rlm@10: (list (.subSequence s prevend (.length s))))))) rlm@10: 0))) rlm@10: rlm@10: (defn ^String join rlm@10: "Returns a string of all elements in coll, separated by rlm@10: separator. Like Perl's join." rlm@10: {:deprecated "1.2"} rlm@10: [^String separator coll] rlm@10: (apply str (interpose separator coll))) rlm@10: rlm@10: (defn ^String chop rlm@10: "Removes the last character of string, does nothing on a zero-length rlm@10: string." rlm@10: [^String s] rlm@10: (let [size (count s)] rlm@10: (if (zero? size) rlm@10: s rlm@10: (subs s 0 (dec (count s)))))) rlm@10: rlm@10: (defn ^String chomp rlm@10: "Removes all trailing newline \\n or return \\r characters from rlm@10: string. Note: String.trim() is similar and faster. rlm@10: Deprecated in 1.2. Use clojure.string/trim-newline" rlm@10: {:deprecated "1.2"} rlm@10: [^String s] rlm@10: (replace-re #"[\r\n]+$" "" s)) rlm@10: rlm@10: (defn ^String swap-case rlm@10: "Changes upper case characters to lower case and vice-versa. rlm@10: Handles Unicode supplementary characters correctly. Uses the rlm@10: locale-sensitive String.toUpperCase() and String.toLowerCase() rlm@10: methods." rlm@10: [^String s] rlm@10: (let [buffer (StringBuilder. (.length s)) rlm@10: ;; array to make a String from one code point rlm@10: ^"[I" array (make-array Integer/TYPE 1)] rlm@10: (docodepoints [c s] rlm@10: (aset-int array 0 c) rlm@10: (if (Character/isLowerCase c) rlm@10: ;; Character.toUpperCase is not locale-sensitive, but rlm@10: ;; String.toUpperCase is; so we use a String. rlm@10: (.append buffer (.toUpperCase (String. array 0 1))) rlm@10: (.append buffer (.toLowerCase (String. array 0 1))))) rlm@10: (.toString buffer))) rlm@10: rlm@10: (defn ^String capitalize rlm@10: "Converts first character of the string to upper-case, all other rlm@10: characters to lower-case." rlm@10: {:deprecated "1.2"} rlm@10: [^String s] rlm@10: (if (< (count s) 2) rlm@10: (.toUpperCase s) rlm@10: (str (.toUpperCase ^String (subs s 0 1)) rlm@10: (.toLowerCase ^String (subs s 1))))) rlm@10: rlm@10: (defn ^String ltrim rlm@10: "Removes whitespace from the left side of string. rlm@10: Deprecated in 1.2. Use clojure.string/triml." rlm@10: {:deprecated "1.2"} rlm@10: [^String s] rlm@10: (replace-re #"^\s+" "" s)) rlm@10: rlm@10: (defn ^String rtrim rlm@10: "Removes whitespace from the right side of string. rlm@10: Deprecated in 1.2. Use clojure.string/trimr." rlm@10: {:deprecated "1.2"} rlm@10: [^String s] rlm@10: (replace-re #"\s+$" "" s)) rlm@10: rlm@10: (defn split-lines rlm@10: "Splits s on \\n or \\r\\n." rlm@10: {:deprecated "1.2"} rlm@10: [^String s] rlm@10: (seq (.split #"\r?\n" s))) rlm@10: rlm@10: ;; borrowed from compojure.string, by James Reeves, EPL 1.0 rlm@10: (defn ^String map-str rlm@10: "Apply f to each element of coll, concatenate all results into a rlm@10: String." rlm@10: [f coll] rlm@10: (apply str (map f coll))) rlm@10: rlm@10: ;; borrowed from compojure.string, by James Reeves, EPL 1.0 rlm@10: (defn grep rlm@10: "Filters elements of coll by a regular expression. The String rlm@10: representation (with str) of each element is tested with re-find." rlm@10: [re coll] rlm@10: (filter (fn [x] (re-find re (str x))) coll)) rlm@10: rlm@10: (defn as-str rlm@10: "Like clojure.core/str, but if an argument is a keyword or symbol, rlm@10: its name will be used instead of its literal representation. rlm@10: rlm@10: Example: rlm@10: (str :foo :bar) ;;=> \":foo:bar\" rlm@10: (as-str :foo :bar) ;;=> \"foobar\" rlm@10: rlm@10: Note that this does not apply to keywords or symbols nested within rlm@10: data structures; they will be rendered as with str. rlm@10: rlm@10: Example: rlm@10: (str {:foo :bar}) ;;=> \"{:foo :bar}\" rlm@10: (as-str {:foo :bar}) ;;=> \"{:foo :bar}\" " rlm@10: ([] "") rlm@10: ([x] (if (instance? clojure.lang.Named x) rlm@10: (name x) rlm@10: (str x))) rlm@10: ([x & ys] rlm@10: ((fn [^StringBuilder sb more] rlm@10: (if more rlm@10: (recur (. sb (append (as-str (first more)))) (next more)) rlm@10: (str sb))) rlm@10: (new StringBuilder ^String (as-str x)) ys))) rlm@10: rlm@10: rlm@10: ;;; WRAPPERS rlm@10: rlm@10: ;; The following functions are simple wrappers around java.lang.String rlm@10: ;; functions. They are included here for completeness, and for use rlm@10: ;; when mapping over a collection of strings. rlm@10: rlm@10: (defn ^String upper-case rlm@10: "Converts string to all upper-case." rlm@10: {:deprecated "1.2"} rlm@10: [^String s] rlm@10: (.toUpperCase s)) rlm@10: rlm@10: (defn ^String lower-case rlm@10: "Converts string to all lower-case." rlm@10: {:deprecated "1.2"} rlm@10: [^String s] rlm@10: (.toLowerCase s)) rlm@10: rlm@10: (defn split rlm@10: "Splits string on a regular expression. Optional argument limit is rlm@10: the maximum number of splits." rlm@10: {:deprecated "1.2"} rlm@10: ([^Pattern re ^String s] (seq (.split re s))) rlm@10: ([^Pattern re limit ^String s] (seq (.split re s limit)))) rlm@10: rlm@10: (defn ^String trim rlm@10: "Removes whitespace from both ends of string." rlm@10: {:deprecated "1.2"} rlm@10: [^String s] rlm@10: (.trim s)) rlm@10: rlm@10: (defn ^String substring? rlm@10: "True if s contains the substring." rlm@10: [substring ^String s] rlm@10: (.contains s substring)) rlm@10: rlm@10: (defn ^String get rlm@10: "Gets the i'th character in string." rlm@10: {:deprecated "1.2"} rlm@10: [^String s i] rlm@10: (.charAt s i)) rlm@10: