Mercurial > coderloop
diff src/top_five.clj @ 0:307a81e46071 tip
initial committ
author | Robert McIntyre <rlm@mit.edu> |
---|---|
date | Tue, 18 Oct 2011 01:17:49 -0700 |
parents | |
children |
line wrap: on
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/src/top_five.clj Tue Oct 18 01:17:49 2011 -0700 1.3 @@ -0,0 +1,171 @@ 1.4 +(ns coderloop.top-five) 1.5 + 1.6 + 1.7 +(use 'rlm.shell-inspect) 1.8 +(use 'clojure.contrib.profile) 1.9 +(use '[clojure.contrib [duck-streams :only [file-str read-lines *default-encoding*]]]) 1.10 +(import '(java.nio ByteBuffer CharBuffer) 1.11 + '(java.io PushbackReader InputStream InputStreamReader 1.12 + FileInputStream)) 1.13 + 1.14 + 1.15 + 1.16 +;; ^{:doc "Name of the default encoding to use when reading & writing. 1.17 +;; Default is UTF-8." 1.18 +;; :tag "java.lang.String"} 1.19 +;; *default-encoding* "US-ASCII") 1.20 +;;(set! clojure.contrib.duck-streams/*default-encoding* "US-ASCII") 1.21 + 1.22 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1.23 + 1.24 + (def a (file-str "/home/r/coderloop-test/topfive-a.in")) 1.25 + (def b (file-str "/home/r/coderloop-test/topfive-b.in")) 1.26 + (def c (file-str "/home/r/coderloop-test/topfive-c.in")) 1.27 + (def d (file-str "/home/r/coderloop-test/topfive-d.in")) 1.28 + (def e (file-str "/home/r/coderloop-test/topfive-e.in")) 1.29 + (def f (file-str "/home/r/coderloop-test/topfive-f.in")) 1.30 + 1.31 +(defn get-query-slow [s] 1.32 + (nth (re-matches #"^.*, query=(.*)]$" s) 1)) 1.33 + 1.34 +(defn #^java.lang.String get-query [#^java.lang.String s] 1.35 + (.substring s (clojure.core/+ (.lastIndexOf s "query=") 6) (clojure.core/- (.length s) 1))) 1.36 + 1.37 + 1.38 +(defn chuncked-pmap 1.39 + "helps with paralleization of functions that don't take 1.40 + very much time to execute" 1.41 + [n f coll] 1.42 + (apply concat 1.43 + (pmap 1.44 + (fn [coll] 1.45 + (doall (map f coll))) 1.46 + (partition-all n coll)))) 1.47 + 1.48 + 1.49 +;; (def *main-map* (atom {})) 1.50 + 1.51 +;; (declare parse-lines tally!) 1.52 + 1.53 +;; (defn parse-lines [lines] 1.54 +;; (map get-query lines)) 1.55 + 1.56 +;; (defn tally! [state parsed-lines] 1.57 +;; (doseq [element parsed-lines] 1.58 +;; (bump! state element 1))) 1.59 + 1.60 + 1.61 +;; (defn analyze-log [file] 1.62 +;; (let [chunk-count (int (/ (.length #^java.io.File file) (* 32 1024 1024))) 1.63 +;; state (atom {})] 1.64 +;; (dorun 1.65 +;; (pmap (fn [[idx [start end]]] 1.66 +;; (println (str "Chunk " idx "/" chunk-count 1.67 +;; " (" start " -> " end ")")) 1.68 +;; (->> (read-lines-range file start end) 1.69 +;; (parse-lines) 1.70 +;; (tally! state))) 1.71 +;; (indexed (chunk-file file chunk-count)))) 1.72 +;; state)) 1.73 + 1.74 + 1.75 + 1.76 + 1.77 +;; (defn preduce-full 1.78 +;; "a parallel reduce. Because it is parallel, the reducing 1.79 +;; function must make sense in a tree-reducing context" 1.80 +;; [num-chunks mapping-function f coll] 1.81 +;; (loop [prev coll] 1.82 +;; (let [new-coll 1.83 +;; (concat 1.84 +;; (mapping-function 1.85 +;; (partial apply f) 1.86 +;; (partition-all num-chunks prev)))] 1.87 +;; (if (not (next new-coll)) 1.88 +;; (first new-coll) 1.89 +;; (recur new-coll))))) 1.90 + 1.91 + 1.92 + 1.93 + 1.94 + 1.95 +;;; correctness before speed: 1.96 +(defn top-five-0 [file] 1.97 + (binding [*default-encoding* "US-ASCII"] 1.98 + 1.99 + (map 1.100 + first 1.101 + (take 1.102 + 5 1.103 + (reverse 1.104 + (sort-by second 1.105 + (reduce 1.106 + (partial merge-with +) 1.107 + (map frequencies 1.108 + (partition-all 2 (map get-query (read-lines file))))))))))) 1.109 +"Elapsed time: 34440.153889 msecs" 1.110 +"Elapsed time: 21072.141885 msecs" 1.111 +"Elapsed time: 21041.711202 msecs" 1.112 +;; now let's do speed: 1.113 +(defn top-five-1 [file] 1.114 + (map 1.115 + first 1.116 + (take 1.117 + 5 1.118 + (sort-by (comp - second) 1.119 + (reduce 1.120 + (partial merge-with +) 1.121 + (map frequencies 1.122 + (partition-all 1.123 + 800 (map get-query (read-lines file))))))))) 1.124 + 1.125 + 1.126 +(defn top-five-3 [file] 1.127 + (map 1.128 + first 1.129 + (take 5 1.130 + (sort-by (comp - second) 1.131 + (frequencies 1.132 + (map get-query (read-lines file))))))) 1.133 + 1.134 + 1.135 + 1.136 +"Elapsed time: 12877.194194 msecs" 1.137 +"Elapsed time: 12282.975191 msecs" 1.138 +;; got a factor of 2x increase in speed from a larger partition and chuncked-pmap. 1.139 +;; still too slow.... 1.140 + 1.141 +;;; let's try the "make it all one big function" approach 1.142 +;;(set! *warn-on-reflection* true) 1.143 + 1.144 +(defn top-five-2 [file] 1.145 + 1.146 + (let [*top-5* (atom (vector :a :b :c :d :e)) 1.147 + *data* (atom {})] 1.148 + 1.149 + (letfn [(top-ten-helper 1.150 + [line] 1.151 + (let [query (get-query line)] 1.152 + (swap! *data* #(assoc % query (inc (get @*data* query 0)))) 1.153 + (if (> (@*data* query 0) (get @*data* (nth @*top-5* 0) 0)) 1.154 + (do 1.155 + 1.156 + (if-not (contains? (set @*top-5*) query) 1.157 + (do 1.158 + (swap! *top-5* #(assoc % 0 query)) 1.159 + (swap! *top-5* 1.160 + (fn [v] (vec (sort-by #(get @*data* % 0) v))))))))))] 1.161 + 1.162 + (dorun (chuncked-pmap 800 top-ten-helper (read-lines file))) 1.163 + (swap! *top-5* 1.164 + (fn [v] (vec (sort-by #(get @*data* % 0) v)))) 1.165 + (reverse @*top-5*)))) 1.166 +"Elapsed time: 10735.897831 msecs" ;; with chuncked-pmap 1.167 + 1.168 + 1.169 + 1.170 + 1.171 +(if (command-line?) 1.172 + (do 1.173 + (dorun (map println (top-five-3 (file-str (first *command-line-args*))))) 1.174 + (System/exit 0)))