Mercurial > coderloop
view src/top_five.clj @ 0:307a81e46071 tip
initial committ
author | Robert McIntyre <rlm@mit.edu> |
---|---|
date | Tue, 18 Oct 2011 01:17:49 -0700 |
parents | |
children |
line wrap: on
line source
1 (ns coderloop.top-five)4 (use 'rlm.shell-inspect)5 (use 'clojure.contrib.profile)6 (use '[clojure.contrib [duck-streams :only [file-str read-lines *default-encoding*]]])7 (import '(java.nio ByteBuffer CharBuffer)8 '(java.io PushbackReader InputStream InputStreamReader9 FileInputStream))13 ;; ^{:doc "Name of the default encoding to use when reading & writing.14 ;; Default is UTF-8."15 ;; :tag "java.lang.String"}16 ;; *default-encoding* "US-ASCII")17 ;;(set! clojure.contrib.duck-streams/*default-encoding* "US-ASCII")19 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;21 (def a (file-str "/home/r/coderloop-test/topfive-a.in"))22 (def b (file-str "/home/r/coderloop-test/topfive-b.in"))23 (def c (file-str "/home/r/coderloop-test/topfive-c.in"))24 (def d (file-str "/home/r/coderloop-test/topfive-d.in"))25 (def e (file-str "/home/r/coderloop-test/topfive-e.in"))26 (def f (file-str "/home/r/coderloop-test/topfive-f.in"))28 (defn get-query-slow [s]29 (nth (re-matches #"^.*, query=(.*)]$" s) 1))31 (defn #^java.lang.String get-query [#^java.lang.String s]32 (.substring s (clojure.core/+ (.lastIndexOf s "query=") 6) (clojure.core/- (.length s) 1)))35 (defn chuncked-pmap36 "helps with paralleization of functions that don't take37 very much time to execute"38 [n f coll]39 (apply concat40 (pmap41 (fn [coll]42 (doall (map f coll)))43 (partition-all n coll))))46 ;; (def *main-map* (atom {}))48 ;; (declare parse-lines tally!)50 ;; (defn parse-lines [lines]51 ;; (map get-query lines))53 ;; (defn tally! [state parsed-lines]54 ;; (doseq [element parsed-lines]55 ;; (bump! state element 1)))58 ;; (defn analyze-log [file]59 ;; (let [chunk-count (int (/ (.length #^java.io.File file) (* 32 1024 1024)))60 ;; state (atom {})]61 ;; (dorun62 ;; (pmap (fn [[idx [start end]]]63 ;; (println (str "Chunk " idx "/" chunk-count64 ;; " (" start " -> " end ")"))65 ;; (->> (read-lines-range file start end)66 ;; (parse-lines)67 ;; (tally! state)))68 ;; (indexed (chunk-file file chunk-count))))69 ;; state))74 ;; (defn preduce-full75 ;; "a parallel reduce. Because it is parallel, the reducing76 ;; function must make sense in a tree-reducing context"77 ;; [num-chunks mapping-function f coll]78 ;; (loop [prev coll]79 ;; (let [new-coll80 ;; (concat81 ;; (mapping-function82 ;; (partial apply f)83 ;; (partition-all num-chunks prev)))]84 ;; (if (not (next new-coll))85 ;; (first new-coll)86 ;; (recur new-coll)))))92 ;;; correctness before speed:93 (defn top-five-0 [file]94 (binding [*default-encoding* "US-ASCII"]96 (map97 first98 (take99 5100 (reverse101 (sort-by second102 (reduce103 (partial merge-with +)104 (map frequencies105 (partition-all 2 (map get-query (read-lines file)))))))))))106 "Elapsed time: 34440.153889 msecs"107 "Elapsed time: 21072.141885 msecs"108 "Elapsed time: 21041.711202 msecs"109 ;; now let's do speed:110 (defn top-five-1 [file]111 (map112 first113 (take114 5115 (sort-by (comp - second)116 (reduce117 (partial merge-with +)118 (map frequencies119 (partition-all120 800 (map get-query (read-lines file)))))))))123 (defn top-five-3 [file]124 (map125 first126 (take 5127 (sort-by (comp - second)128 (frequencies129 (map get-query (read-lines file)))))))133 "Elapsed time: 12877.194194 msecs"134 "Elapsed time: 12282.975191 msecs"135 ;; got a factor of 2x increase in speed from a larger partition and chuncked-pmap.136 ;; still too slow....138 ;;; let's try the "make it all one big function" approach139 ;;(set! *warn-on-reflection* true)141 (defn top-five-2 [file]143 (let [*top-5* (atom (vector :a :b :c :d :e))144 *data* (atom {})]146 (letfn [(top-ten-helper147 [line]148 (let [query (get-query line)]149 (swap! *data* #(assoc % query (inc (get @*data* query 0))))150 (if (> (@*data* query 0) (get @*data* (nth @*top-5* 0) 0))151 (do153 (if-not (contains? (set @*top-5*) query)154 (do155 (swap! *top-5* #(assoc % 0 query))156 (swap! *top-5*157 (fn [v] (vec (sort-by #(get @*data* % 0) v))))))))))]159 (dorun (chuncked-pmap 800 top-ten-helper (read-lines file)))160 (swap! *top-5*161 (fn [v] (vec (sort-by #(get @*data* % 0) v))))162 (reverse @*top-5*))))163 "Elapsed time: 10735.897831 msecs" ;; with chuncked-pmap168 (if (command-line?)169 (do170 (dorun (map println (top-five-3 (file-str (first *command-line-args*)))))171 (System/exit 0)))