annotate src/top_five.clj @ 0:307a81e46071 tip

initial committ
author Robert McIntyre <rlm@mit.edu>
date Tue, 18 Oct 2011 01:17:49 -0700
parents
children
rev   line source
rlm@0 1 (ns coderloop.top-five)
rlm@0 2
rlm@0 3
rlm@0 4 (use 'rlm.shell-inspect)
rlm@0 5 (use 'clojure.contrib.profile)
rlm@0 6 (use '[clojure.contrib [duck-streams :only [file-str read-lines *default-encoding*]]])
rlm@0 7 (import '(java.nio ByteBuffer CharBuffer)
rlm@0 8 '(java.io PushbackReader InputStream InputStreamReader
rlm@0 9 FileInputStream))
rlm@0 10
rlm@0 11
rlm@0 12
rlm@0 13 ;; ^{:doc "Name of the default encoding to use when reading & writing.
rlm@0 14 ;; Default is UTF-8."
rlm@0 15 ;; :tag "java.lang.String"}
rlm@0 16 ;; *default-encoding* "US-ASCII")
rlm@0 17 ;;(set! clojure.contrib.duck-streams/*default-encoding* "US-ASCII")
rlm@0 18
rlm@0 19 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
rlm@0 20
rlm@0 21 (def a (file-str "/home/r/coderloop-test/topfive-a.in"))
rlm@0 22 (def b (file-str "/home/r/coderloop-test/topfive-b.in"))
rlm@0 23 (def c (file-str "/home/r/coderloop-test/topfive-c.in"))
rlm@0 24 (def d (file-str "/home/r/coderloop-test/topfive-d.in"))
rlm@0 25 (def e (file-str "/home/r/coderloop-test/topfive-e.in"))
rlm@0 26 (def f (file-str "/home/r/coderloop-test/topfive-f.in"))
rlm@0 27
rlm@0 28 (defn get-query-slow [s]
rlm@0 29 (nth (re-matches #"^.*, query=(.*)]$" s) 1))
rlm@0 30
rlm@0 31 (defn #^java.lang.String get-query [#^java.lang.String s]
rlm@0 32 (.substring s (clojure.core/+ (.lastIndexOf s "query=") 6) (clojure.core/- (.length s) 1)))
rlm@0 33
rlm@0 34
rlm@0 35 (defn chuncked-pmap
rlm@0 36 "helps with paralleization of functions that don't take
rlm@0 37 very much time to execute"
rlm@0 38 [n f coll]
rlm@0 39 (apply concat
rlm@0 40 (pmap
rlm@0 41 (fn [coll]
rlm@0 42 (doall (map f coll)))
rlm@0 43 (partition-all n coll))))
rlm@0 44
rlm@0 45
rlm@0 46 ;; (def *main-map* (atom {}))
rlm@0 47
rlm@0 48 ;; (declare parse-lines tally!)
rlm@0 49
rlm@0 50 ;; (defn parse-lines [lines]
rlm@0 51 ;; (map get-query lines))
rlm@0 52
rlm@0 53 ;; (defn tally! [state parsed-lines]
rlm@0 54 ;; (doseq [element parsed-lines]
rlm@0 55 ;; (bump! state element 1)))
rlm@0 56
rlm@0 57
rlm@0 58 ;; (defn analyze-log [file]
rlm@0 59 ;; (let [chunk-count (int (/ (.length #^java.io.File file) (* 32 1024 1024)))
rlm@0 60 ;; state (atom {})]
rlm@0 61 ;; (dorun
rlm@0 62 ;; (pmap (fn [[idx [start end]]]
rlm@0 63 ;; (println (str "Chunk " idx "/" chunk-count
rlm@0 64 ;; " (" start " -> " end ")"))
rlm@0 65 ;; (->> (read-lines-range file start end)
rlm@0 66 ;; (parse-lines)
rlm@0 67 ;; (tally! state)))
rlm@0 68 ;; (indexed (chunk-file file chunk-count))))
rlm@0 69 ;; state))
rlm@0 70
rlm@0 71
rlm@0 72
rlm@0 73
rlm@0 74 ;; (defn preduce-full
rlm@0 75 ;; "a parallel reduce. Because it is parallel, the reducing
rlm@0 76 ;; function must make sense in a tree-reducing context"
rlm@0 77 ;; [num-chunks mapping-function f coll]
rlm@0 78 ;; (loop [prev coll]
rlm@0 79 ;; (let [new-coll
rlm@0 80 ;; (concat
rlm@0 81 ;; (mapping-function
rlm@0 82 ;; (partial apply f)
rlm@0 83 ;; (partition-all num-chunks prev)))]
rlm@0 84 ;; (if (not (next new-coll))
rlm@0 85 ;; (first new-coll)
rlm@0 86 ;; (recur new-coll)))))
rlm@0 87
rlm@0 88
rlm@0 89
rlm@0 90
rlm@0 91
rlm@0 92 ;;; correctness before speed:
rlm@0 93 (defn top-five-0 [file]
rlm@0 94 (binding [*default-encoding* "US-ASCII"]
rlm@0 95
rlm@0 96 (map
rlm@0 97 first
rlm@0 98 (take
rlm@0 99 5
rlm@0 100 (reverse
rlm@0 101 (sort-by second
rlm@0 102 (reduce
rlm@0 103 (partial merge-with +)
rlm@0 104 (map frequencies
rlm@0 105 (partition-all 2 (map get-query (read-lines file)))))))))))
rlm@0 106 "Elapsed time: 34440.153889 msecs"
rlm@0 107 "Elapsed time: 21072.141885 msecs"
rlm@0 108 "Elapsed time: 21041.711202 msecs"
rlm@0 109 ;; now let's do speed:
rlm@0 110 (defn top-five-1 [file]
rlm@0 111 (map
rlm@0 112 first
rlm@0 113 (take
rlm@0 114 5
rlm@0 115 (sort-by (comp - second)
rlm@0 116 (reduce
rlm@0 117 (partial merge-with +)
rlm@0 118 (map frequencies
rlm@0 119 (partition-all
rlm@0 120 800 (map get-query (read-lines file)))))))))
rlm@0 121
rlm@0 122
rlm@0 123 (defn top-five-3 [file]
rlm@0 124 (map
rlm@0 125 first
rlm@0 126 (take 5
rlm@0 127 (sort-by (comp - second)
rlm@0 128 (frequencies
rlm@0 129 (map get-query (read-lines file)))))))
rlm@0 130
rlm@0 131
rlm@0 132
rlm@0 133 "Elapsed time: 12877.194194 msecs"
rlm@0 134 "Elapsed time: 12282.975191 msecs"
rlm@0 135 ;; got a factor of 2x increase in speed from a larger partition and chuncked-pmap.
rlm@0 136 ;; still too slow....
rlm@0 137
rlm@0 138 ;;; let's try the "make it all one big function" approach
rlm@0 139 ;;(set! *warn-on-reflection* true)
rlm@0 140
rlm@0 141 (defn top-five-2 [file]
rlm@0 142
rlm@0 143 (let [*top-5* (atom (vector :a :b :c :d :e))
rlm@0 144 *data* (atom {})]
rlm@0 145
rlm@0 146 (letfn [(top-ten-helper
rlm@0 147 [line]
rlm@0 148 (let [query (get-query line)]
rlm@0 149 (swap! *data* #(assoc % query (inc (get @*data* query 0))))
rlm@0 150 (if (> (@*data* query 0) (get @*data* (nth @*top-5* 0) 0))
rlm@0 151 (do
rlm@0 152
rlm@0 153 (if-not (contains? (set @*top-5*) query)
rlm@0 154 (do
rlm@0 155 (swap! *top-5* #(assoc % 0 query))
rlm@0 156 (swap! *top-5*
rlm@0 157 (fn [v] (vec (sort-by #(get @*data* % 0) v))))))))))]
rlm@0 158
rlm@0 159 (dorun (chuncked-pmap 800 top-ten-helper (read-lines file)))
rlm@0 160 (swap! *top-5*
rlm@0 161 (fn [v] (vec (sort-by #(get @*data* % 0) v))))
rlm@0 162 (reverse @*top-5*))))
rlm@0 163 "Elapsed time: 10735.897831 msecs" ;; with chuncked-pmap
rlm@0 164
rlm@0 165
rlm@0 166
rlm@0 167
rlm@0 168 (if (command-line?)
rlm@0 169 (do
rlm@0 170 (dorun (map println (top-five-3 (file-str (first *command-line-args*)))))
rlm@0 171 (System/exit 0)))