view src/top_five.clj @ 0:307a81e46071 tip

initial committ
author Robert McIntyre <rlm@mit.edu>
date Tue, 18 Oct 2011 01:17:49 -0700
parents
children
line wrap: on
line source
1 (ns coderloop.top-five)
4 (use 'rlm.shell-inspect)
5 (use 'clojure.contrib.profile)
6 (use '[clojure.contrib [duck-streams :only [file-str read-lines *default-encoding*]]])
7 (import '(java.nio ByteBuffer CharBuffer)
8 '(java.io PushbackReader InputStream InputStreamReader
9 FileInputStream))
13 ;; ^{:doc "Name of the default encoding to use when reading & writing.
14 ;; Default is UTF-8."
15 ;; :tag "java.lang.String"}
16 ;; *default-encoding* "US-ASCII")
17 ;;(set! clojure.contrib.duck-streams/*default-encoding* "US-ASCII")
19 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
21 (def a (file-str "/home/r/coderloop-test/topfive-a.in"))
22 (def b (file-str "/home/r/coderloop-test/topfive-b.in"))
23 (def c (file-str "/home/r/coderloop-test/topfive-c.in"))
24 (def d (file-str "/home/r/coderloop-test/topfive-d.in"))
25 (def e (file-str "/home/r/coderloop-test/topfive-e.in"))
26 (def f (file-str "/home/r/coderloop-test/topfive-f.in"))
28 (defn get-query-slow [s]
29 (nth (re-matches #"^.*, query=(.*)]$" s) 1))
31 (defn #^java.lang.String get-query [#^java.lang.String s]
32 (.substring s (clojure.core/+ (.lastIndexOf s "query=") 6) (clojure.core/- (.length s) 1)))
35 (defn chuncked-pmap
36 "helps with paralleization of functions that don't take
37 very much time to execute"
38 [n f coll]
39 (apply concat
40 (pmap
41 (fn [coll]
42 (doall (map f coll)))
43 (partition-all n coll))))
46 ;; (def *main-map* (atom {}))
48 ;; (declare parse-lines tally!)
50 ;; (defn parse-lines [lines]
51 ;; (map get-query lines))
53 ;; (defn tally! [state parsed-lines]
54 ;; (doseq [element parsed-lines]
55 ;; (bump! state element 1)))
58 ;; (defn analyze-log [file]
59 ;; (let [chunk-count (int (/ (.length #^java.io.File file) (* 32 1024 1024)))
60 ;; state (atom {})]
61 ;; (dorun
62 ;; (pmap (fn [[idx [start end]]]
63 ;; (println (str "Chunk " idx "/" chunk-count
64 ;; " (" start " -> " end ")"))
65 ;; (->> (read-lines-range file start end)
66 ;; (parse-lines)
67 ;; (tally! state)))
68 ;; (indexed (chunk-file file chunk-count))))
69 ;; state))
74 ;; (defn preduce-full
75 ;; "a parallel reduce. Because it is parallel, the reducing
76 ;; function must make sense in a tree-reducing context"
77 ;; [num-chunks mapping-function f coll]
78 ;; (loop [prev coll]
79 ;; (let [new-coll
80 ;; (concat
81 ;; (mapping-function
82 ;; (partial apply f)
83 ;; (partition-all num-chunks prev)))]
84 ;; (if (not (next new-coll))
85 ;; (first new-coll)
86 ;; (recur new-coll)))))
92 ;;; correctness before speed:
93 (defn top-five-0 [file]
94 (binding [*default-encoding* "US-ASCII"]
96 (map
97 first
98 (take
99 5
100 (reverse
101 (sort-by second
102 (reduce
103 (partial merge-with +)
104 (map frequencies
105 (partition-all 2 (map get-query (read-lines file)))))))))))
106 "Elapsed time: 34440.153889 msecs"
107 "Elapsed time: 21072.141885 msecs"
108 "Elapsed time: 21041.711202 msecs"
109 ;; now let's do speed:
110 (defn top-five-1 [file]
111 (map
112 first
113 (take
114 5
115 (sort-by (comp - second)
116 (reduce
117 (partial merge-with +)
118 (map frequencies
119 (partition-all
120 800 (map get-query (read-lines file)))))))))
123 (defn top-five-3 [file]
124 (map
125 first
126 (take 5
127 (sort-by (comp - second)
128 (frequencies
129 (map get-query (read-lines file)))))))
133 "Elapsed time: 12877.194194 msecs"
134 "Elapsed time: 12282.975191 msecs"
135 ;; got a factor of 2x increase in speed from a larger partition and chuncked-pmap.
136 ;; still too slow....
138 ;;; let's try the "make it all one big function" approach
139 ;;(set! *warn-on-reflection* true)
141 (defn top-five-2 [file]
143 (let [*top-5* (atom (vector :a :b :c :d :e))
144 *data* (atom {})]
146 (letfn [(top-ten-helper
147 [line]
148 (let [query (get-query line)]
149 (swap! *data* #(assoc % query (inc (get @*data* query 0))))
150 (if (> (@*data* query 0) (get @*data* (nth @*top-5* 0) 0))
151 (do
153 (if-not (contains? (set @*top-5*) query)
154 (do
155 (swap! *top-5* #(assoc % 0 query))
156 (swap! *top-5*
157 (fn [v] (vec (sort-by #(get @*data* % 0) v))))))))))]
159 (dorun (chuncked-pmap 800 top-ten-helper (read-lines file)))
160 (swap! *top-5*
161 (fn [v] (vec (sort-by #(get @*data* % 0) v))))
162 (reverse @*top-5*))))
163 "Elapsed time: 10735.897831 msecs" ;; with chuncked-pmap
168 (if (command-line?)
169 (do
170 (dorun (map println (top-five-3 (file-str (first *command-line-args*)))))
171 (System/exit 0)))