rlm@0
|
1 (ns coderloop.top-five)
|
rlm@0
|
2
|
rlm@0
|
3
|
rlm@0
|
4 (use 'rlm.shell-inspect)
|
rlm@0
|
5 (use 'clojure.contrib.profile)
|
rlm@0
|
6 (use '[clojure.contrib [duck-streams :only [file-str read-lines *default-encoding*]]])
|
rlm@0
|
7 (import '(java.nio ByteBuffer CharBuffer)
|
rlm@0
|
8 '(java.io PushbackReader InputStream InputStreamReader
|
rlm@0
|
9 FileInputStream))
|
rlm@0
|
10
|
rlm@0
|
11
|
rlm@0
|
12
|
rlm@0
|
13 ;; ^{:doc "Name of the default encoding to use when reading & writing.
|
rlm@0
|
14 ;; Default is UTF-8."
|
rlm@0
|
15 ;; :tag "java.lang.String"}
|
rlm@0
|
16 ;; *default-encoding* "US-ASCII")
|
rlm@0
|
17 ;;(set! clojure.contrib.duck-streams/*default-encoding* "US-ASCII")
|
rlm@0
|
18
|
rlm@0
|
19 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
rlm@0
|
20
|
rlm@0
|
21 (def a (file-str "/home/r/coderloop-test/topfive-a.in"))
|
rlm@0
|
22 (def b (file-str "/home/r/coderloop-test/topfive-b.in"))
|
rlm@0
|
23 (def c (file-str "/home/r/coderloop-test/topfive-c.in"))
|
rlm@0
|
24 (def d (file-str "/home/r/coderloop-test/topfive-d.in"))
|
rlm@0
|
25 (def e (file-str "/home/r/coderloop-test/topfive-e.in"))
|
rlm@0
|
26 (def f (file-str "/home/r/coderloop-test/topfive-f.in"))
|
rlm@0
|
27
|
rlm@0
|
28 (defn get-query-slow [s]
|
rlm@0
|
29 (nth (re-matches #"^.*, query=(.*)]$" s) 1))
|
rlm@0
|
30
|
rlm@0
|
31 (defn #^java.lang.String get-query [#^java.lang.String s]
|
rlm@0
|
32 (.substring s (clojure.core/+ (.lastIndexOf s "query=") 6) (clojure.core/- (.length s) 1)))
|
rlm@0
|
33
|
rlm@0
|
34
|
rlm@0
|
35 (defn chuncked-pmap
|
rlm@0
|
36 "helps with paralleization of functions that don't take
|
rlm@0
|
37 very much time to execute"
|
rlm@0
|
38 [n f coll]
|
rlm@0
|
39 (apply concat
|
rlm@0
|
40 (pmap
|
rlm@0
|
41 (fn [coll]
|
rlm@0
|
42 (doall (map f coll)))
|
rlm@0
|
43 (partition-all n coll))))
|
rlm@0
|
44
|
rlm@0
|
45
|
rlm@0
|
46 ;; (def *main-map* (atom {}))
|
rlm@0
|
47
|
rlm@0
|
48 ;; (declare parse-lines tally!)
|
rlm@0
|
49
|
rlm@0
|
50 ;; (defn parse-lines [lines]
|
rlm@0
|
51 ;; (map get-query lines))
|
rlm@0
|
52
|
rlm@0
|
53 ;; (defn tally! [state parsed-lines]
|
rlm@0
|
54 ;; (doseq [element parsed-lines]
|
rlm@0
|
55 ;; (bump! state element 1)))
|
rlm@0
|
56
|
rlm@0
|
57
|
rlm@0
|
58 ;; (defn analyze-log [file]
|
rlm@0
|
59 ;; (let [chunk-count (int (/ (.length #^java.io.File file) (* 32 1024 1024)))
|
rlm@0
|
60 ;; state (atom {})]
|
rlm@0
|
61 ;; (dorun
|
rlm@0
|
62 ;; (pmap (fn [[idx [start end]]]
|
rlm@0
|
63 ;; (println (str "Chunk " idx "/" chunk-count
|
rlm@0
|
64 ;; " (" start " -> " end ")"))
|
rlm@0
|
65 ;; (->> (read-lines-range file start end)
|
rlm@0
|
66 ;; (parse-lines)
|
rlm@0
|
67 ;; (tally! state)))
|
rlm@0
|
68 ;; (indexed (chunk-file file chunk-count))))
|
rlm@0
|
69 ;; state))
|
rlm@0
|
70
|
rlm@0
|
71
|
rlm@0
|
72
|
rlm@0
|
73
|
rlm@0
|
74 ;; (defn preduce-full
|
rlm@0
|
75 ;; "a parallel reduce. Because it is parallel, the reducing
|
rlm@0
|
76 ;; function must make sense in a tree-reducing context"
|
rlm@0
|
77 ;; [num-chunks mapping-function f coll]
|
rlm@0
|
78 ;; (loop [prev coll]
|
rlm@0
|
79 ;; (let [new-coll
|
rlm@0
|
80 ;; (concat
|
rlm@0
|
81 ;; (mapping-function
|
rlm@0
|
82 ;; (partial apply f)
|
rlm@0
|
83 ;; (partition-all num-chunks prev)))]
|
rlm@0
|
84 ;; (if (not (next new-coll))
|
rlm@0
|
85 ;; (first new-coll)
|
rlm@0
|
86 ;; (recur new-coll)))))
|
rlm@0
|
87
|
rlm@0
|
88
|
rlm@0
|
89
|
rlm@0
|
90
|
rlm@0
|
91
|
rlm@0
|
92 ;;; correctness before speed:
|
rlm@0
|
93 (defn top-five-0 [file]
|
rlm@0
|
94 (binding [*default-encoding* "US-ASCII"]
|
rlm@0
|
95
|
rlm@0
|
96 (map
|
rlm@0
|
97 first
|
rlm@0
|
98 (take
|
rlm@0
|
99 5
|
rlm@0
|
100 (reverse
|
rlm@0
|
101 (sort-by second
|
rlm@0
|
102 (reduce
|
rlm@0
|
103 (partial merge-with +)
|
rlm@0
|
104 (map frequencies
|
rlm@0
|
105 (partition-all 2 (map get-query (read-lines file)))))))))))
|
rlm@0
|
106 "Elapsed time: 34440.153889 msecs"
|
rlm@0
|
107 "Elapsed time: 21072.141885 msecs"
|
rlm@0
|
108 "Elapsed time: 21041.711202 msecs"
|
rlm@0
|
109 ;; now let's do speed:
|
rlm@0
|
110 (defn top-five-1 [file]
|
rlm@0
|
111 (map
|
rlm@0
|
112 first
|
rlm@0
|
113 (take
|
rlm@0
|
114 5
|
rlm@0
|
115 (sort-by (comp - second)
|
rlm@0
|
116 (reduce
|
rlm@0
|
117 (partial merge-with +)
|
rlm@0
|
118 (map frequencies
|
rlm@0
|
119 (partition-all
|
rlm@0
|
120 800 (map get-query (read-lines file)))))))))
|
rlm@0
|
121
|
rlm@0
|
122
|
rlm@0
|
123 (defn top-five-3 [file]
|
rlm@0
|
124 (map
|
rlm@0
|
125 first
|
rlm@0
|
126 (take 5
|
rlm@0
|
127 (sort-by (comp - second)
|
rlm@0
|
128 (frequencies
|
rlm@0
|
129 (map get-query (read-lines file)))))))
|
rlm@0
|
130
|
rlm@0
|
131
|
rlm@0
|
132
|
rlm@0
|
133 "Elapsed time: 12877.194194 msecs"
|
rlm@0
|
134 "Elapsed time: 12282.975191 msecs"
|
rlm@0
|
135 ;; got a factor of 2x increase in speed from a larger partition and chuncked-pmap.
|
rlm@0
|
136 ;; still too slow....
|
rlm@0
|
137
|
rlm@0
|
138 ;;; let's try the "make it all one big function" approach
|
rlm@0
|
139 ;;(set! *warn-on-reflection* true)
|
rlm@0
|
140
|
rlm@0
|
141 (defn top-five-2 [file]
|
rlm@0
|
142
|
rlm@0
|
143 (let [*top-5* (atom (vector :a :b :c :d :e))
|
rlm@0
|
144 *data* (atom {})]
|
rlm@0
|
145
|
rlm@0
|
146 (letfn [(top-ten-helper
|
rlm@0
|
147 [line]
|
rlm@0
|
148 (let [query (get-query line)]
|
rlm@0
|
149 (swap! *data* #(assoc % query (inc (get @*data* query 0))))
|
rlm@0
|
150 (if (> (@*data* query 0) (get @*data* (nth @*top-5* 0) 0))
|
rlm@0
|
151 (do
|
rlm@0
|
152
|
rlm@0
|
153 (if-not (contains? (set @*top-5*) query)
|
rlm@0
|
154 (do
|
rlm@0
|
155 (swap! *top-5* #(assoc % 0 query))
|
rlm@0
|
156 (swap! *top-5*
|
rlm@0
|
157 (fn [v] (vec (sort-by #(get @*data* % 0) v))))))))))]
|
rlm@0
|
158
|
rlm@0
|
159 (dorun (chuncked-pmap 800 top-ten-helper (read-lines file)))
|
rlm@0
|
160 (swap! *top-5*
|
rlm@0
|
161 (fn [v] (vec (sort-by #(get @*data* % 0) v))))
|
rlm@0
|
162 (reverse @*top-5*))))
|
rlm@0
|
163 "Elapsed time: 10735.897831 msecs" ;; with chuncked-pmap
|
rlm@0
|
164
|
rlm@0
|
165
|
rlm@0
|
166
|
rlm@0
|
167
|
rlm@0
|
168 (if (command-line?)
|
rlm@0
|
169 (do
|
rlm@0
|
170 (dorun (map println (top-five-3 (file-str (first *command-line-args*)))))
|
rlm@0
|
171 (System/exit 0)))
|