Mercurial > coderloop
comparison src/top_five.clj @ 0:307a81e46071 tip
initial committ
author | Robert McIntyre <rlm@mit.edu> |
---|---|
date | Tue, 18 Oct 2011 01:17:49 -0700 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:307a81e46071 |
---|---|
1 (ns coderloop.top-five) | |
2 | |
3 | |
4 (use 'rlm.shell-inspect) | |
5 (use 'clojure.contrib.profile) | |
6 (use '[clojure.contrib [duck-streams :only [file-str read-lines *default-encoding*]]]) | |
7 (import '(java.nio ByteBuffer CharBuffer) | |
8 '(java.io PushbackReader InputStream InputStreamReader | |
9 FileInputStream)) | |
10 | |
11 | |
12 | |
13 ;; ^{:doc "Name of the default encoding to use when reading & writing. | |
14 ;; Default is UTF-8." | |
15 ;; :tag "java.lang.String"} | |
16 ;; *default-encoding* "US-ASCII") | |
17 ;;(set! clojure.contrib.duck-streams/*default-encoding* "US-ASCII") | |
18 | |
19 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
20 | |
21 (def a (file-str "/home/r/coderloop-test/topfive-a.in")) | |
22 (def b (file-str "/home/r/coderloop-test/topfive-b.in")) | |
23 (def c (file-str "/home/r/coderloop-test/topfive-c.in")) | |
24 (def d (file-str "/home/r/coderloop-test/topfive-d.in")) | |
25 (def e (file-str "/home/r/coderloop-test/topfive-e.in")) | |
26 (def f (file-str "/home/r/coderloop-test/topfive-f.in")) | |
27 | |
28 (defn get-query-slow [s] | |
29 (nth (re-matches #"^.*, query=(.*)]$" s) 1)) | |
30 | |
31 (defn #^java.lang.String get-query [#^java.lang.String s] | |
32 (.substring s (clojure.core/+ (.lastIndexOf s "query=") 6) (clojure.core/- (.length s) 1))) | |
33 | |
34 | |
35 (defn chuncked-pmap | |
36 "helps with paralleization of functions that don't take | |
37 very much time to execute" | |
38 [n f coll] | |
39 (apply concat | |
40 (pmap | |
41 (fn [coll] | |
42 (doall (map f coll))) | |
43 (partition-all n coll)))) | |
44 | |
45 | |
46 ;; (def *main-map* (atom {})) | |
47 | |
48 ;; (declare parse-lines tally!) | |
49 | |
50 ;; (defn parse-lines [lines] | |
51 ;; (map get-query lines)) | |
52 | |
53 ;; (defn tally! [state parsed-lines] | |
54 ;; (doseq [element parsed-lines] | |
55 ;; (bump! state element 1))) | |
56 | |
57 | |
58 ;; (defn analyze-log [file] | |
59 ;; (let [chunk-count (int (/ (.length #^java.io.File file) (* 32 1024 1024))) | |
60 ;; state (atom {})] | |
61 ;; (dorun | |
62 ;; (pmap (fn [[idx [start end]]] | |
63 ;; (println (str "Chunk " idx "/" chunk-count | |
64 ;; " (" start " -> " end ")")) | |
65 ;; (->> (read-lines-range file start end) | |
66 ;; (parse-lines) | |
67 ;; (tally! state))) | |
68 ;; (indexed (chunk-file file chunk-count)))) | |
69 ;; state)) | |
70 | |
71 | |
72 | |
73 | |
74 ;; (defn preduce-full | |
75 ;; "a parallel reduce. Because it is parallel, the reducing | |
76 ;; function must make sense in a tree-reducing context" | |
77 ;; [num-chunks mapping-function f coll] | |
78 ;; (loop [prev coll] | |
79 ;; (let [new-coll | |
80 ;; (concat | |
81 ;; (mapping-function | |
82 ;; (partial apply f) | |
83 ;; (partition-all num-chunks prev)))] | |
84 ;; (if (not (next new-coll)) | |
85 ;; (first new-coll) | |
86 ;; (recur new-coll))))) | |
87 | |
88 | |
89 | |
90 | |
91 | |
92 ;;; correctness before speed: | |
93 (defn top-five-0 [file] | |
94 (binding [*default-encoding* "US-ASCII"] | |
95 | |
96 (map | |
97 first | |
98 (take | |
99 5 | |
100 (reverse | |
101 (sort-by second | |
102 (reduce | |
103 (partial merge-with +) | |
104 (map frequencies | |
105 (partition-all 2 (map get-query (read-lines file))))))))))) | |
106 "Elapsed time: 34440.153889 msecs" | |
107 "Elapsed time: 21072.141885 msecs" | |
108 "Elapsed time: 21041.711202 msecs" | |
109 ;; now let's do speed: | |
110 (defn top-five-1 [file] | |
111 (map | |
112 first | |
113 (take | |
114 5 | |
115 (sort-by (comp - second) | |
116 (reduce | |
117 (partial merge-with +) | |
118 (map frequencies | |
119 (partition-all | |
120 800 (map get-query (read-lines file))))))))) | |
121 | |
122 | |
123 (defn top-five-3 [file] | |
124 (map | |
125 first | |
126 (take 5 | |
127 (sort-by (comp - second) | |
128 (frequencies | |
129 (map get-query (read-lines file))))))) | |
130 | |
131 | |
132 | |
133 "Elapsed time: 12877.194194 msecs" | |
134 "Elapsed time: 12282.975191 msecs" | |
135 ;; got a factor of 2x increase in speed from a larger partition and chuncked-pmap. | |
136 ;; still too slow.... | |
137 | |
138 ;;; let's try the "make it all one big function" approach | |
139 ;;(set! *warn-on-reflection* true) | |
140 | |
141 (defn top-five-2 [file] | |
142 | |
143 (let [*top-5* (atom (vector :a :b :c :d :e)) | |
144 *data* (atom {})] | |
145 | |
146 (letfn [(top-ten-helper | |
147 [line] | |
148 (let [query (get-query line)] | |
149 (swap! *data* #(assoc % query (inc (get @*data* query 0)))) | |
150 (if (> (@*data* query 0) (get @*data* (nth @*top-5* 0) 0)) | |
151 (do | |
152 | |
153 (if-not (contains? (set @*top-5*) query) | |
154 (do | |
155 (swap! *top-5* #(assoc % 0 query)) | |
156 (swap! *top-5* | |
157 (fn [v] (vec (sort-by #(get @*data* % 0) v))))))))))] | |
158 | |
159 (dorun (chuncked-pmap 800 top-ten-helper (read-lines file))) | |
160 (swap! *top-5* | |
161 (fn [v] (vec (sort-by #(get @*data* % 0) v)))) | |
162 (reverse @*top-5*)))) | |
163 "Elapsed time: 10735.897831 msecs" ;; with chuncked-pmap | |
164 | |
165 | |
166 | |
167 | |
168 (if (command-line?) | |
169 (do | |
170 (dorun (map println (top-five-3 (file-str (first *command-line-args*))))) | |
171 (System/exit 0))) |