diff src/top_five.clj @ 0:307a81e46071 tip

initial committ
author Robert McIntyre <rlm@mit.edu>
date Tue, 18 Oct 2011 01:17:49 -0700
parents
children
line wrap: on
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/src/top_five.clj	Tue Oct 18 01:17:49 2011 -0700
     1.3 @@ -0,0 +1,171 @@
     1.4 +(ns coderloop.top-five)
     1.5 +
     1.6 +
     1.7 +(use 'rlm.shell-inspect)
     1.8 +(use 'clojure.contrib.profile)
     1.9 +(use '[clojure.contrib [duck-streams :only [file-str read-lines *default-encoding*]]])
    1.10 +(import '(java.nio ByteBuffer CharBuffer)
    1.11 +	'(java.io PushbackReader InputStream InputStreamReader
    1.12 +		  FileInputStream))
    1.13 +
    1.14 +
    1.15 +
    1.16 +;;  ^{:doc "Name of the default encoding to use when reading & writing.
    1.17 +;;   Default is UTF-8."
    1.18 +;;     :tag "java.lang.String"}
    1.19 +;;  *default-encoding* "US-ASCII")
    1.20 +;;(set! clojure.contrib.duck-streams/*default-encoding* "US-ASCII")
    1.21 +
    1.22 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    1.23 +
    1.24 + (def a (file-str "/home/r/coderloop-test/topfive-a.in"))
    1.25 + (def b (file-str "/home/r/coderloop-test/topfive-b.in"))
    1.26 + (def c (file-str "/home/r/coderloop-test/topfive-c.in"))
    1.27 + (def d (file-str "/home/r/coderloop-test/topfive-d.in"))
    1.28 + (def e (file-str "/home/r/coderloop-test/topfive-e.in"))
    1.29 + (def f (file-str "/home/r/coderloop-test/topfive-f.in"))
    1.30 +
    1.31 +(defn get-query-slow [s]
    1.32 +     (nth (re-matches #"^.*, query=(.*)]$" s) 1))
    1.33 +
    1.34 +(defn #^java.lang.String get-query [#^java.lang.String s]
    1.35 +   (.substring s (clojure.core/+ (.lastIndexOf s "query=") 6) (clojure.core/- (.length s) 1)))
    1.36 +
    1.37 +
    1.38 +(defn chuncked-pmap
    1.39 +  "helps with paralleization of functions that don't take
    1.40 +   very much time to execute"  
    1.41 +  [n f coll]
    1.42 +  (apply concat
    1.43 +   (pmap
    1.44 +    (fn [coll]
    1.45 +      (doall (map f coll)))
    1.46 +    (partition-all n coll))))
    1.47 +
    1.48 +
    1.49 +;; (def *main-map* (atom {}))
    1.50 +
    1.51 +;; (declare parse-lines tally!)
    1.52 +
    1.53 +;; (defn parse-lines [lines]
    1.54 +;;   (map get-query lines))
    1.55 +
    1.56 +;; (defn tally! [state parsed-lines]
    1.57 +;;   (doseq [element parsed-lines]
    1.58 +;;     (bump! state element 1)))
    1.59 +
    1.60 +
    1.61 +;; (defn analyze-log [file]
    1.62 +;;   (let [chunk-count (int (/ (.length #^java.io.File file) (* 32 1024 1024)))
    1.63 +;; 	state (atom {})]
    1.64 +;;     (dorun
    1.65 +;;      (pmap (fn [[idx [start end]]]
    1.66 +;; 	     (println (str "Chunk " idx "/" chunk-count
    1.67 +;; 			   " (" start " -> " end ")"))
    1.68 +;; 	     (->> (read-lines-range file start end)
    1.69 +;; 		  (parse-lines)
    1.70 +;; 		  (tally! state)))
    1.71 +;; 	   (indexed (chunk-file file chunk-count))))
    1.72 +;;     state))
    1.73 +
    1.74 +
    1.75 +
    1.76 +
    1.77 +;; (defn preduce-full
    1.78 +;;   "a parallel reduce. Because it is parallel, the reducing
    1.79 +;;    function must make sense in a tree-reducing context"
    1.80 +;;   [num-chunks mapping-function f coll]
    1.81 +;;   (loop [prev coll]
    1.82 +;;     (let [new-coll
    1.83 +;; 	  (concat
    1.84 +;; 	   (mapping-function 
    1.85 +;; 	    (partial apply f)
    1.86 +;; 	    (partition-all num-chunks prev)))]
    1.87 +;;       (if (not (next new-coll))
    1.88 +;; 	(first new-coll)
    1.89 +;; 	(recur new-coll)))))
    1.90 +
    1.91 +
    1.92 +
    1.93 +
    1.94 +
    1.95 +;;; correctness before speed:
    1.96 +(defn top-five-0 [file]
    1.97 +  (binding [*default-encoding* "US-ASCII"]
    1.98 +    
    1.99 +    (map
   1.100 +     first
   1.101 +     (take
   1.102 +      5
   1.103 +      (reverse
   1.104 +       (sort-by second
   1.105 +		(reduce
   1.106 +		 (partial merge-with +)
   1.107 +		 (map frequencies
   1.108 +		      (partition-all 2 (map get-query (read-lines file)))))))))))
   1.109 +"Elapsed time: 34440.153889 msecs"
   1.110 +"Elapsed time: 21072.141885 msecs"
   1.111 +"Elapsed time: 21041.711202 msecs"
   1.112 +;; now let's do speed:
   1.113 +(defn top-five-1 [file]
   1.114 +  (map
   1.115 +   first
   1.116 +   (take
   1.117 +    5
   1.118 +    (sort-by (comp - second)
   1.119 +	     (reduce
   1.120 +	      (partial merge-with +)
   1.121 +	      (map frequencies
   1.122 +		   (partition-all
   1.123 +		    800 (map get-query (read-lines file)))))))))
   1.124 +
   1.125 +
   1.126 +(defn top-five-3 [file]
   1.127 +  (map
   1.128 +   first
   1.129 +   (take 5
   1.130 +	 (sort-by (comp - second)
   1.131 +		  (frequencies
   1.132 +		   (map get-query (read-lines file)))))))
   1.133 +
   1.134 +
   1.135 +
   1.136 +"Elapsed time: 12877.194194 msecs"
   1.137 +"Elapsed time: 12282.975191 msecs"
   1.138 +;; got a factor of 2x increase in speed from a larger partition and chuncked-pmap.
   1.139 +;; still too slow....
   1.140 +
   1.141 +;;; let's try the "make it all one big function" approach
   1.142 +;;(set! *warn-on-reflection* true)
   1.143 +
   1.144 +(defn top-five-2 [file]
   1.145 +
   1.146 +  (let [*top-5* (atom (vector :a :b :c :d :e))
   1.147 +	*data* (atom {})]
   1.148 +    
   1.149 +    (letfn [(top-ten-helper
   1.150 +	     [line]
   1.151 +	     (let [query (get-query line)]
   1.152 +	       (swap! *data* #(assoc % query (inc (get @*data* query 0))))
   1.153 +	       (if (> (@*data* query 0) (get @*data* (nth @*top-5* 0) 0))
   1.154 +		 (do
   1.155 +
   1.156 +		   (if-not (contains? (set @*top-5*) query)
   1.157 +		     (do 
   1.158 +		       (swap! *top-5* #(assoc % 0 query))
   1.159 +		       (swap! *top-5*
   1.160 +			      (fn [v] (vec (sort-by #(get @*data* % 0) v))))))))))]
   1.161 +
   1.162 +      (dorun (chuncked-pmap 800 top-ten-helper (read-lines file)))
   1.163 +      (swap! *top-5*
   1.164 +	     (fn [v] (vec (sort-by #(get @*data* % 0) v))))
   1.165 +      (reverse @*top-5*))))
   1.166 +"Elapsed time: 10735.897831 msecs" ;; with chuncked-pmap
   1.167 +
   1.168 +	
   1.169 +    
   1.170 +  
   1.171 +(if (command-line?)
   1.172 +  (do
   1.173 +    (dorun (map println (top-five-3 (file-str (first *command-line-args*)))))
   1.174 +    (System/exit 0)))