Mercurial > rlm
diff src/rlm/pikasemechu.clj @ 0:78a630e650d2
initial import
author | Robert McIntyre <rlm@mit.edu> |
---|---|
date | Tue, 18 Oct 2011 00:57:08 -0700 |
parents | |
children |
line wrap: on
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/src/rlm/pikasemechu.clj Tue Oct 18 00:57:08 2011 -0700 1.3 @@ -0,0 +1,136 @@ 1.4 +(ns rlm.pikasemechu 1.5 + (:use rlm.ns-rlm)) 1.6 +(rlm.ns-rlm/ns-clone rlm.light-base) 1.7 +(use 'clojure.java.io) 1.8 +(use 'rlm.sanitize-file) 1.9 +(import org.htmlcleaner.HtmlCleaner) 1.10 +(import org.htmlcleaner.TagNode) 1.11 +(import java.io.File) 1.12 +(import java.net.URL) 1.13 + 1.14 +(def first-comic (URL. "http://pikasemechu.smackjeeves.com/comics/165598/cover/")) 1.15 +(def base-directory (file-str "~/pikasemechu")) 1.16 + 1.17 +(defn tags-by-name 1.18 + [#^TagNode node #^String element] 1.19 + (seq (.getElementListByName node element true))) 1.20 + 1.21 +(defn-memo parse 1.22 + "parse a web page using HtmlCleaner" 1.23 + [#^URL url] 1.24 + (.clean (HtmlCleaner.) (input-stream url))) 1.25 + 1.26 +(defn attributes 1.27 + "get a hash map of the attributes of an element" 1.28 + [#^TagNode node] 1.29 + (into {} (.getAttributes node))) 1.30 + 1.31 +(defn next-url 1.32 + "get the next url to visit from the current page we are visiting" 1.33 + [#^URL url] 1.34 + (let 1.35 + [;; extract all links 1.36 + links (tags-by-name (parse url) "a") 1.37 + ;; extract the "Next" link 1.38 + next-node (first (filter #(re-matches #".*Next.*" (.getText %)) links)) 1.39 + next-page (URL. ((attributes next-node) "href"))] 1.40 + ;; return nil if this is the last page 1.41 + (if (= next-page url) nil next-page))) 1.42 + 1.43 +(def poke-first-comic 1.44 + (URL. "http://www.mangareader.net/458-28199-1/pokemon-adventures/chapter-1.html")) 1.45 + 1.46 +(def poke-base-directory (file-str "~/poke")) 1.47 + 1.48 +(defn poke-next-url 1.49 + "get the next url to visit from the current page we are visiting" 1.50 + [#^URL url] 1.51 + (let 1.52 + [;; extract all links 1.53 + links (tags-by-name (parse url) "a") 1.54 + ;; extract the "Next" link 1.55 + next-node (first (filter #(re-matches #".*Next.*" (.getText %)) links)) 1.56 + next-page 1.57 + 1.58 + (URL. (str "http://www.mangareader.net" ((attributes next-node) "href")))] 1.59 + ;; return nil if this is the last page 1.60 + (if (= next-page url) nil next-page))) 1.61 + 1.62 +(defn poke-comic-img 1.63 + "get the file which we want to download from the webpage" 1.64 + [#^URL url] 1.65 + (let [;; parse the HTML 1.66 + node (parse url) 1.67 + ;; grab all img tags 1.68 + container (first (filter (fn [node] (= ((attributes node) "id") 1.69 + "imgholder")) 1.70 + (tags-by-name node "div")))] 1.71 + ;; extract the comic 1.72 + (URL. 1.73 + ((attributes 1.74 + (last (.getChildren 1.75 + (first 1.76 + (tags-by-name container "a"))))) "src")))) 1.77 + 1.78 +(defn poke-comic-name [] 1.79 + (let [a (atom 0)] 1.80 + (fn 1.81 + [url] 1.82 + (File. poke-base-directory (str (swap! a inc) ".jpg"))))) 1.83 + 1.84 +(defn comic-img 1.85 + "get the file which we want to download from the webpage" 1.86 + [#^URL url] 1.87 + (let [;; parse the HTML 1.88 + node (parse url) 1.89 + ;; grab all img tags 1.90 + images (map attributes (tags-by-name node "img")) 1.91 + ;; extract the comic 1.92 + comic (first (filter #(= "comic_image" (% "id")) images))] 1.93 + (URL. (comic "src")))) 1.94 + 1.95 +(defn comic-name 1.96 + "get the comic's name" 1.97 + [#^URL url] 1.98 + (let [;; parse the HTML 1.99 + node (parse url) 1.100 + ;; grab the select box 1.101 + select (first (tags-by-name node "select")) 1.102 + ;; extract the selected name which is the name of the comic 1.103 + selected (first (filter #(= "selected" ((attributes %) "selected")) 1.104 + (tags-by-name node "option")))] 1.105 + (File. base-directory 1.106 + (sanitize-file-name (str (.getText selected) ".jpg"))))) 1.107 + 1.108 +(defn download-url 1.109 + [#^URL url #^File file] 1.110 + (if (.exists file) 1.111 + (println (.getCanonicalPath file) " already exists.") 1.112 + (do 1.113 + (println "Copying " (.getPath url) " to " (.getCanonicalPath file)) 1.114 + (copy (input-stream url) file)) )) 1.115 + 1.116 +(defn download-comic 1.117 + ([first-comic next-url comic-img comic-name n] 1.118 + ;; create a lazy sequence of web urls by chaining next-url 1.119 + (let [urls (take-while (comp not nil?) (iterate next-url first-comic))] 1.120 + (dorun (take n (map download-url 1.121 + (map comic-img urls) 1.122 + (map comic-name urls))))))) 1.123 + 1.124 +;; first stab at making it more general 1.125 +(defn download-comic-archive 1.126 + [comic-img comic-name comics] 1.127 + (dorun (map download-url 1.128 + (map comic-img comics) 1.129 + (map comic-name comics)))) 1.130 + 1.131 +;; (defn download-comic-chain 1.132 +;; [comic-img comic-name next-url first-comic] 1.133 +;; [comics (take-while (comp not nil?) (iterate next-url first-comic))] 1.134 +;; (download-comics comics comic-img comic-name)) 1.135 + 1.136 +(defn tests [] 1.137 + (println (comic-name first-comic)) 1.138 + (println (comic-img first-comic)) 1.139 + (println (next-url first-comic)))