annotate src/rlm/pikasemechu.clj @ 0:78a630e650d2

initial import
author Robert McIntyre <rlm@mit.edu>
date Tue, 18 Oct 2011 00:57:08 -0700
parents
children
rev   line source
rlm@0 1 (ns rlm.pikasemechu
rlm@0 2 (:use rlm.ns-rlm))
rlm@0 3 (rlm.ns-rlm/ns-clone rlm.light-base)
rlm@0 4 (use 'clojure.java.io)
rlm@0 5 (use 'rlm.sanitize-file)
rlm@0 6 (import org.htmlcleaner.HtmlCleaner)
rlm@0 7 (import org.htmlcleaner.TagNode)
rlm@0 8 (import java.io.File)
rlm@0 9 (import java.net.URL)
rlm@0 10
rlm@0 11 (def first-comic (URL. "http://pikasemechu.smackjeeves.com/comics/165598/cover/"))
rlm@0 12 (def base-directory (file-str "~/pikasemechu"))
rlm@0 13
rlm@0 14 (defn tags-by-name
rlm@0 15 [#^TagNode node #^String element]
rlm@0 16 (seq (.getElementListByName node element true)))
rlm@0 17
rlm@0 18 (defn-memo parse
rlm@0 19 "parse a web page using HtmlCleaner"
rlm@0 20 [#^URL url]
rlm@0 21 (.clean (HtmlCleaner.) (input-stream url)))
rlm@0 22
rlm@0 23 (defn attributes
rlm@0 24 "get a hash map of the attributes of an element"
rlm@0 25 [#^TagNode node]
rlm@0 26 (into {} (.getAttributes node)))
rlm@0 27
rlm@0 28 (defn next-url
rlm@0 29 "get the next url to visit from the current page we are visiting"
rlm@0 30 [#^URL url]
rlm@0 31 (let
rlm@0 32 [;; extract all links
rlm@0 33 links (tags-by-name (parse url) "a")
rlm@0 34 ;; extract the "Next" link
rlm@0 35 next-node (first (filter #(re-matches #".*Next.*" (.getText %)) links))
rlm@0 36 next-page (URL. ((attributes next-node) "href"))]
rlm@0 37 ;; return nil if this is the last page
rlm@0 38 (if (= next-page url) nil next-page)))
rlm@0 39
rlm@0 40 (def poke-first-comic
rlm@0 41 (URL. "http://www.mangareader.net/458-28199-1/pokemon-adventures/chapter-1.html"))
rlm@0 42
rlm@0 43 (def poke-base-directory (file-str "~/poke"))
rlm@0 44
rlm@0 45 (defn poke-next-url
rlm@0 46 "get the next url to visit from the current page we are visiting"
rlm@0 47 [#^URL url]
rlm@0 48 (let
rlm@0 49 [;; extract all links
rlm@0 50 links (tags-by-name (parse url) "a")
rlm@0 51 ;; extract the "Next" link
rlm@0 52 next-node (first (filter #(re-matches #".*Next.*" (.getText %)) links))
rlm@0 53 next-page
rlm@0 54
rlm@0 55 (URL. (str "http://www.mangareader.net" ((attributes next-node) "href")))]
rlm@0 56 ;; return nil if this is the last page
rlm@0 57 (if (= next-page url) nil next-page)))
rlm@0 58
rlm@0 59 (defn poke-comic-img
rlm@0 60 "get the file which we want to download from the webpage"
rlm@0 61 [#^URL url]
rlm@0 62 (let [;; parse the HTML
rlm@0 63 node (parse url)
rlm@0 64 ;; grab all img tags
rlm@0 65 container (first (filter (fn [node] (= ((attributes node) "id")
rlm@0 66 "imgholder"))
rlm@0 67 (tags-by-name node "div")))]
rlm@0 68 ;; extract the comic
rlm@0 69 (URL.
rlm@0 70 ((attributes
rlm@0 71 (last (.getChildren
rlm@0 72 (first
rlm@0 73 (tags-by-name container "a"))))) "src"))))
rlm@0 74
rlm@0 75 (defn poke-comic-name []
rlm@0 76 (let [a (atom 0)]
rlm@0 77 (fn
rlm@0 78 [url]
rlm@0 79 (File. poke-base-directory (str (swap! a inc) ".jpg")))))
rlm@0 80
rlm@0 81 (defn comic-img
rlm@0 82 "get the file which we want to download from the webpage"
rlm@0 83 [#^URL url]
rlm@0 84 (let [;; parse the HTML
rlm@0 85 node (parse url)
rlm@0 86 ;; grab all img tags
rlm@0 87 images (map attributes (tags-by-name node "img"))
rlm@0 88 ;; extract the comic
rlm@0 89 comic (first (filter #(= "comic_image" (% "id")) images))]
rlm@0 90 (URL. (comic "src"))))
rlm@0 91
rlm@0 92 (defn comic-name
rlm@0 93 "get the comic's name"
rlm@0 94 [#^URL url]
rlm@0 95 (let [;; parse the HTML
rlm@0 96 node (parse url)
rlm@0 97 ;; grab the select box
rlm@0 98 select (first (tags-by-name node "select"))
rlm@0 99 ;; extract the selected name which is the name of the comic
rlm@0 100 selected (first (filter #(= "selected" ((attributes %) "selected"))
rlm@0 101 (tags-by-name node "option")))]
rlm@0 102 (File. base-directory
rlm@0 103 (sanitize-file-name (str (.getText selected) ".jpg")))))
rlm@0 104
rlm@0 105 (defn download-url
rlm@0 106 [#^URL url #^File file]
rlm@0 107 (if (.exists file)
rlm@0 108 (println (.getCanonicalPath file) " already exists.")
rlm@0 109 (do
rlm@0 110 (println "Copying " (.getPath url) " to " (.getCanonicalPath file))
rlm@0 111 (copy (input-stream url) file)) ))
rlm@0 112
rlm@0 113 (defn download-comic
rlm@0 114 ([first-comic next-url comic-img comic-name n]
rlm@0 115 ;; create a lazy sequence of web urls by chaining next-url
rlm@0 116 (let [urls (take-while (comp not nil?) (iterate next-url first-comic))]
rlm@0 117 (dorun (take n (map download-url
rlm@0 118 (map comic-img urls)
rlm@0 119 (map comic-name urls)))))))
rlm@0 120
rlm@0 121 ;; first stab at making it more general
rlm@0 122 (defn download-comic-archive
rlm@0 123 [comic-img comic-name comics]
rlm@0 124 (dorun (map download-url
rlm@0 125 (map comic-img comics)
rlm@0 126 (map comic-name comics))))
rlm@0 127
rlm@0 128 ;; (defn download-comic-chain
rlm@0 129 ;; [comic-img comic-name next-url first-comic]
rlm@0 130 ;; [comics (take-while (comp not nil?) (iterate next-url first-comic))]
rlm@0 131 ;; (download-comics comics comic-img comic-name))
rlm@0 132
rlm@0 133 (defn tests []
rlm@0 134 (println (comic-name first-comic))
rlm@0 135 (println (comic-img first-comic))
rlm@0 136 (println (next-url first-comic)))